diff mbox

[RFC] pass write value to in_range pointers

Message ID 20090622151631.GA14780@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michael S. Tsirkin June 22, 2009, 3:16 p.m. UTC
It seems that a lot of complexity and trickiness with iosignalfd is
handling the group/item relationship, which comes about because kvm does
not currently let a device on the bus claim a write transaction based on the
value written.  This could be greatly simplified if the value written
was passed to the in_range check for write operation.  We could then
simply make each kvm_iosignalfd a device on the bus.

What does everyone think of the following lightly tested patch?

-->

Subject: kvm: pass value to in_range callback

For write transactions, pass the value written to in_range checks so
that we can make each iosignalfd a separate device on kvm bus.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

---


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Gregory Haskins June 22, 2009, 3:45 p.m. UTC | #1
Michael S. Tsirkin wrote:
> It seems that a lot of complexity and trickiness with iosignalfd is
> handling the group/item relationship, which comes about because kvm does
> not currently let a device on the bus claim a write transaction based on the
> value written.  This could be greatly simplified if the value written
> was passed to the in_range check for write operation.  We could then
> simply make each kvm_iosignalfd a device on the bus.
>
> What does everyone think of the following lightly tested patch?
>   

Hi Michael,
  Its interesting, but I am not convinced its necessary.  We created the
group/item layout because iosignalfds are unique in that they are
probably the only IO device that wants to do some kind of address
aliasing.  With what you are proposing here, you are adding aliasing
support to the general infrastructure which I am not (yet) convinced is
necessary.  If there isn't a use case for other devices to have
aliasing, I would think the logic is best contained in iosignalfd.  Do
you have something in mind?

Kind Regards,
-Greg

> -->
>
> Subject: kvm: pass value to in_range callback
>
> For write transactions, pass the value written to in_range checks so
> that we can make each iosignalfd a separate device on kvm bus.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>
> ---
>
>
> diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
> index 55e8846..98a25af 100644
> --- a/virt/kvm/iodev.h
> +++ b/virt/kvm/iodev.h
> @@ -28,7 +28,7 @@ struct kvm_io_device {
>  		      int len,
>  		      const void *val);
>  	int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len,
> -			int is_write);
> +			int is_write, void *write_val);
>  	void (*destructor)(struct kvm_io_device *this);
>  
>  	void             *private;
> @@ -51,9 +51,10 @@ static inline void kvm_iodevice_write(struct kvm_io_device *dev,
>  }
>  
>  static inline int kvm_iodevice_inrange(struct kvm_io_device *dev,
> -				       gpa_t addr, int len, int is_write)
> +				       gpa_t addr, int len, int is_write,
> +				       void *write_val)
>  {
> -	return dev->in_range(dev, addr, len, is_write);
> +	return dev->in_range(dev, addr, len, is_write, write_val);
>  }
>  
>  static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
> diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
> index 80c57b0..8cfdf9d 100644
> --- a/arch/ia64/kvm/kvm-ia64.c
> +++ b/arch/ia64/kvm/kvm-ia64.c
> @@ -211,11 +211,13 @@ int kvm_dev_ioctl_check_extension(long ext)
>  }
>  
>  static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
> -					gpa_t addr, int len, int is_write)
> +					gpa_t addr, int len, int is_write,
> +					void *write_val)
>  {
>  	struct kvm_io_device *dev;
>  
> -	dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write);
> +	dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write,
> +	                          write_val);
>  
>  	return dev;
>  }
> @@ -247,7 +247,8 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>  	kvm_run->exit_reason = KVM_EXIT_MMIO;
>  	return 0;
>  mmio:
> -	mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, !p->dir);
> +	mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size,
> +				      !p->dir, &p->data);
>  	if (mmio_dev) {
>  		if (!p->dir)
>  			kvm_iodevice_write(mmio_dev, p->addr, p->size,
> diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
> index 4d6f0d2..5ba21ff 100644
> --- a/arch/x86/kvm/i8254.c
> +++ b/arch/x86/kvm/i8254.c
> @@ -485,7 +485,7 @@ static void pit_ioport_read(struct kvm_io_device *this,
>  }
>  
>  static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
> -			int len, int is_write)
> +			int len, int is_write, void *write_val)
>  {
>  	return ((addr >= KVM_PIT_BASE_ADDRESS) &&
>  		(addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index ae99d83..456fd53 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -721,7 +721,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
>  }
>  
>  static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
> -			   int len, int size)
> +			   int len, int is_write, void *write_val)
>  {
>  	struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
>  	int ret = 0;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 249540f..9d29017 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2043,13 +2043,13 @@ static void kvm_init_msr_list(void)
>   */
>  static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
>  						gpa_t addr, int len,
> -						int is_write)
> +						int is_write, void *write_val)
>  {
>  	struct kvm_io_device *dev;
>  
>  	if (vcpu->arch.apic) {
>  		dev = &vcpu->arch.apic->dev;
> -		if (dev->in_range(dev, addr, len, is_write))
> +		if (dev->in_range(dev, addr, len, is_write, write_val))
>  			return dev;
>  	}
>  	return NULL;
> @@ -2058,14 +2058,14 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
>  
>  static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
>  						gpa_t addr, int len,
> -						int is_write)
> +						int is_write, void *write_val)
>  {
>  	struct kvm_io_device *dev;
>  
> -	dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
> +	dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write, write_val);
>  	if (dev == NULL)
>  		dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
> -					  is_write);
> +					  is_write, write_val);
>  	return dev;
>  }
>  
> @@ -2161,7 +2161,7 @@ mmio:
>  	 * Is this MMIO handled locally?
>  	 */
>  	mutex_lock(&vcpu->kvm->lock);
> -	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
> +	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0, NULL);
>  	if (mmio_dev) {
>  		kvm_iodevice_read(mmio_dev, gpa, bytes, val);
>  		mutex_unlock(&vcpu->kvm->lock);
> @@ -2216,7 +2216,7 @@ mmio:
>  	 * Is this MMIO handled locally?
>  	 */
>  	mutex_lock(&vcpu->kvm->lock);
> -	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
> +	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1, val);
>  	if (mmio_dev) {
>  		kvm_iodevice_write(mmio_dev, gpa, bytes, val);
>  		mutex_unlock(&vcpu->kvm->lock);
> @@ -2576,9 +2576,10 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
>  
>  static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
>  					       gpa_t addr, int len,
> -					       int is_write)
> +					       int is_write, void *write_val)
>  {
> -	return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
> +	return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write,
> +	                           write_val);
>  }
>  
>  int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
> @@ -2608,7 +2608,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
>  	val = kvm_register_read(vcpu, VCPU_REGS_RAX);
>  	memcpy(vcpu->arch.pio_data, &val, 4);
>  
> -	pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
> +	pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in,
> +				    vcpu->arch.pio_data);
>  	if (pio_dev) {
>  		kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
>  		complete_pio(vcpu);
> @@ -2624,7 +2625,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
>  {
>  	unsigned now, in_page;
>  	int ret = 0;
> -	struct kvm_io_device *pio_dev;
>  
>  	vcpu->run->exit_reason = KVM_EXIT_IO;
>  	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
> @@ -2672,9 +2672,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
>  
>  	vcpu->arch.pio.guest_gva = address;
>  
> -	pio_dev = vcpu_find_pio_dev(vcpu, port,
> -				    vcpu->arch.pio.cur_count,
> -				    !vcpu->arch.pio.in);
>  	if (!vcpu->arch.pio.in) {
>  		/* string PIO write */
>  		ret = pio_copy_data(vcpu);
> @@ -2682,13 +2679,20 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
>  			kvm_inject_gp(vcpu, 0);
>  			return 1;
>  		}
> -		if (ret == 0 && pio_dev) {
> -			pio_string_write(pio_dev, vcpu);
> -			complete_pio(vcpu);
> -			if (vcpu->arch.pio.count == 0)
> -				ret = 1;
> +		if (ret == 0) {
> +			struct kvm_io_device *pio_dev;
> +			pio_dev = vcpu_find_pio_dev(vcpu, port,
> +						    vcpu->arch.pio.cur_count,
> +						    1, vcpu->arch.pio_data);
> +			if (pio_dev) {
> +				pio_string_write(pio_dev, vcpu);
> +				complete_pio(vcpu);
> +				if (vcpu->arch.pio.count == 0)
> +					ret = 1;
> +			}
>  		}
> -	} else if (pio_dev)
> +	} else if (vcpu_find_pio_dev(vcpu, port, vcpu->arch.pio.cur_count, 0,
> +				     NULL))
>  		pr_unimpl(vcpu, "no string pio read support yet, "
>  		       "port %x size %d count %ld\n",
>  			port, size, count);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index aacc544..0fb7938 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -60,7 +60,8 @@ struct kvm_io_bus {
>  void kvm_io_bus_init(struct kvm_io_bus *bus);
>  void kvm_io_bus_destroy(struct kvm_io_bus *bus);
>  struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
> -					  gpa_t addr, int len, int is_write);
> +					  gpa_t addr, int len, int is_write,
> +					  void *write_val);
>  void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
>  			     struct kvm_io_device *dev);
>  
> diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
> index 5ae620d..3a1cbfd 100644
> --- a/virt/kvm/coalesced_mmio.c
> +++ b/virt/kvm/coalesced_mmio.c
> @@ -15,7 +15,8 @@
>  #include "coalesced_mmio.h"
>  
>  static int coalesced_mmio_in_range(struct kvm_io_device *this,
> -				   gpa_t addr, int len, int is_write)
> +				   gpa_t addr, int len, int is_write,
> +				   void *write_val)
>  {
>  	struct kvm_coalesced_mmio_dev *dev =
>  				(struct kvm_coalesced_mmio_dev*)this->private;
> diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
> index 1eddae9..2adbb1b 100644
> --- a/virt/kvm/ioapic.c
> +++ b/virt/kvm/ioapic.c
> @@ -221,7 +221,8 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
>  }
>  
>  static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr,
> -			   int len, int is_write)
> +			   int len, int is_write,
> +			   void *write_val)
>  {
>  	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
>  
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 7645543..b97e390 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2445,14 +2445,15 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus)
>  }
>  
>  struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
> -					  gpa_t addr, int len, int is_write)
> +					  gpa_t addr, int len, int is_write,
> +					  void *write_val)
>  {
>  	int i;
>  
>  	for (i = 0; i < bus->dev_count; i++) {
>  		struct kvm_io_device *pos = bus->devs[i];
>  
> -		if (pos->in_range(pos, addr, len, is_write))
> +		if (pos->in_range(pos, addr, len, is_write, write_val))
>  			return pos;
>  	}
>  
>
Michael S. Tsirkin June 22, 2009, 4:08 p.m. UTC | #2
On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote:
> Michael S. Tsirkin wrote:
> > It seems that a lot of complexity and trickiness with iosignalfd is
> > handling the group/item relationship, which comes about because kvm does
> > not currently let a device on the bus claim a write transaction based on the
> > value written.  This could be greatly simplified if the value written
> > was passed to the in_range check for write operation.  We could then
> > simply make each kvm_iosignalfd a device on the bus.
> >
> > What does everyone think of the following lightly tested patch?
> >   
> 
> Hi Michael,
>   Its interesting, but I am not convinced its necessary.  We created the
> group/item layout because iosignalfds are unique in that they are
> probably the only IO device that wants to do some kind of address
> aliasing.

We actually already have aliasing: is_write flag is used for this
purpose. Actually, it's possible to remove is_write by passing
a null pointer in write_val for reads. I like this a bit less as
the code generated is less compact ... Avi, what do you think?

>  With what you are proposing here, you are adding aliasing
> support to the general infrastructure which I am not (yet) convinced is
> necessary.

Infrastructure is a big name for something that adds a total of 10 lines to kvm.
And it should at least halve the size of your 450-line patch.

> If there isn't a use case for other devices to have
> aliasing, I would think the logic is best contained in iosignalfd.  Do
> you have something in mind?

One is enough :)
Seriously, do you see that this saves you all of RCU, linked lists and
counters? You don't need to keep track of iofds, you don't need to
implement your own lookup logic - you just use the kvm device
and that's it.

> Kind Regards,
> -Greg
> 
> > -->
> >
> > Subject: kvm: pass value to in_range callback
> >
> > For write transactions, pass the value written to in_range checks so
> > that we can make each iosignalfd a separate device on kvm bus.
> >
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> >
> > ---
> >
> >
> > diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
> > index 55e8846..98a25af 100644
> > --- a/virt/kvm/iodev.h
> > +++ b/virt/kvm/iodev.h
> > @@ -28,7 +28,7 @@ struct kvm_io_device {
> >  		      int len,
> >  		      const void *val);
> >  	int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len,
> > -			int is_write);
> > +			int is_write, void *write_val);
> >  	void (*destructor)(struct kvm_io_device *this);
> >  
> >  	void             *private;
> > @@ -51,9 +51,10 @@ static inline void kvm_iodevice_write(struct kvm_io_device *dev,
> >  }
> >  
> >  static inline int kvm_iodevice_inrange(struct kvm_io_device *dev,
> > -				       gpa_t addr, int len, int is_write)
> > +				       gpa_t addr, int len, int is_write,
> > +				       void *write_val)
> >  {
> > -	return dev->in_range(dev, addr, len, is_write);
> > +	return dev->in_range(dev, addr, len, is_write, write_val);
> >  }
> >  
> >  static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
> > diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
> > index 80c57b0..8cfdf9d 100644
> > --- a/arch/ia64/kvm/kvm-ia64.c
> > +++ b/arch/ia64/kvm/kvm-ia64.c
> > @@ -211,11 +211,13 @@ int kvm_dev_ioctl_check_extension(long ext)
> >  }
> >  
> >  static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
> > -					gpa_t addr, int len, int is_write)
> > +					gpa_t addr, int len, int is_write,
> > +					void *write_val)
> >  {
> >  	struct kvm_io_device *dev;
> >  
> > -	dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write);
> > +	dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write,
> > +	                          write_val);
> >  
> >  	return dev;
> >  }
> > @@ -247,7 +247,8 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> >  	kvm_run->exit_reason = KVM_EXIT_MMIO;
> >  	return 0;
> >  mmio:
> > -	mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, !p->dir);
> > +	mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size,
> > +				      !p->dir, &p->data);
> >  	if (mmio_dev) {
> >  		if (!p->dir)
> >  			kvm_iodevice_write(mmio_dev, p->addr, p->size,
> > diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
> > index 4d6f0d2..5ba21ff 100644
> > --- a/arch/x86/kvm/i8254.c
> > +++ b/arch/x86/kvm/i8254.c
> > @@ -485,7 +485,7 @@ static void pit_ioport_read(struct kvm_io_device *this,
> >  }
> >  
> >  static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
> > -			int len, int is_write)
> > +			int len, int is_write, void *write_val)
> >  {
> >  	return ((addr >= KVM_PIT_BASE_ADDRESS) &&
> >  		(addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index ae99d83..456fd53 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -721,7 +721,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
> >  }
> >  
> >  static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
> > -			   int len, int size)
> > +			   int len, int is_write, void *write_val)
> >  {
> >  	struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
> >  	int ret = 0;
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 249540f..9d29017 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -2043,13 +2043,13 @@ static void kvm_init_msr_list(void)
> >   */
> >  static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
> >  						gpa_t addr, int len,
> > -						int is_write)
> > +						int is_write, void *write_val)
> >  {
> >  	struct kvm_io_device *dev;
> >  
> >  	if (vcpu->arch.apic) {
> >  		dev = &vcpu->arch.apic->dev;
> > -		if (dev->in_range(dev, addr, len, is_write))
> > +		if (dev->in_range(dev, addr, len, is_write, write_val))
> >  			return dev;
> >  	}
> >  	return NULL;
> > @@ -2058,14 +2058,14 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
> >  
> >  static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
> >  						gpa_t addr, int len,
> > -						int is_write)
> > +						int is_write, void *write_val)
> >  {
> >  	struct kvm_io_device *dev;
> >  
> > -	dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
> > +	dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write, write_val);
> >  	if (dev == NULL)
> >  		dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
> > -					  is_write);
> > +					  is_write, write_val);
> >  	return dev;
> >  }
> >  
> > @@ -2161,7 +2161,7 @@ mmio:
> >  	 * Is this MMIO handled locally?
> >  	 */
> >  	mutex_lock(&vcpu->kvm->lock);
> > -	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
> > +	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0, NULL);
> >  	if (mmio_dev) {
> >  		kvm_iodevice_read(mmio_dev, gpa, bytes, val);
> >  		mutex_unlock(&vcpu->kvm->lock);
> > @@ -2216,7 +2216,7 @@ mmio:
> >  	 * Is this MMIO handled locally?
> >  	 */
> >  	mutex_lock(&vcpu->kvm->lock);
> > -	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
> > +	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1, val);
> >  	if (mmio_dev) {
> >  		kvm_iodevice_write(mmio_dev, gpa, bytes, val);
> >  		mutex_unlock(&vcpu->kvm->lock);
> > @@ -2576,9 +2576,10 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
> >  
> >  static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
> >  					       gpa_t addr, int len,
> > -					       int is_write)
> > +					       int is_write, void *write_val)
> >  {
> > -	return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
> > +	return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write,
> > +	                           write_val);
> >  }
> >  
> >  int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
> > @@ -2608,7 +2608,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
> >  	val = kvm_register_read(vcpu, VCPU_REGS_RAX);
> >  	memcpy(vcpu->arch.pio_data, &val, 4);
> >  
> > -	pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
> > +	pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in,
> > +				    vcpu->arch.pio_data);
> >  	if (pio_dev) {
> >  		kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
> >  		complete_pio(vcpu);
> > @@ -2624,7 +2625,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
> >  {
> >  	unsigned now, in_page;
> >  	int ret = 0;
> > -	struct kvm_io_device *pio_dev;
> >  
> >  	vcpu->run->exit_reason = KVM_EXIT_IO;
> >  	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
> > @@ -2672,9 +2672,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
> >  
> >  	vcpu->arch.pio.guest_gva = address;
> >  
> > -	pio_dev = vcpu_find_pio_dev(vcpu, port,
> > -				    vcpu->arch.pio.cur_count,
> > -				    !vcpu->arch.pio.in);
> >  	if (!vcpu->arch.pio.in) {
> >  		/* string PIO write */
> >  		ret = pio_copy_data(vcpu);
> > @@ -2682,13 +2679,20 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
> >  			kvm_inject_gp(vcpu, 0);
> >  			return 1;
> >  		}
> > -		if (ret == 0 && pio_dev) {
> > -			pio_string_write(pio_dev, vcpu);
> > -			complete_pio(vcpu);
> > -			if (vcpu->arch.pio.count == 0)
> > -				ret = 1;
> > +		if (ret == 0) {
> > +			struct kvm_io_device *pio_dev;
> > +			pio_dev = vcpu_find_pio_dev(vcpu, port,
> > +						    vcpu->arch.pio.cur_count,
> > +						    1, vcpu->arch.pio_data);
> > +			if (pio_dev) {
> > +				pio_string_write(pio_dev, vcpu);
> > +				complete_pio(vcpu);
> > +				if (vcpu->arch.pio.count == 0)
> > +					ret = 1;
> > +			}
> >  		}
> > -	} else if (pio_dev)
> > +	} else if (vcpu_find_pio_dev(vcpu, port, vcpu->arch.pio.cur_count, 0,
> > +				     NULL))
> >  		pr_unimpl(vcpu, "no string pio read support yet, "
> >  		       "port %x size %d count %ld\n",
> >  			port, size, count);
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index aacc544..0fb7938 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -60,7 +60,8 @@ struct kvm_io_bus {
> >  void kvm_io_bus_init(struct kvm_io_bus *bus);
> >  void kvm_io_bus_destroy(struct kvm_io_bus *bus);
> >  struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
> > -					  gpa_t addr, int len, int is_write);
> > +					  gpa_t addr, int len, int is_write,
> > +					  void *write_val);
> >  void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
> >  			     struct kvm_io_device *dev);
> >  
> > diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
> > index 5ae620d..3a1cbfd 100644
> > --- a/virt/kvm/coalesced_mmio.c
> > +++ b/virt/kvm/coalesced_mmio.c
> > @@ -15,7 +15,8 @@
> >  #include "coalesced_mmio.h"
> >  
> >  static int coalesced_mmio_in_range(struct kvm_io_device *this,
> > -				   gpa_t addr, int len, int is_write)
> > +				   gpa_t addr, int len, int is_write,
> > +				   void *write_val)
> >  {
> >  	struct kvm_coalesced_mmio_dev *dev =
> >  				(struct kvm_coalesced_mmio_dev*)this->private;
> > diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
> > index 1eddae9..2adbb1b 100644
> > --- a/virt/kvm/ioapic.c
> > +++ b/virt/kvm/ioapic.c
> > @@ -221,7 +221,8 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
> >  }
> >  
> >  static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr,
> > -			   int len, int is_write)
> > +			   int len, int is_write,
> > +			   void *write_val)
> >  {
> >  	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
> >  
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 7645543..b97e390 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -2445,14 +2445,15 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus)
> >  }
> >  
> >  struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
> > -					  gpa_t addr, int len, int is_write)
> > +					  gpa_t addr, int len, int is_write,
> > +					  void *write_val)
> >  {
> >  	int i;
> >  
> >  	for (i = 0; i < bus->dev_count; i++) {
> >  		struct kvm_io_device *pos = bus->devs[i];
> >  
> > -		if (pos->in_range(pos, addr, len, is_write))
> > +		if (pos->in_range(pos, addr, len, is_write, write_val))
> >  			return pos;
> >  	}
> >  
> >   
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins June 22, 2009, 4:29 p.m. UTC | #3
Michael S. Tsirkin wrote:
> On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote:
>   
>> Michael S. Tsirkin wrote:
>>     
>>> It seems that a lot of complexity and trickiness with iosignalfd is
>>> handling the group/item relationship, which comes about because kvm does
>>> not currently let a device on the bus claim a write transaction based on the
>>> value written.  This could be greatly simplified if the value written
>>> was passed to the in_range check for write operation.  We could then
>>> simply make each kvm_iosignalfd a device on the bus.
>>>
>>> What does everyone think of the following lightly tested patch?
>>>   
>>>       
>> Hi Michael,
>>   Its interesting, but I am not convinced its necessary.  We created the
>> group/item layout because iosignalfds are unique in that they are
>> probably the only IO device that wants to do some kind of address
>> aliasing.
>>     
>
> We actually already have aliasing: is_write flag is used for this
> purpose.

Yes, but read/write address aliasing is not the same thing is
multi-match data aliasing.  Besides, your proposal also breaks some of
the natural relationship models (e.g. all the aliased iosignal_items
always belong to the same underlying device.  io_bus entries have an
arbitrary topology).


>  Actually, it's possible to remove is_write by passing
> a null pointer in write_val for reads. I like this a bit less as
> the code generated is less compact ... Avi, what do you think?
>
>   
>>  With what you are proposing here, you are adding aliasing
>> support to the general infrastructure which I am not (yet) convinced is
>> necessary.
>>     
>
> Infrastructure is a big name for something that adds a total of 10 lines to kvm.
> And it should at least halve the size of your 450-line patch.
>   

Your patch isn't complete until some critical missing features are added
to io_bus, though, so its not really just 10 lines.  For one, it will
need to support much more than 6 devices.  It will also need to support
multiple matches.  Also you are proposing an general interface change
that doesn't make sense to all but one device type.  So now every
io-device developer that comes along will scratch their head at what to
do with that field.

None of these are insurmountable hurdles, but my point is that today the
complexity is encapsulated in the proper place IMO.  E.g. The one and
only device that cares to do this "weird" thing handles it behind an
interface that makes sense to all parties involved.
>   
>> If there isn't a use case for other devices to have
>> aliasing, I would think the logic is best contained in iosignalfd.  Do
>> you have something in mind?
>>     
>
> One is enough :)
>   

I am not convinced yet. ;)  It appears to me that we are leaking
iosignalfd-isms into the general code.  If there is another device that
wants to do something similar, ok.  But I can't think of any.

> Seriously, do you see that this saves you all of RCU, linked lists and
> counters?

Well, also keep in mind we will probably be converting io_bus to RCU
very soon, so we are going the opposite direction ;)

Kind Regards,
-Greg
Michael S. Tsirkin June 22, 2009, 5:27 p.m. UTC | #4
On Mon, Jun 22, 2009 at 12:29:10PM -0400, Gregory Haskins wrote:
> Michael S. Tsirkin wrote:
> > On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote:
> >   
> >> Michael S. Tsirkin wrote:
> >>     
> >>> It seems that a lot of complexity and trickiness with iosignalfd is
> >>> handling the group/item relationship, which comes about because kvm does
> >>> not currently let a device on the bus claim a write transaction based on the
> >>> value written.  This could be greatly simplified if the value written
> >>> was passed to the in_range check for write operation.  We could then
> >>> simply make each kvm_iosignalfd a device on the bus.
> >>>
> >>> What does everyone think of the following lightly tested patch?
> >>>   
> >>>       
> >> Hi Michael,
> >>   Its interesting, but I am not convinced its necessary.  We created the
> >> group/item layout because iosignalfds are unique in that they are
> >> probably the only IO device that wants to do some kind of address
> >> aliasing.
> >>     
> >
> > We actually already have aliasing: is_write flag is used for this
> > purpose.
> 
> Yes, but read/write address aliasing is not the same thing is
> multi-match data aliasing.

What's the big difference?

> Besides, your proposal also breaks

s/break/removes limitation/ :)

> some of
> the natural relationship models
> (e.g. all the aliased iosignal_items
> always belong to the same underlying device.  io_bus entries have an
> arbitrary topology).

iosignal_item is an artifact, they are not seen by user -
they are just a work around an API limitation.

And they are only grouped if the same PIO offset is used for all accesses.
Why is not always the case. If a device uses several PIO offsets
(as virtio does), you create separate devices for a single guest device too.

> 
> >  Actually, it's possible to remove is_write by passing
> > a null pointer in write_val for reads. I like this a bit less as
> > the code generated is less compact ... Avi, what do you think?
> >
> >   
> >>  With what you are proposing here, you are adding aliasing
> >> support to the general infrastructure which I am not (yet) convinced is
> >> necessary.
> >>     
> >
> > Infrastructure is a big name for something that adds a total of 10 lines to kvm.
> > And it should at least halve the size of your 450-line patch.
> >   
> 
> Your patch isn't complete until some critical missing features are added
> to io_bus, though, so its not really just 10 lines.
>  For one, it will
> need to support much more than 6 devices.

Isn't this like a #define change? With the item patch we are still
limited in the number of groups we can create.

What we gain is a simple array/list instead of a tree of
linked lists that makes cheshire cheese out of CPU data cache.

>  It will also need to support
> multiple matches.

What, signal many fds on the same address/value pair?
I see this as a bug. Why is this a good thing to support?
Just increases the chance of leaking this fd.

> Also you are proposing an general interface change
> that doesn't make sense to all but one device type.  So now every
> io-device developer that comes along will scratch their head at what to
> do with that field.

What do they do with is_write now? Ignore it. It's used in a whole
of 1 place.

> 
> None of these are insurmountable hurdles, but my point is that today the
> complexity is encapsulated in the proper place IMO.

It's better to get rid of complexity than encapsulate it.

>  E.g. The one and
> only device that cares to do this "weird" thing handles it behind an
> interface that makes sense to all parties involved.
> >   
> >> If there isn't a use case for other devices to have
> >> aliasing, I would think the logic is best contained in iosignalfd.  Do
> >> you have something in mind?
> >>     
> >
> > One is enough :)
> >   
> 
> I am not convinced yet. ;)  It appears to me that we are leaking
> iosignalfd-isms into the general code.  If there is another device that
> wants to do something similar, ok.  But I can't think of any.

You never know. is_write was used by a whole of 1 user: coalesced_mmio,
then your patch comes along ...


> > Seriously, do you see that this saves you all of RCU, linked lists and
> > counters?
> 
> Well, also keep in mind we will probably be converting io_bus to RCU
> very soon, so we are going the opposite direction ;)
> 
> Kind Regards,
> -Greg
> 

Same direction. Let's put RCU in iobus, we don't need another one on
top of it. That's encapsulating complexity.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins June 23, 2009, 4:04 a.m. UTC | #5
Michael S. Tsirkin wrote:
> On Mon, Jun 22, 2009 at 12:29:10PM -0400, Gregory Haskins wrote:
>   
>> Michael S. Tsirkin wrote:
>>     
>>> On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote:
>>>   
>>>       
>>>> Michael S. Tsirkin wrote:
>>>>     
>>>>         
>>>>> It seems that a lot of complexity and trickiness with iosignalfd is
>>>>> handling the group/item relationship, which comes about because kvm does
>>>>> not currently let a device on the bus claim a write transaction based on the
>>>>> value written.  This could be greatly simplified if the value written
>>>>> was passed to the in_range check for write operation.  We could then
>>>>> simply make each kvm_iosignalfd a device on the bus.
>>>>>
>>>>> What does everyone think of the following lightly tested patch?
>>>>>   
>>>>>       
>>>>>           
>>>> Hi Michael,
>>>>   Its interesting, but I am not convinced its necessary.  We created the
>>>> group/item layout because iosignalfds are unique in that they are
>>>> probably the only IO device that wants to do some kind of address
>>>> aliasing.
>>>>     
>>>>         
>>> We actually already have aliasing: is_write flag is used for this
>>> purpose.
>>>       
>> Yes, but read/write address aliasing is not the same thing is
>> multi-match data aliasing.
>>     
>
> What's the big difference?
>   

Well, for one its not very clear what the benefit of the read/write
aliasing even is. ;)  Apparently coalesced_mmio uses it, but even so I
doubt that is for the purposes of having one device do reads while
another does writes.  I could be wrong, though.
>   
>> Besides, your proposal also breaks
>>     
>
> s/break/removes limitation/ :)
>
>   
>> some of
>> the natural relationship models
>> (e.g. all the aliased iosignal_items
>> always belong to the same underlying device.  io_bus entries have an
>> arbitrary topology).
>>     
>
> iosignal_item is an artifact, they are not seen by user -
> they are just a work around an API limitation.
>   

Well, not really.  The "limitation" is a natural attribute of real
hardware too.  I don't know of a whole bunch of real hardware that
aliases multiple address decoders to the same value, do you?  How would
you handle reads?

The way I think of these items are not as unique devices in the io_bus
sense.  We have one address decoder (the group) listening on the bus to
a unique address.  That group provides a data-decoding service, where
you can program what event gets triggered for which value (or wildcard =
all values).

It is unnatural to turn it around and say that the io_bus now scans
exhaustively on the address/data tuple, and we support aliasing such
that multiple decoders may coexist.  There isn't any other device
(hardware or software) that will actually do this aside from iosignalfd,
to my knowledge.  So why distort the entire io_bus subsystem's
addressing scheme to match?  I don't get the motivation, as it appears
to me to be already handled ideally as it is.

Perhaps I am contributing to this confusion with my decision to make the
group creation implicit with the first item creation.  Would you feel
better about this if we made the distinction with group and item more
explicit? (E.g. IOSIGNALFD_GROUP_ASSIGN and IOSIGNALFD_ITEM_ASSIGN
verbs).  I am fine with that change, I suppose.


> And they are only grouped if the same PIO offset is used for all accesses.
> Why is not always the case. If a device uses several PIO offsets
> (as virtio does), you create separate devices for a single guest device too.
>   

Thats ok.  As I stated above, I view this as a data-match service
against a single address decoder, not as multiple decoders aliased to
the same address/data pair.  Its perfectly fine for a logical device to
employ multiple decoders, but its unnatural for there to be multiple
decoders to the same address.  Its also unnatural to use the data as
part of the decode criteria for a general IO cycle.

>   
>>>  Actually, it's possible to remove is_write by passing
>>> a null pointer in write_val for reads. I like this a bit less as
>>> the code generated is less compact ... Avi, what do you think?
>>>
>>>   
>>>       
>>>>  With what you are proposing here, you are adding aliasing
>>>> support to the general infrastructure which I am not (yet) convinced is
>>>> necessary.
>>>>     
>>>>         
>>> Infrastructure is a big name for something that adds a total of 10 lines to kvm.
>>> And it should at least halve the size of your 450-line patch.
>>>   
>>>       
>> Your patch isn't complete until some critical missing features are added
>> to io_bus, though, so its not really just 10 lines.
>>  For one, it will
>> need to support much more than 6 devices.
>>     
>
> Isn't this like a #define change? With the item patch we are still
> limited in the number of groups we can create.
>   

Well, no because we want to support ~512 of these things, so the array
is not going to scale.  Right now iosignalfd is implemented as a
tree+list, which is a little better for scalability but even that needs
to change soon.  We probably ultimately want to do either a rbtree or
radixtree for all of these things.  But that is an optimization patch
for a later day ;)

> What we gain is a simple array/list instead of a tree of
> linked lists that makes cheshire cheese out of CPU data cache.
>
>   

Yeah, that is a down side, I admit.  But the current io_bus array will
probably not scale going forward either so it needs to be addressed on
that side as well.  We can always implement a SW cache, if need be, to
account for this.

>>  It will also need to support
>> multiple matches.
>>     
>
> What, signal many fds on the same address/value pair?
> I see this as a bug. Why is this a good thing to support?
> Just increases the chance of leaking this fd.
>   

I believe Avi asked for this feature specifically, so I will defer to him.


>   
>> Also you are proposing an general interface change
>> that doesn't make sense to all but one device type.  So now every
>> io-device developer that comes along will scratch their head at what to
>> do with that field.
>>     
>
> What do they do with is_write now? Ignore it. It's used in a whole
> of 1 place.
>
>   
>> None of these are insurmountable hurdles, but my point is that today the
>> complexity is encapsulated in the proper place IMO.
>>     
>
> It's better to get rid of complexity than encapsulate it.
>   

All you are doing is moving the complexity to a place where I don't see
it serving a benefit to any code other than the one you just took it
away from.  Like it or not, io_bus will have to address scale and
data-matching to do what you want.  And the address/data match doesn't
make sense to anyone else afaict.  In addition, data-matching at that
level is even harder because we can make little simplifications with the
way we do things now (like enforce that all add/lens conform to the
group addr/len).  If you go to a general address/data tuple decoder, you
will presumably need to have a much more flexible decoder design
(overlapping regions, etc).

Again, not insurmountable...but I am not seeing the advantage to make
this change worthwhile either.


>   
>>  E.g. The one and
>> only device that cares to do this "weird" thing handles it behind an
>> interface that makes sense to all parties involved.
>>     
>>>   
>>>       
>>>> If there isn't a use case for other devices to have
>>>> aliasing, I would think the logic is best contained in iosignalfd.  Do
>>>> you have something in mind?
>>>>     
>>>>         
>>> One is enough :)
>>>   
>>>       
>> I am not convinced yet. ;)  It appears to me that we are leaking
>> iosignalfd-isms into the general code.  If there is another device that
>> wants to do something similar, ok.  But I can't think of any.
>>     
>
> You never know. is_write was used by a whole of 1 user: coalesced_mmio,
> then your patch comes along ...
>   

Can it wait until that happens, then?  :)  I'm already at v8 and a v9 is
brewing.  It's not like we can't change later if something better comes
along.

>
>   
>>> Seriously, do you see that this saves you all of RCU, linked lists and
>>> counters?
>>>       
>> Well, also keep in mind we will probably be converting io_bus to RCU
>> very soon, so we are going the opposite direction ;)
>>
>> Kind Regards,
>> -Greg
>>
>>     
>
> Same direction. Let's put RCU in iobus, we don't need another one on
> top of it. That's encapsulating complexity.
>   

Is this really all that complicated?  The iosignalfd code is fairly
trivial even with the group/item nesting IMO.  I also think it makes
sense to live where it is.

Thanks Michael,
-Greg
Avi Kivity June 23, 2009, 9:52 a.m. UTC | #6
On 06/22/2009 07:08 PM, Michael S. Tsirkin wrote:
> On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote:
>    
>> Michael S. Tsirkin wrote:
>>      
>>> It seems that a lot of complexity and trickiness with iosignalfd is
>>> handling the group/item relationship, which comes about because kvm does
>>> not currently let a device on the bus claim a write transaction based on the
>>> value written.  This could be greatly simplified if the value written
>>> was passed to the in_range check for write operation.  We could then
>>> simply make each kvm_iosignalfd a device on the bus.
>>>
>>> What does everyone think of the following lightly tested patch?
>>>
>>>        
>> Hi Michael,
>>    Its interesting, but I am not convinced its necessary.  We created the
>> group/item layout because iosignalfds are unique in that they are
>> probably the only IO device that wants to do some kind of address
>> aliasing.
>>      
>
> We actually already have aliasing: is_write flag is used for this
> purpose. Actually, it's possible to remove is_write by passing
> a null pointer in write_val for reads. I like this a bit less as
> the code generated is less compact ... Avi, what do you think?
>    

Greg, won't Michael's patch eliminate a big chunk from your iosignalfd 
patches?  Seems like a win to me.

> One is enough :)
> Seriously, do you see that this saves you all of RCU, linked lists and
> counters? You don't need to keep track of iofds, you don't need to
> implement your own lookup logic - you just use the kvm device
> and that's it.
>
>    

Yup.
Avi Kivity June 23, 2009, 9:54 a.m. UTC | #7
On 06/22/2009 07:29 PM, Gregory Haskins wrote:
>> We actually already have aliasing: is_write flag is used for this
>> purpose.
>>      
>
> Yes, but read/write address aliasing is not the same thing is
> multi-match data aliasing.  Besides, your proposal also breaks some of
> the natural relationship models (e.g. all the aliased iosignal_items
> always belong to the same underlying device.  io_bus entries have an
> arbitrary topology).
>    

It's all one big hack, we want to get the correct function called with 
as little fuss as possible.
Gregory Haskins June 23, 2009, 11:41 a.m. UTC | #8
Avi Kivity wrote:
> On 06/22/2009 07:08 PM, Michael S. Tsirkin wrote:
>> On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote:
>>   
>>> Michael S. Tsirkin wrote:
>>>     
>>>> It seems that a lot of complexity and trickiness with iosignalfd is
>>>> handling the group/item relationship, which comes about because kvm
>>>> does
>>>> not currently let a device on the bus claim a write transaction
>>>> based on the
>>>> value written.  This could be greatly simplified if the value written
>>>> was passed to the in_range check for write operation.  We could then
>>>> simply make each kvm_iosignalfd a device on the bus.
>>>>
>>>> What does everyone think of the following lightly tested patch?
>>>>
>>>>        
>>> Hi Michael,
>>>    Its interesting, but I am not convinced its necessary.  We
>>> created the
>>> group/item layout because iosignalfds are unique in that they are
>>> probably the only IO device that wants to do some kind of address
>>> aliasing.
>>>      
>>
>> We actually already have aliasing: is_write flag is used for this
>> purpose. Actually, it's possible to remove is_write by passing
>> a null pointer in write_val for reads. I like this a bit less as
>> the code generated is less compact ... Avi, what do you think?
>>    
>
> Greg, won't Michael's patch eliminate a big chunk from your iosignalfd
> patches?  Seems like a win to me.

Well, it really just moves that hunk from eventfd.c to kvm_main.c, where
I don't think anyone else will use it by iosignalfd.  But if that is
what everyone wants, I guess I have no choice.

>
>> One is enough :)
>> Seriously, do you see that this saves you all of RCU, linked lists and
>> counters? You don't need to keep track of iofds, you don't need to
>> implement your own lookup logic - you just use the kvm device
>> and that's it.
>>
>>    
>
> Yup.
>
Michael S. Tsirkin June 23, 2009, 11:44 a.m. UTC | #9
On Tue, Jun 23, 2009 at 12:04:06AM -0400, Gregory Haskins wrote:
> >>  It will also need to support
> >> multiple matches.
> >>     
> >
> > What, signal many fds on the same address/value pair?
> > I see this as a bug. Why is this a good thing to support?
> > Just increases the chance of leaking this fd.
> >   
> 
> I believe Avi asked for this feature specifically, so I will defer to him.

Hmm. That's hard to implement in my model. Avi, can we give up
this feature? I don't think anyone needs this specifically ...
Michael S. Tsirkin June 23, 2009, 11:46 a.m. UTC | #10
On Tue, Jun 23, 2009 at 07:41:12AM -0400, Gregory Haskins wrote:
> Avi Kivity wrote:
> > On 06/22/2009 07:08 PM, Michael S. Tsirkin wrote:
> >> On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote:
> >>   
> >>> Michael S. Tsirkin wrote:
> >>>     
> >>>> It seems that a lot of complexity and trickiness with iosignalfd is
> >>>> handling the group/item relationship, which comes about because kvm
> >>>> does
> >>>> not currently let a device on the bus claim a write transaction
> >>>> based on the
> >>>> value written.  This could be greatly simplified if the value written
> >>>> was passed to the in_range check for write operation.  We could then
> >>>> simply make each kvm_iosignalfd a device on the bus.
> >>>>
> >>>> What does everyone think of the following lightly tested patch?
> >>>>
> >>>>        
> >>> Hi Michael,
> >>>    Its interesting, but I am not convinced its necessary.  We
> >>> created the
> >>> group/item layout because iosignalfds are unique in that they are
> >>> probably the only IO device that wants to do some kind of address
> >>> aliasing.
> >>>      
> >>
> >> We actually already have aliasing: is_write flag is used for this
> >> purpose. Actually, it's possible to remove is_write by passing
> >> a null pointer in write_val for reads. I like this a bit less as
> >> the code generated is less compact ... Avi, what do you think?
> >>    
> >
> > Greg, won't Michael's patch eliminate a big chunk from your iosignalfd
> > patches?  Seems like a win to me.
> 
> Well, it really just moves that hunk from eventfd.c to kvm_main.c, where
> I don't think anyone else will use it by iosignalfd.  But if that is
> what everyone wants, I guess I have no choice.

Wait a bit before you start rebasing though please.
I just had a brainwave and is rewriting this patch.

> >
> >> One is enough :)
> >> Seriously, do you see that this saves you all of RCU, linked lists and
> >> counters? You don't need to keep track of iofds, you don't need to
> >> implement your own lookup logic - you just use the kvm device
> >> and that's it.
> >>
> >>    
> >
> > Yup.
> >
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins June 23, 2009, 11:52 a.m. UTC | #11
Michael S. Tsirkin wrote:
> On Tue, Jun 23, 2009 at 12:04:06AM -0400, Gregory Haskins wrote:
>   
>>>>  It will also need to support
>>>> multiple matches.
>>>>     
>>>>         
>>> What, signal many fds on the same address/value pair?
>>> I see this as a bug. Why is this a good thing to support?
>>> Just increases the chance of leaking this fd.
>>>   
>>>       
>> I believe Avi asked for this feature specifically, so I will defer to him.
>>     
>
> Hmm. That's hard to implement in my model. Avi, can we give up
> this feature? I don't think anyone needs this specifically ...
>
>   

If we can drop this it will simplify the mods I will need to do to
io_bus to support Michael's new interface.

Out of curiosity, what is it you are trying to build Michael?

-Greg
Avi Kivity June 23, 2009, 11:53 a.m. UTC | #12
On 06/23/2009 07:04 AM, Gregory Haskins wrote:
> Well, for one its not very clear what the benefit of the read/write
> aliasing even is. ;)  Apparently coalesced_mmio uses it, but even so I
> doubt that is for the purposes of having one device do reads while
> another does writes.  I could be wrong, though.
>    

Coalesced mmio cannot handle reads.
Avi Kivity June 23, 2009, 11:56 a.m. UTC | #13
On 06/23/2009 02:44 PM, Michael S. Tsirkin wrote:
> On Tue, Jun 23, 2009 at 12:04:06AM -0400, Gregory Haskins wrote:
>    
>>>>   It will also need to support
>>>> multiple matches.
>>>>
>>>>          
>>> What, signal many fds on the same address/value pair?
>>> I see this as a bug. Why is this a good thing to support?
>>> Just increases the chance of leaking this fd.
>>>
>>>        
>> I believe Avi asked for this feature specifically, so I will defer to him.
>>      
>
> Hmm. That's hard to implement in my model. Avi, can we give up
> this feature? I don't think anyone needs this specifically ...
>    

I think we can make do with passing that single eventfd to multiple 
consumers.  It means their event count reads may return zero, but I 
guess we can live with that.

I do want to retain flexibility in how we route events.
Gregory Haskins June 23, 2009, 12:01 p.m. UTC | #14
Avi Kivity wrote:
> On 06/23/2009 02:44 PM, Michael S. Tsirkin wrote:
>> On Tue, Jun 23, 2009 at 12:04:06AM -0400, Gregory Haskins wrote:
>>   
>>>>>   It will also need to support
>>>>> multiple matches.
>>>>>
>>>>>          
>>>> What, signal many fds on the same address/value pair?
>>>> I see this as a bug. Why is this a good thing to support?
>>>> Just increases the chance of leaking this fd.
>>>>
>>>>        
>>> I believe Avi asked for this feature specifically, so I will defer
>>> to him.
>>>      
>>
>> Hmm. That's hard to implement in my model. Avi, can we give up
>> this feature? I don't think anyone needs this specifically ...
>>    
>
> I think we can make do with passing that single eventfd to multiple
> consumers.  It means their event count reads may return zero, but I
> guess we can live with that.
>
> I do want to retain flexibility in how we route events.
>

Ok, so for now I will just crank up the io_bus array, and we can address
scale another day.  Can I just drop patch 2/3 and let the io_bus govern
the limit?

-Greg
Avi Kivity June 23, 2009, 12:19 p.m. UTC | #15
On 06/23/2009 03:01 PM, Gregory Haskins wrote:
> Ok, so for now I will just crank up the io_bus array, and we can address
> scale another day.  Can I just drop patch 2/3 and let the io_bus govern
> the limit?
>    

So long as we have a runtime-discoverable limit, yes.
diff mbox

Patch

diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 55e8846..98a25af 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -28,7 +28,7 @@  struct kvm_io_device {
 		      int len,
 		      const void *val);
 	int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len,
-			int is_write);
+			int is_write, void *write_val);
 	void (*destructor)(struct kvm_io_device *this);
 
 	void             *private;
@@ -51,9 +51,10 @@  static inline void kvm_iodevice_write(struct kvm_io_device *dev,
 }
 
 static inline int kvm_iodevice_inrange(struct kvm_io_device *dev,
-				       gpa_t addr, int len, int is_write)
+				       gpa_t addr, int len, int is_write,
+				       void *write_val)
 {
-	return dev->in_range(dev, addr, len, is_write);
+	return dev->in_range(dev, addr, len, is_write, write_val);
 }
 
 static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 80c57b0..8cfdf9d 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -211,11 +211,13 @@  int kvm_dev_ioctl_check_extension(long ext)
 }
 
 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
-					gpa_t addr, int len, int is_write)
+					gpa_t addr, int len, int is_write,
+					void *write_val)
 {
 	struct kvm_io_device *dev;
 
-	dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write);
+	dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write,
+	                          write_val);
 
 	return dev;
 }
@@ -247,7 +247,8 @@  static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	kvm_run->exit_reason = KVM_EXIT_MMIO;
 	return 0;
 mmio:
-	mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, !p->dir);
+	mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size,
+				      !p->dir, &p->data);
 	if (mmio_dev) {
 		if (!p->dir)
 			kvm_iodevice_write(mmio_dev, p->addr, p->size,
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4d6f0d2..5ba21ff 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -485,7 +485,7 @@  static void pit_ioport_read(struct kvm_io_device *this,
 }
 
 static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
-			int len, int is_write)
+			int len, int is_write, void *write_val)
 {
 	return ((addr >= KVM_PIT_BASE_ADDRESS) &&
 		(addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ae99d83..456fd53 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -721,7 +721,7 @@  static void apic_mmio_write(struct kvm_io_device *this,
 }
 
 static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
-			   int len, int size)
+			   int len, int is_write, void *write_val)
 {
 	struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
 	int ret = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 249540f..9d29017 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2043,13 +2043,13 @@  static void kvm_init_msr_list(void)
  */
 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
 						gpa_t addr, int len,
-						int is_write)
+						int is_write, void *write_val)
 {
 	struct kvm_io_device *dev;
 
 	if (vcpu->arch.apic) {
 		dev = &vcpu->arch.apic->dev;
-		if (dev->in_range(dev, addr, len, is_write))
+		if (dev->in_range(dev, addr, len, is_write, write_val))
 			return dev;
 	}
 	return NULL;
@@ -2058,14 +2058,14 @@  static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
 
 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
 						gpa_t addr, int len,
-						int is_write)
+						int is_write, void *write_val)
 {
 	struct kvm_io_device *dev;
 
-	dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
+	dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write, write_val);
 	if (dev == NULL)
 		dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
-					  is_write);
+					  is_write, write_val);
 	return dev;
 }
 
@@ -2161,7 +2161,7 @@  mmio:
 	 * Is this MMIO handled locally?
 	 */
 	mutex_lock(&vcpu->kvm->lock);
-	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
+	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0, NULL);
 	if (mmio_dev) {
 		kvm_iodevice_read(mmio_dev, gpa, bytes, val);
 		mutex_unlock(&vcpu->kvm->lock);
@@ -2216,7 +2216,7 @@  mmio:
 	 * Is this MMIO handled locally?
 	 */
 	mutex_lock(&vcpu->kvm->lock);
-	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
+	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1, val);
 	if (mmio_dev) {
 		kvm_iodevice_write(mmio_dev, gpa, bytes, val);
 		mutex_unlock(&vcpu->kvm->lock);
@@ -2576,9 +2576,10 @@  static void pio_string_write(struct kvm_io_device *pio_dev,
 
 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
 					       gpa_t addr, int len,
-					       int is_write)
+					       int is_write, void *write_val)
 {
-	return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
+	return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write,
+	                           write_val);
 }
 
 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
@@ -2608,7 +2608,8 @@  int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	val = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	memcpy(vcpu->arch.pio_data, &val, 4);
 
-	pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
+	pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in,
+				    vcpu->arch.pio_data);
 	if (pio_dev) {
 		kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
 		complete_pio(vcpu);
@@ -2624,7 +2625,6 @@  int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 {
 	unsigned now, in_page;
 	int ret = 0;
-	struct kvm_io_device *pio_dev;
 
 	vcpu->run->exit_reason = KVM_EXIT_IO;
 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2672,9 +2672,6 @@  int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 
 	vcpu->arch.pio.guest_gva = address;
 
-	pio_dev = vcpu_find_pio_dev(vcpu, port,
-				    vcpu->arch.pio.cur_count,
-				    !vcpu->arch.pio.in);
 	if (!vcpu->arch.pio.in) {
 		/* string PIO write */
 		ret = pio_copy_data(vcpu);
@@ -2682,13 +2679,20 @@  int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 			kvm_inject_gp(vcpu, 0);
 			return 1;
 		}
-		if (ret == 0 && pio_dev) {
-			pio_string_write(pio_dev, vcpu);
-			complete_pio(vcpu);
-			if (vcpu->arch.pio.count == 0)
-				ret = 1;
+		if (ret == 0) {
+			struct kvm_io_device *pio_dev;
+			pio_dev = vcpu_find_pio_dev(vcpu, port,
+						    vcpu->arch.pio.cur_count,
+						    1, vcpu->arch.pio_data);
+			if (pio_dev) {
+				pio_string_write(pio_dev, vcpu);
+				complete_pio(vcpu);
+				if (vcpu->arch.pio.count == 0)
+					ret = 1;
+			}
 		}
-	} else if (pio_dev)
+	} else if (vcpu_find_pio_dev(vcpu, port, vcpu->arch.pio.cur_count, 0,
+				     NULL))
 		pr_unimpl(vcpu, "no string pio read support yet, "
 		       "port %x size %d count %ld\n",
 			port, size, count);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index aacc544..0fb7938 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -60,7 +60,8 @@  struct kvm_io_bus {
 void kvm_io_bus_init(struct kvm_io_bus *bus);
 void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
-					  gpa_t addr, int len, int is_write);
+					  gpa_t addr, int len, int is_write,
+					  void *write_val);
 void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
 			     struct kvm_io_device *dev);
 
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 5ae620d..3a1cbfd 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -15,7 +15,8 @@ 
 #include "coalesced_mmio.h"
 
 static int coalesced_mmio_in_range(struct kvm_io_device *this,
-				   gpa_t addr, int len, int is_write)
+				   gpa_t addr, int len, int is_write,
+				   void *write_val)
 {
 	struct kvm_coalesced_mmio_dev *dev =
 				(struct kvm_coalesced_mmio_dev*)this->private;
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 1eddae9..2adbb1b 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -221,7 +221,8 @@  void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
 }
 
 static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr,
-			   int len, int is_write)
+			   int len, int is_write,
+			   void *write_val)
 {
 	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7645543..b97e390 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2445,14 +2445,15 @@  void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 }
 
 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
-					  gpa_t addr, int len, int is_write)
+					  gpa_t addr, int len, int is_write,
+					  void *write_val)
 {
 	int i;
 
 	for (i = 0; i < bus->dev_count; i++) {
 		struct kvm_io_device *pos = bus->devs[i];
 
-		if (pos->in_range(pos, addr, len, is_write))
+		if (pos->in_range(pos, addr, len, is_write, write_val))
 			return pos;
 	}