diff mbox

[5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET

Message ID 1309927078-5983-5-git-send-email-levinsasha928@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Sasha Levin July 6, 2011, 4:37 a.m. UTC
The new flag allows passing a connected socket instead of an
eventfd to be notified of writes or reads to the specified memory region.

Instead of signaling an event, On write - the value written to the memory
region is written to the pipe.
On read - a notification of the read is sent to the host, and a response
is expected with the value to be 'read'.

Using a socket instead of an eventfd is usefull when any value can be
written to the memory region but we're interested in recieving the
actual value instead of just a notification.

A simple example for practical use is the serial port. we are not
interested in an exit every time a char is written to the port, but
we do need to know what was written so we could handle it on the guest.

Cc: Avi Kivity <avi@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 Documentation/virtual/kvm/api.txt |   18 ++++-
 include/linux/kvm.h               |    9 ++
 virt/kvm/eventfd.c                |  153 ++++++++++++++++++++++++++++++++-----
 3 files changed, 161 insertions(+), 19 deletions(-)

Comments

Michael S. Tsirkin July 6, 2011, 11:42 a.m. UTC | #1
On Wed, Jul 06, 2011 at 07:37:58AM +0300, Sasha Levin wrote:
> The new flag allows passing a connected socket instead of an
> eventfd to be notified of writes or reads to the specified memory region.
> 
> Instead of signaling an event, On write - the value written to the memory
> region is written to the pipe.
> On read - a notification of the read is sent to the host, and a response
> is expected with the value to be 'read'.
> 
> Using a socket instead of an eventfd is usefull when any value can be
> written to the memory region but we're interested in recieving the
> actual value instead of just a notification.
> 
> A simple example for practical use is the serial port. we are not
> interested in an exit every time a char is written to the port, but
> we do need to know what was written so we could handle it on the guest.
> 
> Cc: Avi Kivity <avi@redhat.com>
> Cc: Ingo Molnar <mingo@elte.hu>
> Cc: Marcelo Tosatti <mtosatti@redhat.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> ---
>  Documentation/virtual/kvm/api.txt |   18 ++++-
>  include/linux/kvm.h               |    9 ++
>  virt/kvm/eventfd.c                |  153 ++++++++++++++++++++++++++++++++-----
>  3 files changed, 161 insertions(+), 19 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 317d86a..74f0946 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -1330,7 +1330,7 @@ Returns: 0 on success, !0 on error
>  
>  This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
>  within the guest.  A guest write in the registered address will signal the
> -provided event instead of triggering an exit.
> +provided event or write to the provided socket instead of triggering an exit.
>  
>  struct kvm_ioeventfd {
>  	__u64 datamatch;
> @@ -1341,6 +1341,13 @@ struct kvm_ioeventfd {
>  	__u8  pad[36];
>  };
>  
> +struct kvm_ioeventfd_data {
> +	__u64 data;
> +	__u64 addr;
> +	__u32 len;
> +	__u8  is_write;
> +};
> +
>  The following flags are defined:
>  
>  #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
> @@ -1348,6 +1355,7 @@ The following flags are defined:
>  #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
>  #define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
>  #define KVM_IOEVENTFD_FLAG_NOWRITE   (1 << kvm_ioeventfd_flag_nr_nowrite)
> +#define KVM_IOEVENTFD_FLAG_SOCKET    (1 << kvm_ioeventfd_flag_nr_socket)
>  
>  If datamatch flag is set, the event will be signaled only if the written value
>  to the registered address is equal to datamatch in struct kvm_ioeventfd.
> @@ -1359,6 +1367,14 @@ passed in datamatch.
>  If the nowrite flag is set, the event won't be signaled when the specified address
>  is being written to.
>  
> +If the socket flag is set, fd is expected to be a connected AF_UNIX
> +SOCK_SEQPACKET socket.

Let's verify that then?

> +
> +	if (p->sock)
> +		socket_write(p->sock, &data, sizeof(data));
> +	else
> +		eventfd_signal(p->eventfd, 1);
> +
>  	return 0;
>  }

This still loses the data if socket would block and there's a signal.
I think we agreed to use non blocking operations and exit to
userspace in that case?


>  
> @@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
>  		void *val)
>  {
>  	struct _ioeventfd *p = to_ioeventfd(this);
> +	struct kvm_ioeventfd_data data;
>  
>  	/* Exit if signaling on reads isn't requested */
>  	if (!p->track_reads)
> @@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
>  	if (!ioeventfd_in_range(p, addr, len, val))
>  		return -EOPNOTSUPP;
>  
> -	eventfd_signal(p->eventfd, 1);
> +	data = (struct kvm_ioeventfd_data) {
> +		.addr = addr,
> +		.len = len,
> +		.is_write = 0,
> +	};
> +
> +	if (p->sock) {
> +		socket_write(p->sock, &data, sizeof(data));
> +		socket_read(p->sock, &data, sizeof(data));
> +		set_val(val, len, data.data);

Same here.

> +	} else {
> +		set_val(val, len, p->datamatch);
> +		eventfd_signal(p->eventfd, 1);
> +	}
> +
>  	return 0;
>  }
>
Avi Kivity July 6, 2011, 12:39 p.m. UTC | #2
On 07/06/2011 07:37 AM, Sasha Levin wrote:
> The new flag allows passing a connected socket instead of an
> eventfd to be notified of writes or reads to the specified memory region.
>
> Instead of signaling an event, On write - the value written to the memory
> region is written to the pipe.
> On read - a notification of the read is sent to the host, and a response
> is expected with the value to be 'read'.
>
> Using a socket instead of an eventfd is usefull when any value can be
> written to the memory region but we're interested in recieving the
> actual value instead of just a notification.
>
> A simple example for practical use is the serial port. we are not
> interested in an exit every time a char is written to the port, but
> we do need to know what was written so we could handle it on the guest.
>
>
>
> @@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
>   		void *val)
>   {
>   	struct _ioeventfd *p = to_ioeventfd(this);
> +	struct kvm_ioeventfd_data data;
>
>   	/* Exit if signaling on reads isn't requested */
>   	if (!p->track_reads)
> @@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
>   	if (!ioeventfd_in_range(p, addr, len, val))
>   		return -EOPNOTSUPP;
>
> -	eventfd_signal(p->eventfd, 1);
> +	data = (struct kvm_ioeventfd_data) {
> +		.addr = addr,
> +		.len = len,
> +		.is_write = 0,
> +	};
> +
> +	if (p->sock) {
> +		socket_write(p->sock,&data, sizeof(data));
> +		socket_read(p->sock,&data, sizeof(data));
> +		set_val(val, len, data.data);
> +	} else {
> +		set_val(val, len, p->datamatch);
> +		eventfd_signal(p->eventfd, 1);
> +	}
> +
>   	return 0;
>   }

If there are two reads on the same ioeventfd range, then the responses 
can mix up.  Need to make sure we get the right response.

Note that a mutex on the ioeventfd structure is insufficient, since the 
same socket may be used for multiple ranges.

One way out is to require that sockets not be shared among vcpus (there 
can be only one outstanding read per vcpu).  It seems heavy handed though.
Sasha Levin July 6, 2011, 12:58 p.m. UTC | #3
On Wed, 2011-07-06 at 15:39 +0300, Avi Kivity wrote:
> On 07/06/2011 07:37 AM, Sasha Levin wrote:
> > The new flag allows passing a connected socket instead of an
> > eventfd to be notified of writes or reads to the specified memory region.
> >
> > Instead of signaling an event, On write - the value written to the memory
> > region is written to the pipe.
> > On read - a notification of the read is sent to the host, and a response
> > is expected with the value to be 'read'.
> >
> > Using a socket instead of an eventfd is usefull when any value can be
> > written to the memory region but we're interested in recieving the
> > actual value instead of just a notification.
> >
> > A simple example for practical use is the serial port. we are not
> > interested in an exit every time a char is written to the port, but
> > we do need to know what was written so we could handle it on the guest.
> >
> >
> >
> > @@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
> >   		void *val)
> >   {
> >   	struct _ioeventfd *p = to_ioeventfd(this);
> > +	struct kvm_ioeventfd_data data;
> >
> >   	/* Exit if signaling on reads isn't requested */
> >   	if (!p->track_reads)
> > @@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
> >   	if (!ioeventfd_in_range(p, addr, len, val))
> >   		return -EOPNOTSUPP;
> >
> > -	eventfd_signal(p->eventfd, 1);
> > +	data = (struct kvm_ioeventfd_data) {
> > +		.addr = addr,
> > +		.len = len,
> > +		.is_write = 0,
> > +	};
> > +
> > +	if (p->sock) {
> > +		socket_write(p->sock,&data, sizeof(data));
> > +		socket_read(p->sock,&data, sizeof(data));
> > +		set_val(val, len, data.data);
> > +	} else {
> > +		set_val(val, len, p->datamatch);
> > +		eventfd_signal(p->eventfd, 1);
> > +	}
> > +
> >   	return 0;
> >   }
> 
> If there are two reads on the same ioeventfd range, then the responses 
> can mix up.  Need to make sure we get the right response.
> 
> Note that a mutex on the ioeventfd structure is insufficient, since the 
> same socket may be used for multiple ranges.
> 
> One way out is to require that sockets not be shared among vcpus (there 
> can be only one outstanding read per vcpu).  It seems heavy handed though.
> 

What about something as follows:

This requires an addition of a mutex to struct ioeventfd.

1. When adding a new ioeventfd, scan exiting ioeventfds (we already do
it anyway) and check whether another ioeventfd is using the socket
already.

2. If the existing ioeventfd doesn't have a mutex assigned, create a new
mutex and assign it to both ioeventfds.

3. If the existing ioeventfd already has a mutex assigned, copy it to
the new ioeventfd.

4. When removing an ioeventfd, do everything the other way around :)

This mutex can be used to lock the write/read pair.
Avi Kivity July 6, 2011, 1 p.m. UTC | #4
On 07/06/2011 07:37 AM, Sasha Levin wrote:
> The new flag allows passing a connected socket instead of an
> eventfd to be notified of writes or reads to the specified memory region.
>
> Instead of signaling an event, On write - the value written to the memory
> region is written to the pipe.
> On read - a notification of the read is sent to the host, and a response
> is expected with the value to be 'read'.
>
> Using a socket instead of an eventfd is usefull when any value can be
> written to the memory region but we're interested in recieving the
> actual value instead of just a notification.
>
> A simple example for practical use is the serial port. we are not
> interested in an exit every time a char is written to the port, but
> we do need to know what was written so we could handle it on the guest.
>
> @@ -1341,6 +1341,13 @@ struct kvm_ioeventfd {
>   	__u8  pad[36];
>   };
>
> +struct kvm_ioeventfd_data {
> +	__u64 data;
> +	__u64 addr;
> +	__u32 len;
> +	__u8  is_write;
> +};

Please pad, let's say to 32 bytes so it's a nice power of two.
Avi Kivity July 6, 2011, 1:04 p.m. UTC | #5
On 07/06/2011 03:58 PM, Sasha Levin wrote:
> What about something as follows:
>
> This requires an addition of a mutex to struct ioeventfd.
>
> 1. When adding a new ioeventfd, scan exiting ioeventfds (we already do
> it anyway) and check whether another ioeventfd is using the socket
> already.
>
> 2. If the existing ioeventfd doesn't have a mutex assigned, create a new
> mutex and assign it to both ioeventfds.

That fails if there is a read already in progress on the old ioeventfd.

Just create or share a mutex every time.  Taking a mutex is cheap enough.

> 3. If the existing ioeventfd already has a mutex assigned, copy it to
> the new ioeventfd.
>
> 4. When removing an ioeventfd, do everything the other way around :)
>
> This mutex can be used to lock the write/read pair.
>

Not very elegant, but reasonable and simple.
Sasha Levin July 6, 2011, 3:01 p.m. UTC | #6
On Wed, 2011-07-06 at 14:42 +0300, Michael S. Tsirkin wrote:
> On Wed, Jul 06, 2011 at 07:37:58AM +0300, Sasha Levin wrote:
> > The new flag allows passing a connected socket instead of an
> > eventfd to be notified of writes or reads to the specified memory region.
> > 
> > Instead of signaling an event, On write - the value written to the memory
> > region is written to the pipe.
> > On read - a notification of the read is sent to the host, and a response
> > is expected with the value to be 'read'.
> > 
> > Using a socket instead of an eventfd is usefull when any value can be
> > written to the memory region but we're interested in recieving the
> > actual value instead of just a notification.
> > 
> > A simple example for practical use is the serial port. we are not
> > interested in an exit every time a char is written to the port, but
> > we do need to know what was written so we could handle it on the guest.
> > 
> > Cc: Avi Kivity <avi@redhat.com>
> > Cc: Ingo Molnar <mingo@elte.hu>
> > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > Cc: Michael S. Tsirkin <mst@redhat.com>
> > Cc: Pekka Enberg <penberg@kernel.org>
> > Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> > ---
> >  Documentation/virtual/kvm/api.txt |   18 ++++-
> >  include/linux/kvm.h               |    9 ++
> >  virt/kvm/eventfd.c                |  153 ++++++++++++++++++++++++++++++++-----
> >  3 files changed, 161 insertions(+), 19 deletions(-)
> > 
> > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > index 317d86a..74f0946 100644
> > --- a/Documentation/virtual/kvm/api.txt
> > +++ b/Documentation/virtual/kvm/api.txt
> > @@ -1330,7 +1330,7 @@ Returns: 0 on success, !0 on error
> >  
> >  This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
> >  within the guest.  A guest write in the registered address will signal the
> > -provided event instead of triggering an exit.
> > +provided event or write to the provided socket instead of triggering an exit.
> >  
> >  struct kvm_ioeventfd {
> >  	__u64 datamatch;
> > @@ -1341,6 +1341,13 @@ struct kvm_ioeventfd {
> >  	__u8  pad[36];
> >  };
> >  
> > +struct kvm_ioeventfd_data {
> > +	__u64 data;
> > +	__u64 addr;
> > +	__u32 len;
> > +	__u8  is_write;
> > +};
> > +
> >  The following flags are defined:
> >  
> >  #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
> > @@ -1348,6 +1355,7 @@ The following flags are defined:
> >  #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
> >  #define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
> >  #define KVM_IOEVENTFD_FLAG_NOWRITE   (1 << kvm_ioeventfd_flag_nr_nowrite)
> > +#define KVM_IOEVENTFD_FLAG_SOCKET    (1 << kvm_ioeventfd_flag_nr_socket)
> >  
> >  If datamatch flag is set, the event will be signaled only if the written value
> >  to the registered address is equal to datamatch in struct kvm_ioeventfd.
> > @@ -1359,6 +1367,14 @@ passed in datamatch.
> >  If the nowrite flag is set, the event won't be signaled when the specified address
> >  is being written to.
> >  
> > +If the socket flag is set, fd is expected to be a connected AF_UNIX
> > +SOCK_SEQPACKET socket.
> 
> Let's verify that then?
> 
> > +
> > +	if (p->sock)
> > +		socket_write(p->sock, &data, sizeof(data));
> > +	else
> > +		eventfd_signal(p->eventfd, 1);
> > +
> >  	return 0;
> >  }
> 
> This still loses the data if socket would block and there's a signal.
> I think we agreed to use non blocking operations and exit to
> userspace in that case?
> 
> 
> >  
> > @@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
> >  		void *val)
> >  {
> >  	struct _ioeventfd *p = to_ioeventfd(this);
> > +	struct kvm_ioeventfd_data data;
> >  
> >  	/* Exit if signaling on reads isn't requested */
> >  	if (!p->track_reads)
> > @@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
> >  	if (!ioeventfd_in_range(p, addr, len, val))
> >  		return -EOPNOTSUPP;
> >  
> > -	eventfd_signal(p->eventfd, 1);
> > +	data = (struct kvm_ioeventfd_data) {
> > +		.addr = addr,
> > +		.len = len,
> > +		.is_write = 0,
> > +	};
> > +
> > +	if (p->sock) {
> > +		socket_write(p->sock, &data, sizeof(data));
> > +		socket_read(p->sock, &data, sizeof(data));
> > +		set_val(val, len, data.data);
> 
> Same here.

The socket_read() here I should leave blocking, and spin on it until I
read something - right?

> > +	} else {
> > +		set_val(val, len, p->datamatch);
> > +		eventfd_signal(p->eventfd, 1);
> > +	}
> > +
> >  	return 0;
> >  }
> >
> 
>
Michael S. Tsirkin July 6, 2011, 5:58 p.m. UTC | #7
On Wed, Jul 06, 2011 at 06:01:46PM +0300, Sasha Levin wrote:
> On Wed, 2011-07-06 at 14:42 +0300, Michael S. Tsirkin wrote:
> > On Wed, Jul 06, 2011 at 07:37:58AM +0300, Sasha Levin wrote:
> > > The new flag allows passing a connected socket instead of an
> > > eventfd to be notified of writes or reads to the specified memory region.
> > > 
> > > Instead of signaling an event, On write - the value written to the memory
> > > region is written to the pipe.
> > > On read - a notification of the read is sent to the host, and a response
> > > is expected with the value to be 'read'.
> > > 
> > > Using a socket instead of an eventfd is usefull when any value can be
> > > written to the memory region but we're interested in recieving the
> > > actual value instead of just a notification.
> > > 
> > > A simple example for practical use is the serial port. we are not
> > > interested in an exit every time a char is written to the port, but
> > > we do need to know what was written so we could handle it on the guest.
> > > 
> > > Cc: Avi Kivity <avi@redhat.com>
> > > Cc: Ingo Molnar <mingo@elte.hu>
> > > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > > Cc: Michael S. Tsirkin <mst@redhat.com>
> > > Cc: Pekka Enberg <penberg@kernel.org>
> > > Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> > > ---
> > >  Documentation/virtual/kvm/api.txt |   18 ++++-
> > >  include/linux/kvm.h               |    9 ++
> > >  virt/kvm/eventfd.c                |  153 ++++++++++++++++++++++++++++++++-----
> > >  3 files changed, 161 insertions(+), 19 deletions(-)
> > > 
> > > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > > index 317d86a..74f0946 100644
> > > --- a/Documentation/virtual/kvm/api.txt
> > > +++ b/Documentation/virtual/kvm/api.txt
> > > @@ -1330,7 +1330,7 @@ Returns: 0 on success, !0 on error
> > >  
> > >  This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
> > >  within the guest.  A guest write in the registered address will signal the
> > > -provided event instead of triggering an exit.
> > > +provided event or write to the provided socket instead of triggering an exit.
> > >  
> > >  struct kvm_ioeventfd {
> > >  	__u64 datamatch;
> > > @@ -1341,6 +1341,13 @@ struct kvm_ioeventfd {
> > >  	__u8  pad[36];
> > >  };
> > >  
> > > +struct kvm_ioeventfd_data {
> > > +	__u64 data;
> > > +	__u64 addr;
> > > +	__u32 len;
> > > +	__u8  is_write;
> > > +};
> > > +
> > >  The following flags are defined:
> > >  
> > >  #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
> > > @@ -1348,6 +1355,7 @@ The following flags are defined:
> > >  #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
> > >  #define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
> > >  #define KVM_IOEVENTFD_FLAG_NOWRITE   (1 << kvm_ioeventfd_flag_nr_nowrite)
> > > +#define KVM_IOEVENTFD_FLAG_SOCKET    (1 << kvm_ioeventfd_flag_nr_socket)
> > >  
> > >  If datamatch flag is set, the event will be signaled only if the written value
> > >  to the registered address is equal to datamatch in struct kvm_ioeventfd.
> > > @@ -1359,6 +1367,14 @@ passed in datamatch.
> > >  If the nowrite flag is set, the event won't be signaled when the specified address
> > >  is being written to.
> > >  
> > > +If the socket flag is set, fd is expected to be a connected AF_UNIX
> > > +SOCK_SEQPACKET socket.
> > 
> > Let's verify that then?
> > 
> > > +
> > > +	if (p->sock)
> > > +		socket_write(p->sock, &data, sizeof(data));
> > > +	else
> > > +		eventfd_signal(p->eventfd, 1);
> > > +
> > >  	return 0;
> > >  }
> > 
> > This still loses the data if socket would block and there's a signal.
> > I think we agreed to use non blocking operations and exit to
> > userspace in that case?
> > 
> > 
> > >  
> > > @@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
> > >  		void *val)
> > >  {
> > >  	struct _ioeventfd *p = to_ioeventfd(this);
> > > +	struct kvm_ioeventfd_data data;
> > >  
> > >  	/* Exit if signaling on reads isn't requested */
> > >  	if (!p->track_reads)
> > > @@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
> > >  	if (!ioeventfd_in_range(p, addr, len, val))
> > >  		return -EOPNOTSUPP;
> > >  
> > > -	eventfd_signal(p->eventfd, 1);
> > > +	data = (struct kvm_ioeventfd_data) {
> > > +		.addr = addr,
> > > +		.len = len,
> > > +		.is_write = 0,
> > > +	};
> > > +
> > > +	if (p->sock) {
> > > +		socket_write(p->sock, &data, sizeof(data));
> > > +		socket_read(p->sock, &data, sizeof(data));
> > > +		set_val(val, len, data.data);
> > 
> > Same here.
> 
> The socket_read() here I should leave blocking, and spin on it until I
> read something - right?

I think it's best to exit to userspace.

> > > +	} else {
> > > +		set_val(val, len, p->datamatch);
> > > +		eventfd_signal(p->eventfd, 1);
> > > +	}
> > > +
> > >  	return 0;
> > >  }
> > >
> > 
> > 
> 
> 
> -- 
> 
> Sasha.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sasha Levin July 10, 2011, 5:34 a.m. UTC | #8
On Wed, 2011-07-06 at 20:58 +0300, Michael S. Tsirkin wrote:
> On Wed, Jul 06, 2011 at 06:01:46PM +0300, Sasha Levin wrote:
> > On Wed, 2011-07-06 at 14:42 +0300, Michael S. Tsirkin wrote:
> > > On Wed, Jul 06, 2011 at 07:37:58AM +0300, Sasha Levin wrote:
> > > > +	if (p->sock) {
> > > > +		socket_write(p->sock, &data, sizeof(data));
> > > > +		socket_read(p->sock, &data, sizeof(data));
> > > > +		set_val(val, len, data.data);
> > > 
> > > Same here.
> > 
> > The socket_read() here I should leave blocking, and spin on it until I
> > read something - right?
> 
> I think it's best to exit to userspace.

Since sock_recvmsg for AF_UNIX SEQPACKET is interruptible, if we fail
the read here we'll take a regular MMIO exit and will allow the usermode
to deal with the MMIO in a regular way.

I've discussed the issue of usermode might having to handle the same
MMIO read request twice with Michael, and the solution proposed was to
add a new type of exit to handle this special case.

After working on that solution a bit I saw it's adding a lot of code and
complexity for this small issue, and I'm now thinking we may be better
off with just handling reads twice in case of a signal just between
socket_write() and socket_read() - once through the socket and once
through a regular MMIO exit.
Michael S. Tsirkin July 10, 2011, 8:05 a.m. UTC | #9
On Sun, Jul 10, 2011 at 08:34:43AM +0300, Sasha Levin wrote:
> On Wed, 2011-07-06 at 20:58 +0300, Michael S. Tsirkin wrote:
> > On Wed, Jul 06, 2011 at 06:01:46PM +0300, Sasha Levin wrote:
> > > On Wed, 2011-07-06 at 14:42 +0300, Michael S. Tsirkin wrote:
> > > > On Wed, Jul 06, 2011 at 07:37:58AM +0300, Sasha Levin wrote:
> > > > > +	if (p->sock) {
> > > > > +		socket_write(p->sock, &data, sizeof(data));
> > > > > +		socket_read(p->sock, &data, sizeof(data));
> > > > > +		set_val(val, len, data.data);
> > > > 
> > > > Same here.
> > > 
> > > The socket_read() here I should leave blocking, and spin on it until I
> > > read something - right?
> > 
> > I think it's best to exit to userspace.
> 
> Since sock_recvmsg for AF_UNIX SEQPACKET is interruptible, if we fail
> the read here we'll take a regular MMIO exit and will allow the usermode
> to deal with the MMIO in a regular way.
> 
> I've discussed the issue of usermode might having to handle the same
> MMIO read request twice with Michael, and the solution proposed was to
> add a new type of exit to handle this special case.
> 
> After working on that solution a bit I saw it's adding a lot of code and
> complexity for this small issue, and I'm now thinking we may be better
> off with just handling reads twice in case of a signal just between
> socket_write() and socket_read() - once through the socket and once
> through a regular MMIO exit.

The problem with this approach would be reads with a side-effect though:
for example, the IRQ status read in virtio clears the status register.

I don't insist on a new type of exit, just pointing out the problem.


> -- 
> 
> Sasha.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sasha Levin July 12, 2011, 11:23 a.m. UTC | #10
On Sun, 2011-07-10 at 11:05 +0300, Michael S. Tsirkin wrote:
> On Sun, Jul 10, 2011 at 08:34:43AM +0300, Sasha Levin wrote:
> > On Wed, 2011-07-06 at 20:58 +0300, Michael S. Tsirkin wrote:
> > > On Wed, Jul 06, 2011 at 06:01:46PM +0300, Sasha Levin wrote:
> > > > On Wed, 2011-07-06 at 14:42 +0300, Michael S. Tsirkin wrote:
> > > > > On Wed, Jul 06, 2011 at 07:37:58AM +0300, Sasha Levin wrote:
> > > > > > +	if (p->sock) {
> > > > > > +		socket_write(p->sock, &data, sizeof(data));
> > > > > > +		socket_read(p->sock, &data, sizeof(data));
> > > > > > +		set_val(val, len, data.data);
> > > > > 
> > > > > Same here.
> > > > 
> > > > The socket_read() here I should leave blocking, and spin on it until I
> > > > read something - right?
> > > 
> > > I think it's best to exit to userspace.
> > 
> > Since sock_recvmsg for AF_UNIX SEQPACKET is interruptible, if we fail
> > the read here we'll take a regular MMIO exit and will allow the usermode
> > to deal with the MMIO in a regular way.
> > 
> > I've discussed the issue of usermode might having to handle the same
> > MMIO read request twice with Michael, and the solution proposed was to
> > add a new type of exit to handle this special case.
> > 
> > After working on that solution a bit I saw it's adding a lot of code and
> > complexity for this small issue, and I'm now thinking we may be better
> > off with just handling reads twice in case of a signal just between
> > socket_write() and socket_read() - once through the socket and once
> > through a regular MMIO exit.
> 
> The problem with this approach would be reads with a side-effect though:
> for example, the IRQ status read in virtio clears the status register.
> 
> I don't insist on a new type of exit, just pointing out the problem.

I agree with you, I don't have a better solution either.

I don't feel it's worth it adding so much code for read support to
properly work. Can we do this patch series without socket read support
at the moment?
Avi Kivity July 12, 2011, 11:26 a.m. UTC | #11
On 07/12/2011 02:23 PM, Sasha Levin wrote:
> >
> >  I don't insist on a new type of exit, just pointing out the problem.
>
> I agree with you, I don't have a better solution either.
>
> I don't feel it's worth it adding so much code for read support to
> properly work. Can we do this patch series without socket read support
> at the moment?

No.  As I said before, I don't want a fragmented ABI.
Pekka Enberg July 13, 2011, 6:37 a.m. UTC | #12
Hi,

On 07/12/2011 02:23 PM, Sasha Levin wrote:
>> >  I don't insist on a new type of exit, just pointing out the problem.
>>
>> I agree with you, I don't have a better solution either.
>>
>> I don't feel it's worth it adding so much code for read support to
>> properly work. Can we do this patch series without socket read support
>> at the moment?

On Tue, Jul 12, 2011 at 2:26 PM, Avi Kivity <avi@redhat.com> wrote:
> No.  As I said before, I don't want a fragmented ABI.

OK, what's the simplest thing we can do here to keep Avi happy and get
the functionality of Sasha's original patch that helps us avoid guest
exits for serial console? I agree with Avi that we don't want
fragmented ABI but it seems to me there's no clear idea how to fix
KVM_IOEVENTFD_FLAG_SOCKET corner cases, right?

                                 Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pekka Enberg July 13, 2011, 6:45 a.m. UTC | #13
On Wed, Jul 13, 2011 at 9:37 AM, Pekka Enberg <penberg@kernel.org> wrote:
> On 07/12/2011 02:23 PM, Sasha Levin wrote:
>>> >  I don't insist on a new type of exit, just pointing out the problem.
>>>
>>> I agree with you, I don't have a better solution either.
>>>
>>> I don't feel it's worth it adding so much code for read support to
>>> properly work. Can we do this patch series without socket read support
>>> at the moment?
>
> On Tue, Jul 12, 2011 at 2:26 PM, Avi Kivity <avi@redhat.com> wrote:
>> No.  As I said before, I don't want a fragmented ABI.
>
> OK, what's the simplest thing we can do here to keep Avi happy and get
> the functionality of Sasha's original patch that helps us avoid guest
> exits for serial console? I agree with Avi that we don't want
> fragmented ABI but it seems to me there's no clear idea how to fix
> KVM_IOEVENTFD_FLAG_SOCKET corner cases, right?

And btw, I didn't follow the discussion closely, but introducing a new
type of exit for a feature that's designed to _avoid exits_ doesn't
seem like a smart thing to do. Is it even possible to support sockets
sanely for this?

                                 Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity July 13, 2011, 7:07 a.m. UTC | #14
On 07/13/2011 09:45 AM, Pekka Enberg wrote:
> >
> >  OK, what's the simplest thing we can do here to keep Avi happy and get
> >  the functionality of Sasha's original patch that helps us avoid guest
> >  exits for serial console? I agree with Avi that we don't want
> >  fragmented ABI but it seems to me there's no clear idea how to fix
> >  KVM_IOEVENTFD_FLAG_SOCKET corner cases, right?
>
> And btw, I didn't follow the discussion closely, but introducing a new
> type of exit for a feature that's designed to _avoid exits_ doesn't
> seem like a smart thing to do. Is it even possible to support sockets
> sanely for this?

First of all, this feature doesn't avoid exits.  The guest exits to the 
kernel just the same.  It just avoids userspace exits, trading them off 
for remote wakeups or context switches, as the case may be, which are 
both more expensive.

This feature is a win in the following cases:
- the socket is consumed in the kernel (like ioeventfd/irqfd and 
vhost-net, or vfio)
- the I/O triggers a lot of cpu work, _and_ there is a lot of spare cpu 
available on the host, so we can parallelize work.  Note it ends up 
being less efficient, but delivering higher throughput.  That's common 
in benchmarks but not so good for real workloads.
- the socket is consumed in another process (not thread) in which case 
we gain some security
- the writes are such high frequency that we gain something from the 
queueing is happens when we thread work.  That's where the gain for 
serial console can come from - though it would be better to just use 
virtio-serial instead.

Second, introducing a new type of exit doesn't mean we see more exits.

Third, the new type of exit is probably not needed - we can use the 
existing mmio/pio exits, and document that in certain cases the kernel 
will fall back to these instead of delivering the I/O via the sockets 
(userspace can then try writing itself).

There was a fourth too but I forgot it already.
Pekka Enberg July 13, 2011, 7:51 a.m. UTC | #15
On Sun, Jul 10, 2011 at 8:34 AM, Sasha Levin <levinsasha928@gmail.com> wrote:
> After working on that solution a bit I saw it's adding a lot of code and
> complexity for this small issue, and I'm now thinking we may be better
> off with just handling reads twice in case of a signal just between
> socket_write() and socket_read() - once through the socket and once
> through a regular MMIO exit.

I don't really understand the issue so can you elaborate where the
complexity comes from? Why can't we just switch to non-blocking read
and return -ENOSUPP if there's signal_pending() after socket_write()?
AFAICT, we can just let callers of kvm_iodevice_write() and
kvm_iodevice_read() deal with exits, no?

                                  Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pekka Enberg July 13, 2011, 8:02 a.m. UTC | #16
On Wed, Jul 13, 2011 at 10:07 AM, Avi Kivity <avi@redhat.com> wrote:
> - the writes are such high frequency that we gain something from the
> queueing is happens when we thread work.  That's where the gain for serial
> console can come from - though it would be better to just use virtio-serial
> instead.

We'd like to keep 8250 emulation because it's the most robust method
for getting data out of the kernel when there's problems. It's also
compatible with earlyprintk and such.

                                 Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pekka Enberg July 13, 2011, 10:04 a.m. UTC | #17
On Wed, Jul 13, 2011 at 10:51 AM, Pekka Enberg <penberg@kernel.org> wrote:
> On Sun, Jul 10, 2011 at 8:34 AM, Sasha Levin <levinsasha928@gmail.com> wrote:
>> After working on that solution a bit I saw it's adding a lot of code and
>> complexity for this small issue, and I'm now thinking we may be better
>> off with just handling reads twice in case of a signal just between
>> socket_write() and socket_read() - once through the socket and once
>> through a regular MMIO exit.
>
> I don't really understand the issue so can you elaborate where the
> complexity comes from? Why can't we just switch to non-blocking read
> and return -ENOSUPP if there's signal_pending() after socket_write()?
> AFAICT, we can just let callers of kvm_iodevice_write() and
> kvm_iodevice_read() deal with exits, no?

Oh, re-reading Michael's explanation, signal_pending() is irrelevant
here and all we need to do is return -ENOSUPP if either the read or
write fails. What's the part I'm totally missing here?

                                  Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sasha Levin July 13, 2011, 10:26 a.m. UTC | #18
On Wed, 2011-07-13 at 13:04 +0300, Pekka Enberg wrote:
> On Wed, Jul 13, 2011 at 10:51 AM, Pekka Enberg <penberg@kernel.org> wrote:
> > On Sun, Jul 10, 2011 at 8:34 AM, Sasha Levin <levinsasha928@gmail.com> wrote:
> >> After working on that solution a bit I saw it's adding a lot of code and
> >> complexity for this small issue, and I'm now thinking we may be better
> >> off with just handling reads twice in case of a signal just between
> >> socket_write() and socket_read() - once through the socket and once
> >> through a regular MMIO exit.
> >
> > I don't really understand the issue so can you elaborate where the
> > complexity comes from? Why can't we just switch to non-blocking read
> > and return -ENOSUPP if there's signal_pending() after socket_write()?
> > AFAICT, we can just let callers of kvm_iodevice_write() and
> > kvm_iodevice_read() deal with exits, no?
> 
> Oh, re-reading Michael's explanation, signal_pending() is irrelevant
> here and all we need to do is return -ENOSUPP if either the read or
> write fails. What's the part I'm totally missing here?

The problem is that if we received a signal during the read notification
the write and before receiving the read, we have to go back to
userspace.

This means that userspace will see same read request twice (once in the
socket and once in the MMIO exit).
Pekka Enberg July 13, 2011, 10:56 a.m. UTC | #19
On Wed, Jul 13, 2011 at 1:26 PM, Sasha Levin <levinsasha928@gmail.com> wrote:
> The problem is that if we received a signal during the read notification
> the write and before receiving the read, we have to go back to
> userspace.
>
> This means that userspace will see same read request twice (once in the
> socket and once in the MMIO exit).

So the problem is only in ioeventfd_read() if socket_write() succeeds
but socket_read() fails? If so, can we do the socket_read() somewhere
else in the code which is able to restart it?

AFAICT, both ioeventfd_read() and ioeventfd_write() should -ENOSUPP if
socket_write() fails. If socket_write() succeeds, we should return
-EINTR and teach vcpu_mmio_read() and kernel_pio() to KVM_EXIT_INTR in
those cases. We'll can just restart the read, no? The only
complication I can see is that ioeventfd_read() needs to keep track if
it did the socket_write() already or not.

                                   Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pekka Enberg July 13, 2011, 11:14 a.m. UTC | #20
On Wed, Jul 13, 2011 at 1:56 PM, Pekka Enberg <penberg@kernel.org> wrote:
> On Wed, Jul 13, 2011 at 1:26 PM, Sasha Levin <levinsasha928@gmail.com> wrote:
>> The problem is that if we received a signal during the read notification
>> the write and before receiving the read, we have to go back to
>> userspace.
>>
>> This means that userspace will see same read request twice (once in the
>> socket and once in the MMIO exit).
>
> So the problem is only in ioeventfd_read() if socket_write() succeeds
> but socket_read() fails? If so, can we do the socket_read() somewhere
> else in the code which is able to restart it?
>
> AFAICT, both ioeventfd_read() and ioeventfd_write() should -ENOSUPP if
> socket_write() fails. If socket_write() succeeds, we should return
> -EINTR and teach vcpu_mmio_read() and kernel_pio() to KVM_EXIT_INTR in
> those cases. We'll can just restart the read, no? The only
> complication I can see is that ioeventfd_read() needs to keep track if
> it did the socket_write() already or not.

Maybe struct _ioeventfd could hold a flag that tells ioevenfd_read()
that the write part was already successful?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity July 13, 2011, 12:57 p.m. UTC | #21
On 07/13/2011 11:02 AM, Pekka Enberg wrote:
> On Wed, Jul 13, 2011 at 10:07 AM, Avi Kivity<avi@redhat.com>  wrote:
> >  - the writes are such high frequency that we gain something from the
> >  queueing is happens when we thread work.  That's where the gain for serial
> >  console can come from - though it would be better to just use virtio-serial
> >  instead.
>
> We'd like to keep 8250 emulation because it's the most robust method
> for getting data out of the kernel when there's problems. It's also
> compatible with earlyprintk and such.

What do you hope to gain from the optimization? 100ms less boot time?

btw, it likely needs read support, if any read can depend on a previous 
write.
Pekka Enberg July 13, 2011, 1 p.m. UTC | #22
On Wed, Jul 13, 2011 at 3:57 PM, Avi Kivity <avi@redhat.com> wrote:
>> We'd like to keep 8250 emulation because it's the most robust method
>> for getting data out of the kernel when there's problems. It's also
>> compatible with earlyprintk and such.
>
> What do you hope to gain from the optimization? 100ms less boot time?

Hmm? I use serial console for logging into the guest and actually
using it all the time.

                                Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity July 13, 2011, 1:32 p.m. UTC | #23
On 07/13/2011 04:00 PM, Pekka Enberg wrote:
> On Wed, Jul 13, 2011 at 3:57 PM, Avi Kivity<avi@redhat.com>  wrote:
> >>  We'd like to keep 8250 emulation because it's the most robust method
> >>  for getting data out of the kernel when there's problems. It's also
> >>  compatible with earlyprintk and such.
> >
> >  What do you hope to gain from the optimization? 100ms less boot time?
>
> Hmm? I use serial console for logging into the guest and actually
> using it all the time.

Just how fast do you type?

It's not enough to make something faster, it has to be slow to being 
with.  The interface is good, but use it where it helps.
Pekka Enberg July 14, 2011, 7:26 a.m. UTC | #24
On 07/13/2011 04:00 PM, Pekka Enberg wrote:
>> On Wed, Jul 13, 2011 at 3:57 PM, Avi Kivity<avi@redhat.com>  wrote:
>> >>  We'd like to keep 8250 emulation because it's the most robust method
>> >>  for getting data out of the kernel when there's problems. It's also
>> >>  compatible with earlyprintk and such.
>> >
>> >  What do you hope to gain from the optimization? 100ms less boot time?
>>
>> Hmm? I use serial console for logging into the guest and actually
>> using it all the time.

On 7/13/11 4:32 PM, Avi Kivity wrote:
> Just how fast do you type?
>
> It's not enough to make something faster, it has to be slow to being 
> with.  The interface is good, but use it where it helps.

I don't know why you bring up serial console input here. Sasha's
original KVM_IOEVENTFD_FLAG_PIPE patch clearly stated:

   A simple example for practical use is the serial port. we are not
   interested in an exit every time a char is written to the port, but
   we do need to know what was written so we could handle it on the
   guest.

So it's talking about the serial console output, not input.

                             Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sasha Levin July 14, 2011, 8:07 a.m. UTC | #25
On Thu, 2011-07-14 at 10:26 +0300, Pekka Enberg wrote:
> On 07/13/2011 04:00 PM, Pekka Enberg wrote:
> >> On Wed, Jul 13, 2011 at 3:57 PM, Avi Kivity<avi@redhat.com>  wrote:
> >> >>  We'd like to keep 8250 emulation because it's the most robust method
> >> >>  for getting data out of the kernel when there's problems. It's also
> >> >>  compatible with earlyprintk and such.
> >> >
> >> >  What do you hope to gain from the optimization? 100ms less boot time?
> >>
> >> Hmm? I use serial console for logging into the guest and actually
> >> using it all the time.
> 
> On 7/13/11 4:32 PM, Avi Kivity wrote:
> > Just how fast do you type?
> >
> > It's not enough to make something faster, it has to be slow to being 
> > with.  The interface is good, but use it where it helps.
> 
> I don't know why you bring up serial console input here. Sasha's
> original KVM_IOEVENTFD_FLAG_PIPE patch clearly stated:
> 
>    A simple example for practical use is the serial port. we are not
>    interested in an exit every time a char is written to the port, but
>    we do need to know what was written so we could handle it on the
>    guest.
> 
> So it's talking about the serial console output, not input.

Yes, I have originally thought of doing the pipefd interface to prevent
the overhead caused by console intensive benchmarks on actual
benchmarks. It's funny how much slowness spinning the cursor or printing
a table incurs.
Avi Kivity July 14, 2011, 8:09 a.m. UTC | #26
On 07/14/2011 10:26 AM, Pekka Enberg wrote:
>
> On 7/13/11 4:32 PM, Avi Kivity wrote:
>> Just how fast do you type?
>>
>> It's not enough to make something faster, it has to be slow to being 
>> with.  The interface is good, but use it where it helps.
>
> I don't know why you bring up serial console input here. Sasha's
> original KVM_IOEVENTFD_FLAG_PIPE patch clearly stated:
>
>   A simple example for practical use is the serial port. we are not
>   interested in an exit every time a char is written to the port, but
>   we do need to know what was written so we could handle it on the
>   guest.
>
> So it's talking about the serial console output, not input.
>

I'm talking about the real world.  Is any of this something that needs 
optimization?

1024 vcpus logging in via the serial console at 10Gbps.  Get real.
Pekka Enberg July 14, 2011, 8:14 a.m. UTC | #27
On Thu, Jul 14, 2011 at 11:09 AM, Avi Kivity <avi@redhat.com> wrote:
> On 07/14/2011 10:26 AM, Pekka Enberg wrote:
>>
>> On 7/13/11 4:32 PM, Avi Kivity wrote:
>>>
>>> Just how fast do you type?
>>>
>>> It's not enough to make something faster, it has to be slow to being
>>> with.  The interface is good, but use it where it helps.
>>
>> I don't know why you bring up serial console input here. Sasha's
>> original KVM_IOEVENTFD_FLAG_PIPE patch clearly stated:
>>
>>  A simple example for practical use is the serial port. we are not
>>  interested in an exit every time a char is written to the port, but
>>  we do need to know what was written so we could handle it on the
>>  guest.
>>
>> So it's talking about the serial console output, not input.
>>
>
> I'm talking about the real world.  Is any of this something that needs
> optimization?

Yes. See Sasha's other email. We should have mentioned that in the changelog.

> 1024 vcpus logging in via the serial console at 10Gbps.  Get real.

[...]

Huh, why are you bringing something like that up?

                                Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 14, 2011, 8:19 a.m. UTC | #28
On Thu, Jul 14, 2011 at 11:09:57AM +0300, Avi Kivity wrote:
> On 07/14/2011 10:26 AM, Pekka Enberg wrote:
> >
> >On 7/13/11 4:32 PM, Avi Kivity wrote:
> >>Just how fast do you type?
> >>
> >>It's not enough to make something faster, it has to be slow to
> >>being with.  The interface is good, but use it where it helps.
> >
> >I don't know why you bring up serial console input here. Sasha's
> >original KVM_IOEVENTFD_FLAG_PIPE patch clearly stated:
> >
> >  A simple example for practical use is the serial port. we are not
> >  interested in an exit every time a char is written to the port, but
> >  we do need to know what was written so we could handle it on the
> >  guest.
> >
> >So it's talking about the serial console output, not input.
> >
> 
> I'm talking about the real world.  Is any of this something that
> needs optimization?
> 
> 1024 vcpus logging in via the serial console at 10Gbps.  Get real.
> 
It is nice to have the ability to trace a guest execution from a host
(added bonus if you can trace the guest and the host simultaneously and
have trace output in one nice file), but serial interface is not fit
for that. PV interface is needed. May be share ftrace buffer between
the host and the guest?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin July 14, 2011, 8:25 a.m. UTC | #29
On Thu, Jul 14, 2011 at 11:09:57AM +0300, Avi Kivity wrote:
> I'm talking about the real world.  Is any of this something that
> needs optimization?

This work might help enable vhost support for non-MSI guests
(required for linux 2.6.30 and back). It's not enough
for that though: we would also need level interrupt
support in irqfd. That might be doable by using
sockets for irqfd in a similar way.
Avi Kivity July 14, 2011, 8:28 a.m. UTC | #30
On 07/14/2011 11:14 AM, Pekka Enberg wrote:
> >
> >  I'm talking about the real world.  Is any of this something that needs
> >  optimization?
>
> Yes. See Sasha's other email. We should have mentioned that in the changelog.

It's completely unrealistic.  Sash and you are the only two people in 
the universe logging into the guest via a serial console and running 
benchmarks.

> >  1024 vcpus logging in via the serial console at 10Gbps.  Get real.
>
> [...]
>
> Huh, why are you bringing something like that up?

Because it's another example of an unrealistic approach on your part.  
No real user is interested in 1024 vcpus, and no real user is interested 
in optimized serial console.  It's good that you are bringing up new 
ideas and new code, but adding code just for its own sake is not a good 
thing.
Avi Kivity July 14, 2011, 8:29 a.m. UTC | #31
On 07/14/2011 11:25 AM, Michael S. Tsirkin wrote:
> On Thu, Jul 14, 2011 at 11:09:57AM +0300, Avi Kivity wrote:
> >  I'm talking about the real world.  Is any of this something that
> >  needs optimization?
>
> This work might help enable vhost support for non-MSI guests
> (required for linux 2.6.30 and back). It's not enough
> for that though: we would also need level interrupt
> support in irqfd. That might be doable by using
> sockets for irqfd in a similar way.

I don't have an issue with socket mmio (it's a great feature once it's 
done well), just with presenting the serial port as motivation.
Pekka Enberg July 14, 2011, 8:59 a.m. UTC | #32
On Thu, Jul 14, 2011 at 11:28 AM, Avi Kivity <avi@redhat.com> wrote:
> On 07/14/2011 11:14 AM, Pekka Enberg wrote:
>>
>> >
>> >  I'm talking about the real world.  Is any of this something that needs
>> >  optimization?
>>
>> Yes. See Sasha's other email. We should have mentioned that in the
>> changelog.
>
> It's completely unrealistic.  Sash and you are the only two people in the
> universe logging into the guest via a serial console and running benchmarks.

Why does that matter? Why should we keep the emulation slow if it's
possible to fix it? It's a fair question to ask if the benefits
outweigh the added complexity but asking as to keep serial emulation
slow because *you* think it's unrealistic is well - unrealistic from
your part!

>> >  1024 vcpus logging in via the serial console at 10Gbps.  Get real.
>>
>> [...]
>>
>> Huh, why are you bringing something like that up?
>
> Because it's another example of an unrealistic approach on your part.  No
> real user is interested in 1024 vcpus, and no real user is interested in
> optimized serial console.  It's good that you are bringing up new ideas and
> new code, but adding code just for its own sake is not a good thing.

*You* brought up 1024 vcpus using serial console! Obviously optimizing
something like that is stupid but we never claimed that we wanted to
do something like that!

As for 1024 vcpus, we already had the discussion where we explained
why we thought it was a good idea not to have such a low hard vcpu
limit for vcpus.

                                   Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity July 14, 2011, 9:48 a.m. UTC | #33
On 07/14/2011 11:59 AM, Pekka Enberg wrote:
> On Thu, Jul 14, 2011 at 11:28 AM, Avi Kivity<avi@redhat.com>  wrote:
> >  On 07/14/2011 11:14 AM, Pekka Enberg wrote:
> >>
> >>  >
> >>  >    I'm talking about the real world.  Is any of this something that needs
> >>  >    optimization?
> >>
> >>  Yes. See Sasha's other email. We should have mentioned that in the
> >>  changelog.
> >
> >  It's completely unrealistic.  Sash and you are the only two people in the
> >  universe logging into the guest via a serial console and running benchmarks.
>
> Why does that matter? Why should we keep the emulation slow if it's
> possible to fix it?

Fixing things that don't need fixing has a cost.  In work, in risk, and 
in maintainability.  If you can share this cost among other users (which 
is certainly possible with socket mmio), it may still be worth it.  But 
just making something faster is not sufficient, it has to be faster for 
a significant number of users.

> It's a fair question to ask if the benefits
> outweigh the added complexity but asking as to keep serial emulation
> slow because *you* think it's unrealistic is well - unrealistic from
> your part!

Exactly where am I unrealistic?  Do you think there are many users who 
suffer from slow serial emulation?

> >>  >    1024 vcpus logging in via the serial console at 10Gbps.  Get real.
> >>
> >>  [...]
> >>
> >>  Huh, why are you bringing something like that up?
> >
> >  Because it's another example of an unrealistic approach on your part.  No
> >  real user is interested in 1024 vcpus, and no real user is interested in
> >  optimized serial console.  It's good that you are bringing up new ideas and
> >  new code, but adding code just for its own sake is not a good thing.
>
> *You* brought up 1024 vcpus using serial console! Obviously optimizing
> something like that is stupid but we never claimed that we wanted to
> do something like that!

Either of them, independently, is unrealistic.  The example of them 
together was just levantine exaggeration, it wasn't meant to be taken 
literally.

> As for 1024 vcpus, we already had the discussion where we explained
> why we thought it was a good idea not to have such a low hard vcpu
> limit for vcpus.

I can't say I was convinced.  It's pretty simple to patch the kernel if 
you want to engage in such experiments.  We did find something that 
works out (the soft/hard limits), but it's still overkill.

There's a large attitude mismatch between tools/kvm developers and kvm 
developers (or at least me): tools/kvm is growing rapidly, adding 
features and improving stability at a fast pace.  kvm on the other hand 
is mature and a lot more concerned with preserving and improving 
stability than with adding new features.  The fact is, kvm is already 
very feature rich and very performant, so we're at a very different 
place in the performance/features/stability scales.
Pekka Enberg July 14, 2011, 10:30 a.m. UTC | #34
Hi Avi,

On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity <avi@redhat.com> wrote:
>> Why does that matter? Why should we keep the emulation slow if it's
>> possible to fix it?
>
> Fixing things that don't need fixing has a cost.  In work, in risk, and in
> maintainability.  If you can share this cost among other users (which is
> certainly possible with socket mmio), it may still be worth it.  But just
> making something faster is not sufficient, it has to be faster for a
> significant number of users.

I don't think it needs to be faster for *significant number* of users
but yes, I completely agree that we need to make sure KVM gains more
than the costs are.

On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity <avi@redhat.com> wrote:
>> It's a fair question to ask if the benefits
>> outweigh the added complexity but asking as to keep serial emulation
>> slow because *you* think it's unrealistic is well - unrealistic from
>> your part!
>
> Exactly where am I unrealistic?  Do you think there are many users who
> suffer from slow serial emulation?

Again, I don't with you that there needs to be many users for this
type of feature. That said, as a maintainer you seem to think that it
is and I'm obviously OK with that.

So if you've been saying 'this is too complex for too little gain' all
this time, I've misunderstood what you've been trying to say. The way
I've read your comments is "optimizing serial console is stupid
because it's a useless feature" which obviously not true because we
find it useful!

>> *You* brought up 1024 vcpus using serial console! Obviously optimizing
>> something like that is stupid but we never claimed that we wanted to
>> do something like that!
>
> Either of them, independently, is unrealistic.  The example of them together
> was just levantine exaggeration, it wasn't meant to be taken literally.

I obviously don't agree that they're unrealistic independently.

We want to use 8250 emulation instead of virtio-serial because it's
more compatible with kernel debugging mechanisms. Also, it makes
debugging virtio code much easier when we don't need to use virtio to
deliver console output while debugging it. We want to make it fast so
that we don't need to switch over to another console type after early
boot.

What's unreasonable about that?

Reasonably fast 1024 VCPUs would be great for testing kernel
configurations. KVM is not there yet so we suggested that we raise the
hard limit from current 64 VCPUs so that it's easier for people such
as ourselves to improve things. I don't understand why you think
that's unreasonable either!

On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity <avi@redhat.com> wrote:
>> As for 1024 vcpus, we already had the discussion where we explained
>> why we thought it was a good idea not to have such a low hard vcpu
>> limit for vcpus.
>
> I can't say I was convinced.  It's pretty simple to patch the kernel if you
> want to engage in such experiments.  We did find something that works out
> (the soft/hard limits), but it's still overkill.

I thought you were convinced that KVM_CAP_MAX_VCPUS was reasonable. I
guess I misunderstood your position then.

On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity <avi@redhat.com> wrote:
> There's a large attitude mismatch between tools/kvm developers and kvm
> developers (or at least me): tools/kvm is growing rapidly, adding features
> and improving stability at a fast pace.  kvm on the other hand is mature and
> a lot more concerned with preserving and improving stability than with
> adding new features.  The fact is, kvm is already very feature rich and very
> performant, so we're at a very different place in the
> performance/features/stability scales.

Yes, we're at different places but we definitely appreciate the
stability and performance of KVM and have no interest in disrupting
that. I don't expect you to merge our patches when you think they're
risky or not worth the added complexity. So there's no attitude
mismatch there.

I simply don't agree with some of your requirements (significant
number of users) or some of the technical decisions (VCPU hard limit
at 64).

                                  Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity July 14, 2011, 11:54 a.m. UTC | #35
On 07/14/2011 01:30 PM, Pekka Enberg wrote:
> Hi Avi,
>
> On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity<avi@redhat.com>  wrote:
> >>  Why does that matter? Why should we keep the emulation slow if it's
> >>  possible to fix it?
> >
> >  Fixing things that don't need fixing has a cost.  In work, in risk, and in
> >  maintainability.  If you can share this cost among other users (which is
> >  certainly possible with socket mmio), it may still be worth it.  But just
> >  making something faster is not sufficient, it has to be faster for a
> >  significant number of users.
>
> I don't think it needs to be faster for *significant number* of users
> but yes, I completely agree that we need to make sure KVM gains more
> than the costs are.

Significant, for me, means it's measured in a percentage, not as digits 
on various limbs.  2% is a significant amount of users.  5 is not.

> On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity<avi@redhat.com>  wrote:
> >>  It's a fair question to ask if the benefits
> >>  outweigh the added complexity but asking as to keep serial emulation
> >>  slow because *you* think it's unrealistic is well - unrealistic from
> >>  your part!
> >
> >  Exactly where am I unrealistic?  Do you think there are many users who
> >  suffer from slow serial emulation?
>
> Again, I don't with you that there needs to be many users for this
> type of feature. That said, as a maintainer you seem to think that it
> is and I'm obviously OK with that.
>
> So if you've been saying 'this is too complex for too little gain' all
> this time, I've misunderstood what you've been trying to say. The way
> I've read your comments is "optimizing serial console is stupid
> because it's a useless feature" which obviously not true because we
> find it useful!

More or less.  Note "this" here is not socket mmio - but I want to see a 
real use case for socket mmio.

> >>  *You* brought up 1024 vcpus using serial console! Obviously optimizing
> >>  something like that is stupid but we never claimed that we wanted to
> >>  do something like that!
> >
> >  Either of them, independently, is unrealistic.  The example of them together
> >  was just levantine exaggeration, it wasn't meant to be taken literally.
>
> I obviously don't agree that they're unrealistic independently.
>
> We want to use 8250 emulation instead of virtio-serial because it's
> more compatible with kernel debugging mechanisms. Also, it makes
> debugging virtio code much easier when we don't need to use virtio to
> deliver console output while debugging it. We want to make it fast so
> that we don't need to switch over to another console type after early
> boot.
>
> What's unreasonable about that?

Does virtio debugging really need super-fast serial?  Does it need 
serial at all?

> Reasonably fast 1024 VCPUs would be great for testing kernel
> configurations. KVM is not there yet so we suggested that we raise the
> hard limit from current 64 VCPUs so that it's easier for people such
> as ourselves to improve things. I don't understand why you think
> that's unreasonable either!

You will never get reasonably fast 1024 vcpus on your laptop.  As soon 
as your vcpus start doing useful work, they will thrash.  The guest 
kernel expects reasonable latency on cross-cpu operations, and kvm won't 
be able to provide it with such overcommit.  The PLE stuff attempts to 
mitigate some of the problem, but it's not going to work for such huge 
overcommit.

Every contended spin_lock() or IPI will turn into a huge spin in the 
worst case or a context switch in the best case.  Performance will tank 
unless you're running some shared-nothing process-per-vcpu workload in 
the guest.

The only way to get reasonable 1024 vcpu performance is to run it on a 
1024 cpu host.  People who have such machines are usually interested in 
realistic workloads, and that means much smaller guests.  If you do want 
to run 1024-on-1024, there is a lot of work in getting NUMA to function 
correctly; what we have now is not sufficient for large machines with 
large NUMA factors.

> On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity<avi@redhat.com>  wrote:
> >>  As for 1024 vcpus, we already had the discussion where we explained
> >>  why we thought it was a good idea not to have such a low hard vcpu
> >>  limit for vcpus.
> >
> >  I can't say I was convinced.  It's pretty simple to patch the kernel if you
> >  want to engage in such experiments.  We did find something that works out
> >  (the soft/hard limits), but it's still overkill.
>
> I thought you were convinced that KVM_CAP_MAX_VCPUS was reasonable. I
> guess I misunderstood your position then.

It's "okay", but no more.  If I were Linus I'd say it's scalability 
masturbation.

> On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity<avi@redhat.com>  wrote:
> >  There's a large attitude mismatch between tools/kvm developers and kvm
> >  developers (or at least me): tools/kvm is growing rapidly, adding features
> >  and improving stability at a fast pace.  kvm on the other hand is mature and
> >  a lot more concerned with preserving and improving stability than with
> >  adding new features.  The fact is, kvm is already very feature rich and very
> >  performant, so we're at a very different place in the
> >  performance/features/stability scales.
>
> Yes, we're at different places but we definitely appreciate the
> stability and performance of KVM and have no interest in disrupting
> that. I don't expect you to merge our patches when you think they're
> risky or not worth the added complexity. So there's no attitude
> mismatch there.
>
> I simply don't agree with some of your requirements (significant
> number of users) or some of the technical decisions (VCPU hard limit
> at 64).

It's great to have a disagreement without descending into ugly 
flamewars, I appreciate that.
Sasha Levin July 14, 2011, 12:32 p.m. UTC | #36
On Thu, 2011-07-14 at 14:54 +0300, Avi Kivity wrote:
> On 07/14/2011 01:30 PM, Pekka Enberg wrote:
> > We want to use 8250 emulation instead of virtio-serial because it's
> > more compatible with kernel debugging mechanisms. Also, it makes
> > debugging virtio code much easier when we don't need to use virtio to
> > deliver console output while debugging it. We want to make it fast so
> > that we don't need to switch over to another console type after early
> > boot.
> >
> > What's unreasonable about that?
> 
> Does virtio debugging really need super-fast serial?  Does it need 
> serial at all?
> 

Does it need super-fast serial? No, although it's nice. Does it need
serial at all? Definitely.

It's not just virtio which can fail running on virtio-console, it's also
the threadpool, the eventfd mechanism and even the PCI management
module. You can't really debug it if you can't depend on your debugging
mechanism to properly work.

So far, serial is the simplest, most effective, and never-failing method
we had for working on guests, I don't see how we can work without it at
the moment.


> > Reasonably fast 1024 VCPUs would be great for testing kernel
> > configurations. KVM is not there yet so we suggested that we raise the
> > hard limit from current 64 VCPUs so that it's easier for people such
> > as ourselves to improve things. I don't understand why you think
> > that's unreasonable either!
> 
> You will never get reasonably fast 1024 vcpus on your laptop.  As soon 
> as your vcpus start doing useful work, they will thrash.  The guest 
> kernel expects reasonable latency on cross-cpu operations, and kvm won't 
> be able to provide it with such overcommit.  The PLE stuff attempts to 
> mitigate some of the problem, but it's not going to work for such huge 
> overcommit.
> 

I agree here that the performance even with 256 vcpus would be terrible
and no 'real' users would be doing that until the infrastructure could
provide reasonable performance.

The two uses I see for it are:

1. Stressing out the usermode code. One of the reasons qemu can't
properly do 64 vcpus now is not just due to the KVM kernel code, it's
also due to qemu itself. We're trying to avoid doing the same
with /tools/kvm.

2. Preventing future scalability problems. Currently we can't do 1024
vcpus because it breaks coalesced MMIO - which is IMO not a valid reason
for not scaling up to 1024 vcpus (and by scaling I mean running without
errors, without regards to performance).
Pekka Enberg July 14, 2011, 12:37 p.m. UTC | #37
Hi Avi,

On Thu, Jul 14, 2011 at 2:54 PM, Avi Kivity <avi@redhat.com> wrote:
>> I don't think it needs to be faster for *significant number* of users
>> but yes, I completely agree that we need to make sure KVM gains more
>> than the costs are.
>
> Significant, for me, means it's measured in a percentage, not as digits on
> various limbs.  2% is a significant amount of users.  5 is not.

Just to make my position clear: I think it's enough that there's one
user that benefits as long as complexity isn't too high and I think
x86/Voyager is a pretty good example of that. So while you can argue
*for* complexity if there are enough users, when there's only few
users, it's really about whether a feature adds significant complexity
or not.

>> We want to use 8250 emulation instead of virtio-serial because it's
>> more compatible with kernel debugging mechanisms. Also, it makes
>> debugging virtio code much easier when we don't need to use virtio to
>> deliver console output while debugging it. We want to make it fast so
>> that we don't need to switch over to another console type after early
>> boot.
>>
>> What's unreasonable about that?
>
> Does virtio debugging really need super-fast serial?  Does it need serial at
> all?

Text mode guests should be super-fast, not virtio debugging. We want
to use 8250 instead of virtio serial or virtio console (which we
support, btw) because it's more compatible with Linux. Debugging
virtio with 8250 has turned out to be useful in the past.

>> I thought you were convinced that KVM_CAP_MAX_VCPUS was reasonable. I
>> guess I misunderstood your position then.
>
> It's "okay", but no more.  If I were Linus I'd say it's scalability
> masturbation.

That's definitely not the intent and I suppose we have different
opinions on what's 'reasonably fast'.

                                Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity July 14, 2011, 12:46 p.m. UTC | #38
On 07/14/2011 03:32 PM, Sasha Levin wrote:
> On Thu, 2011-07-14 at 14:54 +0300, Avi Kivity wrote:
> >  On 07/14/2011 01:30 PM, Pekka Enberg wrote:
> >  >  We want to use 8250 emulation instead of virtio-serial because it's
> >  >  more compatible with kernel debugging mechanisms. Also, it makes
> >  >  debugging virtio code much easier when we don't need to use virtio to
> >  >  deliver console output while debugging it. We want to make it fast so
> >  >  that we don't need to switch over to another console type after early
> >  >  boot.
> >  >
> >  >  What's unreasonable about that?
> >
> >  Does virtio debugging really need super-fast serial?  Does it need
> >  serial at all?
> >
>
> Does it need super-fast serial? No, although it's nice. Does it need
> serial at all? Definitely.

Why?  virtio is mature.  It's not some early boot thing which fails and 
kills the guest.  Even if you get an oops, usually the guest is still alive.

> It's not just virtio which can fail running on virtio-console, it's also
> the threadpool, the eventfd mechanism and even the PCI management
> module. You can't really debug it if you can't depend on your debugging
> mechanism to properly work.

Wait, those are guest things, not host things.

> So far, serial is the simplest, most effective, and never-failing method
> we had for working on guests, I don't see how we can work without it at
> the moment.

I really can't remember the last time I used the serial console for the 
guest.  In the early early days, sure, but now?

> I agree here that the performance even with 256 vcpus would be terrible
> and no 'real' users would be doing that until the infrastructure could
> provide reasonable performance.
>
> The two uses I see for it are:
>
> 1. Stressing out the usermode code. One of the reasons qemu can't
> properly do 64 vcpus now is not just due to the KVM kernel code, it's
> also due to qemu itself. We're trying to avoid doing the same
> with /tools/kvm.

It won't help without a 1024 cpu host.  As soon as you put a real 
workload on the guest, it will thrash and any scaling issue in qemu or 
tools/kvm will be drowned in the noise.

> 2. Preventing future scalability problems. Currently we can't do 1024
> vcpus because it breaks coalesced MMIO - which is IMO not a valid reason
> for not scaling up to 1024 vcpus (and by scaling I mean running without
> errors, without regards to performance).

That's not what scaling means (not to say that it wouldn't be nice to 
fix coalesced mmio).

btw, why are you so eager to run 1024 vcpu guests? usually, if you have 
a need for such large systems, you're really performance sensitive.  
It's not a good case for virtualization.
Avi Kivity July 14, 2011, 12:48 p.m. UTC | #39
On 07/14/2011 03:37 PM, Pekka Enberg wrote:
> Hi Avi,
>
> On Thu, Jul 14, 2011 at 2:54 PM, Avi Kivity<avi@redhat.com>  wrote:
> >>  I don't think it needs to be faster for *significant number* of users
> >>  but yes, I completely agree that we need to make sure KVM gains more
> >>  than the costs are.
> >
> >  Significant, for me, means it's measured in a percentage, not as digits on
> >  various limbs.  2% is a significant amount of users.  5 is not.
>
> Just to make my position clear: I think it's enough that there's one
> user that benefits as long as complexity isn't too high and I think
> x86/Voyager is a pretty good example of that. So while you can argue
> *for* complexity if there are enough users, when there's only few
> users, it's really about whether a feature adds significant complexity
> or not.

Everything adds complexity.  And I think x86/voyager was a mistake.

> >>  We want to use 8250 emulation instead of virtio-serial because it's
> >>  more compatible with kernel debugging mechanisms. Also, it makes
> >>  debugging virtio code much easier when we don't need to use virtio to
> >>  deliver console output while debugging it. We want to make it fast so
> >>  that we don't need to switch over to another console type after early
> >>  boot.
> >>
> >>  What's unreasonable about that?
> >
> >  Does virtio debugging really need super-fast serial?  Does it need serial at
> >  all?
>
> Text mode guests should be super-fast, not virtio debugging. We want
> to use 8250 instead of virtio serial or virtio console (which we
> support, btw) because it's more compatible with Linux. Debugging
> virtio with 8250 has turned out to be useful in the past.

Use virtio-console when you're in production (it will be much much 
faster than socket-mmio 8250), and 8250 when debugging.
Pekka Enberg July 14, 2011, 12:52 p.m. UTC | #40
On Thu, 2011-07-14 at 15:48 +0300, Avi Kivity wrote:
> Use virtio-console when you're in production (it will be much much 
> faster than socket-mmio 8250), and 8250 when debugging.

This is exactly what we try to avoid. We want to use same code for
production and debugging when possible. And btw, when I have my kernel
hacker hat on, production is debugging so it's important that that's
fast too.

			Pekka

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity July 14, 2011, 12:54 p.m. UTC | #41
On 07/14/2011 03:52 PM, Pekka Enberg wrote:
> On Thu, 2011-07-14 at 15:48 +0300, Avi Kivity wrote:
> >  Use virtio-console when you're in production (it will be much much
> >  faster than socket-mmio 8250), and 8250 when debugging.
>
> This is exactly what we try to avoid. We want to use same code for
> production and debugging when possible. And btw, when I have my kernel
> hacker hat on, production is debugging so it's important that that's
> fast too.

But again, socket-mmio 8250 will never match virtio-console.
Sasha Levin July 14, 2011, 1 p.m. UTC | #42
On Thu, 2011-07-14 at 15:46 +0300, Avi Kivity wrote:
> On 07/14/2011 03:32 PM, Sasha Levin wrote:
> > On Thu, 2011-07-14 at 14:54 +0300, Avi Kivity wrote:
> > >  On 07/14/2011 01:30 PM, Pekka Enberg wrote:
> > >  >  We want to use 8250 emulation instead of virtio-serial because it's
> > >  >  more compatible with kernel debugging mechanisms. Also, it makes
> > >  >  debugging virtio code much easier when we don't need to use virtio to
> > >  >  deliver console output while debugging it. We want to make it fast so
> > >  >  that we don't need to switch over to another console type after early
> > >  >  boot.
> > >  >
> > >  >  What's unreasonable about that?
> > >
> > >  Does virtio debugging really need super-fast serial?  Does it need
> > >  serial at all?
> > >
> >
> > Does it need super-fast serial? No, although it's nice. Does it need
> > serial at all? Definitely.
> 
> Why?  virtio is mature.  It's not some early boot thing which fails and 
> kills the guest.  Even if you get an oops, usually the guest is still alive.

virtio is mature, /tools/kvm isn't :)

> 
> > It's not just virtio which can fail running on virtio-console, it's also
> > the threadpool, the eventfd mechanism and even the PCI management
> > module. You can't really debug it if you can't depend on your debugging
> > mechanism to properly work.
> 
> Wait, those are guest things, not host things.

Yes, as you said in the previous mail, both KVM and virtio are very
stable. /tools/kvm was the one who was being debugged most of the time.


> > So far, serial is the simplest, most effective, and never-failing method
> > we had for working on guests, I don't see how we can work without it at
> > the moment.
> 
> I really can't remember the last time I used the serial console for the 
> guest.  In the early early days, sure, but now?
> 

I don't know, if it works fine why not use it when you need simple
serial connection?

It's also useful for kernel hackers who break early boot things :)

> > I agree here that the performance even with 256 vcpus would be terrible
> > and no 'real' users would be doing that until the infrastructure could
> > provide reasonable performance.
> >
> > The two uses I see for it are:
> >
> > 1. Stressing out the usermode code. One of the reasons qemu can't
> > properly do 64 vcpus now is not just due to the KVM kernel code, it's
> > also due to qemu itself. We're trying to avoid doing the same
> > with /tools/kvm.
> 
> It won't help without a 1024 cpu host.  As soon as you put a real 
> workload on the guest, it will thrash and any scaling issue in qemu or 
> tools/kvm will be drowned in the noise.
> 
> > 2. Preventing future scalability problems. Currently we can't do 1024
> > vcpus because it breaks coalesced MMIO - which is IMO not a valid reason
> > for not scaling up to 1024 vcpus (and by scaling I mean running without
> > errors, without regards to performance).
> 
> That's not what scaling means (not to say that it wouldn't be nice to 
> fix coalesced mmio).
> 
> btw, why are you so eager to run 1024 vcpu guests? usually, if you have 
> a need for such large systems, you're really performance sensitive.  
> It's not a good case for virtualization.
> 
> 

I may have went too far with 1024, I have only tested it on 254 vcpus so
far - I'll change that in my patch.

It's also not just a KVM issue. Take for example the RCU issue which we
were able to detect with /tools/kvm just by trying more than 30 vcpus
and noticing that RCU was broken with a recent kernel.

Testing the kernel on guests with large amount of vcpus or virtual
memory might prove beneficial not only for KVM itself.
Avi Kivity July 14, 2011, 1:05 p.m. UTC | #43
On 07/14/2011 04:00 PM, Sasha Levin wrote:
> >
> >  Why?  virtio is mature.  It's not some early boot thing which fails and
> >  kills the guest.  Even if you get an oops, usually the guest is still alive.
>
> virtio is mature, /tools/kvm isn't :)
> >
> >  >  It's not just virtio which can fail running on virtio-console, it's also
> >  >  the threadpool, the eventfd mechanism and even the PCI management
> >  >  module. You can't really debug it if you can't depend on your debugging
> >  >  mechanism to properly work.
> >
> >  Wait, those are guest things, not host things.
>
> Yes, as you said in the previous mail, both KVM and virtio are very
> stable. /tools/kvm was the one who was being debugged most of the time.

I still don't follow.  The guest oopses? dmesg | less.  An issue with 
tools/kvm? gdb -p `pgrep kvm`.

> >  >  So far, serial is the simplest, most effective, and never-failing method
> >  >  we had for working on guests, I don't see how we can work without it at
> >  >  the moment.
> >
> >  I really can't remember the last time I used the serial console for the
> >  guest.  In the early early days, sure, but now?
> >
>
> I don't know, if it works fine why not use it when you need simple
> serial connection?
>
> It's also useful for kernel hackers who break early boot things :)

I'm not advocating removing it!  I'm just questioning the need for 
optimization.

> >  That's not what scaling means (not to say that it wouldn't be nice to
> >  fix coalesced mmio).
> >
> >  btw, why are you so eager to run 1024 vcpu guests? usually, if you have
> >  a need for such large systems, you're really performance sensitive.
> >  It's not a good case for virtualization.
> >
> >
>
> I may have went too far with 1024, I have only tested it on 254 vcpus so
> far - I'll change that in my patch.
>
> It's also not just a KVM issue. Take for example the RCU issue which we
> were able to detect with /tools/kvm just by trying more than 30 vcpus
> and noticing that RCU was broken with a recent kernel.
>
> Testing the kernel on guests with large amount of vcpus or virtual
> memory might prove beneficial not only for KVM itself.

Non-performance testing of really large guests is a valid use case, I agree.
Pekka Enberg July 14, 2011, 1:17 p.m. UTC | #44
On Thu, 2011-07-14 at 16:05 +0300, Avi Kivity wrote:
> On 07/14/2011 04:00 PM, Sasha Levin wrote:
> > >
> > >  Why?  virtio is mature.  It's not some early boot thing which fails and
> > >  kills the guest.  Even if you get an oops, usually the guest is still alive.
> >
> > virtio is mature, /tools/kvm isn't :)
> > >
> > >  >  It's not just virtio which can fail running on virtio-console, it's also
> > >  >  the threadpool, the eventfd mechanism and even the PCI management
> > >  >  module. You can't really debug it if you can't depend on your debugging
> > >  >  mechanism to properly work.
> > >
> > >  Wait, those are guest things, not host things.
> >
> > Yes, as you said in the previous mail, both KVM and virtio are very
> > stable. /tools/kvm was the one who was being debugged most of the time.
> 
> I still don't follow.  The guest oopses? dmesg | less.  An issue with 
> tools/kvm? gdb -p `pgrep kvm`.

When I was debugging tools/kvm virtio code, I used to 'instrument' the
guest kernel with printk() calls which helped a lot.

Also, a bug in tools/kvm can manifest in many interesting ways in the
guest kernel during boot, for example. You can't do dmesg then and gdb
won't save you. I think you've lived too long in the table KVM and Qemu
land to remember how important reliable printk() is for development.

			Pekka

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity July 14, 2011, 1:23 p.m. UTC | #45
On 07/14/2011 04:17 PM, Pekka Enberg wrote:
> >
> >  I still don't follow.  The guest oopses? dmesg | less.  An issue with
> >  tools/kvm? gdb -p `pgrep kvm`.
>
> When I was debugging tools/kvm virtio code, I used to 'instrument' the
> guest kernel with printk() calls which helped a lot.
>

Sure, but do you really need it spewing out the serial port all the time?

> Also, a bug in tools/kvm can manifest in many interesting ways in the
> guest kernel during boot, for example. You can't do dmesg then and gdb
> won't save you. I think you've lived too long in the table KVM and Qemu
> land to remember how important reliable printk() is for development.

I guess.  Also I've switched to trace_printk() since it's much nicer 
(and intergrates with other ftrace features).

And again, I'm not against tools/kvm optimizing serial.  I just want 
better justification for socket-mmio.
Anthony Liguori July 20, 2011, 2:42 a.m. UTC | #46
On 07/05/2011 11:37 PM, Sasha Levin wrote:
> The new flag allows passing a connected socket instead of an
> eventfd to be notified of writes or reads to the specified memory region.
>
> Instead of signaling an event, On write - the value written to the memory
> region is written to the pipe.
> On read - a notification of the read is sent to the host, and a response
> is expected with the value to be 'read'.
>
> Using a socket instead of an eventfd is usefull when any value can be
> written to the memory region but we're interested in recieving the
> actual value instead of just a notification.
>
> A simple example for practical use is the serial port. we are not
> interested in an exit every time a char is written to the port, but
> we do need to know what was written so we could handle it on the guest.

I suspect a normal uart is a bad use case for this.

The THR can only hold a single value, a guest is not supposed to write 
to the THR again until the THRE flag in the LSR is set which indicates 
that the THR is empty.

In fact, when THRE is raised, the device emulation should raise an 
interrupt and that's what should trigger the guest to write another 
value to the THR.

So in practice, I don't see how it would work without relying on some 
specific behavior of the current Linux uart guest code.

But that said, I think this is an interesting mechanism.  I'd be curious 
how well it worked for VGA planar access which is what the current 
coalesced I/O is most useful for.

Regards,

Anthony Liguori

>
> Cc: Avi Kivity<avi@redhat.com>
> Cc: Ingo Molnar<mingo@elte.hu>
> Cc: Marcelo Tosatti<mtosatti@redhat.com>
> Cc: Michael S. Tsirkin<mst@redhat.com>
> Cc: Pekka Enberg<penberg@kernel.org>
> Signed-off-by: Sasha Levin<levinsasha928@gmail.com>
> ---
>   Documentation/virtual/kvm/api.txt |   18 ++++-
>   include/linux/kvm.h               |    9 ++
>   virt/kvm/eventfd.c                |  153 ++++++++++++++++++++++++++++++++-----
>   3 files changed, 161 insertions(+), 19 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 317d86a..74f0946 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -1330,7 +1330,7 @@ Returns: 0 on success, !0 on error
>
>   This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
>   within the guest.  A guest write in the registered address will signal the
> -provided event instead of triggering an exit.
> +provided event or write to the provided socket instead of triggering an exit.
>
>   struct kvm_ioeventfd {
>   	__u64 datamatch;
> @@ -1341,6 +1341,13 @@ struct kvm_ioeventfd {
>   	__u8  pad[36];
>   };
>
> +struct kvm_ioeventfd_data {
> +	__u64 data;
> +	__u64 addr;
> +	__u32 len;
> +	__u8  is_write;
> +};
> +
>   The following flags are defined:
>
>   #define KVM_IOEVENTFD_FLAG_DATAMATCH (1<<  kvm_ioeventfd_flag_nr_datamatch)
> @@ -1348,6 +1355,7 @@ The following flags are defined:
>   #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1<<  kvm_ioeventfd_flag_nr_deassign)
>   #define KVM_IOEVENTFD_FLAG_READ      (1<<  kvm_ioeventfd_flag_nr_read)
>   #define KVM_IOEVENTFD_FLAG_NOWRITE   (1<<  kvm_ioeventfd_flag_nr_nowrite)
> +#define KVM_IOEVENTFD_FLAG_SOCKET    (1<<  kvm_ioeventfd_flag_nr_socket)
>
>   If datamatch flag is set, the event will be signaled only if the written value
>   to the registered address is equal to datamatch in struct kvm_ioeventfd.
> @@ -1359,6 +1367,14 @@ passed in datamatch.
>   If the nowrite flag is set, the event won't be signaled when the specified address
>   is being written to.
>
> +If the socket flag is set, fd is expected to be a connected AF_UNIX
> +SOCK_SEQPACKET socket. Once a guest write in the registered address is
> +detected - a struct kvm_ioeventfd_data which describes the write will be
> +written to the socket.
> +On read, struct kvm_ioeventfd_data will be written with 'is_write = 0', and
> +would wait for a response with a struct kvm_ioeventfd_data containing the
> +value which should be 'read' by the guest.
> +
>
>   5. The kvm_run structure
>
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 8a12711..ff3d808 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -389,6 +389,7 @@ enum {
>   	kvm_ioeventfd_flag_nr_deassign,
>   	kvm_ioeventfd_flag_nr_read,
>   	kvm_ioeventfd_flag_nr_nowrite,
> +	kvm_ioeventfd_flag_nr_socket,
>   	kvm_ioeventfd_flag_nr_max,
>   };
>
> @@ -397,6 +398,7 @@ enum {
>   #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1<<  kvm_ioeventfd_flag_nr_deassign)
>   #define KVM_IOEVENTFD_FLAG_READ      (1<<  kvm_ioeventfd_flag_nr_read)
>   #define KVM_IOEVENTFD_FLAG_NOWRITE   (1<<  kvm_ioeventfd_flag_nr_nowrite)
> +#define KVM_IOEVENTFD_FLAG_SOCKET    (1<<  kvm_ioeventfd_flag_nr_socket)
>
>   #define KVM_IOEVENTFD_VALID_FLAG_MASK  ((1<<  kvm_ioeventfd_flag_nr_max) - 1)
>
> @@ -409,6 +411,13 @@ struct kvm_ioeventfd {
>   	__u8  pad[36];
>   };
>
> +struct kvm_ioeventfd_data {
> +	__u64 data;
> +	__u64 addr;
> +	__u32 len;
> +	__u8  is_write;
> +};
> +
>   /* for KVM_ENABLE_CAP */
>   struct kvm_enable_cap {
>   	/* in */
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 5f2d203..d1d63b3 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -32,6 +32,7 @@
>   #include<linux/eventfd.h>
>   #include<linux/kernel.h>
>   #include<linux/slab.h>
> +#include<linux/net.h>
>
>   #include "iodev.h"
>
> @@ -413,10 +414,11 @@ module_exit(irqfd_module_exit);
>
>   /*
>    * --------------------------------------------------------------------
> - * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
> + * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal or
> + *            a socket write.
>    *
> - * userspace can register a PIO/MMIO address with an eventfd for receiving
> - * notification when the memory has been touched.
> + * userspace can register a PIO/MMIO address with an eventfd or a
> + * socket for receiving notification when the memory has been touched.
>    * --------------------------------------------------------------------
>    */
>
> @@ -424,7 +426,10 @@ struct _ioeventfd {
>   	struct list_head     list;
>   	u64                  addr;
>   	int                  length;
> -	struct eventfd_ctx  *eventfd;
> +	union {
> +		struct socket       *sock;
> +		struct eventfd_ctx  *eventfd;
> +	};
>   	u64                  datamatch;
>   	struct kvm_io_device dev;
>   	bool                 wildcard;
> @@ -441,7 +446,11 @@ to_ioeventfd(struct kvm_io_device *dev)
>   static void
>   ioeventfd_release(struct _ioeventfd *p)
>   {
> -	eventfd_ctx_put(p->eventfd);
> +	if (p->eventfd)
> +		eventfd_ctx_put(p->eventfd);
> +	else
> +		sockfd_put(p->sock);
> +
>   	list_del(&p->list);
>   	kfree(p);
>   }
> @@ -510,12 +519,65 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
>   	return _val == p->datamatch ? true : false;
>   }
>
> +static ssize_t socket_write(struct socket *sock, const void *buf, size_t count)
> +{
> +	mm_segment_t old_fs;
> +	ssize_t res;
> +	struct msghdr msg;
> +	struct iovec iov;
> +
> +	iov = (struct iovec) {
> +		.iov_base = (void *)buf,
> +		.iov_len  = count,
> +	};
> +
> +	msg = (struct msghdr) {
> +		.msg_iov =&iov,
> +		.msg_iovlen = 1,
> +	};
> +
> +	old_fs = get_fs();
> +	set_fs(get_ds());
> +	/* The cast to a user pointer is valid due to the set_fs() */
> +	res = sock_sendmsg(sock,&msg, count);
> +	set_fs(old_fs);
> +
> +	return res;
> +}
> +
> +static ssize_t socket_read(struct socket *sock, void *buf, size_t count)
> +{
> +	mm_segment_t old_fs;
> +	ssize_t res;
> +	struct msghdr msg;
> +	struct iovec iov;
> +
> +	iov = (struct iovec) {
> +		.iov_base = (void *)buf,
> +		.iov_len  = count,
> +	};
> +
> +	msg = (struct msghdr) {
> +		.msg_iov =&iov,
> +		.msg_iovlen = 1,
> +	};
> +
> +	old_fs = get_fs();
> +	set_fs(get_ds());
> +	/* The cast to a user pointer is valid due to the set_fs() */
> +	res = sock_recvmsg(sock,&msg, count, 0);
> +	set_fs(old_fs);
> +
> +	return res;
> +}
> +
>   /* MMIO/PIO writes trigger an event if the addr/val match */
>   static int
>   ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
>   		const void *val)
>   {
>   	struct _ioeventfd *p = to_ioeventfd(this);
> +	struct kvm_ioeventfd_data data;
>
>   	/* Exit if signaling on writes isn't requested */
>   	if (!p->track_writes)
> @@ -524,7 +586,18 @@ ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
>   	if (!ioeventfd_in_range(p, addr, len, val))
>   		return -EOPNOTSUPP;
>
> -	eventfd_signal(p->eventfd, 1);
> +	data = (struct kvm_ioeventfd_data) {
> +		.data = get_val(val, len),
> +		.addr = addr,
> +		.len = len,
> +		.is_write = 1,
> +	};
> +
> +	if (p->sock)
> +		socket_write(p->sock,&data, sizeof(data));
> +	else
> +		eventfd_signal(p->eventfd, 1);
> +
>   	return 0;
>   }
>
> @@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
>   		void *val)
>   {
>   	struct _ioeventfd *p = to_ioeventfd(this);
> +	struct kvm_ioeventfd_data data;
>
>   	/* Exit if signaling on reads isn't requested */
>   	if (!p->track_reads)
> @@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
>   	if (!ioeventfd_in_range(p, addr, len, val))
>   		return -EOPNOTSUPP;
>
> -	eventfd_signal(p->eventfd, 1);
> +	data = (struct kvm_ioeventfd_data) {
> +		.addr = addr,
> +		.len = len,
> +		.is_write = 0,
> +	};
> +
> +	if (p->sock) {
> +		socket_write(p->sock,&data, sizeof(data));
> +		socket_read(p->sock,&data, sizeof(data));
> +		set_val(val, len, data.data);
> +	} else {
> +		set_val(val, len, p->datamatch);
> +		eventfd_signal(p->eventfd, 1);
> +	}
> +
>   	return 0;
>   }
>
> @@ -585,7 +673,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>   	int                       pio = args->flags&  KVM_IOEVENTFD_FLAG_PIO;
>   	enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
>   	struct _ioeventfd        *p;
> -	struct eventfd_ctx       *eventfd;
> +	struct eventfd_ctx       *eventfd = NULL;
>   	int                       ret;
>
>   	/* check for range overflow */
> @@ -596,10 +684,6 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>   	if (args->flags&  ~KVM_IOEVENTFD_VALID_FLAG_MASK)
>   		return -EINVAL;
>
> -	eventfd = eventfd_ctx_fdget(args->fd);
> -	if (IS_ERR(eventfd))
> -		return PTR_ERR(eventfd);
> -
>   	p = kzalloc(sizeof(*p), GFP_KERNEL);
>   	if (!p) {
>   		ret = -ENOMEM;
> @@ -611,6 +695,20 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>   	p->length  = args->len;
>   	p->eventfd = eventfd;
>
> +	if (args->flags&  KVM_IOEVENTFD_FLAG_SOCKET) {
> +		ret = 0;
> +		p->sock = sockfd_lookup(args->fd,&ret);
> +		if (ret)
> +			goto fail;
> +	} else {
> +		ret = -EINVAL;
> +		eventfd = eventfd_ctx_fdget(args->fd);
> +		if (IS_ERR(eventfd))
> +			goto fail;
> +
> +		p->eventfd = eventfd;
> +	}
> +
>   	/* The datamatch feature is optional, otherwise this is a wildcard */
>   	if (args->flags&  KVM_IOEVENTFD_FLAG_DATAMATCH)
>   		p->datamatch = args->datamatch;
> @@ -649,8 +747,14 @@ unlock_fail:
>   	mutex_unlock(&kvm->slots_lock);
>
>   fail:
> +	if (eventfd)
> +		eventfd_ctx_put(eventfd);
> +
> +	if (p->sock)
> +		sockfd_put(p->sock);
> +
> +
>   	kfree(p);
> -	eventfd_ctx_put(eventfd);
>
>   	return ret;
>   }
> @@ -661,12 +765,21 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>   	int                       pio = args->flags&  KVM_IOEVENTFD_FLAG_PIO;
>   	enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
>   	struct _ioeventfd        *p, *tmp;
> -	struct eventfd_ctx       *eventfd;
> +	struct eventfd_ctx       *eventfd = NULL;
> +	struct socket            *sock = NULL;
>   	int                       ret = -ENOENT;
>
> -	eventfd = eventfd_ctx_fdget(args->fd);
> -	if (IS_ERR(eventfd))
> -		return PTR_ERR(eventfd);
> +	if (args->flags&  KVM_IOEVENTFD_FLAG_SOCKET) {
> +		ret = 0;
> +		sock = sockfd_lookup(args->fd,&ret);
> +		if (ret)
> +			return PTR_ERR(sock);
> +	} else {
> +		ret = -EINVAL;
> +		eventfd = eventfd_ctx_fdget(args->fd);
> +		if (IS_ERR(eventfd))
> +			return PTR_ERR(eventfd);
> +	}
>
>   	mutex_lock(&kvm->slots_lock);
>
> @@ -674,6 +787,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>   		bool wildcard = !(args->flags&  KVM_IOEVENTFD_FLAG_DATAMATCH);
>
>   		if (p->eventfd != eventfd  ||
> +		    p->sock != sock        ||
>   		    p->addr != args->addr  ||
>   		    p->length != args->len ||
>   		    p->wildcard != wildcard)
> @@ -690,7 +804,10 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>
>   	mutex_unlock(&kvm->slots_lock);
>
> -	eventfd_ctx_put(eventfd);
> +	if (eventfd)
> +		eventfd_ctx_put(eventfd);
> +	if (sock)
> +		sockfd_put(sock);
>
>   	return ret;
>   }

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori July 20, 2011, 2:49 a.m. UTC | #47
On 07/13/2011 02:07 AM, Avi Kivity wrote:
> On 07/13/2011 09:45 AM, Pekka Enberg wrote:
>> >
>> > OK, what's the simplest thing we can do here to keep Avi happy and get
>> > the functionality of Sasha's original patch that helps us avoid guest
>> > exits for serial console? I agree with Avi that we don't want
>> > fragmented ABI but it seems to me there's no clear idea how to fix
>> > KVM_IOEVENTFD_FLAG_SOCKET corner cases, right?
>>
>> And btw, I didn't follow the discussion closely, but introducing a new
>> type of exit for a feature that's designed to _avoid exits_ doesn't
>> seem like a smart thing to do. Is it even possible to support sockets
>> sanely for this?
>
> First of all, this feature doesn't avoid exits.

I don't think it's possible to do correct UART emulation and avoid an exit.

And yeah, looking at your serial emulation, you're not properly updating 
the LSR register.  The fact that this works with Linux is just luck. 
You will encounter guests that poll LSR.THRE in order to write more data 
to the THR and you will no longer be able to treat THR as having no side 
effects.

Worse yet, if the Linux driver ever changes in the future in a way 
that's fully compliant to the UART spec, it'll break your emulation.

If you're not going to emulate a UART properly, why not just use a 
paravirtual serial device that you can keep simple?

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori July 20, 2011, 2:52 a.m. UTC | #48
On 07/14/2011 08:23 AM, Avi Kivity wrote:
> On 07/14/2011 04:17 PM, Pekka Enberg wrote:
>> >
>> > I still don't follow. The guest oopses? dmesg | less. An issue with
>> > tools/kvm? gdb -p `pgrep kvm`.
>>
>> When I was debugging tools/kvm virtio code, I used to 'instrument' the
>> guest kernel with printk() calls which helped a lot.
>>
>
> Sure, but do you really need it spewing out the serial port all the time?
>
>> Also, a bug in tools/kvm can manifest in many interesting ways in the
>> guest kernel during boot, for example. You can't do dmesg then and gdb
>> won't save you. I think you've lived too long in the table KVM and Qemu
>> land to remember how important reliable printk() is for development.
>
> I guess. Also I've switched to trace_printk() since it's much nicer (and
> intergrates with other ftrace features).
>
> And again, I'm not against tools/kvm optimizing serial. I just want
> better justification for socket-mmio.

Just to be really thorough, the optimization is incorrect for UART 
emulation.  Maybe for a simple PIO based console where there were no 
guest visible side effects to a character write, this would be okay, but 
that is not how a UART works.

Just implement virtio-serial :-)  You can move data as fast as you'd 
like through it.

And if virtio-serial is too complicated, make a simpler version that 
doesn't have such crazy semantics.

Regards,

Anthony Liguori

>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sasha Levin July 20, 2011, 6:16 a.m. UTC | #49
On Tue, 2011-07-19 at 21:52 -0500, Anthony Liguori wrote:
> On 07/14/2011 08:23 AM, Avi Kivity wrote:
> > On 07/14/2011 04:17 PM, Pekka Enberg wrote:
> >> >
> >> > I still don't follow. The guest oopses? dmesg | less. An issue with
> >> > tools/kvm? gdb -p `pgrep kvm`.
> >>
> >> When I was debugging tools/kvm virtio code, I used to 'instrument' the
> >> guest kernel with printk() calls which helped a lot.
> >>
> >
> > Sure, but do you really need it spewing out the serial port all the time?
> >
> >> Also, a bug in tools/kvm can manifest in many interesting ways in the
> >> guest kernel during boot, for example. You can't do dmesg then and gdb
> >> won't save you. I think you've lived too long in the table KVM and Qemu
> >> land to remember how important reliable printk() is for development.
> >
> > I guess. Also I've switched to trace_printk() since it's much nicer (and
> > intergrates with other ftrace features).
> >
> > And again, I'm not against tools/kvm optimizing serial. I just want
> > better justification for socket-mmio.
> 
> Just to be really thorough, the optimization is incorrect for UART 
> emulation.  Maybe for a simple PIO based console where there were no 
> guest visible side effects to a character write, this would be okay, but 
> that is not how a UART works.

To be honest, I haven't checked if the implementation we have is correct
- I've just checked that it would benefit from the implementation of
socket ioeventfds.

I'll have to look at it again based on your review and probably fix it
to work correctly :)

> 
> Just implement virtio-serial :-)  You can move data as fast as you'd 
> like through it.
> 
> And if virtio-serial is too complicated, make a simpler version that 
> doesn't have such crazy semantics.

We do have virtio-serial support in /tools/kvm, you can try it using
'--console virtio'.

Currently I prefer using simple serial most of the time since it's the
most basic and simplest method of getting output from a guest, which is
quite useful when developing the core of /tools/kvm.
Avi Kivity July 20, 2011, 8:19 a.m. UTC | #50
On 07/20/2011 05:42 AM, Anthony Liguori wrote:
>
> I suspect a normal uart is a bad use case for this.
>
> The THR can only hold a single value, a guest is not supposed to write 
> to the THR again until the THRE flag in the LSR is set which indicates 
> that the THR is empty.
>
> In fact, when THRE is raised, the device emulation should raise an 
> interrupt and that's what should trigger the guest to write another 
> value to the THR.
>
> So in practice, I don't see how it would work without relying on some 
> specific behavior of the current Linux uart guest code.
>
> But that said, I think this is an interesting mechanism.  I'd be 
> curious how well it worked for VGA planar access which is what the 
> current coalesced I/O is most useful for.

Probably the interesting use case is to forward an entire BAR to a 
separate process or thread, perhaps implemented in the kernel.  However 
synchronization will be an issue - a PCI read is supposed to flush all 
pending PCI writes, and this is hard to enforce when the socket 
consumers are out of process.
Pekka Enberg July 20, 2011, 9:42 a.m. UTC | #51
On Wed, 2011-07-20 at 09:16 +0300, Sasha Levin wrote:
> > Just implement virtio-serial :-)  You can move data as fast as you'd 
> > like through it.
> > 
> > And if virtio-serial is too complicated, make a simpler version that 
> > doesn't have such crazy semantics.
> 
> We do have virtio-serial support in /tools/kvm, you can try it using
> '--console virtio'.

That's virtio-console. I thought virtio-serial was a separate driver.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pekka Enberg July 20, 2011, 9:44 a.m. UTC | #52
On Tue, 2011-07-19 at 21:49 -0500, Anthony Liguori wrote:
> If you're not going to emulate a UART properly, why not just use a 
> paravirtual serial device that you can keep simple?

We want to emulate it correctly to make it robust. The problems you
pointed out were caused by me being under heavy dose of ignorance when I
wrote the emulation code. ;-)

			Pekka

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori July 20, 2011, 9:10 p.m. UTC | #53
On 07/20/2011 04:44 AM, Pekka Enberg wrote:
> On Tue, 2011-07-19 at 21:49 -0500, Anthony Liguori wrote:
>> If you're not going to emulate a UART properly, why not just use a
>> paravirtual serial device that you can keep simple?
>
> We want to emulate it correctly to make it robust. The problems you
> pointed out were caused by me being under heavy dose of ignorance when I
> wrote the emulation code. ;-)

A UART has a fixed frequency associated with it and the divider latch 
will select the actual frequency based on a division of the fixed frequency.

There is no way to send a byte over the serial line faster than the 
fixed frequency.  The UART is so simple that it cannot DMA and it does 
not have very much memory on it for buffering so this limitation is 
visible to the guest.  This means the driver has to be very involved in 
the flower control of the device.

There's no way around this.  You simply can't do 10gb/s over a serial 
line and still have something that looks and acts like a UART.

BTW, it used to be we were pretty loose with how fast we'd move data 
through the serial port in QEMU and Linux would actually throw warnings 
if it saw more data moving through the serial port than is theoretically 
possible on bare metal.  Those warnings were removed at some point but 
I'm sure if you dug up and kernel and ran it, you would run into them 
with your current emulation.

Regards,

Anthony Liguori

>
> 			Pekka
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sasha Levin July 25, 2011, 12:10 p.m. UTC | #54
On Wed, 2011-07-13 at 10:07 +0300, Avi Kivity wrote:
> Second, introducing a new type of exit doesn't mean we see more exits.
> 
> Third, the new type of exit is probably not needed - we can use the 
> existing mmio/pio exits, and document that in certain cases the kernel 
> will fall back to these instead of delivering the I/O via the sockets 
> (userspace can then try writing itself).

Just waking this up since I want to send a new version and just want to
cover some things before that.

The problem with the original implementation was that if we receive a
signal while we wait for the host to provide a value to be read, we must
abort the operation and exit to do the signal.

What this caused was that read operations with side effects would break
(for example, when reading a byte would change the value in that byte).

The original plan was to notify the host that we ignored his answer via
the socket, and it should provide the response again via regular MMIO
exit, but I couldn't find a good way to pass it through the MMIO exit.
Also, This would complicate this operation on the host quite a bit.

What I did instead was to assume that if the socket write notifying the
host of a read operation went through ok, we can block on the socket
read request.

Does it sound ok? I know it's not what was originally planned, but to me
it looked like the most efficient approach.
Avi Kivity July 25, 2011, 12:16 p.m. UTC | #55
On 07/25/2011 03:10 PM, Sasha Levin wrote:
> On Wed, 2011-07-13 at 10:07 +0300, Avi Kivity wrote:
> >  Second, introducing a new type of exit doesn't mean we see more exits.
> >
> >  Third, the new type of exit is probably not needed - we can use the
> >  existing mmio/pio exits, and document that in certain cases the kernel
> >  will fall back to these instead of delivering the I/O via the sockets
> >  (userspace can then try writing itself).
>
> Just waking this up since I want to send a new version and just want to
> cover some things before that.
>
> The problem with the original implementation was that if we receive a
> signal while we wait for the host to provide a value to be read, we must
> abort the operation and exit to do the signal.
>
> What this caused was that read operations with side effects would break
> (for example, when reading a byte would change the value in that byte).
>
> The original plan was to notify the host that we ignored his answer via
> the socket, and it should provide the response again via regular MMIO
> exit, but I couldn't find a good way to pass it through the MMIO exit.
> Also, This would complicate this operation on the host quite a bit.
>
> What I did instead was to assume that if the socket write notifying the
> host of a read operation went through ok, we can block on the socket
> read request.
>
> Does it sound ok? I know it's not what was originally planned, but to me
> it looked like the most efficient approach.

You can't block when a signal is pending.  You can block, however, after 
you've exited with -EINTR and re-entered.

We need to document that if a vcpu exited with -EINTR, then any socket 
memory transactions need to be flushed before the vcpu's state can be 
considered stable (for live migration).  In fact it's true for any kind 
of exit.
Sasha Levin July 25, 2011, 12:26 p.m. UTC | #56
On Mon, 2011-07-25 at 15:16 +0300, Avi Kivity wrote:
> On 07/25/2011 03:10 PM, Sasha Levin wrote:
> > On Wed, 2011-07-13 at 10:07 +0300, Avi Kivity wrote:
> > >  Second, introducing a new type of exit doesn't mean we see more exits.
> > >
> > >  Third, the new type of exit is probably not needed - we can use the
> > >  existing mmio/pio exits, and document that in certain cases the kernel
> > >  will fall back to these instead of delivering the I/O via the sockets
> > >  (userspace can then try writing itself).
> >
> > Just waking this up since I want to send a new version and just want to
> > cover some things before that.
> >
> > The problem with the original implementation was that if we receive a
> > signal while we wait for the host to provide a value to be read, we must
> > abort the operation and exit to do the signal.
> >
> > What this caused was that read operations with side effects would break
> > (for example, when reading a byte would change the value in that byte).
> >
> > The original plan was to notify the host that we ignored his answer via
> > the socket, and it should provide the response again via regular MMIO
> > exit, but I couldn't find a good way to pass it through the MMIO exit.
> > Also, This would complicate this operation on the host quite a bit.
> >
> > What I did instead was to assume that if the socket write notifying the
> > host of a read operation went through ok, we can block on the socket
> > read request.
> >
> > Does it sound ok? I know it's not what was originally planned, but to me
> > it looked like the most efficient approach.
> 
> You can't block when a signal is pending.  You can block, however, after 
> you've exited with -EINTR and re-entered.
> 

What would happen with the MMIO then? I need to provide an answer before
I leave the read/write functions to exit with -EINTR, no?

> We need to document that if a vcpu exited with -EINTR, then any socket 
> memory transactions need to be flushed before the vcpu's state can be 
> considered stable (for live migration).  In fact it's true for any kind 
> of exit.
>
Avi Kivity July 25, 2011, 1:04 p.m. UTC | #57
On 07/25/2011 03:26 PM, Sasha Levin wrote:
> >
> >  You can't block when a signal is pending.  You can block, however, after
> >  you've exited with -EINTR and re-entered.
> >
>
> What would happen with the MMIO then? I need to provide an answer before
> I leave the read/write functions to exit with -EINTR, no?

If you exit, no result is needed until you reenter.  You just have to 
remember that on the next KVM_RUN, instead of running the vcpu, you have 
to talk to the socket (either write or read, depending on where the 
signal got you).
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 317d86a..74f0946 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1330,7 +1330,7 @@  Returns: 0 on success, !0 on error
 
 This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
 within the guest.  A guest write in the registered address will signal the
-provided event instead of triggering an exit.
+provided event or write to the provided socket instead of triggering an exit.
 
 struct kvm_ioeventfd {
 	__u64 datamatch;
@@ -1341,6 +1341,13 @@  struct kvm_ioeventfd {
 	__u8  pad[36];
 };
 
+struct kvm_ioeventfd_data {
+	__u64 data;
+	__u64 addr;
+	__u32 len;
+	__u8  is_write;
+};
+
 The following flags are defined:
 
 #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
@@ -1348,6 +1355,7 @@  The following flags are defined:
 #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
 #define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
 #define KVM_IOEVENTFD_FLAG_NOWRITE   (1 << kvm_ioeventfd_flag_nr_nowrite)
+#define KVM_IOEVENTFD_FLAG_SOCKET    (1 << kvm_ioeventfd_flag_nr_socket)
 
 If datamatch flag is set, the event will be signaled only if the written value
 to the registered address is equal to datamatch in struct kvm_ioeventfd.
@@ -1359,6 +1367,14 @@  passed in datamatch.
 If the nowrite flag is set, the event won't be signaled when the specified address
 is being written to.
 
+If the socket flag is set, fd is expected to be a connected AF_UNIX
+SOCK_SEQPACKET socket. Once a guest write in the registered address is
+detected - a struct kvm_ioeventfd_data which describes the write will be
+written to the socket.
+On read, struct kvm_ioeventfd_data will be written with 'is_write = 0', and
+would wait for a response with a struct kvm_ioeventfd_data containing the
+value which should be 'read' by the guest.
+
 
 5. The kvm_run structure
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 8a12711..ff3d808 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -389,6 +389,7 @@  enum {
 	kvm_ioeventfd_flag_nr_deassign,
 	kvm_ioeventfd_flag_nr_read,
 	kvm_ioeventfd_flag_nr_nowrite,
+	kvm_ioeventfd_flag_nr_socket,
 	kvm_ioeventfd_flag_nr_max,
 };
 
@@ -397,6 +398,7 @@  enum {
 #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
 #define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
 #define KVM_IOEVENTFD_FLAG_NOWRITE   (1 << kvm_ioeventfd_flag_nr_nowrite)
+#define KVM_IOEVENTFD_FLAG_SOCKET    (1 << kvm_ioeventfd_flag_nr_socket)
 
 #define KVM_IOEVENTFD_VALID_FLAG_MASK  ((1 << kvm_ioeventfd_flag_nr_max) - 1)
 
@@ -409,6 +411,13 @@  struct kvm_ioeventfd {
 	__u8  pad[36];
 };
 
+struct kvm_ioeventfd_data {
+	__u64 data;
+	__u64 addr;
+	__u32 len;
+	__u8  is_write;
+};
+
 /* for KVM_ENABLE_CAP */
 struct kvm_enable_cap {
 	/* in */
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 5f2d203..d1d63b3 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -32,6 +32,7 @@ 
 #include <linux/eventfd.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/net.h>
 
 #include "iodev.h"
 
@@ -413,10 +414,11 @@  module_exit(irqfd_module_exit);
 
 /*
  * --------------------------------------------------------------------
- * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
+ * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal or
+ *            a socket write.
  *
- * userspace can register a PIO/MMIO address with an eventfd for receiving
- * notification when the memory has been touched.
+ * userspace can register a PIO/MMIO address with an eventfd or a
+ * socket for receiving notification when the memory has been touched.
  * --------------------------------------------------------------------
  */
 
@@ -424,7 +426,10 @@  struct _ioeventfd {
 	struct list_head     list;
 	u64                  addr;
 	int                  length;
-	struct eventfd_ctx  *eventfd;
+	union {
+		struct socket       *sock;
+		struct eventfd_ctx  *eventfd;
+	};
 	u64                  datamatch;
 	struct kvm_io_device dev;
 	bool                 wildcard;
@@ -441,7 +446,11 @@  to_ioeventfd(struct kvm_io_device *dev)
 static void
 ioeventfd_release(struct _ioeventfd *p)
 {
-	eventfd_ctx_put(p->eventfd);
+	if (p->eventfd)
+		eventfd_ctx_put(p->eventfd);
+	else
+		sockfd_put(p->sock);
+
 	list_del(&p->list);
 	kfree(p);
 }
@@ -510,12 +519,65 @@  ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 	return _val == p->datamatch ? true : false;
 }
 
+static ssize_t socket_write(struct socket *sock, const void *buf, size_t count)
+{
+	mm_segment_t old_fs;
+	ssize_t res;
+	struct msghdr msg;
+	struct iovec iov;
+
+	iov = (struct iovec) {
+		.iov_base = (void *)buf,
+		.iov_len  = count,
+	};
+
+	msg = (struct msghdr) {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = sock_sendmsg(sock, &msg, count);
+	set_fs(old_fs);
+
+	return res;
+}
+
+static ssize_t socket_read(struct socket *sock, void *buf, size_t count)
+{
+	mm_segment_t old_fs;
+	ssize_t res;
+	struct msghdr msg;
+	struct iovec iov;
+
+	iov = (struct iovec) {
+		.iov_base = (void *)buf,
+		.iov_len  = count,
+	};
+
+	msg = (struct msghdr) {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = sock_recvmsg(sock, &msg, count, 0);
+	set_fs(old_fs);
+
+	return res;
+}
+
 /* MMIO/PIO writes trigger an event if the addr/val match */
 static int
 ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
 		const void *val)
 {
 	struct _ioeventfd *p = to_ioeventfd(this);
+	struct kvm_ioeventfd_data data;
 
 	/* Exit if signaling on writes isn't requested */
 	if (!p->track_writes)
@@ -524,7 +586,18 @@  ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
 	if (!ioeventfd_in_range(p, addr, len, val))
 		return -EOPNOTSUPP;
 
-	eventfd_signal(p->eventfd, 1);
+	data = (struct kvm_ioeventfd_data) {
+		.data = get_val(val, len),
+		.addr = addr,
+		.len = len,
+		.is_write = 1,
+	};
+
+	if (p->sock)
+		socket_write(p->sock, &data, sizeof(data));
+	else
+		eventfd_signal(p->eventfd, 1);
+
 	return 0;
 }
 
@@ -534,6 +607,7 @@  ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
 		void *val)
 {
 	struct _ioeventfd *p = to_ioeventfd(this);
+	struct kvm_ioeventfd_data data;
 
 	/* Exit if signaling on reads isn't requested */
 	if (!p->track_reads)
@@ -542,7 +616,21 @@  ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
 	if (!ioeventfd_in_range(p, addr, len, val))
 		return -EOPNOTSUPP;
 
-	eventfd_signal(p->eventfd, 1);
+	data = (struct kvm_ioeventfd_data) {
+		.addr = addr,
+		.len = len,
+		.is_write = 0,
+	};
+
+	if (p->sock) {
+		socket_write(p->sock, &data, sizeof(data));
+		socket_read(p->sock, &data, sizeof(data));
+		set_val(val, len, data.data);
+	} else {
+		set_val(val, len, p->datamatch);
+		eventfd_signal(p->eventfd, 1);
+	}
+
 	return 0;
 }
 
@@ -585,7 +673,7 @@  kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 	int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
 	enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
 	struct _ioeventfd        *p;
-	struct eventfd_ctx       *eventfd;
+	struct eventfd_ctx       *eventfd = NULL;
 	int                       ret;
 
 	/* check for range overflow */
@@ -596,10 +684,6 @@  kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 	if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
 		return -EINVAL;
 
-	eventfd = eventfd_ctx_fdget(args->fd);
-	if (IS_ERR(eventfd))
-		return PTR_ERR(eventfd);
-
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p) {
 		ret = -ENOMEM;
@@ -611,6 +695,20 @@  kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 	p->length  = args->len;
 	p->eventfd = eventfd;
 
+	if (args->flags & KVM_IOEVENTFD_FLAG_SOCKET) {
+		ret = 0;
+		p->sock = sockfd_lookup(args->fd, &ret);
+		if (ret)
+			goto fail;
+	} else {
+		ret = -EINVAL;
+		eventfd = eventfd_ctx_fdget(args->fd);
+		if (IS_ERR(eventfd))
+			goto fail;
+
+		p->eventfd = eventfd;
+	}
+
 	/* The datamatch feature is optional, otherwise this is a wildcard */
 	if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
 		p->datamatch = args->datamatch;
@@ -649,8 +747,14 @@  unlock_fail:
 	mutex_unlock(&kvm->slots_lock);
 
 fail:
+	if (eventfd)
+		eventfd_ctx_put(eventfd);
+
+	if (p->sock)
+		sockfd_put(p->sock);
+
+
 	kfree(p);
-	eventfd_ctx_put(eventfd);
 
 	return ret;
 }
@@ -661,12 +765,21 @@  kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 	int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
 	enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
 	struct _ioeventfd        *p, *tmp;
-	struct eventfd_ctx       *eventfd;
+	struct eventfd_ctx       *eventfd = NULL;
+	struct socket            *sock = NULL;
 	int                       ret = -ENOENT;
 
-	eventfd = eventfd_ctx_fdget(args->fd);
-	if (IS_ERR(eventfd))
-		return PTR_ERR(eventfd);
+	if (args->flags & KVM_IOEVENTFD_FLAG_SOCKET) {
+		ret = 0;
+		sock = sockfd_lookup(args->fd, &ret);
+		if (ret)
+			return PTR_ERR(sock);
+	} else {
+		ret = -EINVAL;
+		eventfd = eventfd_ctx_fdget(args->fd);
+		if (IS_ERR(eventfd))
+			return PTR_ERR(eventfd);
+	}
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -674,6 +787,7 @@  kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 		bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
 
 		if (p->eventfd != eventfd  ||
+		    p->sock != sock        ||
 		    p->addr != args->addr  ||
 		    p->length != args->len ||
 		    p->wildcard != wildcard)
@@ -690,7 +804,10 @@  kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 	mutex_unlock(&kvm->slots_lock);
 
-	eventfd_ctx_put(eventfd);
+	if (eventfd)
+		eventfd_ctx_put(eventfd);
+	if (sock)
+		sockfd_put(sock);
 
 	return ret;
 }