diff mbox series

[RFC,v2,3/4] KVM: add support for ioregionfd cmds/replies serialization

Message ID 294d8a0e08eff4ec9c8f8f62492f29163e6c4319.1611850291.git.eafanasova@gmail.com (mailing list archive)
State New, archived
Headers show
Series Introduce MMIO/PIO dispatch file descriptors (ioregionfd) | expand

Commit Message

Elena Afanasova Jan. 28, 2021, 6:32 p.m. UTC
Add ioregionfd context and kvm_io_device_ops->prepare/finish()
in order to serialize all bytes requested by guest.

Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
---
 arch/x86/kvm/x86.c       |  19 ++++++++
 include/kvm/iodev.h      |  14 ++++++
 include/linux/kvm_host.h |   4 ++
 virt/kvm/ioregion.c      | 102 +++++++++++++++++++++++++++++++++------
 virt/kvm/kvm_main.c      |  32 ++++++++++++
 5 files changed, 157 insertions(+), 14 deletions(-)

Comments

Stefan Hajnoczi Jan. 30, 2021, 6:54 p.m. UTC | #1
On Thu, Jan 28, 2021 at 09:32:22PM +0300, Elena Afanasova wrote:
> Add ioregionfd context and kvm_io_device_ops->prepare/finish()
> in order to serialize all bytes requested by guest.
> 
> Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> ---
>  arch/x86/kvm/x86.c       |  19 ++++++++
>  include/kvm/iodev.h      |  14 ++++++
>  include/linux/kvm_host.h |   4 ++
>  virt/kvm/ioregion.c      | 102 +++++++++++++++++++++++++++++++++------
>  virt/kvm/kvm_main.c      |  32 ++++++++++++
>  5 files changed, 157 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index a04516b531da..393fb0f4bf46 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5802,6 +5802,8 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
>  	int ret = 0;
>  	bool is_apic;
>  
> +	kvm_io_bus_prepare(vcpu, KVM_MMIO_BUS, addr, len);
> +
>  	do {
>  		n = min(len, 8);
>  		is_apic = lapic_in_kernel(vcpu) &&
> @@ -5823,8 +5825,10 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
>  	if (ret == -EINTR) {
>  		vcpu->run->exit_reason = KVM_EXIT_INTR;
>  		++vcpu->stat.signal_exits;
> +		return handled;
>  	}
>  #endif
> +	kvm_io_bus_finish(vcpu, KVM_MMIO_BUS, addr, len);

Hmm...it would be nice for kvm_io_bus_prepare() to return the idx or the
device pointer so the devices don't need to be searched in
read/write/finish. However, it's complicated by the loop which may
access multiple devices.

> @@ -9309,6 +9325,7 @@ static int complete_ioregion_mmio(struct kvm_vcpu *vcpu)
>  		vcpu->mmio_cur_fragment++;
>  	}
>  
> +	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
>  	vcpu->mmio_needed = 0;
>  	if (!vcpu->ioregion_ctx.in) {
>  		srcu_read_unlock(&vcpu->kvm->srcu, idx);
> @@ -9333,6 +9350,7 @@ static int complete_ioregion_pio(struct kvm_vcpu *vcpu)
>  		vcpu->ioregion_ctx.val += vcpu->ioregion_ctx.len;
>  	}
>  
> +	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
>  	if (vcpu->ioregion_ctx.in)
>  		r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
>  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> @@ -9352,6 +9370,7 @@ static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu)
>  	complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
>  				 vcpu->ioregion_ctx.len,
>  				 vcpu->ioregion_ctx.val);
> +	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
>  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
>  
>  	if (vcpu->ioregion_ctx.in) {

Normally userspace will invoke ioctl(KVM_RUN) and reach one of these
completion functions, but what if the vcpu fd is closed instead?
->finish() should still be called to avoid leaks.

> diff --git a/include/kvm/iodev.h b/include/kvm/iodev.h
> index d75fc4365746..db8a3c69b7bb 100644
> --- a/include/kvm/iodev.h
> +++ b/include/kvm/iodev.h
> @@ -25,6 +25,8 @@ struct kvm_io_device_ops {
>  		     gpa_t addr,
>  		     int len,
>  		     const void *val);
> +	void (*prepare)(struct kvm_io_device *this);
> +	void (*finish)(struct kvm_io_device *this);
>  	void (*destructor)(struct kvm_io_device *this);
>  };
>  
> @@ -55,6 +57,18 @@ static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu,
>  				 : -EOPNOTSUPP;
>  }
>  
> +static inline void kvm_iodevice_prepare(struct kvm_io_device *dev)
> +{
> +	if (dev->ops->prepare)
> +		dev->ops->prepare(dev);
> +}
> +
> +static inline void kvm_iodevice_finish(struct kvm_io_device *dev)
> +{
> +	if (dev->ops->finish)
> +		dev->ops->finish(dev);
> +}

A performance optimization: keep a separate list of struct
kvm_io_devices that implement prepare/finish. That way the search
doesn't need to iterate over devices that don't support this interface.

Before implementing an optimization like this it would be good to check
how this patch affects performance on guests with many in-kernel devices
(e.g. a guest that has many multi-queue virtio-net/blk devices with
ioeventfd). ioregionfd shouldn't reduce performance of existing KVM
configurations, so it's worth measuring.

> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> index da38124e1418..3474090ccc8c 100644
> --- a/virt/kvm/ioregion.c
> +++ b/virt/kvm/ioregion.c
> @@ -1,6 +1,6 @@
>  // SPDX-License-Identifier: GPL-2.0-only
>  #include <linux/kvm_host.h>
> -#include <linux/fs.h>
> +#include <linux/wait.h>
>  #include <kvm/iodev.h>
>  #include "eventfd.h"
>  #include <uapi/linux/ioregion.h>
> @@ -12,15 +12,23 @@ kvm_ioregionfd_init(struct kvm *kvm)
>  	INIT_LIST_HEAD(&kvm->ioregions_pio);
>  }
>  
> +/* Serializes ioregionfd cmds/replies */

Please expand on this comment:

  ioregions that share the same rfd are serialized so that only one vCPU
  thread sends a struct ioregionfd_cmd to userspace at a time. This
  ensures that the struct ioregionfd_resp received from userspace will
  be processed by the one and only vCPU thread that sent it.

  A waitqueue is used to wake up waiting vCPU threads in order. Most of
  the time the waitqueue is unused and the lock is not contended.
  For best performance userspace should set up ioregionfds so that there
  is no contention (e.g. dedicated ioregionfds for queue doorbell
  registers on multi-queue devices).

A comment along these lines will give readers an idea of why the code
does this.
Elena Afanasova Feb. 3, 2021, 2:10 p.m. UTC | #2
On Sat, 2021-01-30 at 18:54 +0000, Stefan Hajnoczi wrote:
> On Thu, Jan 28, 2021 at 09:32:22PM +0300, Elena Afanasova wrote:
> > Add ioregionfd context and kvm_io_device_ops->prepare/finish()
> > in order to serialize all bytes requested by guest.
> > 
> > Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
> > ---
> >  arch/x86/kvm/x86.c       |  19 ++++++++
> >  include/kvm/iodev.h      |  14 ++++++
> >  include/linux/kvm_host.h |   4 ++
> >  virt/kvm/ioregion.c      | 102 +++++++++++++++++++++++++++++++++
> > ------
> >  virt/kvm/kvm_main.c      |  32 ++++++++++++
> >  5 files changed, 157 insertions(+), 14 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index a04516b531da..393fb0f4bf46 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -5802,6 +5802,8 @@ static int vcpu_mmio_write(struct kvm_vcpu
> > *vcpu, gpa_t addr, int len,
> >  	int ret = 0;
> >  	bool is_apic;
> >  
> > +	kvm_io_bus_prepare(vcpu, KVM_MMIO_BUS, addr, len);
> > +
> >  	do {
> >  		n = min(len, 8);
> >  		is_apic = lapic_in_kernel(vcpu) &&
> > @@ -5823,8 +5825,10 @@ static int vcpu_mmio_write(struct kvm_vcpu
> > *vcpu, gpa_t addr, int len,
> >  	if (ret == -EINTR) {
> >  		vcpu->run->exit_reason = KVM_EXIT_INTR;
> >  		++vcpu->stat.signal_exits;
> > +		return handled;
> >  	}
> >  #endif
> > +	kvm_io_bus_finish(vcpu, KVM_MMIO_BUS, addr, len);
> 
> Hmm...it would be nice for kvm_io_bus_prepare() to return the idx or
> the
> device pointer so the devices don't need to be searched in
> read/write/finish. However, it's complicated by the loop which may
> access multiple devices.
> 
Agree

> > @@ -9309,6 +9325,7 @@ static int complete_ioregion_mmio(struct
> > kvm_vcpu *vcpu)
> >  		vcpu->mmio_cur_fragment++;
> >  	}
> >  
> > +	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
> >  	vcpu->mmio_needed = 0;
> >  	if (!vcpu->ioregion_ctx.in) {
> >  		srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > @@ -9333,6 +9350,7 @@ static int complete_ioregion_pio(struct
> > kvm_vcpu *vcpu)
> >  		vcpu->ioregion_ctx.val += vcpu->ioregion_ctx.len;
> >  	}
> >  
> > +	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
> >  	if (vcpu->ioregion_ctx.in)
> >  		r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> >  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > @@ -9352,6 +9370,7 @@ static int complete_ioregion_fast_pio(struct
> > kvm_vcpu *vcpu)
> >  	complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
> >  				 vcpu->ioregion_ctx.len,
> >  				 vcpu->ioregion_ctx.val);
> > +	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
> >  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> >  
> >  	if (vcpu->ioregion_ctx.in) {
> 
> Normally userspace will invoke ioctl(KVM_RUN) and reach one of these
> completion functions, but what if the vcpu fd is closed instead?
> ->finish() should still be called to avoid leaks.
> 
Will fix

> > diff --git a/include/kvm/iodev.h b/include/kvm/iodev.h
> > index d75fc4365746..db8a3c69b7bb 100644
> > --- a/include/kvm/iodev.h
> > +++ b/include/kvm/iodev.h
> > @@ -25,6 +25,8 @@ struct kvm_io_device_ops {
> >  		     gpa_t addr,
> >  		     int len,
> >  		     const void *val);
> > +	void (*prepare)(struct kvm_io_device *this);
> > +	void (*finish)(struct kvm_io_device *this);
> >  	void (*destructor)(struct kvm_io_device *this);
> >  };
> >  
> > @@ -55,6 +57,18 @@ static inline int kvm_iodevice_write(struct
> > kvm_vcpu *vcpu,
> >  				 : -EOPNOTSUPP;
> >  }
> >  
> > +static inline void kvm_iodevice_prepare(struct kvm_io_device *dev)
> > +{
> > +	if (dev->ops->prepare)
> > +		dev->ops->prepare(dev);
> > +}
> > +
> > +static inline void kvm_iodevice_finish(struct kvm_io_device *dev)
> > +{
> > +	if (dev->ops->finish)
> > +		dev->ops->finish(dev);
> > +}
> 
> A performance optimization: keep a separate list of struct
> kvm_io_devices that implement prepare/finish. That way the search
> doesn't need to iterate over devices that don't support this
> interface.
> 
Thanks for the idea

> Before implementing an optimization like this it would be good to
> check
> how this patch affects performance on guests with many in-kernel
> devices
> (e.g. a guest that has many multi-queue virtio-net/blk devices with
> ioeventfd). ioregionfd shouldn't reduce performance of existing KVM
> configurations, so it's worth measuring.
> 
> > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> > index da38124e1418..3474090ccc8c 100644
> > --- a/virt/kvm/ioregion.c
> > +++ b/virt/kvm/ioregion.c
> > @@ -1,6 +1,6 @@
> >  // SPDX-License-Identifier: GPL-2.0-only
> >  #include <linux/kvm_host.h>
> > -#include <linux/fs.h>
> > +#include <linux/wait.h>
> >  #include <kvm/iodev.h>
> >  #include "eventfd.h"
> >  #include <uapi/linux/ioregion.h>
> > @@ -12,15 +12,23 @@ kvm_ioregionfd_init(struct kvm *kvm)
> >  	INIT_LIST_HEAD(&kvm->ioregions_pio);
> >  }
> >  
> > +/* Serializes ioregionfd cmds/replies */
> 
> Please expand on this comment:
> 
>   ioregions that share the same rfd are serialized so that only one
> vCPU
>   thread sends a struct ioregionfd_cmd to userspace at a time. This
>   ensures that the struct ioregionfd_resp received from userspace
> will
>   be processed by the one and only vCPU thread that sent it.
> 
>   A waitqueue is used to wake up waiting vCPU threads in order. Most
> of
>   the time the waitqueue is unused and the lock is not contended.
>   For best performance userspace should set up ioregionfds so that
> there
>   is no contention (e.g. dedicated ioregionfds for queue doorbell
>   registers on multi-queue devices).
> 
> A comment along these lines will give readers an idea of why the code
> does this.

Ok, thank you
diff mbox series

Patch

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a04516b531da..393fb0f4bf46 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5802,6 +5802,8 @@  static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
 	int ret = 0;
 	bool is_apic;
 
+	kvm_io_bus_prepare(vcpu, KVM_MMIO_BUS, addr, len);
+
 	do {
 		n = min(len, 8);
 		is_apic = lapic_in_kernel(vcpu) &&
@@ -5823,8 +5825,10 @@  static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
 	if (ret == -EINTR) {
 		vcpu->run->exit_reason = KVM_EXIT_INTR;
 		++vcpu->stat.signal_exits;
+		return handled;
 	}
 #endif
+	kvm_io_bus_finish(vcpu, KVM_MMIO_BUS, addr, len);
 
 	return handled;
 }
@@ -5836,6 +5840,8 @@  static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 	int ret = 0;
 	bool is_apic;
 
+	kvm_io_bus_prepare(vcpu, KVM_MMIO_BUS, addr, len);
+
 	do {
 		n = min(len, 8);
 		is_apic = lapic_in_kernel(vcpu) &&
@@ -5858,8 +5864,10 @@  static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 	if (ret == -EINTR) {
 		vcpu->run->exit_reason = KVM_EXIT_INTR;
 		++vcpu->stat.signal_exits;
+		return handled;
 	}
 #endif
+	kvm_io_bus_finish(vcpu, KVM_MMIO_BUS, addr, len);
 
 	return handled;
 }
@@ -6442,6 +6450,10 @@  static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 {
 	int r = 0, i;
 
+	kvm_io_bus_prepare(vcpu, KVM_PIO_BUS,
+			   vcpu->arch.pio.port,
+			   vcpu->arch.pio.size);
+
 	for (i = 0; i < vcpu->arch.pio.count; i++) {
 		if (vcpu->arch.pio.in)
 			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS,
@@ -6458,8 +6470,12 @@  static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 #ifdef CONFIG_KVM_IOREGION
 	if (vcpu->ioregion_interrupted && r == -EINTR) {
 		vcpu->ioregion_ctx.pio = i;
+		return r;
 	}
 #endif
+	kvm_io_bus_finish(vcpu, KVM_PIO_BUS,
+			  vcpu->arch.pio.port,
+			  vcpu->arch.pio.size);
 
 	return r;
 }
@@ -9309,6 +9325,7 @@  static int complete_ioregion_mmio(struct kvm_vcpu *vcpu)
 		vcpu->mmio_cur_fragment++;
 	}
 
+	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
 	vcpu->mmio_needed = 0;
 	if (!vcpu->ioregion_ctx.in) {
 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -9333,6 +9350,7 @@  static int complete_ioregion_pio(struct kvm_vcpu *vcpu)
 		vcpu->ioregion_ctx.val += vcpu->ioregion_ctx.len;
 	}
 
+	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
 	if (vcpu->ioregion_ctx.in)
 		r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -9352,6 +9370,7 @@  static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu)
 	complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
 				 vcpu->ioregion_ctx.len,
 				 vcpu->ioregion_ctx.val);
+	vcpu->ioregion_ctx.dev->ops->finish(vcpu->ioregion_ctx.dev);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
 	if (vcpu->ioregion_ctx.in) {
diff --git a/include/kvm/iodev.h b/include/kvm/iodev.h
index d75fc4365746..db8a3c69b7bb 100644
--- a/include/kvm/iodev.h
+++ b/include/kvm/iodev.h
@@ -25,6 +25,8 @@  struct kvm_io_device_ops {
 		     gpa_t addr,
 		     int len,
 		     const void *val);
+	void (*prepare)(struct kvm_io_device *this);
+	void (*finish)(struct kvm_io_device *this);
 	void (*destructor)(struct kvm_io_device *this);
 };
 
@@ -55,6 +57,18 @@  static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu,
 				 : -EOPNOTSUPP;
 }
 
+static inline void kvm_iodevice_prepare(struct kvm_io_device *dev)
+{
+	if (dev->ops->prepare)
+		dev->ops->prepare(dev);
+}
+
+static inline void kvm_iodevice_finish(struct kvm_io_device *dev)
+{
+	if (dev->ops->finish)
+		dev->ops->finish(dev);
+}
+
 static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
 {
 	if (dev->ops->destructor)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5cfdecfca6db..f6b9ff4c468d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -194,6 +194,10 @@  void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 			       struct kvm_io_device *dev);
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 					 gpa_t addr);
+void kvm_io_bus_prepare(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
+			int len);
+void kvm_io_bus_finish(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
+		       int len);
 
 #ifdef CONFIG_KVM_ASYNC_PF
 struct kvm_async_pf {
diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
index da38124e1418..3474090ccc8c 100644
--- a/virt/kvm/ioregion.c
+++ b/virt/kvm/ioregion.c
@@ -1,6 +1,6 @@ 
 // SPDX-License-Identifier: GPL-2.0-only
 #include <linux/kvm_host.h>
-#include <linux/fs.h>
+#include <linux/wait.h>
 #include <kvm/iodev.h>
 #include "eventfd.h"
 #include <uapi/linux/ioregion.h>
@@ -12,15 +12,23 @@  kvm_ioregionfd_init(struct kvm *kvm)
 	INIT_LIST_HEAD(&kvm->ioregions_pio);
 }
 
+/* Serializes ioregionfd cmds/replies */
+struct ioregionfd {
+	wait_queue_head_t	  wq;
+	struct file		 *rf;
+	struct kref		  kref;
+	bool			  busy;
+};
+
 struct ioregion {
-	struct list_head     list;
-	u64                  paddr;  /* guest physical address */
-	u64                  size;   /* size in bytes */
-	struct file         *rf;
-	struct file         *wf;
-	u64                  user_data; /* opaque token used by userspace */
-	struct kvm_io_device dev;
-	bool                 posted_writes;
+	struct list_head	  list;
+	u64			  paddr;   /* guest physical address */
+	u64			  size;    /* size in bytes */
+	struct file		 *wf;
+	u64			  user_data; /* opaque token used by userspace */
+	struct kvm_io_device	  dev;
+	bool			  posted_writes;
+	struct ioregionfd	 *ctx;
 };
 
 static inline struct ioregion *
@@ -29,13 +37,22 @@  to_ioregion(struct kvm_io_device *dev)
 	return container_of(dev, struct ioregion, dev);
 }
 
+/* assumes kvm->slots_lock held */
+static void ctx_free(struct kref *kref)
+{
+	struct ioregionfd *ctx = container_of(kref, struct ioregionfd, kref);
+
+	kfree(ctx);
+}
+
 /* assumes kvm->slots_lock held */
 static void
 ioregion_release(struct ioregion *p)
 {
-	fput(p->rf);
+	fput(p->ctx->rf);
 	fput(p->wf);
 	list_del(&p->list);
+	kref_put(&p->ctx->kref, ctx_free);
 	kfree(p);
 }
 
@@ -94,6 +111,28 @@  ioregion_save_ctx(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
 	vcpu->ioregion_ctx.in = in;
 }
 
+static void
+ioregion_prepare(struct kvm_io_device *this)
+{
+	struct ioregion *p = to_ioregion(this);
+
+	spin_lock(&p->ctx->wq.lock);
+	wait_event_interruptible_exclusive_locked(p->ctx->wq, !p->ctx->busy);
+	p->ctx->busy = true;
+	spin_unlock(&p->ctx->wq.lock);
+}
+
+static void
+ioregion_finish(struct kvm_io_device *this)
+{
+	struct ioregion *p = to_ioregion(this);
+
+	spin_lock(&p->ctx->wq.lock);
+	p->ctx->busy = false;
+	wake_up_locked(&p->ctx->wq);
+	spin_unlock(&p->ctx->wq.lock);
+}
+
 static int
 ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 	      int len, void *val)
@@ -142,7 +181,7 @@  ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 
 get_repl:
 	memset(&buf, 0, sizeof(buf));
-	ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
+	ret = kernel_read(p->ctx->rf, &buf.resp, sizeof(buf.resp), 0);
 	state += (ret == sizeof(buf.resp));
 	if (signal_pending(current)) {
 		ioregion_save_ctx(vcpu, this, 1, addr, len, buf.resp.data, state, val);
@@ -209,7 +248,7 @@  ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 get_repl:
 	if (!p->posted_writes) {
 		memset(&buf, 0, sizeof(buf));
-		ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
+		ret = kernel_read(p->ctx->rf, &buf.resp, sizeof(buf.resp), 0);
 		state += (ret == sizeof(buf.resp));
 		if (signal_pending(current)) {
 			ioregion_save_ctx(vcpu, this, 0, addr, len,
@@ -240,6 +279,8 @@  ioregion_destructor(struct kvm_io_device *this)
 static const struct kvm_io_device_ops ioregion_ops = {
 	.read       = ioregion_read,
 	.write      = ioregion_write,
+	.prepare    = ioregion_prepare,
+	.finish     = ioregion_finish,
 	.destructor = ioregion_destructor,
 };
 
@@ -295,6 +336,34 @@  get_bus_from_flags(__u32 flags)
 	return KVM_MMIO_BUS;
 }
 
+/* assumes kvm->slots_lock held */
+static bool
+ioregion_get_ctx(struct kvm *kvm, struct ioregion *p, struct file *rf, int bus_idx)
+{
+	struct ioregion *_p;
+	struct list_head *ioregions;
+
+	ioregions = get_ioregion_list(kvm, bus_idx);
+	list_for_each_entry(_p, ioregions, list)
+		if (file_inode(_p->ctx->rf)->i_ino == file_inode(rf)->i_ino) {
+			p->ctx = _p->ctx;
+			kref_get(&p->ctx->kref);
+			return true;
+		}
+
+	p->ctx = kzalloc(sizeof(*p->ctx), GFP_KERNEL_ACCOUNT);
+	if (!p->ctx) {
+		kfree(p);
+		return false;
+	}
+	p->ctx->rf = rf;
+	p->ctx->busy = false;
+	init_waitqueue_head(&p->ctx->wq);
+	kref_get(&p->ctx->kref);
+
+	return true;
+}
+
 int
 kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
 {
@@ -327,11 +396,10 @@  kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
 	}
 
 	INIT_LIST_HEAD(&p->list);
+	p->wf = wfile;
 	p->paddr = args->guest_paddr;
 	p->size = args->memory_size;
 	p->user_data = args->user_data;
-	p->rf = rfile;
-	p->wf = wfile;
 	p->posted_writes = args->flags & KVM_IOREGION_POSTED_WRITES;
 	bus_idx = get_bus_from_flags(args->flags);
 
@@ -341,6 +409,12 @@  kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args)
 		ret = -EEXIST;
 		goto unlock_fail;
 	}
+
+	if (!ioregion_get_ctx(kvm, p, rfile, bus_idx)) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
 	kvm_iodevice_init(&p->dev, &ioregion_ops);
 	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr, p->size,
 				      &p->dev);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index df387857f51f..096504a6cc62 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4308,6 +4308,38 @@  int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 	return r < 0 ? r : 0;
 }
 
+void kvm_io_bus_prepare(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len)
+{
+	struct kvm_io_bus *bus;
+	int idx;
+
+	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	if (!bus)
+		return;
+
+	idx = kvm_io_bus_get_first_dev(bus, addr, len);
+	if (idx < 0)
+		return;
+
+	kvm_iodevice_prepare(bus->range[idx].dev);
+}
+
+void kvm_io_bus_finish(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len)
+{
+	struct kvm_io_bus *bus;
+	int idx;
+
+	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	if (!bus)
+		return;
+
+	idx = kvm_io_bus_get_first_dev(bus, addr, len);
+	if (idx < 0)
+		return;
+
+	kvm_iodevice_finish(bus->range[idx].dev);
+}
+
 /* Caller must hold slots_lock. */
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 			    int len, struct kvm_io_device *dev)