diff mbox

[v16,5/5] virtio-balloon: VIRTIO_BALLOON_F_CTRL_VQ

Message ID 1506744354-20979-6-git-send-email-wei.w.wang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wang, Wei W Sept. 30, 2017, 4:05 a.m. UTC
Add a new vq, ctrl_vq, to handle commands between the host and guest.
With this feature, we will be able to have the control plane and data
plane separated. In other words, the control related commands of each
feature will be sent via the ctrl_vq, meanwhile each feature may have
its own vq used as a data plane.

Free page report is the the first new feature controlled via ctrl_vq,
and a new cmd class, VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE, is added.
Currently, this feature has two cmds:
VIRTIO_BALLOON_FREE_PAGE_F_START: This cmd is sent from host to guest
to start the free page report work.
VIRTIO_BALLOON_FREE_PAGE_F_STOP: This cmd is bidirectional. The guest
would send the cmd to the host to indicate the reporting work is done.
The host would send the cmd to the guest to actively request the stop
of the reporting work.

The free_page_vq is used to transmit the guest free page blocks to the
host.

Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
---
 drivers/virtio/virtio_balloon.c     | 249 +++++++++++++++++++++++++++++++++---
 include/uapi/linux/virtio_balloon.h |  15 +++
 2 files changed, 244 insertions(+), 20 deletions(-)

Comments

Michael S. Tsirkin Oct. 1, 2017, 3:18 a.m. UTC | #1
On Sat, Sep 30, 2017 at 12:05:54PM +0800, Wei Wang wrote:
> Add a new vq, ctrl_vq, to handle commands between the host and guest.
> With this feature, we will be able to have the control plane and data
> plane separated. In other words, the control related commands of each
> feature will be sent via the ctrl_vq, meanwhile each feature may have
> its own vq used as a data plane.
> 
> Free page report is the the first new feature controlled via ctrl_vq,
> and a new cmd class, VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE, is added.
> Currently, this feature has two cmds:
> VIRTIO_BALLOON_FREE_PAGE_F_START: This cmd is sent from host to guest
> to start the free page report work.
> VIRTIO_BALLOON_FREE_PAGE_F_STOP: This cmd is bidirectional. The guest
> would send the cmd to the host to indicate the reporting work is done.
> The host would send the cmd to the guest to actively request the stop
> of the reporting work.
> 
> The free_page_vq is used to transmit the guest free page blocks to the
> host.
> 
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> Signed-off-by: Liang Li <liang.z.li@intel.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Michal Hocko <mhocko@kernel.org>
> ---
>  drivers/virtio/virtio_balloon.c     | 249 +++++++++++++++++++++++++++++++++---
>  include/uapi/linux/virtio_balloon.h |  15 +++
>  2 files changed, 244 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 6952e19..70dc4ae 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -55,7 +55,13 @@ static struct vfsmount *balloon_mnt;
>  
>  struct virtio_balloon {
>  	struct virtio_device *vdev;
> -	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
> +	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *ctrl_vq,
> +			 *free_page_vq;
> +
> +	/* Balloon's own wq for cpu-intensive work items */
> +	struct workqueue_struct *balloon_wq;
> +	/* The work items submitted to the balloon wq are listed here */
> +	struct work_struct report_free_page_work;
>  
>  	/* The balloon servicing is delegated to a freezable workqueue. */
>  	struct work_struct update_balloon_stats_work;
> @@ -65,6 +71,9 @@ struct virtio_balloon {
>  	spinlock_t stop_update_lock;
>  	bool stop_update;
>  
> +	/* Stop reporting free pages */
> +	bool report_free_page_stop;
> +
>  	/* Waiting for host to ack the pages we released. */
>  	wait_queue_head_t acked;
>  
> @@ -93,6 +102,11 @@ struct virtio_balloon {
>  
>  	/* To register callback in oom notifier call chain */
>  	struct notifier_block nb;
> +
> +	/* Host to guest ctrlq cmd buf for free page report */
> +	struct virtio_balloon_ctrlq_cmd free_page_cmd_in;
> +	/* Guest to Host ctrlq cmd buf for free page report */
> +	struct virtio_balloon_ctrlq_cmd free_page_cmd_out;
>  };
>  
>  static struct virtio_device_id id_table[] = {
> @@ -186,6 +200,24 @@ static int send_balloon_page_sg(struct virtio_balloon *vb,
>  	return err;
>  }
>  
> +static int send_free_page_sg(struct virtqueue *vq, void *addr, uint32_t size)
> +{
> +	int ret = 0;
> +
> +	/*
> +	 * Since this is an optimization feature, losing a couplle of free

typo

> +	 * pages to report isn't important. We simply resturn without adding
> +	 * the page if the vq is full.
> +	 */
> +	if (vq->num_free) {
> +		ret = add_one_sg(vq, addr, size);
> +		if (!ret)
> +			virtqueue_kick(vq);
> +	}
> +
> +	return ret;
> +}
> +
>  /*
>   * Send balloon pages in sgs to host. The balloon pages are recorded in the
>   * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
> @@ -542,42 +574,210 @@ static void update_balloon_size_func(struct work_struct *work)
>  		queue_work(system_freezable_wq, work);
>  }
>  
> -static int init_vqs(struct virtio_balloon *vb)
> +static bool virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
> +					   unsigned long nr_pages)
> +{
> +	struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
> +	void *addr = (void *)pfn_to_kaddr(pfn);
> +	uint32_t len = nr_pages << PAGE_SHIFT;
> +
> +	if (vb->report_free_page_stop)
> +		return false;
> +
> +	/* If the vq is broken, stop reporting the free pages. */
> +	if (send_free_page_sg(vb->free_page_vq, addr, len) < 0)
> +		return false;
> +
> +	return true;
> +}
> +
> +static void ctrlq_add_cmd(struct virtqueue *vq,
> +			  struct virtio_balloon_ctrlq_cmd *cmd,
> +			  bool inbuf)
>  {
> -	struct virtqueue *vqs[3];
> -	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
> -	static const char * const names[] = { "inflate", "deflate", "stats" };
> -	int err, nvqs;
> +	struct scatterlist sg;
> +	int err;
> +
> +	sg_init_one(&sg, cmd, sizeof(struct virtio_balloon_ctrlq_cmd));
> +	if (inbuf)
> +		err = virtqueue_add_inbuf(vq, &sg, 1, cmd, GFP_KERNEL);
> +	else
> +		err = virtqueue_add_outbuf(vq, &sg, 1, cmd, GFP_KERNEL);
> +
> +	/* Sanity check: this can't really happen */
> +	WARN_ON(err);
> +}
> +
> +static void ctrlq_send_cmd(struct virtio_balloon *vb,
> +			  struct virtio_balloon_ctrlq_cmd *cmd,
> +			  bool inbuf)
> +{
> +	struct virtqueue *vq = vb->ctrl_vq;
> +
> +	ctrlq_add_cmd(vq, cmd, inbuf);
> +	if (!inbuf) {
> +		/*
> +		 * All the input cmd buffers are replenished here.
> +		 * This is necessary because the input cmd buffers are lost
> +		 * after live migration. The device needs to rewind all of
> +		 * them from the ctrl_vq.

Confused. Live migration somehow loses state? Why is that and why
is it a good idea? And how do you know this is migration even?
Looks like all you know is you got free page end. Could be any
reason for this.


> +		 */
> +		ctrlq_add_cmd(vq, &vb->free_page_cmd_in, true);
> +	}
> +	virtqueue_kick(vq);
> +}
>  
> +static void report_free_page_end(struct virtio_balloon *vb)
> +{
>  	/*
> -	 * We expect two virtqueues: inflate and deflate, and
> -	 * optionally stat.
> +	 * The host may have already requested to stop the reporting before we
> +	 * finish, so no need to notify the host in this case.
>  	 */
> -	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
> -	err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
> +	if (vb->report_free_page_stop)
> +		return;
> +
> +	vb->free_page_cmd_out.class = VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE;
> +	vb->free_page_cmd_out.cmd = VIRTIO_BALLOON_FREE_PAGE_F_STOP;
> +	ctrlq_send_cmd(vb, &vb->free_page_cmd_out, false);
> +	vb->report_free_page_stop = true;
> +}
> +
> +static void report_free_page(struct work_struct *work)
> +{
> +	struct virtio_balloon *vb;
> +
> +	vb = container_of(work, struct virtio_balloon, report_free_page_work);
> +	walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
> +	report_free_page_end(vb);
> +}
> +
> +static void ctrlq_handle(struct virtqueue *vq)
> +{
> +	struct virtio_balloon *vb = vq->vdev->priv;
> +	struct virtio_balloon_ctrlq_cmd *msg;
> +	unsigned int class, cmd, len;
> +
> +	msg = (struct virtio_balloon_ctrlq_cmd *)virtqueue_get_buf(vq, &len);
> +	if (unlikely(!msg))
> +		return;
> +
> +	/* The outbuf is sent by the host for recycling, so just return. */
> +	if (msg == &vb->free_page_cmd_out)
> +		return;
> +
> +	class = virtio32_to_cpu(vb->vdev, msg->class);
> +	cmd =  virtio32_to_cpu(vb->vdev, msg->cmd);
> +
> +	switch (class) {
> +	case VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE:
> +		if (cmd == VIRTIO_BALLOON_FREE_PAGE_F_STOP) {
> +			vb->report_free_page_stop = true;
> +		} else if (cmd == VIRTIO_BALLOON_FREE_PAGE_F_START) {
> +			vb->report_free_page_stop = false;
> +			queue_work(vb->balloon_wq, &vb->report_free_page_work);
> +		}
> +		vb->free_page_cmd_in.class =
> +					VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE;
> +		ctrlq_send_cmd(vb, &vb->free_page_cmd_in, true);
> +	break;
> +	default:
> +		dev_warn(&vb->vdev->dev, "%s: cmd class not supported\n",
> +			 __func__);
> +	}

Manipulating report_free_page_stop without any locks looks
very suspicious.
Also, what if we get two start commands? we should restart
from beginning, should we not?

> +}
> +
> +static int init_vqs(struct virtio_balloon *vb)
> +{
> +	struct virtqueue **vqs;
> +	vq_callback_t **callbacks;
> +	const char **names;
> +	struct scatterlist sg;
> +	int i, nvqs, err = -ENOMEM;
> +
> +	/* Inflateq and deflateq are used unconditionally */
> +	nvqs = 2;
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ))
> +		nvqs++;
> +	/* If ctrlq is enabled, the free page vq will also be created */
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_CTRL_VQ))
> +		nvqs += 2;

Since you made it generic, free page should
have its own flag not rely on ctrl vq.


> +
> +	/* Allocate space for find_vqs parameters */
> +	vqs = kcalloc(nvqs, sizeof(*vqs), GFP_KERNEL);
> +	if (!vqs)
> +		goto err_vq;
> +	callbacks = kmalloc_array(nvqs, sizeof(*callbacks), GFP_KERNEL);
> +	if (!callbacks)
> +		goto err_callback;
> +	names = kmalloc_array(nvqs, sizeof(*names), GFP_KERNEL);
> +	if (!names)
> +		goto err_names;
> +
> +	callbacks[0] = balloon_ack;
> +	names[0] = "inflate";
> +	callbacks[1] = balloon_ack;
> +	names[1] = "deflate";
> +
> +	i = 2;
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> +		callbacks[i] = stats_request;
> +		names[i] = "stats";
> +		i++;
> +	}
> +
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_CTRL_VQ)) {
> +		callbacks[i] = ctrlq_handle;
> +		names[i++] = "ctrlq";
> +		callbacks[i] = NULL;
> +		names[i] = "free_page_vq";
> +	}
> +
> +	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names,
> +					 NULL, NULL);
>  	if (err)
> -		return err;
> +		goto err_find;
>  
>  	vb->inflate_vq = vqs[0];
>  	vb->deflate_vq = vqs[1];
> +	i = 2;
>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> -		struct scatterlist sg;
> -		unsigned int num_stats;
> -		vb->stats_vq = vqs[2];
> -
> +		vb->stats_vq = vqs[i++];
>  		/*
>  		 * Prime this virtqueue with one buffer so the hypervisor can
>  		 * use it to signal us later (it can't be broken yet!).
>  		 */
> -		num_stats = update_balloon_stats(vb);
> -
> -		sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
> +		sg_init_one(&sg, vb->stats, sizeof(vb->stats));
>  		if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
> -		    < 0)
> -			BUG();
> +		    < 0) {
> +			dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
> +				 __func__);
> +			goto err_find;
> +		}
>  		virtqueue_kick(vb->stats_vq);
>  	}
> +
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_CTRL_VQ)) {
> +		vb->ctrl_vq = vqs[i++];
> +		vb->free_page_vq = vqs[i];
> +		/* Prime the ctrlq with an inbuf for the host to send a cmd */
> +		vb->free_page_cmd_in.class =
> +					VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE;
> +		ctrlq_send_cmd(vb, &vb->free_page_cmd_in, true);
> +	}
> +
> +	kfree(names);
> +	kfree(callbacks);
> +	kfree(vqs);
>  	return 0;
> +
> +err_find:
> +	kfree(names);
> +err_names:
> +	kfree(callbacks);
> +err_callback:
> +	kfree(vqs);
> +err_vq:
> +	return err;
>  }
>  
>  #ifdef CONFIG_BALLOON_COMPACTION
> @@ -706,6 +906,13 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_SG))
>  		xb_init(&vb->page_xb);
>  
> +	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_CTRL_VQ)) {
> +		vb->balloon_wq = alloc_workqueue("balloon-wq",
> +					WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
> +		INIT_WORK(&vb->report_free_page_work, report_free_page);
> +		vb->report_free_page_stop = true;
> +	}
> +
>  	vb->nb.notifier_call = virtballoon_oom_notify;
>  	vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
>  	err = register_oom_notifier(&vb->nb);
> @@ -770,6 +977,7 @@ static void virtballoon_remove(struct virtio_device *vdev)
>  	spin_unlock_irq(&vb->stop_update_lock);
>  	cancel_work_sync(&vb->update_balloon_size_work);
>  	cancel_work_sync(&vb->update_balloon_stats_work);
> +	cancel_work_sync(&vb->report_free_page_work);
>  
>  	remove_common(vb);
>  #ifdef CONFIG_BALLOON_COMPACTION
> @@ -823,6 +1031,7 @@ static unsigned int features[] = {
>  	VIRTIO_BALLOON_F_STATS_VQ,
>  	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
>  	VIRTIO_BALLOON_F_SG,
> +	VIRTIO_BALLOON_F_CTRL_VQ,
>  };
>  
>  static struct virtio_driver virtio_balloon_driver = {
> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index 37780a7..dbf0616 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -35,6 +35,7 @@
>  #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
>  #define VIRTIO_BALLOON_F_SG		3 /* Use sg instead of PFN lists */
> +#define VIRTIO_BALLOON_F_CTRL_VQ	4 /* Control Virtqueue */
>  
>  /* Size of a PFN in the balloon interface. */
>  #define VIRTIO_BALLOON_PFN_SHIFT 12
> @@ -83,4 +84,18 @@ struct virtio_balloon_stat {
>  	__virtio64 val;
>  } __attribute__((packed));
>  
> +enum {
> +	VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE = 0,
> +	VIRTIO_BALLOON_CTRLQ_CLASS_MAX,
> +};
> +
> +struct virtio_balloon_ctrlq_cmd {
> +	__virtio32 class;
> +	__virtio32 cmd;
> +};
> +
> +/* Ctrlq commands related to VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE */
> +#define VIRTIO_BALLOON_FREE_PAGE_F_STOP		0
> +#define VIRTIO_BALLOON_FREE_PAGE_F_START	1
> +
>  #endif /* _LINUX_VIRTIO_BALLOON_H */

The stop command does not appear to be thought through.

Let's assume e.g. you started migration. You ask guest for free pages.
Then you cancel it.  There are a bunch of pages in free vq and you are
getting more.  You now want to start migration again. What to do?

A bunch of vq flushing and waiting will maybe do the trick, but waiting
on guest is never a great idea.

I previously suggested pushing the stop/start commands from guest to
host on the free page vq, and including an ID in host to guest and
guest to host commands. This way ctrl vq is just for host to guest
commands, and host matches commands and knows which command
is a free page in response to.

I still think it's a good idea but go ahead and propose something
else that works.



> -- 
> 2.7.4
Wang, Wei W Oct. 2, 2017, 4:38 p.m. UTC | #2
On Sunday, October 1, 2017 11:19 AM, Michael S. Tsirkin wrote:
> On Sat, Sep 30, 2017 at 12:05:54PM +0800, Wei Wang wrote:
> > +static void ctrlq_send_cmd(struct virtio_balloon *vb,
> > +			  struct virtio_balloon_ctrlq_cmd *cmd,
> > +			  bool inbuf)
> > +{
> > +	struct virtqueue *vq = vb->ctrl_vq;
> > +
> > +	ctrlq_add_cmd(vq, cmd, inbuf);
> > +	if (!inbuf) {
> > +		/*
> > +		 * All the input cmd buffers are replenished here.
> > +		 * This is necessary because the input cmd buffers are lost
> > +		 * after live migration. The device needs to rewind all of
> > +		 * them from the ctrl_vq.
> 
> Confused. Live migration somehow loses state? Why is that and why is it a good
> idea? And how do you know this is migration even?
> Looks like all you know is you got free page end. Could be any reason for this.


I think this would be something that the current live migration lacks - what the
device read from the vq is not transferred during live migration, an example is the 
stat_vq_elem: 
Line 476 at https://github.com/qemu/qemu/blob/master/hw/virtio/virtio-balloon.c

For all the things that are added to the vq and need to be held by the device
to use later need to consider the situation that live migration might happen at any
time and they need to be re-taken from the vq by the device on the destination
machine.

So, even without this live migration optimization feature, I think all the things that are 
added to the vq for the device to hold, need a way for the device to rewind back from
the vq - re-adding all the elements to the vq is a trick to keep a record of all of them
on the vq so that the device side rewinding can work. 

Please let me know if anything is missed or if you have other suggestions.


> > +static void ctrlq_handle(struct virtqueue *vq) {
> > +	struct virtio_balloon *vb = vq->vdev->priv;
> > +	struct virtio_balloon_ctrlq_cmd *msg;
> > +	unsigned int class, cmd, len;
> > +
> > +	msg = (struct virtio_balloon_ctrlq_cmd *)virtqueue_get_buf(vq, &len);
> > +	if (unlikely(!msg))
> > +		return;
> > +
> > +	/* The outbuf is sent by the host for recycling, so just return. */
> > +	if (msg == &vb->free_page_cmd_out)
> > +		return;
> > +
> > +	class = virtio32_to_cpu(vb->vdev, msg->class);
> > +	cmd =  virtio32_to_cpu(vb->vdev, msg->cmd);
> > +
> > +	switch (class) {
> > +	case VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE:
> > +		if (cmd == VIRTIO_BALLOON_FREE_PAGE_F_STOP) {
> > +			vb->report_free_page_stop = true;
> > +		} else if (cmd == VIRTIO_BALLOON_FREE_PAGE_F_START) {
> > +			vb->report_free_page_stop = false;
> > +			queue_work(vb->balloon_wq, &vb-
> >report_free_page_work);
> > +		}
> > +		vb->free_page_cmd_in.class =
> > +
> 	VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE;
> > +		ctrlq_send_cmd(vb, &vb->free_page_cmd_in, true);
> > +	break;
> > +	default:
> > +		dev_warn(&vb->vdev->dev, "%s: cmd class not supported\n",
> > +			 __func__);
> > +	}
> 
> Manipulating report_free_page_stop without any locks looks very suspicious.

> Also, what if we get two start commands? we should restart from beginning,
> should we not?
> 


Yes, it will start to report free pages from the beginning.
walk_free_mem_block() doesn't maintain any internal status, so the invoking of
it will always start from the beginning.


> > +/* Ctrlq commands related to VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE
> */
> > +#define VIRTIO_BALLOON_FREE_PAGE_F_STOP		0
> > +#define VIRTIO_BALLOON_FREE_PAGE_F_START	1
> > +
> >  #endif /* _LINUX_VIRTIO_BALLOON_H */
> 
> The stop command does not appear to be thought through.
> 
> Let's assume e.g. you started migration. You ask guest for free pages.
> Then you cancel it.  There are a bunch of pages in free vq and you are getting
> more.  You now want to start migration again. What to do?
> 
> A bunch of vq flushing and waiting will maybe do the trick, but waiting on guest
> is never a great idea.
> 


I think the device can flush (pop out what's left in the vq and push them back) the
vq right after the Stop command is sent to the guest, rather than doing the flush
when the 2nd initiation of live migration begins. The entries pushed back to the vq
will be in the used ring, what would the device need to wait for?


> I previously suggested pushing the stop/start commands from guest to host on
> the free page vq, and including an ID in host to guest and guest to host
> commands. This way ctrl vq is just for host to guest commands, and host
> matches commands and knows which command is a free page in response to.
> 
> I still think it's a good idea but go ahead and propose something else that works.
> 

Thanks for the suggestion. Probably I haven't fully understood it. Please see the example
below:

1) host-to-guest ctrl_vq:
StartCMD, ID=1

2) guest-to-host free_page_vq:
free_page, ID=1
free_page, ID=1
free_page, ID=1
free_page, ID=1

3) host-to-guest ctrl_vq:
StopCMD, ID=1

4) initiate the 2nd try of live migration via host-to-guest ctrl_vq:
StartCMD, ID=2

5) the guest-to-host free_page_vq might look like this:
free_page, ID=1
free_page, ID=1
free_page, ID=2
free_page, ID=2

The device will need to drop (pop out the two entries and push them back)
the first 2 obsolete free pages which are sent by ID=1.

I haven't found the benefits above yet. The device will perform the same operations
to get rid of the old free pages. If we drop the old free pages after the StopCMD (
ID may also not be needed in this case), the overhead won't be added to the live
migration time.

Would you have any thought about this?


Best,
Wei
Michael S. Tsirkin Oct. 10, 2017, 3:15 p.m. UTC | #3
On Mon, Oct 02, 2017 at 04:38:01PM +0000, Wang, Wei W wrote:
> On Sunday, October 1, 2017 11:19 AM, Michael S. Tsirkin wrote:
> > On Sat, Sep 30, 2017 at 12:05:54PM +0800, Wei Wang wrote:
> > > +static void ctrlq_send_cmd(struct virtio_balloon *vb,
> > > +			  struct virtio_balloon_ctrlq_cmd *cmd,
> > > +			  bool inbuf)
> > > +{
> > > +	struct virtqueue *vq = vb->ctrl_vq;
> > > +
> > > +	ctrlq_add_cmd(vq, cmd, inbuf);
> > > +	if (!inbuf) {
> > > +		/*
> > > +		 * All the input cmd buffers are replenished here.
> > > +		 * This is necessary because the input cmd buffers are lost
> > > +		 * after live migration. The device needs to rewind all of
> > > +		 * them from the ctrl_vq.
> > 
> > Confused. Live migration somehow loses state? Why is that and why is it a good
> > idea? And how do you know this is migration even?
> > Looks like all you know is you got free page end. Could be any reason for this.
> 
> 
> I think this would be something that the current live migration lacks - what the
> device read from the vq is not transferred during live migration, an example is the 
> stat_vq_elem: 
> Line 476 at https://github.com/qemu/qemu/blob/master/hw/virtio/virtio-balloon.c

This does not touch guest memory though it just manipulates
internal state to make it easier to migrate.
It's transparent to guest as migration should be.

> For all the things that are added to the vq and need to be held by the device
> to use later need to consider the situation that live migration might happen at any
> time and they need to be re-taken from the vq by the device on the destination
> machine.
> 
> So, even without this live migration optimization feature, I think all the things that are 
> added to the vq for the device to hold, need a way for the device to rewind back from
> the vq - re-adding all the elements to the vq is a trick to keep a record of all of them
> on the vq so that the device side rewinding can work. 
> 
> Please let me know if anything is missed or if you have other suggestions.

IMO migration should pass enough data source to destination for
destination to continue where source left off without guest help.

> 
> > > +static void ctrlq_handle(struct virtqueue *vq) {
> > > +	struct virtio_balloon *vb = vq->vdev->priv;
> > > +	struct virtio_balloon_ctrlq_cmd *msg;
> > > +	unsigned int class, cmd, len;
> > > +
> > > +	msg = (struct virtio_balloon_ctrlq_cmd *)virtqueue_get_buf(vq, &len);
> > > +	if (unlikely(!msg))
> > > +		return;
> > > +
> > > +	/* The outbuf is sent by the host for recycling, so just return. */
> > > +	if (msg == &vb->free_page_cmd_out)
> > > +		return;
> > > +
> > > +	class = virtio32_to_cpu(vb->vdev, msg->class);
> > > +	cmd =  virtio32_to_cpu(vb->vdev, msg->cmd);
> > > +
> > > +	switch (class) {
> > > +	case VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE:
> > > +		if (cmd == VIRTIO_BALLOON_FREE_PAGE_F_STOP) {
> > > +			vb->report_free_page_stop = true;
> > > +		} else if (cmd == VIRTIO_BALLOON_FREE_PAGE_F_START) {
> > > +			vb->report_free_page_stop = false;
> > > +			queue_work(vb->balloon_wq, &vb-
> > >report_free_page_work);
> > > +		}
> > > +		vb->free_page_cmd_in.class =
> > > +
> > 	VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE;
> > > +		ctrlq_send_cmd(vb, &vb->free_page_cmd_in, true);
> > > +	break;
> > > +	default:
> > > +		dev_warn(&vb->vdev->dev, "%s: cmd class not supported\n",
> > > +			 __func__);
> > > +	}
> > 
> > Manipulating report_free_page_stop without any locks looks very suspicious.
> 
> > Also, what if we get two start commands? we should restart from beginning,
> > should we not?
> > 
> 
> 
> Yes, it will start to report free pages from the beginning.
> walk_free_mem_block() doesn't maintain any internal status, so the invoking of
> it will always start from the beginning.

Well yes but it will first complete the previous walk.

> 
> > > +/* Ctrlq commands related to VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE
> > */
> > > +#define VIRTIO_BALLOON_FREE_PAGE_F_STOP		0
> > > +#define VIRTIO_BALLOON_FREE_PAGE_F_START	1
> > > +
> > >  #endif /* _LINUX_VIRTIO_BALLOON_H */
> > 
> > The stop command does not appear to be thought through.
> > 
> > Let's assume e.g. you started migration. You ask guest for free pages.
> > Then you cancel it.  There are a bunch of pages in free vq and you are getting
> > more.  You now want to start migration again. What to do?
> > 
> > A bunch of vq flushing and waiting will maybe do the trick, but waiting on guest
> > is never a great idea.
> > 
> 
> 
> I think the device can flush (pop out what's left in the vq and push them back) the
> vq right after the Stop command is sent to the guest, rather than doing the flush
> when the 2nd initiation of live migration begins. The entries pushed back to the vq
> will be in the used ring, what would the device need to wait for?

You will be getting stale pages in available ring which were possibly
taken out of free list since memory is not tracked when migration is not
going on.



> > I previously suggested pushing the stop/start commands from guest to host on
> > the free page vq, and including an ID in host to guest and guest to host
> > commands. This way ctrl vq is just for host to guest commands, and host
> > matches commands and knows which command is a free page in response to.
> > 
> > I still think it's a good idea but go ahead and propose something else that works.
> > 
> 
> Thanks for the suggestion. Probably I haven't fully understood it. Please see the example
> below:
> 
> 1) host-to-guest ctrl_vq:
> StartCMD, ID=1
> 
> 2) guest-to-host free_page_vq:
> free_page, ID=1
> free_page, ID=1
> free_page, ID=1
> free_page, ID=1
> 
> 3) host-to-guest ctrl_vq:
> StopCMD, ID=1
> 
> 4) initiate the 2nd try of live migration via host-to-guest ctrl_vq:
> StartCMD, ID=2
> 
> 5) the guest-to-host free_page_vq might look like this:
> free_page, ID=1
> free_page, ID=1
> free_page, ID=2
> free_page, ID=2
> 
> The device will need to drop (pop out the two entries and push them back)
> the first 2 obsolete free pages which are sent by ID=1.

yes. But you do not have to attach id to each page.

It can be:

ID=1
free_page
free_page
ID=2
free_page
free_page



> I haven't found the benefits above yet. The device will perform the same operations
> to get rid of the old free pages. If we drop the old free pages after the StopCMD (
> ID may also not be needed in this case), the overhead won't be added to the live
> migration time.
> Would you have any thought about this?
> 
> 
> Best,
> Wei
> 

As these are separate vqs there is not clean way to know whether
free_page was queued before or after stop command.
Sending the ID helps detect where the free pages for a given start
command are.
Wang, Wei W Oct. 11, 2017, 6:03 a.m. UTC | #4
On 10/10/2017 11:15 PM, Michael S. Tsirkin wrote:
> On Mon, Oct 02, 2017 at 04:38:01PM +0000, Wang, Wei W wrote:
>> On Sunday, October 1, 2017 11:19 AM, Michael S. Tsirkin wrote:
>>> On Sat, Sep 30, 2017 at 12:05:54PM +0800, Wei Wang wrote:
>>>> +static void ctrlq_send_cmd(struct virtio_balloon *vb,
>>>> +			  struct virtio_balloon_ctrlq_cmd *cmd,
>>>> +			  bool inbuf)
>>>> +{
>>>> +	struct virtqueue *vq = vb->ctrl_vq;
>>>> +
>>>> +	ctrlq_add_cmd(vq, cmd, inbuf);
>>>> +	if (!inbuf) {
>>>> +		/*
>>>> +		 * All the input cmd buffers are replenished here.
>>>> +		 * This is necessary because the input cmd buffers are lost
>>>> +		 * after live migration. The device needs to rewind all of
>>>> +		 * them from the ctrl_vq.
>>> Confused. Live migration somehow loses state? Why is that and why is it a good
>>> idea? And how do you know this is migration even?
>>> Looks like all you know is you got free page end. Could be any reason for this.
>>
>> I think this would be something that the current live migration lacks - what the
>> device read from the vq is not transferred during live migration, an example is the
>> stat_vq_elem:
>> Line 476 at https://github.com/qemu/qemu/blob/master/hw/virtio/virtio-balloon.c
> This does not touch guest memory though it just manipulates
> internal state to make it easier to migrate.
> It's transparent to guest as migration should be.
>
>> For all the things that are added to the vq and need to be held by the device
>> to use later need to consider the situation that live migration might happen at any
>> time and they need to be re-taken from the vq by the device on the destination
>> machine.
>>
>> So, even without this live migration optimization feature, I think all the things that are
>> added to the vq for the device to hold, need a way for the device to rewind back from
>> the vq - re-adding all the elements to the vq is a trick to keep a record of all of them
>> on the vq so that the device side rewinding can work.
>>
>> Please let me know if anything is missed or if you have other suggestions.
> IMO migration should pass enough data source to destination for
> destination to continue where source left off without guest help.
>

I'm afraid it would be difficult to pass the entire VirtQueueElement to 
the destination. I think
that would also be the reason that stats_vq_elem chose to rewind from 
the guest vq, which re-do the
virtqueue_pop() --> virtqueue_map_desc() steps (the QEMU virtual address 
to the guest physical
address relationship may be changed on the destination).


How about another direction which would be easier - using two 32-bit 
device specific configuration registers,
Host2Guest and Guest2Host command registers, to replace the ctrlq for 
command exchange:

The flow can be as follows:

1) Before Host sending a StartCMD, it flushes the free_page_vq in case 
any old free page hint is left there;

2) Host writes StartCMD to the Host2Guest register, and notifies the guest;

3) Upon receiving a configuration notification, Guest reads the 
Host2Guest register, and detaches all the used buffers from free_page_vq;
(then for each StartCMD, the free_page_vq will always have no obsolete 
free page hints, right? )

4) Guest start report free pages:
     4.1) Host may actively write StopCMD to the Host2Guest register 
before the guest finishes; or
     4.2) Guest finishes reporting, write StopCMD  the Guest2HOST 
register, which traps to QEMU, to stop.


Best,
Wei
Michael S. Tsirkin Oct. 11, 2017, 1:49 p.m. UTC | #5
On Wed, Oct 11, 2017 at 02:03:20PM +0800, Wei Wang wrote:
> On 10/10/2017 11:15 PM, Michael S. Tsirkin wrote:
> > On Mon, Oct 02, 2017 at 04:38:01PM +0000, Wang, Wei W wrote:
> > > On Sunday, October 1, 2017 11:19 AM, Michael S. Tsirkin wrote:
> > > > On Sat, Sep 30, 2017 at 12:05:54PM +0800, Wei Wang wrote:
> > > > > +static void ctrlq_send_cmd(struct virtio_balloon *vb,
> > > > > +			  struct virtio_balloon_ctrlq_cmd *cmd,
> > > > > +			  bool inbuf)
> > > > > +{
> > > > > +	struct virtqueue *vq = vb->ctrl_vq;
> > > > > +
> > > > > +	ctrlq_add_cmd(vq, cmd, inbuf);
> > > > > +	if (!inbuf) {
> > > > > +		/*
> > > > > +		 * All the input cmd buffers are replenished here.
> > > > > +		 * This is necessary because the input cmd buffers are lost
> > > > > +		 * after live migration. The device needs to rewind all of
> > > > > +		 * them from the ctrl_vq.
> > > > Confused. Live migration somehow loses state? Why is that and why is it a good
> > > > idea? And how do you know this is migration even?
> > > > Looks like all you know is you got free page end. Could be any reason for this.
> > > 
> > > I think this would be something that the current live migration lacks - what the
> > > device read from the vq is not transferred during live migration, an example is the
> > > stat_vq_elem:
> > > Line 476 at https://github.com/qemu/qemu/blob/master/hw/virtio/virtio-balloon.c
> > This does not touch guest memory though it just manipulates
> > internal state to make it easier to migrate.
> > It's transparent to guest as migration should be.
> > 
> > > For all the things that are added to the vq and need to be held by the device
> > > to use later need to consider the situation that live migration might happen at any
> > > time and they need to be re-taken from the vq by the device on the destination
> > > machine.
> > > 
> > > So, even without this live migration optimization feature, I think all the things that are
> > > added to the vq for the device to hold, need a way for the device to rewind back from
> > > the vq - re-adding all the elements to the vq is a trick to keep a record of all of them
> > > on the vq so that the device side rewinding can work.
> > > 
> > > Please let me know if anything is missed or if you have other suggestions.
> > IMO migration should pass enough data source to destination for
> > destination to continue where source left off without guest help.
> > 
> 
> I'm afraid it would be difficult to pass the entire VirtQueueElement to the
> destination. I think
> that would also be the reason that stats_vq_elem chose to rewind from the
> guest vq, which re-do the
> virtqueue_pop() --> virtqueue_map_desc() steps (the QEMU virtual address to
> the guest physical
> address relationship may be changed on the destination).

Yes but note how that rewind does not involve modifying the ring.
It just rolls back some indices.


> 
> How about another direction which would be easier - using two 32-bit device
> specific configuration registers,
> Host2Guest and Guest2Host command registers, to replace the ctrlq for
> command exchange:
> 
> The flow can be as follows:
> 
> 1) Before Host sending a StartCMD, it flushes the free_page_vq in case any
> old free page hint is left there;

> 2) Host writes StartCMD to the Host2Guest register, and notifies the guest;
> 
> 3) Upon receiving a configuration notification, Guest reads the Host2Guest
> register, and detaches all the used buffers from free_page_vq;
> (then for each StartCMD, the free_page_vq will always have no obsolete free
> page hints, right? )
> 
> 4) Guest start report free pages:
>     4.1) Host may actively write StopCMD to the Host2Guest register before
> the guest finishes; or
>     4.2) Guest finishes reporting, write StopCMD  the Guest2HOST register,
> which traps to QEMU, to stop.
> 
> 
> Best,
> Wei

I am not sure it matters whether a VQ or the config are used to start/stop.
But I think flushing is very fragile. You will easily run into races
if one of the actors gets out of sync and keeps adding data.
I think adding an ID in the free vq stream is a more robust
approach.
Wang, Wei W Oct. 12, 2017, 3:54 a.m. UTC | #6
On 10/11/2017 09:49 PM, Michael S. Tsirkin wrote:
> On Wed, Oct 11, 2017 at 02:03:20PM +0800, Wei Wang wrote:
>> On 10/10/2017 11:15 PM, Michael S. Tsirkin wrote:
>>> On Mon, Oct 02, 2017 at 04:38:01PM +0000, Wang, Wei W wrote:
>>>> On Sunday, October 1, 2017 11:19 AM, Michael S. Tsirkin wrote:
>>>>> On Sat, Sep 30, 2017 at 12:05:54PM +0800, Wei Wang wrote:
>>>>>> +static void ctrlq_send_cmd(struct virtio_balloon *vb,
>>>>>> +			  struct virtio_balloon_ctrlq_cmd *cmd,
>>>>>> +			  bool inbuf)
>>>>>> +{
>>>>>> +	struct virtqueue *vq = vb->ctrl_vq;
>>>>>> +
>>>>>> +	ctrlq_add_cmd(vq, cmd, inbuf);
>>>>>> +	if (!inbuf) {
>>>>>> +		/*
>>>>>> +		 * All the input cmd buffers are replenished here.
>>>>>> +		 * This is necessary because the input cmd buffers are lost
>>>>>> +		 * after live migration. The device needs to rewind all of
>>>>>> +		 * them from the ctrl_vq.
>>>>> Confused. Live migration somehow loses state? Why is that and why is it a good
>>>>> idea? And how do you know this is migration even?
>>>>> Looks like all you know is you got free page end. Could be any reason for this.
>>>> I think this would be something that the current live migration lacks - what the
>>>> device read from the vq is not transferred during live migration, an example is the
>>>> stat_vq_elem:
>>>> Line 476 at https://github.com/qemu/qemu/blob/master/hw/virtio/virtio-balloon.c
>>> This does not touch guest memory though it just manipulates
>>> internal state to make it easier to migrate.
>>> It's transparent to guest as migration should be.
>>>
>>>> For all the things that are added to the vq and need to be held by the device
>>>> to use later need to consider the situation that live migration might happen at any
>>>> time and they need to be re-taken from the vq by the device on the destination
>>>> machine.
>>>>
>>>> So, even without this live migration optimization feature, I think all the things that are
>>>> added to the vq for the device to hold, need a way for the device to rewind back from
>>>> the vq - re-adding all the elements to the vq is a trick to keep a record of all of them
>>>> on the vq so that the device side rewinding can work.
>>>>
>>>> Please let me know if anything is missed or if you have other suggestions.
>>> IMO migration should pass enough data source to destination for
>>> destination to continue where source left off without guest help.
>>>
>> I'm afraid it would be difficult to pass the entire VirtQueueElement to the
>> destination. I think
>> that would also be the reason that stats_vq_elem chose to rewind from the
>> guest vq, which re-do the
>> virtqueue_pop() --> virtqueue_map_desc() steps (the QEMU virtual address to
>> the guest physical
>> address relationship may be changed on the destination).
> Yes but note how that rewind does not involve modifying the ring.
> It just rolls back some indices.

Yes, it rolls back the indices, then the following 
virtio_balloon_receive_stats()
can re-pop out the previous entry given by the guest.

Recall how stats_vq_elem works: there is only one stats buffer, which is 
used by the
guest to report stats, and also used by the host to ask the guest for 
stats report.

So the host can roll back one previous entry and what it gets will 
always be stat_vq_elem.


Our case is a little more complex than that - we have both free_page_cmd_in
(for host to guest command) and free_page_cmd_out (for guest to host 
command) buffer
passed via ctrl_vq. When the host rolls back one entry, it may get the 
free_page_cmd_out
buffer which can't be used as the host to guest buffer (i.e. 
free_page_elem held by the device).

So a trick in the driver is to refill the free_page_cmd_in buffer every 
time after the free_page_cmd_out
was sent to the host, so that when the host rewind one previous entry, 
it can always get the
free_page_cmd_in buffer (may be not a very nice method).



>
>> How about another direction which would be easier - using two 32-bit device
>> specific configuration registers,
>> Host2Guest and Guest2Host command registers, to replace the ctrlq for
>> command exchange:
>>
>> The flow can be as follows:
>>
>> 1) Before Host sending a StartCMD, it flushes the free_page_vq in case any
>> old free page hint is left there;
>> 2) Host writes StartCMD to the Host2Guest register, and notifies the guest;
>>
>> 3) Upon receiving a configuration notification, Guest reads the Host2Guest
>> register, and detaches all the used buffers from free_page_vq;
>> (then for each StartCMD, the free_page_vq will always have no obsolete free
>> page hints, right? )
>>
>> 4) Guest start report free pages:
>>      4.1) Host may actively write StopCMD to the Host2Guest register before
>> the guest finishes; or
>>      4.2) Guest finishes reporting, write StopCMD  the Guest2HOST register,
>> which traps to QEMU, to stop.
>>
>>
>> Best,
>> Wei
> I am not sure it matters whether a VQ or the config are used to start/stop.


Not matters, in terms of the flushing issue. The config method could 
avoid the above rewind issue.


> But I think flushing is very fragile. You will easily run into races
> if one of the actors gets out of sync and keeps adding data.
> I think adding an ID in the free vq stream is a more robust
> approach.
>

Adding ID to the free vq would need the device to distinguish whether it 
receives an ID or a free page hint,
so an extra protocol is needed for the two sides to talk. Currently, we 
directly assign the free page
address to desc->addr. With ID support, we would need to first allocate 
buffer for the protocol header,
and add the free page address to the header, then desc->addr = &header.

How about putting the ID to the command path? This would avoid the above 
trouble.

For example, using the 32-bit config registers:
first 16-bit: Command field
send 16-bit: ID field

Then, the working flow would look like this:

1) Host writes "Start, 1" to the Host2Guest register and notify;

2) Guest reads Host2Guest register, and ACKs by writing "Start, 1" to 
Guest2Host register;

3) Guest starts report free pages;

4) Each time when the host receives a free page hint from the 
free_page_vq, it compares the ID fields of
the Host2Guest and Guest2Host register. If matching, then filter out the 
free page from the migration dirty bitmap,
otherwise, simply push back without doing the filtering.


Best,
Wei
Michael S. Tsirkin Oct. 13, 2017, 1:38 p.m. UTC | #7
On Thu, Oct 12, 2017 at 11:54:56AM +0800, Wei Wang wrote:
> > But I think flushing is very fragile. You will easily run into races
> > if one of the actors gets out of sync and keeps adding data.
> > I think adding an ID in the free vq stream is a more robust
> > approach.
> > 
> 
> Adding ID to the free vq would need the device to distinguish whether it
> receives an ID or a free page hint,

Not really.  It's pretty simple: a 64 bit buffer is an ID. A 4K and bigger one
is a page.


> so an extra protocol is needed for the two sides to talk. Currently, we
> directly assign the free page
> address to desc->addr. With ID support, we would need to first allocate
> buffer for the protocol header,
> and add the free page address to the header, then desc->addr = &header.


I do not think you should add ID on each page. What would be the point?
Add it each time you detect a new start command.

> How about putting the ID to the command path? This would avoid the above
> trouble.
> 
> For example, using the 32-bit config registers:
> first 16-bit: Command field
> send 16-bit: ID field
> 
> Then, the working flow would look like this:
> 
> 1) Host writes "Start, 1" to the Host2Guest register and notify;
> 
> 2) Guest reads Host2Guest register, and ACKs by writing "Start, 1" to
> Guest2Host register;
> 
> 3) Guest starts report free pages;
> 
> 4) Each time when the host receives a free page hint from the free_page_vq,
> it compares the ID fields of
> the Host2Guest and Guest2Host register. If matching, then filter out the
> free page from the migration dirty bitmap,
> otherwise, simply push back without doing the filtering.
> 
> 
> Best,
> Wei


All fine but config and vq ops are asynchronous. Host has no idea when
were entries added to vq. So the ID sent to host needs to be through vq.
And I would make it a 64 or at least 32 bit ID, not a 16 bit one,
to avoid wrap-around.
Wang, Wei W Oct. 19, 2017, 8:07 a.m. UTC | #8
On 10/13/2017 09:38 PM, Michael S. Tsirkin wrote:
> On Thu, Oct 12, 2017 at 11:54:56AM +0800, Wei Wang wrote:
>>> But I think flushing is very fragile. You will easily run into races
>>> if one of the actors gets out of sync and keeps adding data.
>>> I think adding an ID in the free vq stream is a more robust
>>> approach.
>>>
>> Adding ID to the free vq would need the device to distinguish whether it
>> receives an ID or a free page hint,
> Not really.  It's pretty simple: a 64 bit buffer is an ID. A 4K and bigger one
> is a page.

I think we can also use the previous method, free page via in_buf, and 
id via out_buf.

Best,
Wei
diff mbox

Patch

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 6952e19..70dc4ae 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -55,7 +55,13 @@  static struct vfsmount *balloon_mnt;
 
 struct virtio_balloon {
 	struct virtio_device *vdev;
-	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *ctrl_vq,
+			 *free_page_vq;
+
+	/* Balloon's own wq for cpu-intensive work items */
+	struct workqueue_struct *balloon_wq;
+	/* The work items submitted to the balloon wq are listed here */
+	struct work_struct report_free_page_work;
 
 	/* The balloon servicing is delegated to a freezable workqueue. */
 	struct work_struct update_balloon_stats_work;
@@ -65,6 +71,9 @@  struct virtio_balloon {
 	spinlock_t stop_update_lock;
 	bool stop_update;
 
+	/* Stop reporting free pages */
+	bool report_free_page_stop;
+
 	/* Waiting for host to ack the pages we released. */
 	wait_queue_head_t acked;
 
@@ -93,6 +102,11 @@  struct virtio_balloon {
 
 	/* To register callback in oom notifier call chain */
 	struct notifier_block nb;
+
+	/* Host to guest ctrlq cmd buf for free page report */
+	struct virtio_balloon_ctrlq_cmd free_page_cmd_in;
+	/* Guest to Host ctrlq cmd buf for free page report */
+	struct virtio_balloon_ctrlq_cmd free_page_cmd_out;
 };
 
 static struct virtio_device_id id_table[] = {
@@ -186,6 +200,24 @@  static int send_balloon_page_sg(struct virtio_balloon *vb,
 	return err;
 }
 
+static int send_free_page_sg(struct virtqueue *vq, void *addr, uint32_t size)
+{
+	int ret = 0;
+
+	/*
+	 * Since this is an optimization feature, losing a couplle of free
+	 * pages to report isn't important. We simply resturn without adding
+	 * the page if the vq is full.
+	 */
+	if (vq->num_free) {
+		ret = add_one_sg(vq, addr, size);
+		if (!ret)
+			virtqueue_kick(vq);
+	}
+
+	return ret;
+}
+
 /*
  * Send balloon pages in sgs to host. The balloon pages are recorded in the
  * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
@@ -542,42 +574,210 @@  static void update_balloon_size_func(struct work_struct *work)
 		queue_work(system_freezable_wq, work);
 }
 
-static int init_vqs(struct virtio_balloon *vb)
+static bool virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
+					   unsigned long nr_pages)
+{
+	struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
+	void *addr = (void *)pfn_to_kaddr(pfn);
+	uint32_t len = nr_pages << PAGE_SHIFT;
+
+	if (vb->report_free_page_stop)
+		return false;
+
+	/* If the vq is broken, stop reporting the free pages. */
+	if (send_free_page_sg(vb->free_page_vq, addr, len) < 0)
+		return false;
+
+	return true;
+}
+
+static void ctrlq_add_cmd(struct virtqueue *vq,
+			  struct virtio_balloon_ctrlq_cmd *cmd,
+			  bool inbuf)
 {
-	struct virtqueue *vqs[3];
-	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
-	static const char * const names[] = { "inflate", "deflate", "stats" };
-	int err, nvqs;
+	struct scatterlist sg;
+	int err;
+
+	sg_init_one(&sg, cmd, sizeof(struct virtio_balloon_ctrlq_cmd));
+	if (inbuf)
+		err = virtqueue_add_inbuf(vq, &sg, 1, cmd, GFP_KERNEL);
+	else
+		err = virtqueue_add_outbuf(vq, &sg, 1, cmd, GFP_KERNEL);
+
+	/* Sanity check: this can't really happen */
+	WARN_ON(err);
+}
+
+static void ctrlq_send_cmd(struct virtio_balloon *vb,
+			  struct virtio_balloon_ctrlq_cmd *cmd,
+			  bool inbuf)
+{
+	struct virtqueue *vq = vb->ctrl_vq;
+
+	ctrlq_add_cmd(vq, cmd, inbuf);
+	if (!inbuf) {
+		/*
+		 * All the input cmd buffers are replenished here.
+		 * This is necessary because the input cmd buffers are lost
+		 * after live migration. The device needs to rewind all of
+		 * them from the ctrl_vq.
+		 */
+		ctrlq_add_cmd(vq, &vb->free_page_cmd_in, true);
+	}
+	virtqueue_kick(vq);
+}
 
+static void report_free_page_end(struct virtio_balloon *vb)
+{
 	/*
-	 * We expect two virtqueues: inflate and deflate, and
-	 * optionally stat.
+	 * The host may have already requested to stop the reporting before we
+	 * finish, so no need to notify the host in this case.
 	 */
-	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
-	err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
+	if (vb->report_free_page_stop)
+		return;
+
+	vb->free_page_cmd_out.class = VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE;
+	vb->free_page_cmd_out.cmd = VIRTIO_BALLOON_FREE_PAGE_F_STOP;
+	ctrlq_send_cmd(vb, &vb->free_page_cmd_out, false);
+	vb->report_free_page_stop = true;
+}
+
+static void report_free_page(struct work_struct *work)
+{
+	struct virtio_balloon *vb;
+
+	vb = container_of(work, struct virtio_balloon, report_free_page_work);
+	walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
+	report_free_page_end(vb);
+}
+
+static void ctrlq_handle(struct virtqueue *vq)
+{
+	struct virtio_balloon *vb = vq->vdev->priv;
+	struct virtio_balloon_ctrlq_cmd *msg;
+	unsigned int class, cmd, len;
+
+	msg = (struct virtio_balloon_ctrlq_cmd *)virtqueue_get_buf(vq, &len);
+	if (unlikely(!msg))
+		return;
+
+	/* The outbuf is sent by the host for recycling, so just return. */
+	if (msg == &vb->free_page_cmd_out)
+		return;
+
+	class = virtio32_to_cpu(vb->vdev, msg->class);
+	cmd =  virtio32_to_cpu(vb->vdev, msg->cmd);
+
+	switch (class) {
+	case VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE:
+		if (cmd == VIRTIO_BALLOON_FREE_PAGE_F_STOP) {
+			vb->report_free_page_stop = true;
+		} else if (cmd == VIRTIO_BALLOON_FREE_PAGE_F_START) {
+			vb->report_free_page_stop = false;
+			queue_work(vb->balloon_wq, &vb->report_free_page_work);
+		}
+		vb->free_page_cmd_in.class =
+					VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE;
+		ctrlq_send_cmd(vb, &vb->free_page_cmd_in, true);
+	break;
+	default:
+		dev_warn(&vb->vdev->dev, "%s: cmd class not supported\n",
+			 __func__);
+	}
+}
+
+static int init_vqs(struct virtio_balloon *vb)
+{
+	struct virtqueue **vqs;
+	vq_callback_t **callbacks;
+	const char **names;
+	struct scatterlist sg;
+	int i, nvqs, err = -ENOMEM;
+
+	/* Inflateq and deflateq are used unconditionally */
+	nvqs = 2;
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ))
+		nvqs++;
+	/* If ctrlq is enabled, the free page vq will also be created */
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_CTRL_VQ))
+		nvqs += 2;
+
+	/* Allocate space for find_vqs parameters */
+	vqs = kcalloc(nvqs, sizeof(*vqs), GFP_KERNEL);
+	if (!vqs)
+		goto err_vq;
+	callbacks = kmalloc_array(nvqs, sizeof(*callbacks), GFP_KERNEL);
+	if (!callbacks)
+		goto err_callback;
+	names = kmalloc_array(nvqs, sizeof(*names), GFP_KERNEL);
+	if (!names)
+		goto err_names;
+
+	callbacks[0] = balloon_ack;
+	names[0] = "inflate";
+	callbacks[1] = balloon_ack;
+	names[1] = "deflate";
+
+	i = 2;
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
+		callbacks[i] = stats_request;
+		names[i] = "stats";
+		i++;
+	}
+
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_CTRL_VQ)) {
+		callbacks[i] = ctrlq_handle;
+		names[i++] = "ctrlq";
+		callbacks[i] = NULL;
+		names[i] = "free_page_vq";
+	}
+
+	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names,
+					 NULL, NULL);
 	if (err)
-		return err;
+		goto err_find;
 
 	vb->inflate_vq = vqs[0];
 	vb->deflate_vq = vqs[1];
+	i = 2;
 	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
-		struct scatterlist sg;
-		unsigned int num_stats;
-		vb->stats_vq = vqs[2];
-
+		vb->stats_vq = vqs[i++];
 		/*
 		 * Prime this virtqueue with one buffer so the hypervisor can
 		 * use it to signal us later (it can't be broken yet!).
 		 */
-		num_stats = update_balloon_stats(vb);
-
-		sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
+		sg_init_one(&sg, vb->stats, sizeof(vb->stats));
 		if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
-		    < 0)
-			BUG();
+		    < 0) {
+			dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
+				 __func__);
+			goto err_find;
+		}
 		virtqueue_kick(vb->stats_vq);
 	}
+
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_CTRL_VQ)) {
+		vb->ctrl_vq = vqs[i++];
+		vb->free_page_vq = vqs[i];
+		/* Prime the ctrlq with an inbuf for the host to send a cmd */
+		vb->free_page_cmd_in.class =
+					VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE;
+		ctrlq_send_cmd(vb, &vb->free_page_cmd_in, true);
+	}
+
+	kfree(names);
+	kfree(callbacks);
+	kfree(vqs);
 	return 0;
+
+err_find:
+	kfree(names);
+err_names:
+	kfree(callbacks);
+err_callback:
+	kfree(vqs);
+err_vq:
+	return err;
 }
 
 #ifdef CONFIG_BALLOON_COMPACTION
@@ -706,6 +906,13 @@  static int virtballoon_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_SG))
 		xb_init(&vb->page_xb);
 
+	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_CTRL_VQ)) {
+		vb->balloon_wq = alloc_workqueue("balloon-wq",
+					WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
+		INIT_WORK(&vb->report_free_page_work, report_free_page);
+		vb->report_free_page_stop = true;
+	}
+
 	vb->nb.notifier_call = virtballoon_oom_notify;
 	vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
 	err = register_oom_notifier(&vb->nb);
@@ -770,6 +977,7 @@  static void virtballoon_remove(struct virtio_device *vdev)
 	spin_unlock_irq(&vb->stop_update_lock);
 	cancel_work_sync(&vb->update_balloon_size_work);
 	cancel_work_sync(&vb->update_balloon_stats_work);
+	cancel_work_sync(&vb->report_free_page_work);
 
 	remove_common(vb);
 #ifdef CONFIG_BALLOON_COMPACTION
@@ -823,6 +1031,7 @@  static unsigned int features[] = {
 	VIRTIO_BALLOON_F_STATS_VQ,
 	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
 	VIRTIO_BALLOON_F_SG,
+	VIRTIO_BALLOON_F_CTRL_VQ,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index 37780a7..dbf0616 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -35,6 +35,7 @@ 
 #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_SG		3 /* Use sg instead of PFN lists */
+#define VIRTIO_BALLOON_F_CTRL_VQ	4 /* Control Virtqueue */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
@@ -83,4 +84,18 @@  struct virtio_balloon_stat {
 	__virtio64 val;
 } __attribute__((packed));
 
+enum {
+	VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE = 0,
+	VIRTIO_BALLOON_CTRLQ_CLASS_MAX,
+};
+
+struct virtio_balloon_ctrlq_cmd {
+	__virtio32 class;
+	__virtio32 cmd;
+};
+
+/* Ctrlq commands related to VIRTIO_BALLOON_CTRLQ_CLASS_FREE_PAGE */
+#define VIRTIO_BALLOON_FREE_PAGE_F_STOP		0
+#define VIRTIO_BALLOON_FREE_PAGE_F_START	1
+
 #endif /* _LINUX_VIRTIO_BALLOON_H */