diff mbox

[v11,6/6] virtio-balloon: VIRTIO_BALLOON_F_CMD_VQ

Message ID 1497004901-30593-7-git-send-email-wei.w.wang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wang, Wei W June 9, 2017, 10:41 a.m. UTC
Add a new vq, cmdq, to handle requests between the device and driver.

This patch implements two commands send from the device and handled in
the driver.
1) cmd VIRTIO_BALLOON_CMDQ_REPORT_STATS: this command is used to report
the guest memory statistics to the host. The stats_vq mechanism is not
used when the cmdq mechanism is enabled.
2) cmd VIRTIO_BALLOON_CMDQ_REPORT_UNUSED_PAGES: this command is used to
report the guest unused pages to the host.

Signed-off-by: Wei Wang <wei.w.wang@intel.com>
---
 drivers/virtio/virtio_balloon.c     | 363 ++++++++++++++++++++++++++++++++----
 include/uapi/linux/virtio_balloon.h |  13 ++
 2 files changed, 337 insertions(+), 39 deletions(-)

Comments

Dave Hansen June 12, 2017, 2:07 p.m. UTC | #1
On 06/09/2017 03:41 AM, Wei Wang wrote:
> +	for_each_populated_zone(zone) {
> +		for (order = MAX_ORDER - 1; order > 0; order--) {
> +			for (migratetype = 0; migratetype < MIGRATE_TYPES;
> +			     migratetype++) {
> +				do {
> +					ret = report_unused_page_block(zone,
> +						order, migratetype, &page);
> +					if (!ret) {
> +						pfn = (u64)page_to_pfn(page);
> +						add_one_chunk(vb, vq,
> +						PAGE_CHNUK_UNUSED_PAGE,
> +						pfn << VIRTIO_BALLOON_PFN_SHIFT,
> +						(u64)(1 << order) *
> +						VIRTIO_BALLOON_PAGES_PER_PAGE);
> +					}
> +				} while (!ret);
> +			}
> +		}
> +	}

This is pretty unreadable.    Please add some indentation.  If you go
over 80 cols, then you might need to break this up into a separate
function.  But, either way, it can't be left like this.
Wang, Wei W June 13, 2017, 10:17 a.m. UTC | #2
On 06/12/2017 10:07 PM, Dave Hansen wrote:
> On 06/09/2017 03:41 AM, Wei Wang wrote:
>> +	for_each_populated_zone(zone) {
>> +		for (order = MAX_ORDER - 1; order > 0; order--) {
>> +			for (migratetype = 0; migratetype < MIGRATE_TYPES;
>> +			     migratetype++) {
>> +				do {
>> +					ret = report_unused_page_block(zone,
>> +						order, migratetype, &page);
>> +					if (!ret) {
>> +						pfn = (u64)page_to_pfn(page);
>> +						add_one_chunk(vb, vq,
>> +						PAGE_CHNUK_UNUSED_PAGE,
>> +						pfn << VIRTIO_BALLOON_PFN_SHIFT,
>> +						(u64)(1 << order) *
>> +						VIRTIO_BALLOON_PAGES_PER_PAGE);
>> +					}
>> +				} while (!ret);
>> +			}
>> +		}
>> +	}
> This is pretty unreadable.    Please add some indentation.  If you go
> over 80 cols, then you might need to break this up into a separate
> function.  But, either way, it can't be left like this.

OK, I'll re-arrange it.

Best,
Wei
Michael S. Tsirkin June 20, 2017, 4:18 p.m. UTC | #3
On Fri, Jun 09, 2017 at 06:41:41PM +0800, Wei Wang wrote:
> -	if (!virtqueue_indirect_desc_table_add(vq, desc, num)) {
> +	if (!virtqueue_indirect_desc_table_add(vq, desc, *num)) {
>  		virtqueue_kick(vq);
> -		wait_event(vb->acked, virtqueue_get_buf(vq, &len));
> -		vb->balloon_page_chunk.chunk_num = 0;
> +		if (busy_wait)
> +			while (!virtqueue_get_buf(vq, &len) &&
> +			       !virtqueue_is_broken(vq))
> +				cpu_relax();
> +		else
> +			wait_event(vb->acked, virtqueue_get_buf(vq, &len));


This is something I didn't previously notice.
As you always keep a single buffer in flight, you do not
really need indirect at all. Just add all descriptors
in the ring directly, then kick.

E.g.
	virtqueue_add_first
	virtqueue_add_next
	virtqueue_add_last

?

You also want a flag to avoid allocations but there's no need to do it
per descriptor, set it on vq.
Wang, Wei W June 21, 2017, 3:28 a.m. UTC | #4
On 06/21/2017 12:18 AM, Michael S. Tsirkin wrote:
> On Fri, Jun 09, 2017 at 06:41:41PM +0800, Wei Wang wrote:
>> -	if (!virtqueue_indirect_desc_table_add(vq, desc, num)) {
>> +	if (!virtqueue_indirect_desc_table_add(vq, desc, *num)) {
>>   		virtqueue_kick(vq);
>> -		wait_event(vb->acked, virtqueue_get_buf(vq, &len));
>> -		vb->balloon_page_chunk.chunk_num = 0;
>> +		if (busy_wait)
>> +			while (!virtqueue_get_buf(vq, &len) &&
>> +			       !virtqueue_is_broken(vq))
>> +				cpu_relax();
>> +		else
>> +			wait_event(vb->acked, virtqueue_get_buf(vq, &len));
>
> This is something I didn't previously notice.
> As you always keep a single buffer in flight, you do not
> really need indirect at all. Just add all descriptors
> in the ring directly, then kick.
>
> E.g.
> 	virtqueue_add_first
> 	virtqueue_add_next
> 	virtqueue_add_last
>
> ?
>
> You also want a flag to avoid allocations but there's no need to do it
> per descriptor, set it on vq.
>

Without using the indirect table, I'm thinking about changing to use
the standard sg (i.e. struct scatterlist), instead of vring_desc, so that
we don't need to modify or add any new functions of virtqueue_add().

In this case, we will kmalloc an array of sgs in probe(), and we can add
the sgs one by one to the vq, which won't trigger the allocation of an
indirect table inside virtqueue_add(), and then kick when all are added.

Best,
Wei
Michael S. Tsirkin June 21, 2017, 12:28 p.m. UTC | #5
On Wed, Jun 21, 2017 at 11:28:00AM +0800, Wei Wang wrote:
> On 06/21/2017 12:18 AM, Michael S. Tsirkin wrote:
> > On Fri, Jun 09, 2017 at 06:41:41PM +0800, Wei Wang wrote:
> > > -	if (!virtqueue_indirect_desc_table_add(vq, desc, num)) {
> > > +	if (!virtqueue_indirect_desc_table_add(vq, desc, *num)) {
> > >   		virtqueue_kick(vq);
> > > -		wait_event(vb->acked, virtqueue_get_buf(vq, &len));
> > > -		vb->balloon_page_chunk.chunk_num = 0;
> > > +		if (busy_wait)
> > > +			while (!virtqueue_get_buf(vq, &len) &&
> > > +			       !virtqueue_is_broken(vq))
> > > +				cpu_relax();
> > > +		else
> > > +			wait_event(vb->acked, virtqueue_get_buf(vq, &len));
> > 
> > This is something I didn't previously notice.
> > As you always keep a single buffer in flight, you do not
> > really need indirect at all. Just add all descriptors
> > in the ring directly, then kick.
> > 
> > E.g.
> > 	virtqueue_add_first
> > 	virtqueue_add_next
> > 	virtqueue_add_last
> > 
> > ?
> > 
> > You also want a flag to avoid allocations but there's no need to do it
> > per descriptor, set it on vq.
> > 
> 
> Without using the indirect table, I'm thinking about changing to use
> the standard sg (i.e. struct scatterlist), instead of vring_desc, so that
> we don't need to modify or add any new functions of virtqueue_add().
> 
> In this case, we will kmalloc an array of sgs in probe(), and we can add
> the sgs one by one to the vq, which won't trigger the allocation of an
> indirect table inside virtqueue_add(), and then kick when all are added.
> 
> Best,
> Wei

And allocate headers too? This can work. API extensions aren't
necessarily a bad idea though. The API I suggest above is preferable
for the simple reason that it can work without INDIRECT flag
support in hypervisor.

I wonder which APIs would Nitesh find useful.
Wang, Wei W June 22, 2017, 8:40 a.m. UTC | #6
On 06/21/2017 08:28 PM, Michael S. Tsirkin wrote:
> On Wed, Jun 21, 2017 at 11:28:00AM +0800, Wei Wang wrote:
>> On 06/21/2017 12:18 AM, Michael S. Tsirkin wrote:
>>> On Fri, Jun 09, 2017 at 06:41:41PM +0800, Wei Wang wrote:
>>>> -	if (!virtqueue_indirect_desc_table_add(vq, desc, num)) {
>>>> +	if (!virtqueue_indirect_desc_table_add(vq, desc, *num)) {
>>>>    		virtqueue_kick(vq);
>>>> -		wait_event(vb->acked, virtqueue_get_buf(vq, &len));
>>>> -		vb->balloon_page_chunk.chunk_num = 0;
>>>> +		if (busy_wait)
>>>> +			while (!virtqueue_get_buf(vq, &len) &&
>>>> +			       !virtqueue_is_broken(vq))
>>>> +				cpu_relax();
>>>> +		else
>>>> +			wait_event(vb->acked, virtqueue_get_buf(vq, &len));
>>> This is something I didn't previously notice.
>>> As you always keep a single buffer in flight, you do not
>>> really need indirect at all. Just add all descriptors
>>> in the ring directly, then kick.
>>>
>>> E.g.
>>> 	virtqueue_add_first
>>> 	virtqueue_add_next
>>> 	virtqueue_add_last
>>>
>>> ?
>>>
>>> You also want a flag to avoid allocations but there's no need to do it
>>> per descriptor, set it on vq.
>>>
>> Without using the indirect table, I'm thinking about changing to use
>> the standard sg (i.e. struct scatterlist), instead of vring_desc, so that
>> we don't need to modify or add any new functions of virtqueue_add().
>>
>> In this case, we will kmalloc an array of sgs in probe(), and we can add
>> the sgs one by one to the vq, which won't trigger the allocation of an
>> indirect table inside virtqueue_add(), and then kick when all are added.
>>
>> Best,
>> Wei
> And allocate headers too? This can work. API extensions aren't
> necessarily a bad idea though. The API I suggest above is preferable
> for the simple reason that it can work without INDIRECT flag
> support in hypervisor.

OK, probably we don't need to add a desc to the vq - we can just use
the vq's desc, like this:

int virtqueue_add_first(struct virtqueue *_vq,
                                      uint64_t addr,
                                      uint32_t len,
                                      bool in,
                                      unsigned int *idx) {

     ...
    uint16_t desc_flags = in ? VRING_DESC_F_NEXT | VRING_DESC_F_WRITE :
                                              VRING_DESC_F_NEXT;

     vq->vring.desc[vq->free_head].addr = addr;
     vq->vring.desc[vq->free_head].len = len;
     vq->vring.desc[vq->free_head].flags = cpu_to_virtio16(_vq->vdev, 
flags);
     /* return to the caller the desc id */
     *idx = vq->free_head;
     ...
}

int virtqueue_add_next(struct virtqueue *_vq,
                                      uint64_t addr,
                                      uint32_t len,
                                      bool in,
                                      bool end,
                                      unsigned int *idx) {
     ...
     vq->vring.desc[*idx].next = vq->free_head;
     vq->vring.desc[vq->free_head].addr = addr;
     ...
     if (end)
         remove the VRING_DESC_F_NEXT flag
}


What do you think? We can also combine the two functions into one.



Best,
Wei
Michael S. Tsirkin June 28, 2017, 3:01 p.m. UTC | #7
On Thu, Jun 22, 2017 at 04:40:39PM +0800, Wei Wang wrote:
> On 06/21/2017 08:28 PM, Michael S. Tsirkin wrote:
> > On Wed, Jun 21, 2017 at 11:28:00AM +0800, Wei Wang wrote:
> > > On 06/21/2017 12:18 AM, Michael S. Tsirkin wrote:
> > > > On Fri, Jun 09, 2017 at 06:41:41PM +0800, Wei Wang wrote:
> > > > > -	if (!virtqueue_indirect_desc_table_add(vq, desc, num)) {
> > > > > +	if (!virtqueue_indirect_desc_table_add(vq, desc, *num)) {
> > > > >    		virtqueue_kick(vq);
> > > > > -		wait_event(vb->acked, virtqueue_get_buf(vq, &len));
> > > > > -		vb->balloon_page_chunk.chunk_num = 0;
> > > > > +		if (busy_wait)
> > > > > +			while (!virtqueue_get_buf(vq, &len) &&
> > > > > +			       !virtqueue_is_broken(vq))
> > > > > +				cpu_relax();
> > > > > +		else
> > > > > +			wait_event(vb->acked, virtqueue_get_buf(vq, &len));
> > > > This is something I didn't previously notice.
> > > > As you always keep a single buffer in flight, you do not
> > > > really need indirect at all. Just add all descriptors
> > > > in the ring directly, then kick.
> > > > 
> > > > E.g.
> > > > 	virtqueue_add_first
> > > > 	virtqueue_add_next
> > > > 	virtqueue_add_last
> > > > 
> > > > ?
> > > > 
> > > > You also want a flag to avoid allocations but there's no need to do it
> > > > per descriptor, set it on vq.
> > > > 
> > > Without using the indirect table, I'm thinking about changing to use
> > > the standard sg (i.e. struct scatterlist), instead of vring_desc, so that
> > > we don't need to modify or add any new functions of virtqueue_add().
> > > 
> > > In this case, we will kmalloc an array of sgs in probe(), and we can add
> > > the sgs one by one to the vq, which won't trigger the allocation of an
> > > indirect table inside virtqueue_add(), and then kick when all are added.
> > > 
> > > Best,
> > > Wei
> > And allocate headers too? This can work. API extensions aren't
> > necessarily a bad idea though. The API I suggest above is preferable
> > for the simple reason that it can work without INDIRECT flag
> > support in hypervisor.
> 
> OK, probably we don't need to add a desc to the vq - we can just use
> the vq's desc, like this:
> 
> int virtqueue_add_first(struct virtqueue *_vq,
>                                      uint64_t addr,
>                                      uint32_t len,
>                                      bool in,
>                                      unsigned int *idx) {
> 
>     ...
>    uint16_t desc_flags = in ? VRING_DESC_F_NEXT | VRING_DESC_F_WRITE :
>                                              VRING_DESC_F_NEXT;
> 
>     vq->vring.desc[vq->free_head].addr = addr;
>     vq->vring.desc[vq->free_head].len = len;
>     vq->vring.desc[vq->free_head].flags = cpu_to_virtio16(_vq->vdev, flags);
>     /* return to the caller the desc id */
>     *idx = vq->free_head;
>     ...
> }
> 
> int virtqueue_add_next(struct virtqueue *_vq,
>                                      uint64_t addr,
>                                      uint32_t len,
>                                      bool in,
>                                      bool end,
>                                      unsigned int *idx) {
>     ...
>     vq->vring.desc[*idx].next = vq->free_head;
>     vq->vring.desc[vq->free_head].addr = addr;
>     ...
>     if (end)
>         remove the VRING_DESC_F_NEXT flag
> }
> 

Add I would say add-last.

> 
> What do you think? We can also combine the two functions into one.
> 
> 
> 
> Best,
> Wei

With an enum? Yes that's also an option.
Wang, Wei W July 12, 2017, 12:57 p.m. UTC | #8
On 06/28/2017 11:01 PM, Michael S. Tsirkin wrote:
> On Thu, Jun 22, 2017 at 04:40:39PM +0800, Wei Wang wrote:
>> On 06/21/2017 08:28 PM, Michael S. Tsirkin wrote:
>>> On Wed, Jun 21, 2017 at 11:28:00AM +0800, Wei Wang wrote:
>>>> On 06/21/2017 12:18 AM, Michael S. Tsirkin wrote:
>>>>> On Fri, Jun 09, 2017 at 06:41:41PM +0800, Wei Wang wrote:
>>>>>> -	if (!virtqueue_indirect_desc_table_add(vq, desc, num)) {
>>>>>> +	if (!virtqueue_indirect_desc_table_add(vq, desc, *num)) {
>>>>>>     		virtqueue_kick(vq);
>>>>>> -		wait_event(vb->acked, virtqueue_get_buf(vq, &len));
>>>>>> -		vb->balloon_page_chunk.chunk_num = 0;
>>>>>> +		if (busy_wait)
>>>>>> +			while (!virtqueue_get_buf(vq, &len) &&
>>>>>> +			       !virtqueue_is_broken(vq))
>>>>>> +				cpu_relax();
>>>>>> +		else
>>>>>> +			wait_event(vb->acked, virtqueue_get_buf(vq, &len));
>>>>> This is something I didn't previously notice.
>>>>> As you always keep a single buffer in flight, you do not
>>>>> really need indirect at all. Just add all descriptors
>>>>> in the ring directly, then kick.
>>>>>
>>>>> E.g.
>>>>> 	virtqueue_add_first
>>>>> 	virtqueue_add_next
>>>>> 	virtqueue_add_last
>>>>>
>>>>> ?
>>>>>
>>>>> You also want a flag to avoid allocations but there's no need to do it
>>>>> per descriptor, set it on vq.
>>>>>
>>>> Without using the indirect table, I'm thinking about changing to use
>>>> the standard sg (i.e. struct scatterlist), instead of vring_desc, so that
>>>> we don't need to modify or add any new functions of virtqueue_add().
>>>>
>>>> In this case, we will kmalloc an array of sgs in probe(), and we can add
>>>> the sgs one by one to the vq, which won't trigger the allocation of an
>>>> indirect table inside virtqueue_add(), and then kick when all are added.
>>>>
>>>> Best,
>>>> Wei
>>> And allocate headers too? This can work. API extensions aren't
>>> necessarily a bad idea though. The API I suggest above is preferable
>>> for the simple reason that it can work without INDIRECT flag
>>> support in hypervisor.
>> OK, probably we don't need to add a desc to the vq - we can just use
>> the vq's desc, like this:
>>
>> int virtqueue_add_first(struct virtqueue *_vq,
>>                                       uint64_t addr,
>>                                       uint32_t len,
>>                                       bool in,
>>                                       unsigned int *idx) {
>>
>>      ...
>>     uint16_t desc_flags = in ? VRING_DESC_F_NEXT | VRING_DESC_F_WRITE :
>>                                               VRING_DESC_F_NEXT;
>>
>>      vq->vring.desc[vq->free_head].addr = addr;
>>      vq->vring.desc[vq->free_head].len = len;
>>      vq->vring.desc[vq->free_head].flags = cpu_to_virtio16(_vq->vdev, flags);
>>      /* return to the caller the desc id */
>>      *idx = vq->free_head;
>>      ...
>> }
>>
>> int virtqueue_add_next(struct virtqueue *_vq,
>>                                       uint64_t addr,
>>                                       uint32_t len,
>>                                       bool in,
>>                                       bool end,
>>                                       unsigned int *idx) {
>>      ...
>>      vq->vring.desc[*idx].next = vq->free_head;
>>      vq->vring.desc[vq->free_head].addr = addr;
>>      ...
>>      if (end)
>>          remove the VRING_DESC_F_NEXT flag
>> }
>>
> Add I would say add-last.
>
>> What do you think? We can also combine the two functions into one.
>>
>>
>>
>> Best,
>> Wei
> With an enum? Yes that's also an option.
>

Thanks for the suggestion. I shifted it a little bit, please have a check
the latest v12 patches that I just sent out.

Best,
Wei
diff mbox

Patch

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 0cf945c..4ac90a5 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -51,6 +51,10 @@  MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 static struct vfsmount *balloon_mnt;
 #endif
 
+/* Types of pages to chunk */
+#define PAGE_CHUNK_TYPE_BALLOON 0	/* Chunk of inflate/deflate pages */
+#define PAGE_CHNUK_UNUSED_PAGE  1	/* Chunk of unused pages */
+
 /* The size of one page_bmap used to record inflated/deflated pages. */
 #define VIRTIO_BALLOON_PAGE_BMAP_SIZE	(8 * PAGE_SIZE)
 /*
@@ -81,12 +85,25 @@  struct virtio_balloon_page_chunk {
 	unsigned long *page_bmap[VIRTIO_BALLOON_PAGE_BMAP_MAX_NUM];
 };
 
+struct virtio_balloon_cmdq_unused_page {
+	struct virtio_balloon_cmdq_hdr hdr;
+	struct vring_desc *desc_table;
+	/* Number of added descriptors */
+	unsigned int num;
+};
+
+struct virtio_balloon_cmdq_stats {
+	struct virtio_balloon_cmdq_hdr hdr;
+	struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+};
+
 struct virtio_balloon {
 	struct virtio_device *vdev;
-	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *cmd_vq;
 
 	/* The balloon servicing is delegated to a freezable workqueue. */
 	struct work_struct update_balloon_stats_work;
+	struct work_struct cmdq_handle_work;
 	struct work_struct update_balloon_size_work;
 
 	/* Prevent updating balloon when it is being canceled. */
@@ -115,8 +132,10 @@  struct virtio_balloon {
 	unsigned int num_pfns;
 	__virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
 
-	/* Memory statistics */
-	struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+	/* Cmdq msg buffer for memory statistics */
+	struct virtio_balloon_cmdq_stats cmdq_stats;
+	/* Cmdq msg buffer for reporting ununsed pages */
+	struct virtio_balloon_cmdq_unused_page cmdq_unused_page;
 
 	/* To register callback in oom notifier call chain */
 	struct notifier_block nb;
@@ -208,31 +227,77 @@  static void clear_page_bmap(struct virtio_balloon *vb,
 		       VIRTIO_BALLOON_PAGE_BMAP_SIZE);
 }
 
-static void send_page_chunks(struct virtio_balloon *vb, struct virtqueue *vq)
+static void send_page_chunks(struct virtio_balloon *vb, struct virtqueue *vq,
+			     int type, bool busy_wait)
 {
-	unsigned int len, num;
-	struct vring_desc *desc = vb->balloon_page_chunk.desc_table;
+	unsigned int len, *num, reset_num;
+	struct vring_desc *desc;
+
+	switch (type) {
+	case PAGE_CHUNK_TYPE_BALLOON:
+		desc = vb->balloon_page_chunk.desc_table;
+		num = &vb->balloon_page_chunk.chunk_num;
+		reset_num = 0;
+		break;
+	case PAGE_CHNUK_UNUSED_PAGE:
+		desc = vb->cmdq_unused_page.desc_table;
+		num = &vb->cmdq_unused_page.num;
+		/*
+		 * The first desc is used for the cmdq_hdr, so chunks will be
+		 * added from the second desc.
+		 */
+		reset_num = 1;
+		break;
+	default:
+		dev_warn(&vb->vdev->dev, "%s: unknown page chunk type %d\n",
+			 __func__, type);
+		return;
+	}
 
-	num = vb->balloon_page_chunk.chunk_num;
-	if (!virtqueue_indirect_desc_table_add(vq, desc, num)) {
+	if (!virtqueue_indirect_desc_table_add(vq, desc, *num)) {
 		virtqueue_kick(vq);
-		wait_event(vb->acked, virtqueue_get_buf(vq, &len));
-		vb->balloon_page_chunk.chunk_num = 0;
+		if (busy_wait)
+			while (!virtqueue_get_buf(vq, &len) &&
+			       !virtqueue_is_broken(vq))
+				cpu_relax();
+		else
+			wait_event(vb->acked, virtqueue_get_buf(vq, &len));
+		/*
+		 * Now, the descriptor have been delivered to the host. Reset
+		 * the field in the structure that records the number of added
+		 * descriptors, so that new added descriptor can be re-counted.
+		 */
+		*num = reset_num;
 	}
 }
 
 /* Add a chunk to the buffer. */
 static void add_one_chunk(struct virtio_balloon *vb, struct virtqueue *vq,
-			  u64 base_addr, u32 size)
+			  int type, u64 base_addr, u32 size)
 {
-	unsigned int *num = &vb->balloon_page_chunk.chunk_num;
-	struct vring_desc *desc = &vb->balloon_page_chunk.desc_table[*num];
+	unsigned int *num;
+	struct vring_desc *desc;
+
+	switch (type) {
+	case PAGE_CHUNK_TYPE_BALLOON:
+		num = &vb->balloon_page_chunk.chunk_num;
+		desc = &vb->balloon_page_chunk.desc_table[*num];
+		break;
+	case PAGE_CHNUK_UNUSED_PAGE:
+		num = &vb->cmdq_unused_page.num;
+		desc = &vb->cmdq_unused_page.desc_table[*num];
+		break;
+	default:
+		dev_warn(&vb->vdev->dev, "%s: chunk %d of unknown pages\n",
+			 __func__, type);
+		return;
+	}
 
 	desc->addr = cpu_to_virtio64(vb->vdev, base_addr);
 	desc->len = cpu_to_virtio32(vb->vdev, size);
 	*num += 1;
 	if (*num == VIRTIO_BALLOON_MAX_PAGE_CHUNKS)
-		send_page_chunks(vb, vq);
+		send_page_chunks(vb, vq, type, false);
 }
 
 static void convert_bmap_to_chunks(struct virtio_balloon *vb,
@@ -264,7 +329,8 @@  static void convert_bmap_to_chunks(struct virtio_balloon *vb,
 		chunk_base_addr = (pfn_start + next_one) <<
 				  VIRTIO_BALLOON_PFN_SHIFT;
 		if (chunk_size) {
-			add_one_chunk(vb, vq, chunk_base_addr, chunk_size);
+			add_one_chunk(vb, vq, PAGE_CHUNK_TYPE_BALLOON,
+				      chunk_base_addr, chunk_size);
 			pos += next_zero + 1;
 		}
 	}
@@ -311,7 +377,7 @@  static void tell_host_from_page_bmap(struct virtio_balloon *vb,
 				       pfn_num);
 	}
 	if (vb->balloon_page_chunk.chunk_num > 0)
-		send_page_chunks(vb, vq);
+		send_page_chunks(vb, vq, PAGE_CHUNK_TYPE_BALLOON, false);
 }
 
 static void set_page_pfns(struct virtio_balloon *vb,
@@ -516,8 +582,8 @@  static inline void update_stat(struct virtio_balloon *vb, int idx,
 			       u16 tag, u64 val)
 {
 	BUG_ON(idx >= VIRTIO_BALLOON_S_NR);
-	vb->stats[idx].tag = cpu_to_virtio16(vb->vdev, tag);
-	vb->stats[idx].val = cpu_to_virtio64(vb->vdev, val);
+	vb->cmdq_stats.stats[idx].tag = cpu_to_virtio16(vb->vdev, tag);
+	vb->cmdq_stats.stats[idx].val = cpu_to_virtio64(vb->vdev, val);
 }
 
 #define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT)
@@ -582,7 +648,8 @@  static void stats_handle_request(struct virtio_balloon *vb)
 	vq = vb->stats_vq;
 	if (!virtqueue_get_buf(vq, &len))
 		return;
-	sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
+	sg_init_one(&sg, vb->cmdq_stats.stats,
+		    sizeof(vb->cmdq_stats.stats[0]) * num_stats);
 	virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL);
 	virtqueue_kick(vq);
 }
@@ -686,43 +753,216 @@  static void update_balloon_size_func(struct work_struct *work)
 		queue_work(system_freezable_wq, work);
 }
 
+static void cmdq_handle_stats(struct virtio_balloon *vb)
+{
+	struct scatterlist sg;
+	unsigned int num_stats;
+
+	spin_lock(&vb->stop_update_lock);
+	if (!vb->stop_update) {
+		num_stats = update_balloon_stats(vb);
+		sg_init_one(&sg, &vb->cmdq_stats,
+			    sizeof(struct virtio_balloon_cmdq_hdr) +
+			    sizeof(struct virtio_balloon_stat) * num_stats);
+		virtqueue_add_outbuf(vb->cmd_vq, &sg, 1, vb, GFP_KERNEL);
+		virtqueue_kick(vb->cmd_vq);
+	}
+	spin_unlock(&vb->stop_update_lock);
+}
+
+/*
+ * The header part of the message buffer is given to the device to send a
+ * command to the driver.
+ */
+static void host_cmd_buf_add(struct virtio_balloon *vb,
+			   struct virtio_balloon_cmdq_hdr *hdr)
+{
+	struct scatterlist sg;
+
+	hdr->flags = 0;
+	sg_init_one(&sg, hdr, VIRTIO_BALLOON_CMDQ_HDR_SIZE);
+
+	if (virtqueue_add_inbuf(vb->cmd_vq, &sg, 1, hdr, GFP_KERNEL) < 0) {
+		__virtio_clear_bit(vb->vdev,
+				   VIRTIO_BALLOON_F_CMD_VQ);
+		dev_warn(&vb->vdev->dev, "%s: add miscq msg buf err\n",
+			 __func__);
+		return;
+	}
+
+	virtqueue_kick(vb->cmd_vq);
+}
+
+static void cmdq_handle_unused_pages(struct virtio_balloon *vb)
+{
+	struct virtqueue *vq = vb->cmd_vq;
+	struct vring_desc *hdr_desc = &vb->cmdq_unused_page.desc_table[0];
+	unsigned long hdr_pa;
+	unsigned int order = 0, migratetype = 0;
+	struct zone *zone = NULL;
+	struct page *page = NULL;
+	u64 pfn;
+	int ret = 0;
+
+	/* Put the hdr to the first desc */
+	hdr_pa = virt_to_phys((void *)&vb->cmdq_unused_page.hdr);
+	hdr_desc->addr = cpu_to_virtio64(vb->vdev, hdr_pa);
+	hdr_desc->len = cpu_to_virtio32(vb->vdev,
+				sizeof(struct virtio_balloon_cmdq_hdr));
+	vb->cmdq_unused_page.num = 1;
+
+	for_each_populated_zone(zone) {
+		for (order = MAX_ORDER - 1; order > 0; order--) {
+			for (migratetype = 0; migratetype < MIGRATE_TYPES;
+			     migratetype++) {
+				do {
+					ret = report_unused_page_block(zone,
+						order, migratetype, &page);
+					if (!ret) {
+						pfn = (u64)page_to_pfn(page);
+						add_one_chunk(vb, vq,
+						PAGE_CHNUK_UNUSED_PAGE,
+						pfn << VIRTIO_BALLOON_PFN_SHIFT,
+						(u64)(1 << order) *
+						VIRTIO_BALLOON_PAGES_PER_PAGE);
+					}
+				} while (!ret);
+			}
+		}
+	}
+
+	/* Set the cmd completion flag. */
+	vb->cmdq_unused_page.hdr.flags |=
+				cpu_to_le32(VIRTIO_BALLOON_CMDQ_F_COMPLETION);
+	send_page_chunks(vb, vq, PAGE_CHNUK_UNUSED_PAGE, true);
+}
+
+static void cmdq_handle(struct virtio_balloon *vb)
+{
+	struct virtqueue *vq;
+	struct virtio_balloon_cmdq_hdr *hdr;
+	unsigned int len;
+
+	vq = vb->cmd_vq;
+	while ((hdr = (struct virtio_balloon_cmdq_hdr *)
+			virtqueue_get_buf(vq, &len)) != NULL) {
+		switch (hdr->cmd) {
+		case VIRTIO_BALLOON_CMDQ_REPORT_STATS:
+			cmdq_handle_stats(vb);
+			break;
+		case VIRTIO_BALLOON_CMDQ_REPORT_UNUSED_PAGES:
+			cmdq_handle_unused_pages(vb);
+			break;
+		default:
+			dev_warn(&vb->vdev->dev, "%s: wrong cmd\n", __func__);
+			return;
+		}
+		/*
+		 * Replenish all the command buffer to the device after a
+		 * command is handled. This is for the convenience of the
+		 * device to rewind the cmdq to get back all the command
+		 * buffer after live migration.
+		 */
+		host_cmd_buf_add(vb, &vb->cmdq_stats.hdr);
+		host_cmd_buf_add(vb, &vb->cmdq_unused_page.hdr);
+	}
+}
+
+static void cmdq_handle_work_func(struct work_struct *work)
+{
+	struct virtio_balloon *vb;
+
+	vb = container_of(work, struct virtio_balloon,
+			  cmdq_handle_work);
+	cmdq_handle(vb);
+}
+
+static void cmdq_callback(struct virtqueue *vq)
+{
+	struct virtio_balloon *vb = vq->vdev->priv;
+
+	queue_work(system_freezable_wq, &vb->cmdq_handle_work);
+}
+
 static int init_vqs(struct virtio_balloon *vb)
 {
-	struct virtqueue *vqs[3];
-	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
-	static const char * const names[] = { "inflate", "deflate", "stats" };
-	int err, nvqs;
+	struct virtqueue **vqs;
+	vq_callback_t **callbacks;
+	const char **names;
+	int err = -ENOMEM;
+	int nvqs;
+
+	/* Inflateq and deflateq are used unconditionally */
+	nvqs = 2;
+
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_CMD_VQ) ||
+	    virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ))
+		nvqs++;
+
+	/* Allocate space for find_vqs parameters */
+	vqs = kcalloc(nvqs, sizeof(*vqs), GFP_KERNEL);
+	if (!vqs)
+		goto err_vq;
+	callbacks = kmalloc_array(nvqs, sizeof(*callbacks), GFP_KERNEL);
+	if (!callbacks)
+		goto err_callback;
+	names = kmalloc_array(nvqs, sizeof(*names), GFP_KERNEL);
+	if (!names)
+		goto err_names;
+
+	callbacks[0] = balloon_ack;
+	names[0] = "inflate";
+	callbacks[1] = balloon_ack;
+	names[1] = "deflate";
 
 	/*
-	 * We expect two virtqueues: inflate and deflate, and
-	 * optionally stat.
+	 * The stats_vq is used only when cmdq is not supported (or disabled)
+	 * by the device.
 	 */
-	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
-	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names,
-			NULL);
-	if (err)
-		return err;
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_CMD_VQ)) {
+		callbacks[2] = cmdq_callback;
+		names[2] = "cmdq";
+	} else if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
+		callbacks[2] = stats_request;
+		names[2] = "stats";
+	}
 
+	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks,
+					 names, NULL);
+	if (err)
+		goto err_find;
 	vb->inflate_vq = vqs[0];
 	vb->deflate_vq = vqs[1];
-	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
+
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_CMD_VQ)) {
+		vb->cmd_vq = vqs[2];
+		/* Prime the cmdq with the header buffer. */
+		host_cmd_buf_add(vb, &vb->cmdq_stats.hdr);
+		host_cmd_buf_add(vb, &vb->cmdq_unused_page.hdr);
+	} else if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
 		struct scatterlist sg;
-		unsigned int num_stats;
-		vb->stats_vq = vqs[2];
 
+		vb->stats_vq = vqs[2];
 		/*
 		 * Prime this virtqueue with one buffer so the hypervisor can
 		 * use it to signal us later (it can't be broken yet!).
 		 */
-		num_stats = update_balloon_stats(vb);
-
-		sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
+		sg_init_one(&sg, vb->cmdq_stats.stats,
+			    sizeof(vb->cmdq_stats.stats));
 		if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
 		    < 0)
 			BUG();
 		virtqueue_kick(vb->stats_vq);
 	}
-	return 0;
+
+err_find:
+	kfree(names);
+err_names:
+	kfree(callbacks);
+err_callback:
+	kfree(vqs);
+err_vq:
+	return err;
 }
 
 #ifdef CONFIG_BALLOON_COMPACTION
@@ -730,7 +970,8 @@  static int init_vqs(struct virtio_balloon *vb)
 static void tell_host_one_page(struct virtio_balloon *vb,
 			       struct virtqueue *vq, struct page *page)
 {
-	add_one_chunk(vb, vq, page_to_pfn(page) << VIRTIO_BALLOON_PFN_SHIFT,
+	add_one_chunk(vb, vq, PAGE_CHUNK_TYPE_BALLOON,
+		      page_to_pfn(page) << VIRTIO_BALLOON_PFN_SHIFT,
 		      VIRTIO_BALLOON_PAGES_PER_PAGE);
 }
 
@@ -865,6 +1106,40 @@  static int balloon_page_chunk_init(struct virtio_balloon *vb)
 	return -ENOMEM;
 }
 
+/*
+ * Each type of command is handled one in-flight each time. So, we allocate
+ * one message buffer for each type of command. The header part of the message
+ * buffer will be offered to the device, so that the device can send a command
+ * using the corresponding command buffer to the driver later.
+ */
+static int cmdq_init(struct virtio_balloon *vb)
+{
+	vb->cmdq_unused_page.desc_table = alloc_indirect(vb->vdev,
+						VIRTIO_BALLOON_MAX_PAGE_CHUNKS,
+						GFP_KERNEL);
+	if (!vb->cmdq_unused_page.desc_table) {
+		dev_warn(&vb->vdev->dev, "%s: failed\n", __func__);
+		__virtio_clear_bit(vb->vdev,
+				   VIRTIO_BALLOON_F_CMD_VQ);
+		return -ENOMEM;
+	}
+	vb->cmdq_unused_page.num = 0;
+
+	/*
+	 * The header is initialized to let the device know which type of
+	 * command buffer it receives. The device will later use a buffer
+	 * according to the type of command that it needs to send.
+	 */
+	vb->cmdq_stats.hdr.cmd = VIRTIO_BALLOON_CMDQ_REPORT_STATS;
+	vb->cmdq_stats.hdr.flags = 0;
+	vb->cmdq_unused_page.hdr.cmd = VIRTIO_BALLOON_CMDQ_REPORT_UNUSED_PAGES;
+	vb->cmdq_unused_page.hdr.flags = 0;
+
+	INIT_WORK(&vb->cmdq_handle_work, cmdq_handle_work_func);
+
+	return 0;
+}
+
 static int virtballoon_validate(struct virtio_device *vdev)
 {
 	struct virtio_balloon *vb = NULL;
@@ -883,6 +1158,11 @@  static int virtballoon_validate(struct virtio_device *vdev)
 			goto err_page_chunk;
 	}
 
+	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_CMD_VQ)) {
+		err = cmdq_init(vb);
+		if (err < 0)
+			goto err_vb;
+	}
 	return 0;
 
 err_page_chunk:
@@ -902,7 +1182,10 @@  static int virtballoon_probe(struct virtio_device *vdev)
 		return -EINVAL;
 	}
 
-	INIT_WORK(&vb->update_balloon_stats_work, update_balloon_stats_func);
+	if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_CMD_VQ) &&
+	    virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ))
+		INIT_WORK(&vb->update_balloon_stats_work,
+			  update_balloon_stats_func);
 	INIT_WORK(&vb->update_balloon_size_work, update_balloon_size_func);
 	spin_lock_init(&vb->stop_update_lock);
 	vb->stop_update = false;
@@ -980,6 +1263,7 @@  static void virtballoon_remove(struct virtio_device *vdev)
 	spin_unlock_irq(&vb->stop_update_lock);
 	cancel_work_sync(&vb->update_balloon_size_work);
 	cancel_work_sync(&vb->update_balloon_stats_work);
+	cancel_work_sync(&vb->cmdq_handle_work);
 
 	remove_common(vb);
 	free_page_bmap(vb);
@@ -1029,6 +1313,7 @@  static unsigned int features[] = {
 	VIRTIO_BALLOON_F_STATS_VQ,
 	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
 	VIRTIO_BALLOON_F_PAGE_CHUNKS,
+	VIRTIO_BALLOON_F_CMD_VQ,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index 5ed3c7b..cb66c1a 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -35,6 +35,7 @@ 
 #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_PAGE_CHUNKS	3 /* Inflate/Deflate pages in chunks */
+#define VIRTIO_BALLOON_F_CMD_VQ		4 /* Command virtqueue */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
@@ -83,4 +84,16 @@  struct virtio_balloon_stat {
 	__virtio64 val;
 } __attribute__((packed));
 
+/* Use the memory of a vring_desc to place the cmdq header */
+#define VIRTIO_BALLOON_CMDQ_HDR_SIZE sizeof(struct vring_desc)
+
+struct virtio_balloon_cmdq_hdr {
+#define VIRTIO_BALLOON_CMDQ_REPORT_STATS	0
+#define VIRTIO_BALLOON_CMDQ_REPORT_UNUSED_PAGES	1
+	__le32 cmd;
+/* Flag to indicate the completion of handling a command */
+#define VIRTIO_BALLOON_CMDQ_F_COMPLETION	1
+	__le32 flags;
+};
+
 #endif /* _LINUX_VIRTIO_BALLOON_H */