diff mbox

[v5,4/5] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT

Message ID 1521197309-13544-5-git-send-email-wei.w.wang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wei Wang March 16, 2018, 10:48 a.m. UTC
The new feature enables the virtio-balloon device to receive hints of
guest free pages from the free page vq.

balloon_free_page_start - start guest free page hint reporting.
balloon_free_page_stop - stop guest free page hint reporting.

Note: balloon will report pages which were free at the time
of this call. As the reporting happens asynchronously, dirty bit logging
must be enabled before this call is made. Guest reporting must be
disabled before the migration dirty bitmap is synchronized.

Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Liang Li <liang.z.li@intel.com>
CC: Michael S. Tsirkin <mst@redhat.com>
CC: Dr. David Alan Gilbert <dgilbert@redhat.com>
CC: Juan Quintela <quintela@redhat.com>
---
 balloon.c                                       |  58 +++++--
 hw/virtio/virtio-balloon.c                      | 217 ++++++++++++++++++++++--
 include/hw/virtio/virtio-balloon.h              |  20 ++-
 include/standard-headers/linux/virtio_balloon.h |   7 +
 include/sysemu/balloon.h                        |  15 +-
 5 files changed, 288 insertions(+), 29 deletions(-)

Comments

Michael S. Tsirkin March 16, 2018, 3:16 p.m. UTC | #1
On Fri, Mar 16, 2018 at 06:48:28PM +0800, Wei Wang wrote:
> The new feature enables the virtio-balloon device to receive hints of
> guest free pages from the free page vq.
> 
> balloon_free_page_start - start guest free page hint reporting.
> balloon_free_page_stop - stop guest free page hint reporting.
> 
> Note: balloon will report pages which were free at the time
> of this call. As the reporting happens asynchronously, dirty bit logging
> must be enabled before this call is made. Guest reporting must be
> disabled before the migration dirty bitmap is synchronized.
> 
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> Signed-off-by: Liang Li <liang.z.li@intel.com>
> CC: Michael S. Tsirkin <mst@redhat.com>
> CC: Dr. David Alan Gilbert <dgilbert@redhat.com>
> CC: Juan Quintela <quintela@redhat.com>
> ---
>  balloon.c                                       |  58 +++++--
>  hw/virtio/virtio-balloon.c                      | 217 ++++++++++++++++++++++--
>  include/hw/virtio/virtio-balloon.h              |  20 ++-
>  include/standard-headers/linux/virtio_balloon.h |   7 +
>  include/sysemu/balloon.h                        |  15 +-
>  5 files changed, 288 insertions(+), 29 deletions(-)
> 
> diff --git a/balloon.c b/balloon.c
> index 6bf0a96..87a0410 100644
> --- a/balloon.c
> +++ b/balloon.c
> @@ -36,6 +36,9 @@
>  
>  static QEMUBalloonEvent *balloon_event_fn;
>  static QEMUBalloonStatus *balloon_stat_fn;
> +static QEMUBalloonFreePageSupport *balloon_free_page_support_fn;
> +static QEMUBalloonFreePageStart *balloon_free_page_start_fn;
> +static QEMUBalloonFreePageStop *balloon_free_page_stop_fn;
>  static void *balloon_opaque;
>  static bool balloon_inhibited;
>  
> @@ -64,19 +67,51 @@ static bool have_balloon(Error **errp)
>      return true;
>  }
>  
> -int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
> -                             QEMUBalloonStatus *stat_func, void *opaque)
> +bool balloon_free_page_support(void)
>  {
> -    if (balloon_event_fn || balloon_stat_fn || balloon_opaque) {
> -        /* We're already registered one balloon handler.  How many can
> -         * a guest really have?
> -         */
> -        return -1;
> +    return balloon_free_page_support_fn &&
> +           balloon_free_page_support_fn(balloon_opaque);
> +}
> +
> +/*
> + * Balloon will report pages which were free at the time of this call. As the
> + * reporting happens asynchronously, dirty bit logging must be enabled before
> + * this call is made.
> + */
> +void balloon_free_page_start(void)
> +{
> +    balloon_free_page_start_fn(balloon_opaque);
> +}
> +
> +/*
> + * Guest reporting must be disabled before the migration dirty bitmap is
> + * synchronized.
> + */
> +void balloon_free_page_stop(void)
> +{
> +    balloon_free_page_stop_fn(balloon_opaque);
> +}
> +
> +void qemu_add_balloon_handler(QEMUBalloonEvent *event_fn,
> +                              QEMUBalloonStatus *stat_fn,
> +                              QEMUBalloonFreePageSupport *free_page_support_fn,
> +                              QEMUBalloonFreePageStart *free_page_start_fn,
> +                              QEMUBalloonFreePageStop *free_page_stop_fn,
> +                              void *opaque)
> +{
> +    if (balloon_event_fn || balloon_stat_fn || balloon_free_page_support_fn ||
> +        balloon_free_page_start_fn || balloon_free_page_stop_fn ||
> +        balloon_opaque) {
> +        /* We already registered one balloon handler. */
> +        return;
>      }
> -    balloon_event_fn = event_func;
> -    balloon_stat_fn = stat_func;
> +
> +    balloon_event_fn = event_fn;
> +    balloon_stat_fn = stat_fn;
> +    balloon_free_page_support_fn = free_page_support_fn;
> +    balloon_free_page_start_fn = free_page_start_fn;
> +    balloon_free_page_stop_fn = free_page_stop_fn;
>      balloon_opaque = opaque;
> -    return 0;
>  }
>  
>  void qemu_remove_balloon_handler(void *opaque)
> @@ -86,6 +121,9 @@ void qemu_remove_balloon_handler(void *opaque)
>      }
>      balloon_event_fn = NULL;
>      balloon_stat_fn = NULL;
> +    balloon_free_page_support_fn = NULL;
> +    balloon_free_page_start_fn = NULL;
> +    balloon_free_page_stop_fn = NULL;
>      balloon_opaque = NULL;
>  }
>  
> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
> index f456cea..30c7504 100644
> --- a/hw/virtio/virtio-balloon.c
> +++ b/hw/virtio/virtio-balloon.c
> @@ -31,6 +31,7 @@
>  
>  #include "hw/virtio/virtio-bus.h"
>  #include "hw/virtio/virtio-access.h"
> +#include "migration/misc.h"
>  
>  #define BALLOON_PAGE_SIZE  (1 << VIRTIO_BALLOON_PFN_SHIFT)
>  
> @@ -308,6 +309,127 @@ out:
>      }
>  }
>  
> +static void *virtio_balloon_poll_free_page_hints(void *opaque)
> +{
> +    VirtQueueElement *elem;
> +    VirtIOBalloon *dev = opaque;
> +    VirtQueue *vq = dev->free_page_vq;
> +    uint32_t id;
> +    size_t size;
> +
> +    /* The optimization thread runs only when the guest is running. */
> +    while (runstate_is_running()) {

Note that this check is not guaranteed to be correct
when checked like this outside BQL.

I think you are better off relying on status
callback to synchronise with the backend thread.


> +        qemu_spin_lock(&dev->free_page_lock);
> +        /*
> +         * If the migration thread actively stops the reporting, exit
> +         * immediately.
> +         */
> +        if (dev->free_page_report_status >= FREE_PAGE_REPORT_S_STOP) {
> +            qemu_spin_unlock(&dev->free_page_lock);
> +            break;
> +        }
> +
> +        elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
> +        if (!elem) {
> +            qemu_spin_unlock(&dev->free_page_lock);
> +            continue;
> +        }
> +
> +        if (elem->out_num) {
> +            size = iov_to_buf(elem->out_sg, elem->out_num, 0, &id, sizeof(id));
> +            virtqueue_push(vq, elem, size);
> +            g_free(elem);
> +            if (unlikely(size != sizeof(id))) {
> +                warn_report("%s: received an incorrect cmd id", __func__);

virtio_error?

> +                qemu_spin_unlock(&dev->free_page_lock);
> +                break;
> +            }
> +            if (id == dev->free_page_report_cmd_id) {

id is LE above, but used in native endian here.

> +                dev->free_page_report_status = FREE_PAGE_REPORT_S_START;
> +            } else if (dev->free_page_report_status ==
> +                       FREE_PAGE_REPORT_S_START) {
> +                /*
> +                 * Stop the optimization only when it has started. This avoids
> +                 * a stale stop sign for the previous command.
> +                 */
> +                dev->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
> +                qemu_spin_unlock(&dev->free_page_lock);
> +                break;
> +            }

And else? What if it does not match and status is not start?
Don't you need to skip in elem decoding?


> +        }
> +
> +        if (elem->in_num) {
> +            if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START &&
> +                !dev->poison_val) {

poison generally disables everything? Add a TODO to handle
it in he future pls.


> +                qemu_guest_free_page_hint(elem->in_sg[0].iov_base,
> +                                          elem->in_sg[0].iov_len);
> +            }
> +            virtqueue_push(vq, elem, 0);
> +            g_free(elem);
> +        }
> +        qemu_spin_unlock(&dev->free_page_lock);
> +    }
> +    return NULL;
> +}
> +
> +static bool virtio_balloon_free_page_support(void *opaque)
> +{
> +    VirtIOBalloon *s = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +
> +    return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT);
> +}
> +
> +static void virtio_balloon_free_page_start(void *opaque)
> +{
> +    VirtIOBalloon *s = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +
> +    if (!runstate_is_running()) {
> +        return;
> +    }
> +
> +    if (unlikely(s->free_page_report_cmd_id == UINT_MAX)) {

Don't bother with these annotations for non data path things.

> +        s->free_page_report_cmd_id =
> +                       VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN;
> +    } else {
> +        s->free_page_report_cmd_id++;
> +    }
> +
> +    s->free_page_report_status = FREE_PAGE_REPORT_S_REQUESTED;
> +    virtio_notify_config(vdev);
> +    qemu_thread_create(&s->free_page_thread, "balloon_fpo",
> +                       virtio_balloon_poll_free_page_hints, s,
> +                       QEMU_THREAD_JOINABLE);
> +}
> +
> +static void virtio_balloon_free_page_stop(void *opaque)
> +{
> +    VirtIOBalloon *s = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +
> +    qemu_spin_lock(&s->free_page_lock);
> +    switch (s->free_page_report_status) {
> +    case FREE_PAGE_REPORT_S_REQUESTED:
> +    case FREE_PAGE_REPORT_S_START:
> +        /*
> +         * The guest hasn't done the reporting, so host sends a notification
> +         * to the guest to actively stop the reporting before joining the
> +         * optimization thread.
> +         */
> +        s->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
> +        virtio_notify_config(vdev);
> +    case FREE_PAGE_REPORT_S_STOP:
> +        /* The guest has stopped the reporting. Join the optimization thread */
> +        qemu_thread_join(&s->free_page_thread);
> +        s->free_page_report_status = FREE_PAGE_REPORT_S_EXIT;
> +    case FREE_PAGE_REPORT_S_EXIT:
> +        /* The optimization thread has gone. No further actions needded. */
> +        break;
> +    }
> +    qemu_spin_unlock(&s->free_page_lock);
> +}
> +
>  static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
>  {
>      VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
> @@ -315,6 +437,15 @@ static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
>  
>      config.num_pages = cpu_to_le32(dev->num_pages);
>      config.actual = cpu_to_le32(dev->actual);
> +    config.poison_val = cpu_to_le32(dev->poison_val);
> +
> +    if (dev->free_page_report_status >= FREE_PAGE_REPORT_S_STOP) {
> +        config.free_page_report_cmd_id =
> +                       cpu_to_le32(VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID);
> +    } else {
> +        config.free_page_report_cmd_id =
> +                       cpu_to_le32(dev->free_page_report_cmd_id);
> +    }
>  
>      trace_virtio_balloon_get_config(config.num_pages, config.actual);
>      memcpy(config_data, &config, sizeof(struct virtio_balloon_config));
> @@ -368,6 +499,7 @@ static void virtio_balloon_set_config(VirtIODevice *vdev,
>                          ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT),
>                          &error_abort);
>      }
> +    dev->poison_val = le32_to_cpu(config.poison_val);

We probably should not assume this value is correct if guest didn't
ack the appropriate feature.


>      trace_virtio_balloon_set_config(dev->actual, oldactual);
>  }
>  
> @@ -377,6 +509,11 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
>      VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
>      f |= dev->host_features;
>      virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ);
> +
> +    if (dev->host_features & 1ULL << VIRTIO_BALLOON_F_FREE_PAGE_HINT) {
> +        virtio_add_feature(&f, VIRTIO_BALLOON_F_PAGE_POISON);
> +    }
> +
>      return f;
>  }
>  
> @@ -413,6 +550,18 @@ static int virtio_balloon_post_load_device(void *opaque, int version_id)
>      return 0;
>  }
>  
> +static const VMStateDescription vmstate_virtio_balloon_free_page_report = {
> +    .name = "virtio-balloon-device/free-page-report",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = virtio_balloon_free_page_support,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon),
> +        VMSTATE_UINT32(poison_val, VirtIOBalloon),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static const VMStateDescription vmstate_virtio_balloon_device = {
>      .name = "virtio-balloon-device",
>      .version_id = 1,
> @@ -423,30 +572,31 @@ static const VMStateDescription vmstate_virtio_balloon_device = {
>          VMSTATE_UINT32(actual, VirtIOBalloon),
>          VMSTATE_END_OF_LIST()
>      },
> +    .subsections = (const VMStateDescription * []) {
> +        &vmstate_virtio_balloon_free_page_report,
> +        NULL
> +    }
>  };
>  
>  static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
>  {
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>      VirtIOBalloon *s = VIRTIO_BALLOON(dev);
> -    int ret;
>  
>      virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON,
>                  sizeof(struct virtio_balloon_config));
>  
> -    ret = qemu_add_balloon_handler(virtio_balloon_to_target,
> -                                   virtio_balloon_stat, s);
> -
> -    if (ret < 0) {
> -        error_setg(errp, "Only one balloon device is supported");
> -        virtio_cleanup(vdev);
> -        return;
> -    }
> -
>      s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
>      s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
>      s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
> -
> +    if (virtio_has_feature(s->host_features,
> +                           VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> +        s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE, NULL);
> +        s->free_page_report_status = FREE_PAGE_REPORT_S_EXIT;
> +        s->free_page_report_cmd_id =
> +                           VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN - 1;
> +        qemu_spin_init(&s->free_page_lock);
> +    }
>      reset_stats(s);
>  }
>  
> @@ -455,6 +605,9 @@ static void virtio_balloon_device_unrealize(DeviceState *dev, Error **errp)
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>      VirtIOBalloon *s = VIRTIO_BALLOON(dev);
>  
> +    if (virtio_balloon_free_page_support(s)) {
> +        virtio_balloon_free_page_stop(s);
> +    }
>      balloon_stats_destroy_timer(s);
>      qemu_remove_balloon_handler(s);
>      virtio_cleanup(vdev);
> @@ -464,6 +617,10 @@ static void virtio_balloon_device_reset(VirtIODevice *vdev)
>  {
>      VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
>  
> +    if (virtio_balloon_free_page_support(s)) {
> +        virtio_balloon_free_page_stop(s);
> +    }
> +
>      if (s->stats_vq_elem != NULL) {
>          virtqueue_unpop(s->svq, s->stats_vq_elem, 0);
>          g_free(s->stats_vq_elem);
> @@ -475,11 +632,37 @@ static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
>  
> -    if (!s->stats_vq_elem && vdev->vm_running &&
> -        (status & VIRTIO_CONFIG_S_DRIVER_OK) && virtqueue_rewind(s->svq, 1)) {
> -        /* poll stats queue for the element we have discarded when the VM
> -         * was stopped */
> -        virtio_balloon_receive_stats(vdev, s->svq);
> +    if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
> +        if (!s->stats_vq_elem && vdev->vm_running &&
> +            virtqueue_rewind(s->svq, 1)) {
> +            /*
> +             * Poll stats queue for the element we have discarded when the VM
> +             * was stopped.
> +             */
> +            virtio_balloon_receive_stats(vdev, s->svq);
> +        }
> +
> +        if (virtio_balloon_free_page_support(s)) {
> +            qemu_add_balloon_handler(virtio_balloon_to_target,
> +                                     virtio_balloon_stat,
> +                                     virtio_balloon_free_page_support,
> +                                     virtio_balloon_free_page_start,
> +                                     virtio_balloon_free_page_stop,
> +                                     s);
> +            /*
> +             * This handles the case that the guest is being stopped (e.g. by
> +             * qmp commands) while the driver is still reporting hints. When
> +             * the guest is woken up, it will continue to report hints, which
> +             * are not needed. So when the wakeup notifier invokes the
> +             * set_status callback here, we get the chance to make sure that
> +             * the free page optimization thread is exited via
> +             * virtio_balloon_free_page_stop.
> +             */
> +            virtio_balloon_free_page_stop(s);

I don't understand the logic here at all.
So if status is set to DRIVER_OK, then we stop reporting?


> +        } else {
> +            qemu_add_balloon_handler(virtio_balloon_to_target,
> +                                     virtio_balloon_stat, NULL, NULL, NULL, s);
> +        }
>      }
>  }
>  
> @@ -509,6 +692,8 @@ static const VMStateDescription vmstate_virtio_balloon = {
>  static Property virtio_balloon_properties[] = {
>      DEFINE_PROP_BIT("deflate-on-oom", VirtIOBalloon, host_features,
>                      VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false),
> +    DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features,
> +                    VIRTIO_BALLOON_F_FREE_PAGE_HINT, false),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h
> index 1ea13bd..cfdba37 100644
> --- a/include/hw/virtio/virtio-balloon.h
> +++ b/include/hw/virtio/virtio-balloon.h
> @@ -23,6 +23,8 @@
>  #define VIRTIO_BALLOON(obj) \
>          OBJECT_CHECK(VirtIOBalloon, (obj), TYPE_VIRTIO_BALLOON)
>  
> +#define VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN 0x80000000
> +
>  typedef struct virtio_balloon_stat VirtIOBalloonStat;
>  
>  typedef struct virtio_balloon_stat_modern {
> @@ -31,15 +33,31 @@ typedef struct virtio_balloon_stat_modern {
>         uint64_t val;
>  } VirtIOBalloonStatModern;
>  
> +enum virtio_balloon_free_page_report_status {
> +    FREE_PAGE_REPORT_S_REQUESTED,
> +    FREE_PAGE_REPORT_S_START,
> +    FREE_PAGE_REPORT_S_STOP,
> +    FREE_PAGE_REPORT_S_EXIT,
> +};
> +
>  typedef struct VirtIOBalloon {
>      VirtIODevice parent_obj;
> -    VirtQueue *ivq, *dvq, *svq;
> +    VirtQueue *ivq, *dvq, *svq, *free_page_vq;
> +    uint32_t free_page_report_status;
>      uint32_t num_pages;
>      uint32_t actual;
> +    uint32_t free_page_report_cmd_id;
> +    uint32_t poison_val;
>      uint64_t stats[VIRTIO_BALLOON_S_NR];
>      VirtQueueElement *stats_vq_elem;
>      size_t stats_vq_offset;
>      QEMUTimer *stats_timer;
> +    QemuThread free_page_thread;
> +    /*
> +     * Lock to synchronize threads to access the free page reporting related
> +     * fields (e.g. free_page_report_status).
> +     */
> +    QemuSpin free_page_lock;
>      int64_t stats_last_update;
>      int64_t stats_poll_interval;
>      uint32_t host_features;
> diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
> index 7b0a41b..f89e80f 100644
> --- a/include/standard-headers/linux/virtio_balloon.h
> +++ b/include/standard-headers/linux/virtio_balloon.h
> @@ -34,15 +34,22 @@
>  #define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
>  #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
> +#define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
> +#define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
>  
>  /* Size of a PFN in the balloon interface. */
>  #define VIRTIO_BALLOON_PFN_SHIFT 12
>  
> +#define VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID 0
>  struct virtio_balloon_config {
>  	/* Number of pages host wants Guest to give up. */
>  	uint32_t num_pages;
>  	/* Number of pages we've actually got in balloon. */
>  	uint32_t actual;
> +	/* Free page report command id, readonly by guest */
> +	uint32_t free_page_report_cmd_id;
> +	/* Stores PAGE_POISON if page poisoning is in use */
> +	uint32_t poison_val;
>  };
>  
>  #define VIRTIO_BALLOON_S_SWAP_IN  0   /* Amount of memory swapped in */
> diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h
> index 66543ae..6561a08 100644
> --- a/include/sysemu/balloon.h
> +++ b/include/sysemu/balloon.h
> @@ -18,11 +18,22 @@
>  
>  typedef void (QEMUBalloonEvent)(void *opaque, ram_addr_t target);
>  typedef void (QEMUBalloonStatus)(void *opaque, BalloonInfo *info);
> +typedef bool (QEMUBalloonFreePageSupport)(void *opaque);
> +typedef void (QEMUBalloonFreePageStart)(void *opaque);
> +typedef void (QEMUBalloonFreePageStop)(void *opaque);
>  
> -int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
> -			     QEMUBalloonStatus *stat_func, void *opaque);
>  void qemu_remove_balloon_handler(void *opaque);
>  bool qemu_balloon_is_inhibited(void);
>  void qemu_balloon_inhibit(bool state);
> +bool balloon_free_page_support(void);
> +void balloon_free_page_start(void);
> +void balloon_free_page_stop(void);
> +
> +void qemu_add_balloon_handler(QEMUBalloonEvent *event_fn,
> +                              QEMUBalloonStatus *stat_fn,
> +                              QEMUBalloonFreePageSupport *free_page_support_fn,
> +                              QEMUBalloonFreePageStart *free_page_start_fn,
> +                              QEMUBalloonFreePageStop *free_page_stop_fn,
> +                              void *opaque);
>  
>  #endif
> -- 
> 1.8.3.1
Wei Wang March 18, 2018, 10:36 a.m. UTC | #2
On 03/16/2018 11:16 PM, Michael S. Tsirkin wrote:
> On Fri, Mar 16, 2018 at 06:48:28PM +0800, Wei Wang wrote:
>> The new feature enables the virtio-balloon device to receive hints of
>> guest free pages from the free page vq.
>>
>> balloon_free_page_start - start guest free page hint reporting.
>> balloon_free_page_stop - stop guest free page hint reporting.
>>
>> Note: balloon will report pages which were free at the time
>> of this call. As the reporting happens asynchronously, dirty bit logging
>> must be enabled before this call is made. Guest reporting must be
>> disabled before the migration dirty bitmap is synchronized.
>>
>> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
>> Signed-off-by: Liang Li <liang.z.li@intel.com>
>> CC: Michael S. Tsirkin <mst@redhat.com>
>> CC: Dr. David Alan Gilbert <dgilbert@redhat.com>
>> CC: Juan Quintela <quintela@redhat.com>
>> ---
>>   balloon.c                                       |  58 +++++--
>>   hw/virtio/virtio-balloon.c                      | 217 ++++++++++++++++++++++--
>>   include/hw/virtio/virtio-balloon.h              |  20 ++-
>>   include/standard-headers/linux/virtio_balloon.h |   7 +
>>   include/sysemu/balloon.h                        |  15 +-
>>   5 files changed, 288 insertions(+), 29 deletions(-)
>>
>> diff --git a/balloon.c b/balloon.c
>> index 6bf0a96..87a0410 100644
>> --- a/balloon.c
>> +++ b/balloon.c
>> @@ -36,6 +36,9 @@
>>   
>>
>> +static void *virtio_balloon_poll_free_page_hints(void *opaque)
>> +{
>> +    VirtQueueElement *elem;
>> +    VirtIOBalloon *dev = opaque;
>> +    VirtQueue *vq = dev->free_page_vq;
>> +    uint32_t id;
>> +    size_t size;
>> +
>> +    /* The optimization thread runs only when the guest is running. */
>> +    while (runstate_is_running()) {
> Note that this check is not guaranteed to be correct
> when checked like this outside BQL.
>
> I think you are better off relying on status
> callback to synchronise with the backend thread.
>

It's actually OK here, I think we don't need the guarantee. The device 
is just the consumer of the vq, essentially it doesn't have a dependency 
(i.e. won't block or cause errors) on the guest state.
For example:
1) when the device executes "while (runstate_is_running())" and finds 
that the guest is running, it proceeds;
2) the guest is stopped immediately right after the "while 
(runstate_is_running())" check;
3) the device side execution reaches virtqueue_pop(), and finds 
"elem==NULL", since the driver (provider) stops adding hints. Then it 
continues by going back to "while (runstate_is_running())", and now it 
finds the guest is stopped, and then exits.

Essentially, this runstate check is just an optimization to the case 
that the driver is stopped to provide hints while the device side 
optimization thread is still polling the empty vq (i.e. effort in vain). 
Probably, it would be better to check the runstate under "elem==NULL":

while (1) {
     ...
     elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
     if (!elem) {
             qemu_spin_unlock(&dev->free_page_lock);
             if (runstate_is_running())
                 continue;
             else
                 break;
     }
     ...
}


>> +                dev->free_page_report_status = FREE_PAGE_REPORT_S_START;
>> +            } else if (dev->free_page_report_status ==
>> +                       FREE_PAGE_REPORT_S_START) {
>> +                /*
>> +                 * Stop the optimization only when it has started. This avoids
>> +                 * a stale stop sign for the previous command.
>> +                 */
>> +                dev->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
>> +                qemu_spin_unlock(&dev->free_page_lock);
>> +                break;
>> +            }
> And else? What if it does not match and status is not start?
> Don't you need to skip in elem decoding?

No, actually we don't need "else". Please see the code inside if 
(elem->in_num) below. If the status isn't set to START, 
qemu_guest_free_page_hint will not be called to decode the elem.


>
>> +        }
>> +
>> +        if (elem->in_num) {
>> +            if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START &&
>> +                !dev->poison_val) {
> poison generally disables everything? Add a TODO to handle
> it in he future pls.


OK, will add TODO in the commit.



@@ -368,6 +499,7 @@ static void virtio_balloon_set_config(VirtIODevice *vdev,
                          ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT),
                          &error_abort);
      }
+    dev->poison_val = le32_to_cpu(config.poison_val);

> We probably should not assume this value is correct if guest didn't
> ack the appropriate feature.


OK, I'll add a comment to explain that.


@@ -475,11 +632,37 @@ static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status)
  {
      VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
  
-    if (!s->stats_vq_elem && vdev->vm_running &&
-        (status & VIRTIO_CONFIG_S_DRIVER_OK) && virtqueue_rewind(s->svq, 1)) {
-        /* poll stats queue for the element we have discarded when the VM
-         * was stopped */
-        virtio_balloon_receive_stats(vdev, s->svq);
+    if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+        if (!s->stats_vq_elem && vdev->vm_running &&
+            virtqueue_rewind(s->svq, 1)) {
+            /*
+             * Poll stats queue for the element we have discarded when the VM
+             * was stopped.
+             */
+            virtio_balloon_receive_stats(vdev, s->svq);
+        }
+
+        if (virtio_balloon_free_page_support(s)) {
+            qemu_add_balloon_handler(virtio_balloon_to_target,
+                                     virtio_balloon_stat,
+                                     virtio_balloon_free_page_support,
+                                     virtio_balloon_free_page_start,
+                                     virtio_balloon_free_page_stop,
+                                     s);
+            /*
+             * This handles the case that the guest is being stopped (e.g. by
+             * qmp commands) while the driver is still reporting hints. When
+             * the guest is woken up, it will continue to report hints, which
+             * are not needed. So when the wakeup notifier invokes the
+             * set_status callback here, we get the chance to make sure that
+             * the free page optimization thread is exited via
+             * virtio_balloon_free_page_stop.
+             */
+            virtio_balloon_free_page_stop(s);

> I don't understand the logic here at all.
> So if status is set to DRIVER_OK, then we stop reporting?
>

I thought about this more. It's not necessary to call the stop() 
function here, because we have already made the optimization thread exit 
when the guest is stopped. When the guest is woken up, it will continue 
to report, and there is no consumer of the vq (the optimization thread 
has exited). There is no functional issue here, since the driver will 
just drop the hint when the vq is full. From the optimization point of 
view, I think we can consider to replace the above sop() function with 
virtio_notify_config(vdev), which will stop the guest's unnecessary 
reporting.


Best,
Wei
Michael S. Tsirkin March 19, 2018, 4:24 a.m. UTC | #3
On Sun, Mar 18, 2018 at 06:36:20PM +0800, Wei Wang wrote:
> On 03/16/2018 11:16 PM, Michael S. Tsirkin wrote:
> > On Fri, Mar 16, 2018 at 06:48:28PM +0800, Wei Wang wrote:
> > > The new feature enables the virtio-balloon device to receive hints of
> > > guest free pages from the free page vq.
> > > 
> > > balloon_free_page_start - start guest free page hint reporting.
> > > balloon_free_page_stop - stop guest free page hint reporting.
> > > 
> > > Note: balloon will report pages which were free at the time
> > > of this call. As the reporting happens asynchronously, dirty bit logging
> > > must be enabled before this call is made. Guest reporting must be
> > > disabled before the migration dirty bitmap is synchronized.
> > > 
> > > Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> > > Signed-off-by: Liang Li <liang.z.li@intel.com>
> > > CC: Michael S. Tsirkin <mst@redhat.com>
> > > CC: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > > CC: Juan Quintela <quintela@redhat.com>
> > > ---
> > >   balloon.c                                       |  58 +++++--
> > >   hw/virtio/virtio-balloon.c                      | 217 ++++++++++++++++++++++--
> > >   include/hw/virtio/virtio-balloon.h              |  20 ++-
> > >   include/standard-headers/linux/virtio_balloon.h |   7 +
> > >   include/sysemu/balloon.h                        |  15 +-
> > >   5 files changed, 288 insertions(+), 29 deletions(-)
> > > 
> > > diff --git a/balloon.c b/balloon.c
> > > index 6bf0a96..87a0410 100644
> > > --- a/balloon.c
> > > +++ b/balloon.c
> > > @@ -36,6 +36,9 @@
> > > 
> > > +static void *virtio_balloon_poll_free_page_hints(void *opaque)
> > > +{
> > > +    VirtQueueElement *elem;
> > > +    VirtIOBalloon *dev = opaque;
> > > +    VirtQueue *vq = dev->free_page_vq;
> > > +    uint32_t id;
> > > +    size_t size;
> > > +
> > > +    /* The optimization thread runs only when the guest is running. */
> > > +    while (runstate_is_running()) {
> > Note that this check is not guaranteed to be correct
> > when checked like this outside BQL.
> > 
> > I think you are better off relying on status
> > callback to synchronise with the backend thread.
> > 
> 
> It's actually OK here, I think we don't need the guarantee. The device is
> just the consumer of the vq, essentially it doesn't have a dependency (i.e.
> won't block or cause errors) on the guest state.
> For example:
> 1) when the device executes "while (runstate_is_running())" and finds that
> the guest is running, it proceeds;
> 2) the guest is stopped immediately right after the "while
> (runstate_is_running())" check;
> 3) the device side execution reaches virtqueue_pop(), and finds
> "elem==NULL", since the driver (provider) stops adding hints. Then it
> continues by going back to "while (runstate_is_running())", and now it finds
> the guest is stopped, and then exits.


OTOH it seems that if thread stops nothing will wake it up
whem vm is restarted. Such bahaviour change across vmstop/vmstart
is unexpected.

> Essentially, this runstate check is just an optimization to the case that
> the driver is stopped to provide hints while the device side optimization
> thread is still polling the empty vq (i.e. effort in vain). Probably, it
> would be better to check the runstate under "elem==NULL":
> 
> while (1) {
>     ...
>     elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
>     if (!elem) {
>             qemu_spin_unlock(&dev->free_page_lock);
>             if (runstate_is_running())
>                 continue;
>             else
>                 break;
>     }
>     ...
> }
> 
> 
> > > +                dev->free_page_report_status = FREE_PAGE_REPORT_S_START;
> > > +            } else if (dev->free_page_report_status ==
> > > +                       FREE_PAGE_REPORT_S_START) {
> > > +                /*
> > > +                 * Stop the optimization only when it has started. This avoids
> > > +                 * a stale stop sign for the previous command.
> > > +                 */
> > > +                dev->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
> > > +                qemu_spin_unlock(&dev->free_page_lock);
> > > +                break;
> > > +            }
> > And else? What if it does not match and status is not start?
> > Don't you need to skip in elem decoding?
> 
> No, actually we don't need "else". Please see the code inside if
> (elem->in_num) below. If the status isn't set to START,
> qemu_guest_free_page_hint will not be called to decode the elem.
> 
> 
> > 
> > > +        }
> > > +
> > > +        if (elem->in_num) {
> > > +            if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START &&
> > > +                !dev->poison_val) {
> > poison generally disables everything? Add a TODO to handle
> > it in he future pls.
> 
> 
> OK, will add TODO in the commit.
> 
> 
> 
> @@ -368,6 +499,7 @@ static void virtio_balloon_set_config(VirtIODevice *vdev,
>                          ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT),
>                          &error_abort);
>      }
> +    dev->poison_val = le32_to_cpu(config.poison_val);
> 
> > We probably should not assume this value is correct if guest didn't
> > ack the appropriate feature.
> 
> 
> OK, I'll add a comment to explain that.
> 
> 
> @@ -475,11 +632,37 @@ static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
> -    if (!s->stats_vq_elem && vdev->vm_running &&
> -        (status & VIRTIO_CONFIG_S_DRIVER_OK) && virtqueue_rewind(s->svq, 1)) {
> -        /* poll stats queue for the element we have discarded when the VM
> -         * was stopped */
> -        virtio_balloon_receive_stats(vdev, s->svq);
> +    if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
> +        if (!s->stats_vq_elem && vdev->vm_running &&
> +            virtqueue_rewind(s->svq, 1)) {
> +            /*
> +             * Poll stats queue for the element we have discarded when the VM
> +             * was stopped.
> +             */
> +            virtio_balloon_receive_stats(vdev, s->svq);
> +        }
> +
> +        if (virtio_balloon_free_page_support(s)) {
> +            qemu_add_balloon_handler(virtio_balloon_to_target,
> +                                     virtio_balloon_stat,
> +                                     virtio_balloon_free_page_support,
> +                                     virtio_balloon_free_page_start,
> +                                     virtio_balloon_free_page_stop,
> +                                     s);
> +            /*
> +             * This handles the case that the guest is being stopped (e.g. by
> +             * qmp commands) while the driver is still reporting hints. When
> +             * the guest is woken up, it will continue to report hints, which
> +             * are not needed. So when the wakeup notifier invokes the
> +             * set_status callback here, we get the chance to make sure that
> +             * the free page optimization thread is exited via
> +             * virtio_balloon_free_page_stop.
> +             */
> +            virtio_balloon_free_page_stop(s);
> 
> > I don't understand the logic here at all.
> > So if status is set to DRIVER_OK, then we stop reporting?
> > 
> 
> I thought about this more. It's not necessary to call the stop() function
> here, because we have already made the optimization thread exit when the
> guest is stopped. When the guest is woken up, it will continue to report,
> and there is no consumer of the vq (the optimization thread has exited).
> There is no functional issue here, since the driver will just drop the hint
> when the vq is full. From the optimization point of view, I think we can
> consider to replace the above sop() function with
> virtio_notify_config(vdev), which will stop the guest's unnecessary
> reporting.

I do not understand why we want to increment the counter
on vm stop though. It does make sense to stop the thread
but why not resume where we left off when vm is resumed?


> 
> Best,
> Wei

In short the code will be easier to reason about if there's some
symmetry in how we start/stop things.  If we need the thread to run
together with the VCPU then starting/stopping it from status seems
like a reasonable choice.
Wei Wang March 19, 2018, 9:01 a.m. UTC | #4
On 03/19/2018 12:24 PM, Michael S. Tsirkin wrote:
> On Sun, Mar 18, 2018 at 06:36:20PM +0800, Wei Wang wrote:
>> On 03/16/2018 11:16 PM, Michael S. Tsirkin wrote:
>>> On Fri, Mar 16, 2018 at 06:48:28PM +0800, Wei Wang wrote:
>
> OTOH it seems that if thread stops nothing will wake it up
> whem vm is restarted. Such bahaviour change across vmstop/vmstart
> is unexpected.
> I do not understand why we want to increment the counter
> on vm stop though. It does make sense to stop the thread
> but why not resume where we left off when vm is resumed?
>

I'm not sure which counter we incremented. But it would be clear if we 
have a high level view of how it works (it is symmetric actually). 
Basically, we start the optimization when each round starts and stop it 
at the end of each round (i.e. before we do the bitmap sync), as shown 
below:

1) 1st Round starts --> free_page_start
2) 1st Round in progress..
3) 1st Round ends  --> free_page_stop
4) 2nd Round starts --> free_page_start
5) 2nd Round in progress..
6) 2nd Round ends --> free_page_stop
......

For example, in 2), the VM is stopped. 
virtio_balloon_poll_free_page_hints finds the vq is empty (i.e. elem == 
NULL) and the runstate is stopped, the optimization thread exits 
immediately. That is, this optimization thread is gone forever (the 
optimization we can do for this round is done). We won't know when would 
the VM be woken up:
A) If the VM is woken up very soon when the migration thread is still in 
progress of 2), then in 4) a new optimization thread (not the same one 
for the first round) will be created and start the optimization for the 
2nd round as usual (If you have questions about 3) in this case, that 
free_page_stop will do nothing than just return, since the optimization 
thread has exited) ;
B) If the VM is woken up after the whole migration has ended, there is 
still no point in resuming the optimization.

I think this would be the simple design for the first release of this 
optimization. There are possibilities to improve case A) above by 
continuing optimization for the 1st Round as it is still in progress, 
but I think adding that complexity for this rare case wouldn't be 
worthwhile (at least for now). What would you think?


Best,
Wei
Michael S. Tsirkin March 19, 2018, 10:55 p.m. UTC | #5
On Mon, Mar 19, 2018 at 05:01:38PM +0800, Wei Wang wrote:
> On 03/19/2018 12:24 PM, Michael S. Tsirkin wrote:
> > On Sun, Mar 18, 2018 at 06:36:20PM +0800, Wei Wang wrote:
> > > On 03/16/2018 11:16 PM, Michael S. Tsirkin wrote:
> > > > On Fri, Mar 16, 2018 at 06:48:28PM +0800, Wei Wang wrote:
> > 
> > OTOH it seems that if thread stops nothing will wake it up
> > whem vm is restarted. Such bahaviour change across vmstop/vmstart
> > is unexpected.
> > I do not understand why we want to increment the counter
> > on vm stop though. It does make sense to stop the thread
> > but why not resume where we left off when vm is resumed?
> > 
> 
> I'm not sure which counter we incremented. But it would be clear if we have
> a high level view of how it works (it is symmetric actually). Basically, we
> start the optimization when each round starts and stop it at the end of each
> round (i.e. before we do the bitmap sync), as shown below:
> 
> 1) 1st Round starts --> free_page_start
> 2) 1st Round in progress..
> 3) 1st Round ends  --> free_page_stop
> 4) 2nd Round starts --> free_page_start
> 5) 2nd Round in progress..
> 6) 2nd Round ends --> free_page_stop
> ......
> 
> For example, in 2), the VM is stopped. virtio_balloon_poll_free_page_hints
> finds the vq is empty (i.e. elem == NULL) and the runstate is stopped, the
> optimization thread exits immediately. That is, this optimization thread is
> gone forever (the optimization we can do for this round is done). We won't
> know when would the VM be woken up:
> A) If the VM is woken up very soon when the migration thread is still in
> progress of 2), then in 4) a new optimization thread (not the same one for
> the first round) will be created and start the optimization for the 2nd
> round as usual (If you have questions about 3) in this case, that
> free_page_stop will do nothing than just return, since the optimization
> thread has exited) ;
> B) If the VM is woken up after the whole migration has ended, there is still
> no point in resuming the optimization.
> 
> I think this would be the simple design for the first release of this
> optimization. There are possibilities to improve case A) above by continuing
> optimization for the 1st Round as it is still in progress, but I think
> adding that complexity for this rare case wouldn't be worthwhile (at least
> for now). What would you think?
> 
> 
> Best,
> Wei

In my opinion this just makes the patch very messy.

E.g. attempts to attach a debugger to the guest will call vmstop and
then behaviour changes. This is a receipe for heisenbugs which are then
extremely painful to debug.

It is not really hard to make things symmetrical:
e.g. if you stop on vmstop then you should start on vmstart, etc.
And stopping thread should not involve a bunch of state
changes, just stop it and that's it.
Wei Wang March 20, 2018, 2:16 a.m. UTC | #6
On 03/20/2018 06:55 AM, Michael S. Tsirkin wrote:
> On Mon, Mar 19, 2018 at 05:01:38PM +0800, Wei Wang wrote:
>> On 03/19/2018 12:24 PM, Michael S. Tsirkin wrote:
>>> On Sun, Mar 18, 2018 at 06:36:20PM +0800, Wei Wang wrote:
>>>> On 03/16/2018 11:16 PM, Michael S. Tsirkin wrote:
>>>>> On Fri, Mar 16, 2018 at 06:48:28PM +0800, Wei Wang wrote:
>>> OTOH it seems that if thread stops nothing will wake it up
>>> whem vm is restarted. Such bahaviour change across vmstop/vmstart
>>> is unexpected.
>>> I do not understand why we want to increment the counter
>>> on vm stop though. It does make sense to stop the thread
>>> but why not resume where we left off when vm is resumed?
>>>
>> I'm not sure which counter we incremented. But it would be clear if we have
>> a high level view of how it works (it is symmetric actually). Basically, we
>> start the optimization when each round starts and stop it at the end of each
>> round (i.e. before we do the bitmap sync), as shown below:
>>
>> 1) 1st Round starts --> free_page_start
>> 2) 1st Round in progress..
>> 3) 1st Round ends  --> free_page_stop
>> 4) 2nd Round starts --> free_page_start
>> 5) 2nd Round in progress..
>> 6) 2nd Round ends --> free_page_stop
>> ......
>>
>> For example, in 2), the VM is stopped. virtio_balloon_poll_free_page_hints
>> finds the vq is empty (i.e. elem == NULL) and the runstate is stopped, the
>> optimization thread exits immediately. That is, this optimization thread is
>> gone forever (the optimization we can do for this round is done). We won't
>> know when would the VM be woken up:
>> A) If the VM is woken up very soon when the migration thread is still in
>> progress of 2), then in 4) a new optimization thread (not the same one for
>> the first round) will be created and start the optimization for the 2nd
>> round as usual (If you have questions about 3) in this case, that
>> free_page_stop will do nothing than just return, since the optimization
>> thread has exited) ;
>> B) If the VM is woken up after the whole migration has ended, there is still
>> no point in resuming the optimization.
>>
>> I think this would be the simple design for the first release of this
>> optimization. There are possibilities to improve case A) above by continuing
>> optimization for the 1st Round as it is still in progress, but I think
>> adding that complexity for this rare case wouldn't be worthwhile (at least
>> for now). What would you think?
>>
>>
>> Best,
>> Wei
> In my opinion this just makes the patch very messy.
>
> E.g. attempts to attach a debugger to the guest will call vmstop and
> then behaviour changes. This is a receipe for heisenbugs which are then
> extremely painful to debug.
>
> It is not really hard to make things symmetrical:
> e.g. if you stop on vmstop then you should start on vmstart, etc.
> And stopping thread should not involve a bunch of state
> changes, just stop it and that's it.
>

"stop it" - do you mean to
1) make the thread exit (i.e.make virtio_balloon_poll_free_page_hints 
exit the while loop and return NULL); or
2) keep the thread staying in the while loop but yield running (e.g. 
sleep(1) or block on a mutex)? (or please let me know if you suggested a 
different implementation about stopping the thread)

If we go with 1), then we would not be able to resume the thread.
If we go with 2), then there is no guarantee to make the thread continue 
to run immediately (there will be a scheduling delay which is not 
predictable) when the vm is woken up. If the thread cannot run 
immediately when the the VM is woken up, it will be effectively the same 
as 1).

In terms of heisenbugs, I think we can recommend developers (of this 
feature) to use methods that don't stop the VM (e.g. print).

Best,
Wei
Michael S. Tsirkin March 20, 2018, 2:59 a.m. UTC | #7
On Tue, Mar 20, 2018 at 10:16:09AM +0800, Wei Wang wrote:
> On 03/20/2018 06:55 AM, Michael S. Tsirkin wrote:
> > On Mon, Mar 19, 2018 at 05:01:38PM +0800, Wei Wang wrote:
> > > On 03/19/2018 12:24 PM, Michael S. Tsirkin wrote:
> > > > On Sun, Mar 18, 2018 at 06:36:20PM +0800, Wei Wang wrote:
> > > > > On 03/16/2018 11:16 PM, Michael S. Tsirkin wrote:
> > > > > > On Fri, Mar 16, 2018 at 06:48:28PM +0800, Wei Wang wrote:
> > > > OTOH it seems that if thread stops nothing will wake it up
> > > > whem vm is restarted. Such bahaviour change across vmstop/vmstart
> > > > is unexpected.
> > > > I do not understand why we want to increment the counter
> > > > on vm stop though. It does make sense to stop the thread
> > > > but why not resume where we left off when vm is resumed?
> > > > 
> > > I'm not sure which counter we incremented. But it would be clear if we have
> > > a high level view of how it works (it is symmetric actually). Basically, we
> > > start the optimization when each round starts and stop it at the end of each
> > > round (i.e. before we do the bitmap sync), as shown below:
> > > 
> > > 1) 1st Round starts --> free_page_start
> > > 2) 1st Round in progress..
> > > 3) 1st Round ends  --> free_page_stop
> > > 4) 2nd Round starts --> free_page_start
> > > 5) 2nd Round in progress..
> > > 6) 2nd Round ends --> free_page_stop
> > > ......
> > > 
> > > For example, in 2), the VM is stopped. virtio_balloon_poll_free_page_hints
> > > finds the vq is empty (i.e. elem == NULL) and the runstate is stopped, the
> > > optimization thread exits immediately. That is, this optimization thread is
> > > gone forever (the optimization we can do for this round is done). We won't
> > > know when would the VM be woken up:
> > > A) If the VM is woken up very soon when the migration thread is still in
> > > progress of 2), then in 4) a new optimization thread (not the same one for
> > > the first round) will be created and start the optimization for the 2nd
> > > round as usual (If you have questions about 3) in this case, that
> > > free_page_stop will do nothing than just return, since the optimization
> > > thread has exited) ;
> > > B) If the VM is woken up after the whole migration has ended, there is still
> > > no point in resuming the optimization.
> > > 
> > > I think this would be the simple design for the first release of this
> > > optimization. There are possibilities to improve case A) above by continuing
> > > optimization for the 1st Round as it is still in progress, but I think
> > > adding that complexity for this rare case wouldn't be worthwhile (at least
> > > for now). What would you think?
> > > 
> > > 
> > > Best,
> > > Wei
> > In my opinion this just makes the patch very messy.
> > 
> > E.g. attempts to attach a debugger to the guest will call vmstop and
> > then behaviour changes. This is a receipe for heisenbugs which are then
> > extremely painful to debug.
> > 
> > It is not really hard to make things symmetrical:
> > e.g. if you stop on vmstop then you should start on vmstart, etc.
> > And stopping thread should not involve a bunch of state
> > changes, just stop it and that's it.
> > 
> 
> "stop it" - do you mean to
> 1) make the thread exit (i.e.make virtio_balloon_poll_free_page_hints exit
> the while loop and return NULL); or
> 2) keep the thread staying in the while loop but yield running (e.g.
> sleep(1) or block on a mutex)? (or please let me know if you suggested a
> different implementation about stopping the thread)

I would say it makes more sense to make it block on something.

BTW I still think you are engaging in premature optimization here.
What you are doing here is a "data plane for balloon".
I would make the feature work first by processing this in a BH.
Creating threads immediately opens up questions of isolation,
cgroups etc.

> If we go with 1), then we would not be able to resume the thread.
> If we go with 2), then there is no guarantee to make the thread continue to
> run immediately (there will be a scheduling delay which is not predictable)
> when the vm is woken up. If the thread cannot run immediately when the the
> VM is woken up, it will be effectively the same as 1).
> 
> In terms of heisenbugs, I think we can recommend developers (of this
> feature) to use methods that don't stop the VM (e.g. print).
> 
> Best,
> Wei

Sorry this does not makes sense to me.  The reality is that for most
people migration works just fine.  These patches implement a niche
optimization. #1 priority should be to break no existing flows.
Wei Wang March 20, 2018, 3:18 a.m. UTC | #8
On 03/20/2018 10:59 AM, Michael S. Tsirkin wrote:
> On Tue, Mar 20, 2018 at 10:16:09AM +0800, Wei Wang wrote:
>> On 03/20/2018 06:55 AM, Michael S. Tsirkin wrote:
>>> On Mon, Mar 19, 2018 at 05:01:38PM +0800, Wei Wang wrote:
>>>> On 03/19/2018 12:24 PM, Michael S. Tsirkin wrote:
>>>>> On Sun, Mar 18, 2018 at 06:36:20PM +0800, Wei Wang wrote:
>>>>>> On 03/16/2018 11:16 PM, Michael S. Tsirkin wrote:
>>>>>>> On Fri, Mar 16, 2018 at 06:48:28PM +0800, Wei Wang wrote:
>>>>> OTOH it seems that if thread stops nothing will wake it up
>>>>> whem vm is restarted. Such bahaviour change across vmstop/vmstart
>>>>> is unexpected.
>>>>> I do not understand why we want to increment the counter
>>>>> on vm stop though. It does make sense to stop the thread
>>>>> but why not resume where we left off when vm is resumed?
>>>>>
>>>> I'm not sure which counter we incremented. But it would be clear if we have
>>>> a high level view of how it works (it is symmetric actually). Basically, we
>>>> start the optimization when each round starts and stop it at the end of each
>>>> round (i.e. before we do the bitmap sync), as shown below:
>>>>
>>>> 1) 1st Round starts --> free_page_start
>>>> 2) 1st Round in progress..
>>>> 3) 1st Round ends  --> free_page_stop
>>>> 4) 2nd Round starts --> free_page_start
>>>> 5) 2nd Round in progress..
>>>> 6) 2nd Round ends --> free_page_stop
>>>> ......
>>>>
>>>> For example, in 2), the VM is stopped. virtio_balloon_poll_free_page_hints
>>>> finds the vq is empty (i.e. elem == NULL) and the runstate is stopped, the
>>>> optimization thread exits immediately. That is, this optimization thread is
>>>> gone forever (the optimization we can do for this round is done). We won't
>>>> know when would the VM be woken up:
>>>> A) If the VM is woken up very soon when the migration thread is still in
>>>> progress of 2), then in 4) a new optimization thread (not the same one for
>>>> the first round) will be created and start the optimization for the 2nd
>>>> round as usual (If you have questions about 3) in this case, that
>>>> free_page_stop will do nothing than just return, since the optimization
>>>> thread has exited) ;
>>>> B) If the VM is woken up after the whole migration has ended, there is still
>>>> no point in resuming the optimization.
>>>>
>>>> I think this would be the simple design for the first release of this
>>>> optimization. There are possibilities to improve case A) above by continuing
>>>> optimization for the 1st Round as it is still in progress, but I think
>>>> adding that complexity for this rare case wouldn't be worthwhile (at least
>>>> for now). What would you think?
>>>>
>>>>
>>>> Best,
>>>> Wei
>>> In my opinion this just makes the patch very messy.
>>>
>>> E.g. attempts to attach a debugger to the guest will call vmstop and
>>> then behaviour changes. This is a receipe for heisenbugs which are then
>>> extremely painful to debug.
>>>
>>> It is not really hard to make things symmetrical:
>>> e.g. if you stop on vmstop then you should start on vmstart, etc.
>>> And stopping thread should not involve a bunch of state
>>> changes, just stop it and that's it.
>>>
>> "stop it" - do you mean to
>> 1) make the thread exit (i.e.make virtio_balloon_poll_free_page_hints exit
>> the while loop and return NULL); or
>> 2) keep the thread staying in the while loop but yield running (e.g.
>> sleep(1) or block on a mutex)? (or please let me know if you suggested a
>> different implementation about stopping the thread)
> I would say it makes more sense to make it block on something.
>
> BTW I still think you are engaging in premature optimization here.
> What you are doing here is a "data plane for balloon".
> I would make the feature work first by processing this in a BH.
> Creating threads immediately opens up questions of isolation,
> cgroups etc.
>

Could you please share more about how creating threads affects isolation 
and cgroup? and how does BH solve it? Thanks.

Best,
Wei
Michael S. Tsirkin March 20, 2018, 3:24 a.m. UTC | #9
On Tue, Mar 20, 2018 at 11:18:23AM +0800, Wei Wang wrote:
> On 03/20/2018 10:59 AM, Michael S. Tsirkin wrote:
> > On Tue, Mar 20, 2018 at 10:16:09AM +0800, Wei Wang wrote:
> > > On 03/20/2018 06:55 AM, Michael S. Tsirkin wrote:
> > > > On Mon, Mar 19, 2018 at 05:01:38PM +0800, Wei Wang wrote:
> > > > > On 03/19/2018 12:24 PM, Michael S. Tsirkin wrote:
> > > > > > On Sun, Mar 18, 2018 at 06:36:20PM +0800, Wei Wang wrote:
> > > > > > > On 03/16/2018 11:16 PM, Michael S. Tsirkin wrote:
> > > > > > > > On Fri, Mar 16, 2018 at 06:48:28PM +0800, Wei Wang wrote:
> > > > > > OTOH it seems that if thread stops nothing will wake it up
> > > > > > whem vm is restarted. Such bahaviour change across vmstop/vmstart
> > > > > > is unexpected.
> > > > > > I do not understand why we want to increment the counter
> > > > > > on vm stop though. It does make sense to stop the thread
> > > > > > but why not resume where we left off when vm is resumed?
> > > > > > 
> > > > > I'm not sure which counter we incremented. But it would be clear if we have
> > > > > a high level view of how it works (it is symmetric actually). Basically, we
> > > > > start the optimization when each round starts and stop it at the end of each
> > > > > round (i.e. before we do the bitmap sync), as shown below:
> > > > > 
> > > > > 1) 1st Round starts --> free_page_start
> > > > > 2) 1st Round in progress..
> > > > > 3) 1st Round ends  --> free_page_stop
> > > > > 4) 2nd Round starts --> free_page_start
> > > > > 5) 2nd Round in progress..
> > > > > 6) 2nd Round ends --> free_page_stop
> > > > > ......
> > > > > 
> > > > > For example, in 2), the VM is stopped. virtio_balloon_poll_free_page_hints
> > > > > finds the vq is empty (i.e. elem == NULL) and the runstate is stopped, the
> > > > > optimization thread exits immediately. That is, this optimization thread is
> > > > > gone forever (the optimization we can do for this round is done). We won't
> > > > > know when would the VM be woken up:
> > > > > A) If the VM is woken up very soon when the migration thread is still in
> > > > > progress of 2), then in 4) a new optimization thread (not the same one for
> > > > > the first round) will be created and start the optimization for the 2nd
> > > > > round as usual (If you have questions about 3) in this case, that
> > > > > free_page_stop will do nothing than just return, since the optimization
> > > > > thread has exited) ;
> > > > > B) If the VM is woken up after the whole migration has ended, there is still
> > > > > no point in resuming the optimization.
> > > > > 
> > > > > I think this would be the simple design for the first release of this
> > > > > optimization. There are possibilities to improve case A) above by continuing
> > > > > optimization for the 1st Round as it is still in progress, but I think
> > > > > adding that complexity for this rare case wouldn't be worthwhile (at least
> > > > > for now). What would you think?
> > > > > 
> > > > > 
> > > > > Best,
> > > > > Wei
> > > > In my opinion this just makes the patch very messy.
> > > > 
> > > > E.g. attempts to attach a debugger to the guest will call vmstop and
> > > > then behaviour changes. This is a receipe for heisenbugs which are then
> > > > extremely painful to debug.
> > > > 
> > > > It is not really hard to make things symmetrical:
> > > > e.g. if you stop on vmstop then you should start on vmstart, etc.
> > > > And stopping thread should not involve a bunch of state
> > > > changes, just stop it and that's it.
> > > > 
> > > "stop it" - do you mean to
> > > 1) make the thread exit (i.e.make virtio_balloon_poll_free_page_hints exit
> > > the while loop and return NULL); or
> > > 2) keep the thread staying in the while loop but yield running (e.g.
> > > sleep(1) or block on a mutex)? (or please let me know if you suggested a
> > > different implementation about stopping the thread)
> > I would say it makes more sense to make it block on something.
> > 
> > BTW I still think you are engaging in premature optimization here.
> > What you are doing here is a "data plane for balloon".
> > I would make the feature work first by processing this in a BH.
> > Creating threads immediately opens up questions of isolation,
> > cgroups etc.
> > 
> 
> Could you please share more about how creating threads affects isolation and
> cgroup?

When threads are coming and going, they are hard to control.

Consider the rich API libvirt exposes for controlling the io threads:

https://libvirt.org/formatdomain.html#elementsIOThreadsAllocation


> and how does BH solve it? Thanks.
> Best,
> Wei

It's late at night so I don't remember whether it's the emulator
or the io thread that runs the BH, but the point is it's
already controlled by libvirt.
Wei Wang March 22, 2018, 3:13 a.m. UTC | #10
On 03/20/2018 11:24 AM, Michael S. Tsirkin wrote:
> On Tue, Mar 20, 2018 at 11:18:23AM +0800, Wei Wang wrote:
>> On 03/20/2018 10:59 AM, Michael S. Tsirkin wrote:
>>> On Tue, Mar 20, 2018 at 10:16:09AM +0800, Wei Wang wrote:
>>>> On 03/20/2018 06:55 AM, Michael S. Tsirkin wrote:
>>>>> On Mon, Mar 19, 2018 at 05:01:38PM +0800, Wei Wang wrote:
>>>>>> On 03/19/2018 12:24 PM, Michael S. Tsirkin wrote:
>>>>>>> On Sun, Mar 18, 2018 at 06:36:20PM +0800, Wei Wang wrote:
>>>>>>>> On 03/16/2018 11:16 PM, Michael S. Tsirkin wrote:
>>>>>>>>> On Fri, Mar 16, 2018 at 06:48:28PM +0800, Wei Wang wrote:
>>>>>>> OTOH it seems that if thread stops nothing will wake it up
>>>>>>> whem vm is restarted. Such bahaviour change across vmstop/vmstart
>>>>>>> is unexpected.
>>>>>>> I do not understand why we want to increment the counter
>>>>>>> on vm stop though. It does make sense to stop the thread
>>>>>>> but why not resume where we left off when vm is resumed?
>>>>>>>
>>>>>> I'm not sure which counter we incremented. But it would be clear if we have
>>>>>> a high level view of how it works (it is symmetric actually). Basically, we
>>>>>> start the optimization when each round starts and stop it at the end of each
>>>>>> round (i.e. before we do the bitmap sync), as shown below:
>>>>>>
>>>>>> 1) 1st Round starts --> free_page_start
>>>>>> 2) 1st Round in progress..
>>>>>> 3) 1st Round ends  --> free_page_stop
>>>>>> 4) 2nd Round starts --> free_page_start
>>>>>> 5) 2nd Round in progress..
>>>>>> 6) 2nd Round ends --> free_page_stop
>>>>>> ......
>>>>>>
>>>>>> For example, in 2), the VM is stopped. virtio_balloon_poll_free_page_hints
>>>>>> finds the vq is empty (i.e. elem == NULL) and the runstate is stopped, the
>>>>>> optimization thread exits immediately. That is, this optimization thread is
>>>>>> gone forever (the optimization we can do for this round is done). We won't
>>>>>> know when would the VM be woken up:
>>>>>> A) If the VM is woken up very soon when the migration thread is still in
>>>>>> progress of 2), then in 4) a new optimization thread (not the same one for
>>>>>> the first round) will be created and start the optimization for the 2nd
>>>>>> round as usual (If you have questions about 3) in this case, that
>>>>>> free_page_stop will do nothing than just return, since the optimization
>>>>>> thread has exited) ;
>>>>>> B) If the VM is woken up after the whole migration has ended, there is still
>>>>>> no point in resuming the optimization.
>>>>>>
>>>>>> I think this would be the simple design for the first release of this
>>>>>> optimization. There are possibilities to improve case A) above by continuing
>>>>>> optimization for the 1st Round as it is still in progress, but I think
>>>>>> adding that complexity for this rare case wouldn't be worthwhile (at least
>>>>>> for now). What would you think?
>>>>>>
>>>>>>
>>>>>> Best,
>>>>>> Wei
>>>>> In my opinion this just makes the patch very messy.
>>>>>
>>>>> E.g. attempts to attach a debugger to the guest will call vmstop and
>>>>> then behaviour changes. This is a receipe for heisenbugs which are then
>>>>> extremely painful to debug.
>>>>>
>>>>> It is not really hard to make things symmetrical:
>>>>> e.g. if you stop on vmstop then you should start on vmstart, etc.
>>>>> And stopping thread should not involve a bunch of state
>>>>> changes, just stop it and that's it.
>>>>>
>>>> "stop it" - do you mean to
>>>> 1) make the thread exit (i.e.make virtio_balloon_poll_free_page_hints exit
>>>> the while loop and return NULL); or
>>>> 2) keep the thread staying in the while loop but yield running (e.g.
>>>> sleep(1) or block on a mutex)? (or please let me know if you suggested a
>>>> different implementation about stopping the thread)
>>> I would say it makes more sense to make it block on something.
>>>
>>> BTW I still think you are engaging in premature optimization here.
>>> What you are doing here is a "data plane for balloon".
>>> I would make the feature work first by processing this in a BH.
>>> Creating threads immediately opens up questions of isolation,
>>> cgroups etc.
>>>
>> Could you please share more about how creating threads affects isolation and
>> cgroup?
> When threads are coming and going, they are hard to control.
>
> Consider the rich API libvirt exposes for controlling the io threads:
>
> https://libvirt.org/formatdomain.html#elementsIOThreadsAllocation
>
>
>> and how does BH solve it? Thanks.
>> Best,
>> Wei
> It's late at night so I don't remember whether it's the emulator
> or the io thread that runs the BH, but the point is it's
> already controlled by libvirt.
>

OK. I've tried to implement it this way: create an iothread via the qemu 
cmdline option: --device 
virtio-balloon,free-page-hint=true,iothread=iothread10, and schedule a 
BH to run in the iothread context when free_page_start() is called.

I think the drawback of this method is that the iothread exists during 
the whole QEMU lifetime, which wastes CPU cycles (e.g. when live 
migration isn't in progress). The method in v5 is like migration thread, 
which comes only when the migration request is received and goes when 
migration is done. Any thought?

Best,
Wei
Wei Wang March 26, 2018, 10:58 a.m. UTC | #11
On 03/22/2018 11:13 AM, Wei Wang wrote:
>
> OK. I've tried to implement it this way: create an iothread via the 
> qemu cmdline option: --device 
> virtio-balloon,free-page-hint=true,iothread=iothread10, and schedule a 
> BH to run in the iothread context when free_page_start() is called.
>
> I think the drawback of this method is that the iothread exists during 
> the whole QEMU lifetime, which wastes CPU cycles (e.g. when live 
> migration isn't in progress). The method in v5 is like migration 
> thread, which comes only when the migration request is received and 
> goes when migration is done. Any thought?
>

Hi Michael,

Would it be acceptable to go with the thread creation method for now? I 
would prefer that method for the above reasons. If you are very 
confident about the iothread method, I can also send out the patches 
with that implementation. Hope we could finish this feature soon. Thanks.

Best,
Wei
Daniel P. Berrangé March 26, 2018, 11:09 a.m. UTC | #12
On Tue, Mar 20, 2018 at 05:24:39AM +0200, Michael S. Tsirkin wrote:
> On Tue, Mar 20, 2018 at 11:18:23AM +0800, Wei Wang wrote:
> > On 03/20/2018 10:59 AM, Michael S. Tsirkin wrote:
> > > On Tue, Mar 20, 2018 at 10:16:09AM +0800, Wei Wang wrote:
> > > > On 03/20/2018 06:55 AM, Michael S. Tsirkin wrote:
> > > > > On Mon, Mar 19, 2018 at 05:01:38PM +0800, Wei Wang wrote:
> > > > > > On 03/19/2018 12:24 PM, Michael S. Tsirkin wrote:
> > > > > > > On Sun, Mar 18, 2018 at 06:36:20PM +0800, Wei Wang wrote:
> > > > > > > > On 03/16/2018 11:16 PM, Michael S. Tsirkin wrote:
> > > > > > > > > On Fri, Mar 16, 2018 at 06:48:28PM +0800, Wei Wang wrote:
> > > > > > > OTOH it seems that if thread stops nothing will wake it up
> > > > > > > whem vm is restarted. Such bahaviour change across vmstop/vmstart
> > > > > > > is unexpected.
> > > > > > > I do not understand why we want to increment the counter
> > > > > > > on vm stop though. It does make sense to stop the thread
> > > > > > > but why not resume where we left off when vm is resumed?
> > > > > > > 
> > > > > > I'm not sure which counter we incremented. But it would be clear if we have
> > > > > > a high level view of how it works (it is symmetric actually). Basically, we
> > > > > > start the optimization when each round starts and stop it at the end of each
> > > > > > round (i.e. before we do the bitmap sync), as shown below:
> > > > > > 
> > > > > > 1) 1st Round starts --> free_page_start
> > > > > > 2) 1st Round in progress..
> > > > > > 3) 1st Round ends  --> free_page_stop
> > > > > > 4) 2nd Round starts --> free_page_start
> > > > > > 5) 2nd Round in progress..
> > > > > > 6) 2nd Round ends --> free_page_stop
> > > > > > ......
> > > > > > 
> > > > > > For example, in 2), the VM is stopped. virtio_balloon_poll_free_page_hints
> > > > > > finds the vq is empty (i.e. elem == NULL) and the runstate is stopped, the
> > > > > > optimization thread exits immediately. That is, this optimization thread is
> > > > > > gone forever (the optimization we can do for this round is done). We won't
> > > > > > know when would the VM be woken up:
> > > > > > A) If the VM is woken up very soon when the migration thread is still in
> > > > > > progress of 2), then in 4) a new optimization thread (not the same one for
> > > > > > the first round) will be created and start the optimization for the 2nd
> > > > > > round as usual (If you have questions about 3) in this case, that
> > > > > > free_page_stop will do nothing than just return, since the optimization
> > > > > > thread has exited) ;
> > > > > > B) If the VM is woken up after the whole migration has ended, there is still
> > > > > > no point in resuming the optimization.
> > > > > > 
> > > > > > I think this would be the simple design for the first release of this
> > > > > > optimization. There are possibilities to improve case A) above by continuing
> > > > > > optimization for the 1st Round as it is still in progress, but I think
> > > > > > adding that complexity for this rare case wouldn't be worthwhile (at least
> > > > > > for now). What would you think?
> > > > > > 
> > > > > > 
> > > > > > Best,
> > > > > > Wei
> > > > > In my opinion this just makes the patch very messy.
> > > > > 
> > > > > E.g. attempts to attach a debugger to the guest will call vmstop and
> > > > > then behaviour changes. This is a receipe for heisenbugs which are then
> > > > > extremely painful to debug.
> > > > > 
> > > > > It is not really hard to make things symmetrical:
> > > > > e.g. if you stop on vmstop then you should start on vmstart, etc.
> > > > > And stopping thread should not involve a bunch of state
> > > > > changes, just stop it and that's it.
> > > > > 
> > > > "stop it" - do you mean to
> > > > 1) make the thread exit (i.e.make virtio_balloon_poll_free_page_hints exit
> > > > the while loop and return NULL); or
> > > > 2) keep the thread staying in the while loop but yield running (e.g.
> > > > sleep(1) or block on a mutex)? (or please let me know if you suggested a
> > > > different implementation about stopping the thread)
> > > I would say it makes more sense to make it block on something.
> > > 
> > > BTW I still think you are engaging in premature optimization here.
> > > What you are doing here is a "data plane for balloon".
> > > I would make the feature work first by processing this in a BH.
> > > Creating threads immediately opens up questions of isolation,
> > > cgroups etc.
> > > 
> > 
> > Could you please share more about how creating threads affects isolation and
> > cgroup?
> 
> When threads are coming and going, they are hard to control.
> 
> Consider the rich API libvirt exposes for controlling the io threads:
> 
> https://libvirt.org/formatdomain.html#elementsIOThreadsAllocation
> 
> 
> > and how does BH solve it? Thanks.
> > Best,
> > Wei
> 
> It's late at night so I don't remember whether it's the emulator
> or the io thread that runs the BH, but the point is it's
> already controlled by libvirt.

As far as libvirt is concerned there are three sets of threads it provides
control over

 - vCPUs - each VCPU in KVM has a thread. Libvirt provides per-thread
   tunable control

 - IOThreads - each named I/O thread can be associated with one or more
   devices. Libvirt provides per-thread tunable control.

 - Emulator - any other QEMU thread which isn't an vCPU thread or IO thread
   gets called an emulator thread by libvirt. There is no-per thread
   tunable control - we can set tunables for entire set of emulator threads
   at once.

So, if this balloon driver thread needs to support tuning controls
separately from other general purpose QEMU threads, then it would
ideally use iothread infrastructure.

I don't particularly understand what this code is doing, but please consider
whether NUMA has any impact on the work done in this thread. Specifically when
the guest has multiple virtual NUMA nodes, each associated with a specific
host NUMA node. If there is any memory intensive work being done here, then
it might need to be executed on the correct host NUMA node according to the
memory region being touched.

Regards,
Daniel
Wei Wang March 26, 2018, 2:54 p.m. UTC | #13
On Monday, March 26, 2018 7:09 PM, Daniel P. Berrangé wrote:
> 

> As far as libvirt is concerned there are three sets of threads it provides

> control over

> 

>  - vCPUs - each VCPU in KVM has a thread. Libvirt provides per-thread

>    tunable control

> 

>  - IOThreads - each named I/O thread can be associated with one or more

>    devices. Libvirt provides per-thread tunable control.

> 

>  - Emulator - any other QEMU thread which isn't an vCPU thread or IO thread

>    gets called an emulator thread by libvirt. There is no-per thread

>    tunable control - we can set tunables for entire set of emulator threads

>    at once.

> 



Hi Daniel,
Thanks for sharing the details, they are very helpful. I still have a question:

There is no fundamental difference between iothread and our optimization thread (it is similar to the migration thread, which is created when migration begins and terminated when migration is done) - both of them are pthreads and each has a name. Could we also add the similar per-thread tunable control in libvirt for such threads?

For example, in QEMU we can add a new migration qmp command, migrate_enable_free_page_optimization (just like other commands migrate_set_speed 10G), this command will create the optimization thread. In this way, creation of the thread is in libvirt's control, and libvirt can then support tuning the thread (e.g. pin it to any pCPU), right?


> So, if this balloon driver thread needs to support tuning controls separately

> from other general purpose QEMU threads, then it would ideally use

> iothread infrastructure.

> 

> I don't particularly understand what this code is doing, but please consider

> whether NUMA has any impact on the work done in this thread. Specifically

> when the guest has multiple virtual NUMA nodes, each associated with a

> specific host NUMA node. If there is any memory intensive work being done

> here, then it might need to be executed on the correct host NUMA node

> according to the memory region being touched.

> 

 
I think it would not be significantly impacted by NUMA, because this optimization thread doesn’t access to the guest memory a lot except the virtqueue (even with iothread, we may still not know which pCPU to pin to match virtqueue in the vNUMA case). Essentially, it gets the free page address and length, then clears bits from the migration dirty bitmap, which is allocated by QEMU itself.
So, I think adding the tunable support is nicer, but I'm not sure if that would be required.

Best,
Wei
Daniel P. Berrangé March 26, 2018, 3:04 p.m. UTC | #14
On Mon, Mar 26, 2018 at 02:54:45PM +0000, Wang, Wei W wrote:
> On Monday, March 26, 2018 7:09 PM, Daniel P. Berrangé wrote:
> > 
> > As far as libvirt is concerned there are three sets of threads it provides
> > control over
> > 
> >  - vCPUs - each VCPU in KVM has a thread. Libvirt provides per-thread
> >    tunable control
> > 
> >  - IOThreads - each named I/O thread can be associated with one or more
> >    devices. Libvirt provides per-thread tunable control.
> > 
> >  - Emulator - any other QEMU thread which isn't an vCPU thread or IO thread
> >    gets called an emulator thread by libvirt. There is no-per thread
> >    tunable control - we can set tunables for entire set of emulator threads
> >    at once.
> > 
> 
> 
> Hi Daniel,
> Thanks for sharing the details, they are very helpful. I still have a question:
> 
> There is no fundamental difference between iothread and our optimization
> thread (it is similar to the migration thread, which is created when
> migration begins and terminated when migration is done) - both of them are
> pthreads and each has a name. Could we also add the similar per-thread
> tunable control in libvirt for such threads?
> 
> For example, in QEMU we can add a new migration qmp command,
> migrate_enable_free_page_optimization (just like other commands
> migrate_set_speed 10G), this command will create the optimization thread.
> In this way, creation of the thread is in libvirt's control, and libvirt
> can then support tuning the thread (e.g. pin it to any pCPU), right?

Anything is possible from a technical level, but from a design POV we
would rather not start exposing tunables for arbitrary device type specific
threads as they are inherantly non-portable and may not even exist in QEMU
long term as it is just an artifact of the current QEMU implementation.

Regards,
Daniel
Wei Wang March 26, 2018, 3:35 p.m. UTC | #15
On Monday, March 26, 2018 11:04 PM, Daniel P. Berrangé wrote:
> On Mon, Mar 26, 2018 at 02:54:45PM +0000, Wang, Wei W wrote:

> > On Monday, March 26, 2018 7:09 PM, Daniel P. Berrangé wrote:

> > >

> > > As far as libvirt is concerned there are three sets of threads it

> > > provides control over

> > >

> > >  - vCPUs - each VCPU in KVM has a thread. Libvirt provides per-thread

> > >    tunable control

> > >

> > >  - IOThreads - each named I/O thread can be associated with one or more

> > >    devices. Libvirt provides per-thread tunable control.

> > >

> > >  - Emulator - any other QEMU thread which isn't an vCPU thread or IO

> thread

> > >    gets called an emulator thread by libvirt. There is no-per thread

> > >    tunable control - we can set tunables for entire set of emulator threads

> > >    at once.

> > >

> >

> >

> > Hi Daniel,

> > Thanks for sharing the details, they are very helpful. I still have a question:

> >

> > There is no fundamental difference between iothread and our

> > optimization thread (it is similar to the migration thread, which is

> > created when migration begins and terminated when migration is done) -

> > both of them are pthreads and each has a name. Could we also add the

> > similar per-thread tunable control in libvirt for such threads?

> >

> > For example, in QEMU we can add a new migration qmp command,

> > migrate_enable_free_page_optimization (just like other commands

> > migrate_set_speed 10G), this command will create the optimization thread.

> > In this way, creation of the thread is in libvirt's control, and

> > libvirt can then support tuning the thread (e.g. pin it to any pCPU), right?

> 

> Anything is possible from a technical level, but from a design POV we would

> rather not start exposing tunables for arbitrary device type specific threads

> as they are inherantly non-portable and may not even exist in QEMU long

> term as it is just an artifact of the current QEMU implementation.

> 


OK. My actual concern with iothread is that it exists during the whole QEMU lifecycle. Block device using it is fine since block device access happens frequently during the QEMU lifecycle. Usages like live migration, if they happen once per day, running this iothread would be a waste of CPU cycles.

Best,
Wei
Wei Wang March 30, 2018, 9:52 a.m. UTC | #16
On 03/20/2018 11:24 AM, Michael S. Tsirkin wrote:
> On Tue, Mar 20, 2018 at 11:18:23AM +0800, Wei Wang wrote:
>> On 03/20/2018 10:59 AM, Michael S. Tsirkin wrote:
>>> On Tue, Mar 20, 2018 at 10:16:09AM +0800, Wei Wang wrote:
>>>> On 03/20/2018 06:55 AM, Michael S. Tsirkin wrote:
>>>>> On Mon, Mar 19, 2018 at 05:01:38PM +0800, Wei Wang wrote:
>>>>>> On 03/19/2018 12:24 PM, Michael S. Tsirkin wrote:
>>>>>>> On Sun, Mar 18, 2018 at 06:36:20PM +0800, Wei Wang wrote:
>>>>>>>> On 03/16/2018 11:16 PM, Michael S. Tsirkin wrote:
>>>>>>>>> On Fri, Mar 16, 2018 at 06:48:28PM +0800, Wei Wang wrote:
>>>>>>> OTOH it seems that if thread stops nothing will wake it up
>>>>>>> whem vm is restarted. Such bahaviour change across vmstop/vmstart
>>>>>>> is unexpected.
>>>>>>> I do not understand why we want to increment the counter
>>>>>>> on vm stop though. It does make sense to stop the thread
>>>>>>> but why not resume where we left off when vm is resumed?
>>>>>>>
>>>>>> I'm not sure which counter we incremented. But it would be clear if we have
>>>>>> a high level view of how it works (it is symmetric actually). Basically, we
>>>>>> start the optimization when each round starts and stop it at the end of each
>>>>>> round (i.e. before we do the bitmap sync), as shown below:
>>>>>>
>>>>>> 1) 1st Round starts --> free_page_start
>>>>>> 2) 1st Round in progress..
>>>>>> 3) 1st Round ends  --> free_page_stop
>>>>>> 4) 2nd Round starts --> free_page_start
>>>>>> 5) 2nd Round in progress..
>>>>>> 6) 2nd Round ends --> free_page_stop
>>>>>> ......
>>>>>>
>>>>>> For example, in 2), the VM is stopped. virtio_balloon_poll_free_page_hints
>>>>>> finds the vq is empty (i.e. elem == NULL) and the runstate is stopped, the
>>>>>> optimization thread exits immediately. That is, this optimization thread is
>>>>>> gone forever (the optimization we can do for this round is done). We won't
>>>>>> know when would the VM be woken up:
>>>>>> A) If the VM is woken up very soon when the migration thread is still in
>>>>>> progress of 2), then in 4) a new optimization thread (not the same one for
>>>>>> the first round) will be created and start the optimization for the 2nd
>>>>>> round as usual (If you have questions about 3) in this case, that
>>>>>> free_page_stop will do nothing than just return, since the optimization
>>>>>> thread has exited) ;
>>>>>> B) If the VM is woken up after the whole migration has ended, there is still
>>>>>> no point in resuming the optimization.
>>>>>>
>>>>>> I think this would be the simple design for the first release of this
>>>>>> optimization. There are possibilities to improve case A) above by continuing
>>>>>> optimization for the 1st Round as it is still in progress, but I think
>>>>>> adding that complexity for this rare case wouldn't be worthwhile (at least
>>>>>> for now). What would you think?
>>>>>>
>>>>>>
>>>>>> Best,
>>>>>> Wei
>>>>> In my opinion this just makes the patch very messy.
>>>>>
>>>>> E.g. attempts to attach a debugger to the guest will call vmstop and
>>>>> then behaviour changes. This is a receipe for heisenbugs which are then
>>>>> extremely painful to debug.
>>>>>
>>>>> It is not really hard to make things symmetrical:
>>>>> e.g. if you stop on vmstop then you should start on vmstart, etc.
>>>>> And stopping thread should not involve a bunch of state
>>>>> changes, just stop it and that's it.
>>>>>
>>>> "stop it" - do you mean to
>>>> 1) make the thread exit (i.e.make virtio_balloon_poll_free_page_hints exit
>>>> the while loop and return NULL); or
>>>> 2) keep the thread staying in the while loop but yield running (e.g.
>>>> sleep(1) or block on a mutex)? (or please let me know if you suggested a
>>>> different implementation about stopping the thread)
>>> I would say it makes more sense to make it block on something.
>>>
>>> BTW I still think you are engaging in premature optimization here.
>>> What you are doing here is a "data plane for balloon".
>>> I would make the feature work first by processing this in a BH.
>>> Creating threads immediately opens up questions of isolation,
>>> cgroups etc.
>>>
>> Could you please share more about how creating threads affects isolation and
>> cgroup?
> When threads are coming and going, they are hard to control.
>
> Consider the rich API libvirt exposes for controlling the io threads:
>
> https://libvirt.org/formatdomain.html#elementsIOThreadsAllocation
>
>
>> and how does BH solve it? Thanks.
>> Best,
>> Wei
> It's late at night so I don't remember whether it's the emulator
> or the io thread that runs the BH, but the point is it's
> already controlled by libvirt.
>

Thanks all for reviewing the patches. I've changed to use iothread in 
the new version. Please have a check if that would be good enough.

Best,
Wei
diff mbox

Patch

diff --git a/balloon.c b/balloon.c
index 6bf0a96..87a0410 100644
--- a/balloon.c
+++ b/balloon.c
@@ -36,6 +36,9 @@ 
 
 static QEMUBalloonEvent *balloon_event_fn;
 static QEMUBalloonStatus *balloon_stat_fn;
+static QEMUBalloonFreePageSupport *balloon_free_page_support_fn;
+static QEMUBalloonFreePageStart *balloon_free_page_start_fn;
+static QEMUBalloonFreePageStop *balloon_free_page_stop_fn;
 static void *balloon_opaque;
 static bool balloon_inhibited;
 
@@ -64,19 +67,51 @@  static bool have_balloon(Error **errp)
     return true;
 }
 
-int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
-                             QEMUBalloonStatus *stat_func, void *opaque)
+bool balloon_free_page_support(void)
 {
-    if (balloon_event_fn || balloon_stat_fn || balloon_opaque) {
-        /* We're already registered one balloon handler.  How many can
-         * a guest really have?
-         */
-        return -1;
+    return balloon_free_page_support_fn &&
+           balloon_free_page_support_fn(balloon_opaque);
+}
+
+/*
+ * Balloon will report pages which were free at the time of this call. As the
+ * reporting happens asynchronously, dirty bit logging must be enabled before
+ * this call is made.
+ */
+void balloon_free_page_start(void)
+{
+    balloon_free_page_start_fn(balloon_opaque);
+}
+
+/*
+ * Guest reporting must be disabled before the migration dirty bitmap is
+ * synchronized.
+ */
+void balloon_free_page_stop(void)
+{
+    balloon_free_page_stop_fn(balloon_opaque);
+}
+
+void qemu_add_balloon_handler(QEMUBalloonEvent *event_fn,
+                              QEMUBalloonStatus *stat_fn,
+                              QEMUBalloonFreePageSupport *free_page_support_fn,
+                              QEMUBalloonFreePageStart *free_page_start_fn,
+                              QEMUBalloonFreePageStop *free_page_stop_fn,
+                              void *opaque)
+{
+    if (balloon_event_fn || balloon_stat_fn || balloon_free_page_support_fn ||
+        balloon_free_page_start_fn || balloon_free_page_stop_fn ||
+        balloon_opaque) {
+        /* We already registered one balloon handler. */
+        return;
     }
-    balloon_event_fn = event_func;
-    balloon_stat_fn = stat_func;
+
+    balloon_event_fn = event_fn;
+    balloon_stat_fn = stat_fn;
+    balloon_free_page_support_fn = free_page_support_fn;
+    balloon_free_page_start_fn = free_page_start_fn;
+    balloon_free_page_stop_fn = free_page_stop_fn;
     balloon_opaque = opaque;
-    return 0;
 }
 
 void qemu_remove_balloon_handler(void *opaque)
@@ -86,6 +121,9 @@  void qemu_remove_balloon_handler(void *opaque)
     }
     balloon_event_fn = NULL;
     balloon_stat_fn = NULL;
+    balloon_free_page_support_fn = NULL;
+    balloon_free_page_start_fn = NULL;
+    balloon_free_page_stop_fn = NULL;
     balloon_opaque = NULL;
 }
 
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index f456cea..30c7504 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -31,6 +31,7 @@ 
 
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio-access.h"
+#include "migration/misc.h"
 
 #define BALLOON_PAGE_SIZE  (1 << VIRTIO_BALLOON_PFN_SHIFT)
 
@@ -308,6 +309,127 @@  out:
     }
 }
 
+static void *virtio_balloon_poll_free_page_hints(void *opaque)
+{
+    VirtQueueElement *elem;
+    VirtIOBalloon *dev = opaque;
+    VirtQueue *vq = dev->free_page_vq;
+    uint32_t id;
+    size_t size;
+
+    /* The optimization thread runs only when the guest is running. */
+    while (runstate_is_running()) {
+        qemu_spin_lock(&dev->free_page_lock);
+        /*
+         * If the migration thread actively stops the reporting, exit
+         * immediately.
+         */
+        if (dev->free_page_report_status >= FREE_PAGE_REPORT_S_STOP) {
+            qemu_spin_unlock(&dev->free_page_lock);
+            break;
+        }
+
+        elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+        if (!elem) {
+            qemu_spin_unlock(&dev->free_page_lock);
+            continue;
+        }
+
+        if (elem->out_num) {
+            size = iov_to_buf(elem->out_sg, elem->out_num, 0, &id, sizeof(id));
+            virtqueue_push(vq, elem, size);
+            g_free(elem);
+            if (unlikely(size != sizeof(id))) {
+                warn_report("%s: received an incorrect cmd id", __func__);
+                qemu_spin_unlock(&dev->free_page_lock);
+                break;
+            }
+            if (id == dev->free_page_report_cmd_id) {
+                dev->free_page_report_status = FREE_PAGE_REPORT_S_START;
+            } else if (dev->free_page_report_status ==
+                       FREE_PAGE_REPORT_S_START) {
+                /*
+                 * Stop the optimization only when it has started. This avoids
+                 * a stale stop sign for the previous command.
+                 */
+                dev->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
+                qemu_spin_unlock(&dev->free_page_lock);
+                break;
+            }
+        }
+
+        if (elem->in_num) {
+            if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START &&
+                !dev->poison_val) {
+                qemu_guest_free_page_hint(elem->in_sg[0].iov_base,
+                                          elem->in_sg[0].iov_len);
+            }
+            virtqueue_push(vq, elem, 0);
+            g_free(elem);
+        }
+        qemu_spin_unlock(&dev->free_page_lock);
+    }
+    return NULL;
+}
+
+static bool virtio_balloon_free_page_support(void *opaque)
+{
+    VirtIOBalloon *s = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+    return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT);
+}
+
+static void virtio_balloon_free_page_start(void *opaque)
+{
+    VirtIOBalloon *s = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+    if (!runstate_is_running()) {
+        return;
+    }
+
+    if (unlikely(s->free_page_report_cmd_id == UINT_MAX)) {
+        s->free_page_report_cmd_id =
+                       VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN;
+    } else {
+        s->free_page_report_cmd_id++;
+    }
+
+    s->free_page_report_status = FREE_PAGE_REPORT_S_REQUESTED;
+    virtio_notify_config(vdev);
+    qemu_thread_create(&s->free_page_thread, "balloon_fpo",
+                       virtio_balloon_poll_free_page_hints, s,
+                       QEMU_THREAD_JOINABLE);
+}
+
+static void virtio_balloon_free_page_stop(void *opaque)
+{
+    VirtIOBalloon *s = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+    qemu_spin_lock(&s->free_page_lock);
+    switch (s->free_page_report_status) {
+    case FREE_PAGE_REPORT_S_REQUESTED:
+    case FREE_PAGE_REPORT_S_START:
+        /*
+         * The guest hasn't done the reporting, so host sends a notification
+         * to the guest to actively stop the reporting before joining the
+         * optimization thread.
+         */
+        s->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
+        virtio_notify_config(vdev);
+    case FREE_PAGE_REPORT_S_STOP:
+        /* The guest has stopped the reporting. Join the optimization thread */
+        qemu_thread_join(&s->free_page_thread);
+        s->free_page_report_status = FREE_PAGE_REPORT_S_EXIT;
+    case FREE_PAGE_REPORT_S_EXIT:
+        /* The optimization thread has gone. No further actions needded. */
+        break;
+    }
+    qemu_spin_unlock(&s->free_page_lock);
+}
+
 static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
 {
     VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
@@ -315,6 +437,15 @@  static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
 
     config.num_pages = cpu_to_le32(dev->num_pages);
     config.actual = cpu_to_le32(dev->actual);
+    config.poison_val = cpu_to_le32(dev->poison_val);
+
+    if (dev->free_page_report_status >= FREE_PAGE_REPORT_S_STOP) {
+        config.free_page_report_cmd_id =
+                       cpu_to_le32(VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID);
+    } else {
+        config.free_page_report_cmd_id =
+                       cpu_to_le32(dev->free_page_report_cmd_id);
+    }
 
     trace_virtio_balloon_get_config(config.num_pages, config.actual);
     memcpy(config_data, &config, sizeof(struct virtio_balloon_config));
@@ -368,6 +499,7 @@  static void virtio_balloon_set_config(VirtIODevice *vdev,
                         ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT),
                         &error_abort);
     }
+    dev->poison_val = le32_to_cpu(config.poison_val);
     trace_virtio_balloon_set_config(dev->actual, oldactual);
 }
 
@@ -377,6 +509,11 @@  static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
     VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
     f |= dev->host_features;
     virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ);
+
+    if (dev->host_features & 1ULL << VIRTIO_BALLOON_F_FREE_PAGE_HINT) {
+        virtio_add_feature(&f, VIRTIO_BALLOON_F_PAGE_POISON);
+    }
+
     return f;
 }
 
@@ -413,6 +550,18 @@  static int virtio_balloon_post_load_device(void *opaque, int version_id)
     return 0;
 }
 
+static const VMStateDescription vmstate_virtio_balloon_free_page_report = {
+    .name = "virtio-balloon-device/free-page-report",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = virtio_balloon_free_page_support,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon),
+        VMSTATE_UINT32(poison_val, VirtIOBalloon),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_virtio_balloon_device = {
     .name = "virtio-balloon-device",
     .version_id = 1,
@@ -423,30 +572,31 @@  static const VMStateDescription vmstate_virtio_balloon_device = {
         VMSTATE_UINT32(actual, VirtIOBalloon),
         VMSTATE_END_OF_LIST()
     },
+    .subsections = (const VMStateDescription * []) {
+        &vmstate_virtio_balloon_free_page_report,
+        NULL
+    }
 };
 
 static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtIOBalloon *s = VIRTIO_BALLOON(dev);
-    int ret;
 
     virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON,
                 sizeof(struct virtio_balloon_config));
 
-    ret = qemu_add_balloon_handler(virtio_balloon_to_target,
-                                   virtio_balloon_stat, s);
-
-    if (ret < 0) {
-        error_setg(errp, "Only one balloon device is supported");
-        virtio_cleanup(vdev);
-        return;
-    }
-
     s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
     s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
     s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
-
+    if (virtio_has_feature(s->host_features,
+                           VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+        s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE, NULL);
+        s->free_page_report_status = FREE_PAGE_REPORT_S_EXIT;
+        s->free_page_report_cmd_id =
+                           VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN - 1;
+        qemu_spin_init(&s->free_page_lock);
+    }
     reset_stats(s);
 }
 
@@ -455,6 +605,9 @@  static void virtio_balloon_device_unrealize(DeviceState *dev, Error **errp)
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtIOBalloon *s = VIRTIO_BALLOON(dev);
 
+    if (virtio_balloon_free_page_support(s)) {
+        virtio_balloon_free_page_stop(s);
+    }
     balloon_stats_destroy_timer(s);
     qemu_remove_balloon_handler(s);
     virtio_cleanup(vdev);
@@ -464,6 +617,10 @@  static void virtio_balloon_device_reset(VirtIODevice *vdev)
 {
     VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
 
+    if (virtio_balloon_free_page_support(s)) {
+        virtio_balloon_free_page_stop(s);
+    }
+
     if (s->stats_vq_elem != NULL) {
         virtqueue_unpop(s->svq, s->stats_vq_elem, 0);
         g_free(s->stats_vq_elem);
@@ -475,11 +632,37 @@  static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status)
 {
     VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
 
-    if (!s->stats_vq_elem && vdev->vm_running &&
-        (status & VIRTIO_CONFIG_S_DRIVER_OK) && virtqueue_rewind(s->svq, 1)) {
-        /* poll stats queue for the element we have discarded when the VM
-         * was stopped */
-        virtio_balloon_receive_stats(vdev, s->svq);
+    if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+        if (!s->stats_vq_elem && vdev->vm_running &&
+            virtqueue_rewind(s->svq, 1)) {
+            /*
+             * Poll stats queue for the element we have discarded when the VM
+             * was stopped.
+             */
+            virtio_balloon_receive_stats(vdev, s->svq);
+        }
+
+        if (virtio_balloon_free_page_support(s)) {
+            qemu_add_balloon_handler(virtio_balloon_to_target,
+                                     virtio_balloon_stat,
+                                     virtio_balloon_free_page_support,
+                                     virtio_balloon_free_page_start,
+                                     virtio_balloon_free_page_stop,
+                                     s);
+            /*
+             * This handles the case that the guest is being stopped (e.g. by
+             * qmp commands) while the driver is still reporting hints. When
+             * the guest is woken up, it will continue to report hints, which
+             * are not needed. So when the wakeup notifier invokes the
+             * set_status callback here, we get the chance to make sure that
+             * the free page optimization thread is exited via
+             * virtio_balloon_free_page_stop.
+             */
+            virtio_balloon_free_page_stop(s);
+        } else {
+            qemu_add_balloon_handler(virtio_balloon_to_target,
+                                     virtio_balloon_stat, NULL, NULL, NULL, s);
+        }
     }
 }
 
@@ -509,6 +692,8 @@  static const VMStateDescription vmstate_virtio_balloon = {
 static Property virtio_balloon_properties[] = {
     DEFINE_PROP_BIT("deflate-on-oom", VirtIOBalloon, host_features,
                     VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false),
+    DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features,
+                    VIRTIO_BALLOON_F_FREE_PAGE_HINT, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h
index 1ea13bd..cfdba37 100644
--- a/include/hw/virtio/virtio-balloon.h
+++ b/include/hw/virtio/virtio-balloon.h
@@ -23,6 +23,8 @@ 
 #define VIRTIO_BALLOON(obj) \
         OBJECT_CHECK(VirtIOBalloon, (obj), TYPE_VIRTIO_BALLOON)
 
+#define VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN 0x80000000
+
 typedef struct virtio_balloon_stat VirtIOBalloonStat;
 
 typedef struct virtio_balloon_stat_modern {
@@ -31,15 +33,31 @@  typedef struct virtio_balloon_stat_modern {
        uint64_t val;
 } VirtIOBalloonStatModern;
 
+enum virtio_balloon_free_page_report_status {
+    FREE_PAGE_REPORT_S_REQUESTED,
+    FREE_PAGE_REPORT_S_START,
+    FREE_PAGE_REPORT_S_STOP,
+    FREE_PAGE_REPORT_S_EXIT,
+};
+
 typedef struct VirtIOBalloon {
     VirtIODevice parent_obj;
-    VirtQueue *ivq, *dvq, *svq;
+    VirtQueue *ivq, *dvq, *svq, *free_page_vq;
+    uint32_t free_page_report_status;
     uint32_t num_pages;
     uint32_t actual;
+    uint32_t free_page_report_cmd_id;
+    uint32_t poison_val;
     uint64_t stats[VIRTIO_BALLOON_S_NR];
     VirtQueueElement *stats_vq_elem;
     size_t stats_vq_offset;
     QEMUTimer *stats_timer;
+    QemuThread free_page_thread;
+    /*
+     * Lock to synchronize threads to access the free page reporting related
+     * fields (e.g. free_page_report_status).
+     */
+    QemuSpin free_page_lock;
     int64_t stats_last_update;
     int64_t stats_poll_interval;
     uint32_t host_features;
diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
index 7b0a41b..f89e80f 100644
--- a/include/standard-headers/linux/virtio_balloon.h
+++ b/include/standard-headers/linux/virtio_balloon.h
@@ -34,15 +34,22 @@ 
 #define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
 #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
+#define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
+#define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
 
+#define VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID 0
 struct virtio_balloon_config {
 	/* Number of pages host wants Guest to give up. */
 	uint32_t num_pages;
 	/* Number of pages we've actually got in balloon. */
 	uint32_t actual;
+	/* Free page report command id, readonly by guest */
+	uint32_t free_page_report_cmd_id;
+	/* Stores PAGE_POISON if page poisoning is in use */
+	uint32_t poison_val;
 };
 
 #define VIRTIO_BALLOON_S_SWAP_IN  0   /* Amount of memory swapped in */
diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h
index 66543ae..6561a08 100644
--- a/include/sysemu/balloon.h
+++ b/include/sysemu/balloon.h
@@ -18,11 +18,22 @@ 
 
 typedef void (QEMUBalloonEvent)(void *opaque, ram_addr_t target);
 typedef void (QEMUBalloonStatus)(void *opaque, BalloonInfo *info);
+typedef bool (QEMUBalloonFreePageSupport)(void *opaque);
+typedef void (QEMUBalloonFreePageStart)(void *opaque);
+typedef void (QEMUBalloonFreePageStop)(void *opaque);
 
-int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
-			     QEMUBalloonStatus *stat_func, void *opaque);
 void qemu_remove_balloon_handler(void *opaque);
 bool qemu_balloon_is_inhibited(void);
 void qemu_balloon_inhibit(bool state);
+bool balloon_free_page_support(void);
+void balloon_free_page_start(void);
+void balloon_free_page_stop(void);
+
+void qemu_add_balloon_handler(QEMUBalloonEvent *event_fn,
+                              QEMUBalloonStatus *stat_fn,
+                              QEMUBalloonFreePageSupport *free_page_support_fn,
+                              QEMUBalloonFreePageStart *free_page_start_fn,
+                              QEMUBalloonFreePageStop *free_page_stop_fn,
+                              void *opaque);
 
 #endif