diff mbox

[v34,2/4] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT

Message ID 1529928312-30500-3-git-send-email-wei.w.wang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wang, Wei W June 25, 2018, 12:05 p.m. UTC
Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_HINT feature indicates the
support of reporting hints of guest free pages to host via virtio-balloon.

Host requests the guest to report free page hints by sending a new cmd id
to the guest via the free_page_report_cmd_id configuration register.

As the first step here, virtio-balloon only reports free page hints from
the max order (i.e. 10) free page list to host. This has generated similar
good results as reporting all free page hints during our tests.

When the guest starts to report, it first sends a start cmd to host via
the free page vq, which acks to host the cmd id received, and tells it the
hint size (e.g. 4MB each on x86). When the guest finishes the reporting,
a stop cmd is sent to host via the vq.

TODO:
- support reporting free page hints from smaller order free page lists
  when there is a need/request from users.

Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/virtio/virtio_balloon.c     | 347 ++++++++++++++++++++++++++++++++----
 include/uapi/linux/virtio_balloon.h |  11 ++
 2 files changed, 322 insertions(+), 36 deletions(-)

Comments

Michael S. Tsirkin June 26, 2018, 1:37 a.m. UTC | #1
On Mon, Jun 25, 2018 at 08:05:10PM +0800, Wei Wang wrote:
> Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_HINT feature indicates the
> support of reporting hints of guest free pages to host via virtio-balloon.
> 
> Host requests the guest to report free page hints by sending a new cmd id
> to the guest via the free_page_report_cmd_id configuration register.
> 
> As the first step here, virtio-balloon only reports free page hints from
> the max order (i.e. 10) free page list to host. This has generated similar
> good results as reporting all free page hints during our tests.
> 
> When the guest starts to report, it first sends a start cmd to host via
> the free page vq, which acks to host the cmd id received, and tells it the
> hint size (e.g. 4MB each on x86). When the guest finishes the reporting,
> a stop cmd is sent to host via the vq.
> 
> TODO:
> - support reporting free page hints from smaller order free page lists
>   when there is a need/request from users.
> 
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> Signed-off-by: Liang Li <liang.z.li@intel.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> ---
>  drivers/virtio/virtio_balloon.c     | 347 ++++++++++++++++++++++++++++++++----
>  include/uapi/linux/virtio_balloon.h |  11 ++
>  2 files changed, 322 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 6b237e3..d05f0ba 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -43,6 +43,11 @@
>  #define OOM_VBALLOON_DEFAULT_PAGES 256
>  #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
>  
> +/* The order used to allocate an array to load free page hints */
> +#define ARRAY_ALLOC_ORDER (MAX_ORDER - 1)
> +/* The size of an array in bytes */
> +#define ARRAY_ALLOC_SIZE ((1 << ARRAY_ALLOC_ORDER) << PAGE_SHIFT)
> +


Pls prefix macros so we can figure out they are local ones.

>  static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
>  module_param(oom_pages, int, S_IRUSR | S_IWUSR);
>  MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
> @@ -51,9 +56,22 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
>  static struct vfsmount *balloon_mnt;
>  #endif
>  
> +enum virtio_balloon_vq {
> +	VIRTIO_BALLOON_VQ_INFLATE,
> +	VIRTIO_BALLOON_VQ_DEFLATE,
> +	VIRTIO_BALLOON_VQ_STATS,
> +	VIRTIO_BALLOON_VQ_FREE_PAGE,
> +	VIRTIO_BALLOON_VQ_MAX
> +};
> +
>  struct virtio_balloon {
>  	struct virtio_device *vdev;
> -	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
> +	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> +
> +	/* Balloon's own wq for cpu-intensive work items */
> +	struct workqueue_struct *balloon_wq;
> +	/* The free page reporting work item submitted to the balloon wq */
> +	struct work_struct report_free_page_work;
>  
>  	/* The balloon servicing is delegated to a freezable workqueue. */
>  	struct work_struct update_balloon_stats_work;
> @@ -63,6 +81,15 @@ struct virtio_balloon {
>  	spinlock_t stop_update_lock;
>  	bool stop_update;
>  
> +	/* Command buffers to start and stop the reporting of hints to host */
> +	struct virtio_balloon_free_page_hints_cmd cmd_start;
> +	struct virtio_balloon_free_page_hints_cmd cmd_stop;
> +
> +	/* The cmd id received from host */
> +	uint32_t cmd_id_received;
> +	/* The cmd id that is actively in use */
> +	uint32_t cmd_id_active;
> +
>  	/* Waiting for host to ack the pages we released. */
>  	wait_queue_head_t acked;
>  

You want u32 types.

> @@ -326,17 +353,6 @@ static void stats_handle_request(struct virtio_balloon *vb)
>  	virtqueue_kick(vq);
>  }
>  
> -static void virtballoon_changed(struct virtio_device *vdev)
> -{
> -	struct virtio_balloon *vb = vdev->priv;
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&vb->stop_update_lock, flags);
> -	if (!vb->stop_update)
> -		queue_work(system_freezable_wq, &vb->update_balloon_size_work);
> -	spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> -}
> -
>  static inline s64 towards_target(struct virtio_balloon *vb)
>  {
>  	s64 target;
> @@ -353,6 +369,35 @@ static inline s64 towards_target(struct virtio_balloon *vb)
>  	return target - vb->num_pages;
>  }
>  
> +static void virtballoon_changed(struct virtio_device *vdev)
> +{
> +	struct virtio_balloon *vb = vdev->priv;
> +	unsigned long flags;
> +	s64 diff = towards_target(vb);
> +
> +	if (diff) {
> +		spin_lock_irqsave(&vb->stop_update_lock, flags);
> +		if (!vb->stop_update)
> +			queue_work(system_freezable_wq,
> +				   &vb->update_balloon_size_work);
> +		spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> +	}
> +
> +	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> +		virtio_cread(vdev, struct virtio_balloon_config,
> +			     free_page_report_cmd_id, &vb->cmd_id_received);
> +		if (vb->cmd_id_received !=
> +		    VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID &&
> +		    vb->cmd_id_received != vb->cmd_id_active) {
> +			spin_lock_irqsave(&vb->stop_update_lock, flags);
> +			if (!vb->stop_update)
> +				queue_work(vb->balloon_wq,
> +					   &vb->report_free_page_work);
> +			spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> +		}
> +	}
> +}
> +
>  static void update_balloon_size(struct virtio_balloon *vb)
>  {
>  	u32 actual = vb->num_pages;
> @@ -425,44 +470,253 @@ static void update_balloon_size_func(struct work_struct *work)
>  		queue_work(system_freezable_wq, work);
>  }
>  
> +static void free_page_vq_cb(struct virtqueue *vq)
> +{
> +	unsigned int len;
> +	void *buf;
> +	struct virtio_balloon *vb = vq->vdev->priv;
> +
> +	while (1) {
> +		buf = virtqueue_get_buf(vq, &len);
> +
> +		if (!buf || buf == &vb->cmd_start || buf == &vb->cmd_stop)
> +			break;

If there's any buffer after this one we might never get another
callback.

> +		free_pages((unsigned long)buf, ARRAY_ALLOC_ORDER);
> +	}
> +}
> +
>  static int init_vqs(struct virtio_balloon *vb)
>  {
> -	struct virtqueue *vqs[3];
> -	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
> -	static const char * const names[] = { "inflate", "deflate", "stats" };
> -	int err, nvqs;
> +	struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX];
> +	vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_MAX];
> +	const char *names[VIRTIO_BALLOON_VQ_MAX];
> +	struct scatterlist sg;
> +	int ret;
>  
>  	/*
> -	 * We expect two virtqueues: inflate and deflate, and
> -	 * optionally stat.
> +	 * Inflateq and deflateq are used unconditionally. The names[]
> +	 * will be NULL if the related feature is not enabled, which will
> +	 * cause no allocation for the corresponding virtqueue in find_vqs.
>  	 */
> -	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
> -	err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
> -	if (err)
> -		return err;
> +	callbacks[VIRTIO_BALLOON_VQ_INFLATE] = balloon_ack;
> +	names[VIRTIO_BALLOON_VQ_INFLATE] = "inflate";
> +	callbacks[VIRTIO_BALLOON_VQ_DEFLATE] = balloon_ack;
> +	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
> +	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
> +	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
>  
> -	vb->inflate_vq = vqs[0];
> -	vb->deflate_vq = vqs[1];
>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> -		struct scatterlist sg;
> -		unsigned int num_stats;
> -		vb->stats_vq = vqs[2];
> +		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> +		callbacks[VIRTIO_BALLOON_VQ_STATS] = stats_request;
> +	}
>  
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> +		names[VIRTIO_BALLOON_VQ_FREE_PAGE] = "free_page_vq";
> +		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = free_page_vq_cb;
> +	}
> +
> +	ret = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
> +					 vqs, callbacks, names, NULL, NULL);
> +	if (ret)
> +		return ret;
> +
> +	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
> +	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> +		vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_STATS];
>  		/*
>  		 * Prime this virtqueue with one buffer so the hypervisor can
>  		 * use it to signal us later (it can't be broken yet!).
>  		 */
> -		num_stats = update_balloon_stats(vb);
> -
> -		sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
> -		if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
> -		    < 0)
> -			BUG();
> +		sg_init_one(&sg, vb->stats, sizeof(vb->stats));
> +		ret = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb,
> +					   GFP_KERNEL);
> +		if (ret) {
> +			dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
> +				 __func__);
> +			return ret;
> +		}

Why the change? Is it more likely to happen now?

>  		virtqueue_kick(vb->stats_vq);
>  	}
> +
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
> +		vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE];
> +
>  	return 0;
>  }
>  
> +static int send_start_cmd_id(struct virtio_balloon *vb)
> +{
> +	struct scatterlist sg;
> +	struct virtqueue *vq = vb->free_page_vq;
> +
> +	vb->cmd_start.id = cpu_to_virtio32(vb->vdev, vb->cmd_id_active);
> +	vb->cmd_start.size = cpu_to_virtio32(vb->vdev,
> +					     MAX_ORDER_NR_PAGES * PAGE_SIZE);
> +	sg_init_one(&sg, &vb->cmd_start,
> +		    sizeof(struct virtio_balloon_free_page_hints_cmd));
> +	return virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_start, GFP_KERNEL);
> +}
> +
> +static int send_stop_cmd_id(struct virtio_balloon *vb)
> +{
> +	struct scatterlist sg;
> +	struct virtqueue *vq = vb->free_page_vq;
> +
> +	vb->cmd_stop.id = cpu_to_virtio32(vb->vdev,
> +				VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID);
> +	vb->cmd_stop.size = 0;
> +	sg_init_one(&sg, &vb->cmd_stop,
> +		    sizeof(struct virtio_balloon_free_page_hints_cmd));
> +	return virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_stop, GFP_KERNEL);
> +}
> +
> +/*
> + * virtio_balloon_send_hints - send arrays of hints to host
> + * @vb: the virtio_balloon struct
> + * @arrays: the arrays of hints
> + * @array_num: the number of arrays give by the caller
> + * @last_array_hints: the number of hints in the last array
> + *
> + * Send hints to host array by array. This begins by sending a start cmd,
> + * which contains a cmd id received from host and the free page block size in
> + * bytes of each hint. At the end, a stop cmd is sent to host to indicate the
> + * end of this reporting. If host actively requests to stop the reporting, free
> + * the arrays that have not been sent.
> + */
> +static void virtio_balloon_send_hints(struct virtio_balloon *vb,
> +				      __le64 **arrays,
> +				      uint32_t array_num,
> +				      uint32_t last_array_hints)
> +{
> +	int err, i = 0;
> +	struct scatterlist sg;
> +	struct virtqueue *vq = vb->free_page_vq;
> +
> +	/* Start by sending the received cmd id to host with an outbuf. */
> +	err = send_start_cmd_id(vb);
> +	if (unlikely(err))
> +		goto out_err;
> +	/* Kick host to start taking entries from the vq. */
> +	virtqueue_kick(vq);
> +
> +	for (i = 0; i < array_num; i++) {
> +		/*
> +		 * If a stop id or a new cmd id was just received from host,
> +		 * stop the reporting, and free the remaining arrays that
> +		 * haven't been sent to host.
> +		 */
> +		if (vb->cmd_id_received != vb->cmd_id_active)
> +			goto out_free;
> +
> +		if (i + 1 == array_num)
> +			sg_init_one(&sg, (void *)arrays[i],
> +				    last_array_hints * sizeof(__le64));
> +		else
> +			sg_init_one(&sg, (void *)arrays[i], ARRAY_ALLOC_SIZE);
> +		err = virtqueue_add_inbuf(vq, &sg, 1, (void *)arrays[i],
> +					  GFP_KERNEL);
> +		if (unlikely(err))
> +			goto out_err;
> +	}
> +
> +	/* End by sending a stop id to host with an outbuf. */
> +	err = send_stop_cmd_id(vb);
> +	if (unlikely(err))
> +		goto out_err;

Don't we need to kick here?

> +	return;
> +
> +out_err:
> +	dev_err(&vb->vdev->dev, "%s: err = %d\n", __func__, err);
> +out_free:
> +	while (i < array_num)
> +		free_pages((unsigned long)arrays[i++], ARRAY_ALLOC_ORDER);
> +}
> +
> +/*
> + * virtio_balloon_load_hints - load free page hints into arrays
> + * @vb: the virtio_balloon struct
> + * @array_num: the number of arrays allocated
> + * @last_array_hints: the number of hints loaded into the last array
> + *
> + * Only free pages blocks of MAX_ORDER - 1 are loaded into the arrays.
> + * Each array size is MAX_ORDER_NR_PAGES * PAGE_SIZE (e.g. 4MB on x86). Failing
> + * to allocate such an array essentially implies that no such free page blocks
> + * could be reported. Alloacte the number of arrays according to the free page
> + * blocks of MAX_ORDER - 1 that the system may have, and free the unused ones
> + * after loading the free page hints. The last array may be partially loaded,
> + * and @last_array_hints tells the caller about the number of hints there.
> + *
> + * Return the pointer to the memory that holds the addresses of the allocated
> + * arrays, or NULL if no arrays are allocated.
> + */
> +static  __le64 **virtio_balloon_load_hints(struct virtio_balloon *vb,
> +					   uint32_t *array_num,
> +					   uint32_t *last_array_hints)
> +{
> +	__le64 **arrays;
> +	uint32_t max_entries, entries_per_page, entries_per_array,
> +		 max_array_num, loaded_hints;

All above likely should be int.

> +	int i;
> +
> +	max_entries = max_free_page_blocks(ARRAY_ALLOC_ORDER);
> +	entries_per_page = PAGE_SIZE / sizeof(__le64);
> +	entries_per_array = entries_per_page * (1 << ARRAY_ALLOC_ORDER);
> +	max_array_num = max_entries / entries_per_array +
> +			!!(max_entries % entries_per_array);
> +	arrays = kmalloc_array(max_array_num, sizeof(__le64 *), GFP_KERNEL);

Instead of all this mess, how about get_free_pages here as well?

Also why do we need GFP_KERNEL for this?


> +	if (!arrays)
> +		return NULL;
> +
> +	for (i = 0; i < max_array_num; i++) {

So we are getting a ton of memory here just to free it up a bit later.
Why doesn't get_from_free_page_list get the pages from free list for us?
We could also avoid the 1st allocation then - just build a list
of these.


> +		arrays[i] =
> +		(__le64 *)__get_free_pages(__GFP_ATOMIC | __GFP_NOMEMALLOC,
> +					   ARRAY_ALLOC_ORDER);

Coding style says:

Descendants are always substantially shorter than the parent and
are placed substantially to the right. 

> +		if (!arrays[i]) {
Also if it does fail (small guest), shall we try with less arrays?
> +			/*
> +			 * If any one of the arrays fails to be allocated, it
> +			 * implies that the free list that we are interested
> +			 * in is empty, and there is no need to continue the
> +			 * reporting. So just free what's allocated and return
> +			 * NULL.
> +			 */
> +			while (i > 0)
> +				free_pages((unsigned long)arrays[i--],
> +					   ARRAY_ALLOC_ORDER);
> +			kfree(arrays);
> +			return NULL;
> +		}
> +	}
> +	loaded_hints = get_from_free_page_list(ARRAY_ALLOC_ORDER,
> +					       max_array_num, arrays,
> +					       entries_per_array);
> +	*array_num = loaded_hints / entries_per_array +
> +		     !!(max_entries % entries_per_array);
> +	*last_array_hints = loaded_hints -
> +			    (*array_num - 1) * entries_per_array;
> +	for (i = *array_num; i < max_array_num; i++)
> +		free_pages((unsigned long)arrays[i], ARRAY_ALLOC_ORDER);
> +
> +	return arrays;
> +}
> +
> +static void report_free_page_func(struct work_struct *work)
> +{
> +	struct virtio_balloon *vb;
> +	uint32_t array_num = 0, last_array_hints = 0;
> +	__le64 **arrays;
> +
> +	vb = container_of(work, struct virtio_balloon, report_free_page_work);
> +	vb->cmd_id_active = vb->cmd_id_received;
> +
> +	arrays = virtio_balloon_load_hints(vb, &array_num, &last_array_hints);
> +	if (arrays) {
> +		virtio_balloon_send_hints(vb, arrays, array_num,
> +					  last_array_hints);
> +		kfree(arrays);
> +	}
> +}
> +
>  #ifdef CONFIG_BALLOON_COMPACTION
>  /*
>   * virtballoon_migratepage - perform the balloon page migration on behalf of
> @@ -576,18 +830,30 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  	if (err)
>  		goto out_free_vb;
>  
> +	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> +		vb->balloon_wq = alloc_workqueue("balloon-wq",
> +					WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
> +		if (!vb->balloon_wq) {
> +			err = -ENOMEM;
> +			goto out_del_vqs;
> +		}
> +		INIT_WORK(&vb->report_free_page_work, report_free_page_func);
> +		vb->cmd_id_received = VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID;
> +		vb->cmd_id_active = VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID;
> +	}
> +
>  	vb->nb.notifier_call = virtballoon_oom_notify;
>  	vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
>  	err = register_oom_notifier(&vb->nb);
>  	if (err < 0)
> -		goto out_del_vqs;
> +		goto out_del_balloon_wq;
>  
>  #ifdef CONFIG_BALLOON_COMPACTION
>  	balloon_mnt = kern_mount(&balloon_fs);
>  	if (IS_ERR(balloon_mnt)) {
>  		err = PTR_ERR(balloon_mnt);
>  		unregister_oom_notifier(&vb->nb);
> -		goto out_del_vqs;
> +		goto out_del_balloon_wq;
>  	}
>  
>  	vb->vb_dev_info.migratepage = virtballoon_migratepage;
> @@ -597,7 +863,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  		kern_unmount(balloon_mnt);
>  		unregister_oom_notifier(&vb->nb);
>  		vb->vb_dev_info.inode = NULL;
> -		goto out_del_vqs;
> +		goto out_del_balloon_wq;
>  	}
>  	vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
>  #endif
> @@ -608,6 +874,9 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  		virtballoon_changed(vdev);
>  	return 0;
>  
> +out_del_balloon_wq:
> +	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
> +		destroy_workqueue(vb->balloon_wq);
>  out_del_vqs:
>  	vdev->config->del_vqs(vdev);
>  out_free_vb:
> @@ -641,6 +910,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
>  	cancel_work_sync(&vb->update_balloon_size_work);
>  	cancel_work_sync(&vb->update_balloon_stats_work);
>  
> +	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> +		cancel_work_sync(&vb->report_free_page_work);
> +		destroy_workqueue(vb->balloon_wq);
> +	}
> +
>  	remove_common(vb);
>  #ifdef CONFIG_BALLOON_COMPACTION
>  	if (vb->vb_dev_info.inode)
> @@ -692,6 +966,7 @@ static unsigned int features[] = {
>  	VIRTIO_BALLOON_F_MUST_TELL_HOST,
>  	VIRTIO_BALLOON_F_STATS_VQ,
>  	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> +	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
>  };
>  
>  static struct virtio_driver virtio_balloon_driver = {
> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index 13b8cb5..860456f 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -34,15 +34,26 @@
>  #define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
>  #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
> +#define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
>  
>  /* Size of a PFN in the balloon interface. */
>  #define VIRTIO_BALLOON_PFN_SHIFT 12
>  
> +#define VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID	0
>  struct virtio_balloon_config {
>  	/* Number of pages host wants Guest to give up. */
>  	__u32 num_pages;
>  	/* Number of pages we've actually got in balloon. */
>  	__u32 actual;
> +	/* Free page report command id, readonly by guest */
> +	__u32 free_page_report_cmd_id;
> +};
> +
> +struct virtio_balloon_free_page_hints_cmd {
> +	/* The command id received from host */
> +	__le32 id;
> +	/* The free page block size in bytes */
> +	__le32 size;
>  };
>  
>  #define VIRTIO_BALLOON_S_SWAP_IN  0   /* Amount of memory swapped in */
> -- 
> 2.7.4
Wang, Wei W June 26, 2018, 3:46 a.m. UTC | #2
On 06/26/2018 09:37 AM, Michael S. Tsirkin wrote:
> On Mon, Jun 25, 2018 at 08:05:10PM +0800, Wei Wang wrote:
>
>> @@ -326,17 +353,6 @@ static void stats_handle_request(struct virtio_balloon *vb)
>>   	virtqueue_kick(vq);
>>   }
>>   
>> -static void virtballoon_changed(struct virtio_device *vdev)
>> -{
>> -	struct virtio_balloon *vb = vdev->priv;
>> -	unsigned long flags;
>> -
>> -	spin_lock_irqsave(&vb->stop_update_lock, flags);
>> -	if (!vb->stop_update)
>> -		queue_work(system_freezable_wq, &vb->update_balloon_size_work);
>> -	spin_unlock_irqrestore(&vb->stop_update_lock, flags);
>> -}
>> -
>>   static inline s64 towards_target(struct virtio_balloon *vb)
>>   {
>>   	s64 target;
>> @@ -353,6 +369,35 @@ static inline s64 towards_target(struct virtio_balloon *vb)
>>   	return target - vb->num_pages;
>>   }
>>   
>> +static void virtballoon_changed(struct virtio_device *vdev)
>> +{
>> +	struct virtio_balloon *vb = vdev->priv;
>> +	unsigned long flags;
>> +	s64 diff = towards_target(vb);
>> +
>> +	if (diff) {
>> +		spin_lock_irqsave(&vb->stop_update_lock, flags);
>> +		if (!vb->stop_update)
>> +			queue_work(system_freezable_wq,
>> +				   &vb->update_balloon_size_work);
>> +		spin_unlock_irqrestore(&vb->stop_update_lock, flags);
>> +	}
>> +
>> +	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
>> +		virtio_cread(vdev, struct virtio_balloon_config,
>> +			     free_page_report_cmd_id, &vb->cmd_id_received);
>> +		if (vb->cmd_id_received !=
>> +		    VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID &&
>> +		    vb->cmd_id_received != vb->cmd_id_active) {
>> +			spin_lock_irqsave(&vb->stop_update_lock, flags);
>> +			if (!vb->stop_update)
>> +				queue_work(vb->balloon_wq,
>> +					   &vb->report_free_page_work);
>> +			spin_unlock_irqrestore(&vb->stop_update_lock, flags);
>> +		}
>> +	}
>> +}
>> +
>>   static void update_balloon_size(struct virtio_balloon *vb)
>>   {
>>   	u32 actual = vb->num_pages;
>> @@ -425,44 +470,253 @@ static void update_balloon_size_func(struct work_struct *work)
>>   		queue_work(system_freezable_wq, work);
>>   }
>>   
>> +static void free_page_vq_cb(struct virtqueue *vq)
>> +{
>> +	unsigned int len;
>> +	void *buf;
>> +	struct virtio_balloon *vb = vq->vdev->priv;
>> +
>> +	while (1) {
>> +		buf = virtqueue_get_buf(vq, &len);
>> +
>> +		if (!buf || buf == &vb->cmd_start || buf == &vb->cmd_stop)
>> +			break;
> If there's any buffer after this one we might never get another
> callback.

I think every used buffer can get the callback, because host takes from 
the arrays one by one, and puts back each with a vq notify.



>> +		free_pages((unsigned long)buf, ARRAY_ALLOC_ORDER);
>> +	}
>> +}
>> +
>>   static int init_vqs(struct virtio_balloon *vb)
>>   {
>> -	struct virtqueue *vqs[3];
>> -	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
>> -	static const char * const names[] = { "inflate", "deflate", "stats" };
>> -	int err, nvqs;
>> +	struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX];
>> +	vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_MAX];
>> +	const char *names[VIRTIO_BALLOON_VQ_MAX];
>> +	struct scatterlist sg;
>> +	int ret;
>>   
>>   	/*
>> -	 * We expect two virtqueues: inflate and deflate, and
>> -	 * optionally stat.
>> +	 * Inflateq and deflateq are used unconditionally. The names[]
>> +	 * will be NULL if the related feature is not enabled, which will
>> +	 * cause no allocation for the corresponding virtqueue in find_vqs.
>>   	 */
>> -	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
>> -	err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
>> -	if (err)
>> -		return err;
>> +	callbacks[VIRTIO_BALLOON_VQ_INFLATE] = balloon_ack;
>> +	names[VIRTIO_BALLOON_VQ_INFLATE] = "inflate";
>> +	callbacks[VIRTIO_BALLOON_VQ_DEFLATE] = balloon_ack;
>> +	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
>> +	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
>> +	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
>>   
>> -	vb->inflate_vq = vqs[0];
>> -	vb->deflate_vq = vqs[1];
>>   	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
>> -		struct scatterlist sg;
>> -		unsigned int num_stats;
>> -		vb->stats_vq = vqs[2];
>> +		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
>> +		callbacks[VIRTIO_BALLOON_VQ_STATS] = stats_request;
>> +	}
>>   
>> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
>> +		names[VIRTIO_BALLOON_VQ_FREE_PAGE] = "free_page_vq";
>> +		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = free_page_vq_cb;
>> +	}
>> +
>> +	ret = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
>> +					 vqs, callbacks, names, NULL, NULL);
>> +	if (ret)
>> +		return ret;
>> +
>> +	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
>> +	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
>> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
>> +		vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_STATS];
>>   		/*
>>   		 * Prime this virtqueue with one buffer so the hypervisor can
>>   		 * use it to signal us later (it can't be broken yet!).
>>   		 */
>> -		num_stats = update_balloon_stats(vb);
>> -
>> -		sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
>> -		if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
>> -		    < 0)
>> -			BUG();
>> +		sg_init_one(&sg, vb->stats, sizeof(vb->stats));
>> +		ret = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb,
>> +					   GFP_KERNEL);
>> +		if (ret) {
>> +			dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
>> +				 __func__);
>> +			return ret;
>> +		}
> Why the change? Is it more likely to happen now?

Actually this part remains the same as the previous versions (e.g. v32). 
It is changed because we agreed that using BUG() isn't necessary here, 
and better to bail out nicely.



>
> +/*
> + * virtio_balloon_send_hints - send arrays of hints to host
> + * @vb: the virtio_balloon struct
> + * @arrays: the arrays of hints
> + * @array_num: the number of arrays give by the caller
> + * @last_array_hints: the number of hints in the last array
> + *
> + * Send hints to host array by array. This begins by sending a start cmd,
> + * which contains a cmd id received from host and the free page block size in
> + * bytes of each hint. At the end, a stop cmd is sent to host to indicate the
> + * end of this reporting. If host actively requests to stop the reporting, free
> + * the arrays that have not been sent.
> + */
> +static void virtio_balloon_send_hints(struct virtio_balloon *vb,
> +				      __le64 **arrays,
> +				      uint32_t array_num,
> +				      uint32_t last_array_hints)
> +{
> +	int err, i = 0;
> +	struct scatterlist sg;
> +	struct virtqueue *vq = vb->free_page_vq;
> +
> +	/* Start by sending the received cmd id to host with an outbuf. */
> +	err = send_start_cmd_id(vb);
> +	if (unlikely(err))
> +		goto out_err;
> +	/* Kick host to start taking entries from the vq. */
> +	virtqueue_kick(vq);
> +
> +	for (i = 0; i < array_num; i++) {
> +		/*
> +		 * If a stop id or a new cmd id was just received from host,
> +		 * stop the reporting, and free the remaining arrays that
> +		 * haven't been sent to host.
> +		 */
> +		if (vb->cmd_id_received != vb->cmd_id_active)
> +			goto out_free;
> +
> +		if (i + 1 == array_num)
> +			sg_init_one(&sg, (void *)arrays[i],
> +				    last_array_hints * sizeof(__le64));
> +		else
> +			sg_init_one(&sg, (void *)arrays[i], ARRAY_ALLOC_SIZE);
> +		err = virtqueue_add_inbuf(vq, &sg, 1, (void *)arrays[i],
> +					  GFP_KERNEL);
> +		if (unlikely(err))
> +			goto out_err;
> +	}
> +
> +	/* End by sending a stop id to host with an outbuf. */
> +	err = send_stop_cmd_id(vb);
> +	if (unlikely(err))
> +		goto out_err;
> Don't we need to kick here?

I think not needed, because we have kicked host about starting the 
report, and the host side optimization won't exit unless receiving this 
stop sign or the migration thread asks to exit.

>
>> +	int i;
>> +
>> +	max_entries = max_free_page_blocks(ARRAY_ALLOC_ORDER);
>> +	entries_per_page = PAGE_SIZE / sizeof(__le64);
>> +	entries_per_array = entries_per_page * (1 << ARRAY_ALLOC_ORDER);
>> +	max_array_num = max_entries / entries_per_array +
>> +			!!(max_entries % entries_per_array);
>> +	arrays = kmalloc_array(max_array_num, sizeof(__le64 *), GFP_KERNEL);
> Instead of all this mess, how about get_free_pages here as well?

Sounds good, will replace kmalloc_array with __get_free_pages(), but 
still need the above calculation to get max_array_num.

>
> Also why do we need GFP_KERNEL for this?

I guess it is better to use "__GFP_ATOMIC | __GFP_NOMEMALLOC", thanks.

>
>
>> +	if (!arrays)
>> +		return NULL;
>> +
>> +	for (i = 0; i < max_array_num; i++) {
> So we are getting a ton of memory here just to free it up a bit later.
> Why doesn't get_from_free_page_list get the pages from free list for us?
> We could also avoid the 1st allocation then - just build a list
> of these.

That wouldn't be a good choice for us. If we check how the regular 
allocation works, there are many many things we need to consider when 
pages are allocated to users. For example, we need to take care of the 
nr_free counter, we need to check the watermark and perform the related 
actions. Also the folks working on arch_alloc_page to monitor page 
allocation activities would get a surprise..if page allocation is 
allowed to work in this way.





>
>> +		arrays[i] =
>> +		(__le64 *)__get_free_pages(__GFP_ATOMIC | __GFP_NOMEMALLOC,
>> +					   ARRAY_ALLOC_ORDER);
> Coding style says:
>
> Descendants are always substantially shorter than the parent and
> are placed substantially to the right.

Thanks, will rearrange it:

arrays[i] = (__le64 *)__get_free_pages(__GFP_ATOMIC |
				__GFP_NOMEMALLOC, ARRAY_ALLOC_ORDER);



>
>> +		if (!arrays[i]) {
> Also if it does fail (small guest), shall we try with less arrays?

I think it's not needed. If the free list is empty, no matter it is a 
huge guest or a small guest, get_from_free_page_list() will load nothing 
even we pass a small array to it.


Best,
Wei
Michael S. Tsirkin June 26, 2018, 3:56 a.m. UTC | #3
On Tue, Jun 26, 2018 at 11:46:35AM +0800, Wei Wang wrote:
> On 06/26/2018 09:37 AM, Michael S. Tsirkin wrote:
> > On Mon, Jun 25, 2018 at 08:05:10PM +0800, Wei Wang wrote:
> > 
> > > @@ -326,17 +353,6 @@ static void stats_handle_request(struct virtio_balloon *vb)
> > >   	virtqueue_kick(vq);
> > >   }
> > > -static void virtballoon_changed(struct virtio_device *vdev)
> > > -{
> > > -	struct virtio_balloon *vb = vdev->priv;
> > > -	unsigned long flags;
> > > -
> > > -	spin_lock_irqsave(&vb->stop_update_lock, flags);
> > > -	if (!vb->stop_update)
> > > -		queue_work(system_freezable_wq, &vb->update_balloon_size_work);
> > > -	spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> > > -}
> > > -
> > >   static inline s64 towards_target(struct virtio_balloon *vb)
> > >   {
> > >   	s64 target;
> > > @@ -353,6 +369,35 @@ static inline s64 towards_target(struct virtio_balloon *vb)
> > >   	return target - vb->num_pages;
> > >   }
> > > +static void virtballoon_changed(struct virtio_device *vdev)
> > > +{
> > > +	struct virtio_balloon *vb = vdev->priv;
> > > +	unsigned long flags;
> > > +	s64 diff = towards_target(vb);
> > > +
> > > +	if (diff) {
> > > +		spin_lock_irqsave(&vb->stop_update_lock, flags);
> > > +		if (!vb->stop_update)
> > > +			queue_work(system_freezable_wq,
> > > +				   &vb->update_balloon_size_work);
> > > +		spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> > > +	}
> > > +
> > > +	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> > > +		virtio_cread(vdev, struct virtio_balloon_config,
> > > +			     free_page_report_cmd_id, &vb->cmd_id_received);
> > > +		if (vb->cmd_id_received !=
> > > +		    VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID &&
> > > +		    vb->cmd_id_received != vb->cmd_id_active) {
> > > +			spin_lock_irqsave(&vb->stop_update_lock, flags);
> > > +			if (!vb->stop_update)
> > > +				queue_work(vb->balloon_wq,
> > > +					   &vb->report_free_page_work);
> > > +			spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> > > +		}
> > > +	}
> > > +}
> > > +
> > >   static void update_balloon_size(struct virtio_balloon *vb)
> > >   {
> > >   	u32 actual = vb->num_pages;
> > > @@ -425,44 +470,253 @@ static void update_balloon_size_func(struct work_struct *work)
> > >   		queue_work(system_freezable_wq, work);
> > >   }
> > > +static void free_page_vq_cb(struct virtqueue *vq)
> > > +{
> > > +	unsigned int len;
> > > +	void *buf;
> > > +	struct virtio_balloon *vb = vq->vdev->priv;
> > > +
> > > +	while (1) {
> > > +		buf = virtqueue_get_buf(vq, &len);
> > > +
> > > +		if (!buf || buf == &vb->cmd_start || buf == &vb->cmd_stop)
> > > +			break;
> > If there's any buffer after this one we might never get another
> > callback.
> 
> I think every used buffer can get the callback, because host takes from the
> arrays one by one, and puts back each with a vq notify.

It's probabky racy even in this case. Besides, host is free to do it in
any way that's legal in spec.

> 
> 
> > > +		free_pages((unsigned long)buf, ARRAY_ALLOC_ORDER);
> > > +	}
> > > +}
> > > +
> > >   static int init_vqs(struct virtio_balloon *vb)
> > >   {
> > > -	struct virtqueue *vqs[3];
> > > -	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
> > > -	static const char * const names[] = { "inflate", "deflate", "stats" };
> > > -	int err, nvqs;
> > > +	struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX];
> > > +	vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_MAX];
> > > +	const char *names[VIRTIO_BALLOON_VQ_MAX];
> > > +	struct scatterlist sg;
> > > +	int ret;
> > >   	/*
> > > -	 * We expect two virtqueues: inflate and deflate, and
> > > -	 * optionally stat.
> > > +	 * Inflateq and deflateq are used unconditionally. The names[]
> > > +	 * will be NULL if the related feature is not enabled, which will
> > > +	 * cause no allocation for the corresponding virtqueue in find_vqs.
> > >   	 */
> > > -	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
> > > -	err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
> > > -	if (err)
> > > -		return err;
> > > +	callbacks[VIRTIO_BALLOON_VQ_INFLATE] = balloon_ack;
> > > +	names[VIRTIO_BALLOON_VQ_INFLATE] = "inflate";
> > > +	callbacks[VIRTIO_BALLOON_VQ_DEFLATE] = balloon_ack;
> > > +	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
> > > +	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
> > > +	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> > > -	vb->inflate_vq = vqs[0];
> > > -	vb->deflate_vq = vqs[1];
> > >   	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> > > -		struct scatterlist sg;
> > > -		unsigned int num_stats;
> > > -		vb->stats_vq = vqs[2];
> > > +		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> > > +		callbacks[VIRTIO_BALLOON_VQ_STATS] = stats_request;
> > > +	}
> > > +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> > > +		names[VIRTIO_BALLOON_VQ_FREE_PAGE] = "free_page_vq";
> > > +		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = free_page_vq_cb;
> > > +	}
> > > +
> > > +	ret = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
> > > +					 vqs, callbacks, names, NULL, NULL);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
> > > +	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
> > > +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> > > +		vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_STATS];
> > >   		/*
> > >   		 * Prime this virtqueue with one buffer so the hypervisor can
> > >   		 * use it to signal us later (it can't be broken yet!).
> > >   		 */
> > > -		num_stats = update_balloon_stats(vb);
> > > -
> > > -		sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
> > > -		if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
> > > -		    < 0)
> > > -			BUG();
> > > +		sg_init_one(&sg, vb->stats, sizeof(vb->stats));
> > > +		ret = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb,
> > > +					   GFP_KERNEL);
> > > +		if (ret) {
> > > +			dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
> > > +				 __func__);
> > > +			return ret;
> > > +		}
> > Why the change? Is it more likely to happen now?
> 
> Actually this part remains the same as the previous versions (e.g. v32). It
> is changed because we agreed that using BUG() isn't necessary here, and
> better to bail out nicely.

Why is this part of the hinting patch though? I'd rather have
a separate one.

> 
> 
> > 
> > +/*
> > + * virtio_balloon_send_hints - send arrays of hints to host
> > + * @vb: the virtio_balloon struct
> > + * @arrays: the arrays of hints
> > + * @array_num: the number of arrays give by the caller
> > + * @last_array_hints: the number of hints in the last array
> > + *
> > + * Send hints to host array by array. This begins by sending a start cmd,
> > + * which contains a cmd id received from host and the free page block size in
> > + * bytes of each hint. At the end, a stop cmd is sent to host to indicate the
> > + * end of this reporting. If host actively requests to stop the reporting, free
> > + * the arrays that have not been sent.
> > + */
> > +static void virtio_balloon_send_hints(struct virtio_balloon *vb,
> > +				      __le64 **arrays,
> > +				      uint32_t array_num,
> > +				      uint32_t last_array_hints)
> > +{
> > +	int err, i = 0;
> > +	struct scatterlist sg;
> > +	struct virtqueue *vq = vb->free_page_vq;
> > +
> > +	/* Start by sending the received cmd id to host with an outbuf. */
> > +	err = send_start_cmd_id(vb);
> > +	if (unlikely(err))
> > +		goto out_err;
> > +	/* Kick host to start taking entries from the vq. */
> > +	virtqueue_kick(vq);
> > +
> > +	for (i = 0; i < array_num; i++) {
> > +		/*
> > +		 * If a stop id or a new cmd id was just received from host,
> > +		 * stop the reporting, and free the remaining arrays that
> > +		 * haven't been sent to host.
> > +		 */
> > +		if (vb->cmd_id_received != vb->cmd_id_active)
> > +			goto out_free;
> > +
> > +		if (i + 1 == array_num)
> > +			sg_init_one(&sg, (void *)arrays[i],
> > +				    last_array_hints * sizeof(__le64));
> > +		else
> > +			sg_init_one(&sg, (void *)arrays[i], ARRAY_ALLOC_SIZE);
> > +		err = virtqueue_add_inbuf(vq, &sg, 1, (void *)arrays[i],
> > +					  GFP_KERNEL);
> > +		if (unlikely(err))
> > +			goto out_err;
> > +	}
> > +
> > +	/* End by sending a stop id to host with an outbuf. */
> > +	err = send_stop_cmd_id(vb);
> > +	if (unlikely(err))
> > +		goto out_err;
> > Don't we need to kick here?
> 
> I think not needed, because we have kicked host about starting the report,
> and the host side optimization won't exit unless receiving this stop sign or
> the migration thread asks to exit.

You can't assume that. Host might want to sleep.
If it doesn't then it will disable notifications and kick will be free.

> > 
> > > +	int i;
> > > +
> > > +	max_entries = max_free_page_blocks(ARRAY_ALLOC_ORDER);
> > > +	entries_per_page = PAGE_SIZE / sizeof(__le64);
> > > +	entries_per_array = entries_per_page * (1 << ARRAY_ALLOC_ORDER);
> > > +	max_array_num = max_entries / entries_per_array +
> > > +			!!(max_entries % entries_per_array);
> > > +	arrays = kmalloc_array(max_array_num, sizeof(__le64 *), GFP_KERNEL);
> > Instead of all this mess, how about get_free_pages here as well?
> 
> Sounds good, will replace kmalloc_array with __get_free_pages(),

Or alloc_pages, __ APIs are better avoided if possible.

> but still
> need the above calculation to get max_array_num.

Maybe alloc_pages?

> > 
> > Also why do we need GFP_KERNEL for this?
> 
> I guess it is better to use "__GFP_ATOMIC | __GFP_NOMEMALLOC", thanks.
> 
> > 
> > 
> > > +	if (!arrays)
> > > +		return NULL;
> > > +
> > > +	for (i = 0; i < max_array_num; i++) {
> > So we are getting a ton of memory here just to free it up a bit later.
> > Why doesn't get_from_free_page_list get the pages from free list for us?
> > We could also avoid the 1st allocation then - just build a list
> > of these.
> 
> That wouldn't be a good choice for us. If we check how the regular
> allocation works, there are many many things we need to consider when pages
> are allocated to users.
> For example, we need to take care of the nr_free
> counter, we need to check the watermark and perform the related actions.
> Also the folks working on arch_alloc_page to monitor page allocation
> activities would get a surprise..if page allocation is allowed to work in
> this way.
> 

mm/ code is well positioned to handle all this correctly.


> 
> 
> 
> > 
> > > +		arrays[i] =
> > > +		(__le64 *)__get_free_pages(__GFP_ATOMIC | __GFP_NOMEMALLOC,
> > > +					   ARRAY_ALLOC_ORDER);
> > Coding style says:
> > 
> > Descendants are always substantially shorter than the parent and
> > are placed substantially to the right.
> 
> Thanks, will rearrange it:
> 
> arrays[i] = (__le64 *)__get_free_pages(__GFP_ATOMIC |
> 				__GFP_NOMEMALLOC, ARRAY_ALLOC_ORDER);
> 
> 
> 
> > 
> > > +		if (!arrays[i]) {
> > Also if it does fail (small guest), shall we try with less arrays?
> 
> I think it's not needed. If the free list is empty, no matter it is a huge
> guest or a small guest, get_from_free_page_list() will load nothing even we
> pass a small array to it.
> 
> 
> Best,
> Wei

Yes but the reason it's empty is maybe because we used a ton of
memory for all of the arrays. Why allocate a top level array at all?
Can't we pass in a list?
Wang, Wei W June 26, 2018, 12:27 p.m. UTC | #4
On 06/26/2018 11:56 AM, Michael S. Tsirkin wrote:
> On Tue, Jun 26, 2018 at 11:46:35AM +0800, Wei Wang wrote:
>

>>
>>>
>>>> +	if (!arrays)
>>>> +		return NULL;
>>>> +
>>>> +	for (i = 0; i < max_array_num; i++) {
>>> So we are getting a ton of memory here just to free it up a bit later.
>>> Why doesn't get_from_free_page_list get the pages from free list for us?
>>> We could also avoid the 1st allocation then - just build a list
>>> of these.
>> That wouldn't be a good choice for us. If we check how the regular
>> allocation works, there are many many things we need to consider when pages
>> are allocated to users.
>> For example, we need to take care of the nr_free
>> counter, we need to check the watermark and perform the related actions.
>> Also the folks working on arch_alloc_page to monitor page allocation
>> activities would get a surprise..if page allocation is allowed to work in
>> this way.
>>
> mm/ code is well positioned to handle all this correctly.

I'm afraid that would be a re-implementation of the alloc functions, and 
that would be much more complex than what we have. I think your idea of 
passing a list of pages is better.

Best,
Wei
Michael S. Tsirkin June 26, 2018, 1:34 p.m. UTC | #5
On Tue, Jun 26, 2018 at 08:27:44PM +0800, Wei Wang wrote:
> On 06/26/2018 11:56 AM, Michael S. Tsirkin wrote:
> > On Tue, Jun 26, 2018 at 11:46:35AM +0800, Wei Wang wrote:
> > 
> 
> > > 
> > > > 
> > > > > +	if (!arrays)
> > > > > +		return NULL;
> > > > > +
> > > > > +	for (i = 0; i < max_array_num; i++) {
> > > > So we are getting a ton of memory here just to free it up a bit later.
> > > > Why doesn't get_from_free_page_list get the pages from free list for us?
> > > > We could also avoid the 1st allocation then - just build a list
> > > > of these.
> > > That wouldn't be a good choice for us. If we check how the regular
> > > allocation works, there are many many things we need to consider when pages
> > > are allocated to users.
> > > For example, we need to take care of the nr_free
> > > counter, we need to check the watermark and perform the related actions.
> > > Also the folks working on arch_alloc_page to monitor page allocation
> > > activities would get a surprise..if page allocation is allowed to work in
> > > this way.
> > > 
> > mm/ code is well positioned to handle all this correctly.
> 
> I'm afraid that would be a re-implementation of the alloc functions,

A re-factoring - you can share code. The main difference is locking.

> and
> that would be much more complex than what we have. I think your idea of
> passing a list of pages is better.
> 
> Best,
> Wei

How much memory is this allocating anyway?
Wang, Wei W June 27, 2018, 1:24 a.m. UTC | #6
On 06/26/2018 09:34 PM, Michael S. Tsirkin wrote:
> On Tue, Jun 26, 2018 at 08:27:44PM +0800, Wei Wang wrote:
>> On 06/26/2018 11:56 AM, Michael S. Tsirkin wrote:
>>> On Tue, Jun 26, 2018 at 11:46:35AM +0800, Wei Wang wrote:
>>>
>>>>>> +	if (!arrays)
>>>>>> +		return NULL;
>>>>>> +
>>>>>> +	for (i = 0; i < max_array_num; i++) {
>>>>> So we are getting a ton of memory here just to free it up a bit later.
>>>>> Why doesn't get_from_free_page_list get the pages from free list for us?
>>>>> We could also avoid the 1st allocation then - just build a list
>>>>> of these.
>>>> That wouldn't be a good choice for us. If we check how the regular
>>>> allocation works, there are many many things we need to consider when pages
>>>> are allocated to users.
>>>> For example, we need to take care of the nr_free
>>>> counter, we need to check the watermark and perform the related actions.
>>>> Also the folks working on arch_alloc_page to monitor page allocation
>>>> activities would get a surprise..if page allocation is allowed to work in
>>>> this way.
>>>>
>>> mm/ code is well positioned to handle all this correctly.
>> I'm afraid that would be a re-implementation of the alloc functions,
> A re-factoring - you can share code. The main difference is locking.
>
>> and
>> that would be much more complex than what we have. I think your idea of
>> passing a list of pages is better.
>>
>> Best,
>> Wei
> How much memory is this allocating anyway?
>

For every 2TB memory that the guest has, we allocate 4MB. This is the 
same for both cases.
For today's guests, usually there will be only one 4MB allocated and 
passed to get_from_free_page_list.

Best,
Wei
Michael S. Tsirkin June 27, 2018, 2:41 a.m. UTC | #7
On Wed, Jun 27, 2018 at 09:24:18AM +0800, Wei Wang wrote:
> On 06/26/2018 09:34 PM, Michael S. Tsirkin wrote:
> > On Tue, Jun 26, 2018 at 08:27:44PM +0800, Wei Wang wrote:
> > > On 06/26/2018 11:56 AM, Michael S. Tsirkin wrote:
> > > > On Tue, Jun 26, 2018 at 11:46:35AM +0800, Wei Wang wrote:
> > > > 
> > > > > > > +	if (!arrays)
> > > > > > > +		return NULL;
> > > > > > > +
> > > > > > > +	for (i = 0; i < max_array_num; i++) {
> > > > > > So we are getting a ton of memory here just to free it up a bit later.
> > > > > > Why doesn't get_from_free_page_list get the pages from free list for us?
> > > > > > We could also avoid the 1st allocation then - just build a list
> > > > > > of these.
> > > > > That wouldn't be a good choice for us. If we check how the regular
> > > > > allocation works, there are many many things we need to consider when pages
> > > > > are allocated to users.
> > > > > For example, we need to take care of the nr_free
> > > > > counter, we need to check the watermark and perform the related actions.
> > > > > Also the folks working on arch_alloc_page to monitor page allocation
> > > > > activities would get a surprise..if page allocation is allowed to work in
> > > > > this way.
> > > > > 
> > > > mm/ code is well positioned to handle all this correctly.
> > > I'm afraid that would be a re-implementation of the alloc functions,
> > A re-factoring - you can share code. The main difference is locking.
> > 
> > > and
> > > that would be much more complex than what we have. I think your idea of
> > > passing a list of pages is better.
> > > 
> > > Best,
> > > Wei
> > How much memory is this allocating anyway?
> > 
> 
> For every 2TB memory that the guest has, we allocate 4MB.

Hmm I guess I'm missing something, I don't see it:


+       max_entries = max_free_page_blocks(ARRAY_ALLOC_ORDER);
+       entries_per_page = PAGE_SIZE / sizeof(__le64);
+       entries_per_array = entries_per_page * (1 << ARRAY_ALLOC_ORDER);
+       max_array_num = max_entries / entries_per_array +
+                       !!(max_entries % entries_per_array);

Looks like you always allocate the max number?


> This is the same
> for both cases.
> For today's guests, usually there will be only one 4MB allocated and passed
> to get_from_free_page_list.
> 
> Best,
> Wei
> 
>
Wang, Wei W June 27, 2018, 3 a.m. UTC | #8
On 06/27/2018 10:41 AM, Michael S. Tsirkin wrote:
> On Wed, Jun 27, 2018 at 09:24:18AM +0800, Wei Wang wrote:
>> On 06/26/2018 09:34 PM, Michael S. Tsirkin wrote:
>>> On Tue, Jun 26, 2018 at 08:27:44PM +0800, Wei Wang wrote:
>>>> On 06/26/2018 11:56 AM, Michael S. Tsirkin wrote:
>>>>> On Tue, Jun 26, 2018 at 11:46:35AM +0800, Wei Wang wrote:
>>>>>
>>>>>>>> +	if (!arrays)
>>>>>>>> +		return NULL;
>>>>>>>> +
>>>>>>>> +	for (i = 0; i < max_array_num; i++) {
>>>>>>> So we are getting a ton of memory here just to free it up a bit later.
>>>>>>> Why doesn't get_from_free_page_list get the pages from free list for us?
>>>>>>> We could also avoid the 1st allocation then - just build a list
>>>>>>> of these.
>>>>>> That wouldn't be a good choice for us. If we check how the regular
>>>>>> allocation works, there are many many things we need to consider when pages
>>>>>> are allocated to users.
>>>>>> For example, we need to take care of the nr_free
>>>>>> counter, we need to check the watermark and perform the related actions.
>>>>>> Also the folks working on arch_alloc_page to monitor page allocation
>>>>>> activities would get a surprise..if page allocation is allowed to work in
>>>>>> this way.
>>>>>>
>>>>> mm/ code is well positioned to handle all this correctly.
>>>> I'm afraid that would be a re-implementation of the alloc functions,
>>> A re-factoring - you can share code. The main difference is locking.
>>>
>>>> and
>>>> that would be much more complex than what we have. I think your idea of
>>>> passing a list of pages is better.
>>>>
>>>> Best,
>>>> Wei
>>> How much memory is this allocating anyway?
>>>
>> For every 2TB memory that the guest has, we allocate 4MB.
> Hmm I guess I'm missing something, I don't see it:
>
>
> +       max_entries = max_free_page_blocks(ARRAY_ALLOC_ORDER);
> +       entries_per_page = PAGE_SIZE / sizeof(__le64);
> +       entries_per_array = entries_per_page * (1 << ARRAY_ALLOC_ORDER);
> +       max_array_num = max_entries / entries_per_array +
> +                       !!(max_entries % entries_per_array);
>
> Looks like you always allocate the max number?

Yes. We allocated the max number and then free what's not used.
For example, a 16TB guest, we allocate Four 4MB buffers and pass the 4 
buffers to get_from_free_page_list. If it uses 3, then the remaining 1 
"4MB buffer" will end up being freed.

For today's guests, max_array_num is usually 1.

Best,
Wei
Michael S. Tsirkin June 27, 2018, 3:58 a.m. UTC | #9
On Wed, Jun 27, 2018 at 11:00:05AM +0800, Wei Wang wrote:
> On 06/27/2018 10:41 AM, Michael S. Tsirkin wrote:
> > On Wed, Jun 27, 2018 at 09:24:18AM +0800, Wei Wang wrote:
> > > On 06/26/2018 09:34 PM, Michael S. Tsirkin wrote:
> > > > On Tue, Jun 26, 2018 at 08:27:44PM +0800, Wei Wang wrote:
> > > > > On 06/26/2018 11:56 AM, Michael S. Tsirkin wrote:
> > > > > > On Tue, Jun 26, 2018 at 11:46:35AM +0800, Wei Wang wrote:
> > > > > > 
> > > > > > > > > +	if (!arrays)
> > > > > > > > > +		return NULL;
> > > > > > > > > +
> > > > > > > > > +	for (i = 0; i < max_array_num; i++) {
> > > > > > > > So we are getting a ton of memory here just to free it up a bit later.
> > > > > > > > Why doesn't get_from_free_page_list get the pages from free list for us?
> > > > > > > > We could also avoid the 1st allocation then - just build a list
> > > > > > > > of these.
> > > > > > > That wouldn't be a good choice for us. If we check how the regular
> > > > > > > allocation works, there are many many things we need to consider when pages
> > > > > > > are allocated to users.
> > > > > > > For example, we need to take care of the nr_free
> > > > > > > counter, we need to check the watermark and perform the related actions.
> > > > > > > Also the folks working on arch_alloc_page to monitor page allocation
> > > > > > > activities would get a surprise..if page allocation is allowed to work in
> > > > > > > this way.
> > > > > > > 
> > > > > > mm/ code is well positioned to handle all this correctly.
> > > > > I'm afraid that would be a re-implementation of the alloc functions,
> > > > A re-factoring - you can share code. The main difference is locking.
> > > > 
> > > > > and
> > > > > that would be much more complex than what we have. I think your idea of
> > > > > passing a list of pages is better.
> > > > > 
> > > > > Best,
> > > > > Wei
> > > > How much memory is this allocating anyway?
> > > > 
> > > For every 2TB memory that the guest has, we allocate 4MB.
> > Hmm I guess I'm missing something, I don't see it:
> > 
> > 
> > +       max_entries = max_free_page_blocks(ARRAY_ALLOC_ORDER);
> > +       entries_per_page = PAGE_SIZE / sizeof(__le64);
> > +       entries_per_array = entries_per_page * (1 << ARRAY_ALLOC_ORDER);
> > +       max_array_num = max_entries / entries_per_array +
> > +                       !!(max_entries % entries_per_array);
> > 
> > Looks like you always allocate the max number?
> 
> Yes. We allocated the max number and then free what's not used.
> For example, a 16TB guest, we allocate Four 4MB buffers and pass the 4
> buffers to get_from_free_page_list. If it uses 3, then the remaining 1 "4MB
> buffer" will end up being freed.
> 
> For today's guests, max_array_num is usually 1.
> 
> Best,
> Wei

I see, it's based on total ram pages. It's reasonable but might
get out of sync if memory is onlined quickly. So you want to
detect that there's more free memory than can fit and
retry the reporting.

> 
> 
>
Wang, Wei W June 27, 2018, 5:27 a.m. UTC | #10
On 06/27/2018 11:58 AM, Michael S. Tsirkin wrote:
> On Wed, Jun 27, 2018 at 11:00:05AM +0800, Wei Wang wrote:
>> On 06/27/2018 10:41 AM, Michael S. Tsirkin wrote:
>>> On Wed, Jun 27, 2018 at 09:24:18AM +0800, Wei Wang wrote:
>>>> On 06/26/2018 09:34 PM, Michael S. Tsirkin wrote:
>>>>> On Tue, Jun 26, 2018 at 08:27:44PM +0800, Wei Wang wrote:
>>>>>> On 06/26/2018 11:56 AM, Michael S. Tsirkin wrote:
>>>>>>> On Tue, Jun 26, 2018 at 11:46:35AM +0800, Wei Wang wrote:
>>>>>>>
>>>>>>>>>> +	if (!arrays)
>>>>>>>>>> +		return NULL;
>>>>>>>>>> +
>>>>>>>>>> +	for (i = 0; i < max_array_num; i++) {
>>>>>>>>> So we are getting a ton of memory here just to free it up a bit later.
>>>>>>>>> Why doesn't get_from_free_page_list get the pages from free list for us?
>>>>>>>>> We could also avoid the 1st allocation then - just build a list
>>>>>>>>> of these.
>>>>>>>> That wouldn't be a good choice for us. If we check how the regular
>>>>>>>> allocation works, there are many many things we need to consider when pages
>>>>>>>> are allocated to users.
>>>>>>>> For example, we need to take care of the nr_free
>>>>>>>> counter, we need to check the watermark and perform the related actions.
>>>>>>>> Also the folks working on arch_alloc_page to monitor page allocation
>>>>>>>> activities would get a surprise..if page allocation is allowed to work in
>>>>>>>> this way.
>>>>>>>>
>>>>>>> mm/ code is well positioned to handle all this correctly.
>>>>>> I'm afraid that would be a re-implementation of the alloc functions,
>>>>> A re-factoring - you can share code. The main difference is locking.
>>>>>
>>>>>> and
>>>>>> that would be much more complex than what we have. I think your idea of
>>>>>> passing a list of pages is better.
>>>>>>
>>>>>> Best,
>>>>>> Wei
>>>>> How much memory is this allocating anyway?
>>>>>
>>>> For every 2TB memory that the guest has, we allocate 4MB.
>>> Hmm I guess I'm missing something, I don't see it:
>>>
>>>
>>> +       max_entries = max_free_page_blocks(ARRAY_ALLOC_ORDER);
>>> +       entries_per_page = PAGE_SIZE / sizeof(__le64);
>>> +       entries_per_array = entries_per_page * (1 << ARRAY_ALLOC_ORDER);
>>> +       max_array_num = max_entries / entries_per_array +
>>> +                       !!(max_entries % entries_per_array);
>>>
>>> Looks like you always allocate the max number?
>> Yes. We allocated the max number and then free what's not used.
>> For example, a 16TB guest, we allocate Four 4MB buffers and pass the 4
>> buffers to get_from_free_page_list. If it uses 3, then the remaining 1 "4MB
>> buffer" will end up being freed.
>>
>> For today's guests, max_array_num is usually 1.
>>
>> Best,
>> Wei
> I see, it's based on total ram pages. It's reasonable but might
> get out of sync if memory is onlined quickly. So you want to
> detect that there's more free memory than can fit and
> retry the reporting.
>


- AFAIK, memory hotplug isn't expected to happen during live migration 
today. Hypervisors (e.g. QEMU) explicitly forbid this.

- Allocating buffers based on total ram pages already gives some 
headroom for newly plugged memory if that could happen in any case. 
Also, we can think about why people plug in more memory - usually 
because the existing memory isn't enough, which implies that the free 
page list is very likely to be close to empty.

- This method could be easily scaled if people really need more headroom 
for hot-plugged memory. For example, calculation based on "X * 
total_ram_pages", X could be a number passed from the hypervisor.

- This is an optimization feature, and reporting less free memory in 
that rare case doesn't hurt anything.

So I think it is good to start from a fundamental implementation, which 
doesn't confuse people, and complexities can be added when there is a 
real need in the future.

Best,
Wei
Michael S. Tsirkin June 27, 2018, 4:53 p.m. UTC | #11
On Wed, Jun 27, 2018 at 01:27:55PM +0800, Wei Wang wrote:
> On 06/27/2018 11:58 AM, Michael S. Tsirkin wrote:
> > On Wed, Jun 27, 2018 at 11:00:05AM +0800, Wei Wang wrote:
> > > On 06/27/2018 10:41 AM, Michael S. Tsirkin wrote:
> > > > On Wed, Jun 27, 2018 at 09:24:18AM +0800, Wei Wang wrote:
> > > > > On 06/26/2018 09:34 PM, Michael S. Tsirkin wrote:
> > > > > > On Tue, Jun 26, 2018 at 08:27:44PM +0800, Wei Wang wrote:
> > > > > > > On 06/26/2018 11:56 AM, Michael S. Tsirkin wrote:
> > > > > > > > On Tue, Jun 26, 2018 at 11:46:35AM +0800, Wei Wang wrote:
> > > > > > > > 
> > > > > > > > > > > +	if (!arrays)
> > > > > > > > > > > +		return NULL;
> > > > > > > > > > > +
> > > > > > > > > > > +	for (i = 0; i < max_array_num; i++) {
> > > > > > > > > > So we are getting a ton of memory here just to free it up a bit later.
> > > > > > > > > > Why doesn't get_from_free_page_list get the pages from free list for us?
> > > > > > > > > > We could also avoid the 1st allocation then - just build a list
> > > > > > > > > > of these.
> > > > > > > > > That wouldn't be a good choice for us. If we check how the regular
> > > > > > > > > allocation works, there are many many things we need to consider when pages
> > > > > > > > > are allocated to users.
> > > > > > > > > For example, we need to take care of the nr_free
> > > > > > > > > counter, we need to check the watermark and perform the related actions.
> > > > > > > > > Also the folks working on arch_alloc_page to monitor page allocation
> > > > > > > > > activities would get a surprise..if page allocation is allowed to work in
> > > > > > > > > this way.
> > > > > > > > > 
> > > > > > > > mm/ code is well positioned to handle all this correctly.
> > > > > > > I'm afraid that would be a re-implementation of the alloc functions,
> > > > > > A re-factoring - you can share code. The main difference is locking.
> > > > > > 
> > > > > > > and
> > > > > > > that would be much more complex than what we have. I think your idea of
> > > > > > > passing a list of pages is better.
> > > > > > > 
> > > > > > > Best,
> > > > > > > Wei
> > > > > > How much memory is this allocating anyway?
> > > > > > 
> > > > > For every 2TB memory that the guest has, we allocate 4MB.
> > > > Hmm I guess I'm missing something, I don't see it:
> > > > 
> > > > 
> > > > +       max_entries = max_free_page_blocks(ARRAY_ALLOC_ORDER);
> > > > +       entries_per_page = PAGE_SIZE / sizeof(__le64);
> > > > +       entries_per_array = entries_per_page * (1 << ARRAY_ALLOC_ORDER);
> > > > +       max_array_num = max_entries / entries_per_array +
> > > > +                       !!(max_entries % entries_per_array);
> > > > 
> > > > Looks like you always allocate the max number?
> > > Yes. We allocated the max number and then free what's not used.
> > > For example, a 16TB guest, we allocate Four 4MB buffers and pass the 4
> > > buffers to get_from_free_page_list. If it uses 3, then the remaining 1 "4MB
> > > buffer" will end up being freed.
> > > 
> > > For today's guests, max_array_num is usually 1.
> > > 
> > > Best,
> > > Wei
> > I see, it's based on total ram pages. It's reasonable but might
> > get out of sync if memory is onlined quickly. So you want to
> > detect that there's more free memory than can fit and
> > retry the reporting.
> > 
> 
> 
> - AFAIK, memory hotplug isn't expected to happen during live migration
> today. Hypervisors (e.g. QEMU) explicitly forbid this.

That's a temporary limitation.

> - Allocating buffers based on total ram pages already gives some headroom
> for newly plugged memory if that could happen in any case. Also, we can
> think about why people plug in more memory - usually because the existing
> memory isn't enough, which implies that the free page list is very likely to
> be close to empty.

Or maybe because guest is expected to use more memory.

> - This method could be easily scaled if people really need more headroom for
> hot-plugged memory. For example, calculation based on "X * total_ram_pages",
> X could be a number passed from the hypervisor.

All this in place of a simple retry loop within guest?

> - This is an optimization feature, and reporting less free memory in that
> rare case doesn't hurt anything.

People working on memory hotplug can't be expected to worry about
balloon. And maintainers have other things to do than debug hard to
trigger failure reports from the field.

> 
> So I think it is good to start from a fundamental implementation, which
> doesn't confuse people, and complexities can be added when there is a real
> need in the future.
> 
> Best,
> Wei

The usefulness of the whole patchset hasn't been proven in the field yet.
The more uncovered corner cases there are, the higher the chance that
it will turn out not to be useful after all.

> 
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: virtio-dev-unsubscribe@lists.oasis-open.org
> For additional commands, e-mail: virtio-dev-help@lists.oasis-open.org
diff mbox

Patch

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 6b237e3..d05f0ba 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -43,6 +43,11 @@ 
 #define OOM_VBALLOON_DEFAULT_PAGES 256
 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
 
+/* The order used to allocate an array to load free page hints */
+#define ARRAY_ALLOC_ORDER (MAX_ORDER - 1)
+/* The size of an array in bytes */
+#define ARRAY_ALLOC_SIZE ((1 << ARRAY_ALLOC_ORDER) << PAGE_SHIFT)
+
 static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
 module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
@@ -51,9 +56,22 @@  MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 static struct vfsmount *balloon_mnt;
 #endif
 
+enum virtio_balloon_vq {
+	VIRTIO_BALLOON_VQ_INFLATE,
+	VIRTIO_BALLOON_VQ_DEFLATE,
+	VIRTIO_BALLOON_VQ_STATS,
+	VIRTIO_BALLOON_VQ_FREE_PAGE,
+	VIRTIO_BALLOON_VQ_MAX
+};
+
 struct virtio_balloon {
 	struct virtio_device *vdev;
-	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+	/* Balloon's own wq for cpu-intensive work items */
+	struct workqueue_struct *balloon_wq;
+	/* The free page reporting work item submitted to the balloon wq */
+	struct work_struct report_free_page_work;
 
 	/* The balloon servicing is delegated to a freezable workqueue. */
 	struct work_struct update_balloon_stats_work;
@@ -63,6 +81,15 @@  struct virtio_balloon {
 	spinlock_t stop_update_lock;
 	bool stop_update;
 
+	/* Command buffers to start and stop the reporting of hints to host */
+	struct virtio_balloon_free_page_hints_cmd cmd_start;
+	struct virtio_balloon_free_page_hints_cmd cmd_stop;
+
+	/* The cmd id received from host */
+	uint32_t cmd_id_received;
+	/* The cmd id that is actively in use */
+	uint32_t cmd_id_active;
+
 	/* Waiting for host to ack the pages we released. */
 	wait_queue_head_t acked;
 
@@ -326,17 +353,6 @@  static void stats_handle_request(struct virtio_balloon *vb)
 	virtqueue_kick(vq);
 }
 
-static void virtballoon_changed(struct virtio_device *vdev)
-{
-	struct virtio_balloon *vb = vdev->priv;
-	unsigned long flags;
-
-	spin_lock_irqsave(&vb->stop_update_lock, flags);
-	if (!vb->stop_update)
-		queue_work(system_freezable_wq, &vb->update_balloon_size_work);
-	spin_unlock_irqrestore(&vb->stop_update_lock, flags);
-}
-
 static inline s64 towards_target(struct virtio_balloon *vb)
 {
 	s64 target;
@@ -353,6 +369,35 @@  static inline s64 towards_target(struct virtio_balloon *vb)
 	return target - vb->num_pages;
 }
 
+static void virtballoon_changed(struct virtio_device *vdev)
+{
+	struct virtio_balloon *vb = vdev->priv;
+	unsigned long flags;
+	s64 diff = towards_target(vb);
+
+	if (diff) {
+		spin_lock_irqsave(&vb->stop_update_lock, flags);
+		if (!vb->stop_update)
+			queue_work(system_freezable_wq,
+				   &vb->update_balloon_size_work);
+		spin_unlock_irqrestore(&vb->stop_update_lock, flags);
+	}
+
+	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+		virtio_cread(vdev, struct virtio_balloon_config,
+			     free_page_report_cmd_id, &vb->cmd_id_received);
+		if (vb->cmd_id_received !=
+		    VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID &&
+		    vb->cmd_id_received != vb->cmd_id_active) {
+			spin_lock_irqsave(&vb->stop_update_lock, flags);
+			if (!vb->stop_update)
+				queue_work(vb->balloon_wq,
+					   &vb->report_free_page_work);
+			spin_unlock_irqrestore(&vb->stop_update_lock, flags);
+		}
+	}
+}
+
 static void update_balloon_size(struct virtio_balloon *vb)
 {
 	u32 actual = vb->num_pages;
@@ -425,44 +470,253 @@  static void update_balloon_size_func(struct work_struct *work)
 		queue_work(system_freezable_wq, work);
 }
 
+static void free_page_vq_cb(struct virtqueue *vq)
+{
+	unsigned int len;
+	void *buf;
+	struct virtio_balloon *vb = vq->vdev->priv;
+
+	while (1) {
+		buf = virtqueue_get_buf(vq, &len);
+
+		if (!buf || buf == &vb->cmd_start || buf == &vb->cmd_stop)
+			break;
+		free_pages((unsigned long)buf, ARRAY_ALLOC_ORDER);
+	}
+}
+
 static int init_vqs(struct virtio_balloon *vb)
 {
-	struct virtqueue *vqs[3];
-	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
-	static const char * const names[] = { "inflate", "deflate", "stats" };
-	int err, nvqs;
+	struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX];
+	vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_MAX];
+	const char *names[VIRTIO_BALLOON_VQ_MAX];
+	struct scatterlist sg;
+	int ret;
 
 	/*
-	 * We expect two virtqueues: inflate and deflate, and
-	 * optionally stat.
+	 * Inflateq and deflateq are used unconditionally. The names[]
+	 * will be NULL if the related feature is not enabled, which will
+	 * cause no allocation for the corresponding virtqueue in find_vqs.
 	 */
-	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
-	err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
-	if (err)
-		return err;
+	callbacks[VIRTIO_BALLOON_VQ_INFLATE] = balloon_ack;
+	names[VIRTIO_BALLOON_VQ_INFLATE] = "inflate";
+	callbacks[VIRTIO_BALLOON_VQ_DEFLATE] = balloon_ack;
+	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
+	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
+	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
 
-	vb->inflate_vq = vqs[0];
-	vb->deflate_vq = vqs[1];
 	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
-		struct scatterlist sg;
-		unsigned int num_stats;
-		vb->stats_vq = vqs[2];
+		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
+		callbacks[VIRTIO_BALLOON_VQ_STATS] = stats_request;
+	}
 
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+		names[VIRTIO_BALLOON_VQ_FREE_PAGE] = "free_page_vq";
+		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = free_page_vq_cb;
+	}
+
+	ret = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
+					 vqs, callbacks, names, NULL, NULL);
+	if (ret)
+		return ret;
+
+	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
+	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
+		vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_STATS];
 		/*
 		 * Prime this virtqueue with one buffer so the hypervisor can
 		 * use it to signal us later (it can't be broken yet!).
 		 */
-		num_stats = update_balloon_stats(vb);
-
-		sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
-		if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
-		    < 0)
-			BUG();
+		sg_init_one(&sg, vb->stats, sizeof(vb->stats));
+		ret = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb,
+					   GFP_KERNEL);
+		if (ret) {
+			dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
+				 __func__);
+			return ret;
+		}
 		virtqueue_kick(vb->stats_vq);
 	}
+
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+		vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE];
+
 	return 0;
 }
 
+static int send_start_cmd_id(struct virtio_balloon *vb)
+{
+	struct scatterlist sg;
+	struct virtqueue *vq = vb->free_page_vq;
+
+	vb->cmd_start.id = cpu_to_virtio32(vb->vdev, vb->cmd_id_active);
+	vb->cmd_start.size = cpu_to_virtio32(vb->vdev,
+					     MAX_ORDER_NR_PAGES * PAGE_SIZE);
+	sg_init_one(&sg, &vb->cmd_start,
+		    sizeof(struct virtio_balloon_free_page_hints_cmd));
+	return virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_start, GFP_KERNEL);
+}
+
+static int send_stop_cmd_id(struct virtio_balloon *vb)
+{
+	struct scatterlist sg;
+	struct virtqueue *vq = vb->free_page_vq;
+
+	vb->cmd_stop.id = cpu_to_virtio32(vb->vdev,
+				VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID);
+	vb->cmd_stop.size = 0;
+	sg_init_one(&sg, &vb->cmd_stop,
+		    sizeof(struct virtio_balloon_free_page_hints_cmd));
+	return virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_stop, GFP_KERNEL);
+}
+
+/*
+ * virtio_balloon_send_hints - send arrays of hints to host
+ * @vb: the virtio_balloon struct
+ * @arrays: the arrays of hints
+ * @array_num: the number of arrays give by the caller
+ * @last_array_hints: the number of hints in the last array
+ *
+ * Send hints to host array by array. This begins by sending a start cmd,
+ * which contains a cmd id received from host and the free page block size in
+ * bytes of each hint. At the end, a stop cmd is sent to host to indicate the
+ * end of this reporting. If host actively requests to stop the reporting, free
+ * the arrays that have not been sent.
+ */
+static void virtio_balloon_send_hints(struct virtio_balloon *vb,
+				      __le64 **arrays,
+				      uint32_t array_num,
+				      uint32_t last_array_hints)
+{
+	int err, i = 0;
+	struct scatterlist sg;
+	struct virtqueue *vq = vb->free_page_vq;
+
+	/* Start by sending the received cmd id to host with an outbuf. */
+	err = send_start_cmd_id(vb);
+	if (unlikely(err))
+		goto out_err;
+	/* Kick host to start taking entries from the vq. */
+	virtqueue_kick(vq);
+
+	for (i = 0; i < array_num; i++) {
+		/*
+		 * If a stop id or a new cmd id was just received from host,
+		 * stop the reporting, and free the remaining arrays that
+		 * haven't been sent to host.
+		 */
+		if (vb->cmd_id_received != vb->cmd_id_active)
+			goto out_free;
+
+		if (i + 1 == array_num)
+			sg_init_one(&sg, (void *)arrays[i],
+				    last_array_hints * sizeof(__le64));
+		else
+			sg_init_one(&sg, (void *)arrays[i], ARRAY_ALLOC_SIZE);
+		err = virtqueue_add_inbuf(vq, &sg, 1, (void *)arrays[i],
+					  GFP_KERNEL);
+		if (unlikely(err))
+			goto out_err;
+	}
+
+	/* End by sending a stop id to host with an outbuf. */
+	err = send_stop_cmd_id(vb);
+	if (unlikely(err))
+		goto out_err;
+	return;
+
+out_err:
+	dev_err(&vb->vdev->dev, "%s: err = %d\n", __func__, err);
+out_free:
+	while (i < array_num)
+		free_pages((unsigned long)arrays[i++], ARRAY_ALLOC_ORDER);
+}
+
+/*
+ * virtio_balloon_load_hints - load free page hints into arrays
+ * @vb: the virtio_balloon struct
+ * @array_num: the number of arrays allocated
+ * @last_array_hints: the number of hints loaded into the last array
+ *
+ * Only free pages blocks of MAX_ORDER - 1 are loaded into the arrays.
+ * Each array size is MAX_ORDER_NR_PAGES * PAGE_SIZE (e.g. 4MB on x86). Failing
+ * to allocate such an array essentially implies that no such free page blocks
+ * could be reported. Alloacte the number of arrays according to the free page
+ * blocks of MAX_ORDER - 1 that the system may have, and free the unused ones
+ * after loading the free page hints. The last array may be partially loaded,
+ * and @last_array_hints tells the caller about the number of hints there.
+ *
+ * Return the pointer to the memory that holds the addresses of the allocated
+ * arrays, or NULL if no arrays are allocated.
+ */
+static  __le64 **virtio_balloon_load_hints(struct virtio_balloon *vb,
+					   uint32_t *array_num,
+					   uint32_t *last_array_hints)
+{
+	__le64 **arrays;
+	uint32_t max_entries, entries_per_page, entries_per_array,
+		 max_array_num, loaded_hints;
+	int i;
+
+	max_entries = max_free_page_blocks(ARRAY_ALLOC_ORDER);
+	entries_per_page = PAGE_SIZE / sizeof(__le64);
+	entries_per_array = entries_per_page * (1 << ARRAY_ALLOC_ORDER);
+	max_array_num = max_entries / entries_per_array +
+			!!(max_entries % entries_per_array);
+	arrays = kmalloc_array(max_array_num, sizeof(__le64 *), GFP_KERNEL);
+	if (!arrays)
+		return NULL;
+
+	for (i = 0; i < max_array_num; i++) {
+		arrays[i] =
+		(__le64 *)__get_free_pages(__GFP_ATOMIC | __GFP_NOMEMALLOC,
+					   ARRAY_ALLOC_ORDER);
+		if (!arrays[i]) {
+			/*
+			 * If any one of the arrays fails to be allocated, it
+			 * implies that the free list that we are interested
+			 * in is empty, and there is no need to continue the
+			 * reporting. So just free what's allocated and return
+			 * NULL.
+			 */
+			while (i > 0)
+				free_pages((unsigned long)arrays[i--],
+					   ARRAY_ALLOC_ORDER);
+			kfree(arrays);
+			return NULL;
+		}
+	}
+	loaded_hints = get_from_free_page_list(ARRAY_ALLOC_ORDER,
+					       max_array_num, arrays,
+					       entries_per_array);
+	*array_num = loaded_hints / entries_per_array +
+		     !!(max_entries % entries_per_array);
+	*last_array_hints = loaded_hints -
+			    (*array_num - 1) * entries_per_array;
+	for (i = *array_num; i < max_array_num; i++)
+		free_pages((unsigned long)arrays[i], ARRAY_ALLOC_ORDER);
+
+	return arrays;
+}
+
+static void report_free_page_func(struct work_struct *work)
+{
+	struct virtio_balloon *vb;
+	uint32_t array_num = 0, last_array_hints = 0;
+	__le64 **arrays;
+
+	vb = container_of(work, struct virtio_balloon, report_free_page_work);
+	vb->cmd_id_active = vb->cmd_id_received;
+
+	arrays = virtio_balloon_load_hints(vb, &array_num, &last_array_hints);
+	if (arrays) {
+		virtio_balloon_send_hints(vb, arrays, array_num,
+					  last_array_hints);
+		kfree(arrays);
+	}
+}
+
 #ifdef CONFIG_BALLOON_COMPACTION
 /*
  * virtballoon_migratepage - perform the balloon page migration on behalf of
@@ -576,18 +830,30 @@  static int virtballoon_probe(struct virtio_device *vdev)
 	if (err)
 		goto out_free_vb;
 
+	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+		vb->balloon_wq = alloc_workqueue("balloon-wq",
+					WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
+		if (!vb->balloon_wq) {
+			err = -ENOMEM;
+			goto out_del_vqs;
+		}
+		INIT_WORK(&vb->report_free_page_work, report_free_page_func);
+		vb->cmd_id_received = VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID;
+		vb->cmd_id_active = VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID;
+	}
+
 	vb->nb.notifier_call = virtballoon_oom_notify;
 	vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
 	err = register_oom_notifier(&vb->nb);
 	if (err < 0)
-		goto out_del_vqs;
+		goto out_del_balloon_wq;
 
 #ifdef CONFIG_BALLOON_COMPACTION
 	balloon_mnt = kern_mount(&balloon_fs);
 	if (IS_ERR(balloon_mnt)) {
 		err = PTR_ERR(balloon_mnt);
 		unregister_oom_notifier(&vb->nb);
-		goto out_del_vqs;
+		goto out_del_balloon_wq;
 	}
 
 	vb->vb_dev_info.migratepage = virtballoon_migratepage;
@@ -597,7 +863,7 @@  static int virtballoon_probe(struct virtio_device *vdev)
 		kern_unmount(balloon_mnt);
 		unregister_oom_notifier(&vb->nb);
 		vb->vb_dev_info.inode = NULL;
-		goto out_del_vqs;
+		goto out_del_balloon_wq;
 	}
 	vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
 #endif
@@ -608,6 +874,9 @@  static int virtballoon_probe(struct virtio_device *vdev)
 		virtballoon_changed(vdev);
 	return 0;
 
+out_del_balloon_wq:
+	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+		destroy_workqueue(vb->balloon_wq);
 out_del_vqs:
 	vdev->config->del_vqs(vdev);
 out_free_vb:
@@ -641,6 +910,11 @@  static void virtballoon_remove(struct virtio_device *vdev)
 	cancel_work_sync(&vb->update_balloon_size_work);
 	cancel_work_sync(&vb->update_balloon_stats_work);
 
+	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+		cancel_work_sync(&vb->report_free_page_work);
+		destroy_workqueue(vb->balloon_wq);
+	}
+
 	remove_common(vb);
 #ifdef CONFIG_BALLOON_COMPACTION
 	if (vb->vb_dev_info.inode)
@@ -692,6 +966,7 @@  static unsigned int features[] = {
 	VIRTIO_BALLOON_F_MUST_TELL_HOST,
 	VIRTIO_BALLOON_F_STATS_VQ,
 	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
+	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index 13b8cb5..860456f 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -34,15 +34,26 @@ 
 #define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
 #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
+#define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
 
+#define VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID	0
 struct virtio_balloon_config {
 	/* Number of pages host wants Guest to give up. */
 	__u32 num_pages;
 	/* Number of pages we've actually got in balloon. */
 	__u32 actual;
+	/* Free page report command id, readonly by guest */
+	__u32 free_page_report_cmd_id;
+};
+
+struct virtio_balloon_free_page_hints_cmd {
+	/* The command id received from host */
+	__le32 id;
+	/* The free page block size in bytes */
+	__le32 size;
 };
 
 #define VIRTIO_BALLOON_S_SWAP_IN  0   /* Amount of memory swapped in */