Message ID | 1511963726-34070-8-git-send-email-wei.w.wang@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Wei Wang wrote: > +static inline int xb_set_page(struct virtio_balloon *vb, > + struct page *page, > + unsigned long *pfn_min, > + unsigned long *pfn_max) > +{ > + unsigned long pfn = page_to_pfn(page); > + int ret; > + > + *pfn_min = min(pfn, *pfn_min); > + *pfn_max = max(pfn, *pfn_max); > + > + do { > + ret = xb_preload_and_set_bit(&vb->page_xb, pfn, > + GFP_NOWAIT | __GFP_NOWARN); It is a bit of pity that __GFP_NOWARN here is applied to only xb_preload(). Memory allocation by xb_set_bit() will after all emit warnings. Maybe xb_init(&vb->page_xb); vb->page_xb.gfp_mask |= __GFP_NOWARN; is tolerable? Or, unconditionally apply __GFP_NOWARN at xb_init()? static inline void xb_init(struct xb *xb) { INIT_RADIX_TREE(&xb->xbrt, IDR_RT_MARKER | GFP_NOWAIT); } > + } while (unlikely(ret == -EAGAIN)); > + > + return ret; > +} > + > @@ -172,11 +283,18 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) > vb->num_pfns = 0; > > while ((page = balloon_page_pop(&pages))) { > + if (use_sg) { > + if (xb_set_page(vb, page, &pfn_min, &pfn_max) < 0) { > + __free_page(page); > + break; You cannot "break;" without consuming all pages in "pages". > + } > + } else { > + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); > + } > + > balloon_page_enqueue(&vb->vb_dev_info, page); > > vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; > - > - set_page_pfns(vb, vb->pfns + vb->num_pfns, page); > vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; > if (!virtio_has_feature(vb->vdev, > VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) > @@ -212,9 +334,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) > struct page *page; > struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; > LIST_HEAD(pages); > + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); You can pass use_sg as an argument to leak_balloon(). Then, you won't need to define leak_balloon_sg_oom(). Since xbitmap allocation does not use __GFP_DIRECT_RECLAIM, it is safe to reuse leak_balloon() for OOM path. Just be sure to pass use_sg == false because memory allocation for use_sg == true likely fails when called from OOM path. (But trying use_sg == true for OOM path and then fallback to use_sg == false is not bad?) > + unsigned long pfn_max = 0, pfn_min = ULONG_MAX; > > - /* We can only do one array worth at a time. */ > - num = min(num, ARRAY_SIZE(vb->pfns)); > + /* Traditionally, we can only do one array worth at a time. */ > + if (!use_sg) > + num = min(num, ARRAY_SIZE(vb->pfns)); > > mutex_lock(&vb->balloon_lock); > /* We can't release more pages than taken */ > diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h > index 343d7dd..37780a7 100644 > --- a/include/uapi/linux/virtio_balloon.h > +++ b/include/uapi/linux/virtio_balloon.h > @@ -34,6 +34,7 @@ > #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ > #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ > #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ > +#define VIRTIO_BALLOON_F_SG 3 /* Use sg instead of PFN lists */ Want more explicit comment that PFN lists will be used on OOM and therefore the host side must be prepared for both sg and PFN lists even if negotiated?
On Thursday, November 30, 2017 6:36 PM, Tetsuo Handa wrote: > Wei Wang wrote: > > +static inline int xb_set_page(struct virtio_balloon *vb, > > + struct page *page, > > + unsigned long *pfn_min, > > + unsigned long *pfn_max) > > +{ > > + unsigned long pfn = page_to_pfn(page); > > + int ret; > > + > > + *pfn_min = min(pfn, *pfn_min); > > + *pfn_max = max(pfn, *pfn_max); > > + > > + do { > > + ret = xb_preload_and_set_bit(&vb->page_xb, pfn, > > + GFP_NOWAIT | __GFP_NOWARN); > > It is a bit of pity that __GFP_NOWARN here is applied to only xb_preload(). > Memory allocation by xb_set_bit() will after all emit warnings. Maybe > > xb_init(&vb->page_xb); > vb->page_xb.gfp_mask |= __GFP_NOWARN; > > is tolerable? Or, unconditionally apply __GFP_NOWARN at xb_init()? > Please have a check this one: radix_tree_node_alloc() In our case, I think the code path goes to if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) { ... ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask | __GFP_NOWARN); ... goto out; } So I think the __GFP_NOWARN is already there. > static inline void xb_init(struct xb *xb) > { > INIT_RADIX_TREE(&xb->xbrt, IDR_RT_MARKER | GFP_NOWAIT); > } > > > + } while (unlikely(ret == -EAGAIN)); > > + > > + return ret; > > +} > > + > > > > > @@ -172,11 +283,18 @@ static unsigned fill_balloon(struct virtio_balloon > *vb, size_t num) > > vb->num_pfns = 0; > > > > while ((page = balloon_page_pop(&pages))) { > > + if (use_sg) { > > + if (xb_set_page(vb, page, &pfn_min, &pfn_max) < 0) { > > + __free_page(page); > > + break; > > You cannot "break;" without consuming all pages in "pages". Right, I think it should be "continue" here. Thanks. > > > + } > > + } else { > > + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); > > + } > > + > > balloon_page_enqueue(&vb->vb_dev_info, page); > > > > vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; > > - > > - set_page_pfns(vb, vb->pfns + vb->num_pfns, page); > > vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; > > if (!virtio_has_feature(vb->vdev, > > > VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) > > > > > @@ -212,9 +334,12 @@ static unsigned leak_balloon(struct virtio_balloon > *vb, size_t num) > > struct page *page; > > struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; > > LIST_HEAD(pages); > > + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); > > You can pass use_sg as an argument to leak_balloon(). Then, you won't need > to define leak_balloon_sg_oom(). Since xbitmap allocation does not use > __GFP_DIRECT_RECLAIM, it is safe to reuse leak_balloon() for OOM path. > Just be sure to pass use_sg == false because memory allocation for use_sg == > true likely fails when called from OOM path. (But trying use_sg == true for > OOM path and then fallback to use_sg == false is not bad?) > But once the SG mechanism is in use, we cannot use the non-SG mechanism, because the interface between the guest and host is not the same for SG and non-SG. Methods to make the host support both mechanisms at the same time would complicate the interface and implementation. I also think the old non-SG mechanism should be deprecated to use since its implementation isn't perfect in some sense, e.g. it balloons pages using outbuf which implies that no changes are allowed to the balloon pages and this isn't true for ballooning. The new mechanism (SG) has changed it to use inbuf. So I think using leak_balloon_sg_oom() would be better. Is there any reason that we couldn't use it? Best, Wei
On Wed, Nov 29, 2017 at 09:55:23PM +0800, Wei Wang wrote: > Add a new feature, VIRTIO_BALLOON_F_SG, which enables the transfer of > balloon (i.e. inflated/deflated) pages using scatter-gather lists to the > host. A scatter-gather list is described by a vring desc. > > The implementation of the previous virtio-balloon is not very efficient, > because the balloon pages are transferred to the host by one array each > time. Here is the breakdown of the time in percentage spent on each step > of the balloon inflating process (inflating 7GB of an 8GB idle guest). > > 1) allocating pages (6.5%) > 2) sending PFNs to host (68.3%) > 3) address translation (6.1%) > 4) madvise (19%) > > It takes about 4126ms for the inflating process to complete. The above > profiling shows that the bottlenecks are stage 2) and stage 4). > > This patch optimizes step 2) by transferring pages to host in sgs. An sg > describes a chunk of guest physically continuous pages. With this > mechanism, step 4) can also be optimized by doing address translation and > madvise() in chunks rather than page by page. > > With this new feature, the above ballooning process takes ~440ms resulting > in an improvement of ~89%. > > TODO: optimize stage 1) by allocating/freeing a chunk of pages instead of > a single page each time. > > Signed-off-by: Wei Wang <wei.w.wang@intel.com> > Signed-off-by: Liang Li <liang.z.li@intel.com> > Suggested-by: Michael S. Tsirkin <mst@redhat.com> > Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> > --- > drivers/virtio/virtio_balloon.c | 230 +++++++++++++++++++++++++++++++++--- > include/uapi/linux/virtio_balloon.h | 1 + > 2 files changed, 212 insertions(+), 19 deletions(-) > > diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c > index 7960746..2c21c5a 100644 > --- a/drivers/virtio/virtio_balloon.c > +++ b/drivers/virtio/virtio_balloon.c > @@ -32,6 +32,8 @@ > #include <linux/mm.h> > #include <linux/mount.h> > #include <linux/magic.h> > +#include <linux/xbitmap.h> > +#include <asm/page.h> > > /* > * Balloon device works in 4K page units. So each page is pointed to by > @@ -79,6 +81,9 @@ struct virtio_balloon { > /* Synchronize access/update to this struct virtio_balloon elements */ > struct mutex balloon_lock; > > + /* The xbitmap used to record balloon pages */ > + struct xb page_xb; > + > /* The array of pfns we tell the Host about. */ > unsigned int num_pfns; > __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX]; > @@ -141,15 +146,121 @@ static void set_page_pfns(struct virtio_balloon *vb, > page_to_balloon_pfn(page) + i); > } > > +static void kick_and_wait(struct virtqueue *vq, wait_queue_head_t wq_head) > +{ > + unsigned int len; > + > + virtqueue_kick(vq); > + wait_event(wq_head, virtqueue_get_buf(vq, &len)); > +} > + > +static void send_one_desc(struct virtio_balloon *vb, > + struct virtqueue *vq, > + uint64_t addr, > + uint32_t len, > + bool inbuf, > + bool batch) > +{ > + int err; > + unsigned int size; > + > + /* Detach all the used buffers from the vq */ > + while (virtqueue_get_buf(vq, &size)) > + ; > + > + err = virtqueue_add_one_desc(vq, addr, len, inbuf, vq); > + /* > + * This is expected to never fail: there is always at least 1 entry > + * available on the vq, because when the vq is full the worker thread > + * that adds the desc will be put into sleep until at least 1 entry is > + * available to use. > + */ > + BUG_ON(err); > + > + /* If batching is requested, we batch till the vq is full */ > + if (!batch || !vq->num_free) > + kick_and_wait(vq, vb->acked); > +} > + This internal kick complicates callers. I suggest that instead, you move this to callers, just return a "kick required" boolean. This way callers do not need to play with num_free at all. > +/* > + * Send balloon pages in sgs to host. The balloon pages are recorded in the > + * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE. > + * The page xbitmap is searched for continuous "1" bits, which correspond > + * to continuous pages, to chunk into sgs. > + * > + * @page_xb_start and @page_xb_end form the range of bits in the xbitmap that > + * need to be searched. > + */ > +static void tell_host_sgs(struct virtio_balloon *vb, > + struct virtqueue *vq, > + unsigned long page_xb_start, > + unsigned long page_xb_end) > +{ > + unsigned long pfn_start, pfn_end; > + uint64_t addr; > + uint32_t len, max_len = round_down(UINT_MAX, PAGE_SIZE); > + > + pfn_start = page_xb_start; > + while (pfn_start < page_xb_end) { > + pfn_start = xb_find_next_set_bit(&vb->page_xb, pfn_start, > + page_xb_end); > + if (pfn_start == page_xb_end + 1) > + break; > + pfn_end = xb_find_next_zero_bit(&vb->page_xb, > + pfn_start + 1, > + page_xb_end); > + addr = pfn_start << PAGE_SHIFT; > + len = (pfn_end - pfn_start) << PAGE_SHIFT; This assugnment can overflow. Next line compares with UINT_MAX but by that time it is too late. I think you should do all math in 64 bit to avoid surprises, then truncate to max_len and then it's safe to assign to sg. > + while (len > max_len) { > + send_one_desc(vb, vq, addr, max_len, true, true); > + addr += max_len; > + len -= max_len; > + } > + send_one_desc(vb, vq, addr, len, true, true); > + pfn_start = pfn_end + 1; > + } > + > + /* > + * The last few desc-s may not reach the batch size, but need a kick to > + * notify the device to handle them. > + */ > + if (vq->num_free != virtqueue_get_vring_size(vq)) > + kick_and_wait(vq, vb->acked); the actual trigger for kick is if we did not kick after the last send_one_desc. If kick is moved out of send_one_desc, you can test for that explicitly. > + > + xb_clear_bit_range(&vb->page_xb, page_xb_start, page_xb_end); > +} > + > +static inline int xb_set_page(struct virtio_balloon *vb, > + struct page *page, > + unsigned long *pfn_min, > + unsigned long *pfn_max) > +{ > + unsigned long pfn = page_to_pfn(page); > + int ret; > + > + *pfn_min = min(pfn, *pfn_min); > + *pfn_max = max(pfn, *pfn_max); > + > + do { > + ret = xb_preload_and_set_bit(&vb->page_xb, pfn, > + GFP_NOWAIT | __GFP_NOWARN); > + } while (unlikely(ret == -EAGAIN)); what exactly does this loop do? Does this wait forever until there is some free memory? why GFP_NOWAIT? > + > + return ret; > +} > + > static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) > { > unsigned num_allocated_pages; > unsigned num_pfns; > struct page *page; > LIST_HEAD(pages); > + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); > + unsigned long pfn_max = 0, pfn_min = ULONG_MAX; > > /* We can only do one array worth at a time. */ > - num = min(num, ARRAY_SIZE(vb->pfns)); > + if (!use_sg) > + num = min(num, ARRAY_SIZE(vb->pfns)); > > for (num_pfns = 0; num_pfns < num; > num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { > @@ -172,11 +283,18 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) > vb->num_pfns = 0; > > while ((page = balloon_page_pop(&pages))) { > + if (use_sg) { > + if (xb_set_page(vb, page, &pfn_min, &pfn_max) < 0) { > + __free_page(page); > + break; > + } > + } else { > + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); > + } > + > balloon_page_enqueue(&vb->vb_dev_info, page); > > vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; > - > - set_page_pfns(vb, vb->pfns + vb->num_pfns, page); > vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; > if (!virtio_has_feature(vb->vdev, > VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) > @@ -185,8 +303,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) > > num_allocated_pages = vb->num_pfns; > /* Did we get any? */ > - if (vb->num_pfns != 0) > - tell_host(vb, vb->inflate_vq); > + if (vb->num_pfns) { > + if (use_sg) > + tell_host_sgs(vb, vb->inflate_vq, pfn_min, pfn_max); > + else > + tell_host(vb, vb->inflate_vq); > + } > mutex_unlock(&vb->balloon_lock); > > return num_allocated_pages; > @@ -212,9 +334,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) > struct page *page; > struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; > LIST_HEAD(pages); > + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); > + unsigned long pfn_max = 0, pfn_min = ULONG_MAX; > > - /* We can only do one array worth at a time. */ > - num = min(num, ARRAY_SIZE(vb->pfns)); > + /* Traditionally, we can only do one array worth at a time. */ > + if (!use_sg) > + num = min(num, ARRAY_SIZE(vb->pfns)); > > mutex_lock(&vb->balloon_lock); > /* We can't release more pages than taken */ > @@ -224,7 +349,14 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) > page = balloon_page_dequeue(vb_dev_info); > if (!page) > break; > - set_page_pfns(vb, vb->pfns + vb->num_pfns, page); > + if (use_sg) { > + if (xb_set_page(vb, page, &pfn_min, &pfn_max) < 0) { > + balloon_page_enqueue(&vb->vb_dev_info, page); > + break; > + } > + } else { > + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); > + } > list_add(&page->lru, &pages); > vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; > } > @@ -235,13 +367,55 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) > * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); > * is true, we *have* to do it in this order > */ > - if (vb->num_pfns != 0) > - tell_host(vb, vb->deflate_vq); > + if (vb->num_pfns) { > + if (use_sg) > + tell_host_sgs(vb, vb->deflate_vq, pfn_min, pfn_max); > + else > + tell_host(vb, vb->deflate_vq); > + } > release_pages_balloon(vb, &pages); > mutex_unlock(&vb->balloon_lock); > return num_freed_pages; > } > > +/* > + * The regular leak_balloon() with VIRTIO_BALLOON_F_SG needs memory allocation > + * for xbitmap, which is not suitable for the oom case. This function does not > + * use xbitmap to chunk pages, so it can be used by oom notifier to deflate > + * pages when VIRTIO_BALLOON_F_SG is negotiated. > + */ I guess we can live with this for now. Two things to consider - adding support for pre-allocating indirect buffers - sorting the internal page queue (how?) > +static unsigned int leak_balloon_sg_oom(struct virtio_balloon *vb) > +{ > + unsigned int n; > + struct page *page; > + struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; > + struct virtqueue *vq = vb->deflate_vq; > + LIST_HEAD(pages); > + > + mutex_lock(&vb->balloon_lock); > + for (n = 0; n < oom_pages; n++) { > + page = balloon_page_dequeue(vb_dev_info); > + if (!page) > + break; > + > + list_add(&page->lru, &pages); > + vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; > + send_one_desc(vb, vq, virt_to_phys(page_address(page)), > + PAGE_SIZE, true, true); > + release_pages_balloon(vb, &pages); > + } > + > + /* > + * The last few sgs may not reach the batch size, but need a kick to > + * notify the device to handle them. > + */ > + if (vq->num_free != virtqueue_get_vring_size(vq)) > + kick_and_wait(vq, vb->acked); > + mutex_unlock(&vb->balloon_lock); > + > + return n; > +} > + > static inline void update_stat(struct virtio_balloon *vb, int idx, > u16 tag, u64 val) > { > @@ -381,7 +555,10 @@ static int virtballoon_oom_notify(struct notifier_block *self, > return NOTIFY_OK; > > freed = parm; > - num_freed_pages = leak_balloon(vb, oom_pages); > + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG)) > + num_freed_pages = leak_balloon_sg_oom(vb); > + else > + num_freed_pages = leak_balloon(vb, oom_pages); > update_balloon_size(vb); > *freed += num_freed_pages; > > @@ -478,6 +655,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, > { > struct virtio_balloon *vb = container_of(vb_dev_info, > struct virtio_balloon, vb_dev_info); > + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); > unsigned long flags; > > /* > @@ -499,16 +677,26 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, > vb_dev_info->isolated_pages--; > __count_vm_event(BALLOON_MIGRATE); > spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); > - vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; > - set_page_pfns(vb, vb->pfns, newpage); > - tell_host(vb, vb->inflate_vq); > - > + if (use_sg) { > + send_one_desc(vb, vb->inflate_vq, > + virt_to_phys(page_address(newpage)), > + PAGE_SIZE, true, false); > + } else { > + vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; > + set_page_pfns(vb, vb->pfns, newpage); > + tell_host(vb, vb->inflate_vq); > + } > /* balloon's page migration 2nd step -- deflate "page" */ > balloon_page_delete(page); > - vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; > - set_page_pfns(vb, vb->pfns, page); > - tell_host(vb, vb->deflate_vq); > - > + if (use_sg) { > + send_one_desc(vb, vb->inflate_vq, > + virt_to_phys(page_address(page)), > + PAGE_SIZE, true, false); > + } else { > + vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; > + set_page_pfns(vb, vb->pfns, page); > + tell_host(vb, vb->deflate_vq); > + } > mutex_unlock(&vb->balloon_lock); > > put_page(page); /* balloon reference */ > @@ -567,6 +755,9 @@ static int virtballoon_probe(struct virtio_device *vdev) > if (err) > goto out_free_vb; > > + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_SG)) > + xb_init(&vb->page_xb); > + > vb->nb.notifier_call = virtballoon_oom_notify; > vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY; > err = register_oom_notifier(&vb->nb); > @@ -683,6 +874,7 @@ static unsigned int features[] = { > VIRTIO_BALLOON_F_MUST_TELL_HOST, > VIRTIO_BALLOON_F_STATS_VQ, > VIRTIO_BALLOON_F_DEFLATE_ON_OOM, > + VIRTIO_BALLOON_F_SG, > }; > > static struct virtio_driver virtio_balloon_driver = { > diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h > index 343d7dd..37780a7 100644 > --- a/include/uapi/linux/virtio_balloon.h > +++ b/include/uapi/linux/virtio_balloon.h > @@ -34,6 +34,7 @@ > #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ > #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ > #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ > +#define VIRTIO_BALLOON_F_SG 3 /* Use sg instead of PFN lists */ > > /* Size of a PFN in the balloon interface. */ > #define VIRTIO_BALLOON_PFN_SHIFT 12 > -- > 2.7.4
On 12/01/2017 11:38 PM, Michael S. Tsirkin wrote: > On Wed, Nov 29, 2017 at 09:55:23PM +0800, Wei Wang wrote: >> +static void send_one_desc(struct virtio_balloon *vb, >> + struct virtqueue *vq, >> + uint64_t addr, >> + uint32_t len, >> + bool inbuf, >> + bool batch) >> +{ >> + int err; >> + unsigned int size; >> + >> + /* Detach all the used buffers from the vq */ >> + while (virtqueue_get_buf(vq, &size)) >> + ; >> + >> + err = virtqueue_add_one_desc(vq, addr, len, inbuf, vq); >> + /* >> + * This is expected to never fail: there is always at least 1 entry >> + * available on the vq, because when the vq is full the worker thread >> + * that adds the desc will be put into sleep until at least 1 entry is >> + * available to use. >> + */ >> + BUG_ON(err); >> + >> + /* If batching is requested, we batch till the vq is full */ >> + if (!batch || !vq->num_free) >> + kick_and_wait(vq, vb->acked); >> +} >> + > This internal kick complicates callers. I suggest that instead, > you move this to callers, just return a "kick required" boolean. > This way callers do not need to play with num_free at all. Then in what situation would the function return true of "kick required"? I think this wouldn't make a difference fundamentally. For example, we have 257 sgs (batching size=256) to send to host: while (i < 257) { kick_required = send_sgs(); if (kick_required) kick(); // After the 256 sgs have been added, the caller performs a kick(). } Do we still need a kick here for the 257th sg as before? Only the caller knows if the last added sgs need a kick (when the send_sgs receives one sg, it doesn't know if there are more to come). There is another approach to checking if the last added sgs haven't been sync-ed to the host: expose "vring_virtqueue->num_added" to the caller via a virtio_ring API: unsigned int virtqueue_num_added(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); return vq->num_added; } >> +/* >> + * Send balloon pages in sgs to host. The balloon pages are recorded in the >> + * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE. >> + * The page xbitmap is searched for continuous "1" bits, which correspond >> + * to continuous pages, to chunk into sgs. >> + * >> + * @page_xb_start and @page_xb_end form the range of bits in the xbitmap that >> + * need to be searched. >> + */ >> +static void tell_host_sgs(struct virtio_balloon *vb, >> + struct virtqueue *vq, >> + unsigned long page_xb_start, >> + unsigned long page_xb_end) >> +{ >> + unsigned long pfn_start, pfn_end; >> + uint64_t addr; >> + uint32_t len, max_len = round_down(UINT_MAX, PAGE_SIZE); >> + >> + pfn_start = page_xb_start; >> + while (pfn_start < page_xb_end) { >> + pfn_start = xb_find_next_set_bit(&vb->page_xb, pfn_start, >> + page_xb_end); >> + if (pfn_start == page_xb_end + 1) >> + break; >> + pfn_end = xb_find_next_zero_bit(&vb->page_xb, >> + pfn_start + 1, >> + page_xb_end); >> + addr = pfn_start << PAGE_SHIFT; >> + len = (pfn_end - pfn_start) << PAGE_SHIFT; > This assugnment can overflow. Next line compares with UINT_MAX but by > that time it is too late. I think you should do all math in 64 bit to > avoid surprises, then truncate to max_len and then it's safe to assign > to sg. Sounds reasonable, thanks. >> + >> + xb_clear_bit_range(&vb->page_xb, page_xb_start, page_xb_end); >> +} >> + >> +static inline int xb_set_page(struct virtio_balloon *vb, >> + struct page *page, >> + unsigned long *pfn_min, >> + unsigned long *pfn_max) >> +{ >> + unsigned long pfn = page_to_pfn(page); >> + int ret; >> + >> + *pfn_min = min(pfn, *pfn_min); >> + *pfn_max = max(pfn, *pfn_max); >> + >> + do { >> + ret = xb_preload_and_set_bit(&vb->page_xb, pfn, >> + GFP_NOWAIT | __GFP_NOWARN); >> + } while (unlikely(ret == -EAGAIN)); > what exactly does this loop do? Does this wait > forever until there is some free memory? why GFP_NOWAIT? Basically, "-EAGAIN" is returned from xb_set_bit() in the case when the pre-allocated per-cpu ida_bitmap is NULL. In that case, the caller re-invokes xb_preload_and_set_bit(), which re-invokes xb_preload to allocate ida_bitmap. So "-EAGAIN" actually does not indicate a status about memory allocation. "-ENOMEM" is the one to indicate the failure of memory allocation, but the loop doesn't re-try on "-ENOMEM". GFP_NOWAIT is used to avoid memory reclaiming, which could cause the deadlock issue we discussed before. > return num_freed_pages; > } > > +/* > + * The regular leak_balloon() with VIRTIO_BALLOON_F_SG needs memory allocation > + * for xbitmap, which is not suitable for the oom case. This function does not > + * use xbitmap to chunk pages, so it can be used by oom notifier to deflate > + * pages when VIRTIO_BALLOON_F_SG is negotiated. > + */ > I guess we can live with this for now. Agree, the patchset has been big. We can get the basic implementation in first, and leave the following as future work. I can add it in the commit log. > Two things to consider > - adding support for pre-allocating indirect buffers > - sorting the internal page queue (how?) Best, Wei
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 7960746..2c21c5a 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -32,6 +32,8 @@ #include <linux/mm.h> #include <linux/mount.h> #include <linux/magic.h> +#include <linux/xbitmap.h> +#include <asm/page.h> /* * Balloon device works in 4K page units. So each page is pointed to by @@ -79,6 +81,9 @@ struct virtio_balloon { /* Synchronize access/update to this struct virtio_balloon elements */ struct mutex balloon_lock; + /* The xbitmap used to record balloon pages */ + struct xb page_xb; + /* The array of pfns we tell the Host about. */ unsigned int num_pfns; __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX]; @@ -141,15 +146,121 @@ static void set_page_pfns(struct virtio_balloon *vb, page_to_balloon_pfn(page) + i); } +static void kick_and_wait(struct virtqueue *vq, wait_queue_head_t wq_head) +{ + unsigned int len; + + virtqueue_kick(vq); + wait_event(wq_head, virtqueue_get_buf(vq, &len)); +} + +static void send_one_desc(struct virtio_balloon *vb, + struct virtqueue *vq, + uint64_t addr, + uint32_t len, + bool inbuf, + bool batch) +{ + int err; + unsigned int size; + + /* Detach all the used buffers from the vq */ + while (virtqueue_get_buf(vq, &size)) + ; + + err = virtqueue_add_one_desc(vq, addr, len, inbuf, vq); + /* + * This is expected to never fail: there is always at least 1 entry + * available on the vq, because when the vq is full the worker thread + * that adds the desc will be put into sleep until at least 1 entry is + * available to use. + */ + BUG_ON(err); + + /* If batching is requested, we batch till the vq is full */ + if (!batch || !vq->num_free) + kick_and_wait(vq, vb->acked); +} + +/* + * Send balloon pages in sgs to host. The balloon pages are recorded in the + * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE. + * The page xbitmap is searched for continuous "1" bits, which correspond + * to continuous pages, to chunk into sgs. + * + * @page_xb_start and @page_xb_end form the range of bits in the xbitmap that + * need to be searched. + */ +static void tell_host_sgs(struct virtio_balloon *vb, + struct virtqueue *vq, + unsigned long page_xb_start, + unsigned long page_xb_end) +{ + unsigned long pfn_start, pfn_end; + uint64_t addr; + uint32_t len, max_len = round_down(UINT_MAX, PAGE_SIZE); + + pfn_start = page_xb_start; + while (pfn_start < page_xb_end) { + pfn_start = xb_find_next_set_bit(&vb->page_xb, pfn_start, + page_xb_end); + if (pfn_start == page_xb_end + 1) + break; + pfn_end = xb_find_next_zero_bit(&vb->page_xb, + pfn_start + 1, + page_xb_end); + addr = pfn_start << PAGE_SHIFT; + len = (pfn_end - pfn_start) << PAGE_SHIFT; + while (len > max_len) { + send_one_desc(vb, vq, addr, max_len, true, true); + addr += max_len; + len -= max_len; + } + send_one_desc(vb, vq, addr, len, true, true); + pfn_start = pfn_end + 1; + } + + /* + * The last few desc-s may not reach the batch size, but need a kick to + * notify the device to handle them. + */ + if (vq->num_free != virtqueue_get_vring_size(vq)) + kick_and_wait(vq, vb->acked); + + xb_clear_bit_range(&vb->page_xb, page_xb_start, page_xb_end); +} + +static inline int xb_set_page(struct virtio_balloon *vb, + struct page *page, + unsigned long *pfn_min, + unsigned long *pfn_max) +{ + unsigned long pfn = page_to_pfn(page); + int ret; + + *pfn_min = min(pfn, *pfn_min); + *pfn_max = max(pfn, *pfn_max); + + do { + ret = xb_preload_and_set_bit(&vb->page_xb, pfn, + GFP_NOWAIT | __GFP_NOWARN); + } while (unlikely(ret == -EAGAIN)); + + return ret; +} + static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) { unsigned num_allocated_pages; unsigned num_pfns; struct page *page; LIST_HEAD(pages); + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); + unsigned long pfn_max = 0, pfn_min = ULONG_MAX; /* We can only do one array worth at a time. */ - num = min(num, ARRAY_SIZE(vb->pfns)); + if (!use_sg) + num = min(num, ARRAY_SIZE(vb->pfns)); for (num_pfns = 0; num_pfns < num; num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { @@ -172,11 +283,18 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) vb->num_pfns = 0; while ((page = balloon_page_pop(&pages))) { + if (use_sg) { + if (xb_set_page(vb, page, &pfn_min, &pfn_max) < 0) { + __free_page(page); + break; + } + } else { + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); + } + balloon_page_enqueue(&vb->vb_dev_info, page); vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; - - set_page_pfns(vb, vb->pfns + vb->num_pfns, page); vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) @@ -185,8 +303,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) num_allocated_pages = vb->num_pfns; /* Did we get any? */ - if (vb->num_pfns != 0) - tell_host(vb, vb->inflate_vq); + if (vb->num_pfns) { + if (use_sg) + tell_host_sgs(vb, vb->inflate_vq, pfn_min, pfn_max); + else + tell_host(vb, vb->inflate_vq); + } mutex_unlock(&vb->balloon_lock); return num_allocated_pages; @@ -212,9 +334,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) struct page *page; struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; LIST_HEAD(pages); + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); + unsigned long pfn_max = 0, pfn_min = ULONG_MAX; - /* We can only do one array worth at a time. */ - num = min(num, ARRAY_SIZE(vb->pfns)); + /* Traditionally, we can only do one array worth at a time. */ + if (!use_sg) + num = min(num, ARRAY_SIZE(vb->pfns)); mutex_lock(&vb->balloon_lock); /* We can't release more pages than taken */ @@ -224,7 +349,14 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) page = balloon_page_dequeue(vb_dev_info); if (!page) break; - set_page_pfns(vb, vb->pfns + vb->num_pfns, page); + if (use_sg) { + if (xb_set_page(vb, page, &pfn_min, &pfn_max) < 0) { + balloon_page_enqueue(&vb->vb_dev_info, page); + break; + } + } else { + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); + } list_add(&page->lru, &pages); vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; } @@ -235,13 +367,55 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); * is true, we *have* to do it in this order */ - if (vb->num_pfns != 0) - tell_host(vb, vb->deflate_vq); + if (vb->num_pfns) { + if (use_sg) + tell_host_sgs(vb, vb->deflate_vq, pfn_min, pfn_max); + else + tell_host(vb, vb->deflate_vq); + } release_pages_balloon(vb, &pages); mutex_unlock(&vb->balloon_lock); return num_freed_pages; } +/* + * The regular leak_balloon() with VIRTIO_BALLOON_F_SG needs memory allocation + * for xbitmap, which is not suitable for the oom case. This function does not + * use xbitmap to chunk pages, so it can be used by oom notifier to deflate + * pages when VIRTIO_BALLOON_F_SG is negotiated. + */ +static unsigned int leak_balloon_sg_oom(struct virtio_balloon *vb) +{ + unsigned int n; + struct page *page; + struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; + struct virtqueue *vq = vb->deflate_vq; + LIST_HEAD(pages); + + mutex_lock(&vb->balloon_lock); + for (n = 0; n < oom_pages; n++) { + page = balloon_page_dequeue(vb_dev_info); + if (!page) + break; + + list_add(&page->lru, &pages); + vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; + send_one_desc(vb, vq, virt_to_phys(page_address(page)), + PAGE_SIZE, true, true); + release_pages_balloon(vb, &pages); + } + + /* + * The last few sgs may not reach the batch size, but need a kick to + * notify the device to handle them. + */ + if (vq->num_free != virtqueue_get_vring_size(vq)) + kick_and_wait(vq, vb->acked); + mutex_unlock(&vb->balloon_lock); + + return n; +} + static inline void update_stat(struct virtio_balloon *vb, int idx, u16 tag, u64 val) { @@ -381,7 +555,10 @@ static int virtballoon_oom_notify(struct notifier_block *self, return NOTIFY_OK; freed = parm; - num_freed_pages = leak_balloon(vb, oom_pages); + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG)) + num_freed_pages = leak_balloon_sg_oom(vb); + else + num_freed_pages = leak_balloon(vb, oom_pages); update_balloon_size(vb); *freed += num_freed_pages; @@ -478,6 +655,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, { struct virtio_balloon *vb = container_of(vb_dev_info, struct virtio_balloon, vb_dev_info); + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); unsigned long flags; /* @@ -499,16 +677,26 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, vb_dev_info->isolated_pages--; __count_vm_event(BALLOON_MIGRATE); spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); - vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; - set_page_pfns(vb, vb->pfns, newpage); - tell_host(vb, vb->inflate_vq); - + if (use_sg) { + send_one_desc(vb, vb->inflate_vq, + virt_to_phys(page_address(newpage)), + PAGE_SIZE, true, false); + } else { + vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; + set_page_pfns(vb, vb->pfns, newpage); + tell_host(vb, vb->inflate_vq); + } /* balloon's page migration 2nd step -- deflate "page" */ balloon_page_delete(page); - vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; - set_page_pfns(vb, vb->pfns, page); - tell_host(vb, vb->deflate_vq); - + if (use_sg) { + send_one_desc(vb, vb->inflate_vq, + virt_to_phys(page_address(page)), + PAGE_SIZE, true, false); + } else { + vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; + set_page_pfns(vb, vb->pfns, page); + tell_host(vb, vb->deflate_vq); + } mutex_unlock(&vb->balloon_lock); put_page(page); /* balloon reference */ @@ -567,6 +755,9 @@ static int virtballoon_probe(struct virtio_device *vdev) if (err) goto out_free_vb; + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_SG)) + xb_init(&vb->page_xb); + vb->nb.notifier_call = virtballoon_oom_notify; vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY; err = register_oom_notifier(&vb->nb); @@ -683,6 +874,7 @@ static unsigned int features[] = { VIRTIO_BALLOON_F_MUST_TELL_HOST, VIRTIO_BALLOON_F_STATS_VQ, VIRTIO_BALLOON_F_DEFLATE_ON_OOM, + VIRTIO_BALLOON_F_SG, }; static struct virtio_driver virtio_balloon_driver = { diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index 343d7dd..37780a7 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -34,6 +34,7 @@ #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ +#define VIRTIO_BALLOON_F_SG 3 /* Use sg instead of PFN lists */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12