diff mbox series

[v16.1,6/9] virtio-balloon: Add support for providing free page reports to host

Message ID 20200122174347.6142.92803.stgit@localhost.localdomain (mailing list archive)
State New, archived
Headers show
Series mm / virtio: Provide support for free page reporting | expand

Commit Message

Alexander H Duyck Jan. 22, 2020, 5:43 p.m. UTC
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>

Add support for the page reporting feature provided by virtio-balloon.
Reporting differs from the regular balloon functionality in that is is
much less durable than a standard memory balloon. Instead of creating a
list of pages that cannot be accessed the pages are only inaccessible
while they are being indicated to the virtio interface. Once the
interface has acknowledged them they are placed back into their respective
free lists and are once again accessible by the guest system.

Unlike a standard balloon we don't inflate and deflate the pages. Instead
we perform the reporting, and once the reporting is completed it is
assumed that the page has been dropped from the guest and will be faulted
back in the next time the page is accessed.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
---
 drivers/virtio/Kconfig              |    1 +
 drivers/virtio/virtio_balloon.c     |   64 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/virtio_balloon.h |    1 +
 3 files changed, 66 insertions(+)

Comments

David Hildenbrand Feb. 11, 2020, 11:03 a.m. UTC | #1
On 22.01.20 18:43, Alexander Duyck wrote:
> From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> 
> Add support for the page reporting feature provided by virtio-balloon.
> Reporting differs from the regular balloon functionality in that is is
> much less durable than a standard memory balloon. Instead of creating a
> list of pages that cannot be accessed the pages are only inaccessible
> while they are being indicated to the virtio interface. Once the
> interface has acknowledged them they are placed back into their respective
> free lists and are once again accessible by the guest system.
> 
> Unlike a standard balloon we don't inflate and deflate the pages. Instead
> we perform the reporting, and once the reporting is completed it is
> assumed that the page has been dropped from the guest and will be faulted
> back in the next time the page is accessed.
> 
> Acked-by: Michael S. Tsirkin <mst@redhat.com>
> Reviewed-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> ---
>  drivers/virtio/Kconfig              |    1 +
>  drivers/virtio/virtio_balloon.c     |   64 +++++++++++++++++++++++++++++++++++
>  include/uapi/linux/virtio_balloon.h |    1 +
>  3 files changed, 66 insertions(+)
> 
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index 078615cf2afc..4b2dd8259ff5 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -58,6 +58,7 @@ config VIRTIO_BALLOON
>  	tristate "Virtio balloon driver"
>  	depends on VIRTIO
>  	select MEMORY_BALLOON
> +	select PAGE_REPORTING
>  	---help---
>  	 This driver supports increasing and decreasing the amount
>  	 of memory within a KVM guest.
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 40bb7693e3de..a07b9e18a292 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -19,6 +19,7 @@
>  #include <linux/mount.h>
>  #include <linux/magic.h>
>  #include <linux/pseudo_fs.h>
> +#include <linux/page_reporting.h>
>  
>  /*
>   * Balloon device works in 4K page units.  So each page is pointed to by
> @@ -47,6 +48,7 @@ enum virtio_balloon_vq {
>  	VIRTIO_BALLOON_VQ_DEFLATE,
>  	VIRTIO_BALLOON_VQ_STATS,
>  	VIRTIO_BALLOON_VQ_FREE_PAGE,
> +	VIRTIO_BALLOON_VQ_REPORTING,
>  	VIRTIO_BALLOON_VQ_MAX
>  };
>  
> @@ -114,6 +116,10 @@ struct virtio_balloon {
>  
>  	/* To register a shrinker to shrink memory upon memory pressure */
>  	struct shrinker shrinker;
> +
> +	/* Free page reporting device */
> +	struct virtqueue *reporting_vq;
> +	struct page_reporting_dev_info pr_dev_info;
>  };
>  
>  static struct virtio_device_id id_table[] = {
> @@ -153,6 +159,33 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
>  
>  }
>  
> +int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info,
> +				   struct scatterlist *sg, unsigned int nents)
> +{
> +	struct virtio_balloon *vb =
> +		container_of(pr_dev_info, struct virtio_balloon, pr_dev_info);
> +	struct virtqueue *vq = vb->reporting_vq;
> +	unsigned int unused, err;
> +
> +	/* We should always be able to add these buffers to an empty queue. */
> +	err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN);
> +
> +	/*
> +	 * In the extremely unlikely case that something has occurred and we
> +	 * are able to trigger an error we will simply display a warning
> +	 * and exit without actually processing the pages.
> +	 */
> +	if (WARN_ON_ONCE(err))
> +		return err;
> +
> +	virtqueue_kick(vq);
> +
> +	/* When host has read buffer, this completes via balloon_ack */
> +	wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
> +
> +	return 0;
> +}


Did you see the discussion regarding unifying handling of
inflate/deflate/free_page_hinting_free_page_reporting, requested by
Michael? I think free page reporting is special and shall be left alone.

VIRTIO_BALLOON_F_REPORTING is nothing but a more advanced inflate, right
(sg, inflate based on size - not "virtio pages")? And you rely on
deflates not being required before reusing an inflated page.

I suggest the following:

/* New interface (+ 2 virtqueues) to inflate/deflate using a SG */
VIRTIO_BALLOON_F_SG
/*
 * No need to deflate when reusing pages (once the inflate request was
 * processed). Applies to all inflate queues.
 */
VIRTIO_BALLOON_F_OPTIONAL_DEFLATE

And two new virtqueues

VIRTIO_BALLOON_VQ_INFLATE_SG
VIRTIO_BALLOON_VQ_DEFLATE_SG


Your feature would depend on VIRTIO_BALLOON_F_SG &&
VIRTIO_BALLOON_F_OPTIONAL_DEFLATE. VIRTIO_BALLOON_F_OPTIONAL_DEFLATE
could be reused to avoid deflating on certain events (e.g., from
OOM/shrinker).

Thoughts?
Michael S. Tsirkin Feb. 11, 2020, 11:47 a.m. UTC | #2
On Tue, Feb 11, 2020 at 12:03:57PM +0100, David Hildenbrand wrote:
> On 22.01.20 18:43, Alexander Duyck wrote:
> > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > 
> > Add support for the page reporting feature provided by virtio-balloon.
> > Reporting differs from the regular balloon functionality in that is is
> > much less durable than a standard memory balloon. Instead of creating a
> > list of pages that cannot be accessed the pages are only inaccessible
> > while they are being indicated to the virtio interface. Once the
> > interface has acknowledged them they are placed back into their respective
> > free lists and are once again accessible by the guest system.
> > 
> > Unlike a standard balloon we don't inflate and deflate the pages. Instead
> > we perform the reporting, and once the reporting is completed it is
> > assumed that the page has been dropped from the guest and will be faulted
> > back in the next time the page is accessed.
> > 
> > Acked-by: Michael S. Tsirkin <mst@redhat.com>
> > Reviewed-by: David Hildenbrand <david@redhat.com>
> > Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > ---
> >  drivers/virtio/Kconfig              |    1 +
> >  drivers/virtio/virtio_balloon.c     |   64 +++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/virtio_balloon.h |    1 +
> >  3 files changed, 66 insertions(+)
> > 
> > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > index 078615cf2afc..4b2dd8259ff5 100644
> > --- a/drivers/virtio/Kconfig
> > +++ b/drivers/virtio/Kconfig
> > @@ -58,6 +58,7 @@ config VIRTIO_BALLOON
> >  	tristate "Virtio balloon driver"
> >  	depends on VIRTIO
> >  	select MEMORY_BALLOON
> > +	select PAGE_REPORTING
> >  	---help---
> >  	 This driver supports increasing and decreasing the amount
> >  	 of memory within a KVM guest.
> > diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> > index 40bb7693e3de..a07b9e18a292 100644
> > --- a/drivers/virtio/virtio_balloon.c
> > +++ b/drivers/virtio/virtio_balloon.c
> > @@ -19,6 +19,7 @@
> >  #include <linux/mount.h>
> >  #include <linux/magic.h>
> >  #include <linux/pseudo_fs.h>
> > +#include <linux/page_reporting.h>
> >  
> >  /*
> >   * Balloon device works in 4K page units.  So each page is pointed to by
> > @@ -47,6 +48,7 @@ enum virtio_balloon_vq {
> >  	VIRTIO_BALLOON_VQ_DEFLATE,
> >  	VIRTIO_BALLOON_VQ_STATS,
> >  	VIRTIO_BALLOON_VQ_FREE_PAGE,
> > +	VIRTIO_BALLOON_VQ_REPORTING,
> >  	VIRTIO_BALLOON_VQ_MAX
> >  };
> >  
> > @@ -114,6 +116,10 @@ struct virtio_balloon {
> >  
> >  	/* To register a shrinker to shrink memory upon memory pressure */
> >  	struct shrinker shrinker;
> > +
> > +	/* Free page reporting device */
> > +	struct virtqueue *reporting_vq;
> > +	struct page_reporting_dev_info pr_dev_info;
> >  };
> >  
> >  static struct virtio_device_id id_table[] = {
> > @@ -153,6 +159,33 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
> >  
> >  }
> >  
> > +int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info,
> > +				   struct scatterlist *sg, unsigned int nents)
> > +{
> > +	struct virtio_balloon *vb =
> > +		container_of(pr_dev_info, struct virtio_balloon, pr_dev_info);
> > +	struct virtqueue *vq = vb->reporting_vq;
> > +	unsigned int unused, err;
> > +
> > +	/* We should always be able to add these buffers to an empty queue. */
> > +	err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN);
> > +
> > +	/*
> > +	 * In the extremely unlikely case that something has occurred and we
> > +	 * are able to trigger an error we will simply display a warning
> > +	 * and exit without actually processing the pages.
> > +	 */
> > +	if (WARN_ON_ONCE(err))
> > +		return err;
> > +
> > +	virtqueue_kick(vq);
> > +
> > +	/* When host has read buffer, this completes via balloon_ack */
> > +	wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
> > +
> > +	return 0;
> > +}
> 
> 
> Did you see the discussion regarding unifying handling of
> inflate/deflate/free_page_hinting_free_page_reporting, requested by
> Michael? I think free page reporting is special and shall be left alone.

Not sure what do you mean by "left alone here". Could you clarify?

> VIRTIO_BALLOON_F_REPORTING is nothing but a more advanced inflate, right
> (sg, inflate based on size - not "virtio pages")?


Not exactly - it's also initiated by guest as opposed to host, and
not guided by the ballon size request set by the host.
And uses a dedicated queue to avoid blocking other functionality ...

I really think this is more like an inflate immediately followed by deflate.



> And you rely on
> deflates not being required before reusing an inflated page.
> 
> I suggest the following:
> 
> /* New interface (+ 2 virtqueues) to inflate/deflate using a SG */
> VIRTIO_BALLOON_F_SG
> /*
>  * No need to deflate when reusing pages (once the inflate request was
>  * processed). Applies to all inflate queues.
>  */
> VIRTIO_BALLOON_F_OPTIONAL_DEFLATE
> 
> And two new virtqueues
> 
> VIRTIO_BALLOON_VQ_INFLATE_SG
> VIRTIO_BALLOON_VQ_DEFLATE_SG
> 
> 
> Your feature would depend on VIRTIO_BALLOON_F_SG &&
> VIRTIO_BALLOON_F_OPTIONAL_DEFLATE. VIRTIO_BALLOON_F_OPTIONAL_DEFLATE
> could be reused to avoid deflating on certain events (e.g., from
> OOM/shrinker).
> 
> Thoughts?

I'd rather wait until we have a usecase and preferably a POC
showing it helps before we add optional deflate ...
For now I personally am fine with just making this go ahead as is,
and imply SG and OPTIONAL_DEFLATE just for this VQ.

Do you feel strongly we need to bring this up to a TC vote?
It means spec patch needs to be written, but it
does not have to be a big patch ...


> -- 
> Thanks,
> 
> David / dhildenb
David Hildenbrand Feb. 11, 2020, 12:19 p.m. UTC | #3
>>
>> Did you see the discussion regarding unifying handling of
>> inflate/deflate/free_page_hinting_free_page_reporting, requested by
>> Michael? I think free page reporting is special and shall be left alone.
> 
> Not sure what do you mean by "left alone here". Could you clarify?

Don't try to unify handling like I proposed below, because it's
semantics are special.

> 
>> VIRTIO_BALLOON_F_REPORTING is nothing but a more advanced inflate, right
>> (sg, inflate based on size - not "virtio pages")?
> 
> 
> Not exactly - it's also initiated by guest as opposed to host, and
> not guided by the ballon size request set by the host.

True, but AFAIKS you could use existing INFLATE/DEFLATE in a similar
way. There is no way for the hypervisor to nack a request. The balloon
size is not glued to inflate/deflate requests. The guests manually
updates it.

> And uses a dedicated queue to avoid blocking other functionality ...

True, but the other queues also don't allow for an easy extension
AFAIKS, so that's another reason.

> 
> I really think this is more like an inflate immediately followed by deflate.

Depends on how you look at it. As inflate/deflate is not glued to the
balloon size (the guest updates the size manually), it's not obvious.

E.g., in QEMU, a deflate is just a performance improvement
("MADV_WILLNEED") - in that regard, it's more like an optional deflation.

[...]

> 
> I'd rather wait until we have a usecase and preferably a POC
> showing it helps before we add optional deflate ...
> For now I personally am fine with just making this go ahead as is,
> and imply SG and OPTIONAL_DEFLATE just for this VQ.

Also fine with me, you asked about if we can abstract any of this if I
am not wrong :) So this was my take.

> 
> Do you feel strongly we need to bring this up to a TC vote?

Not really. People have been asking about how to inflate/deflate huge
pages a long time ago (comes with different challenges - e.g., balloon
compaction). looked like this interface could have been reused for this
as well.

But yeah, I am not a fan of virtio-balloon and the whole inflate/deflate
thingy. So at least I don't see a need to extend the inflate/deflate
capability.

Free page reporting is a different story (and the semantics require no
inflate/deflate/balloon size) - it could have been moved to
virtio-whatever without any issues. So I am fine with this.
Michael S. Tsirkin Feb. 11, 2020, 2:07 p.m. UTC | #4
On Tue, Feb 11, 2020 at 01:19:31PM +0100, David Hildenbrand wrote:
> >>
> >> Did you see the discussion regarding unifying handling of
> >> inflate/deflate/free_page_hinting_free_page_reporting, requested by
> >> Michael? I think free page reporting is special and shall be left alone.
> > 
> > Not sure what do you mean by "left alone here". Could you clarify?
> 
> Don't try to unify handling like I proposed below, because it's
> semantics are special.
> 
> > 
> >> VIRTIO_BALLOON_F_REPORTING is nothing but a more advanced inflate, right
> >> (sg, inflate based on size - not "virtio pages")?
> > 
> > 
> > Not exactly - it's also initiated by guest as opposed to host, and
> > not guided by the ballon size request set by the host.
> 
> True, but AFAIKS you could use existing INFLATE/DEFLATE in a similar
> way. There is no way for the hypervisor to nack a request. The balloon
> size is not glued to inflate/deflate requests. The guests manually
> updates it.

Hmm how isn't it? num_pages is the only way to inflate/deflate.

Spec also says:
The device is driven either by the receipt of a configuration change notification, or by changing guest memory
needs, such as performing memory compaction or responding to out of memory conditions.

so ignoring compaction/oom (later is under-specified, not a good example
to follow) yes inflate/deflate are tied to host specified configuration.


> > And uses a dedicated queue to avoid blocking other functionality ...
> 
> True, but the other queues also don't allow for an easy extension
> AFAIKS, so that's another reason.
> 
> > 
> > I really think this is more like an inflate immediately followed by deflate.
> 
> Depends on how you look at it. As inflate/deflate is not glued to the
> balloon size (the guest updates the size manually), it's not obvious.
> 
> E.g., in QEMU, a deflate is just a performance improvement
> ("MADV_WILLNEED") - in that regard, it's more like an optional deflation.
> 
> [...]
> 
> > 
> > I'd rather wait until we have a usecase and preferably a POC
> > showing it helps before we add optional deflate ...
> > For now I personally am fine with just making this go ahead as is,
> > and imply SG and OPTIONAL_DEFLATE just for this VQ.
> 
> Also fine with me, you asked about if we can abstract any of this if I
> am not wrong :) So this was my take.
> 
> > 
> > Do you feel strongly we need to bring this up to a TC vote?
> 
> Not really. People have been asking about how to inflate/deflate huge
> pages a long time ago (comes with different challenges - e.g., balloon
> compaction). looked like this interface could have been reused for this
> as well.
> 
> But yeah, I am not a fan of virtio-balloon and the whole inflate/deflate
> thingy. So at least I don't see a need to extend the inflate/deflate
> capability.
> 
> Free page reporting is a different story (and the semantics require no
> inflate/deflate/balloon size) - it could have been moved to
> virtio-whatever without any issues. So I am fine with this.
> 
> -- 
> Thanks,
> 
> David / dhildenb
David Hildenbrand Feb. 11, 2020, 2:31 p.m. UTC | #5
On 11.02.20 15:07, Michael S. Tsirkin wrote:
> On Tue, Feb 11, 2020 at 01:19:31PM +0100, David Hildenbrand wrote:
>>>>
>>>> Did you see the discussion regarding unifying handling of
>>>> inflate/deflate/free_page_hinting_free_page_reporting, requested by
>>>> Michael? I think free page reporting is special and shall be left alone.
>>>
>>> Not sure what do you mean by "left alone here". Could you clarify?
>>
>> Don't try to unify handling like I proposed below, because it's
>> semantics are special.
>>
>>>
>>>> VIRTIO_BALLOON_F_REPORTING is nothing but a more advanced inflate, right
>>>> (sg, inflate based on size - not "virtio pages")?
>>>
>>>
>>> Not exactly - it's also initiated by guest as opposed to host, and
>>> not guided by the ballon size request set by the host.
>>
>> True, but AFAIKS you could use existing INFLATE/DEFLATE in a similar
>> way. There is no way for the hypervisor to nack a request. The balloon
>> size is not glued to inflate/deflate requests. The guests manually
>> updates it.
> 
> Hmm how isn't it? num_pages is the only way to inflate/deflate.

Usually, guests are nice and respond to num_pages changes in an
appropriate way, except:
- Triggering deflate: Unload the driver. Suspend/hibernate. OOM.
  (+ Reboot, although that's special)
- Triggering inflate + deflate: Simple balloon compaction / page
  migration.

But that's not what I meant.

"actual" is updated by the guest, not by the host. So the "actual
balloon size" is set by the guest. It's not glued to inflation/deflation
requests. "num_pages" is the host request.

AFAIKs, the guest could inflate/deflate (esp. temporarily) and
communicate via "actual" the actual balloon size as he sees it.

> Spec also says:
> The device is driven either by the receipt of a configuration change notification, or by changing guest memory
> needs, such as performing memory compaction or responding to out of memory conditions.
> 
> so ignoring compaction/oom (later is under-specified, not a good example
> to follow) yes inflate/deflate are tied to host specified configuration
Yes, "num_pages" is the host request. But I'd say the statement (esp.
"the device is driven by") in the spec is rather weak. It does not
explicitly state when inflation/deflation is allowed IMHO.
Michael S. Tsirkin Feb. 11, 2020, 2:48 p.m. UTC | #6
On Tue, Feb 11, 2020 at 03:31:18PM +0100, David Hildenbrand wrote:
> On 11.02.20 15:07, Michael S. Tsirkin wrote:
> > On Tue, Feb 11, 2020 at 01:19:31PM +0100, David Hildenbrand wrote:
> >>>>
> >>>> Did you see the discussion regarding unifying handling of
> >>>> inflate/deflate/free_page_hinting_free_page_reporting, requested by
> >>>> Michael? I think free page reporting is special and shall be left alone.
> >>>
> >>> Not sure what do you mean by "left alone here". Could you clarify?
> >>
> >> Don't try to unify handling like I proposed below, because it's
> >> semantics are special.
> >>
> >>>
> >>>> VIRTIO_BALLOON_F_REPORTING is nothing but a more advanced inflate, right
> >>>> (sg, inflate based on size - not "virtio pages")?
> >>>
> >>>
> >>> Not exactly - it's also initiated by guest as opposed to host, and
> >>> not guided by the ballon size request set by the host.
> >>
> >> True, but AFAIKS you could use existing INFLATE/DEFLATE in a similar
> >> way. There is no way for the hypervisor to nack a request. The balloon
> >> size is not glued to inflate/deflate requests. The guests manually
> >> updates it.
> > 
> > Hmm how isn't it? num_pages is the only way to inflate/deflate.
> 
> Usually, guests are nice and respond to num_pages changes in an
> appropriate way, except:
> - Triggering deflate: Unload the driver. Suspend/hibernate. OOM.
>   (+ Reboot, although that's special)
> - Triggering inflate + deflate: Simple balloon compaction / page
>   migration.


These are all real situations but balloon always has been best effort.


> But that's not what I meant.
> 
> "actual" is updated by the guest, not by the host. So the "actual
> balloon size" is set by the guest. It's not glued to inflation/deflation
> requests. "num_pages" is the host request.

Well the expectation is that as long as guest has ample
available memory, when num_pages changes then
guest starts sending inflate/deflate requests,
until actual matches num_pages.

If it does not match, and we wait and it still doesn't,
then something unusual happened. People do depend on that
behaviour.

> AFAIKs, the guest could inflate/deflate (esp. temporarily) and
> communicate via "actual" the actual balloon size as he sees it.

OK so you want hinted but unused pages counted, and reported
in "actual"? That's a vmexit before each page use ...



> > Spec also says:
> > The device is driven either by the receipt of a configuration change notification, or by changing guest memory
> > needs, such as performing memory compaction or responding to out of memory conditions.
> > 
> > so ignoring compaction/oom (later is under-specified, not a good example
> > to follow) yes inflate/deflate are tied to host specified configuration
> Yes, "num_pages" is the host request. But I'd say the statement (esp.
> "the device is driven by") in the spec is rather weak. It does not
> explicitly state when inflation/deflation is allowed IMHO.

Right since it's all best effort anyway.


> -- 
> Thanks,
> 
> David / dhildenb
David Hildenbrand Feb. 11, 2020, 3:13 p.m. UTC | #7
>> AFAIKs, the guest could inflate/deflate (esp. temporarily) and
>> communicate via "actual" the actual balloon size as he sees it.
> 
> OK so you want hinted but unused pages counted, and reported
> in "actual"? That's a vmexit before each page use ...

No, not at all. I rather meant, that it is unclear how
inflation/deflation requests and "actual" *could* interact. Especially
if we would consider free page reporting as some way of inflation
(+immediate deflation) triggered by the guest. IMHO, we would not touch
"actual" in that case.

But as I said, I am totally fine with keeping it as is in this patch.
IOW not glue free page reporting to inflation/deflation but let it act
like something different with its own semantics (and document these
properly).
Alexander Duyck Feb. 11, 2020, 4:33 p.m. UTC | #8
On Tue, 2020-02-11 at 16:13 +0100, David Hildenbrand wrote:
>  >> AFAIKs, the guest could inflate/deflate (esp. temporarily) and
> > > communicate via "actual" the actual balloon size as he sees it.
> > 
> > OK so you want hinted but unused pages counted, and reported
> > in "actual"? That's a vmexit before each page use ...
> 
> No, not at all. I rather meant, that it is unclear how
> inflation/deflation requests and "actual" *could* interact. Especially
> if we would consider free page reporting as some way of inflation
> (+immediate deflation) triggered by the guest. IMHO, we would not touch
> "actual" in that case.
> 
> But as I said, I am totally fine with keeping it as is in this patch.
> IOW not glue free page reporting to inflation/deflation but let it act
> like something different with its own semantics (and document these
> properly).
> 

Okay, so before I post v17 am I leaving the virtio-balloon changes as they
were then?

For what it is worth I agree with Michael that there is more to this than
just a scatter-gather queue. For now I am trying to keep the overall
impact on QEMU on the smaller side, and if we do end up supporting the
MADV_FREE instead of MADV_DONTNEED that would also have an impact on
things as it would be yet another difference between ballooning and
hinting.

Thanks.

- Alex
David Hildenbrand Feb. 11, 2020, 5:04 p.m. UTC | #9
> Am 11.02.2020 um 17:33 schrieb Alexander Duyck <alexander.h.duyck@linux.intel.com>:
> 
> On Tue, 2020-02-11 at 16:13 +0100, David Hildenbrand wrote:
>>>> AFAIKs, the guest could inflate/deflate (esp. temporarily) and
>>>> communicate via "actual" the actual balloon size as he sees it.
>>> 
>>> OK so you want hinted but unused pages counted, and reported
>>> in "actual"? That's a vmexit before each page use ...
>> 
>> No, not at all. I rather meant, that it is unclear how
>> inflation/deflation requests and "actual" *could* interact. Especially
>> if we would consider free page reporting as some way of inflation
>> (+immediate deflation) triggered by the guest. IMHO, we would not touch
>> "actual" in that case.
>> 
>> But as I said, I am totally fine with keeping it as is in this patch.
>> IOW not glue free page reporting to inflation/deflation but let it act
>> like something different with its own semantics (and document these
>> properly).
>> 
> 
> Okay, so before I post v17 am I leaving the virtio-balloon changes as they
> were then?

I‘d say yes :)
diff mbox series

Patch

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 078615cf2afc..4b2dd8259ff5 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -58,6 +58,7 @@  config VIRTIO_BALLOON
 	tristate "Virtio balloon driver"
 	depends on VIRTIO
 	select MEMORY_BALLOON
+	select PAGE_REPORTING
 	---help---
 	 This driver supports increasing and decreasing the amount
 	 of memory within a KVM guest.
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 40bb7693e3de..a07b9e18a292 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -19,6 +19,7 @@ 
 #include <linux/mount.h>
 #include <linux/magic.h>
 #include <linux/pseudo_fs.h>
+#include <linux/page_reporting.h>
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -47,6 +48,7 @@  enum virtio_balloon_vq {
 	VIRTIO_BALLOON_VQ_DEFLATE,
 	VIRTIO_BALLOON_VQ_STATS,
 	VIRTIO_BALLOON_VQ_FREE_PAGE,
+	VIRTIO_BALLOON_VQ_REPORTING,
 	VIRTIO_BALLOON_VQ_MAX
 };
 
@@ -114,6 +116,10 @@  struct virtio_balloon {
 
 	/* To register a shrinker to shrink memory upon memory pressure */
 	struct shrinker shrinker;
+
+	/* Free page reporting device */
+	struct virtqueue *reporting_vq;
+	struct page_reporting_dev_info pr_dev_info;
 };
 
 static struct virtio_device_id id_table[] = {
@@ -153,6 +159,33 @@  static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 
 }
 
+int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info,
+				   struct scatterlist *sg, unsigned int nents)
+{
+	struct virtio_balloon *vb =
+		container_of(pr_dev_info, struct virtio_balloon, pr_dev_info);
+	struct virtqueue *vq = vb->reporting_vq;
+	unsigned int unused, err;
+
+	/* We should always be able to add these buffers to an empty queue. */
+	err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN);
+
+	/*
+	 * In the extremely unlikely case that something has occurred and we
+	 * are able to trigger an error we will simply display a warning
+	 * and exit without actually processing the pages.
+	 */
+	if (WARN_ON_ONCE(err))
+		return err;
+
+	virtqueue_kick(vq);
+
+	/* When host has read buffer, this completes via balloon_ack */
+	wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
+
+	return 0;
+}
+
 static void set_page_pfns(struct virtio_balloon *vb,
 			  __virtio32 pfns[], struct page *page)
 {
@@ -479,6 +512,7 @@  static int init_vqs(struct virtio_balloon *vb)
 	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
 	callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
 	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
+	names[VIRTIO_BALLOON_VQ_REPORTING] = NULL;
 
 	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
 		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
@@ -490,6 +524,11 @@  static int init_vqs(struct virtio_balloon *vb)
 		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
 	}
 
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
+		names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq";
+		callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
+	}
+
 	err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
 					 vqs, callbacks, names, NULL, NULL);
 	if (err)
@@ -522,6 +561,9 @@  static int init_vqs(struct virtio_balloon *vb)
 	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
 		vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE];
 
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
+		vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
+
 	return 0;
 }
 
@@ -952,12 +994,31 @@  static int virtballoon_probe(struct virtio_device *vdev)
 		if (err)
 			goto out_del_balloon_wq;
 	}
+
+	vb->pr_dev_info.report = virtballoon_free_page_report;
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
+		unsigned int capacity;
+
+		capacity = virtqueue_get_vring_size(vb->reporting_vq);
+		if (capacity < PAGE_REPORTING_CAPACITY) {
+			err = -ENOSPC;
+			goto out_unregister_shrinker;
+		}
+
+		err = page_reporting_register(&vb->pr_dev_info);
+		if (err)
+			goto out_unregister_shrinker;
+	}
+
 	virtio_device_ready(vdev);
 
 	if (towards_target(vb))
 		virtballoon_changed(vdev);
 	return 0;
 
+out_unregister_shrinker:
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+		virtio_balloon_unregister_shrinker(vb);
 out_del_balloon_wq:
 	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
 		destroy_workqueue(vb->balloon_wq);
@@ -986,6 +1047,8 @@  static void virtballoon_remove(struct virtio_device *vdev)
 {
 	struct virtio_balloon *vb = vdev->priv;
 
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
+		page_reporting_unregister(&vb->pr_dev_info);
 	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
 		virtio_balloon_unregister_shrinker(vb);
 	spin_lock_irq(&vb->stop_update_lock);
@@ -1058,6 +1121,7 @@  static int virtballoon_validate(struct virtio_device *vdev)
 	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
 	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
 	VIRTIO_BALLOON_F_PAGE_POISON,
+	VIRTIO_BALLOON_F_REPORTING,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index a1966cd7b677..19974392d324 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -36,6 +36,7 @@ 
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
 #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
+#define VIRTIO_BALLOON_F_REPORTING	5 /* Page reporting virtqueue */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12