diff mbox

[kernel,v8,3/4] mm: add inerface to offer info about unused pages

Message ID 1489648127-37282-4-git-send-email-wei.w.wang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wang, Wei W March 16, 2017, 7:08 a.m. UTC
From: Liang Li <liang.z.li@intel.com>

This patch adds a function to provides a snapshot of the present system
unused pages. An important usage of this function is to provide the
unsused pages to the Live migration thread, which skips the transfer of
thoses unused pages. Newly used pages can be re-tracked by the dirty
page logging mechanisms.

Signed-off-by: Liang Li <liang.z.li@intel.com>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
---
 include/linux/mm.h |   3 ++
 mm/page_alloc.c    | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+)

Comments

Andrew Morton March 16, 2017, 9:28 p.m. UTC | #1
On Thu, 16 Mar 2017 15:08:46 +0800 Wei Wang <wei.w.wang@intel.com> wrote:

> From: Liang Li <liang.z.li@intel.com>
> 
> This patch adds a function to provides a snapshot of the present system
> unused pages. An important usage of this function is to provide the
> unsused pages to the Live migration thread, which skips the transfer of
> thoses unused pages. Newly used pages can be re-tracked by the dirty
> page logging mechanisms.

I don't think this will be useful for anything other than
virtio-balloon.  I guess it would be better to keep this code in the
virtio-balloon driver if possible, even though that's rather a layering
violation :( What would have to be done to make that possible?  Perhaps
we can put some *small* helpers into page_alloc.c to prevent things
from becoming too ugly.

> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -4498,6 +4498,120 @@ void show_free_areas(unsigned int filter)
>  	show_swap_cache_info();
>  }
>  
> +static int __record_unused_pages(struct zone *zone, int order,
> +				 __le64 *buf, unsigned int size,
> +				 unsigned int *offset, bool part_fill)
> +{
> +	unsigned long pfn, flags;
> +	int t, ret = 0;
> +	struct list_head *curr;
> +	__le64 *chunk;
> +
> +	if (zone_is_empty(zone))
> +		return 0;
> +
> +	spin_lock_irqsave(&zone->lock, flags);
> +
> +	if (*offset + zone->free_area[order].nr_free > size && !part_fill) {
> +		ret = -ENOSPC;
> +		goto out;
> +	}
> +	for (t = 0; t < MIGRATE_TYPES; t++) {
> +		list_for_each(curr, &zone->free_area[order].free_list[t]) {
> +			pfn = page_to_pfn(list_entry(curr, struct page, lru));
> +			chunk = buf + *offset;
> +			if (*offset + 2 > size) {
> +				ret = -ENOSPC;
> +				goto out;
> +			}
> +			/* Align to the chunk format used in virtio-balloon */
> +			*chunk = cpu_to_le64(pfn << 12);
> +			*(chunk + 1) = cpu_to_le64((1 << order) << 12);
> +			*offset += 2;
> +		}
> +	}
> +
> +out:
> +	spin_unlock_irqrestore(&zone->lock, flags);
> +
> +	return ret;
> +}

This looks like it could disable interrupts for a long time.  Too long?

> +/*
> + * The record_unused_pages() function is used to record the system unused
> + * pages. The unused pages can be skipped to transfer during live migration.
> + * Though the unused pages are dynamically changing, dirty page logging
> + * mechanisms are able to capture the newly used pages though they were
> + * recorded as unused pages via this function.
> + *
> + * This function scans the free page list of the specified order to record
> + * the unused pages, and chunks those continuous pages following the chunk
> + * format below:
> + * --------------------------------------
> + * |	Base (52-bit)	| Rsvd (12-bit) |
> + * --------------------------------------
> + * --------------------------------------
> + * |	Size (52-bit)	| Rsvd (12-bit) |
> + * --------------------------------------
> + *
> + * @start_zone: zone to start the record operation.
> + * @order: order of the free page list to record.
> + * @buf: buffer to record the unused page info in chunks.
> + * @size: size of the buffer in __le64 to record
> + * @offset: offset in the buffer to record.
> + * @part_fill: indicate if partial fill is used.
> + *
> + * return -EINVAL if parameter is invalid
> + * return -ENOSPC when the buffer is too small to record all the unsed pages
> + * return 0 when sccess
> + */

It's a strange thing - it returns information which will instantly
become incorrect.

> +int record_unused_pages(struct zone **start_zone, int order,
> +			__le64 *buf, unsigned int size,
> +			unsigned int *offset, bool part_fill)
> +{
> +	struct zone *zone;
> +	int ret = 0;
> +	bool skip_check = false;
> +
> +	/* Make sure all the parameters are valid */
> +	if (buf == NULL || offset == NULL || order >= MAX_ORDER)
> +		return -EINVAL;
> +
> +	if (*start_zone != NULL) {
> +		bool found = false;
> +
> +		for_each_populated_zone(zone) {
> +			if (zone != *start_zone)
> +				continue;
> +			found = true;
> +			break;
> +		}
> +		if (!found)
> +			return -EINVAL;
> +	} else
> +		skip_check = true;
> +
> +	for_each_populated_zone(zone) {
> +		/* Start from *start_zone if it's not NULL */
> +		if (!skip_check) {
> +			if (*start_zone != zone)
> +				continue;
> +			else
> +				skip_check = true;
> +		}
> +		ret = __record_unused_pages(zone, order, buf, size,
> +					    offset, part_fill);
> +		if (ret < 0) {
> +			/* record the failed zone */
> +			*start_zone = zone;
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(record_unused_pages);
Michael S. Tsirkin March 17, 2017, 1:21 a.m. UTC | #2
On Thu, Mar 16, 2017 at 03:08:46PM +0800, Wei Wang wrote:
> +/*
> + * The record_unused_pages() function is used to record the system unused
> + * pages. The unused pages can be skipped to transfer during live migration.
> + * Though the unused pages are dynamically changing, dirty page logging
> + * mechanisms are able to capture the newly used pages though they were
> + * recorded as unused pages via this function.

You will keep confusing people as long as you keep using
this terminology which only makes sense in a very specific
use and a very specific implementation.

How does guest developer know this does the right thing wrt locking etc?
Look at hypervisor spec and try to figure it all out together?

So stop saying what caller should do, describe what does the *API* does.

You want something like this:

	Get a list of pages in the system that are unused at some point between
	record_unused_pages is called and before it returns, implying that any
	data that was present in these pages before record_unused_pages was
	called is safe to discard. Pages can be used immediately after
	this point and any data written after this point is not safe to discard,
	it is caller's responsibility to either prevent the use or
	detect such pages.
Wang, Wei W March 17, 2017, 6:55 a.m. UTC | #3
On 03/17/2017 05:28 AM, Andrew Morton wrote:
> On Thu, 16 Mar 2017 15:08:46 +0800 Wei Wang <wei.w.wang@intel.com> wrote:
>
>> From: Liang Li <liang.z.li@intel.com>
>>
>> This patch adds a function to provides a snapshot of the present system
>> unused pages. An important usage of this function is to provide the
>> unsused pages to the Live migration thread, which skips the transfer of
>> thoses unused pages. Newly used pages can be re-tracked by the dirty
>> page logging mechanisms.
> I don't think this will be useful for anything other than
> virtio-balloon.  I guess it would be better to keep this code in the
> virtio-balloon driver if possible, even though that's rather a layering
> violation :( What would have to be done to make that possible?  Perhaps
> we can put some *small* helpers into page_alloc.c to prevent things
> from becoming too ugly.

The patch description was too narrowed and may have caused some
confusion, sorry about that. This function is aimed to be generic. I
agree with the description suggested by Michael.

Since the main body of the function is related to operating on the
free_list. I think it is better to have them located here.
Small helpers may be less efficient and thereby causing some
performance loss as well.
I think one improvement we can make is to remove the "chunk format"
related things from this function. The function can generally offer the
base pfn to the caller's recording buffer. Then it will be the caller's
responsibility to format the pfn if they need.

>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -4498,6 +4498,120 @@ void show_free_areas(unsigned int filter)
>>   	show_swap_cache_info();
>>   }
>>   
>> +static int __record_unused_pages(struct zone *zone, int order,
>> +				 __le64 *buf, unsigned int size,
>> +				 unsigned int *offset, bool part_fill)
>> +{
>> +	unsigned long pfn, flags;
>> +	int t, ret = 0;
>> +	struct list_head *curr;
>> +	__le64 *chunk;
>> +
>> +	if (zone_is_empty(zone))
>> +		return 0;
>> +
>> +	spin_lock_irqsave(&zone->lock, flags);
>> +
>> +	if (*offset + zone->free_area[order].nr_free > size && !part_fill) {
>> +		ret = -ENOSPC;
>> +		goto out;
>> +	}
>> +	for (t = 0; t < MIGRATE_TYPES; t++) {
>> +		list_for_each(curr, &zone->free_area[order].free_list[t]) {
>> +			pfn = page_to_pfn(list_entry(curr, struct page, lru));
>> +			chunk = buf + *offset;
>> +			if (*offset + 2 > size) {
>> +				ret = -ENOSPC;
>> +				goto out;
>> +			}
>> +			/* Align to the chunk format used in virtio-balloon */
>> +			*chunk = cpu_to_le64(pfn << 12);
>> +			*(chunk + 1) = cpu_to_le64((1 << order) << 12);
>> +			*offset += 2;
>> +		}
>> +	}
>> +
>> +out:
>> +	spin_unlock_irqrestore(&zone->lock, flags);
>> +
>> +	return ret;
>> +}
> This looks like it could disable interrupts for a long time.  Too long?

What do you think if we give "budgets" to the above function?
For example, budget=1000, and there are 2000 nodes on the list.
record() returns with "incomplete" status in the first round, along with the
status info, "*continue_node".

*continue_node: pointer to the starting node of the leftover. If 
*continue_node
has been used at the time of the second call (i.e. continue_node->next 
== NULL),
which implies that the previous 1000 nodes have been used, then the record()
function can simply start from the head of the list.

It is up to the caller whether it needs to continue the second round
when getting "incomplete".

>
>> +/*
>> + * The record_unused_pages() function is used to record the system unused
>> + * pages. The unused pages can be skipped to transfer during live migration.
>> + * Though the unused pages are dynamically changing, dirty page logging
>> + * mechanisms are able to capture the newly used pages though they were
>> + * recorded as unused pages via this function.
>> + *
>> + * This function scans the free page list of the specified order to record
>> + * the unused pages, and chunks those continuous pages following the chunk
>> + * format below:
>> + * --------------------------------------
>> + * |	Base (52-bit)	| Rsvd (12-bit) |
>> + * --------------------------------------
>> + * --------------------------------------
>> + * |	Size (52-bit)	| Rsvd (12-bit) |
>> + * --------------------------------------
>> + *
>> + * @start_zone: zone to start the record operation.
>> + * @order: order of the free page list to record.
>> + * @buf: buffer to record the unused page info in chunks.
>> + * @size: size of the buffer in __le64 to record
>> + * @offset: offset in the buffer to record.
>> + * @part_fill: indicate if partial fill is used.
>> + *
>> + * return -EINVAL if parameter is invalid
>> + * return -ENOSPC when the buffer is too small to record all the unsed pages
>> + * return 0 when sccess
>> + */
> It's a strange thing - it returns information which will instantly
> become incorrect.

I didn't get the point, could you please explain more? Thanks.

Best,
Wei
Wang, Wei W March 22, 2017, 10:52 a.m. UTC | #4
Hi Andrew, 

Do you have any comments on my thoughts? Thanks.

> On 03/17/2017 05:28 AM, Andrew Morton wrote:
> > On Thu, 16 Mar 2017 15:08:46 +0800 Wei Wang <wei.w.wang@intel.com>
> wrote:
> >
> >> From: Liang Li <liang.z.li@intel.com>
> >>
> >> This patch adds a function to provides a snapshot of the present
> >> system unused pages. An important usage of this function is to
> >> provide the unsused pages to the Live migration thread, which skips
> >> the transfer of thoses unused pages. Newly used pages can be
> >> re-tracked by the dirty page logging mechanisms.
> > I don't think this will be useful for anything other than
> > virtio-balloon.  I guess it would be better to keep this code in the
> > virtio-balloon driver if possible, even though that's rather a
> > layering violation :( What would have to be done to make that
> > possible?  Perhaps we can put some *small* helpers into page_alloc.c
> > to prevent things from becoming too ugly.
> 
> The patch description was too narrowed and may have caused some confusion,
> sorry about that. This function is aimed to be generic. I agree with the
> description suggested by Michael.
> 
> Since the main body of the function is related to operating on the free_list. I
> think it is better to have them located here.
> Small helpers may be less efficient and thereby causing some performance loss
> as well.
> I think one improvement we can make is to remove the "chunk format"
> related things from this function. The function can generally offer the base pfn
> to the caller's recording buffer. Then it will be the caller's responsibility to
> format the pfn if they need.
> 
> >> --- a/mm/page_alloc.c
> >> +++ b/mm/page_alloc.c
> >> @@ -4498,6 +4498,120 @@ void show_free_areas(unsigned int filter)
> >>   	show_swap_cache_info();
> >>   }
> >>
> >> +static int __record_unused_pages(struct zone *zone, int order,
> >> +				 __le64 *buf, unsigned int size,
> >> +				 unsigned int *offset, bool part_fill) {
> >> +	unsigned long pfn, flags;
> >> +	int t, ret = 0;
> >> +	struct list_head *curr;
> >> +	__le64 *chunk;
> >> +
> >> +	if (zone_is_empty(zone))
> >> +		return 0;
> >> +
> >> +	spin_lock_irqsave(&zone->lock, flags);
> >> +
> >> +	if (*offset + zone->free_area[order].nr_free > size && !part_fill) {
> >> +		ret = -ENOSPC;
> >> +		goto out;
> >> +	}
> >> +	for (t = 0; t < MIGRATE_TYPES; t++) {
> >> +		list_for_each(curr, &zone->free_area[order].free_list[t]) {
> >> +			pfn = page_to_pfn(list_entry(curr, struct page, lru));
> >> +			chunk = buf + *offset;
> >> +			if (*offset + 2 > size) {
> >> +				ret = -ENOSPC;
> >> +				goto out;
> >> +			}
> >> +			/* Align to the chunk format used in virtio-balloon */
> >> +			*chunk = cpu_to_le64(pfn << 12);
> >> +			*(chunk + 1) = cpu_to_le64((1 << order) << 12);
> >> +			*offset += 2;
> >> +		}
> >> +	}
> >> +
> >> +out:
> >> +	spin_unlock_irqrestore(&zone->lock, flags);
> >> +
> >> +	return ret;
> >> +}
> > This looks like it could disable interrupts for a long time.  Too long?
> 
> What do you think if we give "budgets" to the above function?
> For example, budget=1000, and there are 2000 nodes on the list.
> record() returns with "incomplete" status in the first round, along with the status
> info, "*continue_node".
> 
> *continue_node: pointer to the starting node of the leftover. If *continue_node
> has been used at the time of the second call (i.e. continue_node->next == NULL),
> which implies that the previous 1000 nodes have been used, then the record()
> function can simply start from the head of the list.
> 
> It is up to the caller whether it needs to continue the second round when getting
> "incomplete".
> 
> >
> >> +/*
> >> + * The record_unused_pages() function is used to record the system
> >> +unused
> >> + * pages. The unused pages can be skipped to transfer during live migration.
> >> + * Though the unused pages are dynamically changing, dirty page
> >> +logging
> >> + * mechanisms are able to capture the newly used pages though they
> >> +were
> >> + * recorded as unused pages via this function.
> >> + *
> >> + * This function scans the free page list of the specified order to
> >> +record
> >> + * the unused pages, and chunks those continuous pages following the
> >> +chunk
> >> + * format below:
> >> + * --------------------------------------
> >> + * |	Base (52-bit)	| Rsvd (12-bit) |
> >> + * --------------------------------------
> >> + * --------------------------------------
> >> + * |	Size (52-bit)	| Rsvd (12-bit) |
> >> + * --------------------------------------
> >> + *
> >> + * @start_zone: zone to start the record operation.
> >> + * @order: order of the free page list to record.
> >> + * @buf: buffer to record the unused page info in chunks.
> >> + * @size: size of the buffer in __le64 to record
> >> + * @offset: offset in the buffer to record.
> >> + * @part_fill: indicate if partial fill is used.
> >> + *
> >> + * return -EINVAL if parameter is invalid
> >> + * return -ENOSPC when the buffer is too small to record all the
> >> +unsed pages
> >> + * return 0 when sccess
> >> + */
> > It's a strange thing - it returns information which will instantly
> > become incorrect.
> 
> I didn't get the point, could you please explain more? Thanks.

Best,
Wei
Michael S. Tsirkin March 29, 2017, 5:48 p.m. UTC | #5
On Fri, Mar 17, 2017 at 02:55:33PM +0800, Wei Wang wrote:
> On 03/17/2017 05:28 AM, Andrew Morton wrote:
> > On Thu, 16 Mar 2017 15:08:46 +0800 Wei Wang <wei.w.wang@intel.com> wrote:
> > 
> > > From: Liang Li <liang.z.li@intel.com>
> > > 
> > > This patch adds a function to provides a snapshot of the present system
> > > unused pages. An important usage of this function is to provide the
> > > unsused pages to the Live migration thread, which skips the transfer of
> > > thoses unused pages. Newly used pages can be re-tracked by the dirty
> > > page logging mechanisms.
> > I don't think this will be useful for anything other than
> > virtio-balloon.  I guess it would be better to keep this code in the
> > virtio-balloon driver if possible, even though that's rather a layering
> > violation :( What would have to be done to make that possible?  Perhaps
> > we can put some *small* helpers into page_alloc.c to prevent things
> > from becoming too ugly.
> 
> The patch description was too narrowed and may have caused some
> confusion, sorry about that. This function is aimed to be generic. I
> agree with the description suggested by Michael.
> 
> Since the main body of the function is related to operating on the
> free_list. I think it is better to have them located here.
> Small helpers may be less efficient and thereby causing some
> performance loss as well.
> I think one improvement we can make is to remove the "chunk format"
> related things from this function. The function can generally offer the
> base pfn to the caller's recording buffer. Then it will be the caller's
> responsibility to format the pfn if they need.

Sounds good at a high level, but we'd have to see the implementation
to judge it properly.

> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -4498,6 +4498,120 @@ void show_free_areas(unsigned int filter)
> > >   	show_swap_cache_info();
> > >   }
> > > +static int __record_unused_pages(struct zone *zone, int order,
> > > +				 __le64 *buf, unsigned int size,
> > > +				 unsigned int *offset, bool part_fill)
> > > +{
> > > +	unsigned long pfn, flags;
> > > +	int t, ret = 0;
> > > +	struct list_head *curr;
> > > +	__le64 *chunk;
> > > +
> > > +	if (zone_is_empty(zone))
> > > +		return 0;
> > > +
> > > +	spin_lock_irqsave(&zone->lock, flags);
> > > +
> > > +	if (*offset + zone->free_area[order].nr_free > size && !part_fill) {
> > > +		ret = -ENOSPC;
> > > +		goto out;
> > > +	}
> > > +	for (t = 0; t < MIGRATE_TYPES; t++) {
> > > +		list_for_each(curr, &zone->free_area[order].free_list[t]) {
> > > +			pfn = page_to_pfn(list_entry(curr, struct page, lru));
> > > +			chunk = buf + *offset;
> > > +			if (*offset + 2 > size) {
> > > +				ret = -ENOSPC;
> > > +				goto out;
> > > +			}
> > > +			/* Align to the chunk format used in virtio-balloon */
> > > +			*chunk = cpu_to_le64(pfn << 12);
> > > +			*(chunk + 1) = cpu_to_le64((1 << order) << 12);
> > > +			*offset += 2;
> > > +		}
> > > +	}
> > > +
> > > +out:
> > > +	spin_unlock_irqrestore(&zone->lock, flags);
> > > +
> > > +	return ret;
> > > +}
> > This looks like it could disable interrupts for a long time.  Too long?
> 
> What do you think if we give "budgets" to the above function?
> For example, budget=1000, and there are 2000 nodes on the list.
> record() returns with "incomplete" status in the first round, along with the
> status info, "*continue_node".
> 
> *continue_node: pointer to the starting node of the leftover. If
> *continue_node
> has been used at the time of the second call (i.e. continue_node->next ==
> NULL),
> which implies that the previous 1000 nodes have been used, then the record()
> function can simply start from the head of the list.
> 
> It is up to the caller whether it needs to continue the second round
> when getting "incomplete".

It might be cleaner to add APIs to
	- start iteration
	- do one step
	- end iteration

caller can then iterate without too many issues

> > 
> > > +/*
> > > + * The record_unused_pages() function is used to record the system unused
> > > + * pages. The unused pages can be skipped to transfer during live migration.
> > > + * Though the unused pages are dynamically changing, dirty page logging
> > > + * mechanisms are able to capture the newly used pages though they were
> > > + * recorded as unused pages via this function.
> > > + *
> > > + * This function scans the free page list of the specified order to record
> > > + * the unused pages, and chunks those continuous pages following the chunk
> > > + * format below:
> > > + * --------------------------------------
> > > + * |	Base (52-bit)	| Rsvd (12-bit) |
> > > + * --------------------------------------
> > > + * --------------------------------------
> > > + * |	Size (52-bit)	| Rsvd (12-bit) |
> > > + * --------------------------------------
> > > + *
> > > + * @start_zone: zone to start the record operation.
> > > + * @order: order of the free page list to record.
> > > + * @buf: buffer to record the unused page info in chunks.
> > > + * @size: size of the buffer in __le64 to record
> > > + * @offset: offset in the buffer to record.
> > > + * @part_fill: indicate if partial fill is used.
> > > + *
> > > + * return -EINVAL if parameter is invalid
> > > + * return -ENOSPC when the buffer is too small to record all the unsed pages
> > > + * return 0 when sccess
> > > + */
> > It's a strange thing - it returns information which will instantly
> > become incorrect.
> 
> I didn't get the point, could you please explain more? Thanks.
> 
> Best,
> Wei

I tried to explain it in my reply.  Once this function drops the lock,
the pages can immediately be used so the returned value is wrong.
balloon uses tricks to recover from this but this needs to be explicit
at the API level.
Wang, Wei W March 31, 2017, 9:53 a.m. UTC | #6
On 03/30/2017 01:48 AM, Michael S. Tsirkin wrote:
> On Fri, Mar 17, 2017 at 02:55:33PM +0800, Wei Wang wrote:
>> On 03/17/2017 05:28 AM, Andrew Morton wrote:
>>> On Thu, 16 Mar 2017 15:08:46 +0800 Wei Wang <wei.w.wang@intel.com> wrote:
>>>
>>>> From: Liang Li <liang.z.li@intel.com>
>>>>
>>>> This patch adds a function to provides a snapshot of the present system
>>>> unused pages. An important usage of this function is to provide the
>>>> unsused pages to the Live migration thread, which skips the transfer of
>>>> thoses unused pages. Newly used pages can be re-tracked by the dirty
>>>> page logging mechanisms.
>>> I don't think this will be useful for anything other than
>>> virtio-balloon.  I guess it would be better to keep this code in the
>>> virtio-balloon driver if possible, even though that's rather a layering
>>> violation :( What would have to be done to make that possible?  Perhaps
>>> we can put some *small* helpers into page_alloc.c to prevent things
>>> from becoming too ugly.
>> The patch description was too narrowed and may have caused some
>> confusion, sorry about that. This function is aimed to be generic. I
>> agree with the description suggested by Michael.
>>
>> Since the main body of the function is related to operating on the
>> free_list. I think it is better to have them located here.
>> Small helpers may be less efficient and thereby causing some
>> performance loss as well.
>> I think one improvement we can make is to remove the "chunk format"
>> related things from this function. The function can generally offer the
>> base pfn to the caller's recording buffer. Then it will be the caller's
>> responsibility to format the pfn if they need.
> Sounds good at a high level, but we'd have to see the implementation
> to judge it properly.
>
>>>> --- a/mm/page_alloc.c
>>>> +++ b/mm/page_alloc.c
>>>> @@ -4498,6 +4498,120 @@ void show_free_areas(unsigned int filter)
>>>>    	show_swap_cache_info();
>>>>    }
>>>> +static int __record_unused_pages(struct zone *zone, int order,
>>>> +				 __le64 *buf, unsigned int size,
>>>> +				 unsigned int *offset, bool part_fill)
>>>> +{
>>>> +	unsigned long pfn, flags;
>>>> +	int t, ret = 0;
>>>> +	struct list_head *curr;
>>>> +	__le64 *chunk;
>>>> +
>>>> +	if (zone_is_empty(zone))
>>>> +		return 0;
>>>> +
>>>> +	spin_lock_irqsave(&zone->lock, flags);
>>>> +
>>>> +	if (*offset + zone->free_area[order].nr_free > size && !part_fill) {
>>>> +		ret = -ENOSPC;
>>>> +		goto out;
>>>> +	}
>>>> +	for (t = 0; t < MIGRATE_TYPES; t++) {
>>>> +		list_for_each(curr, &zone->free_area[order].free_list[t]) {
>>>> +			pfn = page_to_pfn(list_entry(curr, struct page, lru));
>>>> +			chunk = buf + *offset;
>>>> +			if (*offset + 2 > size) {
>>>> +				ret = -ENOSPC;
>>>> +				goto out;
>>>> +			}
>>>> +			/* Align to the chunk format used in virtio-balloon */
>>>> +			*chunk = cpu_to_le64(pfn << 12);
>>>> +			*(chunk + 1) = cpu_to_le64((1 << order) << 12);
>>>> +			*offset += 2;
>>>> +		}
>>>> +	}
>>>> +
>>>> +out:
>>>> +	spin_unlock_irqrestore(&zone->lock, flags);
>>>> +
>>>> +	return ret;
>>>> +}
>>> This looks like it could disable interrupts for a long time.  Too long?
>> What do you think if we give "budgets" to the above function?
>> For example, budget=1000, and there are 2000 nodes on the list.
>> record() returns with "incomplete" status in the first round, along with the
>> status info, "*continue_node".
>>
>> *continue_node: pointer to the starting node of the leftover. If
>> *continue_node
>> has been used at the time of the second call (i.e. continue_node->next ==
>> NULL),
>> which implies that the previous 1000 nodes have been used, then the record()
>> function can simply start from the head of the list.
>>
>> It is up to the caller whether it needs to continue the second round
>> when getting "incomplete".
> It might be cleaner to add APIs to
> 	- start iteration
> 	- do one step
> 	- end iteration
>
> caller can then iterate without too many issues
>

OK. I will re-implement it with this simple one - get only one node(page 
block) from the list in each call, and check if the time would increase 
a lot in comparison to v8.

Best,
Wei
Michael S. Tsirkin March 31, 2017, 4:25 p.m. UTC | #7
On Fri, Mar 31, 2017 at 05:53:00PM +0800, Wei Wang wrote:
> On 03/30/2017 01:48 AM, Michael S. Tsirkin wrote:
> > On Fri, Mar 17, 2017 at 02:55:33PM +0800, Wei Wang wrote:
> > > On 03/17/2017 05:28 AM, Andrew Morton wrote:
> > > > On Thu, 16 Mar 2017 15:08:46 +0800 Wei Wang <wei.w.wang@intel.com> wrote:
> > > > 
> > > > > From: Liang Li <liang.z.li@intel.com>
> > > > > 
> > > > > This patch adds a function to provides a snapshot of the present system
> > > > > unused pages. An important usage of this function is to provide the
> > > > > unsused pages to the Live migration thread, which skips the transfer of
> > > > > thoses unused pages. Newly used pages can be re-tracked by the dirty
> > > > > page logging mechanisms.
> > > > I don't think this will be useful for anything other than
> > > > virtio-balloon.  I guess it would be better to keep this code in the
> > > > virtio-balloon driver if possible, even though that's rather a layering
> > > > violation :( What would have to be done to make that possible?  Perhaps
> > > > we can put some *small* helpers into page_alloc.c to prevent things
> > > > from becoming too ugly.
> > > The patch description was too narrowed and may have caused some
> > > confusion, sorry about that. This function is aimed to be generic. I
> > > agree with the description suggested by Michael.
> > > 
> > > Since the main body of the function is related to operating on the
> > > free_list. I think it is better to have them located here.
> > > Small helpers may be less efficient and thereby causing some
> > > performance loss as well.
> > > I think one improvement we can make is to remove the "chunk format"
> > > related things from this function. The function can generally offer the
> > > base pfn to the caller's recording buffer. Then it will be the caller's
> > > responsibility to format the pfn if they need.
> > Sounds good at a high level, but we'd have to see the implementation
> > to judge it properly.
> > 
> > > > > --- a/mm/page_alloc.c
> > > > > +++ b/mm/page_alloc.c
> > > > > @@ -4498,6 +4498,120 @@ void show_free_areas(unsigned int filter)
> > > > >    	show_swap_cache_info();
> > > > >    }
> > > > > +static int __record_unused_pages(struct zone *zone, int order,
> > > > > +				 __le64 *buf, unsigned int size,
> > > > > +				 unsigned int *offset, bool part_fill)
> > > > > +{
> > > > > +	unsigned long pfn, flags;
> > > > > +	int t, ret = 0;
> > > > > +	struct list_head *curr;
> > > > > +	__le64 *chunk;
> > > > > +
> > > > > +	if (zone_is_empty(zone))
> > > > > +		return 0;
> > > > > +
> > > > > +	spin_lock_irqsave(&zone->lock, flags);
> > > > > +
> > > > > +	if (*offset + zone->free_area[order].nr_free > size && !part_fill) {
> > > > > +		ret = -ENOSPC;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	for (t = 0; t < MIGRATE_TYPES; t++) {
> > > > > +		list_for_each(curr, &zone->free_area[order].free_list[t]) {
> > > > > +			pfn = page_to_pfn(list_entry(curr, struct page, lru));
> > > > > +			chunk = buf + *offset;
> > > > > +			if (*offset + 2 > size) {
> > > > > +				ret = -ENOSPC;
> > > > > +				goto out;
> > > > > +			}
> > > > > +			/* Align to the chunk format used in virtio-balloon */
> > > > > +			*chunk = cpu_to_le64(pfn << 12);
> > > > > +			*(chunk + 1) = cpu_to_le64((1 << order) << 12);
> > > > > +			*offset += 2;
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +out:
> > > > > +	spin_unlock_irqrestore(&zone->lock, flags);
> > > > > +
> > > > > +	return ret;
> > > > > +}
> > > > This looks like it could disable interrupts for a long time.  Too long?
> > > What do you think if we give "budgets" to the above function?
> > > For example, budget=1000, and there are 2000 nodes on the list.
> > > record() returns with "incomplete" status in the first round, along with the
> > > status info, "*continue_node".
> > > 
> > > *continue_node: pointer to the starting node of the leftover. If
> > > *continue_node
> > > has been used at the time of the second call (i.e. continue_node->next ==
> > > NULL),
> > > which implies that the previous 1000 nodes have been used, then the record()
> > > function can simply start from the head of the list.
> > > 
> > > It is up to the caller whether it needs to continue the second round
> > > when getting "incomplete".
> > It might be cleaner to add APIs to
> > 	- start iteration
> > 	- do one step
> > 	- end iteration
> > 
> > caller can then iterate without too many issues
> > 
> 
> OK. I will re-implement it with this simple one - get only one node(page
> block) from the list in each call, and check if the time would increase a
> lot in comparison to v8.
> 
> Best,
> Wei

Might work though this isn't what was suggested - just an iterator based
approach that allows user to drop the lock periodically.
Wang, Wei W April 13, 2017, 11:07 a.m. UTC | #8
On 03/17/2017 05:28 AM, Andrew Morton wrote:
> On Thu, 16 Mar 2017 15:08:46 +0800 Wei Wang <wei.w.wang@intel.com> wrote:
>
>> From: Liang Li <liang.z.li@intel.com>
>>
>> This patch adds a function to provides a snapshot of the present system
>> unused pages. An important usage of this function is to provide the
>> unsused pages to the Live migration thread, which skips the transfer of
>> thoses unused pages. Newly used pages can be re-tracked by the dirty
>> page logging mechanisms.
> I don't think this will be useful for anything other than
> virtio-balloon.  I guess it would be better to keep this code in the
> virtio-balloon driver if possible, even though that's rather a layering
> violation :( What would have to be done to make that possible?  Perhaps
> we can put some *small* helpers into page_alloc.c to prevent things
> from becoming too ugly.
>
>
Thanks for the suggestion. Small helpers do look more elegant. The nice 
thing is that I also didn't see any performance loss.
To make that possible, we need to enable for_each_polulated_zone() to be 
callable by a kernel module. Please have a check the v9 patches that I 
just posted out.

Best,
Wei
diff mbox

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b84615b..869749d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1764,6 +1764,9 @@  extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, unsigned long * zones_size,
 		unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
+extern int record_unused_pages(struct zone **start_zone, int order,
+			       __le64 *pages, unsigned int size,
+			       unsigned int *pos, bool part_fill);
 
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f3e0c69..b72a7ac 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4498,6 +4498,120 @@  void show_free_areas(unsigned int filter)
 	show_swap_cache_info();
 }
 
+static int __record_unused_pages(struct zone *zone, int order,
+				 __le64 *buf, unsigned int size,
+				 unsigned int *offset, bool part_fill)
+{
+	unsigned long pfn, flags;
+	int t, ret = 0;
+	struct list_head *curr;
+	__le64 *chunk;
+
+	if (zone_is_empty(zone))
+		return 0;
+
+	spin_lock_irqsave(&zone->lock, flags);
+
+	if (*offset + zone->free_area[order].nr_free > size && !part_fill) {
+		ret = -ENOSPC;
+		goto out;
+	}
+	for (t = 0; t < MIGRATE_TYPES; t++) {
+		list_for_each(curr, &zone->free_area[order].free_list[t]) {
+			pfn = page_to_pfn(list_entry(curr, struct page, lru));
+			chunk = buf + *offset;
+			if (*offset + 2 > size) {
+				ret = -ENOSPC;
+				goto out;
+			}
+			/* Align to the chunk format used in virtio-balloon */
+			*chunk = cpu_to_le64(pfn << 12);
+			*(chunk + 1) = cpu_to_le64((1 << order) << 12);
+			*offset += 2;
+		}
+	}
+
+out:
+	spin_unlock_irqrestore(&zone->lock, flags);
+
+	return ret;
+}
+
+/*
+ * The record_unused_pages() function is used to record the system unused
+ * pages. The unused pages can be skipped to transfer during live migration.
+ * Though the unused pages are dynamically changing, dirty page logging
+ * mechanisms are able to capture the newly used pages though they were
+ * recorded as unused pages via this function.
+ *
+ * This function scans the free page list of the specified order to record
+ * the unused pages, and chunks those continuous pages following the chunk
+ * format below:
+ * --------------------------------------
+ * |	Base (52-bit)	| Rsvd (12-bit) |
+ * --------------------------------------
+ * --------------------------------------
+ * |	Size (52-bit)	| Rsvd (12-bit) |
+ * --------------------------------------
+ *
+ * @start_zone: zone to start the record operation.
+ * @order: order of the free page list to record.
+ * @buf: buffer to record the unused page info in chunks.
+ * @size: size of the buffer in __le64 to record
+ * @offset: offset in the buffer to record.
+ * @part_fill: indicate if partial fill is used.
+ *
+ * return -EINVAL if parameter is invalid
+ * return -ENOSPC when the buffer is too small to record all the unsed pages
+ * return 0 when sccess
+ */
+int record_unused_pages(struct zone **start_zone, int order,
+			__le64 *buf, unsigned int size,
+			unsigned int *offset, bool part_fill)
+{
+	struct zone *zone;
+	int ret = 0;
+	bool skip_check = false;
+
+	/* Make sure all the parameters are valid */
+	if (buf == NULL || offset == NULL || order >= MAX_ORDER)
+		return -EINVAL;
+
+	if (*start_zone != NULL) {
+		bool found = false;
+
+		for_each_populated_zone(zone) {
+			if (zone != *start_zone)
+				continue;
+			found = true;
+			break;
+		}
+		if (!found)
+			return -EINVAL;
+	} else
+		skip_check = true;
+
+	for_each_populated_zone(zone) {
+		/* Start from *start_zone if it's not NULL */
+		if (!skip_check) {
+			if (*start_zone != zone)
+				continue;
+			else
+				skip_check = true;
+		}
+		ret = __record_unused_pages(zone, order, buf, size,
+					    offset, part_fill);
+		if (ret < 0) {
+			/* record the failed zone */
+			*start_zone = zone;
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(record_unused_pages);
+
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;