diff mbox series

[RFC,18/18] mm: Add pfn_range_put()

Message ID 20230830185041.3427464-4-willy@infradead.org (mailing list archive)
State New
Headers show
Series Rearrange batched folio freeing | expand

Commit Message

Matthew Wilcox Aug. 30, 2023, 6:50 p.m. UTC
This function will be used imminently.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/mm.h |  7 ++++++
 mm/swap.c          | 56 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

Comments

Ryan Roberts Aug. 31, 2023, 7:03 p.m. UTC | #1
On 30/08/2023 19:50, Matthew Wilcox (Oracle) wrote:
> This function will be used imminently.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  include/linux/mm.h |  7 ++++++
>  mm/swap.c          | 56 ++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 63 insertions(+)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a517ef8d2386..4f9e2cfb372e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1515,6 +1515,13 @@ typedef union {
>  void release_pages(release_pages_arg, int nr);
>  void folios_put(struct folio_batch *folios);
>  
> +struct pfn_range {
> +	unsigned long start;
> +	unsigned long end;
> +};

As mentioned in the other thread, I think we need to better convey that a
pfn_range must be fully contained within a single folio. Perhaps its better to
call it `struct folio_region`?

> +
> +void pfn_range_put(struct pfn_range *range, unsigned int nr);

If going with `struct folio_region`, we could call this folios_put_refs()?

Or if you hate that idea and want to stick with pfn, then at least call it
pfn_ranges_put() (with s on ranges).

> +
>  static inline void put_page(struct page *page)
>  {
>  	struct folio *folio = page_folio(page);
> diff --git a/mm/swap.c b/mm/swap.c
> index 8bd15402cd8f..218d2cc4c6f4 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -1027,6 +1027,62 @@ void folios_put(struct folio_batch *folios)
>  }
>  EXPORT_SYMBOL(folios_put);
>  
> +void pfn_range_put(struct pfn_range *range, unsigned int nr)
> +{
> +	struct folio_batch folios;
> +	unsigned int i;
> +	struct lruvec *lruvec = NULL;
> +	unsigned long flags = 0;
> +
> +	folio_batch_init(&folios);
> +	for (i = 0; i < nr; i++) {
> +		struct folio *folio = pfn_folio(range[i].start);
> +		unsigned int refs = range[i].end - range[i].start;

Don't you need to check for huge zero page like in folios_put()?

		if (is_huge_zero_page(&folio->page))
			continue;

> +
> +		if (folio_is_zone_device(folio)) {
> +			if (lruvec) {
> +				unlock_page_lruvec_irqrestore(lruvec, flags);
> +				lruvec = NULL;
> +			}
> +			if (put_devmap_managed_page_refs(&folio->page, refs))
> +				continue;
> +			if (folio_ref_sub_and_test(folio, refs))
> +				free_zone_device_page(&folio->page);
> +			continue;
> +		}
> +
> +		if (!folio_ref_sub_and_test(folio, refs))
> +			continue;
> +
> +		/* hugetlb has its own memcg */
> +		if (folio_test_hugetlb(folio)) {
> +			if (lruvec) {
> +				unlock_page_lruvec_irqrestore(lruvec, flags);
> +				lruvec = NULL;
> +			}
> +			free_huge_folio(folio);
> +			continue;
> +		}
> +
> +		__page_cache_release(folio, &lruvec, &flags);
> +		if (folio_batch_add(&folios, folio) == 0) {
> +			if (lruvec) {
> +				unlock_page_lruvec_irqrestore(lruvec, flags);
> +				lruvec = NULL;
> +			}
> +			mem_cgroup_uncharge_folios(&folios);
> +			free_unref_folios(&folios);
> +		}
> +	}
> +	if (lruvec)
> +		unlock_page_lruvec_irqrestore(lruvec, flags);
> +
> +	if (folios.nr) {
> +		mem_cgroup_uncharge_folios(&folios);
> +		free_unref_folios(&folios);
> +	}
> +}

This still duplicates a lot of the logic in folios_put(), but I see you have an
idea in the other thread for improving this situation - I'll reply in the
context of that thread.

But overall this looks great - thanks for taking the time to put this together -
it will integrate nicely with my mmugather series.

Thanks,
Ryan


> +
>  /**
>   * release_pages - batched put_page()
>   * @arg: array of pages to release
Matthew Wilcox Sept. 1, 2023, 4:27 a.m. UTC | #2
On Thu, Aug 31, 2023 at 08:03:14PM +0100, Ryan Roberts wrote:
> > +++ b/include/linux/mm.h
> > @@ -1515,6 +1515,13 @@ typedef union {
> >  void release_pages(release_pages_arg, int nr);
> >  void folios_put(struct folio_batch *folios);
> >  
> > +struct pfn_range {
> > +	unsigned long start;
> > +	unsigned long end;
> > +};
> 
> As mentioned in the other thread, I think we need to better convey that a
> pfn_range must be fully contained within a single folio. Perhaps its better to
> call it `struct folio_region`?

Yep, I'm not taking a strong position on that.  We can call it whatever
you like, subject to usual nitpicking later.

> > +
> > +void pfn_range_put(struct pfn_range *range, unsigned int nr);
> 
> If going with `struct folio_region`, we could call this folios_put_refs()?
> 
> Or if you hate that idea and want to stick with pfn, then at least call it
> pfn_ranges_put() (with s on ranges).

folio_regions_put()?  I don't know at this point; nothing's speaking
to me.

> > +void pfn_range_put(struct pfn_range *range, unsigned int nr)
> > +{
> > +	struct folio_batch folios;
> > +	unsigned int i;
> > +	struct lruvec *lruvec = NULL;
> > +	unsigned long flags = 0;
> > +
> > +	folio_batch_init(&folios);
> > +	for (i = 0; i < nr; i++) {
> > +		struct folio *folio = pfn_folio(range[i].start);
> > +		unsigned int refs = range[i].end - range[i].start;
> 
> Don't you need to check for huge zero page like in folios_put()?
> 
> 		if (is_huge_zero_page(&folio->page))
> 			continue;

Maybe?  Can we put the huge zero page in here, or would we filter it
out earlier?

> > +	if (lruvec)
> > +		unlock_page_lruvec_irqrestore(lruvec, flags);
> > +
> > +	if (folios.nr) {
> > +		mem_cgroup_uncharge_folios(&folios);
> > +		free_unref_folios(&folios);
> > +	}
> > +}
> 
> This still duplicates a lot of the logic in folios_put(), but I see you have an
> idea in the other thread for improving this situation - I'll reply in the
> context of that thread.
> 
> But overall this looks great - thanks for taking the time to put this together -
> it will integrate nicely with my mmugather series.
> 

Thanks!  One thing I was wondering; intended to look at today, but didn't
have time for it -- can we use mmugather to solve how vmscan wants to
handle batched TLB IPIs for mapped pages, instead of having its own thing?
72b252aed506 is the commit to look at.
Ryan Roberts Sept. 1, 2023, 7:59 a.m. UTC | #3
On 01/09/2023 05:27, Matthew Wilcox wrote:
> On Thu, Aug 31, 2023 at 08:03:14PM +0100, Ryan Roberts wrote:
>>> +++ b/include/linux/mm.h
>>> @@ -1515,6 +1515,13 @@ typedef union {
>>>  void release_pages(release_pages_arg, int nr);
>>>  void folios_put(struct folio_batch *folios);
>>>  
>>> +struct pfn_range {
>>> +	unsigned long start;
>>> +	unsigned long end;
>>> +};
>>
>> As mentioned in the other thread, I think we need to better convey that a
>> pfn_range must be fully contained within a single folio. Perhaps its better to
>> call it `struct folio_region`?
> 
> Yep, I'm not taking a strong position on that.  We can call it whatever
> you like, subject to usual nitpicking later.
> 
>>> +
>>> +void pfn_range_put(struct pfn_range *range, unsigned int nr);
>>
>> If going with `struct folio_region`, we could call this folios_put_refs()?
>>
>> Or if you hate that idea and want to stick with pfn, then at least call it
>> pfn_ranges_put() (with s on ranges).
> 
> folio_regions_put()?  I don't know at this point; nothing's speaking
> to me.

I'm inclined to call it `struct folio_region` and
free_folio_regions_and_swap_cache(). Then it replaces
free_pages_and_swap_cache() (as per suggestion against your patch set). For now,
I'll implement free_folio_regions_and_swap_cache() similarly to release_pages()
+ free_swap_cache(). Then your patch set just has to reimplement
free_folio_regions_and_swap_cache() using the folio_batch approach. Shout if
that sounds problematic. Otherwise I'll post a new version of my patch set later
today.

> 
>>> +void pfn_range_put(struct pfn_range *range, unsigned int nr)
>>> +{
>>> +	struct folio_batch folios;
>>> +	unsigned int i;
>>> +	struct lruvec *lruvec = NULL;
>>> +	unsigned long flags = 0;
>>> +
>>> +	folio_batch_init(&folios);
>>> +	for (i = 0; i < nr; i++) {
>>> +		struct folio *folio = pfn_folio(range[i].start);
>>> +		unsigned int refs = range[i].end - range[i].start;
>>
>> Don't you need to check for huge zero page like in folios_put()?
>>
>> 		if (is_huge_zero_page(&folio->page))
>> 			continue;
> 
> Maybe?  Can we put the huge zero page in here, or would we filter it
> out earlier?

I'm not sure there is any earlier opportunity? The next earlist would be to
filter it out from the mmugather batch and that feels like confusing
responsibilities too much. So my vote is to put it here.


> 
>>> +	if (lruvec)
>>> +		unlock_page_lruvec_irqrestore(lruvec, flags);
>>> +
>>> +	if (folios.nr) {
>>> +		mem_cgroup_uncharge_folios(&folios);
>>> +		free_unref_folios(&folios);
>>> +	}
>>> +}
>>
>> This still duplicates a lot of the logic in folios_put(), but I see you have an
>> idea in the other thread for improving this situation - I'll reply in the
>> context of that thread.
>>
>> But overall this looks great - thanks for taking the time to put this together -
>> it will integrate nicely with my mmugather series.
>>
> 
> Thanks!  One thing I was wondering; intended to look at today, but didn't
> have time for it -- can we use mmugather to solve how vmscan wants to
> handle batched TLB IPIs for mapped pages, instead of having its own thing?
> 72b252aed506 is the commit to look at.

Yeah I thought about that too. mmugather works well for TLBIing a contiguous
range of VAs (it just maintains a range that gets expanded to the lowest and
highest entries). That works well for things like munmap and mmap_exit where a
virtually contigious block is being zapped. But I'm guessing that for vmscan,
the pages are usually not contiguous? If so, then I think mmugather would
over-TLBI and likely harm perf?
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a517ef8d2386..4f9e2cfb372e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1515,6 +1515,13 @@  typedef union {
 void release_pages(release_pages_arg, int nr);
 void folios_put(struct folio_batch *folios);
 
+struct pfn_range {
+	unsigned long start;
+	unsigned long end;
+};
+
+void pfn_range_put(struct pfn_range *range, unsigned int nr);
+
 static inline void put_page(struct page *page)
 {
 	struct folio *folio = page_folio(page);
diff --git a/mm/swap.c b/mm/swap.c
index 8bd15402cd8f..218d2cc4c6f4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1027,6 +1027,62 @@  void folios_put(struct folio_batch *folios)
 }
 EXPORT_SYMBOL(folios_put);
 
+void pfn_range_put(struct pfn_range *range, unsigned int nr)
+{
+	struct folio_batch folios;
+	unsigned int i;
+	struct lruvec *lruvec = NULL;
+	unsigned long flags = 0;
+
+	folio_batch_init(&folios);
+	for (i = 0; i < nr; i++) {
+		struct folio *folio = pfn_folio(range[i].start);
+		unsigned int refs = range[i].end - range[i].start;
+
+		if (folio_is_zone_device(folio)) {
+			if (lruvec) {
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+				lruvec = NULL;
+			}
+			if (put_devmap_managed_page_refs(&folio->page, refs))
+				continue;
+			if (folio_ref_sub_and_test(folio, refs))
+				free_zone_device_page(&folio->page);
+			continue;
+		}
+
+		if (!folio_ref_sub_and_test(folio, refs))
+			continue;
+
+		/* hugetlb has its own memcg */
+		if (folio_test_hugetlb(folio)) {
+			if (lruvec) {
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+				lruvec = NULL;
+			}
+			free_huge_folio(folio);
+			continue;
+		}
+
+		__page_cache_release(folio, &lruvec, &flags);
+		if (folio_batch_add(&folios, folio) == 0) {
+			if (lruvec) {
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+				lruvec = NULL;
+			}
+			mem_cgroup_uncharge_folios(&folios);
+			free_unref_folios(&folios);
+		}
+	}
+	if (lruvec)
+		unlock_page_lruvec_irqrestore(lruvec, flags);
+
+	if (folios.nr) {
+		mem_cgroup_uncharge_folios(&folios);
+		free_unref_folios(&folios);
+	}
+}
+
 /**
  * release_pages - batched put_page()
  * @arg: array of pages to release