diff mbox series

mm: make device private reference counts zero based

Message ID 20201008172544.29905-1-rcampbell@nvidia.com (mailing list archive)
State New, archived
Headers show
Series mm: make device private reference counts zero based | expand

Commit Message

Ralph Campbell Oct. 8, 2020, 5:25 p.m. UTC
ZONE_DEVICE struct pages have an extra reference count that complicates the
code for put_page() and several places in the kernel that need to check the
reference count to see that a page is not being used (gup, compaction,
migration, etc.). Clean up the code so the reference count doesn't need to
be treated specially for device private pages, leaving DAX as still being
a special case.

Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
---

I'm sending this as a separate patch since I think it is ready to
merge. Originally, this was part of an RFC:
https://lore.kernel.org/linux-mm/20201001181715.17416-1-rcampbell@nvidia.com
and is changed to only make device private struct page reference
counts be zero based since DAX needs to detect when a page is not
referenced by GUP and not no references at all.

It applies cleanly to linux-5.9.0-rc8-mm1 plus my patch
("ext4/xfs: add page refcount helper")
https://lore.kernel.org/linux-mm/20201007214925.11181-1-rcampbell@nvidia.com
but doesn't really depend on it, just a simple merge conflict
without it.

This is for Andrew Morton after the 5.10.0-rc1 merge window closes.

 arch/powerpc/kvm/book3s_hv_uvmem.c     | 13 +++++++-
 drivers/gpu/drm/nouveau/nouveau_dmem.c |  2 +-
 include/linux/memremap.h               |  6 ++--
 include/linux/mm.h                     |  9 +-----
 lib/test_hmm.c                         |  7 ++++-
 mm/internal.h                          |  8 +++++
 mm/memremap.c                          | 42 ++++++++++----------------
 mm/migrate.c                           |  5 ---
 mm/swap.c                              |  6 ++--
 9 files changed, 51 insertions(+), 47 deletions(-)

Comments

Ira Weiny Oct. 9, 2020, 4:53 p.m. UTC | #1
On Thu, Oct 08, 2020 at 10:25:44AM -0700, Ralph Campbell wrote:
> ZONE_DEVICE struct pages have an extra reference count that complicates the
> code for put_page() and several places in the kernel that need to check the
> reference count to see that a page is not being used (gup, compaction,
> migration, etc.). Clean up the code so the reference count doesn't need to
> be treated specially for device private pages, leaving DAX as still being
> a special case.

What about the check in mc_handle_swap_pte()?

mm/memcontrol.c:

5513                 /*
5514                  * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
5515                  * a refcount of 1 when free (unlike normal page)
5516                  */
5517                 if (!page_ref_add_unless(page, 1, 1))
5518                         return NULL;

... does that need to change?  Perhaps just the comment?

> 
> Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
> ---
> 

[snip]

>  
>  void put_devmap_managed_page(struct page *page);
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index e151a7f10519..bf92a261fa6f 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -509,10 +509,15 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
>  		mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)),
>  		pfn_first, pfn_last);
>  
> +	/*
> +	 * Pages are created with an initial reference count of one but should
> +	 * have a reference count of zero while in the free state.
> +	 */
>  	spin_lock(&mdevice->lock);
>  	for (pfn = pfn_first; pfn < pfn_last; pfn++) {
>  		struct page *page = pfn_to_page(pfn);
>  
> +		set_page_count(page, 0);

This confuses me.  How does this and init_page_count() not confuse the buddy
allocator?  Don't you have to reset the refcount somewhere after the test?

>  		page->zone_device_data = mdevice->free_pages;
>  		mdevice->free_pages = page;
>  	}
> @@ -561,7 +566,7 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
>  	}
>  
>  	dpage->zone_device_data = rpage;
> -	get_page(dpage);
> +	init_page_count(dpage);
>  	lock_page(dpage);
>  	return dpage;
>  
> diff --git a/mm/internal.h b/mm/internal.h
> index c43ccdddb0f6..e1443b73aa9b 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
>  

[snip]

> diff --git a/mm/swap.c b/mm/swap.c
> index 0eb057141a04..93d880c6f73c 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -116,12 +116,11 @@ static void __put_compound_page(struct page *page)
>  void __put_page(struct page *page)
>  {
>  	if (is_zone_device_page(page)) {
> -		put_dev_pagemap(page->pgmap);
> -
>  		/*
>  		 * The page belongs to the device that created pgmap. Do
>  		 * not return it to page allocator.
>  		 */
> +		free_zone_device_page(page);

I really like this.

Ira
Ralph Campbell Oct. 9, 2020, 5:23 p.m. UTC | #2
On 10/9/20 9:53 AM, Ira Weiny wrote:
> On Thu, Oct 08, 2020 at 10:25:44AM -0700, Ralph Campbell wrote:
>> ZONE_DEVICE struct pages have an extra reference count that complicates the
>> code for put_page() and several places in the kernel that need to check the
>> reference count to see that a page is not being used (gup, compaction,
>> migration, etc.). Clean up the code so the reference count doesn't need to
>> be treated specially for device private pages, leaving DAX as still being
>> a special case.
> 
> What about the check in mc_handle_swap_pte()?
> 
> mm/memcontrol.c:
> 
> 5513                 /*
> 5514                  * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
> 5515                  * a refcount of 1 when free (unlike normal page)
> 5516                  */
> 5517                 if (!page_ref_add_unless(page, 1, 1))
> 5518                         return NULL;
> 
> ... does that need to change?  Perhaps just the comment?

Thanks for spotting this.

Actually, this whole bit of code is never reached because the
   if (non_swap_entry(ent))
     return NULL;
covers device private pages and returns.

The device private pages are accounted for in mem_cgroup so this
probably needs fixing. I'll probably move the code before the
non_swap_entry() check and change the refcount increment to
page_ref_add_unless(page, 1, 0).

>>
>> Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
>> ---
>>
> 
> [snip]
> 
>>   
>>   void put_devmap_managed_page(struct page *page);
>> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
>> index e151a7f10519..bf92a261fa6f 100644
>> --- a/lib/test_hmm.c
>> +++ b/lib/test_hmm.c
>> @@ -509,10 +509,15 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
>>   		mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)),
>>   		pfn_first, pfn_last);
>>   
>> +	/*
>> +	 * Pages are created with an initial reference count of one but should
>> +	 * have a reference count of zero while in the free state.
>> +	 */
>>   	spin_lock(&mdevice->lock);
>>   	for (pfn = pfn_first; pfn < pfn_last; pfn++) {
>>   		struct page *page = pfn_to_page(pfn);
>>   
>> +		set_page_count(page, 0);
> 
> This confuses me.  How does this and init_page_count() not confuse the buddy
> allocator?  Don't you have to reset the refcount somewhere after the test?

Device private struct pages are created with memmap_pages() and destroyed with
memunmap_pages(). They are never put on the LRU and never freed to the page
allocator. The refcount is set to zero by put_page() which triggers
the call to pgmap->page_free() instead. So only the driver handles the free pages
it creates.


>>   		page->zone_device_data = mdevice->free_pages;
>>   		mdevice->free_pages = page;
>>   	}
>> @@ -561,7 +566,7 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
>>   	}
>>   
>>   	dpage->zone_device_data = rpage;
>> -	get_page(dpage);
>> +	init_page_count(dpage);
>>   	lock_page(dpage);
>>   	return dpage;
>>   
>> diff --git a/mm/internal.h b/mm/internal.h
>> index c43ccdddb0f6..e1443b73aa9b 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>>   
> 
> [snip]
> 
>> diff --git a/mm/swap.c b/mm/swap.c
>> index 0eb057141a04..93d880c6f73c 100644
>> --- a/mm/swap.c
>> +++ b/mm/swap.c
>> @@ -116,12 +116,11 @@ static void __put_compound_page(struct page *page)
>>   void __put_page(struct page *page)
>>   {
>>   	if (is_zone_device_page(page)) {
>> -		put_dev_pagemap(page->pgmap);
>> -
>>   		/*
>>   		 * The page belongs to the device that created pgmap. Do
>>   		 * not return it to page allocator.
>>   		 */
>> +		free_zone_device_page(page);
> 
> I really like this.
> 
> Ira
>
diff mbox series

Patch

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 84e5a2dc8be5..a0d08b1d8c1e 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -711,7 +711,7 @@  static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
 
 	dpage = pfn_to_page(uvmem_pfn);
 	dpage->zone_device_data = pvt;
-	get_page(dpage);
+	init_page_count(dpage);
 	lock_page(dpage);
 	return dpage;
 out_clear:
@@ -1151,6 +1151,7 @@  int kvmppc_uvmem_init(void)
 	struct resource *res;
 	void *addr;
 	unsigned long pfn_last, pfn_first;
+	unsigned long pfn;
 
 	size = kvmppc_get_secmem_size();
 	if (!size) {
@@ -1191,6 +1192,16 @@  int kvmppc_uvmem_init(void)
 		goto out_unmap;
 	}
 
+	/*
+	 * Pages are created with an initial reference count of one but should
+	 * have a reference count of zero while in the free state.
+	 */
+	for (pfn = pfn_first; pfn < pfn_last; pfn++) {
+		struct page *dpage = pfn_to_page(pfn);
+
+		set_page_count(dpage, 0);
+	}
+
 	pr_info("KVMPPC-UVMEM: Secure Memory size 0x%lx\n", size);
 	return ret;
 out_unmap:
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 92987daa5e17..8bc7120e1216 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -324,7 +324,7 @@  nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
 			return NULL;
 	}
 
-	get_page(page);
+	init_page_count(page);
 	lock_page(page);
 	return page;
 }
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 86c6c368ce9b..43860870bc51 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -66,9 +66,9 @@  enum memory_type {
 
 struct dev_pagemap_ops {
 	/*
-	 * Called once the page refcount reaches 1.  (ZONE_DEVICE pages never
-	 * reach 0 refcount unless there is a refcount bug. This allows the
-	 * device driver to implement its own memory management.)
+	 * Called once the page refcount reaches 0. The reference count should
+	 * be reset to one with init_page_count(page) before reusing the page.
+	 * This allows device drivers to implement their own memory management.
 	 */
 	void (*page_free)(struct page *page);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d6b8e30dce2e..ac848eeb2a1d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1110,14 +1110,7 @@  static inline bool page_is_devmap_managed(struct page *page)
 		return false;
 	if (!is_zone_device_page(page))
 		return false;
-	switch (page->pgmap->type) {
-	case MEMORY_DEVICE_PRIVATE:
-	case MEMORY_DEVICE_FS_DAX:
-		return true;
-	default:
-		break;
-	}
-	return false;
+	return page->pgmap->type == MEMORY_DEVICE_FS_DAX;
 }
 
 void put_devmap_managed_page(struct page *page);
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index e151a7f10519..bf92a261fa6f 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -509,10 +509,15 @@  static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
 		mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)),
 		pfn_first, pfn_last);
 
+	/*
+	 * Pages are created with an initial reference count of one but should
+	 * have a reference count of zero while in the free state.
+	 */
 	spin_lock(&mdevice->lock);
 	for (pfn = pfn_first; pfn < pfn_last; pfn++) {
 		struct page *page = pfn_to_page(pfn);
 
+		set_page_count(page, 0);
 		page->zone_device_data = mdevice->free_pages;
 		mdevice->free_pages = page;
 	}
@@ -561,7 +566,7 @@  static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
 	}
 
 	dpage->zone_device_data = rpage;
-	get_page(dpage);
+	init_page_count(dpage);
 	lock_page(dpage);
 	return dpage;
 
diff --git a/mm/internal.h b/mm/internal.h
index c43ccdddb0f6..e1443b73aa9b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -618,4 +618,12 @@  struct migration_target_control {
 	gfp_t gfp_mask;
 };
 
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+void free_zone_device_page(struct page *page);
+#else
+static inline void free_zone_device_page(struct page *page)
+{
+}
+#endif
+
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/memremap.c b/mm/memremap.c
index 504a10ff2edf..a163a9e36e56 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -49,12 +49,6 @@  static void devmap_managed_enable_put(void)
 
 static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
 {
-	if (pgmap->type == MEMORY_DEVICE_PRIVATE &&
-	    (!pgmap->ops || !pgmap->ops->page_free)) {
-		WARN(1, "Missing page_free method\n");
-		return -EINVAL;
-	}
-
 	static_branch_inc(&devmap_managed_key);
 	return 0;
 }
@@ -92,13 +86,6 @@  static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
 	return (range->start + range_len(range)) >> PAGE_SHIFT;
 }
 
-static unsigned long pfn_next(unsigned long pfn)
-{
-	if (pfn % 1024 == 0)
-		cond_resched();
-	return pfn + 1;
-}
-
 /*
  * This returns true if the page is reserved by ZONE_DEVICE driver.
  */
@@ -119,9 +106,6 @@  bool pfn_zone_device_reserved(unsigned long pfn)
 	return ret;
 }
 
-#define for_each_device_pfn(pfn, map, i) \
-	for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
-
 static void dev_pagemap_kill(struct dev_pagemap *pgmap)
 {
 	if (pgmap->ops && pgmap->ops->kill)
@@ -177,20 +161,20 @@  static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
 
 void memunmap_pages(struct dev_pagemap *pgmap)
 {
-	unsigned long pfn;
 	int i;
 
 	dev_pagemap_kill(pgmap);
 	for (i = 0; i < pgmap->nr_range; i++)
-		for_each_device_pfn(pfn, pgmap, i)
-			put_page(pfn_to_page(pfn));
+		percpu_ref_put_many(pgmap->ref, pfn_end(pgmap, i) -
+						pfn_first(pgmap, i));
 	dev_pagemap_cleanup(pgmap);
 
 	for (i = 0; i < pgmap->nr_range; i++)
 		pageunmap_range(pgmap, i);
 
 	WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
-	devmap_managed_enable_put();
+	if (pgmap->type == MEMORY_DEVICE_FS_DAX)
+		devmap_managed_enable_put();
 }
 EXPORT_SYMBOL_GPL(memunmap_pages);
 
@@ -328,7 +312,7 @@  void *memremap_pages(struct dev_pagemap *pgmap, int nid)
 		.pgprot = PAGE_KERNEL,
 	};
 	const int nr_range = pgmap->nr_range;
-	bool need_devmap_managed = true;
+	bool need_devmap_managed = false;
 	int error, i;
 
 	if (WARN_ONCE(!nr_range, "nr_range must be specified\n"))
@@ -344,6 +328,10 @@  void *memremap_pages(struct dev_pagemap *pgmap, int nid)
 			WARN(1, "Missing migrate_to_ram method\n");
 			return ERR_PTR(-EINVAL);
 		}
+		if (!pgmap->ops->page_free) {
+			WARN(1, "Missing page_free method\n");
+			return ERR_PTR(-EINVAL);
+		}
 		if (!pgmap->owner) {
 			WARN(1, "Missing owner\n");
 			return ERR_PTR(-EINVAL);
@@ -355,13 +343,12 @@  void *memremap_pages(struct dev_pagemap *pgmap, int nid)
 			WARN(1, "File system DAX not supported\n");
 			return ERR_PTR(-EINVAL);
 		}
+		need_devmap_managed = true;
 		break;
 	case MEMORY_DEVICE_GENERIC:
-		need_devmap_managed = false;
 		break;
 	case MEMORY_DEVICE_PCI_P2PDMA:
 		params.pgprot = pgprot_noncached(params.pgprot);
-		need_devmap_managed = false;
 		break;
 	default:
 		WARN(1, "Invalid pgmap type %d\n", pgmap->type);
@@ -508,10 +495,13 @@  EXPORT_SYMBOL_GPL(get_dev_pagemap);
 void free_devmap_managed_page(struct page *page)
 {
 	/* notify page idle for dax */
-	if (!is_device_private_page(page)) {
-		dax_wakeup_page(page);
+	dax_wakeup_page(page);
+}
+
+void free_zone_device_page(struct page *page)
+{
+	if (!is_device_private_page(page))
 		return;
-	}
 
 	__ClearPageWaiters(page);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 5ca5842df5db..ee09334b46d8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -380,11 +380,6 @@  static int expected_page_refs(struct address_space *mapping, struct page *page)
 {
 	int expected_count = 1;
 
-	/*
-	 * Device private pages have an extra refcount as they are
-	 * ZONE_DEVICE pages.
-	 */
-	expected_count += is_device_private_page(page);
 	if (mapping)
 		expected_count += thp_nr_pages(page) + page_has_private(page);
 
diff --git a/mm/swap.c b/mm/swap.c
index 0eb057141a04..93d880c6f73c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -116,12 +116,11 @@  static void __put_compound_page(struct page *page)
 void __put_page(struct page *page)
 {
 	if (is_zone_device_page(page)) {
-		put_dev_pagemap(page->pgmap);
-
 		/*
 		 * The page belongs to the device that created pgmap. Do
 		 * not return it to page allocator.
 		 */
+		free_zone_device_page(page);
 		return;
 	}
 
@@ -907,6 +906,9 @@  void release_pages(struct page **pages, int nr)
 				put_devmap_managed_page(page);
 				continue;
 			}
+			if (put_page_testzero(page))
+				__put_page(page);
+			continue;
 		}
 
 		if (!put_page_testzero(page))