diff mbox series

[v3,10/25] pci/p2pdma: Don't initialise page refcount to one

Message ID 27381b50b65a218da99a2448023b774dd75540df.1732239628.git-series.apopple@nvidia.com (mailing list archive)
State New
Headers show
Series fs/dax: Fix ZONE_DEVICE page reference counts | expand

Commit Message

Alistair Popple Nov. 22, 2024, 1:40 a.m. UTC
The reference counts for ZONE_DEVICE private pages should be
initialised by the driver when the page is actually allocated by the
driver allocator, not when they are first created. This is currently
the case for MEMORY_DEVICE_PRIVATE and MEMORY_DEVICE_COHERENT pages
but not MEMORY_DEVICE_PCI_P2PDMA pages so fix that up.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>

---

Changes since v2:

 - Initialise the page refcount for all pages covered by the kaddr
---
 drivers/pci/p2pdma.c | 13 +++++++++++--
 mm/memremap.c        | 17 +++++++++++++----
 mm/mm_init.c         | 22 ++++++++++++++++++----
 3 files changed, 42 insertions(+), 10 deletions(-)

Comments

Bjorn Helgaas Nov. 22, 2024, 6:20 p.m. UTC | #1
On Fri, Nov 22, 2024 at 12:40:31PM +1100, Alistair Popple wrote:
> The reference counts for ZONE_DEVICE private pages should be
> initialised by the driver when the page is actually allocated by the
> driver allocator, not when they are first created. This is currently
> the case for MEMORY_DEVICE_PRIVATE and MEMORY_DEVICE_COHERENT pages
> but not MEMORY_DEVICE_PCI_P2PDMA pages so fix that up.
> 
> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> Reviewed-by: Dan Williams <dan.j.williams@intel.com>

Previously suggested tweaks to subject line prefix and content:
https://lore.kernel.org/all/20240629212851.GA1484889@bhelgaas/
https://lore.kernel.org/all/20240910134745.GA577955@bhelgaas/

I had the impression that you agreed there was the potential for some
confusion here, but it doesn't look like it was addressed.

So again, a PCI patch labeled "don't init refcount to one" where the
content initializes the refcount to one in p2pdma.c is still confusing
since (IIUC) the subject line refers to the NON-PCI code.

Maybe some sort of "move refcount init from X to p2pdma" or addition
of *who* is no longer initializing refcount to one would clear this
up.

> ---
> 
> Changes since v2:
> 
>  - Initialise the page refcount for all pages covered by the kaddr
> ---
>  drivers/pci/p2pdma.c | 13 +++++++++++--
>  mm/memremap.c        | 17 +++++++++++++----
>  mm/mm_init.c         | 22 ++++++++++++++++++----
>  3 files changed, 42 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index 4f47a13..2c5ac4a 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -140,13 +140,22 @@ static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
>  	rcu_read_unlock();
>  
>  	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
> -		ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr));
> +		struct page *page = virt_to_page(kaddr);
> +
> +		/*
> +		 * Initialise the refcount for the freshly allocated page. As
> +		 * we have just allocated the page no one else should be
> +		 * using it.
> +		 */
> +		VM_WARN_ON_ONCE_PAGE(!page_ref_count(page), page);
> +		set_page_count(page, 1);
> +		ret = vm_insert_page(vma, vaddr, page);
>  		if (ret) {
>  			gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
>  			return ret;
>  		}
>  		percpu_ref_get(ref);
> -		put_page(virt_to_page(kaddr));
> +		put_page(page);
>  		kaddr += PAGE_SIZE;
>  		len -= PAGE_SIZE;
>  	}
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 40d4547..07bbe0e 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -488,15 +488,24 @@ void free_zone_device_folio(struct folio *folio)
>  	folio->mapping = NULL;
>  	folio->page.pgmap->ops->page_free(folio_page(folio, 0));
>  
> -	if (folio->page.pgmap->type != MEMORY_DEVICE_PRIVATE &&
> -	    folio->page.pgmap->type != MEMORY_DEVICE_COHERENT)
> +	switch (folio->page.pgmap->type) {
> +	case MEMORY_DEVICE_PRIVATE:
> +	case MEMORY_DEVICE_COHERENT:
> +		put_dev_pagemap(folio->page.pgmap);
> +		break;
> +
> +	case MEMORY_DEVICE_FS_DAX:
> +	case MEMORY_DEVICE_GENERIC:
>  		/*
>  		 * Reset the refcount to 1 to prepare for handing out the page
>  		 * again.
>  		 */
>  		folio_set_count(folio, 1);
> -	else
> -		put_dev_pagemap(folio->page.pgmap);
> +		break;
> +
> +	case MEMORY_DEVICE_PCI_P2PDMA:
> +		break;
> +	}
>  }
>  
>  void zone_device_page_init(struct page *page)
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 4ba5607..0489820 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1015,12 +1015,26 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
>  	}
>  
>  	/*
> -	 * ZONE_DEVICE pages are released directly to the driver page allocator
> -	 * which will set the page count to 1 when allocating the page.
> +	 * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC and
> +	 * MEMORY_TYPE_FS_DAX pages are released directly to the driver page
> +	 * allocator which will set the page count to 1 when allocating the
> +	 * page.
> +	 *
> +	 * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have
> +	 * their refcount reset to one whenever they are freed (ie. after
> +	 * their refcount drops to 0).
>  	 */
> -	if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
> -	    pgmap->type == MEMORY_DEVICE_COHERENT)
> +	switch (pgmap->type) {
> +	case MEMORY_DEVICE_PRIVATE:
> +	case MEMORY_DEVICE_COHERENT:
> +	case MEMORY_DEVICE_PCI_P2PDMA:
>  		set_page_count(page, 0);
> +		break;
> +
> +	case MEMORY_DEVICE_FS_DAX:
> +	case MEMORY_DEVICE_GENERIC:
> +		break;
> +	}
>  }
>  
>  /*
> -- 
> git-series 0.9.1
Alistair Popple Nov. 24, 2024, 10:39 p.m. UTC | #2
Bjorn Helgaas <helgaas@kernel.org> writes:

> On Fri, Nov 22, 2024 at 12:40:31PM +1100, Alistair Popple wrote:
>> The reference counts for ZONE_DEVICE private pages should be
>> initialised by the driver when the page is actually allocated by the
>> driver allocator, not when they are first created. This is currently
>> the case for MEMORY_DEVICE_PRIVATE and MEMORY_DEVICE_COHERENT pages
>> but not MEMORY_DEVICE_PCI_P2PDMA pages so fix that up.
>> 
>> Signed-off-by: Alistair Popple <apopple@nvidia.com>
>> Reviewed-by: Dan Williams <dan.j.williams@intel.com>
>
> Previously suggested tweaks to subject line prefix and content:
> https://lore.kernel.org/all/20240629212851.GA1484889@bhelgaas/
> https://lore.kernel.org/all/20240910134745.GA577955@bhelgaas/

Apologies Bjorn, I was so focussed on the DAX bits of this series which
had grown significantly and forgot to update the P2PDMA stuff. It's
largely independent of the DAX changes anyway, so unless you disagree I
think it would be best if I split it out into a separate series. Thanks
for taking a look.

> I had the impression that you agreed there was the potential for some
> confusion here, but it doesn't look like it was addressed.
>
> So again, a PCI patch labeled "don't init refcount to one" where the
> content initializes the refcount to one in p2pdma.c is still confusing
> since (IIUC) the subject line refers to the NON-PCI code.
>
> Maybe some sort of "move refcount init from X to p2pdma" or addition
> of *who* is no longer initializing refcount to one would clear this
> up.

To be clear I fully agree with the above, I just failed to do it.

 - Alistair

>> ---
>> 
>> Changes since v2:
>> 
>>  - Initialise the page refcount for all pages covered by the kaddr
>> ---
>>  drivers/pci/p2pdma.c | 13 +++++++++++--
>>  mm/memremap.c        | 17 +++++++++++++----
>>  mm/mm_init.c         | 22 ++++++++++++++++++----
>>  3 files changed, 42 insertions(+), 10 deletions(-)
>> 
>> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
>> index 4f47a13..2c5ac4a 100644
>> --- a/drivers/pci/p2pdma.c
>> +++ b/drivers/pci/p2pdma.c
>> @@ -140,13 +140,22 @@ static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
>>  	rcu_read_unlock();
>>  
>>  	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
>> -		ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr));
>> +		struct page *page = virt_to_page(kaddr);
>> +
>> +		/*
>> +		 * Initialise the refcount for the freshly allocated page. As
>> +		 * we have just allocated the page no one else should be
>> +		 * using it.
>> +		 */
>> +		VM_WARN_ON_ONCE_PAGE(!page_ref_count(page), page);
>> +		set_page_count(page, 1);
>> +		ret = vm_insert_page(vma, vaddr, page);
>>  		if (ret) {
>>  			gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
>>  			return ret;
>>  		}
>>  		percpu_ref_get(ref);
>> -		put_page(virt_to_page(kaddr));
>> +		put_page(page);
>>  		kaddr += PAGE_SIZE;
>>  		len -= PAGE_SIZE;
>>  	}
>> diff --git a/mm/memremap.c b/mm/memremap.c
>> index 40d4547..07bbe0e 100644
>> --- a/mm/memremap.c
>> +++ b/mm/memremap.c
>> @@ -488,15 +488,24 @@ void free_zone_device_folio(struct folio *folio)
>>  	folio->mapping = NULL;
>>  	folio->page.pgmap->ops->page_free(folio_page(folio, 0));
>>  
>> -	if (folio->page.pgmap->type != MEMORY_DEVICE_PRIVATE &&
>> -	    folio->page.pgmap->type != MEMORY_DEVICE_COHERENT)
>> +	switch (folio->page.pgmap->type) {
>> +	case MEMORY_DEVICE_PRIVATE:
>> +	case MEMORY_DEVICE_COHERENT:
>> +		put_dev_pagemap(folio->page.pgmap);
>> +		break;
>> +
>> +	case MEMORY_DEVICE_FS_DAX:
>> +	case MEMORY_DEVICE_GENERIC:
>>  		/*
>>  		 * Reset the refcount to 1 to prepare for handing out the page
>>  		 * again.
>>  		 */
>>  		folio_set_count(folio, 1);
>> -	else
>> -		put_dev_pagemap(folio->page.pgmap);
>> +		break;
>> +
>> +	case MEMORY_DEVICE_PCI_P2PDMA:
>> +		break;
>> +	}
>>  }
>>  
>>  void zone_device_page_init(struct page *page)
>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>> index 4ba5607..0489820 100644
>> --- a/mm/mm_init.c
>> +++ b/mm/mm_init.c
>> @@ -1015,12 +1015,26 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
>>  	}
>>  
>>  	/*
>> -	 * ZONE_DEVICE pages are released directly to the driver page allocator
>> -	 * which will set the page count to 1 when allocating the page.
>> +	 * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC and
>> +	 * MEMORY_TYPE_FS_DAX pages are released directly to the driver page
>> +	 * allocator which will set the page count to 1 when allocating the
>> +	 * page.
>> +	 *
>> +	 * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have
>> +	 * their refcount reset to one whenever they are freed (ie. after
>> +	 * their refcount drops to 0).
>>  	 */
>> -	if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
>> -	    pgmap->type == MEMORY_DEVICE_COHERENT)
>> +	switch (pgmap->type) {
>> +	case MEMORY_DEVICE_PRIVATE:
>> +	case MEMORY_DEVICE_COHERENT:
>> +	case MEMORY_DEVICE_PCI_P2PDMA:
>>  		set_page_count(page, 0);
>> +		break;
>> +
>> +	case MEMORY_DEVICE_FS_DAX:
>> +	case MEMORY_DEVICE_GENERIC:
>> +		break;
>> +	}
>>  }
>>  
>>  /*
>> -- 
>> git-series 0.9.1
diff mbox series

Patch

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 4f47a13..2c5ac4a 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -140,13 +140,22 @@  static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
 	rcu_read_unlock();
 
 	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
-		ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr));
+		struct page *page = virt_to_page(kaddr);
+
+		/*
+		 * Initialise the refcount for the freshly allocated page. As
+		 * we have just allocated the page no one else should be
+		 * using it.
+		 */
+		VM_WARN_ON_ONCE_PAGE(!page_ref_count(page), page);
+		set_page_count(page, 1);
+		ret = vm_insert_page(vma, vaddr, page);
 		if (ret) {
 			gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
 			return ret;
 		}
 		percpu_ref_get(ref);
-		put_page(virt_to_page(kaddr));
+		put_page(page);
 		kaddr += PAGE_SIZE;
 		len -= PAGE_SIZE;
 	}
diff --git a/mm/memremap.c b/mm/memremap.c
index 40d4547..07bbe0e 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -488,15 +488,24 @@  void free_zone_device_folio(struct folio *folio)
 	folio->mapping = NULL;
 	folio->page.pgmap->ops->page_free(folio_page(folio, 0));
 
-	if (folio->page.pgmap->type != MEMORY_DEVICE_PRIVATE &&
-	    folio->page.pgmap->type != MEMORY_DEVICE_COHERENT)
+	switch (folio->page.pgmap->type) {
+	case MEMORY_DEVICE_PRIVATE:
+	case MEMORY_DEVICE_COHERENT:
+		put_dev_pagemap(folio->page.pgmap);
+		break;
+
+	case MEMORY_DEVICE_FS_DAX:
+	case MEMORY_DEVICE_GENERIC:
 		/*
 		 * Reset the refcount to 1 to prepare for handing out the page
 		 * again.
 		 */
 		folio_set_count(folio, 1);
-	else
-		put_dev_pagemap(folio->page.pgmap);
+		break;
+
+	case MEMORY_DEVICE_PCI_P2PDMA:
+		break;
+	}
 }
 
 void zone_device_page_init(struct page *page)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4ba5607..0489820 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1015,12 +1015,26 @@  static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
 	}
 
 	/*
-	 * ZONE_DEVICE pages are released directly to the driver page allocator
-	 * which will set the page count to 1 when allocating the page.
+	 * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC and
+	 * MEMORY_TYPE_FS_DAX pages are released directly to the driver page
+	 * allocator which will set the page count to 1 when allocating the
+	 * page.
+	 *
+	 * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have
+	 * their refcount reset to one whenever they are freed (ie. after
+	 * their refcount drops to 0).
 	 */
-	if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
-	    pgmap->type == MEMORY_DEVICE_COHERENT)
+	switch (pgmap->type) {
+	case MEMORY_DEVICE_PRIVATE:
+	case MEMORY_DEVICE_COHERENT:
+	case MEMORY_DEVICE_PCI_P2PDMA:
 		set_page_count(page, 0);
+		break;
+
+	case MEMORY_DEVICE_FS_DAX:
+	case MEMORY_DEVICE_GENERIC:
+		break;
+	}
 }
 
 /*