diff mbox series

[v1,08/12] mm/rmap: handle device-exclusive entries correctly in try_to_unmap_one()

Message ID 20250129115411.2077152-9-david@redhat.com (mailing list archive)
State New
Headers show
Series mm: fixes for device-exclusive entries (hmm) | expand

Commit Message

David Hildenbrand Jan. 29, 2025, 11:54 a.m. UTC
Ever since commit b756a3b5e7ea ("mm: device exclusive memory access")
we can return with a device-exclusive entry from page_vma_mapped_walk().

try_to_unmap_one() is not prepared for that, so teach it about these
non-present nonswap PTEs.

Before that, could we also have triggered this case with device-private
entries? Unlikely.

Note that we could currently only run into this case with
device-exclusive entries on THPs. For order-0 folios, we still adjust
the mapcount on conversion to device-exclusive, making the rmap walk
abort early (folio_mapcount() == 0 and breaking swapout). We'll fix
that next, now that try_to_unmap_one() can handle it.

Further note that try_to_unmap() calls MMU notifiers and holds the
folio lock, so any device-exclusive users should be properly prepared
for this device-exclusive PTE to "vanish".

Fixes: b756a3b5e7ea ("mm: device exclusive memory access")
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/rmap.c | 53 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 13 deletions(-)

Comments

Simona Vetter Jan. 30, 2025, 10:10 a.m. UTC | #1
On Wed, Jan 29, 2025 at 12:54:06PM +0100, David Hildenbrand wrote:
> Ever since commit b756a3b5e7ea ("mm: device exclusive memory access")
> we can return with a device-exclusive entry from page_vma_mapped_walk().
> 
> try_to_unmap_one() is not prepared for that, so teach it about these
> non-present nonswap PTEs.
> 
> Before that, could we also have triggered this case with device-private
> entries? Unlikely.

Just quick comment on this, I'm still pondering all the other aspects.

device-private memory is entirely owned by the driver, the core mm isn't
supposed to touch these beyond migrating it back to system memory in
do_swap_page. Plus using rmap when the driver asks for invalidating
mappings as needed.

So no lru, thp, migration or anything initiated by core mm should ever
happen on these device private pages. If it does, it'd be a bug.

device-exclusive is entirely different ofc since that's just normal system
memory managed by core mm/.
-Sima
> 
> Note that we could currently only run into this case with
> device-exclusive entries on THPs. For order-0 folios, we still adjust
> the mapcount on conversion to device-exclusive, making the rmap walk
> abort early (folio_mapcount() == 0 and breaking swapout). We'll fix
> that next, now that try_to_unmap_one() can handle it.
> 
> Further note that try_to_unmap() calls MMU notifiers and holds the
> folio lock, so any device-exclusive users should be properly prepared
> for this device-exclusive PTE to "vanish".
> 
> Fixes: b756a3b5e7ea ("mm: device exclusive memory access")
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  mm/rmap.c | 53 ++++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 40 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 65d9bbea16d0..12900f367a2a 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1648,9 +1648,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> +	bool anon_exclusive, ret = true;
>  	pte_t pteval;
>  	struct page *subpage;
> -	bool anon_exclusive, ret = true;
>  	struct mmu_notifier_range range;
>  	enum ttu_flags flags = (enum ttu_flags)(long)arg;
>  	unsigned long pfn;
> @@ -1722,7 +1722,19 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>  		/* Unexpected PMD-mapped THP? */
>  		VM_BUG_ON_FOLIO(!pvmw.pte, folio);
>  
> -		pfn = pte_pfn(ptep_get(pvmw.pte));
> +		/*
> +		 * We can end up here with selected non-swap entries that
> +		 * actually map pages similar to PROT_NONE; see
> +		 * page_vma_mapped_walk()->check_pte().
> +		 */
> +		pteval = ptep_get(pvmw.pte);
> +		if (likely(pte_present(pteval))) {
> +			pfn = pte_pfn(pteval);
> +		} else {
> +			pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
> +			VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
> +		}
> +
>  		subpage = folio_page(folio, pfn - folio_pfn(folio));
>  		address = pvmw.address;
>  		anon_exclusive = folio_test_anon(folio) &&
> @@ -1778,7 +1790,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>  				hugetlb_vma_unlock_write(vma);
>  			}
>  			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
> -		} else {
> +			if (pte_dirty(pteval))
> +				folio_mark_dirty(folio);
> +		} else if (likely(pte_present(pteval))) {
>  			flush_cache_page(vma, address, pfn);
>  			/* Nuke the page table entry. */
>  			if (should_defer_flush(mm, flags)) {
> @@ -1796,6 +1810,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>  			} else {
>  				pteval = ptep_clear_flush(vma, address, pvmw.pte);
>  			}
> +			if (pte_dirty(pteval))
> +				folio_mark_dirty(folio);
> +		} else {
> +			pte_clear(mm, address, pvmw.pte);
>  		}
>  
>  		/*
> @@ -1805,10 +1823,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>  		 */
>  		pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
>  
> -		/* Set the dirty flag on the folio now the pte is gone. */
> -		if (pte_dirty(pteval))
> -			folio_mark_dirty(folio);
> -
>  		/* Update high watermark before we lower rss */
>  		update_hiwater_rss(mm);
>  
> @@ -1822,8 +1836,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>  				dec_mm_counter(mm, mm_counter(folio));
>  				set_pte_at(mm, address, pvmw.pte, pteval);
>  			}
> -
> -		} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
> +		} else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
> +			   !userfaultfd_armed(vma)) {
>  			/*
>  			 * The guest indicated that the page content is of no
>  			 * interest anymore. Simply discard the pte, vmscan
> @@ -1902,6 +1916,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>  				set_pte_at(mm, address, pvmw.pte, pteval);
>  				goto walk_abort;
>  			}
> +
> +			/*
> +			 * arch_unmap_one() is expected to be a NOP on
> +			 * architectures where we could have non-swp entries
> +			 * here, so we'll not check/care.
> +			 */
>  			if (arch_unmap_one(mm, vma, address, pteval) < 0) {
>  				swap_free(entry);
>  				set_pte_at(mm, address, pvmw.pte, pteval);
> @@ -1926,10 +1946,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>  			swp_pte = swp_entry_to_pte(entry);
>  			if (anon_exclusive)
>  				swp_pte = pte_swp_mkexclusive(swp_pte);
> -			if (pte_soft_dirty(pteval))
> -				swp_pte = pte_swp_mksoft_dirty(swp_pte);
> -			if (pte_uffd_wp(pteval))
> -				swp_pte = pte_swp_mkuffd_wp(swp_pte);
> +			if (likely(pte_present(pteval))) {
> +				if (pte_soft_dirty(pteval))
> +					swp_pte = pte_swp_mksoft_dirty(swp_pte);
> +				if (pte_uffd_wp(pteval))
> +					swp_pte = pte_swp_mkuffd_wp(swp_pte);
> +			} else {
> +				if (pte_swp_soft_dirty(pteval))
> +					swp_pte = pte_swp_mksoft_dirty(swp_pte);
> +				if (pte_swp_uffd_wp(pteval))
> +					swp_pte = pte_swp_mkuffd_wp(swp_pte);
> +			}
>  			set_pte_at(mm, address, pvmw.pte, swp_pte);
>  		} else {
>  			/*
> -- 
> 2.48.1
>
David Hildenbrand Jan. 30, 2025, 11:08 a.m. UTC | #2
On 30.01.25 11:10, Simona Vetter wrote:
> On Wed, Jan 29, 2025 at 12:54:06PM +0100, David Hildenbrand wrote:
>> Ever since commit b756a3b5e7ea ("mm: device exclusive memory access")
>> we can return with a device-exclusive entry from page_vma_mapped_walk().
>>
>> try_to_unmap_one() is not prepared for that, so teach it about these
>> non-present nonswap PTEs.
>>
>> Before that, could we also have triggered this case with device-private
>> entries? Unlikely.
> 
> Just quick comment on this, I'm still pondering all the other aspects.
> 
> device-private memory is entirely owned by the driver, the core mm isn't
> supposed to touch these beyond migrating it back to system memory in
> do_swap_page. Plus using rmap when the driver asks for invalidating
> mappings as needed.
> 
> So no lru, thp, migration or anything initiated by core mm should ever
> happen on these device private pages. If it does, it'd be a bug.

I was not 100% sure about HWPoison handling, that's why I added that 
comment. In other regards I agree: reclaim etc. does not apply.
Simona Vetter Jan. 30, 2025, 1:06 p.m. UTC | #3
On Thu, Jan 30, 2025 at 12:08:42PM +0100, David Hildenbrand wrote:
> On 30.01.25 11:10, Simona Vetter wrote:
> > On Wed, Jan 29, 2025 at 12:54:06PM +0100, David Hildenbrand wrote:
> > > Ever since commit b756a3b5e7ea ("mm: device exclusive memory access")
> > > we can return with a device-exclusive entry from page_vma_mapped_walk().
> > > 
> > > try_to_unmap_one() is not prepared for that, so teach it about these
> > > non-present nonswap PTEs.
> > > 
> > > Before that, could we also have triggered this case with device-private
> > > entries? Unlikely.
> > 
> > Just quick comment on this, I'm still pondering all the other aspects.
> > 
> > device-private memory is entirely owned by the driver, the core mm isn't
> > supposed to touch these beyond migrating it back to system memory in
> > do_swap_page. Plus using rmap when the driver asks for invalidating
> > mappings as needed.
> > 
> > So no lru, thp, migration or anything initiated by core mm should ever
> > happen on these device private pages. If it does, it'd be a bug.
> 
> I was not 100% sure about HWPoison handling, that's why I added that
> comment. In other regards I agree: reclaim etc. does not apply.

So maybe I'm just entirely lost, but unless you have a coherent
interconnect I don't think hwpoisin should get involved with device
private memory? And for a coherent interconnect it's just device memory,
which isn't treated very special.

Also to clarify, I meant this as a general comment for all subsequent
patches that have the same paragraph.
-Sima
Jason Gunthorpe Jan. 30, 2025, 2:08 p.m. UTC | #4
On Thu, Jan 30, 2025 at 02:06:12PM +0100, Simona Vetter wrote:
> On Thu, Jan 30, 2025 at 12:08:42PM +0100, David Hildenbrand wrote:
> > On 30.01.25 11:10, Simona Vetter wrote:
> > > On Wed, Jan 29, 2025 at 12:54:06PM +0100, David Hildenbrand wrote:
> > > > Ever since commit b756a3b5e7ea ("mm: device exclusive memory access")
> > > > we can return with a device-exclusive entry from page_vma_mapped_walk().
> > > > 
> > > > try_to_unmap_one() is not prepared for that, so teach it about these
> > > > non-present nonswap PTEs.
> > > > 
> > > > Before that, could we also have triggered this case with device-private
> > > > entries? Unlikely.
> > > 
> > > Just quick comment on this, I'm still pondering all the other aspects.
> > > 
> > > device-private memory is entirely owned by the driver, the core mm isn't
> > > supposed to touch these beyond migrating it back to system memory in
> > > do_swap_page. Plus using rmap when the driver asks for invalidating
> > > mappings as needed.
> > > 
> > > So no lru, thp, migration or anything initiated by core mm should ever
> > > happen on these device private pages. If it does, it'd be a bug.
> > 
> > I was not 100% sure about HWPoison handling, that's why I added that
> > comment. In other regards I agree: reclaim etc. does not apply.
> 
> So maybe I'm just entirely lost, but unless you have a coherent
> interconnect I don't think hwpoisin should get involved with device
> private memory? And for a coherent interconnect it's just device memory,
> which isn't treated very special.

I'm not sure it is meaningful, but in principle a driver could keep
track of the poisoned private memory using that struct page
bit. Perhaps in that sense it is more of a driver private flag than
something the core MM would touch.

If you have a coherent interconnect then you should not be using
device private.

Jason
David Hildenbrand Jan. 30, 2025, 3:52 p.m. UTC | #5
On 30.01.25 14:06, Simona Vetter wrote:
> On Thu, Jan 30, 2025 at 12:08:42PM +0100, David Hildenbrand wrote:
>> On 30.01.25 11:10, Simona Vetter wrote:
>>> On Wed, Jan 29, 2025 at 12:54:06PM +0100, David Hildenbrand wrote:
>>>> Ever since commit b756a3b5e7ea ("mm: device exclusive memory access")
>>>> we can return with a device-exclusive entry from page_vma_mapped_walk().
>>>>
>>>> try_to_unmap_one() is not prepared for that, so teach it about these
>>>> non-present nonswap PTEs.
>>>>
>>>> Before that, could we also have triggered this case with device-private
>>>> entries? Unlikely.
>>>
>>> Just quick comment on this, I'm still pondering all the other aspects.
>>>
>>> device-private memory is entirely owned by the driver, the core mm isn't
>>> supposed to touch these beyond migrating it back to system memory in
>>> do_swap_page. Plus using rmap when the driver asks for invalidating
>>> mappings as needed.
>>>
>>> So no lru, thp, migration or anything initiated by core mm should ever
>>> happen on these device private pages. If it does, it'd be a bug.
>>
>> I was not 100% sure about HWPoison handling, that's why I added that
>> comment. In other regards I agree: reclaim etc. does not apply.
> 
> So maybe I'm just entirely lost, but unless you have a coherent
> interconnect I don't think hwpoisin should get involved with device
> private memory? And for a coherent interconnect it's just device memory,
> which isn't treated very special.

I would have thought that in a scenario Jason describes, that you would 
still want to zap the page from the page table (try_to_unmap()) and 
install a hwpoison entry instead.

But yes, right now this should never ever happen: memory_failure() does 
some ZONE_DEVICE specific things, but likely doesn't call try_to_unmap() 
on these folios.

> 
> Also to clarify, I meant this as a general comment for all subsequent
> patches that have the same paragraph.

Yeah, I'll rephrase that to "We'll never hit that case for special 
device-private pages."
Simona Vetter Jan. 30, 2025, 4:10 p.m. UTC | #6
On Thu, Jan 30, 2025 at 10:08:32AM -0400, Jason Gunthorpe wrote:
> On Thu, Jan 30, 2025 at 02:06:12PM +0100, Simona Vetter wrote:
> > On Thu, Jan 30, 2025 at 12:08:42PM +0100, David Hildenbrand wrote:
> > > On 30.01.25 11:10, Simona Vetter wrote:
> > > > On Wed, Jan 29, 2025 at 12:54:06PM +0100, David Hildenbrand wrote:
> > > > > Ever since commit b756a3b5e7ea ("mm: device exclusive memory access")
> > > > > we can return with a device-exclusive entry from page_vma_mapped_walk().
> > > > > 
> > > > > try_to_unmap_one() is not prepared for that, so teach it about these
> > > > > non-present nonswap PTEs.
> > > > > 
> > > > > Before that, could we also have triggered this case with device-private
> > > > > entries? Unlikely.
> > > > 
> > > > Just quick comment on this, I'm still pondering all the other aspects.
> > > > 
> > > > device-private memory is entirely owned by the driver, the core mm isn't
> > > > supposed to touch these beyond migrating it back to system memory in
> > > > do_swap_page. Plus using rmap when the driver asks for invalidating
> > > > mappings as needed.
> > > > 
> > > > So no lru, thp, migration or anything initiated by core mm should ever
> > > > happen on these device private pages. If it does, it'd be a bug.
> > > 
> > > I was not 100% sure about HWPoison handling, that's why I added that
> > > comment. In other regards I agree: reclaim etc. does not apply.
> > 
> > So maybe I'm just entirely lost, but unless you have a coherent
> > interconnect I don't think hwpoisin should get involved with device
> > private memory? And for a coherent interconnect it's just device memory,
> > which isn't treated very special.
> 
> I'm not sure it is meaningful, but in principle a driver could keep
> track of the poisoned private memory using that struct page
> bit. Perhaps in that sense it is more of a driver private flag than
> something the core MM would touch.
> 
> If you have a coherent interconnect then you should not be using
> device private.

Yes on both, that's what I meant.
-Sima
diff mbox series

Patch

diff --git a/mm/rmap.c b/mm/rmap.c
index 65d9bbea16d0..12900f367a2a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1648,9 +1648,9 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 {
 	struct mm_struct *mm = vma->vm_mm;
 	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+	bool anon_exclusive, ret = true;
 	pte_t pteval;
 	struct page *subpage;
-	bool anon_exclusive, ret = true;
 	struct mmu_notifier_range range;
 	enum ttu_flags flags = (enum ttu_flags)(long)arg;
 	unsigned long pfn;
@@ -1722,7 +1722,19 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 		/* Unexpected PMD-mapped THP? */
 		VM_BUG_ON_FOLIO(!pvmw.pte, folio);
 
-		pfn = pte_pfn(ptep_get(pvmw.pte));
+		/*
+		 * We can end up here with selected non-swap entries that
+		 * actually map pages similar to PROT_NONE; see
+		 * page_vma_mapped_walk()->check_pte().
+		 */
+		pteval = ptep_get(pvmw.pte);
+		if (likely(pte_present(pteval))) {
+			pfn = pte_pfn(pteval);
+		} else {
+			pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+			VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
+		}
+
 		subpage = folio_page(folio, pfn - folio_pfn(folio));
 		address = pvmw.address;
 		anon_exclusive = folio_test_anon(folio) &&
@@ -1778,7 +1790,9 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 				hugetlb_vma_unlock_write(vma);
 			}
 			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
-		} else {
+			if (pte_dirty(pteval))
+				folio_mark_dirty(folio);
+		} else if (likely(pte_present(pteval))) {
 			flush_cache_page(vma, address, pfn);
 			/* Nuke the page table entry. */
 			if (should_defer_flush(mm, flags)) {
@@ -1796,6 +1810,10 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			} else {
 				pteval = ptep_clear_flush(vma, address, pvmw.pte);
 			}
+			if (pte_dirty(pteval))
+				folio_mark_dirty(folio);
+		} else {
+			pte_clear(mm, address, pvmw.pte);
 		}
 
 		/*
@@ -1805,10 +1823,6 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 		 */
 		pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
 
-		/* Set the dirty flag on the folio now the pte is gone. */
-		if (pte_dirty(pteval))
-			folio_mark_dirty(folio);
-
 		/* Update high watermark before we lower rss */
 		update_hiwater_rss(mm);
 
@@ -1822,8 +1836,8 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 				dec_mm_counter(mm, mm_counter(folio));
 				set_pte_at(mm, address, pvmw.pte, pteval);
 			}
-
-		} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+		} else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
+			   !userfaultfd_armed(vma)) {
 			/*
 			 * The guest indicated that the page content is of no
 			 * interest anymore. Simply discard the pte, vmscan
@@ -1902,6 +1916,12 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 				set_pte_at(mm, address, pvmw.pte, pteval);
 				goto walk_abort;
 			}
+
+			/*
+			 * arch_unmap_one() is expected to be a NOP on
+			 * architectures where we could have non-swp entries
+			 * here, so we'll not check/care.
+			 */
 			if (arch_unmap_one(mm, vma, address, pteval) < 0) {
 				swap_free(entry);
 				set_pte_at(mm, address, pvmw.pte, pteval);
@@ -1926,10 +1946,17 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			swp_pte = swp_entry_to_pte(entry);
 			if (anon_exclusive)
 				swp_pte = pte_swp_mkexclusive(swp_pte);
-			if (pte_soft_dirty(pteval))
-				swp_pte = pte_swp_mksoft_dirty(swp_pte);
-			if (pte_uffd_wp(pteval))
-				swp_pte = pte_swp_mkuffd_wp(swp_pte);
+			if (likely(pte_present(pteval))) {
+				if (pte_soft_dirty(pteval))
+					swp_pte = pte_swp_mksoft_dirty(swp_pte);
+				if (pte_uffd_wp(pteval))
+					swp_pte = pte_swp_mkuffd_wp(swp_pte);
+			} else {
+				if (pte_swp_soft_dirty(pteval))
+					swp_pte = pte_swp_mksoft_dirty(swp_pte);
+				if (pte_swp_uffd_wp(pteval))
+					swp_pte = pte_swp_mkuffd_wp(swp_pte);
+			}
 			set_pte_at(mm, address, pvmw.pte, swp_pte);
 		} else {
 			/*