diff mbox series

[v5,03/32] mm/migrate: Trylock device page in do_swap_page

Message ID 20250213021112.1228481-4-matthew.brost@intel.com (mailing list archive)
State New
Headers show
Series Introduce GPU SVM and Xe SVM implementation | expand

Commit Message

Matthew Brost Feb. 13, 2025, 2:10 a.m. UTC
Avoid multiple CPU page faults to the same device page racing by trying
to lock the page in do_swap_page before taking an extra reference to the
page. This prevents scenarios where multiple CPU page faults each take
an extra reference to a device page, which could abort migration in
folio_migrate_mapping. With the device page being locked in
do_swap_page, the migrate_vma_* functions need to be updated to avoid
locking the fault_page argument.

Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
DRM driver) SVM implementation if enough threads faulted the same device
page.

v3:
 - Put page after unlocking page (Alistair)
 - Warn on spliting a TPH which is fault page (Alistair)
 - Warn on dst page == fault page (Alistair)

Cc: Alistair Popple <apopple@nvidia.com>
Cc: Philip Yang <Philip.Yang@amd.com>
Cc: Felix Kuehling <felix.kuehling@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Suggested-by: Simona Vetter <simona.vetter@ffwll.ch>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 mm/memory.c         | 13 ++++++---
 mm/migrate_device.c | 64 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 55 insertions(+), 22 deletions(-)

Comments

Alistair Popple Feb. 19, 2025, 5:36 a.m. UTC | #1
On Wed, Feb 12, 2025 at 06:10:43PM -0800, Matthew Brost wrote:
> Avoid multiple CPU page faults to the same device page racing by trying
> to lock the page in do_swap_page before taking an extra reference to the
> page. This prevents scenarios where multiple CPU page faults each take
> an extra reference to a device page, which could abort migration in
> folio_migrate_mapping. With the device page being locked in
> do_swap_page, the migrate_vma_* functions need to be updated to avoid
> locking the fault_page argument.
> 
> Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> DRM driver) SVM implementation if enough threads faulted the same device
> page.
> 
> v3:
>  - Put page after unlocking page (Alistair)
>  - Warn on spliting a TPH which is fault page (Alistair)
>  - Warn on dst page == fault page (Alistair)
> 
> Cc: Alistair Popple <apopple@nvidia.com>
> Cc: Philip Yang <Philip.Yang@amd.com>
> Cc: Felix Kuehling <felix.kuehling@amd.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Suggested-by: Simona Vetter <simona.vetter@ffwll.ch>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  mm/memory.c         | 13 ++++++---
>  mm/migrate_device.c | 64 ++++++++++++++++++++++++++++++++-------------
>  2 files changed, 55 insertions(+), 22 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 539c0f7c6d54..1e010c5d67bc 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4337,10 +4337,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  			 * Get a page reference while we know the page can't be
>  			 * freed.
>  			 */
> -			get_page(vmf->page);
> -			pte_unmap_unlock(vmf->pte, vmf->ptl);
> -			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> -			put_page(vmf->page);
> +			if (trylock_page(vmf->page)) {
> +				get_page(vmf->page);
> +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> +				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> +				unlock_page(vmf->page);
> +				put_page(vmf->page);
> +			} else {
> +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> +			}
>  		} else if (is_hwpoison_entry(entry)) {
>  			ret = VM_FAULT_HWPOISON;
>  		} else if (is_pte_marker_entry(entry)) {
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 19960743f927..3470357d9bae 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  				   struct mm_walk *walk)
>  {
>  	struct migrate_vma *migrate = walk->private;
> +	struct folio *fault_folio = migrate->fault_page ?
> +		page_folio(migrate->fault_page) : NULL;
>  	struct vm_area_struct *vma = walk->vma;
>  	struct mm_struct *mm = vma->vm_mm;
>  	unsigned long addr = start, unmapped = 0;
> @@ -88,11 +90,16 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  
>  			folio_get(folio);
>  			spin_unlock(ptl);
> +			/* FIXME support THP */
> +			if (WARN_ON_ONCE(fault_folio == folio))

This threw me until I realised this was the THP path because we'd expect to find
the fault_folio during migrate_vma_collect_pmd(). Of course we don't currently
have DEVICE_PRIVATE THP so faulting won't ever get here which makes sense, but a
slightly more verbose comment along those lines would be nice.

Otherwise it looks good and passed my tests so please add:

Reviewed-by: Alistair Popple <apopple@nvidia.com>
Tested-by: Alistair Popple <apopple@nvidia.com>

> +				return migrate_vma_collect_skip(start, end,
> +								walk);
>  			if (unlikely(!folio_trylock(folio)))
>  				return migrate_vma_collect_skip(start, end,
>  								walk);
>  			ret = split_folio(folio);
> -			folio_unlock(folio);
> +			if (fault_folio != folio)
> +				folio_unlock(folio);
>  			folio_put(folio);
>  			if (ret)
>  				return migrate_vma_collect_skip(start, end,
> @@ -192,7 +199,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  		 * optimisation to avoid walking the rmap later with
>  		 * try_to_migrate().
>  		 */
> -		if (folio_trylock(folio)) {
> +		if (fault_folio == folio || folio_trylock(folio)) {
>  			bool anon_exclusive;
>  			pte_t swp_pte;
>  
> @@ -204,7 +211,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  
>  				if (folio_try_share_anon_rmap_pte(folio, page)) {
>  					set_pte_at(mm, addr, ptep, pte);
> -					folio_unlock(folio);
> +					if (fault_folio != folio)
> +						folio_unlock(folio);
>  					folio_put(folio);
>  					mpfn = 0;
>  					goto next;
> @@ -363,6 +371,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>  					  unsigned long npages,
>  					  struct page *fault_page)
>  {
> +	struct folio *fault_folio = fault_page ?
> +		page_folio(fault_page) : NULL;
>  	unsigned long i, restore = 0;
>  	bool allow_drain = true;
>  	unsigned long unmapped = 0;
> @@ -427,7 +437,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>  		remove_migration_ptes(folio, folio, 0);
>  
>  		src_pfns[i] = 0;
> -		folio_unlock(folio);
> +		if (fault_folio != folio)
> +			folio_unlock(folio);
>  		folio_put(folio);
>  		restore--;
>  	}
> @@ -536,6 +547,8 @@ int migrate_vma_setup(struct migrate_vma *args)
>  		return -EINVAL;
>  	if (args->fault_page && !is_device_private_page(args->fault_page))
>  		return -EINVAL;
> +	if (args->fault_page && !PageLocked(args->fault_page))
> +		return -EINVAL;
>  
>  	memset(args->src, 0, sizeof(*args->src) * nr_pages);
>  	args->cpages = 0;
> @@ -799,19 +812,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
>  }
>  EXPORT_SYMBOL(migrate_vma_pages);
>  
> -/*
> - * migrate_device_finalize() - complete page migration
> - * @src_pfns: src_pfns returned from migrate_device_range()
> - * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> - * @npages: number of pages in the range
> - *
> - * Completes migration of the page by removing special migration entries.
> - * Drivers must ensure copying of page data is complete and visible to the CPU
> - * before calling this.
> - */
> -void migrate_device_finalize(unsigned long *src_pfns,
> -			unsigned long *dst_pfns, unsigned long npages)
> +static void __migrate_device_finalize(unsigned long *src_pfns,
> +				      unsigned long *dst_pfns,
> +				      unsigned long npages,
> +				      struct page *fault_page)
>  {
> +	struct folio *fault_folio = fault_page ?
> +		page_folio(fault_page) : NULL;
>  	unsigned long i;
>  
>  	for (i = 0; i < npages; i++) {
> @@ -824,6 +831,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
>  
>  		if (!page) {
>  			if (dst) {
> +				WARN_ON_ONCE(fault_folio == dst);
>  				folio_unlock(dst);
>  				folio_put(dst);
>  			}
> @@ -834,6 +842,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
>  
>  		if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
>  			if (dst) {
> +				WARN_ON_ONCE(fault_folio == dst);
>  				folio_unlock(dst);
>  				folio_put(dst);
>  			}
> @@ -841,7 +850,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
>  		}
>  
>  		remove_migration_ptes(src, dst, 0);
> -		folio_unlock(src);
> +		if (fault_folio != src)
> +			folio_unlock(src);
>  
>  		if (folio_is_zone_device(src))
>  			folio_put(src);
> @@ -849,6 +859,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
>  			folio_putback_lru(src);
>  
>  		if (dst != src) {
> +			WARN_ON_ONCE(fault_folio == dst);
>  			folio_unlock(dst);
>  			if (folio_is_zone_device(dst))
>  				folio_put(dst);
> @@ -857,6 +868,22 @@ void migrate_device_finalize(unsigned long *src_pfns,
>  		}
>  	}
>  }
> +
> +/*
> + * migrate_device_finalize() - complete page migration
> + * @src_pfns: src_pfns returned from migrate_device_range()
> + * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> + * @npages: number of pages in the range
> + *
> + * Completes migration of the page by removing special migration entries.
> + * Drivers must ensure copying of page data is complete and visible to the CPU
> + * before calling this.
> + */
> +void migrate_device_finalize(unsigned long *src_pfns,
> +			unsigned long *dst_pfns, unsigned long npages)
> +{
> +	return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
> +}
>  EXPORT_SYMBOL(migrate_device_finalize);
>  
>  /**
> @@ -872,7 +899,8 @@ EXPORT_SYMBOL(migrate_device_finalize);
>   */
>  void migrate_vma_finalize(struct migrate_vma *migrate)
>  {
> -	migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
> +	__migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
> +				  migrate->fault_page);
>  }
>  EXPORT_SYMBOL(migrate_vma_finalize);
>  
> -- 
> 2.34.1
>
Matthew Brost Feb. 19, 2025, 6:08 a.m. UTC | #2
On Wed, Feb 19, 2025 at 04:36:54PM +1100, Alistair Popple wrote:
> On Wed, Feb 12, 2025 at 06:10:43PM -0800, Matthew Brost wrote:
> > Avoid multiple CPU page faults to the same device page racing by trying
> > to lock the page in do_swap_page before taking an extra reference to the
> > page. This prevents scenarios where multiple CPU page faults each take
> > an extra reference to a device page, which could abort migration in
> > folio_migrate_mapping. With the device page being locked in
> > do_swap_page, the migrate_vma_* functions need to be updated to avoid
> > locking the fault_page argument.
> > 
> > Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> > DRM driver) SVM implementation if enough threads faulted the same device
> > page.
> > 
> > v3:
> >  - Put page after unlocking page (Alistair)
> >  - Warn on spliting a TPH which is fault page (Alistair)
> >  - Warn on dst page == fault page (Alistair)
> > 
> > Cc: Alistair Popple <apopple@nvidia.com>
> > Cc: Philip Yang <Philip.Yang@amd.com>
> > Cc: Felix Kuehling <felix.kuehling@amd.com>
> > Cc: Christian König <christian.koenig@amd.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Suggested-by: Simona Vetter <simona.vetter@ffwll.ch>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  mm/memory.c         | 13 ++++++---
> >  mm/migrate_device.c | 64 ++++++++++++++++++++++++++++++++-------------
> >  2 files changed, 55 insertions(+), 22 deletions(-)
> > 
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 539c0f7c6d54..1e010c5d67bc 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4337,10 +4337,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >  			 * Get a page reference while we know the page can't be
> >  			 * freed.
> >  			 */
> > -			get_page(vmf->page);
> > -			pte_unmap_unlock(vmf->pte, vmf->ptl);
> > -			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> > -			put_page(vmf->page);
> > +			if (trylock_page(vmf->page)) {
> > +				get_page(vmf->page);
> > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> > +				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> > +				unlock_page(vmf->page);
> > +				put_page(vmf->page);
> > +			} else {
> > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> > +			}
> >  		} else if (is_hwpoison_entry(entry)) {
> >  			ret = VM_FAULT_HWPOISON;
> >  		} else if (is_pte_marker_entry(entry)) {
> > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> > index 19960743f927..3470357d9bae 100644
> > --- a/mm/migrate_device.c
> > +++ b/mm/migrate_device.c
> > @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >  				   struct mm_walk *walk)
> >  {
> >  	struct migrate_vma *migrate = walk->private;
> > +	struct folio *fault_folio = migrate->fault_page ?
> > +		page_folio(migrate->fault_page) : NULL;
> >  	struct vm_area_struct *vma = walk->vma;
> >  	struct mm_struct *mm = vma->vm_mm;
> >  	unsigned long addr = start, unmapped = 0;
> > @@ -88,11 +90,16 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >  
> >  			folio_get(folio);
> >  			spin_unlock(ptl);
> > +			/* FIXME support THP */
> > +			if (WARN_ON_ONCE(fault_folio == folio))
> 
> This threw me until I realised this was the THP path because we'd expect to find
> the fault_folio during migrate_vma_collect_pmd(). Of course we don't currently
> have DEVICE_PRIVATE THP so faulting won't ever get here which makes sense, but a
> slightly more verbose comment along those lines would be nice.
> 

Can make the comment a bit more verbose in the next rev.

> Otherwise it looks good and passed my tests so please add:
> 
> Reviewed-by: Alistair Popple <apopple@nvidia.com>
> Tested-by: Alistair Popple <apopple@nvidia.com>
> 

Thanks!

If Nvidia / Nova might make use of GPU SVM, Sima requested an external
ack outside of Intel, as prerequisite to merging this series [1], on the
documentation patch [2] detailing the design principles, current status,
and future plans. If you think reviewing it is appropriate, any input
would be appreciated.

Matt

[1] https://patchwork.freedesktop.org/series/137870/
[2] https://patchwork.freedesktop.org/patch/636838/?series=137870&rev=5

> > +				return migrate_vma_collect_skip(start, end,
> > +								walk);
> >  			if (unlikely(!folio_trylock(folio)))
> >  				return migrate_vma_collect_skip(start, end,
> >  								walk);
> >  			ret = split_folio(folio);
> > -			folio_unlock(folio);
> > +			if (fault_folio != folio)
> > +				folio_unlock(folio);
> >  			folio_put(folio);
> >  			if (ret)
> >  				return migrate_vma_collect_skip(start, end,
> > @@ -192,7 +199,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >  		 * optimisation to avoid walking the rmap later with
> >  		 * try_to_migrate().
> >  		 */
> > -		if (folio_trylock(folio)) {
> > +		if (fault_folio == folio || folio_trylock(folio)) {
> >  			bool anon_exclusive;
> >  			pte_t swp_pte;
> >  
> > @@ -204,7 +211,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >  
> >  				if (folio_try_share_anon_rmap_pte(folio, page)) {
> >  					set_pte_at(mm, addr, ptep, pte);
> > -					folio_unlock(folio);
> > +					if (fault_folio != folio)
> > +						folio_unlock(folio);
> >  					folio_put(folio);
> >  					mpfn = 0;
> >  					goto next;
> > @@ -363,6 +371,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
> >  					  unsigned long npages,
> >  					  struct page *fault_page)
> >  {
> > +	struct folio *fault_folio = fault_page ?
> > +		page_folio(fault_page) : NULL;
> >  	unsigned long i, restore = 0;
> >  	bool allow_drain = true;
> >  	unsigned long unmapped = 0;
> > @@ -427,7 +437,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
> >  		remove_migration_ptes(folio, folio, 0);
> >  
> >  		src_pfns[i] = 0;
> > -		folio_unlock(folio);
> > +		if (fault_folio != folio)
> > +			folio_unlock(folio);
> >  		folio_put(folio);
> >  		restore--;
> >  	}
> > @@ -536,6 +547,8 @@ int migrate_vma_setup(struct migrate_vma *args)
> >  		return -EINVAL;
> >  	if (args->fault_page && !is_device_private_page(args->fault_page))
> >  		return -EINVAL;
> > +	if (args->fault_page && !PageLocked(args->fault_page))
> > +		return -EINVAL;
> >  
> >  	memset(args->src, 0, sizeof(*args->src) * nr_pages);
> >  	args->cpages = 0;
> > @@ -799,19 +812,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
> >  }
> >  EXPORT_SYMBOL(migrate_vma_pages);
> >  
> > -/*
> > - * migrate_device_finalize() - complete page migration
> > - * @src_pfns: src_pfns returned from migrate_device_range()
> > - * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> > - * @npages: number of pages in the range
> > - *
> > - * Completes migration of the page by removing special migration entries.
> > - * Drivers must ensure copying of page data is complete and visible to the CPU
> > - * before calling this.
> > - */
> > -void migrate_device_finalize(unsigned long *src_pfns,
> > -			unsigned long *dst_pfns, unsigned long npages)
> > +static void __migrate_device_finalize(unsigned long *src_pfns,
> > +				      unsigned long *dst_pfns,
> > +				      unsigned long npages,
> > +				      struct page *fault_page)
> >  {
> > +	struct folio *fault_folio = fault_page ?
> > +		page_folio(fault_page) : NULL;
> >  	unsigned long i;
> >  
> >  	for (i = 0; i < npages; i++) {
> > @@ -824,6 +831,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >  
> >  		if (!page) {
> >  			if (dst) {
> > +				WARN_ON_ONCE(fault_folio == dst);
> >  				folio_unlock(dst);
> >  				folio_put(dst);
> >  			}
> > @@ -834,6 +842,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >  
> >  		if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
> >  			if (dst) {
> > +				WARN_ON_ONCE(fault_folio == dst);
> >  				folio_unlock(dst);
> >  				folio_put(dst);
> >  			}
> > @@ -841,7 +850,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >  		}
> >  
> >  		remove_migration_ptes(src, dst, 0);
> > -		folio_unlock(src);
> > +		if (fault_folio != src)
> > +			folio_unlock(src);
> >  
> >  		if (folio_is_zone_device(src))
> >  			folio_put(src);
> > @@ -849,6 +859,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >  			folio_putback_lru(src);
> >  
> >  		if (dst != src) {
> > +			WARN_ON_ONCE(fault_folio == dst);
> >  			folio_unlock(dst);
> >  			if (folio_is_zone_device(dst))
> >  				folio_put(dst);
> > @@ -857,6 +868,22 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >  		}
> >  	}
> >  }
> > +
> > +/*
> > + * migrate_device_finalize() - complete page migration
> > + * @src_pfns: src_pfns returned from migrate_device_range()
> > + * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> > + * @npages: number of pages in the range
> > + *
> > + * Completes migration of the page by removing special migration entries.
> > + * Drivers must ensure copying of page data is complete and visible to the CPU
> > + * before calling this.
> > + */
> > +void migrate_device_finalize(unsigned long *src_pfns,
> > +			unsigned long *dst_pfns, unsigned long npages)
> > +{
> > +	return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
> > +}
> >  EXPORT_SYMBOL(migrate_device_finalize);
> >  
> >  /**
> > @@ -872,7 +899,8 @@ EXPORT_SYMBOL(migrate_device_finalize);
> >   */
> >  void migrate_vma_finalize(struct migrate_vma *migrate)
> >  {
> > -	migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
> > +	__migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
> > +				  migrate->fault_page);
> >  }
> >  EXPORT_SYMBOL(migrate_vma_finalize);
> >  
> > -- 
> > 2.34.1
> >
Alistair Popple Feb. 19, 2025, 6:25 a.m. UTC | #3
On Tue, Feb 18, 2025 at 10:08:31PM -0800, Matthew Brost wrote:
> On Wed, Feb 19, 2025 at 04:36:54PM +1100, Alistair Popple wrote:
> > On Wed, Feb 12, 2025 at 06:10:43PM -0800, Matthew Brost wrote:
> > > Avoid multiple CPU page faults to the same device page racing by trying
> > > to lock the page in do_swap_page before taking an extra reference to the
> > > page. This prevents scenarios where multiple CPU page faults each take
> > > an extra reference to a device page, which could abort migration in
> > > folio_migrate_mapping. With the device page being locked in
> > > do_swap_page, the migrate_vma_* functions need to be updated to avoid
> > > locking the fault_page argument.
> > > 
> > > Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> > > DRM driver) SVM implementation if enough threads faulted the same device
> > > page.
> > > 
> > > v3:
> > >  - Put page after unlocking page (Alistair)
> > >  - Warn on spliting a TPH which is fault page (Alistair)
> > >  - Warn on dst page == fault page (Alistair)
> > > 
> > > Cc: Alistair Popple <apopple@nvidia.com>
> > > Cc: Philip Yang <Philip.Yang@amd.com>
> > > Cc: Felix Kuehling <felix.kuehling@amd.com>
> > > Cc: Christian König <christian.koenig@amd.com>
> > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > Suggested-by: Simona Vetter <simona.vetter@ffwll.ch>
> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > ---
> > >  mm/memory.c         | 13 ++++++---
> > >  mm/migrate_device.c | 64 ++++++++++++++++++++++++++++++++-------------
> > >  2 files changed, 55 insertions(+), 22 deletions(-)
> > > 
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index 539c0f7c6d54..1e010c5d67bc 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -4337,10 +4337,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > >  			 * Get a page reference while we know the page can't be
> > >  			 * freed.
> > >  			 */
> > > -			get_page(vmf->page);
> > > -			pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > -			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> > > -			put_page(vmf->page);
> > > +			if (trylock_page(vmf->page)) {
> > > +				get_page(vmf->page);
> > > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > +				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> > > +				unlock_page(vmf->page);
> > > +				put_page(vmf->page);
> > > +			} else {
> > > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > +			}
> > >  		} else if (is_hwpoison_entry(entry)) {
> > >  			ret = VM_FAULT_HWPOISON;
> > >  		} else if (is_pte_marker_entry(entry)) {
> > > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> > > index 19960743f927..3470357d9bae 100644
> > > --- a/mm/migrate_device.c
> > > +++ b/mm/migrate_device.c
> > > @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> > >  				   struct mm_walk *walk)
> > >  {
> > >  	struct migrate_vma *migrate = walk->private;
> > > +	struct folio *fault_folio = migrate->fault_page ?
> > > +		page_folio(migrate->fault_page) : NULL;
> > >  	struct vm_area_struct *vma = walk->vma;
> > >  	struct mm_struct *mm = vma->vm_mm;
> > >  	unsigned long addr = start, unmapped = 0;
> > > @@ -88,11 +90,16 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> > >  
> > >  			folio_get(folio);
> > >  			spin_unlock(ptl);
> > > +			/* FIXME support THP */
> > > +			if (WARN_ON_ONCE(fault_folio == folio))
> > 
> > This threw me until I realised this was the THP path because we'd expect to find
> > the fault_folio during migrate_vma_collect_pmd(). Of course we don't currently
> > have DEVICE_PRIVATE THP so faulting won't ever get here which makes sense, but a
> > slightly more verbose comment along those lines would be nice.
> > 
> 
> Can make the comment a bit more verbose in the next rev.
> 
> > Otherwise it looks good and passed my tests so please add:
> > 
> > Reviewed-by: Alistair Popple <apopple@nvidia.com>
> > Tested-by: Alistair Popple <apopple@nvidia.com>
> > 
> 
> Thanks!
> 
> If Nvidia / Nova might make use of GPU SVM, Sima requested an external
> ack outside of Intel, as prerequisite to merging this series [1], on the
> documentation patch [2] detailing the design principles, current status,
> and future plans. If you think reviewing it is appropriate, any input
> would be appreciated.

Oh good idea. Both Nvidia and Nouveau drivers currently make use of GPU SVM and
I assume Nova will too (I'm currently getting up to speed on that) so will take
a look.

 - Alistair

> Matt
> 
> [1] https://patchwork.freedesktop.org/series/137870/
> [2] https://patchwork.freedesktop.org/patch/636838/?series=137870&rev=5
> 
> > > +				return migrate_vma_collect_skip(start, end,
> > > +								walk);
> > >  			if (unlikely(!folio_trylock(folio)))
> > >  				return migrate_vma_collect_skip(start, end,
> > >  								walk);
> > >  			ret = split_folio(folio);
> > > -			folio_unlock(folio);
> > > +			if (fault_folio != folio)
> > > +				folio_unlock(folio);
> > >  			folio_put(folio);
> > >  			if (ret)
> > >  				return migrate_vma_collect_skip(start, end,
> > > @@ -192,7 +199,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> > >  		 * optimisation to avoid walking the rmap later with
> > >  		 * try_to_migrate().
> > >  		 */
> > > -		if (folio_trylock(folio)) {
> > > +		if (fault_folio == folio || folio_trylock(folio)) {
> > >  			bool anon_exclusive;
> > >  			pte_t swp_pte;
> > >  
> > > @@ -204,7 +211,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> > >  
> > >  				if (folio_try_share_anon_rmap_pte(folio, page)) {
> > >  					set_pte_at(mm, addr, ptep, pte);
> > > -					folio_unlock(folio);
> > > +					if (fault_folio != folio)
> > > +						folio_unlock(folio);
> > >  					folio_put(folio);
> > >  					mpfn = 0;
> > >  					goto next;
> > > @@ -363,6 +371,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
> > >  					  unsigned long npages,
> > >  					  struct page *fault_page)
> > >  {
> > > +	struct folio *fault_folio = fault_page ?
> > > +		page_folio(fault_page) : NULL;
> > >  	unsigned long i, restore = 0;
> > >  	bool allow_drain = true;
> > >  	unsigned long unmapped = 0;
> > > @@ -427,7 +437,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
> > >  		remove_migration_ptes(folio, folio, 0);
> > >  
> > >  		src_pfns[i] = 0;
> > > -		folio_unlock(folio);
> > > +		if (fault_folio != folio)
> > > +			folio_unlock(folio);
> > >  		folio_put(folio);
> > >  		restore--;
> > >  	}
> > > @@ -536,6 +547,8 @@ int migrate_vma_setup(struct migrate_vma *args)
> > >  		return -EINVAL;
> > >  	if (args->fault_page && !is_device_private_page(args->fault_page))
> > >  		return -EINVAL;
> > > +	if (args->fault_page && !PageLocked(args->fault_page))
> > > +		return -EINVAL;
> > >  
> > >  	memset(args->src, 0, sizeof(*args->src) * nr_pages);
> > >  	args->cpages = 0;
> > > @@ -799,19 +812,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
> > >  }
> > >  EXPORT_SYMBOL(migrate_vma_pages);
> > >  
> > > -/*
> > > - * migrate_device_finalize() - complete page migration
> > > - * @src_pfns: src_pfns returned from migrate_device_range()
> > > - * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> > > - * @npages: number of pages in the range
> > > - *
> > > - * Completes migration of the page by removing special migration entries.
> > > - * Drivers must ensure copying of page data is complete and visible to the CPU
> > > - * before calling this.
> > > - */
> > > -void migrate_device_finalize(unsigned long *src_pfns,
> > > -			unsigned long *dst_pfns, unsigned long npages)
> > > +static void __migrate_device_finalize(unsigned long *src_pfns,
> > > +				      unsigned long *dst_pfns,
> > > +				      unsigned long npages,
> > > +				      struct page *fault_page)
> > >  {
> > > +	struct folio *fault_folio = fault_page ?
> > > +		page_folio(fault_page) : NULL;
> > >  	unsigned long i;
> > >  
> > >  	for (i = 0; i < npages; i++) {
> > > @@ -824,6 +831,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
> > >  
> > >  		if (!page) {
> > >  			if (dst) {
> > > +				WARN_ON_ONCE(fault_folio == dst);
> > >  				folio_unlock(dst);
> > >  				folio_put(dst);
> > >  			}
> > > @@ -834,6 +842,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
> > >  
> > >  		if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
> > >  			if (dst) {
> > > +				WARN_ON_ONCE(fault_folio == dst);
> > >  				folio_unlock(dst);
> > >  				folio_put(dst);
> > >  			}
> > > @@ -841,7 +850,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
> > >  		}
> > >  
> > >  		remove_migration_ptes(src, dst, 0);
> > > -		folio_unlock(src);
> > > +		if (fault_folio != src)
> > > +			folio_unlock(src);
> > >  
> > >  		if (folio_is_zone_device(src))
> > >  			folio_put(src);
> > > @@ -849,6 +859,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
> > >  			folio_putback_lru(src);
> > >  
> > >  		if (dst != src) {
> > > +			WARN_ON_ONCE(fault_folio == dst);
> > >  			folio_unlock(dst);
> > >  			if (folio_is_zone_device(dst))
> > >  				folio_put(dst);
> > > @@ -857,6 +868,22 @@ void migrate_device_finalize(unsigned long *src_pfns,
> > >  		}
> > >  	}
> > >  }
> > > +
> > > +/*
> > > + * migrate_device_finalize() - complete page migration
> > > + * @src_pfns: src_pfns returned from migrate_device_range()
> > > + * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> > > + * @npages: number of pages in the range
> > > + *
> > > + * Completes migration of the page by removing special migration entries.
> > > + * Drivers must ensure copying of page data is complete and visible to the CPU
> > > + * before calling this.
> > > + */
> > > +void migrate_device_finalize(unsigned long *src_pfns,
> > > +			unsigned long *dst_pfns, unsigned long npages)
> > > +{
> > > +	return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
> > > +}
> > >  EXPORT_SYMBOL(migrate_device_finalize);
> > >  
> > >  /**
> > > @@ -872,7 +899,8 @@ EXPORT_SYMBOL(migrate_device_finalize);
> > >   */
> > >  void migrate_vma_finalize(struct migrate_vma *migrate)
> > >  {
> > > -	migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
> > > +	__migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
> > > +				  migrate->fault_page);
> > >  }
> > >  EXPORT_SYMBOL(migrate_vma_finalize);
> > >  
> > > -- 
> > > 2.34.1
> > >
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index 539c0f7c6d54..1e010c5d67bc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4337,10 +4337,15 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 			 * Get a page reference while we know the page can't be
 			 * freed.
 			 */
-			get_page(vmf->page);
-			pte_unmap_unlock(vmf->pte, vmf->ptl);
-			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
-			put_page(vmf->page);
+			if (trylock_page(vmf->page)) {
+				get_page(vmf->page);
+				pte_unmap_unlock(vmf->pte, vmf->ptl);
+				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
+				unlock_page(vmf->page);
+				put_page(vmf->page);
+			} else {
+				pte_unmap_unlock(vmf->pte, vmf->ptl);
+			}
 		} else if (is_hwpoison_entry(entry)) {
 			ret = VM_FAULT_HWPOISON;
 		} else if (is_pte_marker_entry(entry)) {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 19960743f927..3470357d9bae 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -60,6 +60,8 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 				   struct mm_walk *walk)
 {
 	struct migrate_vma *migrate = walk->private;
+	struct folio *fault_folio = migrate->fault_page ?
+		page_folio(migrate->fault_page) : NULL;
 	struct vm_area_struct *vma = walk->vma;
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long addr = start, unmapped = 0;
@@ -88,11 +90,16 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 
 			folio_get(folio);
 			spin_unlock(ptl);
+			/* FIXME support THP */
+			if (WARN_ON_ONCE(fault_folio == folio))
+				return migrate_vma_collect_skip(start, end,
+								walk);
 			if (unlikely(!folio_trylock(folio)))
 				return migrate_vma_collect_skip(start, end,
 								walk);
 			ret = split_folio(folio);
-			folio_unlock(folio);
+			if (fault_folio != folio)
+				folio_unlock(folio);
 			folio_put(folio);
 			if (ret)
 				return migrate_vma_collect_skip(start, end,
@@ -192,7 +199,7 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 		 * optimisation to avoid walking the rmap later with
 		 * try_to_migrate().
 		 */
-		if (folio_trylock(folio)) {
+		if (fault_folio == folio || folio_trylock(folio)) {
 			bool anon_exclusive;
 			pte_t swp_pte;
 
@@ -204,7 +211,8 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 
 				if (folio_try_share_anon_rmap_pte(folio, page)) {
 					set_pte_at(mm, addr, ptep, pte);
-					folio_unlock(folio);
+					if (fault_folio != folio)
+						folio_unlock(folio);
 					folio_put(folio);
 					mpfn = 0;
 					goto next;
@@ -363,6 +371,8 @@  static unsigned long migrate_device_unmap(unsigned long *src_pfns,
 					  unsigned long npages,
 					  struct page *fault_page)
 {
+	struct folio *fault_folio = fault_page ?
+		page_folio(fault_page) : NULL;
 	unsigned long i, restore = 0;
 	bool allow_drain = true;
 	unsigned long unmapped = 0;
@@ -427,7 +437,8 @@  static unsigned long migrate_device_unmap(unsigned long *src_pfns,
 		remove_migration_ptes(folio, folio, 0);
 
 		src_pfns[i] = 0;
-		folio_unlock(folio);
+		if (fault_folio != folio)
+			folio_unlock(folio);
 		folio_put(folio);
 		restore--;
 	}
@@ -536,6 +547,8 @@  int migrate_vma_setup(struct migrate_vma *args)
 		return -EINVAL;
 	if (args->fault_page && !is_device_private_page(args->fault_page))
 		return -EINVAL;
+	if (args->fault_page && !PageLocked(args->fault_page))
+		return -EINVAL;
 
 	memset(args->src, 0, sizeof(*args->src) * nr_pages);
 	args->cpages = 0;
@@ -799,19 +812,13 @@  void migrate_vma_pages(struct migrate_vma *migrate)
 }
 EXPORT_SYMBOL(migrate_vma_pages);
 
-/*
- * migrate_device_finalize() - complete page migration
- * @src_pfns: src_pfns returned from migrate_device_range()
- * @dst_pfns: array of pfns allocated by the driver to migrate memory to
- * @npages: number of pages in the range
- *
- * Completes migration of the page by removing special migration entries.
- * Drivers must ensure copying of page data is complete and visible to the CPU
- * before calling this.
- */
-void migrate_device_finalize(unsigned long *src_pfns,
-			unsigned long *dst_pfns, unsigned long npages)
+static void __migrate_device_finalize(unsigned long *src_pfns,
+				      unsigned long *dst_pfns,
+				      unsigned long npages,
+				      struct page *fault_page)
 {
+	struct folio *fault_folio = fault_page ?
+		page_folio(fault_page) : NULL;
 	unsigned long i;
 
 	for (i = 0; i < npages; i++) {
@@ -824,6 +831,7 @@  void migrate_device_finalize(unsigned long *src_pfns,
 
 		if (!page) {
 			if (dst) {
+				WARN_ON_ONCE(fault_folio == dst);
 				folio_unlock(dst);
 				folio_put(dst);
 			}
@@ -834,6 +842,7 @@  void migrate_device_finalize(unsigned long *src_pfns,
 
 		if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
 			if (dst) {
+				WARN_ON_ONCE(fault_folio == dst);
 				folio_unlock(dst);
 				folio_put(dst);
 			}
@@ -841,7 +850,8 @@  void migrate_device_finalize(unsigned long *src_pfns,
 		}
 
 		remove_migration_ptes(src, dst, 0);
-		folio_unlock(src);
+		if (fault_folio != src)
+			folio_unlock(src);
 
 		if (folio_is_zone_device(src))
 			folio_put(src);
@@ -849,6 +859,7 @@  void migrate_device_finalize(unsigned long *src_pfns,
 			folio_putback_lru(src);
 
 		if (dst != src) {
+			WARN_ON_ONCE(fault_folio == dst);
 			folio_unlock(dst);
 			if (folio_is_zone_device(dst))
 				folio_put(dst);
@@ -857,6 +868,22 @@  void migrate_device_finalize(unsigned long *src_pfns,
 		}
 	}
 }
+
+/*
+ * migrate_device_finalize() - complete page migration
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
+ *
+ * Completes migration of the page by removing special migration entries.
+ * Drivers must ensure copying of page data is complete and visible to the CPU
+ * before calling this.
+ */
+void migrate_device_finalize(unsigned long *src_pfns,
+			unsigned long *dst_pfns, unsigned long npages)
+{
+	return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
+}
 EXPORT_SYMBOL(migrate_device_finalize);
 
 /**
@@ -872,7 +899,8 @@  EXPORT_SYMBOL(migrate_device_finalize);
  */
 void migrate_vma_finalize(struct migrate_vma *migrate)
 {
-	migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
+	__migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
+				  migrate->fault_page);
 }
 EXPORT_SYMBOL(migrate_vma_finalize);