diff mbox series

[v4,04/23] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages

Message ID 20191113042710.3997854-5-jhubbard@nvidia.com (mailing list archive)
State New
Headers show
Series mm/gup: track dma-pinned pages: FOLL_PIN, FOLL_LONGTERM | expand

Commit Message

John Hubbard Nov. 13, 2019, 4:26 a.m. UTC
An upcoming patch changes and complicates the refcounting and
especially the "put page" aspects of it. In order to keep
everything clean, refactor the devmap page release routines:

* Rename put_devmap_managed_page() to page_is_devmap_managed(),
  and limit the functionality to "read only": return a bool,
  with no side effects.

* Add a new routine, put_devmap_managed_page(), to handle checking
  what kind of page it is, and what kind of refcount handling it
  requires.

* Rename __put_devmap_managed_page() to free_devmap_managed_page(),
  and limit the functionality to unconditionally freeing a devmap
  page.

This is originally based on a separate patch by Ira Weiny, which
applied to an early version of the put_user_page() experiments.
Since then, Jérôme Glisse suggested the refactoring described above.

Suggested-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
---
 include/linux/mm.h | 27 ++++++++++++++++---
 mm/memremap.c      | 67 ++++++++++++++++++++--------------------------
 2 files changed, 53 insertions(+), 41 deletions(-)

Comments

Jan Kara Nov. 13, 2019, 10:49 a.m. UTC | #1
On Tue 12-11-19 20:26:51, John Hubbard wrote:
> An upcoming patch changes and complicates the refcounting and
> especially the "put page" aspects of it. In order to keep
> everything clean, refactor the devmap page release routines:
> 
> * Rename put_devmap_managed_page() to page_is_devmap_managed(),
>   and limit the functionality to "read only": return a bool,
>   with no side effects.
> 
> * Add a new routine, put_devmap_managed_page(), to handle checking
>   what kind of page it is, and what kind of refcount handling it
>   requires.
> 
> * Rename __put_devmap_managed_page() to free_devmap_managed_page(),
>   and limit the functionality to unconditionally freeing a devmap
>   page.
> 
> This is originally based on a separate patch by Ira Weiny, which
> applied to an early version of the put_user_page() experiments.
> Since then, Jérôme Glisse suggested the refactoring described above.
> 
> Suggested-by: Jérôme Glisse <jglisse@redhat.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: John Hubbard <jhubbard@nvidia.com>

Looks good to me. You can add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  include/linux/mm.h | 27 ++++++++++++++++---
>  mm/memremap.c      | 67 ++++++++++++++++++++--------------------------
>  2 files changed, 53 insertions(+), 41 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a2adf95b3f9c..96228376139c 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -967,9 +967,10 @@ static inline bool is_zone_device_page(const struct page *page)
>  #endif
>  
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
> -void __put_devmap_managed_page(struct page *page);
> +void free_devmap_managed_page(struct page *page);
>  DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
> -static inline bool put_devmap_managed_page(struct page *page)
> +
> +static inline bool page_is_devmap_managed(struct page *page)
>  {
>  	if (!static_branch_unlikely(&devmap_managed_key))
>  		return false;
> @@ -978,7 +979,6 @@ static inline bool put_devmap_managed_page(struct page *page)
>  	switch (page->pgmap->type) {
>  	case MEMORY_DEVICE_PRIVATE:
>  	case MEMORY_DEVICE_FS_DAX:
> -		__put_devmap_managed_page(page);
>  		return true;
>  	default:
>  		break;
> @@ -986,6 +986,27 @@ static inline bool put_devmap_managed_page(struct page *page)
>  	return false;
>  }
>  
> +static inline bool put_devmap_managed_page(struct page *page)
> +{
> +	bool is_devmap = page_is_devmap_managed(page);
> +
> +	if (is_devmap) {
> +		int count = page_ref_dec_return(page);
> +
> +		/*
> +		 * devmap page refcounts are 1-based, rather than 0-based: if
> +		 * refcount is 1, then the page is free and the refcount is
> +		 * stable because nobody holds a reference on the page.
> +		 */
> +		if (count == 1)
> +			free_devmap_managed_page(page);
> +		else if (!count)
> +			__put_page(page);
> +	}
> +
> +	return is_devmap;
> +}
> +
>  #else /* CONFIG_DEV_PAGEMAP_OPS */
>  static inline bool put_devmap_managed_page(struct page *page)
>  {
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 03ccbdfeb697..bc7e2a27d025 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -410,48 +410,39 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
>  EXPORT_SYMBOL_GPL(get_dev_pagemap);
>  
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
> -void __put_devmap_managed_page(struct page *page)
> +void free_devmap_managed_page(struct page *page)
>  {
> -	int count = page_ref_dec_return(page);
> +	/* Clear Active bit in case of parallel mark_page_accessed */
> +	__ClearPageActive(page);
> +	__ClearPageWaiters(page);
> +
> +	mem_cgroup_uncharge(page);
>  
>  	/*
> -	 * If refcount is 1 then page is freed and refcount is stable as nobody
> -	 * holds a reference on the page.
> +	 * When a device_private page is freed, the page->mapping field
> +	 * may still contain a (stale) mapping value. For example, the
> +	 * lower bits of page->mapping may still identify the page as
> +	 * an anonymous page. Ultimately, this entire field is just
> +	 * stale and wrong, and it will cause errors if not cleared.
> +	 * One example is:
> +	 *
> +	 *  migrate_vma_pages()
> +	 *    migrate_vma_insert_page()
> +	 *      page_add_new_anon_rmap()
> +	 *        __page_set_anon_rmap()
> +	 *          ...checks page->mapping, via PageAnon(page) call,
> +	 *            and incorrectly concludes that the page is an
> +	 *            anonymous page. Therefore, it incorrectly,
> +	 *            silently fails to set up the new anon rmap.
> +	 *
> +	 * For other types of ZONE_DEVICE pages, migration is either
> +	 * handled differently or not done at all, so there is no need
> +	 * to clear page->mapping.
>  	 */
> -	if (count == 1) {
> -		/* Clear Active bit in case of parallel mark_page_accessed */
> -		__ClearPageActive(page);
> -		__ClearPageWaiters(page);
> -
> -		mem_cgroup_uncharge(page);
> -
> -		/*
> -		 * When a device_private page is freed, the page->mapping field
> -		 * may still contain a (stale) mapping value. For example, the
> -		 * lower bits of page->mapping may still identify the page as
> -		 * an anonymous page. Ultimately, this entire field is just
> -		 * stale and wrong, and it will cause errors if not cleared.
> -		 * One example is:
> -		 *
> -		 *  migrate_vma_pages()
> -		 *    migrate_vma_insert_page()
> -		 *      page_add_new_anon_rmap()
> -		 *        __page_set_anon_rmap()
> -		 *          ...checks page->mapping, via PageAnon(page) call,
> -		 *            and incorrectly concludes that the page is an
> -		 *            anonymous page. Therefore, it incorrectly,
> -		 *            silently fails to set up the new anon rmap.
> -		 *
> -		 * For other types of ZONE_DEVICE pages, migration is either
> -		 * handled differently or not done at all, so there is no need
> -		 * to clear page->mapping.
> -		 */
> -		if (is_device_private_page(page))
> -			page->mapping = NULL;
> +	if (is_device_private_page(page))
> +		page->mapping = NULL;
>  
> -		page->pgmap->ops->page_free(page);
> -	} else if (!count)
> -		__put_page(page);
> +	page->pgmap->ops->page_free(page);
>  }
> -EXPORT_SYMBOL(__put_devmap_managed_page);
> +EXPORT_SYMBOL(free_devmap_managed_page);
>  #endif /* CONFIG_DEV_PAGEMAP_OPS */
> -- 
> 2.24.0
>
Jerome Glisse Nov. 13, 2019, 7:09 p.m. UTC | #2
On Tue, Nov 12, 2019 at 08:26:51PM -0800, John Hubbard wrote:
> An upcoming patch changes and complicates the refcounting and
> especially the "put page" aspects of it. In order to keep
> everything clean, refactor the devmap page release routines:
> 
> * Rename put_devmap_managed_page() to page_is_devmap_managed(),
>   and limit the functionality to "read only": return a bool,
>   with no side effects.
> 
> * Add a new routine, put_devmap_managed_page(), to handle checking
>   what kind of page it is, and what kind of refcount handling it
>   requires.
> 
> * Rename __put_devmap_managed_page() to free_devmap_managed_page(),
>   and limit the functionality to unconditionally freeing a devmap
>   page.
> 
> This is originally based on a separate patch by Ira Weiny, which
> applied to an early version of the put_user_page() experiments.
> Since then, Jérôme Glisse suggested the refactoring described above.
> 
> Suggested-by: Jérôme Glisse <jglisse@redhat.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: John Hubbard <jhubbard@nvidia.com>

Reviewed-by: Jérôme Glisse <jglisse@redhat.com>

> ---
>  include/linux/mm.h | 27 ++++++++++++++++---
>  mm/memremap.c      | 67 ++++++++++++++++++++--------------------------
>  2 files changed, 53 insertions(+), 41 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a2adf95b3f9c..96228376139c 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -967,9 +967,10 @@ static inline bool is_zone_device_page(const struct page *page)
>  #endif
>  
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
> -void __put_devmap_managed_page(struct page *page);
> +void free_devmap_managed_page(struct page *page);
>  DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
> -static inline bool put_devmap_managed_page(struct page *page)
> +
> +static inline bool page_is_devmap_managed(struct page *page)
>  {
>  	if (!static_branch_unlikely(&devmap_managed_key))
>  		return false;
> @@ -978,7 +979,6 @@ static inline bool put_devmap_managed_page(struct page *page)
>  	switch (page->pgmap->type) {
>  	case MEMORY_DEVICE_PRIVATE:
>  	case MEMORY_DEVICE_FS_DAX:
> -		__put_devmap_managed_page(page);
>  		return true;
>  	default:
>  		break;
> @@ -986,6 +986,27 @@ static inline bool put_devmap_managed_page(struct page *page)
>  	return false;
>  }
>  
> +static inline bool put_devmap_managed_page(struct page *page)
> +{
> +	bool is_devmap = page_is_devmap_managed(page);
> +
> +	if (is_devmap) {
> +		int count = page_ref_dec_return(page);
> +
> +		/*
> +		 * devmap page refcounts are 1-based, rather than 0-based: if
> +		 * refcount is 1, then the page is free and the refcount is
> +		 * stable because nobody holds a reference on the page.
> +		 */
> +		if (count == 1)
> +			free_devmap_managed_page(page);
> +		else if (!count)
> +			__put_page(page);
> +	}
> +
> +	return is_devmap;
> +}
> +
>  #else /* CONFIG_DEV_PAGEMAP_OPS */
>  static inline bool put_devmap_managed_page(struct page *page)
>  {
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 03ccbdfeb697..bc7e2a27d025 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -410,48 +410,39 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
>  EXPORT_SYMBOL_GPL(get_dev_pagemap);
>  
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
> -void __put_devmap_managed_page(struct page *page)
> +void free_devmap_managed_page(struct page *page)
>  {
> -	int count = page_ref_dec_return(page);
> +	/* Clear Active bit in case of parallel mark_page_accessed */
> +	__ClearPageActive(page);
> +	__ClearPageWaiters(page);
> +
> +	mem_cgroup_uncharge(page);
>  
>  	/*
> -	 * If refcount is 1 then page is freed and refcount is stable as nobody
> -	 * holds a reference on the page.
> +	 * When a device_private page is freed, the page->mapping field
> +	 * may still contain a (stale) mapping value. For example, the
> +	 * lower bits of page->mapping may still identify the page as
> +	 * an anonymous page. Ultimately, this entire field is just
> +	 * stale and wrong, and it will cause errors if not cleared.
> +	 * One example is:
> +	 *
> +	 *  migrate_vma_pages()
> +	 *    migrate_vma_insert_page()
> +	 *      page_add_new_anon_rmap()
> +	 *        __page_set_anon_rmap()
> +	 *          ...checks page->mapping, via PageAnon(page) call,
> +	 *            and incorrectly concludes that the page is an
> +	 *            anonymous page. Therefore, it incorrectly,
> +	 *            silently fails to set up the new anon rmap.
> +	 *
> +	 * For other types of ZONE_DEVICE pages, migration is either
> +	 * handled differently or not done at all, so there is no need
> +	 * to clear page->mapping.
>  	 */
> -	if (count == 1) {
> -		/* Clear Active bit in case of parallel mark_page_accessed */
> -		__ClearPageActive(page);
> -		__ClearPageWaiters(page);
> -
> -		mem_cgroup_uncharge(page);
> -
> -		/*
> -		 * When a device_private page is freed, the page->mapping field
> -		 * may still contain a (stale) mapping value. For example, the
> -		 * lower bits of page->mapping may still identify the page as
> -		 * an anonymous page. Ultimately, this entire field is just
> -		 * stale and wrong, and it will cause errors if not cleared.
> -		 * One example is:
> -		 *
> -		 *  migrate_vma_pages()
> -		 *    migrate_vma_insert_page()
> -		 *      page_add_new_anon_rmap()
> -		 *        __page_set_anon_rmap()
> -		 *          ...checks page->mapping, via PageAnon(page) call,
> -		 *            and incorrectly concludes that the page is an
> -		 *            anonymous page. Therefore, it incorrectly,
> -		 *            silently fails to set up the new anon rmap.
> -		 *
> -		 * For other types of ZONE_DEVICE pages, migration is either
> -		 * handled differently or not done at all, so there is no need
> -		 * to clear page->mapping.
> -		 */
> -		if (is_device_private_page(page))
> -			page->mapping = NULL;
> +	if (is_device_private_page(page))
> +		page->mapping = NULL;
>  
> -		page->pgmap->ops->page_free(page);
> -	} else if (!count)
> -		__put_page(page);
> +	page->pgmap->ops->page_free(page);
>  }
> -EXPORT_SYMBOL(__put_devmap_managed_page);
> +EXPORT_SYMBOL(free_devmap_managed_page);
>  #endif /* CONFIG_DEV_PAGEMAP_OPS */
> -- 
> 2.24.0
>
Dan Williams Nov. 13, 2019, 7:23 p.m. UTC | #3
On Tue, Nov 12, 2019 at 8:27 PM John Hubbard <jhubbard@nvidia.com> wrote:
>
> An upcoming patch changes and complicates the refcounting and
> especially the "put page" aspects of it. In order to keep
> everything clean, refactor the devmap page release routines:
>
> * Rename put_devmap_managed_page() to page_is_devmap_managed(),
>   and limit the functionality to "read only": return a bool,
>   with no side effects.
>
> * Add a new routine, put_devmap_managed_page(), to handle checking
>   what kind of page it is, and what kind of refcount handling it
>   requires.
>
> * Rename __put_devmap_managed_page() to free_devmap_managed_page(),
>   and limit the functionality to unconditionally freeing a devmap
>   page.
>
> This is originally based on a separate patch by Ira Weiny, which
> applied to an early version of the put_user_page() experiments.
> Since then, Jérôme Glisse suggested the refactoring described above.
>
> Suggested-by: Jérôme Glisse <jglisse@redhat.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: John Hubbard <jhubbard@nvidia.com>
> ---
>  include/linux/mm.h | 27 ++++++++++++++++---
>  mm/memremap.c      | 67 ++++++++++++++++++++--------------------------
>  2 files changed, 53 insertions(+), 41 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a2adf95b3f9c..96228376139c 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -967,9 +967,10 @@ static inline bool is_zone_device_page(const struct page *page)
>  #endif
>
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
> -void __put_devmap_managed_page(struct page *page);
> +void free_devmap_managed_page(struct page *page);
>  DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
> -static inline bool put_devmap_managed_page(struct page *page)
> +
> +static inline bool page_is_devmap_managed(struct page *page)
>  {
>         if (!static_branch_unlikely(&devmap_managed_key))
>                 return false;
> @@ -978,7 +979,6 @@ static inline bool put_devmap_managed_page(struct page *page)
>         switch (page->pgmap->type) {
>         case MEMORY_DEVICE_PRIVATE:
>         case MEMORY_DEVICE_FS_DAX:
> -               __put_devmap_managed_page(page);
>                 return true;
>         default:
>                 break;
> @@ -986,6 +986,27 @@ static inline bool put_devmap_managed_page(struct page *page)
>         return false;
>  }
>
> +static inline bool put_devmap_managed_page(struct page *page)
> +{
> +       bool is_devmap = page_is_devmap_managed(page);
> +
> +       if (is_devmap) {
> +               int count = page_ref_dec_return(page);
> +
> +               /*
> +                * devmap page refcounts are 1-based, rather than 0-based: if
> +                * refcount is 1, then the page is free and the refcount is
> +                * stable because nobody holds a reference on the page.
> +                */
> +               if (count == 1)
> +                       free_devmap_managed_page(page);
> +               else if (!count)
> +                       __put_page(page);
> +       }
> +
> +       return is_devmap;
> +}
> +
>  #else /* CONFIG_DEV_PAGEMAP_OPS */
>  static inline bool put_devmap_managed_page(struct page *page)
>  {
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 03ccbdfeb697..bc7e2a27d025 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -410,48 +410,39 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
>  EXPORT_SYMBOL_GPL(get_dev_pagemap);
>
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
> -void __put_devmap_managed_page(struct page *page)
> +void free_devmap_managed_page(struct page *page)
>  {
> -       int count = page_ref_dec_return(page);
> +       /* Clear Active bit in case of parallel mark_page_accessed */
> +       __ClearPageActive(page);
> +       __ClearPageWaiters(page);
> +
> +       mem_cgroup_uncharge(page);

Ugh, when did all this HMM specific manipulation sneak into the
generic ZONE_DEVICE path? It used to be gated by pgmap type with its
own put_zone_device_private_page(). For example it's certainly
unnecessary and might be broken (would need to check) to call
mem_cgroup_uncharge() on a DAX page. ZONE_DEVICE users are not a
monolith and the HMM use case leaks pages into code paths that DAX
explicitly avoids.
Dan Williams Nov. 13, 2019, 10 p.m. UTC | #4
On Wed, Nov 13, 2019 at 11:23 AM Dan Williams <dan.j.williams@intel.com> wrote:
>
> On Tue, Nov 12, 2019 at 8:27 PM John Hubbard <jhubbard@nvidia.com> wrote:
> >
> > An upcoming patch changes and complicates the refcounting and
> > especially the "put page" aspects of it. In order to keep
> > everything clean, refactor the devmap page release routines:
> >
> > * Rename put_devmap_managed_page() to page_is_devmap_managed(),
> >   and limit the functionality to "read only": return a bool,
> >   with no side effects.
> >
> > * Add a new routine, put_devmap_managed_page(), to handle checking
> >   what kind of page it is, and what kind of refcount handling it
> >   requires.
> >
> > * Rename __put_devmap_managed_page() to free_devmap_managed_page(),
> >   and limit the functionality to unconditionally freeing a devmap
> >   page.
> >
> > This is originally based on a separate patch by Ira Weiny, which
> > applied to an early version of the put_user_page() experiments.
> > Since then, Jérôme Glisse suggested the refactoring described above.
> >
> > Suggested-by: Jérôme Glisse <jglisse@redhat.com>
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> > Signed-off-by: John Hubbard <jhubbard@nvidia.com>
> > ---
> >  include/linux/mm.h | 27 ++++++++++++++++---
> >  mm/memremap.c      | 67 ++++++++++++++++++++--------------------------
> >  2 files changed, 53 insertions(+), 41 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index a2adf95b3f9c..96228376139c 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -967,9 +967,10 @@ static inline bool is_zone_device_page(const struct page *page)
> >  #endif
> >
> >  #ifdef CONFIG_DEV_PAGEMAP_OPS
> > -void __put_devmap_managed_page(struct page *page);
> > +void free_devmap_managed_page(struct page *page);
> >  DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
> > -static inline bool put_devmap_managed_page(struct page *page)
> > +
> > +static inline bool page_is_devmap_managed(struct page *page)
> >  {
> >         if (!static_branch_unlikely(&devmap_managed_key))
> >                 return false;
> > @@ -978,7 +979,6 @@ static inline bool put_devmap_managed_page(struct page *page)
> >         switch (page->pgmap->type) {
> >         case MEMORY_DEVICE_PRIVATE:
> >         case MEMORY_DEVICE_FS_DAX:
> > -               __put_devmap_managed_page(page);
> >                 return true;
> >         default:
> >                 break;
> > @@ -986,6 +986,27 @@ static inline bool put_devmap_managed_page(struct page *page)
> >         return false;
> >  }
> >
> > +static inline bool put_devmap_managed_page(struct page *page)
> > +{
> > +       bool is_devmap = page_is_devmap_managed(page);
> > +
> > +       if (is_devmap) {
> > +               int count = page_ref_dec_return(page);
> > +
> > +               /*
> > +                * devmap page refcounts are 1-based, rather than 0-based: if
> > +                * refcount is 1, then the page is free and the refcount is
> > +                * stable because nobody holds a reference on the page.
> > +                */
> > +               if (count == 1)
> > +                       free_devmap_managed_page(page);
> > +               else if (!count)
> > +                       __put_page(page);
> > +       }
> > +
> > +       return is_devmap;
> > +}
> > +
> >  #else /* CONFIG_DEV_PAGEMAP_OPS */
> >  static inline bool put_devmap_managed_page(struct page *page)
> >  {
> > diff --git a/mm/memremap.c b/mm/memremap.c
> > index 03ccbdfeb697..bc7e2a27d025 100644
> > --- a/mm/memremap.c
> > +++ b/mm/memremap.c
> > @@ -410,48 +410,39 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
> >  EXPORT_SYMBOL_GPL(get_dev_pagemap);
> >
> >  #ifdef CONFIG_DEV_PAGEMAP_OPS
> > -void __put_devmap_managed_page(struct page *page)
> > +void free_devmap_managed_page(struct page *page)
> >  {
> > -       int count = page_ref_dec_return(page);
> > +       /* Clear Active bit in case of parallel mark_page_accessed */
> > +       __ClearPageActive(page);
> > +       __ClearPageWaiters(page);
> > +
> > +       mem_cgroup_uncharge(page);
>
> Ugh, when did all this HMM specific manipulation sneak into the
> generic ZONE_DEVICE path? It used to be gated by pgmap type with its
> own put_zone_device_private_page(). For example it's certainly
> unnecessary and might be broken (would need to check) to call
> mem_cgroup_uncharge() on a DAX page. ZONE_DEVICE users are not a
> monolith and the HMM use case leaks pages into code paths that DAX
> explicitly avoids.

It's been this way for a while and I did not react previously,
apologies for that. I think __ClearPageActive, __ClearPageWaiters, and
mem_cgroup_uncharge, belong behind a device-private conditional. The
history here is:

Move some, but not all HMM specifics to hmm_devmem_free():
    2fa147bdbf67 mm, dev_pagemap: Do not clear ->mapping on final put

Remove the clearing of mapping since no upstream consumers needed it:
    b7a523109fb5 mm: don't clear ->mapping in hmm_devmem_free

Add it back in once an upstream consumer arrived:
    7ab0ad0e74f8 mm/hmm: fix ZONE_DEVICE anon page mapping reuse

We're now almost entirely free of ->page_free callbacks except for
that weird nouveau case, can that FIXME in nouveau_dmem_page_free()
also result in killing the ->page_free() callback altogether? In the
meantime I'm proposing a cleanup like this:

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index ad8e4df1282b..4eae441f86c9 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -337,13 +337,7 @@ static void pmem_release_disk(void *__pmem)
        put_disk(pmem->disk);
 }

-static void pmem_pagemap_page_free(struct page *page)
-{
-       wake_up_var(&page->_refcount);
-}
-
 static const struct dev_pagemap_ops fsdax_pagemap_ops = {
-       .page_free              = pmem_pagemap_page_free,
        .kill                   = pmem_pagemap_kill,
        .cleanup                = pmem_pagemap_cleanup,
 };
diff --git a/mm/memremap.c b/mm/memremap.c
index 03ccbdfeb697..157edb8f7cf8 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -419,12 +419,6 @@ void __put_devmap_managed_page(struct page *page)
         * holds a reference on the page.
         */
        if (count == 1) {
-               /* Clear Active bit in case of parallel mark_page_accessed */
-               __ClearPageActive(page);
-               __ClearPageWaiters(page);
-
-               mem_cgroup_uncharge(page);
-
                /*
                 * When a device_private page is freed, the page->mapping field
                 * may still contain a (stale) mapping value. For example, the
@@ -446,10 +440,17 @@ void __put_devmap_managed_page(struct page *page)
                 * handled differently or not done at all, so there is no need
                 * to clear page->mapping.
                 */
-               if (is_device_private_page(page))
-                       page->mapping = NULL;
+               if (is_device_private_page(page)) {
+                       /* Clear Active bit in case of parallel
mark_page_accessed */
+                       __ClearPageActive(page);
+                       __ClearPageWaiters(page);

-               page->pgmap->ops->page_free(page);
+                       mem_cgroup_uncharge(page);
+
+                       page->mapping = NULL;
+                       page->pgmap->ops->page_free(page);
+               } else
+                       wake_up_var(&page->_refcount);
        } else if (!count)
                __put_page(page);
 }
John Hubbard Nov. 13, 2019, 10:46 p.m. UTC | #5
On 11/13/19 2:00 PM, Dan Williams wrote:
...
>> Ugh, when did all this HMM specific manipulation sneak into the
>> generic ZONE_DEVICE path? It used to be gated by pgmap type with its
>> own put_zone_device_private_page(). For example it's certainly
>> unnecessary and might be broken (would need to check) to call
>> mem_cgroup_uncharge() on a DAX page. ZONE_DEVICE users are not a
>> monolith and the HMM use case leaks pages into code paths that DAX
>> explicitly avoids.
> 
> It's been this way for a while and I did not react previously,
> apologies for that. I think __ClearPageActive, __ClearPageWaiters, and
> mem_cgroup_uncharge, belong behind a device-private conditional. The
> history here is:
> 
> Move some, but not all HMM specifics to hmm_devmem_free():
>      2fa147bdbf67 mm, dev_pagemap: Do not clear ->mapping on final put
> 
> Remove the clearing of mapping since no upstream consumers needed it:
>      b7a523109fb5 mm: don't clear ->mapping in hmm_devmem_free
> 
> Add it back in once an upstream consumer arrived:
>      7ab0ad0e74f8 mm/hmm: fix ZONE_DEVICE anon page mapping reuse
> 
> We're now almost entirely free of ->page_free callbacks except for
> that weird nouveau case, can that FIXME in nouveau_dmem_page_free()
> also result in killing the ->page_free() callback altogether? In the
> meantime I'm proposing a cleanup like this:


OK, assuming this is acceptable (no obvious problems jump out at me,
and we can also test it with HMM), then how would you like to proceed, as
far as patches go: add such a patch as part of this series here, or as a
stand-alone patch either before or after this series? Or something else?
And did you plan on sending it out as such?

Also, the diffs didn't quite make it through intact to my "git apply", so
I'm re-posting the diff in hopes that this time it survives:

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index f9f76f6ba07b..21db1ce8c0ae 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -338,13 +338,7 @@ static void pmem_release_disk(void *__pmem)
  	put_disk(pmem->disk);
  }
  
-static void pmem_pagemap_page_free(struct page *page)
-{
-	wake_up_var(&page->_refcount);
-}
-
  static const struct dev_pagemap_ops fsdax_pagemap_ops = {
-	.page_free		= pmem_pagemap_page_free,
  	.kill			= pmem_pagemap_kill,
  	.cleanup		= pmem_pagemap_cleanup,
  };
diff --git a/mm/memremap.c b/mm/memremap.c
index 03ccbdfeb697..157edb8f7cf8 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -419,12 +419,6 @@ void __put_devmap_managed_page(struct page *page)
  	 * holds a reference on the page.
  	 */
  	if (count == 1) {
-		/* Clear Active bit in case of parallel mark_page_accessed */
-		__ClearPageActive(page);
-		__ClearPageWaiters(page);
-
-		mem_cgroup_uncharge(page);
-
  		/*
  		 * When a device_private page is freed, the page->mapping field
  		 * may still contain a (stale) mapping value. For example, the
@@ -446,10 +440,17 @@ void __put_devmap_managed_page(struct page *page)
  		 * handled differently or not done at all, so there is no need
  		 * to clear page->mapping.
  		 */
-		if (is_device_private_page(page))
-			page->mapping = NULL;
+		if (is_device_private_page(page)) {
+			/* Clear Active bit in case of parallel mark_page_accessed */
+			__ClearPageActive(page);
+			__ClearPageWaiters(page);
  
-		page->pgmap->ops->page_free(page);
+			mem_cgroup_uncharge(page);
+
+			page->mapping = NULL;
+			page->pgmap->ops->page_free(page);
+		} else
+			wake_up_var(&page->_refcount);
  	} else if (!count)
  		__put_page(page);
  }
Dan Williams Nov. 13, 2019, 10:55 p.m. UTC | #6
On Wed, Nov 13, 2019 at 2:49 PM John Hubbard <jhubbard@nvidia.com> wrote:
>
> On 11/13/19 2:00 PM, Dan Williams wrote:
> ...
> >> Ugh, when did all this HMM specific manipulation sneak into the
> >> generic ZONE_DEVICE path? It used to be gated by pgmap type with its
> >> own put_zone_device_private_page(). For example it's certainly
> >> unnecessary and might be broken (would need to check) to call
> >> mem_cgroup_uncharge() on a DAX page. ZONE_DEVICE users are not a
> >> monolith and the HMM use case leaks pages into code paths that DAX
> >> explicitly avoids.
> >
> > It's been this way for a while and I did not react previously,
> > apologies for that. I think __ClearPageActive, __ClearPageWaiters, and
> > mem_cgroup_uncharge, belong behind a device-private conditional. The
> > history here is:
> >
> > Move some, but not all HMM specifics to hmm_devmem_free():
> >      2fa147bdbf67 mm, dev_pagemap: Do not clear ->mapping on final put
> >
> > Remove the clearing of mapping since no upstream consumers needed it:
> >      b7a523109fb5 mm: don't clear ->mapping in hmm_devmem_free
> >
> > Add it back in once an upstream consumer arrived:
> >      7ab0ad0e74f8 mm/hmm: fix ZONE_DEVICE anon page mapping reuse
> >
> > We're now almost entirely free of ->page_free callbacks except for
> > that weird nouveau case, can that FIXME in nouveau_dmem_page_free()
> > also result in killing the ->page_free() callback altogether? In the
> > meantime I'm proposing a cleanup like this:
>
>
> OK, assuming this is acceptable (no obvious problems jump out at me,
> and we can also test it with HMM), then how would you like to proceed, as
> far as patches go: add such a patch as part of this series here, or as a
> stand-alone patch either before or after this series? Or something else?
> And did you plan on sending it out as such?

I think it makes sense to include it in your series since you're
looking to refactor the implementation. I can send you one based on
current linux-next as a lead-in cleanup before the refactor. Does that
work for you?

>
> Also, the diffs didn't quite make it through intact to my "git apply", so
> I'm re-posting the diff in hopes that this time it survives:

Apologies for that. For quick "how about this" patch examples, I just
copy and paste into gmail and it sometimes clobbers it.

>
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index f9f76f6ba07b..21db1ce8c0ae 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -338,13 +338,7 @@ static void pmem_release_disk(void *__pmem)
>         put_disk(pmem->disk);
>   }
>
> -static void pmem_pagemap_page_free(struct page *page)
> -{
> -       wake_up_var(&page->_refcount);
> -}
> -
>   static const struct dev_pagemap_ops fsdax_pagemap_ops = {
> -       .page_free              = pmem_pagemap_page_free,
>         .kill                   = pmem_pagemap_kill,
>         .cleanup                = pmem_pagemap_cleanup,
>   };
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 03ccbdfeb697..157edb8f7cf8 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -419,12 +419,6 @@ void __put_devmap_managed_page(struct page *page)
>          * holds a reference on the page.
>          */
>         if (count == 1) {
> -               /* Clear Active bit in case of parallel mark_page_accessed */
> -               __ClearPageActive(page);
> -               __ClearPageWaiters(page);
> -
> -               mem_cgroup_uncharge(page);
> -
>                 /*
>                  * When a device_private page is freed, the page->mapping field
>                  * may still contain a (stale) mapping value. For example, the
> @@ -446,10 +440,17 @@ void __put_devmap_managed_page(struct page *page)
>                  * handled differently or not done at all, so there is no need
>                  * to clear page->mapping.
>                  */
> -               if (is_device_private_page(page))
> -                       page->mapping = NULL;
> +               if (is_device_private_page(page)) {
> +                       /* Clear Active bit in case of parallel mark_page_accessed */
> +                       __ClearPageActive(page);
> +                       __ClearPageWaiters(page);
>
> -               page->pgmap->ops->page_free(page);
> +                       mem_cgroup_uncharge(page);
> +
> +                       page->mapping = NULL;
> +                       page->pgmap->ops->page_free(page);
> +               } else
> +                       wake_up_var(&page->_refcount);
>         } else if (!count)
>                 __put_page(page);
>   }
> --
> 2.24.0
>
>
> thanks,
> --
> John Hubbard
> NVIDIA
>
> >
> > diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> > index ad8e4df1282b..4eae441f86c9 100644
> > --- a/drivers/nvdimm/pmem.c
> > +++ b/drivers/nvdimm/pmem.c
> > @@ -337,13 +337,7 @@ static void pmem_release_disk(void *__pmem)
> >          put_disk(pmem->disk);
> >   }
> >
> > -static void pmem_pagemap_page_free(struct page *page)
> > -{
> > -       wake_up_var(&page->_refcount);
> > -}
> > -
> >   static const struct dev_pagemap_ops fsdax_pagemap_ops = {
> > -       .page_free              = pmem_pagemap_page_free,
> >          .kill                   = pmem_pagemap_kill,
> >          .cleanup                = pmem_pagemap_cleanup,
> >   };
> > diff --git a/mm/memremap.c b/mm/memremap.c
> > index 03ccbdfeb697..157edb8f7cf8 100644
> > --- a/mm/memremap.c
> > +++ b/mm/memremap.c
> > @@ -419,12 +419,6 @@ void __put_devmap_managed_page(struct page *page)
> >           * holds a reference on the page.
> >           */
> >          if (count == 1) {
> > -               /* Clear Active bit in case of parallel mark_page_accessed */
> > -               __ClearPageActive(page);
> > -               __ClearPageWaiters(page);
> > -
> > -               mem_cgroup_uncharge(page);
> > -
> >                  /*
> >                   * When a device_private page is freed, the page->mapping field
> >                   * may still contain a (stale) mapping value. For example, the
> > @@ -446,10 +440,17 @@ void __put_devmap_managed_page(struct page *page)
> >                   * handled differently or not done at all, so there is no need
> >                   * to clear page->mapping.
> >                   */
> > -               if (is_device_private_page(page))
> > -                       page->mapping = NULL;
> > +               if (is_device_private_page(page)) {
> > +                       /* Clear Active bit in case of parallel
> > mark_page_accessed */
> > +                       __ClearPageActive(page);
> > +                       __ClearPageWaiters(page);
> >
> > -               page->pgmap->ops->page_free(page);
> > +                       mem_cgroup_uncharge(page);
> > +
> > +                       page->mapping = NULL;
> > +                       page->pgmap->ops->page_free(page);
> > +               } else
> > +                       wake_up_var(&page->_refcount);
> >          } else if (!count)
> >                  __put_page(page);
> >   }
> >
John Hubbard Nov. 13, 2019, 10:56 p.m. UTC | #7
On 11/13/19 2:55 PM, Dan Williams wrote:
> On Wed, Nov 13, 2019 at 2:49 PM John Hubbard <jhubbard@nvidia.com> wrote:
>>
>> On 11/13/19 2:00 PM, Dan Williams wrote:
>> ...
>>>> Ugh, when did all this HMM specific manipulation sneak into the
>>>> generic ZONE_DEVICE path? It used to be gated by pgmap type with its
>>>> own put_zone_device_private_page(). For example it's certainly
>>>> unnecessary and might be broken (would need to check) to call
>>>> mem_cgroup_uncharge() on a DAX page. ZONE_DEVICE users are not a
>>>> monolith and the HMM use case leaks pages into code paths that DAX
>>>> explicitly avoids.
>>>
>>> It's been this way for a while and I did not react previously,
>>> apologies for that. I think __ClearPageActive, __ClearPageWaiters, and
>>> mem_cgroup_uncharge, belong behind a device-private conditional. The
>>> history here is:
>>>
>>> Move some, but not all HMM specifics to hmm_devmem_free():
>>>       2fa147bdbf67 mm, dev_pagemap: Do not clear ->mapping on final put
>>>
>>> Remove the clearing of mapping since no upstream consumers needed it:
>>>       b7a523109fb5 mm: don't clear ->mapping in hmm_devmem_free
>>>
>>> Add it back in once an upstream consumer arrived:
>>>       7ab0ad0e74f8 mm/hmm: fix ZONE_DEVICE anon page mapping reuse
>>>
>>> We're now almost entirely free of ->page_free callbacks except for
>>> that weird nouveau case, can that FIXME in nouveau_dmem_page_free()
>>> also result in killing the ->page_free() callback altogether? In the
>>> meantime I'm proposing a cleanup like this:
>>
>>
>> OK, assuming this is acceptable (no obvious problems jump out at me,
>> and we can also test it with HMM), then how would you like to proceed, as
>> far as patches go: add such a patch as part of this series here, or as a
>> stand-alone patch either before or after this series? Or something else?
>> And did you plan on sending it out as such?
> 
> I think it makes sense to include it in your series since you're
> looking to refactor the implementation. I can send you one based on
> current linux-next as a lead-in cleanup before the refactor. Does that
> work for you?
> 

That would be perfect!

>>
>> Also, the diffs didn't quite make it through intact to my "git apply", so
>> I'm re-posting the diff in hopes that this time it survives:
> 
> Apologies for that. For quick "how about this" patch examples, I just
> copy and paste into gmail and it sometimes clobbers it.
> 

No problem at all, I do the same thing and *usually* it works. ha. And
as you say, it's good enough for discussions.


thanks,
Jerome Glisse Nov. 13, 2019, 11:03 p.m. UTC | #8
On Wed, Nov 13, 2019 at 02:00:06PM -0800, Dan Williams wrote:
> On Wed, Nov 13, 2019 at 11:23 AM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > On Tue, Nov 12, 2019 at 8:27 PM John Hubbard <jhubbard@nvidia.com> wrote:
> > >
> > > An upcoming patch changes and complicates the refcounting and
> > > especially the "put page" aspects of it. In order to keep
> > > everything clean, refactor the devmap page release routines:
> > >
> > > * Rename put_devmap_managed_page() to page_is_devmap_managed(),
> > >   and limit the functionality to "read only": return a bool,
> > >   with no side effects.
> > >
> > > * Add a new routine, put_devmap_managed_page(), to handle checking
> > >   what kind of page it is, and what kind of refcount handling it
> > >   requires.
> > >
> > > * Rename __put_devmap_managed_page() to free_devmap_managed_page(),
> > >   and limit the functionality to unconditionally freeing a devmap
> > >   page.
> > >
> > > This is originally based on a separate patch by Ira Weiny, which
> > > applied to an early version of the put_user_page() experiments.
> > > Since then, Jérôme Glisse suggested the refactoring described above.
> > >
> > > Suggested-by: Jérôme Glisse <jglisse@redhat.com>
> > > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> > > Signed-off-by: John Hubbard <jhubbard@nvidia.com>
> > > ---
> > >  include/linux/mm.h | 27 ++++++++++++++++---
> > >  mm/memremap.c      | 67 ++++++++++++++++++++--------------------------
> > >  2 files changed, 53 insertions(+), 41 deletions(-)
> > >
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index a2adf95b3f9c..96228376139c 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -967,9 +967,10 @@ static inline bool is_zone_device_page(const struct page *page)
> > >  #endif
> > >
> > >  #ifdef CONFIG_DEV_PAGEMAP_OPS
> > > -void __put_devmap_managed_page(struct page *page);
> > > +void free_devmap_managed_page(struct page *page);
> > >  DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
> > > -static inline bool put_devmap_managed_page(struct page *page)
> > > +
> > > +static inline bool page_is_devmap_managed(struct page *page)
> > >  {
> > >         if (!static_branch_unlikely(&devmap_managed_key))
> > >                 return false;
> > > @@ -978,7 +979,6 @@ static inline bool put_devmap_managed_page(struct page *page)
> > >         switch (page->pgmap->type) {
> > >         case MEMORY_DEVICE_PRIVATE:
> > >         case MEMORY_DEVICE_FS_DAX:
> > > -               __put_devmap_managed_page(page);
> > >                 return true;
> > >         default:
> > >                 break;
> > > @@ -986,6 +986,27 @@ static inline bool put_devmap_managed_page(struct page *page)
> > >         return false;
> > >  }
> > >
> > > +static inline bool put_devmap_managed_page(struct page *page)
> > > +{
> > > +       bool is_devmap = page_is_devmap_managed(page);
> > > +
> > > +       if (is_devmap) {
> > > +               int count = page_ref_dec_return(page);
> > > +
> > > +               /*
> > > +                * devmap page refcounts are 1-based, rather than 0-based: if
> > > +                * refcount is 1, then the page is free and the refcount is
> > > +                * stable because nobody holds a reference on the page.
> > > +                */
> > > +               if (count == 1)
> > > +                       free_devmap_managed_page(page);
> > > +               else if (!count)
> > > +                       __put_page(page);
> > > +       }
> > > +
> > > +       return is_devmap;
> > > +}
> > > +
> > >  #else /* CONFIG_DEV_PAGEMAP_OPS */
> > >  static inline bool put_devmap_managed_page(struct page *page)
> > >  {
> > > diff --git a/mm/memremap.c b/mm/memremap.c
> > > index 03ccbdfeb697..bc7e2a27d025 100644
> > > --- a/mm/memremap.c
> > > +++ b/mm/memremap.c
> > > @@ -410,48 +410,39 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
> > >  EXPORT_SYMBOL_GPL(get_dev_pagemap);
> > >
> > >  #ifdef CONFIG_DEV_PAGEMAP_OPS
> > > -void __put_devmap_managed_page(struct page *page)
> > > +void free_devmap_managed_page(struct page *page)
> > >  {
> > > -       int count = page_ref_dec_return(page);
> > > +       /* Clear Active bit in case of parallel mark_page_accessed */
> > > +       __ClearPageActive(page);
> > > +       __ClearPageWaiters(page);
> > > +
> > > +       mem_cgroup_uncharge(page);
> >
> > Ugh, when did all this HMM specific manipulation sneak into the
> > generic ZONE_DEVICE path? It used to be gated by pgmap type with its
> > own put_zone_device_private_page(). For example it's certainly
> > unnecessary and might be broken (would need to check) to call
> > mem_cgroup_uncharge() on a DAX page. ZONE_DEVICE users are not a
> > monolith and the HMM use case leaks pages into code paths that DAX
> > explicitly avoids.
> 
> It's been this way for a while and I did not react previously,
> apologies for that. I think __ClearPageActive, __ClearPageWaiters, and
> mem_cgroup_uncharge, belong behind a device-private conditional. The
> history here is:
> 
> Move some, but not all HMM specifics to hmm_devmem_free():
>     2fa147bdbf67 mm, dev_pagemap: Do not clear ->mapping on final put
> 
> Remove the clearing of mapping since no upstream consumers needed it:
>     b7a523109fb5 mm: don't clear ->mapping in hmm_devmem_free
> 
> Add it back in once an upstream consumer arrived:
>     7ab0ad0e74f8 mm/hmm: fix ZONE_DEVICE anon page mapping reuse
> 
> We're now almost entirely free of ->page_free callbacks except for
> that weird nouveau case, can that FIXME in nouveau_dmem_page_free()
> also result in killing the ->page_free() callback altogether? In the
> meantime I'm proposing a cleanup like this:

No we need the callback, cleanup looks good.

> 
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index ad8e4df1282b..4eae441f86c9 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -337,13 +337,7 @@ static void pmem_release_disk(void *__pmem)
>         put_disk(pmem->disk);
>  }
> 
> -static void pmem_pagemap_page_free(struct page *page)
> -{
> -       wake_up_var(&page->_refcount);
> -}
> -
>  static const struct dev_pagemap_ops fsdax_pagemap_ops = {
> -       .page_free              = pmem_pagemap_page_free,
>         .kill                   = pmem_pagemap_kill,
>         .cleanup                = pmem_pagemap_cleanup,
>  };
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 03ccbdfeb697..157edb8f7cf8 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -419,12 +419,6 @@ void __put_devmap_managed_page(struct page *page)
>          * holds a reference on the page.
>          */
>         if (count == 1) {
> -               /* Clear Active bit in case of parallel mark_page_accessed */
> -               __ClearPageActive(page);
> -               __ClearPageWaiters(page);
> -
> -               mem_cgroup_uncharge(page);
> -
>                 /*
>                  * When a device_private page is freed, the page->mapping field
>                  * may still contain a (stale) mapping value. For example, the
> @@ -446,10 +440,17 @@ void __put_devmap_managed_page(struct page *page)
>                  * handled differently or not done at all, so there is no need
>                  * to clear page->mapping.
>                  */
> -               if (is_device_private_page(page))
> -                       page->mapping = NULL;
> +               if (is_device_private_page(page)) {
> +                       /* Clear Active bit in case of parallel
> mark_page_accessed */
> +                       __ClearPageActive(page);
> +                       __ClearPageWaiters(page);
> 
> -               page->pgmap->ops->page_free(page);
> +                       mem_cgroup_uncharge(page);
> +
> +                       page->mapping = NULL;
> +                       page->pgmap->ops->page_free(page);
> +               } else
> +                       wake_up_var(&page->_refcount);
>         } else if (!count)
>                 __put_page(page);
>  }
>
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2adf95b3f9c..96228376139c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -967,9 +967,10 @@  static inline bool is_zone_device_page(const struct page *page)
 #endif
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
-void __put_devmap_managed_page(struct page *page);
+void free_devmap_managed_page(struct page *page);
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-static inline bool put_devmap_managed_page(struct page *page)
+
+static inline bool page_is_devmap_managed(struct page *page)
 {
 	if (!static_branch_unlikely(&devmap_managed_key))
 		return false;
@@ -978,7 +979,6 @@  static inline bool put_devmap_managed_page(struct page *page)
 	switch (page->pgmap->type) {
 	case MEMORY_DEVICE_PRIVATE:
 	case MEMORY_DEVICE_FS_DAX:
-		__put_devmap_managed_page(page);
 		return true;
 	default:
 		break;
@@ -986,6 +986,27 @@  static inline bool put_devmap_managed_page(struct page *page)
 	return false;
 }
 
+static inline bool put_devmap_managed_page(struct page *page)
+{
+	bool is_devmap = page_is_devmap_managed(page);
+
+	if (is_devmap) {
+		int count = page_ref_dec_return(page);
+
+		/*
+		 * devmap page refcounts are 1-based, rather than 0-based: if
+		 * refcount is 1, then the page is free and the refcount is
+		 * stable because nobody holds a reference on the page.
+		 */
+		if (count == 1)
+			free_devmap_managed_page(page);
+		else if (!count)
+			__put_page(page);
+	}
+
+	return is_devmap;
+}
+
 #else /* CONFIG_DEV_PAGEMAP_OPS */
 static inline bool put_devmap_managed_page(struct page *page)
 {
diff --git a/mm/memremap.c b/mm/memremap.c
index 03ccbdfeb697..bc7e2a27d025 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -410,48 +410,39 @@  struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 EXPORT_SYMBOL_GPL(get_dev_pagemap);
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
-void __put_devmap_managed_page(struct page *page)
+void free_devmap_managed_page(struct page *page)
 {
-	int count = page_ref_dec_return(page);
+	/* Clear Active bit in case of parallel mark_page_accessed */
+	__ClearPageActive(page);
+	__ClearPageWaiters(page);
+
+	mem_cgroup_uncharge(page);
 
 	/*
-	 * If refcount is 1 then page is freed and refcount is stable as nobody
-	 * holds a reference on the page.
+	 * When a device_private page is freed, the page->mapping field
+	 * may still contain a (stale) mapping value. For example, the
+	 * lower bits of page->mapping may still identify the page as
+	 * an anonymous page. Ultimately, this entire field is just
+	 * stale and wrong, and it will cause errors if not cleared.
+	 * One example is:
+	 *
+	 *  migrate_vma_pages()
+	 *    migrate_vma_insert_page()
+	 *      page_add_new_anon_rmap()
+	 *        __page_set_anon_rmap()
+	 *          ...checks page->mapping, via PageAnon(page) call,
+	 *            and incorrectly concludes that the page is an
+	 *            anonymous page. Therefore, it incorrectly,
+	 *            silently fails to set up the new anon rmap.
+	 *
+	 * For other types of ZONE_DEVICE pages, migration is either
+	 * handled differently or not done at all, so there is no need
+	 * to clear page->mapping.
 	 */
-	if (count == 1) {
-		/* Clear Active bit in case of parallel mark_page_accessed */
-		__ClearPageActive(page);
-		__ClearPageWaiters(page);
-
-		mem_cgroup_uncharge(page);
-
-		/*
-		 * When a device_private page is freed, the page->mapping field
-		 * may still contain a (stale) mapping value. For example, the
-		 * lower bits of page->mapping may still identify the page as
-		 * an anonymous page. Ultimately, this entire field is just
-		 * stale and wrong, and it will cause errors if not cleared.
-		 * One example is:
-		 *
-		 *  migrate_vma_pages()
-		 *    migrate_vma_insert_page()
-		 *      page_add_new_anon_rmap()
-		 *        __page_set_anon_rmap()
-		 *          ...checks page->mapping, via PageAnon(page) call,
-		 *            and incorrectly concludes that the page is an
-		 *            anonymous page. Therefore, it incorrectly,
-		 *            silently fails to set up the new anon rmap.
-		 *
-		 * For other types of ZONE_DEVICE pages, migration is either
-		 * handled differently or not done at all, so there is no need
-		 * to clear page->mapping.
-		 */
-		if (is_device_private_page(page))
-			page->mapping = NULL;
+	if (is_device_private_page(page))
+		page->mapping = NULL;
 
-		page->pgmap->ops->page_free(page);
-	} else if (!count)
-		__put_page(page);
+	page->pgmap->ops->page_free(page);
 }
-EXPORT_SYMBOL(__put_devmap_managed_page);
+EXPORT_SYMBOL(free_devmap_managed_page);
 #endif /* CONFIG_DEV_PAGEMAP_OPS */