diff mbox

[v2,5/6] mm: track gup pages with page->dma_pinned_* fields

Message ID 20180702005654.20369-6-jhubbard@nvidia.com
State New, archived
Headers show

Commit Message

john.hubbard@gmail.com July 2, 2018, 12:56 a.m. UTC
From: John Hubbard <jhubbard@nvidia.com>

This patch sets and restores the new page->dma_pinned_flags and
page->dma_pinned_count fields, but does not actually use them for
anything yet.

In order to use these fields at all, the page must be removed from
any LRU list that it's on. The patch also adds some precautions that
prevent the page from getting moved back onto an LRU, once it is
in this state.

This is in preparation to fix some problems that came up when using
devices (NICs, GPUs, for example) that set up direct access to a chunk
of system (CPU) memory, so that they can DMA to/from that memory.

CC: Matthew Wilcox <willy@infradead.org>
CC: Jan Kara <jack@suse.cz>
CC: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
---
 include/linux/mm.h | 15 +++++++++++++
 mm/gup.c           | 56 ++++++++++++++++++++++++++++++++++++++++++++--
 mm/memcontrol.c    |  7 ++++++
 mm/swap.c          | 48 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 124 insertions(+), 2 deletions(-)

Comments

kernel test robot July 2, 2018, 2:11 a.m. UTC | #1
Hi John,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.18-rc3]
[cannot apply to next-20180629]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/john-hubbard-gmail-com/mm-fs-gup-don-t-unmap-or-drop-filesystem-buffers/20180702-090125
config: i386-randconfig-x074-201826 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All error/warnings (new ones prefixed by >>):

   In file included from include/asm-generic/atomic-instrumented.h:16:0,
                    from arch/x86/include/asm/atomic.h:283,
                    from include/linux/atomic.h:5,
                    from include/linux/page_counter.h:5,
                    from mm/memcontrol.c:34:
   mm/memcontrol.c: In function 'unlock_page_lru':
>> mm/memcontrol.c:2087:32: error: 'page_tail' undeclared (first use in this function); did you mean 'page_pool'?
      VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
                                   ^
   include/linux/build_bug.h:36:63: note: in definition of macro 'BUILD_BUG_ON_INVALID'
    #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
                                                                  ^
>> include/linux/mmdebug.h:46:36: note: in expansion of macro 'VM_BUG_ON'
    #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
                                       ^~~~~~~~~
>> mm/memcontrol.c:2087:3: note: in expansion of macro 'VM_BUG_ON_PAGE'
      VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
      ^~~~~~~~~~~~~~
   mm/memcontrol.c:2087:32: note: each undeclared identifier is reported only once for each function it appears in
      VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
                                   ^
   include/linux/build_bug.h:36:63: note: in definition of macro 'BUILD_BUG_ON_INVALID'
    #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
                                                                  ^
>> include/linux/mmdebug.h:46:36: note: in expansion of macro 'VM_BUG_ON'
    #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
                                       ^~~~~~~~~
>> mm/memcontrol.c:2087:3: note: in expansion of macro 'VM_BUG_ON_PAGE'
      VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
      ^~~~~~~~~~~~~~

vim +2087 mm/memcontrol.c

  2077	
  2078	static void unlock_page_lru(struct page *page, int isolated)
  2079	{
  2080		struct zone *zone = page_zone(page);
  2081	
  2082		if (isolated) {
  2083			struct lruvec *lruvec;
  2084	
  2085			lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
  2086			VM_BUG_ON_PAGE(PageLRU(page), page);
> 2087			VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
  2088	
  2089			SetPageLRU(page);
  2090			add_page_to_lru_list(page, lruvec, page_lru(page));
  2091		}
  2092		spin_unlock_irq(zone_lru_lock(zone));
  2093	}
  2094	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot July 2, 2018, 2:58 a.m. UTC | #2
Hi John,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v4.18-rc3]
[cannot apply to next-20180629]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/john-hubbard-gmail-com/mm-fs-gup-don-t-unmap-or-drop-filesystem-buffers/20180702-090125
config: x86_64-randconfig-x010-201826 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All warnings (new ones prefixed by >>):

   In file included from arch/x86/include/asm/atomic.h:5:0,
                    from include/linux/atomic.h:5,
                    from include/linux/page_counter.h:5,
                    from mm/memcontrol.c:34:
   mm/memcontrol.c: In function 'unlock_page_lru':
   mm/memcontrol.c:2087:32: error: 'page_tail' undeclared (first use in this function); did you mean 'page_pool'?
      VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
                                   ^
   include/linux/compiler.h:58:30: note: in definition of macro '__trace_if'
     if (__builtin_constant_p(!!(cond)) ? !!(cond) :   \
                                 ^~~~
>> include/linux/mmdebug.h:21:3: note: in expansion of macro 'if'
      if (unlikely(cond)) {     \
      ^~
   include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
    #  define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
                           ^~~~~~~~~~~~~~~~
>> include/linux/mmdebug.h:21:7: note: in expansion of macro 'unlikely'
      if (unlikely(cond)) {     \
          ^~~~~~~~
   mm/memcontrol.c:2087:3: note: in expansion of macro 'VM_BUG_ON_PAGE'
      VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
      ^~~~~~~~~~~~~~
   mm/memcontrol.c:2087:32: note: each undeclared identifier is reported only once for each function it appears in
      VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
                                   ^
   include/linux/compiler.h:58:30: note: in definition of macro '__trace_if'
     if (__builtin_constant_p(!!(cond)) ? !!(cond) :   \
                                 ^~~~
>> include/linux/mmdebug.h:21:3: note: in expansion of macro 'if'
      if (unlikely(cond)) {     \
      ^~
   include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
    #  define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
                           ^~~~~~~~~~~~~~~~
>> include/linux/mmdebug.h:21:7: note: in expansion of macro 'unlikely'
      if (unlikely(cond)) {     \
          ^~~~~~~~
   mm/memcontrol.c:2087:3: note: in expansion of macro 'VM_BUG_ON_PAGE'
      VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
      ^~~~~~~~~~~~~~

vim +/if +21 include/linux/mmdebug.h

309381feae Sasha Levin           2014-01-23  16  
59ea746337 Jiri Slaby            2008-06-12  17  #ifdef CONFIG_DEBUG_VM
59ea746337 Jiri Slaby            2008-06-12  18  #define VM_BUG_ON(cond) BUG_ON(cond)
309381feae Sasha Levin           2014-01-23  19  #define VM_BUG_ON_PAGE(cond, page)					\
e4f674229c Dave Hansen           2014-06-04  20  	do {								\
e4f674229c Dave Hansen           2014-06-04 @21  		if (unlikely(cond)) {					\
e4f674229c Dave Hansen           2014-06-04  22  			dump_page(page, "VM_BUG_ON_PAGE(" __stringify(cond)")");\
e4f674229c Dave Hansen           2014-06-04  23  			BUG();						\
e4f674229c Dave Hansen           2014-06-04  24  		}							\
e4f674229c Dave Hansen           2014-06-04  25  	} while (0)
fa3759ccd5 Sasha Levin           2014-10-09  26  #define VM_BUG_ON_VMA(cond, vma)					\
fa3759ccd5 Sasha Levin           2014-10-09  27  	do {								\
fa3759ccd5 Sasha Levin           2014-10-09  28  		if (unlikely(cond)) {					\
fa3759ccd5 Sasha Levin           2014-10-09  29  			dump_vma(vma);					\
fa3759ccd5 Sasha Levin           2014-10-09  30  			BUG();						\
fa3759ccd5 Sasha Levin           2014-10-09  31  		}							\
fa3759ccd5 Sasha Levin           2014-10-09  32  	} while (0)
31c9afa6db Sasha Levin           2014-10-09  33  #define VM_BUG_ON_MM(cond, mm)						\
31c9afa6db Sasha Levin           2014-10-09  34  	do {								\
31c9afa6db Sasha Levin           2014-10-09  35  		if (unlikely(cond)) {					\
31c9afa6db Sasha Levin           2014-10-09  36  			dump_mm(mm);					\
31c9afa6db Sasha Levin           2014-10-09  37  			BUG();						\
31c9afa6db Sasha Levin           2014-10-09  38  		}							\
31c9afa6db Sasha Levin           2014-10-09  39  	} while (0)
91241681c6 Michal Hocko          2018-04-05  40  #define VM_WARN_ON(cond) (void)WARN_ON(cond)
91241681c6 Michal Hocko          2018-04-05  41  #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
91241681c6 Michal Hocko          2018-04-05  42  #define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format)
91241681c6 Michal Hocko          2018-04-05  43  #define VM_WARN(cond, format...) (void)WARN(cond, format)
59ea746337 Jiri Slaby            2008-06-12  44  #else
02602a18c3 Konstantin Khlebnikov 2012-05-29  45  #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
309381feae Sasha Levin           2014-01-23  46  #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
fa3759ccd5 Sasha Levin           2014-10-09  47  #define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond)
31c9afa6db Sasha Levin           2014-10-09  48  #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
02a8efeda8 Andrew Morton         2014-06-04  49  #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
02a8efeda8 Andrew Morton         2014-06-04  50  #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
ef6b571fb8 Andrew Morton         2014-08-06  51  #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
a54f9aebaa Aneesh Kumar K.V      2016-07-26  52  #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
59ea746337 Jiri Slaby            2008-06-12  53  #endif
59ea746337 Jiri Slaby            2008-06-12  54  

:::::: The code at line 21 was first introduced by commit
:::::: e4f674229ce63dac60be0c4ddfb5ef8d1225d30d mm: pass VM_BUG_ON() reason to dump_page()

:::::: TO: Dave Hansen <dave.hansen@linux.intel.com>
:::::: CC: Linus Torvalds <torvalds@linux-foundation.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
John Hubbard July 2, 2018, 5:05 a.m. UTC | #3
On 07/01/2018 07:58 PM, kbuild test robot wrote:
> Hi John,
> 
> Thank you for the patch! Perhaps something to improve:
> 
> [auto build test WARNING on linus/master]
> [also build test WARNING on v4.18-rc3]
> [cannot apply to next-20180629]
> [if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
> 
> url:    https://github.com/0day-ci/linux/commits/john-hubbard-gmail-com/mm-fs-gup-don-t-unmap-or-drop-filesystem-buffers/20180702-090125
> config: x86_64-randconfig-x010-201826 (attached as .config)
> compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
> reproduce:
>         # save the attached .config to linux build tree
>         make ARCH=x86_64 
> 
> All warnings (new ones prefixed by >>):
> 
>    In file included from arch/x86/include/asm/atomic.h:5:0,
>                     from include/linux/atomic.h:5,
>                     from include/linux/page_counter.h:5,
>                     from mm/memcontrol.c:34:
>    mm/memcontrol.c: In function 'unlock_page_lru':
>    mm/memcontrol.c:2087:32: error: 'page_tail' undeclared (first use in this function); did you mean 'page_pool'?
>       VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
>                                    ^
Yes, that should have been:

        VM_BUG_ON_PAGE(PageDmaPinned(page), page);

Fixed locally...maybe I'll post a v3 right now, as there were half a dozen ridiculous typos that
snuck in.



thanks,
Jan Kara July 2, 2018, 9:53 a.m. UTC | #4
On Sun 01-07-18 17:56:53, john.hubbard@gmail.com wrote:
> From: John Hubbard <jhubbard@nvidia.com>
> 
> This patch sets and restores the new page->dma_pinned_flags and
> page->dma_pinned_count fields, but does not actually use them for
> anything yet.
> 
> In order to use these fields at all, the page must be removed from
> any LRU list that it's on. The patch also adds some precautions that
> prevent the page from getting moved back onto an LRU, once it is
> in this state.
> 
> This is in preparation to fix some problems that came up when using
> devices (NICs, GPUs, for example) that set up direct access to a chunk
> of system (CPU) memory, so that they can DMA to/from that memory.
> 
> CC: Matthew Wilcox <willy@infradead.org>
> CC: Jan Kara <jack@suse.cz>
> CC: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: John Hubbard <jhubbard@nvidia.com>

...

> @@ -904,12 +907,24 @@ static inline void get_page(struct page *page)
>  	 */
>  	VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
>  	page_ref_inc(page);
> +
> +	if (unlikely(PageDmaPinned(page)))
> +		__get_page_for_pinned_dma(page);
>  }
>  
>  static inline void put_page(struct page *page)
>  {
>  	page = compound_head(page);
>  
> +	/* Because the page->dma_pinned_* fields are unioned with
> +	 * page->lru, there is no way to do classical refcount-style
> +	 * decrement-and-test-for-zero. Instead, PageDmaPinned(page) must
> +	 * be checked, in order to safely check if we are allowed to decrement
> +	 * page->dma_pinned_count at all.
> +	 */
> +	if (unlikely(PageDmaPinned(page)))
> +		__put_page_for_pinned_dma(page);
> +

These two are just wrong. You cannot make any page reference for
PageDmaPinned() account against a pin count. First, it is just conceptually
wrong as these references need not be long term pins, second, you can
easily race like:

Pinner				Random process
				get_page(page)
pin_page_for_dma()
				put_page(page)
				 -> oops, page gets unpinned too early

So you really have to create counterpart to get_user_pages() - like
put_user_page() or whatever... It is inconvenient to have to modify all GUP
users but I don't see a way around that.

								Honza

>  	/*
>  	 * For devmap managed pages we need to catch refcount transition from
>  	 * 2 to 1, when refcount reach one it means the page is free and we
> diff --git a/mm/gup.c b/mm/gup.c
> index 73f0b3316fa7..e5c0104fd234 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -20,6 +20,51 @@
>  
>  #include "internal.h"
>  
> +static int pin_page_for_dma(struct page *page)
> +{
> +	int ret = 0;
> +	struct zone *zone;
> +
> +	page = compound_head(page);
> +	zone = page_zone(page);
> +
> +	spin_lock(zone_gup_lock(zone));
> +
> +	if (PageDmaPinned(page)) {
> +		/* Page was not on an LRU list, because it was DMA-pinned. */
> +		VM_BUG_ON_PAGE(PageLRU(page), page);
> +
> +		atomic_inc(&page->dma_pinned_count);
> +		goto unlock_out;
> +	}
> +
> +	/*
> +	 * Note that page->dma_pinned_flags is unioned with page->lru.
> +	 * Therefore, the rules are: checking if any of the
> +	 * PAGE_DMA_PINNED_FLAGS bits are set may be done while page->lru
> +	 * is in use. However, setting those flags requires that
> +	 * the page is both locked, and also, removed from the LRU.
> +	 */
> +	ret = isolate_lru_page(page);
> +
> +	if (ret == 0) {
> +		/* Avoid problems later, when freeing the page: */
> +		ClearPageActive(page);
> +		ClearPageUnevictable(page);
> +
> +		/* counteract isolate_lru_page's effects: */
> +		put_page(page);
> +
> +		atomic_set(&page->dma_pinned_count, 1);
> +		SetPageDmaPinned(page);
> +	}
> +
> +unlock_out:
> +	spin_unlock(zone_gup_lock(zone));
> +
> +	return ret;
> +}
> +
>  static struct page *no_page_table(struct vm_area_struct *vma,
>  		unsigned int flags)
>  {
> @@ -659,7 +704,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>  		unsigned int gup_flags, struct page **pages,
>  		struct vm_area_struct **vmas, int *nonblocking)
>  {
> -	long i = 0;
> +	long i = 0, j;
>  	int err = 0;
>  	unsigned int page_mask;
>  	struct vm_area_struct *vma = NULL;
> @@ -764,6 +809,10 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>  	} while (nr_pages);
>  
>  out:
> +	if (pages)
> +		for (j = 0; j < i; j++)
> +			pin_page_for_dma(pages[j]);
> +
>  	return i ? i : err;
>  }
>  
> @@ -1843,7 +1892,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>  			struct page **pages)
>  {
>  	unsigned long addr, len, end;
> -	int nr = 0, ret = 0;
> +	int nr = 0, ret = 0, i;
>  
>  	start &= PAGE_MASK;
>  	addr = start;
> @@ -1864,6 +1913,9 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>  		ret = nr;
>  	}
>  
> +	for (i = 0; i < nr; i++)
> +		pin_page_for_dma(pages[i]);
> +
>  	if (nr < nr_pages) {
>  		/* Try to get the remaining pages with get_user_pages */
>  		start += nr << PAGE_SHIFT;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e6f0d5ef320a..510d442647c2 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2062,6 +2062,11 @@ static void lock_page_lru(struct page *page, int *isolated)
>  	if (PageLRU(page)) {
>  		struct lruvec *lruvec;
>  
> +		/* LRU and PageDmaPinned are mutually exclusive: they use the
> +		 * same fields in struct page, but for different purposes.
> +		 */
> +		VM_BUG_ON_PAGE(PageDmaPinned(page), page);
> +
>  		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
>  		ClearPageLRU(page);
>  		del_page_from_lru_list(page, lruvec, page_lru(page));
> @@ -2079,6 +2084,8 @@ static void unlock_page_lru(struct page *page, int isolated)
>  
>  		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
>  		VM_BUG_ON_PAGE(PageLRU(page), page);
> +		VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
> +
>  		SetPageLRU(page);
>  		add_page_to_lru_list(page, lruvec, page_lru(page));
>  	}
> diff --git a/mm/swap.c b/mm/swap.c
> index 26fc9b5f1b6c..09ba61300d06 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -52,6 +52,43 @@ static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
>  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
>  #endif
>  
> +void __get_page_for_pinned_dma(struct page *page)
> +{
> +	struct zone *zone = page_zone(page);
> +
> +	spin_lock(zone_gup_lock(zone));
> +
> +	if (PageDmaPinned(page))
> +		atomic_inc(&page->dma_pinned_count);
> +
> +	spin_unlock(zone_gup_lock(zone));
> +}
> +EXPORT_SYMBOL(__get_page_for_pinned_dma);
> +
> +void __put_page_for_pinned_dma(struct page *page)
> +{
> +	struct zone *zone = page_zone(page);
> +
> +	if (atomic_dec_and_test(&page->dma_pinned_count)) {
> +		spin_lock(zone_gup_lock(zone));
> +
> +		VM_BUG_ON_PAGE(PageLRU(page), page);
> +
> +		/* Re-check while holding the lock, because
> +		 * pin_page_for_dma() or get_page() may have snuck in right
> +		 * after the atomic_dec_and_test, and raised the count
> +		 * above zero again. If so, just leave the flag set. And
> +		 * because the atomic_dec_and_test above already got the
> +		 * accounting correct, no other action is required.
> +		 */
> +		if (atomic_read(&page->dma_pinned_count) == 0)
> +			ClearPageDmaPinned(page);
> +
> +		spin_unlock(zone_gup_lock(zone));
> +	}
> +}
> +EXPORT_SYMBOL(__put_page_for_pinned_dma);
> +
>  /*
>   * This path almost never happens for VM activity - pages are normally
>   * freed via pagevecs.  But it gets used by networking.
> @@ -824,6 +861,11 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
>  	VM_BUG_ON_PAGE(!PageHead(page), page);
>  	VM_BUG_ON_PAGE(PageCompound(page_tail), page);
>  	VM_BUG_ON_PAGE(PageLRU(page_tail), page);
> +
> +	/* LRU and PageDmaPinned are mutually exclusive: they use the
> +	 * same fields in struct page, but for different purposes.
> +	 */
> +	VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
>  	VM_BUG_ON(NR_CPUS != 1 &&
>  		  !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock));
>  
> @@ -863,6 +905,12 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
>  
>  	VM_BUG_ON_PAGE(PageLRU(page), page);
>  
> +	/* LRU and PageDmaPinned are mutually exclusive: they use the
> +	 * same fields in struct page, but for different purposes.
> +	 */
> +	if (PageDmaPinned(page))
> +		return;
> +
>  	SetPageLRU(page);
>  	/*
>  	 * Page becomes evictable in two ways:
> -- 
> 2.18.0
>
John Hubbard July 2, 2018, 8:43 p.m. UTC | #5
On 07/02/2018 02:53 AM, Jan Kara wrote:
> On Sun 01-07-18 17:56:53, john.hubbard@gmail.com wrote:
>> From: John Hubbard <jhubbard@nvidia.com>
>>
> ...
> 
>> @@ -904,12 +907,24 @@ static inline void get_page(struct page *page)
>>  	 */
>>  	VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
>>  	page_ref_inc(page);
>> +
>> +	if (unlikely(PageDmaPinned(page)))
>> +		__get_page_for_pinned_dma(page);
>>  }
>>  
>>  static inline void put_page(struct page *page)
>>  {
>>  	page = compound_head(page);
>>  
>> +	/* Because the page->dma_pinned_* fields are unioned with
>> +	 * page->lru, there is no way to do classical refcount-style
>> +	 * decrement-and-test-for-zero. Instead, PageDmaPinned(page) must
>> +	 * be checked, in order to safely check if we are allowed to decrement
>> +	 * page->dma_pinned_count at all.
>> +	 */
>> +	if (unlikely(PageDmaPinned(page)))
>> +		__put_page_for_pinned_dma(page);
>> +
> 
> These two are just wrong. You cannot make any page reference for
> PageDmaPinned() account against a pin count. First, it is just conceptually
> wrong as these references need not be long term pins, second, you can
> easily race like:
> 
> Pinner				Random process
> 				get_page(page)
> pin_page_for_dma()
> 				put_page(page)
> 				 -> oops, page gets unpinned too early
> 

I'll drop this approach, without mentioning any of the locking that is hiding in
there, since that was probably breaking other rules anyway. :) Thanks for your
patience in reviewing this.

> So you really have to create counterpart to get_user_pages() - like
> put_user_page() or whatever... It is inconvenient to have to modify all GUP
> users but I don't see a way around that. 

OK, there will be a long-ish pause, while I go visit all the gup sites. I count about
88 callers, which is not nearly as crazy as my first casual grep showed, but still
quite a chunk, since I have to track down where each one does its put_page call(s).

It's definitely worth the effort, though. These pins just plain need some special
handling in order to get everything correct.


thanks,
Christopher Lameter July 3, 2018, 12:08 a.m. UTC | #6
On Mon, 2 Jul 2018, John Hubbard wrote:

> >
> > These two are just wrong. You cannot make any page reference for
> > PageDmaPinned() account against a pin count. First, it is just conceptually
> > wrong as these references need not be long term pins, second, you can
> > easily race like:
> >
> > Pinner				Random process
> > 				get_page(page)
> > pin_page_for_dma()
> > 				put_page(page)
> > 				 -> oops, page gets unpinned too early
> >
>
> I'll drop this approach, without mentioning any of the locking that is hiding in
> there, since that was probably breaking other rules anyway. :) Thanks for your
> patience in reviewing this.

Mayb the following would work:

If you establish a reference to a page then increase the page count. If
the reference is a dma pin action also then increase the pinned count.

That way you know how many of the references to the page are dma
pins and you can correctly manage the state of the page if the dma pins go
away.
John Hubbard July 3, 2018, 4:30 a.m. UTC | #7
On 07/02/2018 05:08 PM, Christopher Lameter wrote:
> On Mon, 2 Jul 2018, John Hubbard wrote:
> 
>>>
>>> These two are just wrong. You cannot make any page reference for
>>> PageDmaPinned() account against a pin count. First, it is just conceptually
>>> wrong as these references need not be long term pins, second, you can
>>> easily race like:
>>>
>>> Pinner				Random process
>>> 				get_page(page)
>>> pin_page_for_dma()
>>> 				put_page(page)
>>> 				 -> oops, page gets unpinned too early
>>>
>>
>> I'll drop this approach, without mentioning any of the locking that is hiding in
>> there, since that was probably breaking other rules anyway. :) Thanks for your
>> patience in reviewing this.
> 
> Mayb the following would work:
> 
> If you establish a reference to a page then increase the page count. If
> the reference is a dma pin action also then increase the pinned count.
> 
> That way you know how many of the references to the page are dma
> pins and you can correctly manage the state of the page if the dma pins go
> away.
> 

I think this sounds like what this patch already does, right? See:
__put_page_for_pinned_dma(), __get_page_for_pinned_dma(), and 
pin_page_for_dma(). The locking seems correct to me, but I suspect it's 
too heavyweight for such a hot path. But without adding a new put_user_page()
call, that was the best I could come up with.

What I'm hearing now from Jan and Michal is that the desired end result is
a separate API call, put_user_pages(), so that we can explicitly manage
these pinned pages.

thanks,
Christopher Lameter July 3, 2018, 5:08 p.m. UTC | #8
On Mon, 2 Jul 2018, John Hubbard wrote:

> > If you establish a reference to a page then increase the page count. If
> > the reference is a dma pin action also then increase the pinned count.
> >
> > That way you know how many of the references to the page are dma
> > pins and you can correctly manage the state of the page if the dma pins go
> > away.
> >
>
> I think this sounds like what this patch already does, right? See:
> __put_page_for_pinned_dma(), __get_page_for_pinned_dma(), and
> pin_page_for_dma(). The locking seems correct to me, but I suspect it's
> too heavyweight for such a hot path. But without adding a new put_user_page()
> call, that was the best I could come up with.

When I saw the patch it looked like you were avoiding to increment the
page->count field.

> What I'm hearing now from Jan and Michal is that the desired end result is
> a separate API call, put_user_pages(), so that we can explicitly manage
> these pinned pages.

Certainly a good approach.
John Hubbard July 3, 2018, 5:36 p.m. UTC | #9
On 07/03/2018 10:08 AM, Christopher Lameter wrote:
> On Mon, 2 Jul 2018, John Hubbard wrote:
> 
>>> If you establish a reference to a page then increase the page count. If
>>> the reference is a dma pin action also then increase the pinned count.
>>>
>>> That way you know how many of the references to the page are dma
>>> pins and you can correctly manage the state of the page if the dma pins go
>>> away.
>>>
>>
>> I think this sounds like what this patch already does, right? See:
>> __put_page_for_pinned_dma(), __get_page_for_pinned_dma(), and
>> pin_page_for_dma(). The locking seems correct to me, but I suspect it's
>> too heavyweight for such a hot path. But without adding a new put_user_page()
>> call, that was the best I could come up with.
> 
> When I saw the patch it looked like you were avoiding to increment the
> page->count field.

Looking at it again, this patch is definitely susceptible to Jan's "page gets
dma-unpinnned too soon" problem.  That leaves a window in which the original
problem can occur.

The page->_refcount field is used normally, in addition to the dma_pinned_count.
But the problem is that, unless the caller knows what kind of page it is,
the page->dma_pinned_count cannot be looked at, because it is unioned with
page->lru.prev.  page->dma_pinned_flags, at least starting at bit 1, are 
safe to look at due to pointer alignment, but now you cannot atomically 
count...

So this seems unsolvable without having the caller specify that it knows the
page type, and that it is therefore safe to decrement page->dma_pinned_count.
I was hoping I'd found a way, but clearly I haven't. :)


thanks,
Christopher Lameter July 3, 2018, 5:48 p.m. UTC | #10
On Tue, 3 Jul 2018, John Hubbard wrote:

> The page->_refcount field is used normally, in addition to the dma_pinned_count.
> But the problem is that, unless the caller knows what kind of page it is,
> the page->dma_pinned_count cannot be looked at, because it is unioned with
> page->lru.prev.  page->dma_pinned_flags, at least starting at bit 1, are
> safe to look at due to pointer alignment, but now you cannot atomically
> count...
>
> So this seems unsolvable without having the caller specify that it knows the
> page type, and that it is therefore safe to decrement page->dma_pinned_count.
> I was hoping I'd found a way, but clearly I haven't. :)

Try to find some way to indicate that the page is pinned by using some of
the existing page flags? There is already an MLOCK flag. Maybe some
creativity with that can lead to something (but then the MLOCKed pages are
on the unevictable LRU....). cgroups used to have something called struct
page_ext. Oh its there in linux/mm/page_ext.c.
John Hubbard July 3, 2018, 6:48 p.m. UTC | #11
On 07/03/2018 10:48 AM, Christopher Lameter wrote:
> On Tue, 3 Jul 2018, John Hubbard wrote:
> 
>> The page->_refcount field is used normally, in addition to the dma_pinned_count.
>> But the problem is that, unless the caller knows what kind of page it is,
>> the page->dma_pinned_count cannot be looked at, because it is unioned with
>> page->lru.prev.  page->dma_pinned_flags, at least starting at bit 1, are
>> safe to look at due to pointer alignment, but now you cannot atomically
>> count...
>>
>> So this seems unsolvable without having the caller specify that it knows the
>> page type, and that it is therefore safe to decrement page->dma_pinned_count.
>> I was hoping I'd found a way, but clearly I haven't. :)
> 
> Try to find some way to indicate that the page is pinned by using some of
> the existing page flags? There is already an MLOCK flag. Maybe some
> creativity with that can lead to something (but then the MLOCKed pages are
> on the unevictable LRU....). cgroups used to have something called struct
> page_ext. Oh its there in linux/mm/page_ext.c.
> 

Yes, that would provide just a touch more cabability: we could both read and
write a dma-pinned page(_ext) flag safely, instead of only being able to just 
read.  I'm doubt that that's enough additional information, though. The general
problem of allowing random put_page() calls to decrement the dma-pinned count (see
Jan's diagram at the beginning of this thread) cannot be solved by anything less
than some sort of "who (or which special type of caller, at least) owns this page"
approach, as far as I can see. The put_user_pages() provides arguably the simplest 
version of that kind of solution.

Also, even just using a single bit from page extensions would cost some extra memory, 
because for example on 64-bit systems many configurations do not need the additional 
flags that page_ext.h provides, so they return "false" from the page_ext_operations.need() 
callback. Changing get_user_pages to require page extensions would lead to
*every* configuration requiring page extensions, so 64-bit users would lose some memory
for sure. On the other hand, it avoids the "take page off of the LRU" complexity that 
I've got here. But again, I don't think a single flag, or even a count, would actually 
solve the problem.
Jan Kara July 4, 2018, 10:43 a.m. UTC | #12
On Tue 03-07-18 10:36:05, John Hubbard wrote:
> On 07/03/2018 10:08 AM, Christopher Lameter wrote:
> > On Mon, 2 Jul 2018, John Hubbard wrote:
> > 
> >>> If you establish a reference to a page then increase the page count. If
> >>> the reference is a dma pin action also then increase the pinned count.
> >>>
> >>> That way you know how many of the references to the page are dma
> >>> pins and you can correctly manage the state of the page if the dma pins go
> >>> away.
> >>>
> >>
> >> I think this sounds like what this patch already does, right? See:
> >> __put_page_for_pinned_dma(), __get_page_for_pinned_dma(), and
> >> pin_page_for_dma(). The locking seems correct to me, but I suspect it's
> >> too heavyweight for such a hot path. But without adding a new put_user_page()
> >> call, that was the best I could come up with.
> > 
> > When I saw the patch it looked like you were avoiding to increment the
> > page->count field.
> 
> Looking at it again, this patch is definitely susceptible to Jan's "page gets
> dma-unpinnned too soon" problem.  That leaves a window in which the original
> problem can occur.
> 
> The page->_refcount field is used normally, in addition to the dma_pinned_count.
> But the problem is that, unless the caller knows what kind of page it is,
> the page->dma_pinned_count cannot be looked at, because it is unioned with
> page->lru.prev.  page->dma_pinned_flags, at least starting at bit 1, are 
> safe to look at due to pointer alignment, but now you cannot atomically 
> count...
> 
> So this seems unsolvable without having the caller specify that it knows the
> page type, and that it is therefore safe to decrement page->dma_pinned_count.
> I was hoping I'd found a way, but clearly I haven't. :)

Well, I think the misconception is that "pinned" is a fundamental property
of a page. It is not. "pinned" is a property of a page reference (i.e., a
kind of reference that can be used for DMA access) and page gets into
"pinned" state if it has any reference of "pinned" type. And when you
realize this, it is obvious that you just have to have a special api for
getting and dropping references of this "pinned" type. For getting we
already have get_user_pages(), for putting we have to create the api...

								Honza
Christopher Lameter July 5, 2018, 2:17 p.m. UTC | #13
On Wed, 4 Jul 2018, Jan Kara wrote:

> > So this seems unsolvable without having the caller specify that it knows the
> > page type, and that it is therefore safe to decrement page->dma_pinned_count.
> > I was hoping I'd found a way, but clearly I haven't. :)
>
> Well, I think the misconception is that "pinned" is a fundamental property
> of a page. It is not. "pinned" is a property of a page reference (i.e., a
> kind of reference that can be used for DMA access) and page gets into
> "pinned" state if it has any reference of "pinned" type. And when you
> realize this, it is obvious that you just have to have a special api for
> getting and dropping references of this "pinned" type. For getting we
> already have get_user_pages(), for putting we have to create the api...

Maybe we can do something by creating a special "pinned" bit in the pte?
If it is a RDMA reference then set that pinned bit there.

Thus any of the references could cause a pin. Since the page struct does
not contain that information we therefore have to scan through the ptes to
figure out if a page is pinned?

If so then we would not need a special function for dropping the
reference.

References to a page can also be created from devices mmu. Maybe we could
at some point start to manage them in a similar way to the page tables of
the processor? The mmu notifiers are a bit awkward if we need closer mm
integration.
Jan Kara July 9, 2018, 1:49 p.m. UTC | #14
On Thu 05-07-18 14:17:19, Christopher Lameter wrote:
> On Wed, 4 Jul 2018, Jan Kara wrote:
> 
> > > So this seems unsolvable without having the caller specify that it knows the
> > > page type, and that it is therefore safe to decrement page->dma_pinned_count.
> > > I was hoping I'd found a way, but clearly I haven't. :)
> >
> > Well, I think the misconception is that "pinned" is a fundamental property
> > of a page. It is not. "pinned" is a property of a page reference (i.e., a
> > kind of reference that can be used for DMA access) and page gets into
> > "pinned" state if it has any reference of "pinned" type. And when you
> > realize this, it is obvious that you just have to have a special api for
> > getting and dropping references of this "pinned" type. For getting we
> > already have get_user_pages(), for putting we have to create the api...
> 
> Maybe we can do something by creating a special "pinned" bit in the pte?
> If it is a RDMA reference then set that pinned bit there.
> 
> Thus any of the references could cause a pin. Since the page struct does
> not contain that information we therefore have to scan through the ptes to
> figure out if a page is pinned?
> 
> If so then we would not need a special function for dropping the
> reference.

I don't really see how a PTE bit would help in getting rid of the special
function for dropping "pinned" reference. You still need to distinguish
preexisting page references (and corresponding page ref drops which must
not unpin the page) from the references acquired after transitioning PTE to
the pinned state...

								Honza
diff mbox

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3094500f5cff..aeba3a13a2e3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -895,6 +895,9 @@  static inline bool is_device_public_page(const struct page *page)
 }
 #endif /* CONFIG_DEV_PAGEMAP_OPS */
 
+void __put_page_for_pinned_dma(struct page *page);
+void __get_page_for_pinned_dma(struct page *page);
+
 static inline void get_page(struct page *page)
 {
 	page = compound_head(page);
@@ -904,12 +907,24 @@  static inline void get_page(struct page *page)
 	 */
 	VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
 	page_ref_inc(page);
+
+	if (unlikely(PageDmaPinned(page)))
+		__get_page_for_pinned_dma(page);
 }
 
 static inline void put_page(struct page *page)
 {
 	page = compound_head(page);
 
+	/* Because the page->dma_pinned_* fields are unioned with
+	 * page->lru, there is no way to do classical refcount-style
+	 * decrement-and-test-for-zero. Instead, PageDmaPinned(page) must
+	 * be checked, in order to safely check if we are allowed to decrement
+	 * page->dma_pinned_count at all.
+	 */
+	if (unlikely(PageDmaPinned(page)))
+		__put_page_for_pinned_dma(page);
+
 	/*
 	 * For devmap managed pages we need to catch refcount transition from
 	 * 2 to 1, when refcount reach one it means the page is free and we
diff --git a/mm/gup.c b/mm/gup.c
index 73f0b3316fa7..e5c0104fd234 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -20,6 +20,51 @@ 
 
 #include "internal.h"
 
+static int pin_page_for_dma(struct page *page)
+{
+	int ret = 0;
+	struct zone *zone;
+
+	page = compound_head(page);
+	zone = page_zone(page);
+
+	spin_lock(zone_gup_lock(zone));
+
+	if (PageDmaPinned(page)) {
+		/* Page was not on an LRU list, because it was DMA-pinned. */
+		VM_BUG_ON_PAGE(PageLRU(page), page);
+
+		atomic_inc(&page->dma_pinned_count);
+		goto unlock_out;
+	}
+
+	/*
+	 * Note that page->dma_pinned_flags is unioned with page->lru.
+	 * Therefore, the rules are: checking if any of the
+	 * PAGE_DMA_PINNED_FLAGS bits are set may be done while page->lru
+	 * is in use. However, setting those flags requires that
+	 * the page is both locked, and also, removed from the LRU.
+	 */
+	ret = isolate_lru_page(page);
+
+	if (ret == 0) {
+		/* Avoid problems later, when freeing the page: */
+		ClearPageActive(page);
+		ClearPageUnevictable(page);
+
+		/* counteract isolate_lru_page's effects: */
+		put_page(page);
+
+		atomic_set(&page->dma_pinned_count, 1);
+		SetPageDmaPinned(page);
+	}
+
+unlock_out:
+	spin_unlock(zone_gup_lock(zone));
+
+	return ret;
+}
+
 static struct page *no_page_table(struct vm_area_struct *vma,
 		unsigned int flags)
 {
@@ -659,7 +704,7 @@  static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned int gup_flags, struct page **pages,
 		struct vm_area_struct **vmas, int *nonblocking)
 {
-	long i = 0;
+	long i = 0, j;
 	int err = 0;
 	unsigned int page_mask;
 	struct vm_area_struct *vma = NULL;
@@ -764,6 +809,10 @@  static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 	} while (nr_pages);
 
 out:
+	if (pages)
+		for (j = 0; j < i; j++)
+			pin_page_for_dma(pages[j]);
+
 	return i ? i : err;
 }
 
@@ -1843,7 +1892,7 @@  int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages)
 {
 	unsigned long addr, len, end;
-	int nr = 0, ret = 0;
+	int nr = 0, ret = 0, i;
 
 	start &= PAGE_MASK;
 	addr = start;
@@ -1864,6 +1913,9 @@  int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 		ret = nr;
 	}
 
+	for (i = 0; i < nr; i++)
+		pin_page_for_dma(pages[i]);
+
 	if (nr < nr_pages) {
 		/* Try to get the remaining pages with get_user_pages */
 		start += nr << PAGE_SHIFT;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e6f0d5ef320a..510d442647c2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2062,6 +2062,11 @@  static void lock_page_lru(struct page *page, int *isolated)
 	if (PageLRU(page)) {
 		struct lruvec *lruvec;
 
+		/* LRU and PageDmaPinned are mutually exclusive: they use the
+		 * same fields in struct page, but for different purposes.
+		 */
+		VM_BUG_ON_PAGE(PageDmaPinned(page), page);
+
 		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
 		ClearPageLRU(page);
 		del_page_from_lru_list(page, lruvec, page_lru(page));
@@ -2079,6 +2084,8 @@  static void unlock_page_lru(struct page *page, int isolated)
 
 		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
 		VM_BUG_ON_PAGE(PageLRU(page), page);
+		VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
+
 		SetPageLRU(page);
 		add_page_to_lru_list(page, lruvec, page_lru(page));
 	}
diff --git a/mm/swap.c b/mm/swap.c
index 26fc9b5f1b6c..09ba61300d06 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -52,6 +52,43 @@  static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
 #endif
 
+void __get_page_for_pinned_dma(struct page *page)
+{
+	struct zone *zone = page_zone(page);
+
+	spin_lock(zone_gup_lock(zone));
+
+	if (PageDmaPinned(page))
+		atomic_inc(&page->dma_pinned_count);
+
+	spin_unlock(zone_gup_lock(zone));
+}
+EXPORT_SYMBOL(__get_page_for_pinned_dma);
+
+void __put_page_for_pinned_dma(struct page *page)
+{
+	struct zone *zone = page_zone(page);
+
+	if (atomic_dec_and_test(&page->dma_pinned_count)) {
+		spin_lock(zone_gup_lock(zone));
+
+		VM_BUG_ON_PAGE(PageLRU(page), page);
+
+		/* Re-check while holding the lock, because
+		 * pin_page_for_dma() or get_page() may have snuck in right
+		 * after the atomic_dec_and_test, and raised the count
+		 * above zero again. If so, just leave the flag set. And
+		 * because the atomic_dec_and_test above already got the
+		 * accounting correct, no other action is required.
+		 */
+		if (atomic_read(&page->dma_pinned_count) == 0)
+			ClearPageDmaPinned(page);
+
+		spin_unlock(zone_gup_lock(zone));
+	}
+}
+EXPORT_SYMBOL(__put_page_for_pinned_dma);
+
 /*
  * This path almost never happens for VM activity - pages are normally
  * freed via pagevecs.  But it gets used by networking.
@@ -824,6 +861,11 @@  void lru_add_page_tail(struct page *page, struct page *page_tail,
 	VM_BUG_ON_PAGE(!PageHead(page), page);
 	VM_BUG_ON_PAGE(PageCompound(page_tail), page);
 	VM_BUG_ON_PAGE(PageLRU(page_tail), page);
+
+	/* LRU and PageDmaPinned are mutually exclusive: they use the
+	 * same fields in struct page, but for different purposes.
+	 */
+	VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page);
 	VM_BUG_ON(NR_CPUS != 1 &&
 		  !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock));
 
@@ -863,6 +905,12 @@  static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
 
 	VM_BUG_ON_PAGE(PageLRU(page), page);
 
+	/* LRU and PageDmaPinned are mutually exclusive: they use the
+	 * same fields in struct page, but for different purposes.
+	 */
+	if (PageDmaPinned(page))
+		return;
+
 	SetPageLRU(page);
 	/*
 	 * Page becomes evictable in two ways: