Message ID | 20180702005654.20369-6-jhubbard@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi John, Thank you for the patch! Yet something to improve: [auto build test ERROR on linus/master] [also build test ERROR on v4.18-rc3] [cannot apply to next-20180629] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/john-hubbard-gmail-com/mm-fs-gup-don-t-unmap-or-drop-filesystem-buffers/20180702-090125 config: i386-randconfig-x074-201826 (attached as .config) compiler: gcc-7 (Debian 7.3.0-16) 7.3.0 reproduce: # save the attached .config to linux build tree make ARCH=i386 All error/warnings (new ones prefixed by >>): In file included from include/asm-generic/atomic-instrumented.h:16:0, from arch/x86/include/asm/atomic.h:283, from include/linux/atomic.h:5, from include/linux/page_counter.h:5, from mm/memcontrol.c:34: mm/memcontrol.c: In function 'unlock_page_lru': >> mm/memcontrol.c:2087:32: error: 'page_tail' undeclared (first use in this function); did you mean 'page_pool'? VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); ^ include/linux/build_bug.h:36:63: note: in definition of macro 'BUILD_BUG_ON_INVALID' #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e)))) ^ >> include/linux/mmdebug.h:46:36: note: in expansion of macro 'VM_BUG_ON' #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) ^~~~~~~~~ >> mm/memcontrol.c:2087:3: note: in expansion of macro 'VM_BUG_ON_PAGE' VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); ^~~~~~~~~~~~~~ mm/memcontrol.c:2087:32: note: each undeclared identifier is reported only once for each function it appears in VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); ^ include/linux/build_bug.h:36:63: note: in definition of macro 'BUILD_BUG_ON_INVALID' #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e)))) ^ >> include/linux/mmdebug.h:46:36: note: in expansion of macro 'VM_BUG_ON' #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) ^~~~~~~~~ >> mm/memcontrol.c:2087:3: note: in expansion of macro 'VM_BUG_ON_PAGE' VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); ^~~~~~~~~~~~~~ vim +2087 mm/memcontrol.c 2077 2078 static void unlock_page_lru(struct page *page, int isolated) 2079 { 2080 struct zone *zone = page_zone(page); 2081 2082 if (isolated) { 2083 struct lruvec *lruvec; 2084 2085 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 2086 VM_BUG_ON_PAGE(PageLRU(page), page); > 2087 VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); 2088 2089 SetPageLRU(page); 2090 add_page_to_lru_list(page, lruvec, page_lru(page)); 2091 } 2092 spin_unlock_irq(zone_lru_lock(zone)); 2093 } 2094 --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
Hi John, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on linus/master] [also build test WARNING on v4.18-rc3] [cannot apply to next-20180629] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/john-hubbard-gmail-com/mm-fs-gup-don-t-unmap-or-drop-filesystem-buffers/20180702-090125 config: x86_64-randconfig-x010-201826 (attached as .config) compiler: gcc-7 (Debian 7.3.0-16) 7.3.0 reproduce: # save the attached .config to linux build tree make ARCH=x86_64 All warnings (new ones prefixed by >>): In file included from arch/x86/include/asm/atomic.h:5:0, from include/linux/atomic.h:5, from include/linux/page_counter.h:5, from mm/memcontrol.c:34: mm/memcontrol.c: In function 'unlock_page_lru': mm/memcontrol.c:2087:32: error: 'page_tail' undeclared (first use in this function); did you mean 'page_pool'? VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); ^ include/linux/compiler.h:58:30: note: in definition of macro '__trace_if' if (__builtin_constant_p(!!(cond)) ? !!(cond) : \ ^~~~ >> include/linux/mmdebug.h:21:3: note: in expansion of macro 'if' if (unlikely(cond)) { \ ^~ include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__' # define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x))) ^~~~~~~~~~~~~~~~ >> include/linux/mmdebug.h:21:7: note: in expansion of macro 'unlikely' if (unlikely(cond)) { \ ^~~~~~~~ mm/memcontrol.c:2087:3: note: in expansion of macro 'VM_BUG_ON_PAGE' VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); ^~~~~~~~~~~~~~ mm/memcontrol.c:2087:32: note: each undeclared identifier is reported only once for each function it appears in VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); ^ include/linux/compiler.h:58:30: note: in definition of macro '__trace_if' if (__builtin_constant_p(!!(cond)) ? !!(cond) : \ ^~~~ >> include/linux/mmdebug.h:21:3: note: in expansion of macro 'if' if (unlikely(cond)) { \ ^~ include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__' # define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x))) ^~~~~~~~~~~~~~~~ >> include/linux/mmdebug.h:21:7: note: in expansion of macro 'unlikely' if (unlikely(cond)) { \ ^~~~~~~~ mm/memcontrol.c:2087:3: note: in expansion of macro 'VM_BUG_ON_PAGE' VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); ^~~~~~~~~~~~~~ vim +/if +21 include/linux/mmdebug.h 309381feae Sasha Levin 2014-01-23 16 59ea746337 Jiri Slaby 2008-06-12 17 #ifdef CONFIG_DEBUG_VM 59ea746337 Jiri Slaby 2008-06-12 18 #define VM_BUG_ON(cond) BUG_ON(cond) 309381feae Sasha Levin 2014-01-23 19 #define VM_BUG_ON_PAGE(cond, page) \ e4f674229c Dave Hansen 2014-06-04 20 do { \ e4f674229c Dave Hansen 2014-06-04 @21 if (unlikely(cond)) { \ e4f674229c Dave Hansen 2014-06-04 22 dump_page(page, "VM_BUG_ON_PAGE(" __stringify(cond)")");\ e4f674229c Dave Hansen 2014-06-04 23 BUG(); \ e4f674229c Dave Hansen 2014-06-04 24 } \ e4f674229c Dave Hansen 2014-06-04 25 } while (0) fa3759ccd5 Sasha Levin 2014-10-09 26 #define VM_BUG_ON_VMA(cond, vma) \ fa3759ccd5 Sasha Levin 2014-10-09 27 do { \ fa3759ccd5 Sasha Levin 2014-10-09 28 if (unlikely(cond)) { \ fa3759ccd5 Sasha Levin 2014-10-09 29 dump_vma(vma); \ fa3759ccd5 Sasha Levin 2014-10-09 30 BUG(); \ fa3759ccd5 Sasha Levin 2014-10-09 31 } \ fa3759ccd5 Sasha Levin 2014-10-09 32 } while (0) 31c9afa6db Sasha Levin 2014-10-09 33 #define VM_BUG_ON_MM(cond, mm) \ 31c9afa6db Sasha Levin 2014-10-09 34 do { \ 31c9afa6db Sasha Levin 2014-10-09 35 if (unlikely(cond)) { \ 31c9afa6db Sasha Levin 2014-10-09 36 dump_mm(mm); \ 31c9afa6db Sasha Levin 2014-10-09 37 BUG(); \ 31c9afa6db Sasha Levin 2014-10-09 38 } \ 31c9afa6db Sasha Levin 2014-10-09 39 } while (0) 91241681c6 Michal Hocko 2018-04-05 40 #define VM_WARN_ON(cond) (void)WARN_ON(cond) 91241681c6 Michal Hocko 2018-04-05 41 #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) 91241681c6 Michal Hocko 2018-04-05 42 #define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format) 91241681c6 Michal Hocko 2018-04-05 43 #define VM_WARN(cond, format...) (void)WARN(cond, format) 59ea746337 Jiri Slaby 2008-06-12 44 #else 02602a18c3 Konstantin Khlebnikov 2012-05-29 45 #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) 309381feae Sasha Levin 2014-01-23 46 #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) fa3759ccd5 Sasha Levin 2014-10-09 47 #define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond) 31c9afa6db Sasha Levin 2014-10-09 48 #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond) 02a8efeda8 Andrew Morton 2014-06-04 49 #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) 02a8efeda8 Andrew Morton 2014-06-04 50 #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) ef6b571fb8 Andrew Morton 2014-08-06 51 #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) a54f9aebaa Aneesh Kumar K.V 2016-07-26 52 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) 59ea746337 Jiri Slaby 2008-06-12 53 #endif 59ea746337 Jiri Slaby 2008-06-12 54 :::::: The code at line 21 was first introduced by commit :::::: e4f674229ce63dac60be0c4ddfb5ef8d1225d30d mm: pass VM_BUG_ON() reason to dump_page() :::::: TO: Dave Hansen <dave.hansen@linux.intel.com> :::::: CC: Linus Torvalds <torvalds@linux-foundation.org> --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
On 07/01/2018 07:58 PM, kbuild test robot wrote: > Hi John, > > Thank you for the patch! Perhaps something to improve: > > [auto build test WARNING on linus/master] > [also build test WARNING on v4.18-rc3] > [cannot apply to next-20180629] > [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] > > url: https://github.com/0day-ci/linux/commits/john-hubbard-gmail-com/mm-fs-gup-don-t-unmap-or-drop-filesystem-buffers/20180702-090125 > config: x86_64-randconfig-x010-201826 (attached as .config) > compiler: gcc-7 (Debian 7.3.0-16) 7.3.0 > reproduce: > # save the attached .config to linux build tree > make ARCH=x86_64 > > All warnings (new ones prefixed by >>): > > In file included from arch/x86/include/asm/atomic.h:5:0, > from include/linux/atomic.h:5, > from include/linux/page_counter.h:5, > from mm/memcontrol.c:34: > mm/memcontrol.c: In function 'unlock_page_lru': > mm/memcontrol.c:2087:32: error: 'page_tail' undeclared (first use in this function); did you mean 'page_pool'? > VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); > ^ Yes, that should have been: VM_BUG_ON_PAGE(PageDmaPinned(page), page); Fixed locally...maybe I'll post a v3 right now, as there were half a dozen ridiculous typos that snuck in. thanks,
On Sun 01-07-18 17:56:53, john.hubbard@gmail.com wrote: > From: John Hubbard <jhubbard@nvidia.com> > > This patch sets and restores the new page->dma_pinned_flags and > page->dma_pinned_count fields, but does not actually use them for > anything yet. > > In order to use these fields at all, the page must be removed from > any LRU list that it's on. The patch also adds some precautions that > prevent the page from getting moved back onto an LRU, once it is > in this state. > > This is in preparation to fix some problems that came up when using > devices (NICs, GPUs, for example) that set up direct access to a chunk > of system (CPU) memory, so that they can DMA to/from that memory. > > CC: Matthew Wilcox <willy@infradead.org> > CC: Jan Kara <jack@suse.cz> > CC: Dan Williams <dan.j.williams@intel.com> > Signed-off-by: John Hubbard <jhubbard@nvidia.com> ... > @@ -904,12 +907,24 @@ static inline void get_page(struct page *page) > */ > VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); > page_ref_inc(page); > + > + if (unlikely(PageDmaPinned(page))) > + __get_page_for_pinned_dma(page); > } > > static inline void put_page(struct page *page) > { > page = compound_head(page); > > + /* Because the page->dma_pinned_* fields are unioned with > + * page->lru, there is no way to do classical refcount-style > + * decrement-and-test-for-zero. Instead, PageDmaPinned(page) must > + * be checked, in order to safely check if we are allowed to decrement > + * page->dma_pinned_count at all. > + */ > + if (unlikely(PageDmaPinned(page))) > + __put_page_for_pinned_dma(page); > + These two are just wrong. You cannot make any page reference for PageDmaPinned() account against a pin count. First, it is just conceptually wrong as these references need not be long term pins, second, you can easily race like: Pinner Random process get_page(page) pin_page_for_dma() put_page(page) -> oops, page gets unpinned too early So you really have to create counterpart to get_user_pages() - like put_user_page() or whatever... It is inconvenient to have to modify all GUP users but I don't see a way around that. Honza > /* > * For devmap managed pages we need to catch refcount transition from > * 2 to 1, when refcount reach one it means the page is free and we > diff --git a/mm/gup.c b/mm/gup.c > index 73f0b3316fa7..e5c0104fd234 100644 > --- a/mm/gup.c > +++ b/mm/gup.c > @@ -20,6 +20,51 @@ > > #include "internal.h" > > +static int pin_page_for_dma(struct page *page) > +{ > + int ret = 0; > + struct zone *zone; > + > + page = compound_head(page); > + zone = page_zone(page); > + > + spin_lock(zone_gup_lock(zone)); > + > + if (PageDmaPinned(page)) { > + /* Page was not on an LRU list, because it was DMA-pinned. */ > + VM_BUG_ON_PAGE(PageLRU(page), page); > + > + atomic_inc(&page->dma_pinned_count); > + goto unlock_out; > + } > + > + /* > + * Note that page->dma_pinned_flags is unioned with page->lru. > + * Therefore, the rules are: checking if any of the > + * PAGE_DMA_PINNED_FLAGS bits are set may be done while page->lru > + * is in use. However, setting those flags requires that > + * the page is both locked, and also, removed from the LRU. > + */ > + ret = isolate_lru_page(page); > + > + if (ret == 0) { > + /* Avoid problems later, when freeing the page: */ > + ClearPageActive(page); > + ClearPageUnevictable(page); > + > + /* counteract isolate_lru_page's effects: */ > + put_page(page); > + > + atomic_set(&page->dma_pinned_count, 1); > + SetPageDmaPinned(page); > + } > + > +unlock_out: > + spin_unlock(zone_gup_lock(zone)); > + > + return ret; > +} > + > static struct page *no_page_table(struct vm_area_struct *vma, > unsigned int flags) > { > @@ -659,7 +704,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, > unsigned int gup_flags, struct page **pages, > struct vm_area_struct **vmas, int *nonblocking) > { > - long i = 0; > + long i = 0, j; > int err = 0; > unsigned int page_mask; > struct vm_area_struct *vma = NULL; > @@ -764,6 +809,10 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, > } while (nr_pages); > > out: > + if (pages) > + for (j = 0; j < i; j++) > + pin_page_for_dma(pages[j]); > + > return i ? i : err; > } > > @@ -1843,7 +1892,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, > struct page **pages) > { > unsigned long addr, len, end; > - int nr = 0, ret = 0; > + int nr = 0, ret = 0, i; > > start &= PAGE_MASK; > addr = start; > @@ -1864,6 +1913,9 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, > ret = nr; > } > > + for (i = 0; i < nr; i++) > + pin_page_for_dma(pages[i]); > + > if (nr < nr_pages) { > /* Try to get the remaining pages with get_user_pages */ > start += nr << PAGE_SHIFT; > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index e6f0d5ef320a..510d442647c2 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -2062,6 +2062,11 @@ static void lock_page_lru(struct page *page, int *isolated) > if (PageLRU(page)) { > struct lruvec *lruvec; > > + /* LRU and PageDmaPinned are mutually exclusive: they use the > + * same fields in struct page, but for different purposes. > + */ > + VM_BUG_ON_PAGE(PageDmaPinned(page), page); > + > lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); > ClearPageLRU(page); > del_page_from_lru_list(page, lruvec, page_lru(page)); > @@ -2079,6 +2084,8 @@ static void unlock_page_lru(struct page *page, int isolated) > > lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); > VM_BUG_ON_PAGE(PageLRU(page), page); > + VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); > + > SetPageLRU(page); > add_page_to_lru_list(page, lruvec, page_lru(page)); > } > diff --git a/mm/swap.c b/mm/swap.c > index 26fc9b5f1b6c..09ba61300d06 100644 > --- a/mm/swap.c > +++ b/mm/swap.c > @@ -52,6 +52,43 @@ static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); > static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); > #endif > > +void __get_page_for_pinned_dma(struct page *page) > +{ > + struct zone *zone = page_zone(page); > + > + spin_lock(zone_gup_lock(zone)); > + > + if (PageDmaPinned(page)) > + atomic_inc(&page->dma_pinned_count); > + > + spin_unlock(zone_gup_lock(zone)); > +} > +EXPORT_SYMBOL(__get_page_for_pinned_dma); > + > +void __put_page_for_pinned_dma(struct page *page) > +{ > + struct zone *zone = page_zone(page); > + > + if (atomic_dec_and_test(&page->dma_pinned_count)) { > + spin_lock(zone_gup_lock(zone)); > + > + VM_BUG_ON_PAGE(PageLRU(page), page); > + > + /* Re-check while holding the lock, because > + * pin_page_for_dma() or get_page() may have snuck in right > + * after the atomic_dec_and_test, and raised the count > + * above zero again. If so, just leave the flag set. And > + * because the atomic_dec_and_test above already got the > + * accounting correct, no other action is required. > + */ > + if (atomic_read(&page->dma_pinned_count) == 0) > + ClearPageDmaPinned(page); > + > + spin_unlock(zone_gup_lock(zone)); > + } > +} > +EXPORT_SYMBOL(__put_page_for_pinned_dma); > + > /* > * This path almost never happens for VM activity - pages are normally > * freed via pagevecs. But it gets used by networking. > @@ -824,6 +861,11 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, > VM_BUG_ON_PAGE(!PageHead(page), page); > VM_BUG_ON_PAGE(PageCompound(page_tail), page); > VM_BUG_ON_PAGE(PageLRU(page_tail), page); > + > + /* LRU and PageDmaPinned are mutually exclusive: they use the > + * same fields in struct page, but for different purposes. > + */ > + VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); > VM_BUG_ON(NR_CPUS != 1 && > !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); > > @@ -863,6 +905,12 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, > > VM_BUG_ON_PAGE(PageLRU(page), page); > > + /* LRU and PageDmaPinned are mutually exclusive: they use the > + * same fields in struct page, but for different purposes. > + */ > + if (PageDmaPinned(page)) > + return; > + > SetPageLRU(page); > /* > * Page becomes evictable in two ways: > -- > 2.18.0 >
On 07/02/2018 02:53 AM, Jan Kara wrote: > On Sun 01-07-18 17:56:53, john.hubbard@gmail.com wrote: >> From: John Hubbard <jhubbard@nvidia.com> >> > ... > >> @@ -904,12 +907,24 @@ static inline void get_page(struct page *page) >> */ >> VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); >> page_ref_inc(page); >> + >> + if (unlikely(PageDmaPinned(page))) >> + __get_page_for_pinned_dma(page); >> } >> >> static inline void put_page(struct page *page) >> { >> page = compound_head(page); >> >> + /* Because the page->dma_pinned_* fields are unioned with >> + * page->lru, there is no way to do classical refcount-style >> + * decrement-and-test-for-zero. Instead, PageDmaPinned(page) must >> + * be checked, in order to safely check if we are allowed to decrement >> + * page->dma_pinned_count at all. >> + */ >> + if (unlikely(PageDmaPinned(page))) >> + __put_page_for_pinned_dma(page); >> + > > These two are just wrong. You cannot make any page reference for > PageDmaPinned() account against a pin count. First, it is just conceptually > wrong as these references need not be long term pins, second, you can > easily race like: > > Pinner Random process > get_page(page) > pin_page_for_dma() > put_page(page) > -> oops, page gets unpinned too early > I'll drop this approach, without mentioning any of the locking that is hiding in there, since that was probably breaking other rules anyway. :) Thanks for your patience in reviewing this. > So you really have to create counterpart to get_user_pages() - like > put_user_page() or whatever... It is inconvenient to have to modify all GUP > users but I don't see a way around that. OK, there will be a long-ish pause, while I go visit all the gup sites. I count about 88 callers, which is not nearly as crazy as my first casual grep showed, but still quite a chunk, since I have to track down where each one does its put_page call(s). It's definitely worth the effort, though. These pins just plain need some special handling in order to get everything correct. thanks,
On Mon, 2 Jul 2018, John Hubbard wrote: > > > > These two are just wrong. You cannot make any page reference for > > PageDmaPinned() account against a pin count. First, it is just conceptually > > wrong as these references need not be long term pins, second, you can > > easily race like: > > > > Pinner Random process > > get_page(page) > > pin_page_for_dma() > > put_page(page) > > -> oops, page gets unpinned too early > > > > I'll drop this approach, without mentioning any of the locking that is hiding in > there, since that was probably breaking other rules anyway. :) Thanks for your > patience in reviewing this. Mayb the following would work: If you establish a reference to a page then increase the page count. If the reference is a dma pin action also then increase the pinned count. That way you know how many of the references to the page are dma pins and you can correctly manage the state of the page if the dma pins go away.
On 07/02/2018 05:08 PM, Christopher Lameter wrote: > On Mon, 2 Jul 2018, John Hubbard wrote: > >>> >>> These two are just wrong. You cannot make any page reference for >>> PageDmaPinned() account against a pin count. First, it is just conceptually >>> wrong as these references need not be long term pins, second, you can >>> easily race like: >>> >>> Pinner Random process >>> get_page(page) >>> pin_page_for_dma() >>> put_page(page) >>> -> oops, page gets unpinned too early >>> >> >> I'll drop this approach, without mentioning any of the locking that is hiding in >> there, since that was probably breaking other rules anyway. :) Thanks for your >> patience in reviewing this. > > Mayb the following would work: > > If you establish a reference to a page then increase the page count. If > the reference is a dma pin action also then increase the pinned count. > > That way you know how many of the references to the page are dma > pins and you can correctly manage the state of the page if the dma pins go > away. > I think this sounds like what this patch already does, right? See: __put_page_for_pinned_dma(), __get_page_for_pinned_dma(), and pin_page_for_dma(). The locking seems correct to me, but I suspect it's too heavyweight for such a hot path. But without adding a new put_user_page() call, that was the best I could come up with. What I'm hearing now from Jan and Michal is that the desired end result is a separate API call, put_user_pages(), so that we can explicitly manage these pinned pages. thanks,
On Mon, 2 Jul 2018, John Hubbard wrote: > > If you establish a reference to a page then increase the page count. If > > the reference is a dma pin action also then increase the pinned count. > > > > That way you know how many of the references to the page are dma > > pins and you can correctly manage the state of the page if the dma pins go > > away. > > > > I think this sounds like what this patch already does, right? See: > __put_page_for_pinned_dma(), __get_page_for_pinned_dma(), and > pin_page_for_dma(). The locking seems correct to me, but I suspect it's > too heavyweight for such a hot path. But without adding a new put_user_page() > call, that was the best I could come up with. When I saw the patch it looked like you were avoiding to increment the page->count field. > What I'm hearing now from Jan and Michal is that the desired end result is > a separate API call, put_user_pages(), so that we can explicitly manage > these pinned pages. Certainly a good approach.
On 07/03/2018 10:08 AM, Christopher Lameter wrote: > On Mon, 2 Jul 2018, John Hubbard wrote: > >>> If you establish a reference to a page then increase the page count. If >>> the reference is a dma pin action also then increase the pinned count. >>> >>> That way you know how many of the references to the page are dma >>> pins and you can correctly manage the state of the page if the dma pins go >>> away. >>> >> >> I think this sounds like what this patch already does, right? See: >> __put_page_for_pinned_dma(), __get_page_for_pinned_dma(), and >> pin_page_for_dma(). The locking seems correct to me, but I suspect it's >> too heavyweight for such a hot path. But without adding a new put_user_page() >> call, that was the best I could come up with. > > When I saw the patch it looked like you were avoiding to increment the > page->count field. Looking at it again, this patch is definitely susceptible to Jan's "page gets dma-unpinnned too soon" problem. That leaves a window in which the original problem can occur. The page->_refcount field is used normally, in addition to the dma_pinned_count. But the problem is that, unless the caller knows what kind of page it is, the page->dma_pinned_count cannot be looked at, because it is unioned with page->lru.prev. page->dma_pinned_flags, at least starting at bit 1, are safe to look at due to pointer alignment, but now you cannot atomically count... So this seems unsolvable without having the caller specify that it knows the page type, and that it is therefore safe to decrement page->dma_pinned_count. I was hoping I'd found a way, but clearly I haven't. :) thanks,
On Tue, 3 Jul 2018, John Hubbard wrote: > The page->_refcount field is used normally, in addition to the dma_pinned_count. > But the problem is that, unless the caller knows what kind of page it is, > the page->dma_pinned_count cannot be looked at, because it is unioned with > page->lru.prev. page->dma_pinned_flags, at least starting at bit 1, are > safe to look at due to pointer alignment, but now you cannot atomically > count... > > So this seems unsolvable without having the caller specify that it knows the > page type, and that it is therefore safe to decrement page->dma_pinned_count. > I was hoping I'd found a way, but clearly I haven't. :) Try to find some way to indicate that the page is pinned by using some of the existing page flags? There is already an MLOCK flag. Maybe some creativity with that can lead to something (but then the MLOCKed pages are on the unevictable LRU....). cgroups used to have something called struct page_ext. Oh its there in linux/mm/page_ext.c.
On 07/03/2018 10:48 AM, Christopher Lameter wrote: > On Tue, 3 Jul 2018, John Hubbard wrote: > >> The page->_refcount field is used normally, in addition to the dma_pinned_count. >> But the problem is that, unless the caller knows what kind of page it is, >> the page->dma_pinned_count cannot be looked at, because it is unioned with >> page->lru.prev. page->dma_pinned_flags, at least starting at bit 1, are >> safe to look at due to pointer alignment, but now you cannot atomically >> count... >> >> So this seems unsolvable without having the caller specify that it knows the >> page type, and that it is therefore safe to decrement page->dma_pinned_count. >> I was hoping I'd found a way, but clearly I haven't. :) > > Try to find some way to indicate that the page is pinned by using some of > the existing page flags? There is already an MLOCK flag. Maybe some > creativity with that can lead to something (but then the MLOCKed pages are > on the unevictable LRU....). cgroups used to have something called struct > page_ext. Oh its there in linux/mm/page_ext.c. > Yes, that would provide just a touch more cabability: we could both read and write a dma-pinned page(_ext) flag safely, instead of only being able to just read. I'm doubt that that's enough additional information, though. The general problem of allowing random put_page() calls to decrement the dma-pinned count (see Jan's diagram at the beginning of this thread) cannot be solved by anything less than some sort of "who (or which special type of caller, at least) owns this page" approach, as far as I can see. The put_user_pages() provides arguably the simplest version of that kind of solution. Also, even just using a single bit from page extensions would cost some extra memory, because for example on 64-bit systems many configurations do not need the additional flags that page_ext.h provides, so they return "false" from the page_ext_operations.need() callback. Changing get_user_pages to require page extensions would lead to *every* configuration requiring page extensions, so 64-bit users would lose some memory for sure. On the other hand, it avoids the "take page off of the LRU" complexity that I've got here. But again, I don't think a single flag, or even a count, would actually solve the problem.
On Tue 03-07-18 10:36:05, John Hubbard wrote: > On 07/03/2018 10:08 AM, Christopher Lameter wrote: > > On Mon, 2 Jul 2018, John Hubbard wrote: > > > >>> If you establish a reference to a page then increase the page count. If > >>> the reference is a dma pin action also then increase the pinned count. > >>> > >>> That way you know how many of the references to the page are dma > >>> pins and you can correctly manage the state of the page if the dma pins go > >>> away. > >>> > >> > >> I think this sounds like what this patch already does, right? See: > >> __put_page_for_pinned_dma(), __get_page_for_pinned_dma(), and > >> pin_page_for_dma(). The locking seems correct to me, but I suspect it's > >> too heavyweight for such a hot path. But without adding a new put_user_page() > >> call, that was the best I could come up with. > > > > When I saw the patch it looked like you were avoiding to increment the > > page->count field. > > Looking at it again, this patch is definitely susceptible to Jan's "page gets > dma-unpinnned too soon" problem. That leaves a window in which the original > problem can occur. > > The page->_refcount field is used normally, in addition to the dma_pinned_count. > But the problem is that, unless the caller knows what kind of page it is, > the page->dma_pinned_count cannot be looked at, because it is unioned with > page->lru.prev. page->dma_pinned_flags, at least starting at bit 1, are > safe to look at due to pointer alignment, but now you cannot atomically > count... > > So this seems unsolvable without having the caller specify that it knows the > page type, and that it is therefore safe to decrement page->dma_pinned_count. > I was hoping I'd found a way, but clearly I haven't. :) Well, I think the misconception is that "pinned" is a fundamental property of a page. It is not. "pinned" is a property of a page reference (i.e., a kind of reference that can be used for DMA access) and page gets into "pinned" state if it has any reference of "pinned" type. And when you realize this, it is obvious that you just have to have a special api for getting and dropping references of this "pinned" type. For getting we already have get_user_pages(), for putting we have to create the api... Honza
On Wed, 4 Jul 2018, Jan Kara wrote: > > So this seems unsolvable without having the caller specify that it knows the > > page type, and that it is therefore safe to decrement page->dma_pinned_count. > > I was hoping I'd found a way, but clearly I haven't. :) > > Well, I think the misconception is that "pinned" is a fundamental property > of a page. It is not. "pinned" is a property of a page reference (i.e., a > kind of reference that can be used for DMA access) and page gets into > "pinned" state if it has any reference of "pinned" type. And when you > realize this, it is obvious that you just have to have a special api for > getting and dropping references of this "pinned" type. For getting we > already have get_user_pages(), for putting we have to create the api... Maybe we can do something by creating a special "pinned" bit in the pte? If it is a RDMA reference then set that pinned bit there. Thus any of the references could cause a pin. Since the page struct does not contain that information we therefore have to scan through the ptes to figure out if a page is pinned? If so then we would not need a special function for dropping the reference. References to a page can also be created from devices mmu. Maybe we could at some point start to manage them in a similar way to the page tables of the processor? The mmu notifiers are a bit awkward if we need closer mm integration.
On Thu 05-07-18 14:17:19, Christopher Lameter wrote: > On Wed, 4 Jul 2018, Jan Kara wrote: > > > > So this seems unsolvable without having the caller specify that it knows the > > > page type, and that it is therefore safe to decrement page->dma_pinned_count. > > > I was hoping I'd found a way, but clearly I haven't. :) > > > > Well, I think the misconception is that "pinned" is a fundamental property > > of a page. It is not. "pinned" is a property of a page reference (i.e., a > > kind of reference that can be used for DMA access) and page gets into > > "pinned" state if it has any reference of "pinned" type. And when you > > realize this, it is obvious that you just have to have a special api for > > getting and dropping references of this "pinned" type. For getting we > > already have get_user_pages(), for putting we have to create the api... > > Maybe we can do something by creating a special "pinned" bit in the pte? > If it is a RDMA reference then set that pinned bit there. > > Thus any of the references could cause a pin. Since the page struct does > not contain that information we therefore have to scan through the ptes to > figure out if a page is pinned? > > If so then we would not need a special function for dropping the > reference. I don't really see how a PTE bit would help in getting rid of the special function for dropping "pinned" reference. You still need to distinguish preexisting page references (and corresponding page ref drops which must not unpin the page) from the references acquired after transitioning PTE to the pinned state... Honza
diff --git a/include/linux/mm.h b/include/linux/mm.h index 3094500f5cff..aeba3a13a2e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -895,6 +895,9 @@ static inline bool is_device_public_page(const struct page *page) } #endif /* CONFIG_DEV_PAGEMAP_OPS */ +void __put_page_for_pinned_dma(struct page *page); +void __get_page_for_pinned_dma(struct page *page); + static inline void get_page(struct page *page) { page = compound_head(page); @@ -904,12 +907,24 @@ static inline void get_page(struct page *page) */ VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); page_ref_inc(page); + + if (unlikely(PageDmaPinned(page))) + __get_page_for_pinned_dma(page); } static inline void put_page(struct page *page) { page = compound_head(page); + /* Because the page->dma_pinned_* fields are unioned with + * page->lru, there is no way to do classical refcount-style + * decrement-and-test-for-zero. Instead, PageDmaPinned(page) must + * be checked, in order to safely check if we are allowed to decrement + * page->dma_pinned_count at all. + */ + if (unlikely(PageDmaPinned(page))) + __put_page_for_pinned_dma(page); + /* * For devmap managed pages we need to catch refcount transition from * 2 to 1, when refcount reach one it means the page is free and we diff --git a/mm/gup.c b/mm/gup.c index 73f0b3316fa7..e5c0104fd234 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -20,6 +20,51 @@ #include "internal.h" +static int pin_page_for_dma(struct page *page) +{ + int ret = 0; + struct zone *zone; + + page = compound_head(page); + zone = page_zone(page); + + spin_lock(zone_gup_lock(zone)); + + if (PageDmaPinned(page)) { + /* Page was not on an LRU list, because it was DMA-pinned. */ + VM_BUG_ON_PAGE(PageLRU(page), page); + + atomic_inc(&page->dma_pinned_count); + goto unlock_out; + } + + /* + * Note that page->dma_pinned_flags is unioned with page->lru. + * Therefore, the rules are: checking if any of the + * PAGE_DMA_PINNED_FLAGS bits are set may be done while page->lru + * is in use. However, setting those flags requires that + * the page is both locked, and also, removed from the LRU. + */ + ret = isolate_lru_page(page); + + if (ret == 0) { + /* Avoid problems later, when freeing the page: */ + ClearPageActive(page); + ClearPageUnevictable(page); + + /* counteract isolate_lru_page's effects: */ + put_page(page); + + atomic_set(&page->dma_pinned_count, 1); + SetPageDmaPinned(page); + } + +unlock_out: + spin_unlock(zone_gup_lock(zone)); + + return ret; +} + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { @@ -659,7 +704,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) { - long i = 0; + long i = 0, j; int err = 0; unsigned int page_mask; struct vm_area_struct *vma = NULL; @@ -764,6 +809,10 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } while (nr_pages); out: + if (pages) + for (j = 0; j < i; j++) + pin_page_for_dma(pages[j]); + return i ? i : err; } @@ -1843,7 +1892,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { unsigned long addr, len, end; - int nr = 0, ret = 0; + int nr = 0, ret = 0, i; start &= PAGE_MASK; addr = start; @@ -1864,6 +1913,9 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, ret = nr; } + for (i = 0; i < nr; i++) + pin_page_for_dma(pages[i]); + if (nr < nr_pages) { /* Try to get the remaining pages with get_user_pages */ start += nr << PAGE_SHIFT; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e6f0d5ef320a..510d442647c2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2062,6 +2062,11 @@ static void lock_page_lru(struct page *page, int *isolated) if (PageLRU(page)) { struct lruvec *lruvec; + /* LRU and PageDmaPinned are mutually exclusive: they use the + * same fields in struct page, but for different purposes. + */ + VM_BUG_ON_PAGE(PageDmaPinned(page), page); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_lru(page)); @@ -2079,6 +2084,8 @@ static void unlock_page_lru(struct page *page, int isolated) lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); + SetPageLRU(page); add_page_to_lru_list(page, lruvec, page_lru(page)); } diff --git a/mm/swap.c b/mm/swap.c index 26fc9b5f1b6c..09ba61300d06 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -52,6 +52,43 @@ static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); #endif +void __get_page_for_pinned_dma(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock(zone_gup_lock(zone)); + + if (PageDmaPinned(page)) + atomic_inc(&page->dma_pinned_count); + + spin_unlock(zone_gup_lock(zone)); +} +EXPORT_SYMBOL(__get_page_for_pinned_dma); + +void __put_page_for_pinned_dma(struct page *page) +{ + struct zone *zone = page_zone(page); + + if (atomic_dec_and_test(&page->dma_pinned_count)) { + spin_lock(zone_gup_lock(zone)); + + VM_BUG_ON_PAGE(PageLRU(page), page); + + /* Re-check while holding the lock, because + * pin_page_for_dma() or get_page() may have snuck in right + * after the atomic_dec_and_test, and raised the count + * above zero again. If so, just leave the flag set. And + * because the atomic_dec_and_test above already got the + * accounting correct, no other action is required. + */ + if (atomic_read(&page->dma_pinned_count) == 0) + ClearPageDmaPinned(page); + + spin_unlock(zone_gup_lock(zone)); + } +} +EXPORT_SYMBOL(__put_page_for_pinned_dma); + /* * This path almost never happens for VM activity - pages are normally * freed via pagevecs. But it gets used by networking. @@ -824,6 +861,11 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, VM_BUG_ON_PAGE(!PageHead(page), page); VM_BUG_ON_PAGE(PageCompound(page_tail), page); VM_BUG_ON_PAGE(PageLRU(page_tail), page); + + /* LRU and PageDmaPinned are mutually exclusive: they use the + * same fields in struct page, but for different purposes. + */ + VM_BUG_ON_PAGE(PageDmaPinned(page_tail), page); VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); @@ -863,6 +905,12 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, VM_BUG_ON_PAGE(PageLRU(page), page); + /* LRU and PageDmaPinned are mutually exclusive: they use the + * same fields in struct page, but for different purposes. + */ + if (PageDmaPinned(page)) + return; + SetPageLRU(page); /* * Page becomes evictable in two ways: