Message ID | 1625903002-31619-4-git-send-email-linyunsheng@huawei.com (mailing list archive) |
---|---|
State | RFC |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | add elevated refcnt support for page pool | expand |
Context | Check | Description |
---|---|---|
netdev/cover_letter | success | Link |
netdev/fixes_present | success | Link |
netdev/patch_count | success | Link |
netdev/tree_selection | success | Guessed tree name to be net-next |
netdev/subject_prefix | warning | Target tree name not specified in the subject |
netdev/cc_maintainers | warning | 5 maintainers not CCed: yhs@fb.com kpsingh@kernel.org andrii@kernel.org kafai@fb.com songliubraving@fb.com |
netdev/source_inline | success | Was 0 now: 0 |
netdev/verify_signedoff | success | Link |
netdev/module_param | success | Was 0 now: 0 |
netdev/build_32bit | success | Errors and warnings before: 6118 this patch: 6118 |
netdev/kdoc | success | Errors and warnings before: 0 this patch: 0 |
netdev/verify_fixes | success | Link |
netdev/checkpatch | success | total: 0 errors, 0 warnings, 0 checks, 208 lines checked |
netdev/build_allmodconfig_warn | success | Errors and warnings before: 6175 this patch: 6175 |
netdev/header_inline | success | Link |
On Sat, Jul 10, 2021 at 12:44 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > Currently page pool only support page recycling only when > refcnt of page is one, which means it can not support the > split page recycling implemented in the most driver. > > The expectation of page recycling based on elevated refcnt > is that we only do the recycling or freeing of page when the > last user has dropped the refcnt that has given to it. > > The above expectation is based on that the last user will > always call page_pool_put_full_page() in order to do the > recycling or do the resource cleanup(dma unmaping..etc) and > freeing. > > Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> > --- > include/net/page_pool.h | 5 ++- > net/core/page_pool.c | 106 ++++++++++++++++++++++++++++++++++++------------ > 2 files changed, 84 insertions(+), 27 deletions(-) > > diff --git a/include/net/page_pool.h b/include/net/page_pool.h > index 5746f17..f0e708d 100644 > --- a/include/net/page_pool.h > +++ b/include/net/page_pool.h > @@ -45,7 +45,10 @@ > * Please note DMA-sync-for-CPU is still > * device driver responsibility > */ > -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) > +#define PP_FLAG_PAGECNT_BIAS BIT(2) /* For elevated refcnt feature */ > +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ > + PP_FLAG_DMA_SYNC_DEV |\ > + PP_FLAG_PAGECNT_BIAS) > > /* > * Fast allocation side cache array/stack > diff --git a/net/core/page_pool.c b/net/core/page_pool.c > index 78838c6..a87cbe1 100644 > --- a/net/core/page_pool.c > +++ b/net/core/page_pool.c > @@ -24,6 +24,8 @@ > #define DEFER_TIME (msecs_to_jiffies(1000)) > #define DEFER_WARN_INTERVAL (60 * HZ) > > +#define BIAS_MAX (PAGE_SIZE - 1) > + > static int page_pool_init(struct page_pool *pool, > const struct page_pool_params *params) > { > @@ -209,14 +211,24 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) > static void page_pool_set_pp_info(struct page_pool *pool, > struct page *page) > { > + if (pool->p.flags & PP_FLAG_PAGECNT_BIAS) { > + page_ref_add(page, BIAS_MAX); > + page_pool_set_pagecnt_bias(page, BIAS_MAX); > + } > + > page->pp = pool; > page->pp_magic |= PP_SIGNATURE; > } I think this piece makes more sense as a part of __page_pool_alloc_page_order. Basically have it run parallel to the DMA mapping setup. > -static void page_pool_clear_pp_info(struct page *page) > +static int page_pool_clear_pp_info(struct page *page) > { > + int bias = page_pool_get_pagecnt_bias(page); > + > page->pp_magic = 0; > page->pp = NULL; > + page_pool_set_pagecnt_bias(page, 0); > + > + return bias; > } > > static struct page *__page_pool_alloc_page_order(struct page_pool *pool, > @@ -298,6 +310,23 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, > return page; > } > > +static void page_pool_sub_bias(struct page_pool *pool, > + struct page *page, int nr) > +{ > + int bias; > + > + if (!(pool->p.flags & PP_FLAG_PAGECNT_BIAS)) > + return; > + > + bias = page_pool_get_pagecnt_bias(page); > + if (unlikely(bias <= nr)) { > + page_ref_add(page, BIAS_MAX - bias); > + bias = BIAS_MAX; > + } > + > + page_pool_set_pagecnt_bias(page, bias - nr); > +} > + > /* For using page_pool replace: alloc_pages() API calls, but provide > * synchronization guarantee for allocation side. > */ > @@ -307,11 +336,16 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) > > /* Fast-path: Get a page from cache */ > page = __page_pool_get_cached(pool); > - if (page) > + if (page) { > + page_pool_sub_bias(pool, page, 1); > return page; > + } > > /* Slow-path: cache empty, do real allocation */ > page = __page_pool_alloc_pages_slow(pool, gfp); > + if (likely(page)) > + page_pool_sub_bias(pool, page, 1); > + > return page; I would probably just reorder this a bit. Do something like: page = __page_pool_get_cached() if (!page) { page = __page_pool_alloc_pages_slow() if (!page) return NULL; } page_pool_sub_bias() return page; > } > EXPORT_SYMBOL(page_pool_alloc_pages); > @@ -340,10 +374,11 @@ static s32 page_pool_inflight(struct page_pool *pool) > * a regular page (that will eventually be returned to the normal > * page-allocator via put_page). > */ > -void page_pool_release_page(struct page_pool *pool, struct page *page) > +static int __page_pool_release_page(struct page_pool *pool, > + struct page *page) > { > + int bias, count; > dma_addr_t dma; > - int count; > > if (!(pool->p.flags & PP_FLAG_DMA_MAP)) > /* Always account for inflight pages, even if we didn't > @@ -359,22 +394,30 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) > DMA_ATTR_SKIP_CPU_SYNC); > page_pool_set_dma_addr(page, 0); > skip_dma_unmap: > - page_pool_clear_pp_info(page); > + bias = page_pool_clear_pp_info(page); > > /* This may be the last page returned, releasing the pool, so > * it is not safe to reference pool afterwards. > */ > count = atomic_inc_return(&pool->pages_state_release_cnt); > trace_page_pool_state_release(pool, page, count); > + > + return bias; > +} > + > +void page_pool_release_page(struct page_pool *pool, struct page *page) > +{ > + int bias = __page_pool_release_page(pool, page); > + > + WARN_ONCE(bias, "%s is called from driver with elevated refcnt\n", > + __func__); > } I'm not sure it makes sense to have a warning about this. There are multiple scenarios where you will still have to release the page early and deduct the pagecnt_bias. > EXPORT_SYMBOL(page_pool_release_page); > > /* Return a page to the page allocator, cleaning up our state */ > static void page_pool_return_page(struct page_pool *pool, struct page *page) > { > - page_pool_release_page(pool, page); > - > - put_page(page); > + __page_frag_cache_drain(page, __page_pool_release_page(pool, page) + 1); > /* An optimization would be to call __free_pages(page, pool->p.order) > * knowing page is not part of page-cache (thus avoiding a > * __page_cache_release() call). > @@ -409,6 +452,15 @@ static bool page_pool_recycle_in_cache(struct page *page, > return true; > } > > +static bool page_pool_bias_page_recyclable(struct page *page, int bias) > +{ > + int ref = page_ref_dec_return(page); > + > + WARN_ON(ref <= bias); > + > + return ref == bias + 1; > +} > + > /* If the page refcnt == 1, this will try to recycle the page. > * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for > * the configured size min(dma_sync_size, pool->max_len). > @@ -419,6 +471,20 @@ static __always_inline struct page * > __page_pool_put_page(struct page_pool *pool, struct page *page, > unsigned int dma_sync_size, bool allow_direct) > { > + int bias = page_pool_get_pagecnt_bias(page); > + > + /* Handle the elevated refcnt case first */ > + if (bias) { > + /* It is not the last user yet */ > + if (!page_pool_bias_page_recyclable(page, bias)) > + return NULL; > + > + if (likely(!page_is_pfmemalloc(page))) > + goto recyclable; > + else > + goto unrecyclable; > + } > + > /* This allocator is optimized for the XDP mode that uses > * one-frame-per-page, but have fallbacks that act like the > * regular page allocator APIs. > @@ -430,7 +496,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, > */ > if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { > /* Read barrier done in page_ref_count / READ_ONCE */ > - > +recyclable: > if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) > page_pool_dma_sync_for_device(pool, page, > dma_sync_size); > @@ -442,22 +508,9 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, > /* Page found as candidate for recycling */ > return page; > } > - /* Fallback/non-XDP mode: API user have elevated refcnt. > - * > - * Many drivers split up the page into fragments, and some > - * want to keep doing this to save memory and do refcnt based > - * recycling. Support this use case too, to ease drivers > - * switching between XDP/non-XDP. > - * > - * In-case page_pool maintains the DMA mapping, API user must > - * call page_pool_put_page once. In this elevated refcnt > - * case, the DMA is unmapped/released, as driver is likely > - * doing refcnt based recycle tricks, meaning another process > - * will be invoking put_page. > - */ > - /* Do not replace this with page_pool_return_page() */ > - page_pool_release_page(pool, page); > - put_page(page); > + > +unrecyclable: > + page_pool_return_page(pool, page); > > return NULL; > } > @@ -518,7 +571,8 @@ static void page_pool_empty_ring(struct page_pool *pool) > /* Empty recycle ring */ > while ((page = ptr_ring_consume_bh(&pool->ring))) { > /* Verify the refcnt invariant of cached pages */ > - if (!(page_ref_count(page) == 1)) > + if (!(page_ref_count(page) == > + (page_pool_get_pagecnt_bias(page) + 1))) > pr_crit("%s() page_pool refcnt %d violation\n", > __func__, page_ref_count(page)); > So I am not sure this is entirely useful since there are cases where the mm subsystem will take references on pages like I mentioned before. Also we should probably be caching the output from page_ref_count instead of calling it twice as it would guarantee that we will see the actual value the test saw versus performing the read twice which might indicate two different values if somebody else is messing with the page.
On Sat, Jul 10, 2021 at 12:44 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: <snip> > @@ -419,6 +471,20 @@ static __always_inline struct page * > __page_pool_put_page(struct page_pool *pool, struct page *page, > unsigned int dma_sync_size, bool allow_direct) > { > + int bias = page_pool_get_pagecnt_bias(page); > + > + /* Handle the elevated refcnt case first */ > + if (bias) { > + /* It is not the last user yet */ > + if (!page_pool_bias_page_recyclable(page, bias)) > + return NULL; > + > + if (likely(!page_is_pfmemalloc(page))) > + goto recyclable; > + else > + goto unrecyclable; > + } > + So this part is still broken. Anything that takes a reference to the page and holds it while this is called will cause it to break. For example with the recent fixes we put in place all it would take is a skb_clone followed by pskb_expand_head and this starts leaking memory. One of the key bits in order for pagecnt_bias to work is that you have to deduct the bias once there are no more parties using it. Otherwise you leave the reference count artificially inflated and the page will never be freed. It works fine for the single producer single consumer case but once you introduce multiple consumers this is going to fall apart.
On 2021/7/11 1:31, Alexander Duyck wrote: > On Sat, Jul 10, 2021 at 12:44 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > <snip> >> @@ -419,6 +471,20 @@ static __always_inline struct page * >> __page_pool_put_page(struct page_pool *pool, struct page *page, >> unsigned int dma_sync_size, bool allow_direct) >> { >> + int bias = page_pool_get_pagecnt_bias(page); >> + >> + /* Handle the elevated refcnt case first */ >> + if (bias) { >> + /* It is not the last user yet */ >> + if (!page_pool_bias_page_recyclable(page, bias)) >> + return NULL; >> + >> + if (likely(!page_is_pfmemalloc(page))) >> + goto recyclable; >> + else >> + goto unrecyclable; >> + } >> + > > So this part is still broken. Anything that takes a reference to the > page and holds it while this is called will cause it to break. For > example with the recent fixes we put in place all it would take is a > skb_clone followed by pskb_expand_head and this starts leaking memory. Ok, it seems the fix is confilcting with the expectation this patch is based, which is "the last user will always call page_pool_put_full_page() in order to do the recycling or do the resource cleanup(dma unmaping..etc) and freeing.". As the user of the new skb after skb_clone() and pskb_expand_head() is not aware of that their frag page may still be in the page pool after the fix? > > One of the key bits in order for pagecnt_bias to work is that you have > to deduct the bias once there are no more parties using it. Otherwise > you leave the reference count artificially inflated and the page will > never be freed. It works fine for the single producer single consumer > case but once you introduce multiple consumers this is going to fall > apart. It seems we have diffferent understanding about consumer, I treat the above user of new skb after skb_clone() and pskb_expand_head() as the consumer of the page pool too, so that new skb should keep the recycle bit in order for that to happen. If the semantic is "the new user of a page should not be handled by page pool if page pool is not aware of the new user(the new user is added by calling page allocator API instead of calling the page pool API, like the skb_clone() and pskb_expand_head() above) ", I suppose I am ok with that semantic too as long as the above semantic is aligned with the people involved. Also, it seems _refcount and dma_addr in "struct page" is in the same cache line, which means there is already cache line bouncing already between _refcount and dma_addr updating, so it may makes senses to only use bias to indicate number of the page pool user for a page, instead of using "bias - page_ref_count", as the page_ref_count is not reliable if somebody is using the page allocator API directly. And the trick part seems to be how to make the bias atomic for allocating and freeing. Any better idea? > . >
On Sun, Jul 11, 2021 at 7:06 PM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2021/7/11 1:31, Alexander Duyck wrote: > > On Sat, Jul 10, 2021 at 12:44 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > <snip> > >> @@ -419,6 +471,20 @@ static __always_inline struct page * > >> __page_pool_put_page(struct page_pool *pool, struct page *page, > >> unsigned int dma_sync_size, bool allow_direct) > >> { > >> + int bias = page_pool_get_pagecnt_bias(page); > >> + > >> + /* Handle the elevated refcnt case first */ > >> + if (bias) { > >> + /* It is not the last user yet */ > >> + if (!page_pool_bias_page_recyclable(page, bias)) > >> + return NULL; > >> + > >> + if (likely(!page_is_pfmemalloc(page))) > >> + goto recyclable; > >> + else > >> + goto unrecyclable; > >> + } > >> + > > > > So this part is still broken. Anything that takes a reference to the > > page and holds it while this is called will cause it to break. For > > example with the recent fixes we put in place all it would take is a > > skb_clone followed by pskb_expand_head and this starts leaking memory. > > Ok, it seems the fix is confilcting with the expectation this patch is > based, which is "the last user will always call page_pool_put_full_page() > in order to do the recycling or do the resource cleanup(dma unmaping..etc) > and freeing.". > > As the user of the new skb after skb_clone() and pskb_expand_head() is > not aware of that their frag page may still be in the page pool after > the fix? No it isn't the fix that is conflicting. It is the fundamental assumption that is flawed. We cannot guarantee that some other entity will not take a reference on the page. In order for this to work you have to guarantee that no other entity will use get_page/put_page on this page while you are using it. This is the reason why all the other implementations that do pagecnt_bias always remove the leftover count once they are done with the page. What was throwing me off before is that I was assuming you were doing that somewhere and you weren't. Instead this patch effectively turned the page count into a ticket lock of sorts and the problem is this approach only works as long as no other entities can take a reference on the page. > > > > One of the key bits in order for pagecnt_bias to work is that you have > > to deduct the bias once there are no more parties using it. Otherwise > > you leave the reference count artificially inflated and the page will > > never be freed. It works fine for the single producer single consumer > > case but once you introduce multiple consumers this is going to fall > > apart. > > It seems we have diffferent understanding about consumer, I treat the > above user of new skb after skb_clone() and pskb_expand_head() as the > consumer of the page pool too, so that new skb should keep the recycle > bit in order for that to happen. The problem is updating pskb_expand_head to call page_pool_return_skb_page still wouldn't resolve the issue. The fundamental assumption is flawed that the thread holding the skb will always be the last one to free the page. > If the semantic is "the new user of a page should not be handled by page > pool if page pool is not aware of the new user(the new user is added by > calling page allocator API instead of calling the page pool API, like the > skb_clone() and pskb_expand_head() above) ", I suppose I am ok with that > semantic too as long as the above semantic is aligned with the people > involved. The bigger issue is that this use of the page_ref_count violates how the page struct is meant to be used. The count is meant to be atomic and capable of getting to 0. The fact that we are now leaving the bias floating is going to introduce a number of issues. > Also, it seems _refcount and dma_addr in "struct page" is in the same cache > line, which means there is already cache line bouncing already between _refcount > and dma_addr updating, so it may makes senses to only use bias to indicate > number of the page pool user for a page, instead of using "bias - page_ref_count", > as the page_ref_count is not reliable if somebody is using the page allocator API > directly. > > And the trick part seems to be how to make the bias atomic for allocating and > freeing. What we end up essentially needing to do is duplicate that page_ref_count as a page_frag_count and just leave the original page_ref_count at 1. If we do that and then just decrement that new count instead it would allow us to defer the unmapping/recycling and we just fall back into the original path when we finally hit 0. The only limitation then is the fact that we would be looking at potentially 2 atomic operations in the worst case if we release the page as you would need one to get the frag_count to 0, and then another to decrement the ref_count. For the standard case that would work about the same as what you already had though, as you were already having to perform an atomic_dec_and_test on the page_ref_count anyway.
On 2021/7/11 1:24, Alexander Duyck wrote: > On Sat, Jul 10, 2021 at 12:44 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: >> >> Currently page pool only support page recycling only when >> refcnt of page is one, which means it can not support the >> split page recycling implemented in the most driver. >> >> The expectation of page recycling based on elevated refcnt >> is that we only do the recycling or freeing of page when the >> last user has dropped the refcnt that has given to it. >> >> The above expectation is based on that the last user will >> always call page_pool_put_full_page() in order to do the >> recycling or do the resource cleanup(dma unmaping..etc) and >> freeing. >> >> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> >> --- >> include/net/page_pool.h | 5 ++- >> net/core/page_pool.c | 106 ++++++++++++++++++++++++++++++++++++------------ >> 2 files changed, 84 insertions(+), 27 deletions(-) >> >> diff --git a/include/net/page_pool.h b/include/net/page_pool.h >> index 5746f17..f0e708d 100644 >> --- a/include/net/page_pool.h >> +++ b/include/net/page_pool.h >> @@ -45,7 +45,10 @@ >> * Please note DMA-sync-for-CPU is still >> * device driver responsibility >> */ >> -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) >> +#define PP_FLAG_PAGECNT_BIAS BIT(2) /* For elevated refcnt feature */ >> +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ >> + PP_FLAG_DMA_SYNC_DEV |\ >> + PP_FLAG_PAGECNT_BIAS) >> >> /* >> * Fast allocation side cache array/stack >> diff --git a/net/core/page_pool.c b/net/core/page_pool.c >> index 78838c6..a87cbe1 100644 >> --- a/net/core/page_pool.c >> +++ b/net/core/page_pool.c >> @@ -24,6 +24,8 @@ >> #define DEFER_TIME (msecs_to_jiffies(1000)) >> #define DEFER_WARN_INTERVAL (60 * HZ) >> >> +#define BIAS_MAX (PAGE_SIZE - 1) >> + >> static int page_pool_init(struct page_pool *pool, >> const struct page_pool_params *params) >> { >> @@ -209,14 +211,24 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) >> static void page_pool_set_pp_info(struct page_pool *pool, >> struct page *page) >> { >> + if (pool->p.flags & PP_FLAG_PAGECNT_BIAS) { >> + page_ref_add(page, BIAS_MAX); >> + page_pool_set_pagecnt_bias(page, BIAS_MAX); >> + } >> + >> page->pp = pool; >> page->pp_magic |= PP_SIGNATURE; >> } > > I think this piece makes more sense as a part of > __page_pool_alloc_page_order. Basically have it run parallel to the > DMA mapping setup. When implementing the semantic of "page recycling only wait for the page pool user instead of all user of a page", I have merged this patch with the next patch, because it seems we do not need the elevated refcnt case if the frag page is not supported. So all of comment in this patch does not exist in new version, at least that is what I checked, thanks for the reviewing. > >> -static void page_pool_clear_pp_info(struct page *page) >> +static int page_pool_clear_pp_info(struct page *page) >> { >> + int bias = page_pool_get_pagecnt_bias(page); >> + >> page->pp_magic = 0; >> page->pp = NULL; >> + page_pool_set_pagecnt_bias(page, 0); >> + >> + return bias; >> } >> >> static struct page *__page_pool_alloc_page_order(struct page_pool *pool, >> @@ -298,6 +310,23 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, >> return page; >> } >> >> +static void page_pool_sub_bias(struct page_pool *pool, >> + struct page *page, int nr) >> +{ >> + int bias; >> + >> + if (!(pool->p.flags & PP_FLAG_PAGECNT_BIAS)) >> + return; >> + >> + bias = page_pool_get_pagecnt_bias(page); >> + if (unlikely(bias <= nr)) { >> + page_ref_add(page, BIAS_MAX - bias); >> + bias = BIAS_MAX; >> + } >> + >> + page_pool_set_pagecnt_bias(page, bias - nr); >> +} >> + >> /* For using page_pool replace: alloc_pages() API calls, but provide >> * synchronization guarantee for allocation side. >> */ >> @@ -307,11 +336,16 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) >> >> /* Fast-path: Get a page from cache */ >> page = __page_pool_get_cached(pool); >> - if (page) >> + if (page) { >> + page_pool_sub_bias(pool, page, 1); >> return page; >> + } >> >> /* Slow-path: cache empty, do real allocation */ >> page = __page_pool_alloc_pages_slow(pool, gfp); >> + if (likely(page)) >> + page_pool_sub_bias(pool, page, 1); >> + >> return page; > > I would probably just reorder this a bit. Do something like: > page = __page_pool_get_cached() > > if (!page) { > page = __page_pool_alloc_pages_slow() > if (!page) > return NULL; > } > > page_pool_sub_bias() > return page; > > >> } >> EXPORT_SYMBOL(page_pool_alloc_pages); >> @@ -340,10 +374,11 @@ static s32 page_pool_inflight(struct page_pool *pool) >> * a regular page (that will eventually be returned to the normal >> * page-allocator via put_page). >> */ >> -void page_pool_release_page(struct page_pool *pool, struct page *page) >> +static int __page_pool_release_page(struct page_pool *pool, >> + struct page *page) >> { >> + int bias, count; >> dma_addr_t dma; >> - int count; >> >> if (!(pool->p.flags & PP_FLAG_DMA_MAP)) >> /* Always account for inflight pages, even if we didn't >> @@ -359,22 +394,30 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) >> DMA_ATTR_SKIP_CPU_SYNC); >> page_pool_set_dma_addr(page, 0); >> skip_dma_unmap: >> - page_pool_clear_pp_info(page); >> + bias = page_pool_clear_pp_info(page); >> >> /* This may be the last page returned, releasing the pool, so >> * it is not safe to reference pool afterwards. >> */ >> count = atomic_inc_return(&pool->pages_state_release_cnt); >> trace_page_pool_state_release(pool, page, count); >> + >> + return bias; >> +} >> + >> +void page_pool_release_page(struct page_pool *pool, struct page *page) >> +{ >> + int bias = __page_pool_release_page(pool, page); >> + >> + WARN_ONCE(bias, "%s is called from driver with elevated refcnt\n", >> + __func__); >> } > > I'm not sure it makes sense to have a warning about this. There are > multiple scenarios where you will still have to release the page early > and deduct the pagecnt_bias. > >> EXPORT_SYMBOL(page_pool_release_page); >> >> /* Return a page to the page allocator, cleaning up our state */ >> static void page_pool_return_page(struct page_pool *pool, struct page *page) >> { >> - page_pool_release_page(pool, page); >> - >> - put_page(page); >> + __page_frag_cache_drain(page, __page_pool_release_page(pool, page) + 1); >> /* An optimization would be to call __free_pages(page, pool->p.order) >> * knowing page is not part of page-cache (thus avoiding a >> * __page_cache_release() call). >> @@ -409,6 +452,15 @@ static bool page_pool_recycle_in_cache(struct page *page, >> return true; >> } >> >> +static bool page_pool_bias_page_recyclable(struct page *page, int bias) >> +{ >> + int ref = page_ref_dec_return(page); >> + >> + WARN_ON(ref <= bias); >> + >> + return ref == bias + 1; >> +} >> + >> /* If the page refcnt == 1, this will try to recycle the page. >> * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for >> * the configured size min(dma_sync_size, pool->max_len). >> @@ -419,6 +471,20 @@ static __always_inline struct page * >> __page_pool_put_page(struct page_pool *pool, struct page *page, >> unsigned int dma_sync_size, bool allow_direct) >> { >> + int bias = page_pool_get_pagecnt_bias(page); >> + >> + /* Handle the elevated refcnt case first */ >> + if (bias) { >> + /* It is not the last user yet */ >> + if (!page_pool_bias_page_recyclable(page, bias)) >> + return NULL; >> + >> + if (likely(!page_is_pfmemalloc(page))) >> + goto recyclable; >> + else >> + goto unrecyclable; >> + } >> + >> /* This allocator is optimized for the XDP mode that uses >> * one-frame-per-page, but have fallbacks that act like the >> * regular page allocator APIs. >> @@ -430,7 +496,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, >> */ >> if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { >> /* Read barrier done in page_ref_count / READ_ONCE */ >> - >> +recyclable: >> if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) >> page_pool_dma_sync_for_device(pool, page, >> dma_sync_size); >> @@ -442,22 +508,9 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, >> /* Page found as candidate for recycling */ >> return page; >> } >> - /* Fallback/non-XDP mode: API user have elevated refcnt. >> - * >> - * Many drivers split up the page into fragments, and some >> - * want to keep doing this to save memory and do refcnt based >> - * recycling. Support this use case too, to ease drivers >> - * switching between XDP/non-XDP. >> - * >> - * In-case page_pool maintains the DMA mapping, API user must >> - * call page_pool_put_page once. In this elevated refcnt >> - * case, the DMA is unmapped/released, as driver is likely >> - * doing refcnt based recycle tricks, meaning another process >> - * will be invoking put_page. >> - */ >> - /* Do not replace this with page_pool_return_page() */ >> - page_pool_release_page(pool, page); >> - put_page(page); >> + >> +unrecyclable: >> + page_pool_return_page(pool, page); >> >> return NULL; >> } >> @@ -518,7 +571,8 @@ static void page_pool_empty_ring(struct page_pool *pool) >> /* Empty recycle ring */ >> while ((page = ptr_ring_consume_bh(&pool->ring))) { >> /* Verify the refcnt invariant of cached pages */ >> - if (!(page_ref_count(page) == 1)) >> + if (!(page_ref_count(page) == >> + (page_pool_get_pagecnt_bias(page) + 1))) >> pr_crit("%s() page_pool refcnt %d violation\n", >> __func__, page_ref_count(page)); >> > > So I am not sure this is entirely useful since there are cases where > the mm subsystem will take references on pages like I mentioned > before. > > Also we should probably be caching the output from page_ref_count > instead of calling it twice as it would guarantee that we will see the > actual value the test saw versus performing the read twice which might > indicate two different values if somebody else is messing with the > page. > . >
diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 5746f17..f0e708d 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -45,7 +45,10 @@ * Please note DMA-sync-for-CPU is still * device driver responsibility */ -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) +#define PP_FLAG_PAGECNT_BIAS BIT(2) /* For elevated refcnt feature */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ + PP_FLAG_DMA_SYNC_DEV |\ + PP_FLAG_PAGECNT_BIAS) /* * Fast allocation side cache array/stack diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 78838c6..a87cbe1 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -24,6 +24,8 @@ #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) +#define BIAS_MAX (PAGE_SIZE - 1) + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -209,14 +211,24 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) static void page_pool_set_pp_info(struct page_pool *pool, struct page *page) { + if (pool->p.flags & PP_FLAG_PAGECNT_BIAS) { + page_ref_add(page, BIAS_MAX); + page_pool_set_pagecnt_bias(page, BIAS_MAX); + } + page->pp = pool; page->pp_magic |= PP_SIGNATURE; } -static void page_pool_clear_pp_info(struct page *page) +static int page_pool_clear_pp_info(struct page *page) { + int bias = page_pool_get_pagecnt_bias(page); + page->pp_magic = 0; page->pp = NULL; + page_pool_set_pagecnt_bias(page, 0); + + return bias; } static struct page *__page_pool_alloc_page_order(struct page_pool *pool, @@ -298,6 +310,23 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, return page; } +static void page_pool_sub_bias(struct page_pool *pool, + struct page *page, int nr) +{ + int bias; + + if (!(pool->p.flags & PP_FLAG_PAGECNT_BIAS)) + return; + + bias = page_pool_get_pagecnt_bias(page); + if (unlikely(bias <= nr)) { + page_ref_add(page, BIAS_MAX - bias); + bias = BIAS_MAX; + } + + page_pool_set_pagecnt_bias(page, bias - nr); +} + /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ @@ -307,11 +336,16 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) /* Fast-path: Get a page from cache */ page = __page_pool_get_cached(pool); - if (page) + if (page) { + page_pool_sub_bias(pool, page, 1); return page; + } /* Slow-path: cache empty, do real allocation */ page = __page_pool_alloc_pages_slow(pool, gfp); + if (likely(page)) + page_pool_sub_bias(pool, page, 1); + return page; } EXPORT_SYMBOL(page_pool_alloc_pages); @@ -340,10 +374,11 @@ static s32 page_pool_inflight(struct page_pool *pool) * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ -void page_pool_release_page(struct page_pool *pool, struct page *page) +static int __page_pool_release_page(struct page_pool *pool, + struct page *page) { + int bias, count; dma_addr_t dma; - int count; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) /* Always account for inflight pages, even if we didn't @@ -359,22 +394,30 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: - page_pool_clear_pp_info(page); + bias = page_pool_clear_pp_info(page); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ count = atomic_inc_return(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); + + return bias; +} + +void page_pool_release_page(struct page_pool *pool, struct page *page) +{ + int bias = __page_pool_release_page(pool, page); + + WARN_ONCE(bias, "%s is called from driver with elevated refcnt\n", + __func__); } EXPORT_SYMBOL(page_pool_release_page); /* Return a page to the page allocator, cleaning up our state */ static void page_pool_return_page(struct page_pool *pool, struct page *page) { - page_pool_release_page(pool, page); - - put_page(page); + __page_frag_cache_drain(page, __page_pool_release_page(pool, page) + 1); /* An optimization would be to call __free_pages(page, pool->p.order) * knowing page is not part of page-cache (thus avoiding a * __page_cache_release() call). @@ -409,6 +452,15 @@ static bool page_pool_recycle_in_cache(struct page *page, return true; } +static bool page_pool_bias_page_recyclable(struct page *page, int bias) +{ + int ref = page_ref_dec_return(page); + + WARN_ON(ref <= bias); + + return ref == bias + 1; +} + /* If the page refcnt == 1, this will try to recycle the page. * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). @@ -419,6 +471,20 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + int bias = page_pool_get_pagecnt_bias(page); + + /* Handle the elevated refcnt case first */ + if (bias) { + /* It is not the last user yet */ + if (!page_pool_bias_page_recyclable(page, bias)) + return NULL; + + if (likely(!page_is_pfmemalloc(page))) + goto recyclable; + else + goto unrecyclable; + } + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -430,7 +496,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, */ if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { /* Read barrier done in page_ref_count / READ_ONCE */ - +recyclable: if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) page_pool_dma_sync_for_device(pool, page, dma_sync_size); @@ -442,22 +508,9 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, /* Page found as candidate for recycling */ return page; } - /* Fallback/non-XDP mode: API user have elevated refcnt. - * - * Many drivers split up the page into fragments, and some - * want to keep doing this to save memory and do refcnt based - * recycling. Support this use case too, to ease drivers - * switching between XDP/non-XDP. - * - * In-case page_pool maintains the DMA mapping, API user must - * call page_pool_put_page once. In this elevated refcnt - * case, the DMA is unmapped/released, as driver is likely - * doing refcnt based recycle tricks, meaning another process - * will be invoking put_page. - */ - /* Do not replace this with page_pool_return_page() */ - page_pool_release_page(pool, page); - put_page(page); + +unrecyclable: + page_pool_return_page(pool, page); return NULL; } @@ -518,7 +571,8 @@ static void page_pool_empty_ring(struct page_pool *pool) /* Empty recycle ring */ while ((page = ptr_ring_consume_bh(&pool->ring))) { /* Verify the refcnt invariant of cached pages */ - if (!(page_ref_count(page) == 1)) + if (!(page_ref_count(page) == + (page_pool_get_pagecnt_bias(page) + 1))) pr_crit("%s() page_pool refcnt %d violation\n", __func__, page_ref_count(page));
Currently page pool only support page recycling only when refcnt of page is one, which means it can not support the split page recycling implemented in the most driver. The expectation of page recycling based on elevated refcnt is that we only do the recycling or freeing of page when the last user has dropped the refcnt that has given to it. The above expectation is based on that the last user will always call page_pool_put_full_page() in order to do the recycling or do the resource cleanup(dma unmaping..etc) and freeing. Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> --- include/net/page_pool.h | 5 ++- net/core/page_pool.c | 106 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 84 insertions(+), 27 deletions(-)