Message ID | 20240625135216.47007-7-linyunsheng@huawei.com (mailing list archive) |
---|---|
State | Deferred |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | First try to replace page_frag with page_frag_cache | expand |
On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote: > Currently there is one 'struct page_frag' for every 'struct > sock' and 'struct task_struct', we are about to replace the > 'struct page_frag' with 'struct page_frag_cache' for them. > Before begin the replacing, we need to ensure the size of > 'struct page_frag_cache' is not bigger than the size of > 'struct page_frag', as there may be tens of thousands of > 'struct sock' and 'struct task_struct' instances in the > system. > > By or'ing the page order & pfmemalloc with lower bits of > 'va' instead of using 'u16' or 'u32' for page size and 'u8' > for pfmemalloc, we are able to avoid 3 or 5 bytes space waste. > And page address & pfmemalloc & order is unchanged for the > same page in the same 'page_frag_cache' instance, it makes > sense to fit them together. > > Also, it is better to replace 'offset' with 'remaining', which > is the remaining size for the cache in a 'page_frag_cache' > instance, we are able to do a single 'fragsz > remaining' > checking for the case of cache not being enough, which should be > the fast path if we ensure size is zoro when 'va' == NULL by > memset'ing 'struct page_frag_cache' in page_frag_cache_init() > and page_frag_cache_drain(). > > After this patch, the size of 'struct page_frag_cache' should be > the same as the size of 'struct page_frag'. > > CC: Alexander Duyck <alexander.duyck@gmail.com> > Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> > --- > include/linux/page_frag_cache.h | 76 +++++++++++++++++++++++----- > mm/page_frag_cache.c | 90 ++++++++++++++++++++------------- > 2 files changed, 118 insertions(+), 48 deletions(-) > > diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h > index 6ac3a25089d1..b33904d4494f 100644 > --- a/include/linux/page_frag_cache.h > +++ b/include/linux/page_frag_cache.h > @@ -8,29 +8,81 @@ > #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) > #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) > > -struct page_frag_cache { > - void *va; > +/* > + * struct encoded_va - a nonexistent type marking this pointer > + * > + * An 'encoded_va' pointer is a pointer to a aligned virtual address, which is > + * at least aligned to PAGE_SIZE, that means there are at least 12 lower bits > + * space available for other purposes. > + * > + * Currently we use the lower 8 bits and bit 9 for the order and PFMEMALLOC > + * flag of the page this 'va' is corresponding to. > + * > + * Use the supplied helper functions to endcode/decode the pointer and bits. > + */ > +struct encoded_va; > + Why did you create a struct for this? The way you use it below it is just a pointer. No point in defining a struct that doesn't exist anywhere. > +#define PAGE_FRAG_CACHE_ORDER_MASK GENMASK(7, 0) > +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT BIT(8) > +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT 8 > + > +static inline struct encoded_va *encode_aligned_va(void *va, > + unsigned int order, > + bool pfmemalloc) > +{ > #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > - __u16 offset; > - __u16 size; > + return (struct encoded_va *)((unsigned long)va | order | > + pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT); > #else > - __u32 offset; > + return (struct encoded_va *)((unsigned long)va | > + pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT); > +#endif > +} > + > +static inline unsigned long encoded_page_order(struct encoded_va *encoded_va) > +{ > +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > + return PAGE_FRAG_CACHE_ORDER_MASK & (unsigned long)encoded_va; > +#else > + return 0; > +#endif > +} > + > +static inline bool encoded_page_pfmemalloc(struct encoded_va *encoded_va) > +{ > + return PAGE_FRAG_CACHE_PFMEMALLOC_BIT & (unsigned long)encoded_va; > +} > + My advice is that if you just make encoded_va an unsigned long this just becomes some FIELD_GET and bit operations. > +static inline void *encoded_page_address(struct encoded_va *encoded_va) > +{ > + return (void *)((unsigned long)encoded_va & PAGE_MASK); > +} > + > +struct page_frag_cache { > + struct encoded_va *encoded_va; This should be an unsigned long, not a pointer since you are storing data other than just a pointer in here. The pointer is just one of the things you extract out of it. > + > +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) > + u16 pagecnt_bias; > + u16 remaining; > +#else > + u32 pagecnt_bias; > + u32 remaining; > #endif > - /* we maintain a pagecount bias, so that we dont dirty cache line > - * containing page->_refcount every time we allocate a fragment. > - */ > - unsigned int pagecnt_bias; > - bool pfmemalloc; > }; > > static inline void page_frag_cache_init(struct page_frag_cache *nc) > { > - nc->va = NULL; > + memset(nc, 0, sizeof(*nc)); Shouldn't need to memset 0 the whole thing. Just setting page and order to 0 should be enough to indicate that there isn't anything there. > } > > static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc) > { > - return !!nc->pfmemalloc; > + return encoded_page_pfmemalloc(nc->encoded_va); > +} > + > +static inline unsigned int page_frag_cache_page_size(struct encoded_va *encoded_va) > +{ > + return PAGE_SIZE << encoded_page_order(encoded_va); > } > > void page_frag_cache_drain(struct page_frag_cache *nc); > diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c > index dd640af5607a..a3316dd50eff 100644 > --- a/mm/page_frag_cache.c > +++ b/mm/page_frag_cache.c > @@ -18,34 +18,61 @@ > #include <linux/page_frag_cache.h> > #include "internal.h" > > +static void *page_frag_cache_current_va(struct page_frag_cache *nc) > +{ > + struct encoded_va *encoded_va = nc->encoded_va; > + > + return (void *)(((unsigned long)encoded_va & PAGE_MASK) | > + (page_frag_cache_page_size(encoded_va) - nc->remaining)); > +} > + Rather than an OR here I would rather see this just use addition. Otherwise this logic becomes overly complicated. > static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, > gfp_t gfp_mask) > { > struct page *page = NULL; > gfp_t gfp = gfp_mask; > + unsigned int order; > > #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | > __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC; > page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, > PAGE_FRAG_CACHE_MAX_ORDER); > - nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; > #endif > - if (unlikely(!page)) > + if (unlikely(!page)) { > page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); > + if (unlikely(!page)) { > + memset(nc, 0, sizeof(*nc)); > + return NULL; > + } > + > + order = 0; > + nc->remaining = PAGE_SIZE; > + } else { > + order = PAGE_FRAG_CACHE_MAX_ORDER; > + nc->remaining = PAGE_FRAG_CACHE_MAX_SIZE; > + } > > - nc->va = page ? page_address(page) : NULL; > + /* Even if we own the page, we do not use atomic_set(). > + * This would break get_page_unless_zero() users. > + */ > + page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); > > + /* reset page count bias of new frag */ > + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; I would rather keep the pagecnt_bias, page reference addition, and resetting of remaining outside of this. The only fields we should be setting are order, the virtual address, and pfmemalloc since those are what is encoded in your unsigned long variable. > + nc->encoded_va = encode_aligned_va(page_address(page), order, > + page_is_pfmemalloc(page)); > return page; > } > > void page_frag_cache_drain(struct page_frag_cache *nc) > { > - if (!nc->va) > + if (!nc->encoded_va) > return; > > - __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias); > - nc->va = NULL; > + __page_frag_cache_drain(virt_to_head_page(nc->encoded_va), > + nc->pagecnt_bias); > + memset(nc, 0, sizeof(*nc)); Again, no need for memset when "nv->encoded_va = 0" will do. > } > EXPORT_SYMBOL(page_frag_cache_drain); > > @@ -62,51 +89,41 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc, > unsigned int fragsz, gfp_t gfp_mask, > unsigned int align_mask) > { > - unsigned int size = PAGE_SIZE; > + struct encoded_va *encoded_va = nc->encoded_va; > struct page *page; > - int offset; > + int remaining; > + void *va; > > - if (unlikely(!nc->va)) { > + if (unlikely(!encoded_va)) { > refill: > - page = __page_frag_cache_refill(nc, gfp_mask); > - if (!page) > + if (unlikely(!__page_frag_cache_refill(nc, gfp_mask))) > return NULL; > > - /* Even if we own the page, we do not use atomic_set(). > - * This would break get_page_unless_zero() users. > - */ > - page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); > - > - /* reset page count bias and offset to start of new frag */ > - nc->pfmemalloc = page_is_pfmemalloc(page); > - nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; > - nc->offset = 0; > + encoded_va = nc->encoded_va; > } > > -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > - /* if size can vary use size else just use PAGE_SIZE */ > - size = nc->size; > -#endif > - > - offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask); > - if (unlikely(offset + fragsz > size)) { > - page = virt_to_page(nc->va); > - > + remaining = nc->remaining & align_mask; > + remaining -= fragsz; > + if (unlikely(remaining < 0)) { Now this is just getting confusing. You essentially just added an additional addition step and went back to the countdown approach I was using before except for the fact that you are starting at 0 whereas I was actually moving down through the page. What I would suggest doing since "remaining" is a negative offset anyway would be to look at just storing it as a signed negative number. At least with that you can keep to your original approach and would only have to change your check to be for "remaining + fragsz <= 0". With that you can still do your math but it becomes an addition instead of a subtraction. > + page = virt_to_page(encoded_va); > if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) > goto refill; > > - if (unlikely(nc->pfmemalloc)) { > - free_unref_page(page, compound_order(page)); > + if (unlikely(encoded_page_pfmemalloc(encoded_va))) { > + VM_BUG_ON(compound_order(page) != > + encoded_page_order(encoded_va)); > + free_unref_page(page, encoded_page_order(encoded_va)); > goto refill; > } > > /* OK, page count is 0, we can safely set it */ > set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); > > - /* reset page count bias and offset to start of new frag */ > + /* reset page count bias and remaining of new frag */ > nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; > - offset = 0; > - if (unlikely(fragsz > PAGE_SIZE)) { > + nc->remaining = remaining = page_frag_cache_page_size(encoded_va); > + remaining -= fragsz; > + if (unlikely(remaining < 0)) { > /* > * The caller is trying to allocate a fragment > * with fragsz > PAGE_SIZE but the cache isn't big I find it really amusing that you went to all the trouble of flipping the logic just to flip it back to being a countdown setup. If you were going to bother with all that then why not just make the remaining negative instead? You could save yourself a ton of trouble that way and all you would need to do is flip a few signs. > @@ -120,10 +137,11 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc, > } > } > > + va = page_frag_cache_current_va(nc); > nc->pagecnt_bias--; > - nc->offset = offset + fragsz; > + nc->remaining = remaining; > > - return nc->va + offset; > + return va; > } > EXPORT_SYMBOL(__page_frag_alloc_va_align); > Not sure I am huge fan of the way the order of operations has to get so creative for this to work. Not that I see a better way to do it, but my concern is that this is going to add technical debt as I can easily see somebody messing up the order of things at some point in the future and generating a bad pointer.
On 2024/7/2 8:08, Alexander H Duyck wrote: > On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote: >> Currently there is one 'struct page_frag' for every 'struct >> sock' and 'struct task_struct', we are about to replace the >> 'struct page_frag' with 'struct page_frag_cache' for them. >> Before begin the replacing, we need to ensure the size of >> 'struct page_frag_cache' is not bigger than the size of >> 'struct page_frag', as there may be tens of thousands of >> 'struct sock' and 'struct task_struct' instances in the >> system. >> >> By or'ing the page order & pfmemalloc with lower bits of >> 'va' instead of using 'u16' or 'u32' for page size and 'u8' >> for pfmemalloc, we are able to avoid 3 or 5 bytes space waste. >> And page address & pfmemalloc & order is unchanged for the >> same page in the same 'page_frag_cache' instance, it makes >> sense to fit them together. >> >> Also, it is better to replace 'offset' with 'remaining', which >> is the remaining size for the cache in a 'page_frag_cache' >> instance, we are able to do a single 'fragsz > remaining' >> checking for the case of cache not being enough, which should be >> the fast path if we ensure size is zoro when 'va' == NULL by >> memset'ing 'struct page_frag_cache' in page_frag_cache_init() >> and page_frag_cache_drain(). >> >> After this patch, the size of 'struct page_frag_cache' should be >> the same as the size of 'struct page_frag'. >> >> CC: Alexander Duyck <alexander.duyck@gmail.com> >> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> >> --- >> include/linux/page_frag_cache.h | 76 +++++++++++++++++++++++----- >> mm/page_frag_cache.c | 90 ++++++++++++++++++++------------- >> 2 files changed, 118 insertions(+), 48 deletions(-) >> >> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h >> index 6ac3a25089d1..b33904d4494f 100644 >> --- a/include/linux/page_frag_cache.h >> +++ b/include/linux/page_frag_cache.h >> @@ -8,29 +8,81 @@ >> #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) >> #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) >> >> -struct page_frag_cache { >> - void *va; >> +/* >> + * struct encoded_va - a nonexistent type marking this pointer >> + * >> + * An 'encoded_va' pointer is a pointer to a aligned virtual address, which is >> + * at least aligned to PAGE_SIZE, that means there are at least 12 lower bits >> + * space available for other purposes. >> + * >> + * Currently we use the lower 8 bits and bit 9 for the order and PFMEMALLOC >> + * flag of the page this 'va' is corresponding to. >> + * >> + * Use the supplied helper functions to endcode/decode the pointer and bits. >> + */ >> +struct encoded_va; >> + > > Why did you create a struct for this? The way you use it below it is > just a pointer. No point in defining a struct that doesn't exist > anywhere. The encoded_va is mirroring the encoded_page below: https://elixir.bootlin.com/linux/v6.10-rc6/source/include/linux/mm_types.h#L222 https://github.com/torvalds/linux/commit/70fb4fdff5826a48886152fd5c5db04eb6c59a40 "So this introduces a 'struct encoded_page' pointer that cannot be used for anything else than to encode a real page pointer and a couple of extra bits in the low bits. That way the compiler can trivially track the state of the pointer and you just explicitly encode and decode the extra bits." It seems to be similar for encoded_va case too, I guess this is more of personal preference for using a struct or unsigned long here, I have no strong preference here and it can be changed if you really insist. > >> +#define PAGE_FRAG_CACHE_ORDER_MASK GENMASK(7, 0) >> +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT BIT(8) >> +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT 8 >> + >> +static inline struct encoded_va *encode_aligned_va(void *va, >> + unsigned int order, >> + bool pfmemalloc) >> +{ >> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) >> - __u16 offset; >> - __u16 size; >> + return (struct encoded_va *)((unsigned long)va | order | >> + pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT); >> #else >> - __u32 offset; >> + return (struct encoded_va *)((unsigned long)va | >> + pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT); >> +#endif >> +} >> + >> +static inline unsigned long encoded_page_order(struct encoded_va *encoded_va) >> +{ >> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) >> + return PAGE_FRAG_CACHE_ORDER_MASK & (unsigned long)encoded_va; >> +#else >> + return 0; >> +#endif >> +} >> + >> +static inline bool encoded_page_pfmemalloc(struct encoded_va *encoded_va) >> +{ >> + return PAGE_FRAG_CACHE_PFMEMALLOC_BIT & (unsigned long)encoded_va; >> +} >> + > > My advice is that if you just make encoded_va an unsigned long this > just becomes some FIELD_GET and bit operations. As above. > >> +static inline void *encoded_page_address(struct encoded_va *encoded_va) >> +{ >> + return (void *)((unsigned long)encoded_va & PAGE_MASK); >> +} >> + >> +struct page_frag_cache { >> + struct encoded_va *encoded_va; > > This should be an unsigned long, not a pointer since you are storing > data other than just a pointer in here. The pointer is just one of the > things you extract out of it. > >> + >> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) >> + u16 pagecnt_bias; >> + u16 remaining; >> +#else >> + u32 pagecnt_bias; >> + u32 remaining; >> #endif >> - /* we maintain a pagecount bias, so that we dont dirty cache line >> - * containing page->_refcount every time we allocate a fragment. >> - */ >> - unsigned int pagecnt_bias; >> - bool pfmemalloc; >> }; >> >> static inline void page_frag_cache_init(struct page_frag_cache *nc) >> { >> - nc->va = NULL; >> + memset(nc, 0, sizeof(*nc)); > > Shouldn't need to memset 0 the whole thing. Just setting page and order > to 0 should be enough to indicate that there isn't anything there. As mentioned in the commit log: 'Also, it is better to replace 'offset' with 'remaining', which is the remaining size for the cache in a 'page_frag_cache' instance, we are able to do a single 'fragsz > remaining' checking for the case of cache not being enough, which should be the fast path if we ensure 'remaining' is zero when 'va' == NULL by memset'ing 'struct page_frag_cache' in page_frag_cache_init() and page_frag_cache_drain().' Yes, we are only really depending on nc->remaining being zero when 'va' == NULL untill next patch refactors more codes in __page_frag_alloc_va_align() to __page_frag_cache_refill(). Perhap I should do the memset() thing in next patch. > >> } >> >> static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc) >> { >> - return !!nc->pfmemalloc; >> + return encoded_page_pfmemalloc(nc->encoded_va); >> +} >> + >> +static inline unsigned int page_frag_cache_page_size(struct encoded_va *encoded_va) >> +{ >> + return PAGE_SIZE << encoded_page_order(encoded_va); >> } >> >> void page_frag_cache_drain(struct page_frag_cache *nc); >> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c >> index dd640af5607a..a3316dd50eff 100644 >> --- a/mm/page_frag_cache.c >> +++ b/mm/page_frag_cache.c >> @@ -18,34 +18,61 @@ >> #include <linux/page_frag_cache.h> >> #include "internal.h" >> >> +static void *page_frag_cache_current_va(struct page_frag_cache *nc) >> +{ >> + struct encoded_va *encoded_va = nc->encoded_va; >> + >> + return (void *)(((unsigned long)encoded_va & PAGE_MASK) | >> + (page_frag_cache_page_size(encoded_va) - nc->remaining)); >> +} >> + > > Rather than an OR here I would rather see this just use addition. > Otherwise this logic becomes overly complicated. Sure. > >> static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, >> gfp_t gfp_mask) >> { >> struct page *page = NULL; >> gfp_t gfp = gfp_mask; >> + unsigned int order; >> >> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) >> gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | >> __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC; >> page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, >> PAGE_FRAG_CACHE_MAX_ORDER); >> - nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; >> #endif >> - if (unlikely(!page)) >> + if (unlikely(!page)) { >> page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); >> + if (unlikely(!page)) { >> + memset(nc, 0, sizeof(*nc)); >> + return NULL; >> + } >> + >> + order = 0; >> + nc->remaining = PAGE_SIZE; >> + } else { >> + order = PAGE_FRAG_CACHE_MAX_ORDER; >> + nc->remaining = PAGE_FRAG_CACHE_MAX_SIZE; >> + } >> >> - nc->va = page ? page_address(page) : NULL; >> + /* Even if we own the page, we do not use atomic_set(). >> + * This would break get_page_unless_zero() users. >> + */ >> + page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); >> >> + /* reset page count bias of new frag */ >> + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; > > I would rather keep the pagecnt_bias, page reference addition, and > resetting of remaining outside of this. The only fields we should be > setting are order, the virtual address, and pfmemalloc since those are > what is encoded in your unsigned long variable. Is there any reason why you want to keep them outside of this? For resetting of remaining, it seems we need more check to decide the value of remaining if it is kept outside of this. Also, for the next patch, more common codes are refactored out of __page_frag_alloc_va_align() to __page_frag_cache_refill(), so that the new API can make use of them, so I am not sure it really matter that much. > >> + nc->encoded_va = encode_aligned_va(page_address(page), order, >> + page_is_pfmemalloc(page)); >> return page; >> } >> >> void page_frag_cache_drain(struct page_frag_cache *nc) >> { >> - if (!nc->va) >> + if (!nc->encoded_va) >> return; >> >> - __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias); >> - nc->va = NULL; >> + __page_frag_cache_drain(virt_to_head_page(nc->encoded_va), >> + nc->pagecnt_bias); >> + memset(nc, 0, sizeof(*nc)); > > Again, no need for memset when "nv->encoded_va = 0" will do. > >> } >> EXPORT_SYMBOL(page_frag_cache_drain); >> >> @@ -62,51 +89,41 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc, >> unsigned int fragsz, gfp_t gfp_mask, >> unsigned int align_mask) >> { >> - unsigned int size = PAGE_SIZE; >> + struct encoded_va *encoded_va = nc->encoded_va; >> struct page *page; >> - int offset; >> + int remaining; >> + void *va; >> >> - if (unlikely(!nc->va)) { >> + if (unlikely(!encoded_va)) { >> refill: >> - page = __page_frag_cache_refill(nc, gfp_mask); >> - if (!page) >> + if (unlikely(!__page_frag_cache_refill(nc, gfp_mask))) >> return NULL; >> >> - /* Even if we own the page, we do not use atomic_set(). >> - * This would break get_page_unless_zero() users. >> - */ >> - page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); >> - >> - /* reset page count bias and offset to start of new frag */ >> - nc->pfmemalloc = page_is_pfmemalloc(page); >> - nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; >> - nc->offset = 0; >> + encoded_va = nc->encoded_va; >> } >> >> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) >> - /* if size can vary use size else just use PAGE_SIZE */ >> - size = nc->size; >> -#endif >> - >> - offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask); >> - if (unlikely(offset + fragsz > size)) { >> - page = virt_to_page(nc->va); >> - >> + remaining = nc->remaining & align_mask; >> + remaining -= fragsz; >> + if (unlikely(remaining < 0)) { > > Now this is just getting confusing. You essentially just added an > additional addition step and went back to the countdown approach I was > using before except for the fact that you are starting at 0 whereas I > was actually moving down through the page. Does the 'additional addition step' mean the additional step to calculate the offset using the new 'remaining' field? I guess that is the disadvantage by changing 'offset' to 'remaining', but it also some advantages too: 1. it is better to replace 'offset' with 'remaining', which is the remaining size for the cache in a 'page_frag_cache' instance, we are able to do a single 'fragsz > remaining' checking for the case of cache not being enough, which should be the fast path if we ensure size is zoro when 'va' == NULL by memset'ing 'struct page_frag_cache' in page_frag_cache_init() and page_frag_cache_drain(). 2. It seems more convenient to implement the commit/probe API too when using 'remaining' instead of 'offset' as those API needs the remaining size of the page_frag_cache anyway. So it is really a trade-off between using 'offset' and 'remaining', it is like the similar argument about trade-off between allocating fragment 'countdown' and 'countup' way. About confusing part, as the nc->remaining does mean how much cache is left in the 'nc', and nc->remaining does start from PAGE_FRAG_CACHE_MAX_SIZE/PAGE_SIZE to zero naturally if that was what you meant by 'countdown', but it is different from the 'offset countdown' before this patchset as my understanding. > > What I would suggest doing since "remaining" is a negative offset > anyway would be to look at just storing it as a signed negative number. > At least with that you can keep to your original approach and would > only have to change your check to be for "remaining + fragsz <= 0". Did you mean by using s16/s32 for 'remaining'? And set nc->remaining like below? nc->remaining = -PAGE_SIZE or nc->remaining = -PAGE_FRAG_CACHE_MAX_SIZE struct page_frag_cache { struct encoded_va *encoded_va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) u16 pagecnt_bias; s16 remaining; #else u32 pagecnt_bias; s32 remaining; #endif }; If I understand above correctly, it seems we really need a better name than 'remaining' to reflect that. > With that you can still do your math but it becomes an addition instead > of a subtraction. And I am not really sure what is the gain here by using an addition instead of a subtraction here. > >> + page = virt_to_page(encoded_va); >> if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) >> goto refill; >> >> - if (unlikely(nc->pfmemalloc)) { >> - free_unref_page(page, compound_order(page)); >> + if (unlikely(encoded_page_pfmemalloc(encoded_va))) { >> + VM_BUG_ON(compound_order(page) != >> + encoded_page_order(encoded_va)); >> + free_unref_page(page, encoded_page_order(encoded_va)); >> goto refill; >> } >> >> /* OK, page count is 0, we can safely set it */ >> set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); >> >> - /* reset page count bias and offset to start of new frag */ >> + /* reset page count bias and remaining of new frag */ >> nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; >> - offset = 0; >> - if (unlikely(fragsz > PAGE_SIZE)) { >> + nc->remaining = remaining = page_frag_cache_page_size(encoded_va); >> + remaining -= fragsz; >> + if (unlikely(remaining < 0)) { >> /* >> * The caller is trying to allocate a fragment >> * with fragsz > PAGE_SIZE but the cache isn't big > > I find it really amusing that you went to all the trouble of flipping > the logic just to flip it back to being a countdown setup. If you were > going to bother with all that then why not just make the remaining > negative instead? You could save yourself a ton of trouble that way and > all you would need to do is flip a few signs. I am not sure I understand the 'a ton of trouble' part here, if 'flipping a few signs' does save a ton of trouble here, I would like to avoid 'a ton of trouble' here, but I am not really understand the gain here yet as mentioned above.
On Tue, 2024-07-02 at 20:35 +0800, Yunsheng Lin wrote: > On 2024/7/2 8:08, Alexander H Duyck wrote: > > On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote: > > > Currently there is one 'struct page_frag' for every 'struct > > > sock' and 'struct task_struct', we are about to replace the > > > 'struct page_frag' with 'struct page_frag_cache' for them. > > > Before begin the replacing, we need to ensure the size of > > > 'struct page_frag_cache' is not bigger than the size of > > > 'struct page_frag', as there may be tens of thousands of > > > 'struct sock' and 'struct task_struct' instances in the > > > system. > > > > > > By or'ing the page order & pfmemalloc with lower bits of > > > 'va' instead of using 'u16' or 'u32' for page size and 'u8' > > > for pfmemalloc, we are able to avoid 3 or 5 bytes space waste. > > > And page address & pfmemalloc & order is unchanged for the > > > same page in the same 'page_frag_cache' instance, it makes > > > sense to fit them together. > > > > > > Also, it is better to replace 'offset' with 'remaining', which > > > is the remaining size for the cache in a 'page_frag_cache' > > > instance, we are able to do a single 'fragsz > remaining' > > > checking for the case of cache not being enough, which should be > > > the fast path if we ensure size is zoro when 'va' == NULL by > > > memset'ing 'struct page_frag_cache' in page_frag_cache_init() > > > and page_frag_cache_drain(). > > > > > > After this patch, the size of 'struct page_frag_cache' should be > > > the same as the size of 'struct page_frag'. > > > > > > CC: Alexander Duyck <alexander.duyck@gmail.com> > > > Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> > > > --- > > > include/linux/page_frag_cache.h | 76 +++++++++++++++++++++++----- > > > mm/page_frag_cache.c | 90 ++++++++++++++++++++------------- > > > 2 files changed, 118 insertions(+), 48 deletions(-) > > > > > > diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h > > > index 6ac3a25089d1..b33904d4494f 100644 > > > --- a/include/linux/page_frag_cache.h > > > +++ b/include/linux/page_frag_cache.h > > > @@ -8,29 +8,81 @@ > > > #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) > > > #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) > > > > > > -struct page_frag_cache { > > > - void *va; > > > +/* > > > + * struct encoded_va - a nonexistent type marking this pointer > > > + * > > > + * An 'encoded_va' pointer is a pointer to a aligned virtual address, which is > > > + * at least aligned to PAGE_SIZE, that means there are at least 12 lower bits > > > + * space available for other purposes. > > > + * > > > + * Currently we use the lower 8 bits and bit 9 for the order and PFMEMALLOC > > > + * flag of the page this 'va' is corresponding to. > > > + * > > > + * Use the supplied helper functions to endcode/decode the pointer and bits. > > > + */ > > > +struct encoded_va; > > > + > > > > Why did you create a struct for this? The way you use it below it is > > just a pointer. No point in defining a struct that doesn't exist > > anywhere. > > The encoded_va is mirroring the encoded_page below: > https://elixir.bootlin.com/linux/v6.10-rc6/source/include/linux/mm_types.h#L222 > https://github.com/torvalds/linux/commit/70fb4fdff5826a48886152fd5c5db04eb6c59a40 > > "So this introduces a 'struct encoded_page' pointer that cannot be used for > anything else than to encode a real page pointer and a couple of extra > bits in the low bits. That way the compiler can trivially track the state > of the pointer and you just explicitly encode and decode the extra bits." > > It seems to be similar for encoded_va case too, I guess this is more of personal > preference for using a struct or unsigned long here, I have no strong preference > here and it can be changed if you really insist. > Really this seems like a bad copy with none of the guardrails. I still think you would be better off just storing this as a long rather than a pointer. It doesn't need to be a pointer and it creates a bunch of unnecessary extra overhead. In addition if you store it as a long as I mentioned it will make it much easier to extract the size and pfmemalloc fields. Basically this thing is more bitfield(2 items) than pointer(1 item) which is why I think it would make more sense as an unsigned long. Then you only have to cast to get the pointer in and the pointer out. > > > > > +#define PAGE_FRAG_CACHE_ORDER_MASK GENMASK(7, 0) > > > +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT BIT(8) > > > +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT 8 > > > + > > > +static inline struct encoded_va *encode_aligned_va(void *va, > > > + unsigned int order, > > > + bool pfmemalloc) > > > +{ > > > #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > > > - __u16 offset; > > > - __u16 size; > > > + return (struct encoded_va *)((unsigned long)va | order | > > > + pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT); > > > #else > > > - __u32 offset; > > > + return (struct encoded_va *)((unsigned long)va | > > > + pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT); > > > +#endif > > > +} > > > + This is missing any and all protection of the example you cited. If I want to pass order as a 32b value I can and I can corrupt the virtual address. Same thing with pfmemalloc. Lets only hope you don't hit an architecture where a bool is a u8 in size as otherwise that shift is going to wipe out the value, and if it is a u32 which is usually the case lets hope they stick to the values of 0 and 1. > > > +static inline unsigned long encoded_page_order(struct encoded_va *encoded_va) > > > +{ > > > +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > > > + return PAGE_FRAG_CACHE_ORDER_MASK & (unsigned long)encoded_va; > > > +#else > > > + return 0; > > > +#endif > > > +} > > > + > > > +static inline bool encoded_page_pfmemalloc(struct encoded_va *encoded_va) > > > +{ > > > + return PAGE_FRAG_CACHE_PFMEMALLOC_BIT & (unsigned long)encoded_va; > > > +} > > > + > > > > My advice is that if you just make encoded_va an unsigned long this > > just becomes some FIELD_GET and bit operations. > > As above. > The code you mentioned had one extra block of bits that was in it and had strict protections on what went into and out of those bits. You don't have any of those protections. I suggest you just use a long and don't bother storing this as a pointer. > > > +static inline void *encoded_page_address(struct encoded_va *encoded_va) > > > +{ > > > + return (void *)((unsigned long)encoded_va & PAGE_MASK); > > > +} > > > + > > > +struct page_frag_cache { > > > + struct encoded_va *encoded_va; > > > > This should be an unsigned long, not a pointer since you are storing > > data other than just a pointer in here. The pointer is just one of the > > things you extract out of it. > > > > > + > > > +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) > > > + u16 pagecnt_bias; > > > + u16 remaining; > > > +#else > > > + u32 pagecnt_bias; > > > + u32 remaining; > > > #endif > > > - /* we maintain a pagecount bias, so that we dont dirty cache line > > > - * containing page->_refcount every time we allocate a fragment. > > > - */ > > > - unsigned int pagecnt_bias; > > > - bool pfmemalloc; > > > }; > > > > > > static inline void page_frag_cache_init(struct page_frag_cache *nc) > > > { > > > - nc->va = NULL; > > > + memset(nc, 0, sizeof(*nc)); > > > > Shouldn't need to memset 0 the whole thing. Just setting page and order > > to 0 should be enough to indicate that there isn't anything there. > > As mentioned in the commit log: > 'Also, it is better to replace 'offset' with 'remaining', which > is the remaining size for the cache in a 'page_frag_cache' > instance, we are able to do a single 'fragsz > remaining' > checking for the case of cache not being enough, which should be > the fast path if we ensure 'remaining' is zero when 'va' == NULL by > memset'ing 'struct page_frag_cache' in page_frag_cache_init() > and page_frag_cache_drain().' > > Yes, we are only really depending on nc->remaining being zero > when 'va' == NULL untill next patch refactors more codes in > __page_frag_alloc_va_align() to __page_frag_cache_refill(). > Perhap I should do the memset() thing in next patch. > I get that. You reverted the code back to where it was in patch 3 but are retaining the bottom up direction. If you were going to do that you might as well just did the "remaining" change in patch 3 and saved me having to review the same block of logic twice. > > > > > > > static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, > > > gfp_t gfp_mask) > > > { > > > struct page *page = NULL; > > > gfp_t gfp = gfp_mask; > > > + unsigned int order; > > > > > > #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > > > gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | > > > __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC; > > > page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, > > > PAGE_FRAG_CACHE_MAX_ORDER); > > > - nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; > > > #endif > > > - if (unlikely(!page)) > > > + if (unlikely(!page)) { > > > page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); > > > + if (unlikely(!page)) { > > > + memset(nc, 0, sizeof(*nc)); > > > + return NULL; > > > + } > > > + > > > + order = 0; > > > + nc->remaining = PAGE_SIZE; > > > + } else { > > > + order = PAGE_FRAG_CACHE_MAX_ORDER; > > > + nc->remaining = PAGE_FRAG_CACHE_MAX_SIZE; > > > + } > > > > > > - nc->va = page ? page_address(page) : NULL; > > > + /* Even if we own the page, we do not use atomic_set(). > > > + * This would break get_page_unless_zero() users. > > > + */ > > > + page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); > > > > > > + /* reset page count bias of new frag */ > > > + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; > > > > I would rather keep the pagecnt_bias, page reference addition, and > > resetting of remaining outside of this. The only fields we should be > > setting are order, the virtual address, and pfmemalloc since those are > > what is encoded in your unsigned long variable. > > Is there any reason why you want to keep them outside of this? Because this is now essentially a function to allocate an encoded page. The output should be your encoded virtual address with the size and pfmemalloc flags built in. > For resetting of remaining, it seems we need more check to decide the > value of remaining if it is kept outside of this. > > Also, for the next patch, more common codes are refactored out of > __page_frag_alloc_va_align() to __page_frag_cache_refill(), so that > the new API can make use of them, so I am not sure it really matter > that much. > > > > > > + nc->encoded_va = encode_aligned_va(page_address(page), order, > > > + page_is_pfmemalloc(page)); > > > return page; > > > } > > > > > > void page_frag_cache_drain(struct page_frag_cache *nc) > > > { > > > - if (!nc->va) > > > + if (!nc->encoded_va) > > > return; > > > > > > - __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias); > > > - nc->va = NULL; > > > + __page_frag_cache_drain(virt_to_head_page(nc->encoded_va), > > > + nc->pagecnt_bias); > > > + memset(nc, 0, sizeof(*nc)); > > > > Again, no need for memset when "nv->encoded_va = 0" will do. > > > > > } > > > EXPORT_SYMBOL(page_frag_cache_drain); > > > > > > @@ -62,51 +89,41 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc, > > > unsigned int fragsz, gfp_t gfp_mask, > > > unsigned int align_mask) > > > { > > > - unsigned int size = PAGE_SIZE; > > > + struct encoded_va *encoded_va = nc->encoded_va; > > > struct page *page; > > > - int offset; > > > + int remaining; > > > + void *va; > > > > > > - if (unlikely(!nc->va)) { > > > + if (unlikely(!encoded_va)) { > > > refill: > > > - page = __page_frag_cache_refill(nc, gfp_mask); > > > - if (!page) > > > + if (unlikely(!__page_frag_cache_refill(nc, gfp_mask))) > > > return NULL; > > > > > > - /* Even if we own the page, we do not use atomic_set(). > > > - * This would break get_page_unless_zero() users. > > > - */ > > > - page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); > > > - > > > - /* reset page count bias and offset to start of new frag */ > > > - nc->pfmemalloc = page_is_pfmemalloc(page); > > > - nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; > > > - nc->offset = 0; > > > + encoded_va = nc->encoded_va; > > > } > > > > > > -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > > > - /* if size can vary use size else just use PAGE_SIZE */ > > > - size = nc->size; > > > -#endif > > > - > > > - offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask); > > > - if (unlikely(offset + fragsz > size)) { > > > - page = virt_to_page(nc->va); > > > - > > > + remaining = nc->remaining & align_mask; > > > + remaining -= fragsz; > > > + if (unlikely(remaining < 0)) { > > > > Now this is just getting confusing. You essentially just added an > > additional addition step and went back to the countdown approach I was > > using before except for the fact that you are starting at 0 whereas I > > was actually moving down through the page. > > Does the 'additional addition step' mean the additional step to calculate > the offset using the new 'remaining' field? I guess that is the disadvantage > by changing 'offset' to 'remaining', but it also some advantages too: > > 1. it is better to replace 'offset' with 'remaining', which > is the remaining size for the cache in a 'page_frag_cache' > instance, we are able to do a single 'fragsz > remaining' > checking for the case of cache not being enough, which should be > the fast path if we ensure size is zoro when 'va' == NULL by > memset'ing 'struct page_frag_cache' in page_frag_cache_init() > and page_frag_cache_drain(). > 2. It seems more convenient to implement the commit/probe API too > when using 'remaining' instead of 'offset' as those API needs > the remaining size of the page_frag_cache anyway. > > So it is really a trade-off between using 'offset' and 'remaining', > it is like the similar argument about trade-off between allocating > fragment 'countdown' and 'countup' way. > > About confusing part, as the nc->remaining does mean how much cache > is left in the 'nc', and nc->remaining does start from > PAGE_FRAG_CACHE_MAX_SIZE/PAGE_SIZE to zero naturally if that was what > you meant by 'countdown', but it is different from the 'offset countdown' > before this patchset as my understanding. > > > > > What I would suggest doing since "remaining" is a negative offset > > anyway would be to look at just storing it as a signed negative number. > > At least with that you can keep to your original approach and would > > only have to change your check to be for "remaining + fragsz <= 0". > > Did you mean by using s16/s32 for 'remaining'? And set nc->remaining like > below? > nc->remaining = -PAGE_SIZE or > nc->remaining = -PAGE_FRAG_CACHE_MAX_SIZE Yes. Basically if we used it as a signed value then you could just work your way up and do your aligned math still. > struct page_frag_cache { > struct encoded_va *encoded_va; > > #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) > u16 pagecnt_bias; > s16 remaining; > #else > u32 pagecnt_bias; > s32 remaining; > #endif > }; > > If I understand above correctly, it seems we really need a better name > than 'remaining' to reflect that. It would effectively work like a countdown. Instead of T - X in this case it is size - X. > > With that you can still do your math but it becomes an addition instead > > of a subtraction. > > And I am not really sure what is the gain here by using an addition > instead of a subtraction here. > The "remaining" as it currently stands is doing the same thing so odds are it isn't too big a deal. It is just that the original code was already somewhat confusing and this is just making it that much more complex. > > > + page = virt_to_page(encoded_va); > > > if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) > > > goto refill; > > > > > > - if (unlikely(nc->pfmemalloc)) { > > > - free_unref_page(page, compound_order(page)); > > > + if (unlikely(encoded_page_pfmemalloc(encoded_va))) { > > > + VM_BUG_ON(compound_order(page) != > > > + encoded_page_order(encoded_va)); > > > + free_unref_page(page, encoded_page_order(encoded_va)); > > > goto refill; > > > } > > > > > > /* OK, page count is 0, we can safely set it */ > > > set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); > > > > > > - /* reset page count bias and offset to start of new frag */ > > > + /* reset page count bias and remaining of new frag */ > > > nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; > > > - offset = 0; > > > - if (unlikely(fragsz > PAGE_SIZE)) { > > > + nc->remaining = remaining = page_frag_cache_page_size(encoded_va); > > > + remaining -= fragsz; > > > + if (unlikely(remaining < 0)) { > > > /* > > > * The caller is trying to allocate a fragment > > > * with fragsz > PAGE_SIZE but the cache isn't big > > > > I find it really amusing that you went to all the trouble of flipping > > the logic just to flip it back to being a countdown setup. If you were > > going to bother with all that then why not just make the remaining > > negative instead? You could save yourself a ton of trouble that way and > > all you would need to do is flip a few signs. > > I am not sure I understand the 'a ton of trouble' part here, if 'flipping > a few signs' does save a ton of trouble here, I would like to avoid 'a > ton of trouble' here, but I am not really understand the gain here yet as > mentioned above. It isn't about flipping the signs. It is more the fact that the logic has now become even more complex then it originally was. With my work backwards approach the advantage was that the offset could be updated and then we just recorded everything and reported it up. Now we have to keep a temporary "remaining" value, generate our virtual address and store that to a temp variable, we can record the new "remaining" value, and then we can report the virtual address. If we get the ordering on the steps 2 and 3 in this it will cause issues as we will be pointing to the wrong values. It is something I can see someone easily messing up.
On 2024/7/2 22:55, Alexander H Duyck wrote: ... > >>> >>>> +#define PAGE_FRAG_CACHE_ORDER_MASK GENMASK(7, 0) >>>> +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT BIT(8) >>>> +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT 8 >>>> + >>>> +static inline struct encoded_va *encode_aligned_va(void *va, >>>> + unsigned int order, >>>> + bool pfmemalloc) >>>> +{ >>>> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) >>>> - __u16 offset; >>>> - __u16 size; >>>> + return (struct encoded_va *)((unsigned long)va | order | >>>> + pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT); >>>> #else >>>> - __u32 offset; >>>> + return (struct encoded_va *)((unsigned long)va | >>>> + pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT); >>>> +#endif >>>> +} >>>> + > > This is missing any and all protection of the example you cited. If I > want to pass order as a 32b value I can and I can corrupt the virtual > address. Same thing with pfmemalloc. Lets only hope you don't hit an > architecture where a bool is a u8 in size as otherwise that shift is > going to wipe out the value, and if it is a u32 which is usually the > case lets hope they stick to the values of 0 and 1. I explicitly checked that the protection is not really needed due to performance consideration: 1. For the 'pfmemalloc' part, it does always stick to the values of 0 and 1 as below: https://elixir.bootlin.com/linux/v6.10-rc6/source/Documentation/process/coding-style.rst#L1053 2. For the 'order' part, its range can only be within 0~3. > >>>> +static inline unsigned long encoded_page_order(struct encoded_va *encoded_va) >>>> +{ >>>> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) >>>> + return PAGE_FRAG_CACHE_ORDER_MASK & (unsigned long)encoded_va; >>>> +#else >>>> + return 0; >>>> +#endif >>>> +} >>>> + >>>> +static inline bool encoded_page_pfmemalloc(struct encoded_va *encoded_va) >>>> +{ >>>> + return PAGE_FRAG_CACHE_PFMEMALLOC_BIT & (unsigned long)encoded_va; >>>> +} >>>> + >>> >>> My advice is that if you just make encoded_va an unsigned long this >>> just becomes some FIELD_GET and bit operations. >> >> As above. >> > > The code you mentioned had one extra block of bits that was in it and > had strict protections on what went into and out of those bits. You > don't have any of those protections. As above, the protection masking/checking is explicitly avoided due to performance consideration and reasons as above for encoded_va. But I guess it doesn't hurt to have a VM_BUG_ON() checking to catch possible future mistake. > > I suggest you just use a long and don't bother storing this as a > pointer. > ... >>>> - >>>> + remaining = nc->remaining & align_mask; >>>> + remaining -= fragsz; >>>> + if (unlikely(remaining < 0)) { >>> >>> Now this is just getting confusing. You essentially just added an >>> additional addition step and went back to the countdown approach I was >>> using before except for the fact that you are starting at 0 whereas I >>> was actually moving down through the page. >> >> Does the 'additional addition step' mean the additional step to calculate >> the offset using the new 'remaining' field? I guess that is the disadvantage >> by changing 'offset' to 'remaining', but it also some advantages too: >> >> 1. it is better to replace 'offset' with 'remaining', which >> is the remaining size for the cache in a 'page_frag_cache' >> instance, we are able to do a single 'fragsz > remaining' >> checking for the case of cache not being enough, which should be >> the fast path if we ensure size is zoro when 'va' == NULL by >> memset'ing 'struct page_frag_cache' in page_frag_cache_init() >> and page_frag_cache_drain(). >> 2. It seems more convenient to implement the commit/probe API too >> when using 'remaining' instead of 'offset' as those API needs >> the remaining size of the page_frag_cache anyway. >> >> So it is really a trade-off between using 'offset' and 'remaining', >> it is like the similar argument about trade-off between allocating >> fragment 'countdown' and 'countup' way. >> >> About confusing part, as the nc->remaining does mean how much cache >> is left in the 'nc', and nc->remaining does start from >> PAGE_FRAG_CACHE_MAX_SIZE/PAGE_SIZE to zero naturally if that was what >> you meant by 'countdown', but it is different from the 'offset countdown' >> before this patchset as my understanding. >> >>> >>> What I would suggest doing since "remaining" is a negative offset >>> anyway would be to look at just storing it as a signed negative number. >>> At least with that you can keep to your original approach and would >>> only have to change your check to be for "remaining + fragsz <= 0". >> >> Did you mean by using s16/s32 for 'remaining'? And set nc->remaining like >> below? >> nc->remaining = -PAGE_SIZE or >> nc->remaining = -PAGE_FRAG_CACHE_MAX_SIZE > > Yes. Basically if we used it as a signed value then you could just work > your way up and do your aligned math still. For the aligned math, I am not sure how can 'align_mask' be appiled to a signed value yet. It seems that after masking nc->remaining leans towards -PAGE_SIZE/-PAGE_FRAG_CACHE_MAX_SIZE instead of zero for a unsigned value, for example: nc->remaining = -4094; nc->remaining &= -64; It seems we got -4096 for above, is that what we are expecting? > >> struct page_frag_cache { >> struct encoded_va *encoded_va; >> >> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) >> u16 pagecnt_bias; >> s16 remaining; >> #else >> u32 pagecnt_bias; >> s32 remaining; >> #endif >> }; >> >> If I understand above correctly, it seems we really need a better name >> than 'remaining' to reflect that. > > It would effectively work like a countdown. Instead of T - X in this > case it is size - X. > >>> With that you can still do your math but it becomes an addition instead >>> of a subtraction. >> >> And I am not really sure what is the gain here by using an addition >> instead of a subtraction here. >> > > The "remaining" as it currently stands is doing the same thing so odds > are it isn't too big a deal. It is just that the original code was > already somewhat confusing and this is just making it that much more > complex. > >>>> + page = virt_to_page(encoded_va); >>>> if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) >>>> goto refill; >>>> >>>> - if (unlikely(nc->pfmemalloc)) { >>>> - free_unref_page(page, compound_order(page)); >>>> + if (unlikely(encoded_page_pfmemalloc(encoded_va))) { >>>> + VM_BUG_ON(compound_order(page) != >>>> + encoded_page_order(encoded_va)); >>>> + free_unref_page(page, encoded_page_order(encoded_va)); >>>> goto refill; >>>> } >>>> >>>> /* OK, page count is 0, we can safely set it */ >>>> set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); >>>> >>>> - /* reset page count bias and offset to start of new frag */ >>>> + /* reset page count bias and remaining of new frag */ >>>> nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; >>>> - offset = 0; >>>> - if (unlikely(fragsz > PAGE_SIZE)) { >>>> + nc->remaining = remaining = page_frag_cache_page_size(encoded_va); >>>> + remaining -= fragsz; >>>> + if (unlikely(remaining < 0)) { >>>> /* >>>> * The caller is trying to allocate a fragment >>>> * with fragsz > PAGE_SIZE but the cache isn't big >>> >>> I find it really amusing that you went to all the trouble of flipping >>> the logic just to flip it back to being a countdown setup. If you were >>> going to bother with all that then why not just make the remaining >>> negative instead? You could save yourself a ton of trouble that way and >>> all you would need to do is flip a few signs. >> >> I am not sure I understand the 'a ton of trouble' part here, if 'flipping >> a few signs' does save a ton of trouble here, I would like to avoid 'a >> ton of trouble' here, but I am not really understand the gain here yet as >> mentioned above. > > It isn't about flipping the signs. It is more the fact that the logic > has now become even more complex then it originally was. With my work > backwards approach the advantage was that the offset could be updated > and then we just recorded everything and reported it up. Now we have to I am supposing the above is referring to 'offset countdown' way before this patchset, right? > keep a temporary "remaining" value, generate our virtual address and > store that to a temp variable, we can record the new "remaining" value, > and then we can report the virtual address. If we get the ordering on Yes, I noticed it when coding too, that is why current virtual address is generated in page_frag_cache_current_va() basing on nc->remaining instead of the local variable 'remaining' before assigning the local variable 'remaining' to nc->remaining. But I am not sure I understand how using a signed negative number for 'remaining' will change the above steps. If not, I still fail to see the gain of using a signed negative number for 'nc->remaining'. > the steps 2 and 3 in this it will cause issues as we will be pointing > to the wrong values. It is something I can see someone easily messing > up. Yes, agreed. It would be good to be more specific about how to avoid the above problem using a signed negative number for 'remaining' as I am not sure how it can be done yet. >
On Wed, 2024-07-03 at 20:33 +0800, Yunsheng Lin wrote: > On 2024/7/2 22:55, Alexander H Duyck wrote: > > ... > > > > > > > > > > > > +#define PAGE_FRAG_CACHE_ORDER_MASK GENMASK(7, 0) > > > > > +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT BIT(8) > > > > > +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT 8 > > > > > + > > > > > +static inline struct encoded_va *encode_aligned_va(void *va, > > > > > + unsigned int order, > > > > > + bool pfmemalloc) > > > > > +{ > > > > > #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > > > > > - __u16 offset; > > > > > - __u16 size; > > > > > + return (struct encoded_va *)((unsigned long)va | order | > > > > > + pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT); > > > > > #else > > > > > - __u32 offset; > > > > > + return (struct encoded_va *)((unsigned long)va | > > > > > + pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT); > > > > > +#endif > > > > > +} > > > > > + > > > > This is missing any and all protection of the example you cited. If I > > want to pass order as a 32b value I can and I can corrupt the virtual > > address. Same thing with pfmemalloc. Lets only hope you don't hit an > > architecture where a bool is a u8 in size as otherwise that shift is > > going to wipe out the value, and if it is a u32 which is usually the > > case lets hope they stick to the values of 0 and 1. > > I explicitly checked that the protection is not really needed due to > performance consideration: > 1. For the 'pfmemalloc' part, it does always stick to the values of 0 > and 1 as below: > https://elixir.bootlin.com/linux/v6.10-rc6/source/Documentation/process/coding-style.rst#L1053 > > 2. For the 'order' part, its range can only be within 0~3. > > > > > > > > +static inline unsigned long encoded_page_order(struct encoded_va *encoded_va) > > > > > +{ > > > > > +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > > > > > + return PAGE_FRAG_CACHE_ORDER_MASK & (unsigned long)encoded_va; > > > > > +#else > > > > > + return 0; > > > > > +#endif > > > > > +} > > > > > + > > > > > +static inline bool encoded_page_pfmemalloc(struct encoded_va *encoded_va) > > > > > +{ > > > > > + return PAGE_FRAG_CACHE_PFMEMALLOC_BIT & (unsigned long)encoded_va; > > > > > +} > > > > > + > > > > > > > > My advice is that if you just make encoded_va an unsigned long this > > > > just becomes some FIELD_GET and bit operations. > > > > > > As above. > > > > > > > The code you mentioned had one extra block of bits that was in it and > > had strict protections on what went into and out of those bits. You > > don't have any of those protections. > > As above, the protection masking/checking is explicitly avoided due > to performance consideration and reasons as above for encoded_va. > > But I guess it doesn't hurt to have a VM_BUG_ON() checking to catch > possible future mistake. > > > > > I suggest you just use a long and don't bother storing this as a > > pointer. > > > > ... > > > > > > - > > > > > + remaining = nc->remaining & align_mask; > > > > > + remaining -= fragsz; > > > > > + if (unlikely(remaining < 0)) { > > > > > > > > Now this is just getting confusing. You essentially just added an > > > > additional addition step and went back to the countdown approach I was > > > > using before except for the fact that you are starting at 0 whereas I > > > > was actually moving down through the page. > > > > > > Does the 'additional addition step' mean the additional step to calculate > > > the offset using the new 'remaining' field? I guess that is the disadvantage > > > by changing 'offset' to 'remaining', but it also some advantages too: > > > > > > 1. it is better to replace 'offset' with 'remaining', which > > > is the remaining size for the cache in a 'page_frag_cache' > > > instance, we are able to do a single 'fragsz > remaining' > > > checking for the case of cache not being enough, which should be > > > the fast path if we ensure size is zoro when 'va' == NULL by > > > memset'ing 'struct page_frag_cache' in page_frag_cache_init() > > > and page_frag_cache_drain(). > > > 2. It seems more convenient to implement the commit/probe API too > > > when using 'remaining' instead of 'offset' as those API needs > > > the remaining size of the page_frag_cache anyway. > > > > > > So it is really a trade-off between using 'offset' and 'remaining', > > > it is like the similar argument about trade-off between allocating > > > fragment 'countdown' and 'countup' way. > > > > > > About confusing part, as the nc->remaining does mean how much cache > > > is left in the 'nc', and nc->remaining does start from > > > PAGE_FRAG_CACHE_MAX_SIZE/PAGE_SIZE to zero naturally if that was what > > > you meant by 'countdown', but it is different from the 'offset countdown' > > > before this patchset as my understanding. > > > > > > > > > > > What I would suggest doing since "remaining" is a negative offset > > > > anyway would be to look at just storing it as a signed negative number. > > > > At least with that you can keep to your original approach and would > > > > only have to change your check to be for "remaining + fragsz <= 0". > > > > > > Did you mean by using s16/s32 for 'remaining'? And set nc->remaining like > > > below? > > > nc->remaining = -PAGE_SIZE or > > > nc->remaining = -PAGE_FRAG_CACHE_MAX_SIZE > > > > Yes. Basically if we used it as a signed value then you could just work > > your way up and do your aligned math still. > > For the aligned math, I am not sure how can 'align_mask' be appiled to > a signed value yet. It seems that after masking nc->remaining leans > towards -PAGE_SIZE/-PAGE_FRAG_CACHE_MAX_SIZE instead of zero for > a unsigned value, for example: > > nc->remaining = -4094; > nc->remaining &= -64; > > It seems we got -4096 for above, is that what we are expecting? No, you have to do an addition and then the mask like you were before using __ALIGN_KERNEL. As it stands I realized your alignment approach in this patch is broken. You should be aligning the remaining at the start of this function and then storing it before you call page_frag_cache_current_va. Instead you do it after so the start of your block may not be aligned to the requested mask if you have multiple callers sharing this function or if you are passing an unaligned size in the request. > > > > > struct page_frag_cache { > > > struct encoded_va *encoded_va; > > > > > > #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) > > > u16 pagecnt_bias; > > > s16 remaining; > > > #else > > > u32 pagecnt_bias; > > > s32 remaining; > > > #endif > > > }; > > > > > > If I understand above correctly, it seems we really need a better name > > > than 'remaining' to reflect that. > > > > It would effectively work like a countdown. Instead of T - X in this > > case it is size - X. > > > > > > With that you can still do your math but it becomes an addition instead > > > > of a subtraction. > > > > > > And I am not really sure what is the gain here by using an addition > > > instead of a subtraction here. > > > > > > > The "remaining" as it currently stands is doing the same thing so odds > > are it isn't too big a deal. It is just that the original code was > > already somewhat confusing and this is just making it that much more > > complex. > > > > > > > + page = virt_to_page(encoded_va); > > > > > if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) > > > > > goto refill; > > > > > > > > > > - if (unlikely(nc->pfmemalloc)) { > > > > > - free_unref_page(page, compound_order(page)); > > > > > + if (unlikely(encoded_page_pfmemalloc(encoded_va))) { > > > > > + VM_BUG_ON(compound_order(page) != > > > > > + encoded_page_order(encoded_va)); > > > > > + free_unref_page(page, encoded_page_order(encoded_va)); > > > > > goto refill; > > > > > } > > > > > > > > > > /* OK, page count is 0, we can safely set it */ > > > > > set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); > > > > > > > > > > - /* reset page count bias and offset to start of new frag */ > > > > > + /* reset page count bias and remaining of new frag */ > > > > > nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; > > > > > - offset = 0; > > > > > - if (unlikely(fragsz > PAGE_SIZE)) { > > > > > + nc->remaining = remaining = page_frag_cache_page_size(encoded_va); > > > > > + remaining -= fragsz; > > > > > + if (unlikely(remaining < 0)) { > > > > > /* > > > > > * The caller is trying to allocate a fragment > > > > > * with fragsz > PAGE_SIZE but the cache isn't big > > > > > > > > I find it really amusing that you went to all the trouble of flipping > > > > the logic just to flip it back to being a countdown setup. If you were > > > > going to bother with all that then why not just make the remaining > > > > negative instead? You could save yourself a ton of trouble that way and > > > > all you would need to do is flip a few signs. > > > > > > I am not sure I understand the 'a ton of trouble' part here, if 'flipping > > > a few signs' does save a ton of trouble here, I would like to avoid 'a > > > ton of trouble' here, but I am not really understand the gain here yet as > > > mentioned above. > > > > It isn't about flipping the signs. It is more the fact that the logic > > has now become even more complex then it originally was. With my work > > backwards approach the advantage was that the offset could be updated > > and then we just recorded everything and reported it up. Now we have to > > I am supposing the above is referring to 'offset countdown' way > before this patchset, right? > > > keep a temporary "remaining" value, generate our virtual address and > > store that to a temp variable, we can record the new "remaining" value, > > and then we can report the virtual address. If we get the ordering on > > Yes, I noticed it when coding too, that is why current virtual address is > generated in page_frag_cache_current_va() basing on nc->remaining instead > of the local variable 'remaining' before assigning the local variable > 'remaining' to nc->remaining. But I am not sure I understand how using a > signed negative number for 'remaining' will change the above steps. If > not, I still fail to see the gain of using a signed negative number for > 'nc->remaining'. > > > the steps 2 and 3 in this it will cause issues as we will be pointing > > to the wrong values. It is something I can see someone easily messing > > up. > > Yes, agreed. It would be good to be more specific about how to avoid > the above problem using a signed negative number for 'remaining' as > I am not sure how it can be done yet. > My advice would be to go back to patch 3 and figure out how to do this re-ordering without changing the alignment behaviour. The old code essentially aligned both the offset and fragsz by combining the two and then doing the alignment. Since you are doing a count up setup you will need to align the remaining, then add fragsz, and then I guess you could store remaining and then subtract fragsz from your final virtual address to get back to where the starting offset is actually located. Basically your "remaining" value isn't a safe number to use for an offset since it isn't aligned to your starting value at any point. As far as the negative value, it is more about making it easier to keep track of what is actually going on. Basically we can use regular pointer math and as such I suspect the compiler is having to do extra instructions to flip your value negative before it can combine the values via something like the LEA (load effective address) assembler call.
On 2024/7/10 23:28, Alexander H Duyck wrote: ... >>>> >>>>> >>>>> What I would suggest doing since "remaining" is a negative offset >>>>> anyway would be to look at just storing it as a signed negative number. >>>>> At least with that you can keep to your original approach and would >>>>> only have to change your check to be for "remaining + fragsz <= 0". >>>> >>>> Did you mean by using s16/s32 for 'remaining'? And set nc->remaining like >>>> below? >>>> nc->remaining = -PAGE_SIZE or >>>> nc->remaining = -PAGE_FRAG_CACHE_MAX_SIZE >>> >>> Yes. Basically if we used it as a signed value then you could just work >>> your way up and do your aligned math still. >> >> For the aligned math, I am not sure how can 'align_mask' be appiled to >> a signed value yet. It seems that after masking nc->remaining leans >> towards -PAGE_SIZE/-PAGE_FRAG_CACHE_MAX_SIZE instead of zero for >> a unsigned value, for example: >> >> nc->remaining = -4094; >> nc->remaining &= -64; >> >> It seems we got -4096 for above, is that what we are expecting? > > No, you have to do an addition and then the mask like you were before > using __ALIGN_KERNEL. > > As it stands I realized your alignment approach in this patch is > broken. You should be aligning the remaining at the start of this > function and then storing it before you call > page_frag_cache_current_va. Instead you do it after so the start of > your block may not be aligned to the requested mask if you have > multiple callers sharing this function or if you are passing an > unaligned size in the request. Yes, you seems right about it, the intention is to do the below as patch 3 does in v10: aligned_remaining = nc->remaining & align_mask; remaining = aligned_remaining - fragsz; nc->remaining = remaining; return encoded_page_address(nc->encoded_va) + (size - aligned_remaining); > >>> >>>> struct page_frag_cache { >>>> struct encoded_va *encoded_va; >>>> >>>> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) >>>> u16 pagecnt_bias; >>>> s16 remaining; >>>> #else >>>> u32 pagecnt_bias; >>>> s32 remaining; >>>> #endif >>>> }; >>>> >>>> If I understand above correctly, it seems we really need a better name >>>> than 'remaining' to reflect that. >>> >>> It would effectively work like a countdown. Instead of T - X in this >>> case it is size - X. >>> >>>>> With that you can still do your math but it becomes an addition instead >>>>> of a subtraction. >>>> >>>> And I am not really sure what is the gain here by using an addition >>>> instead of a subtraction here. >>>> >>> >>> The "remaining" as it currently stands is doing the same thing so odds >>> are it isn't too big a deal. It is just that the original code was >>> already somewhat confusing and this is just making it that much more >>> complex. >>> >>>>>> + page = virt_to_page(encoded_va); >>>>>> if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) >>>>>> goto refill; >>>>>> >>>>>> - if (unlikely(nc->pfmemalloc)) { >>>>>> - free_unref_page(page, compound_order(page)); >>>>>> + if (unlikely(encoded_page_pfmemalloc(encoded_va))) { >>>>>> + VM_BUG_ON(compound_order(page) != >>>>>> + encoded_page_order(encoded_va)); >>>>>> + free_unref_page(page, encoded_page_order(encoded_va)); >>>>>> goto refill; >>>>>> } >>>>>> >>>>>> /* OK, page count is 0, we can safely set it */ >>>>>> set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); >>>>>> >>>>>> - /* reset page count bias and offset to start of new frag */ >>>>>> + /* reset page count bias and remaining of new frag */ >>>>>> nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; >>>>>> - offset = 0; >>>>>> - if (unlikely(fragsz > PAGE_SIZE)) { >>>>>> + nc->remaining = remaining = page_frag_cache_page_size(encoded_va); >>>>>> + remaining -= fragsz; >>>>>> + if (unlikely(remaining < 0)) { >>>>>> /* >>>>>> * The caller is trying to allocate a fragment >>>>>> * with fragsz > PAGE_SIZE but the cache isn't big >>>>> >>>>> I find it really amusing that you went to all the trouble of flipping >>>>> the logic just to flip it back to being a countdown setup. If you were >>>>> going to bother with all that then why not just make the remaining >>>>> negative instead? You could save yourself a ton of trouble that way and >>>>> all you would need to do is flip a few signs. >>>> >>>> I am not sure I understand the 'a ton of trouble' part here, if 'flipping >>>> a few signs' does save a ton of trouble here, I would like to avoid 'a >>>> ton of trouble' here, but I am not really understand the gain here yet as >>>> mentioned above. >>> >>> It isn't about flipping the signs. It is more the fact that the logic >>> has now become even more complex then it originally was. With my work >>> backwards approach the advantage was that the offset could be updated >>> and then we just recorded everything and reported it up. Now we have to >> >> I am supposing the above is referring to 'offset countdown' way >> before this patchset, right? >> >>> keep a temporary "remaining" value, generate our virtual address and >>> store that to a temp variable, we can record the new "remaining" value, >>> and then we can report the virtual address. If we get the ordering on >> >> Yes, I noticed it when coding too, that is why current virtual address is >> generated in page_frag_cache_current_va() basing on nc->remaining instead >> of the local variable 'remaining' before assigning the local variable >> 'remaining' to nc->remaining. But I am not sure I understand how using a >> signed negative number for 'remaining' will change the above steps. If >> not, I still fail to see the gain of using a signed negative number for >> 'nc->remaining'. >> >>> the steps 2 and 3 in this it will cause issues as we will be pointing >>> to the wrong values. It is something I can see someone easily messing >>> up. >> >> Yes, agreed. It would be good to be more specific about how to avoid >> the above problem using a signed negative number for 'remaining' as >> I am not sure how it can be done yet. >> > > My advice would be to go back to patch 3 and figure out how to do this > re-ordering without changing the alignment behaviour. The old code > essentially aligned both the offset and fragsz by combining the two and > then doing the alignment. Since you are doing a count up setup you will I am not sure I understand 'aligned both the offset and fragsz ' part, as 'fragsz' being aligned or not seems to depend on last caller' align_mask, for the same page_frag_cache instance, suppose offset = 32768 initially for the old code: Step 1: __page_frag_alloc_align() is called with fragsz=7 and align_mask=~0u the offset after this is 32761, the true fragsz is 7 too. Step 2: __page_frag_alloc_align() is called with fragsz=7 and align_mask=-16 the offset after this is 32752, the true fragsz is 9, which does not seems to be aligned. The above is why I added the below paragraph in the doc to make the semantic more explicit: "Depending on different aligning requirement, the page_frag API caller may call page_frag_alloc*_align*() to ensure the returned virtual address or offset of the page is aligned according to the 'align/alignment' parameter. Note the size of the allocated fragment is not aligned, the caller needs to provide an aligned fragsz if there is an alignment requirement for the size of the fragment." And existing callers of page_frag aligned API does seems to follow the above rule last time I checked. Or did I miss something obvious here? > need to align the remaining, then add fragsz, and then I guess you > could store remaining and then subtract fragsz from your final virtual > address to get back to where the starting offset is actually located. remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask); remaining += fragsz; nc->remaining = remaining; return encoded_page_address(nc->encoded_va) + (size + remaining) - fragsz; If yes, I am not sure what is the point of doing the above yet, it just seem to make thing more complicated and harder to understand. > > Basically your "remaining" value isn't a safe number to use for an > offset since it isn't aligned to your starting value at any point. Does using 'aligned_remaining' local variable to make it more obvious seem reasonable to you? > > As far as the negative value, it is more about making it easier to keep > track of what is actually going on. Basically we can use regular > pointer math and as such I suspect the compiler is having to do extra > instructions to flip your value negative before it can combine the > values via something like the LEA (load effective address) assembler > call. I am not an asm expert here, I am not sure I understand the optimization trick here.
On Thu, Jul 11, 2024 at 1:16 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2024/7/10 23:28, Alexander H Duyck wrote: ... > >> > >> Yes, agreed. It would be good to be more specific about how to avoid > >> the above problem using a signed negative number for 'remaining' as > >> I am not sure how it can be done yet. > >> > > > > My advice would be to go back to patch 3 and figure out how to do this > > re-ordering without changing the alignment behaviour. The old code > > essentially aligned both the offset and fragsz by combining the two and > > then doing the alignment. Since you are doing a count up setup you will > > I am not sure I understand 'aligned both the offset and fragsz ' part, as > 'fragsz' being aligned or not seems to depend on last caller' align_mask, > for the same page_frag_cache instance, suppose offset = 32768 initially for > the old code: > Step 1: __page_frag_alloc_align() is called with fragsz=7 and align_mask=~0u > the offset after this is 32761, the true fragsz is 7 too. > > Step 2: __page_frag_alloc_align() is called with fragsz=7 and align_mask=-16 > the offset after this is 32752, the true fragsz is 9, which does > not seems to be aligned. I was referring to my original code before this patchset. I was doing the subtraction of the fragsz first, and then aligning which would end up padding the end of the frame as it was adding to the total size by pulling the offset down *after* I had already subtracted fragsz. The result was that I was always adding additional room depending on the setting of the fragsz and how it related to the alignment. After changing the code to realign on the start of the next frag the fragsz is at the mercy of the next caller's alignment. In the event that the caller didn't bother to align the fragsz by the align mask before hand they can end up with a scenario that might result in false sharing. > The above is why I added the below paragraph in the doc to make the semantic > more explicit: > "Depending on different aligning requirement, the page_frag API caller may call > page_frag_alloc*_align*() to ensure the returned virtual address or offset of > the page is aligned according to the 'align/alignment' parameter. Note the size > of the allocated fragment is not aligned, the caller needs to provide an aligned > fragsz if there is an alignment requirement for the size of the fragment." > > And existing callers of page_frag aligned API does seems to follow the above > rule last time I checked. > > Or did I miss something obvious here? No you didn't miss anything. It is just that there is now more potential for error than there was before. > > need to align the remaining, then add fragsz, and then I guess you > > could store remaining and then subtract fragsz from your final virtual > > address to get back to where the starting offset is actually located. > > remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask); > remaining += fragsz; > nc->remaining = remaining; > return encoded_page_address(nc->encoded_va) + (size + remaining) - fragsz; > > If yes, I am not sure what is the point of doing the above yet, it > just seem to make thing more complicated and harder to understand. That isn't right. I am not sure why you are adding size + remaining or what those are supposed to represent. The issue was that the "remaining" ends up being an unaligned value as you were starting by aligning it and then adding fragsz. So by subtracting fragsz you can get back to the aliglined start. What this patch was doing before was adding the raw unaligned nc->remaining at the end of the function. > > > > Basically your "remaining" value isn't a safe number to use for an > > offset since it isn't aligned to your starting value at any point. > > Does using 'aligned_remaining' local variable to make it more obvious > seem reasonable to you? No, as the value you are storing above isn't guaranteed to be aligned. If you stored it before adding fragsz then it would be aligned. > > > > As far as the negative value, it is more about making it easier to keep > > track of what is actually going on. Basically we can use regular > > pointer math and as such I suspect the compiler is having to do extra > > instructions to flip your value negative before it can combine the > > values via something like the LEA (load effective address) assembler > > call. > > I am not an asm expert here, I am not sure I understand the optimization > trick here. The LEA instruction takes a base address adds 1/2/4/8 times a multiple and then a fixed offset all in one function and provides an address as an output. The general idea is that you could look at converting things such that you are putting together the page address + remaining*1 + PAGE_SIZE. Basically what I was getting at is that addition works, but it doesn't do negative values for the multiple.
On 2024/7/12 0:49, Alexander Duyck wrote: > On Thu, Jul 11, 2024 at 1:16 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: >> >> On 2024/7/10 23:28, Alexander H Duyck wrote: > > ... > >>>> >>>> Yes, agreed. It would be good to be more specific about how to avoid >>>> the above problem using a signed negative number for 'remaining' as >>>> I am not sure how it can be done yet. >>>> >>> >>> My advice would be to go back to patch 3 and figure out how to do this >>> re-ordering without changing the alignment behaviour. The old code >>> essentially aligned both the offset and fragsz by combining the two and >>> then doing the alignment. Since you are doing a count up setup you will >> >> I am not sure I understand 'aligned both the offset and fragsz ' part, as >> 'fragsz' being aligned or not seems to depend on last caller' align_mask, >> for the same page_frag_cache instance, suppose offset = 32768 initially for >> the old code: >> Step 1: __page_frag_alloc_align() is called with fragsz=7 and align_mask=~0u >> the offset after this is 32761, the true fragsz is 7 too. >> >> Step 2: __page_frag_alloc_align() is called with fragsz=7 and align_mask=-16 >> the offset after this is 32752, the true fragsz is 9, which does >> not seems to be aligned. > > I was referring to my original code before this patchset. I was doing > the subtraction of the fragsz first, and then aligning which would end > up padding the end of the frame as it was adding to the total size by > pulling the offset down *after* I had already subtracted fragsz. The > result was that I was always adding additional room depending on the > setting of the fragsz and how it related to the alignment. After > changing the code to realign on the start of the next frag the fragsz > is at the mercy of the next caller's alignment. In the event that the > caller didn't bother to align the fragsz by the align mask before hand > they can end up with a scenario that might result in false sharing. So it is about ensuring the additional room due to alignment requirement being placed at the end of a fragment, in order to avoid false sharing between the last fragment and the current fragment as much as possible, right? I am generally agreed with above if we can also ensure skb coaleasing by doing offset count-up instead of offset countdown. If there is conflict between them, I am assuming that enabling skb frag coaleasing is prefered over avoiding false sharing, right? > >> The above is why I added the below paragraph in the doc to make the semantic >> more explicit: >> "Depending on different aligning requirement, the page_frag API caller may call >> page_frag_alloc*_align*() to ensure the returned virtual address or offset of >> the page is aligned according to the 'align/alignment' parameter. Note the size >> of the allocated fragment is not aligned, the caller needs to provide an aligned >> fragsz if there is an alignment requirement for the size of the fragment." >> >> And existing callers of page_frag aligned API does seems to follow the above >> rule last time I checked. >> >> Or did I miss something obvious here? > > No you didn't miss anything. It is just that there is now more > potential for error than there was before. I guess the 'error' is referred to the 'false sharing' mentioned above, right? If it is indeed an error, are we not supposed to fix it instead of allowing such implicit implication? Allowing implicit implication seems to make the 'error' harder to reproduce and debug. > >>> need to align the remaining, then add fragsz, and then I guess you >>> could store remaining and then subtract fragsz from your final virtual >>> address to get back to where the starting offset is actually located. >> >> remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask); >> remaining += fragsz; >> nc->remaining = remaining; >> return encoded_page_address(nc->encoded_va) + (size + remaining) - fragsz; >> >> If yes, I am not sure what is the point of doing the above yet, it >> just seem to make thing more complicated and harder to understand. > > That isn't right. I am not sure why you are adding size + remaining or > what those are supposed to represent. As the above assumes 'remaining' is a negative value as you suggested, (size + remaining) is supposed to represent the offset of next fragment to ensure we have count-up offset for enabling skb frag coaleasing, and '- fragsz' is used to get the offset of current fragment. > > The issue was that the "remaining" ends up being an unaligned value as > you were starting by aligning it and then adding fragsz. So by > subtracting fragsz you can get back to the aliglined start. What this > patch was doing before was adding the raw unaligned nc->remaining at > the end of the function. > >>> >>> Basically your "remaining" value isn't a safe number to use for an >>> offset since it isn't aligned to your starting value at any point. >> >> Does using 'aligned_remaining' local variable to make it more obvious >> seem reasonable to you? > > No, as the value you are storing above isn't guaranteed to be aligned. > If you stored it before adding fragsz then it would be aligned. I have a feeling that what you are proposing may be conflict with enabling skb frag coaleasing, as doing offset count-up seems to need some room to be reserved at the begin of a allocated fragment due to alignment requirement, and that may mean we need to do both fragsz and offset aligning. Perhaps the 'remaining' changing in this patch does seems to make things harder to discuss. Anyway, it would be more helpful if there is some pseudo code to show the steps of how the above can be done in your mind. > >>> >>> As far as the negative value, it is more about making it easier to keep >>> track of what is actually going on. Basically we can use regular >>> pointer math and as such I suspect the compiler is having to do extra >>> instructions to flip your value negative before it can combine the >>> values via something like the LEA (load effective address) assembler >>> call. >> >> I am not an asm expert here, I am not sure I understand the optimization >> trick here. > > The LEA instruction takes a base address adds 1/2/4/8 times a multiple > and then a fixed offset all in one function and provides an address as > an output. The general idea is that you could look at converting > things such that you are putting together the page address + > remaining*1 + PAGE_SIZE. Basically what I was getting at is that > addition works, but it doesn't do negative values for the multiple.
On Fri, Jul 12, 2024 at 1:42 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2024/7/12 0:49, Alexander Duyck wrote: > > On Thu, Jul 11, 2024 at 1:16 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > >> > >> On 2024/7/10 23:28, Alexander H Duyck wrote: > > > > ... > > > >>>> > >>>> Yes, agreed. It would be good to be more specific about how to avoid > >>>> the above problem using a signed negative number for 'remaining' as > >>>> I am not sure how it can be done yet. > >>>> > >>> > >>> My advice would be to go back to patch 3 and figure out how to do this > >>> re-ordering without changing the alignment behaviour. The old code > >>> essentially aligned both the offset and fragsz by combining the two and > >>> then doing the alignment. Since you are doing a count up setup you will > >> > >> I am not sure I understand 'aligned both the offset and fragsz ' part, as > >> 'fragsz' being aligned or not seems to depend on last caller' align_mask, > >> for the same page_frag_cache instance, suppose offset = 32768 initially for > >> the old code: > >> Step 1: __page_frag_alloc_align() is called with fragsz=7 and align_mask=~0u > >> the offset after this is 32761, the true fragsz is 7 too. > >> > >> Step 2: __page_frag_alloc_align() is called with fragsz=7 and align_mask=-16 > >> the offset after this is 32752, the true fragsz is 9, which does > >> not seems to be aligned. > > > > I was referring to my original code before this patchset. I was doing > > the subtraction of the fragsz first, and then aligning which would end > > up padding the end of the frame as it was adding to the total size by > > pulling the offset down *after* I had already subtracted fragsz. The > > result was that I was always adding additional room depending on the > > setting of the fragsz and how it related to the alignment. After > > changing the code to realign on the start of the next frag the fragsz > > is at the mercy of the next caller's alignment. In the event that the > > caller didn't bother to align the fragsz by the align mask before hand > > they can end up with a scenario that might result in false sharing. > > So it is about ensuring the additional room due to alignment requirement > being placed at the end of a fragment, in order to avoid false sharing > between the last fragment and the current fragment as much as possible, > right? > > I am generally agreed with above if we can also ensure skb coaleasing by > doing offset count-up instead of offset countdown. > > If there is conflict between them, I am assuming that enabling skb frag > coaleasing is prefered over avoiding false sharing, right? The question I would have is where do we have opportunities for this to result in coalescing? If we are using this to allocate skb->data then there isn't such a chance as the tailroom gets in the way. If this is for a device allocating for an Rx buffer we won't get that chance as we have to preallocate some fixed size not knowing the buffer that is coming, and it needs to usually be DMA aligned in order to avoid causing partial cacheline reads/writes. The only way these would combine well is if you are doing aligned fixed blocks and are the only consumer of the page frag cache. It is essentially just optimizing for jumbo frames in that case. If this is for some software interface why wouldn't it request the coalesced size and do one allocation rather than trying to figure out how to perform a bunch of smaller allocations and then trying to merge them together after the fact. > > > >> The above is why I added the below paragraph in the doc to make the semantic > >> more explicit: > >> "Depending on different aligning requirement, the page_frag API caller may call > >> page_frag_alloc*_align*() to ensure the returned virtual address or offset of > >> the page is aligned according to the 'align/alignment' parameter. Note the size > >> of the allocated fragment is not aligned, the caller needs to provide an aligned > >> fragsz if there is an alignment requirement for the size of the fragment." > >> > >> And existing callers of page_frag aligned API does seems to follow the above > >> rule last time I checked. > >> > >> Or did I miss something obvious here? > > > > No you didn't miss anything. It is just that there is now more > > potential for error than there was before. > > I guess the 'error' is referred to the 'false sharing' mentioned above, > right? If it is indeed an error, are we not supposed to fix it instead > of allowing such implicit implication? Allowing implicit implication > seems to make the 'error' harder to reproduce and debug. The concern with the code as it stands is that if I am not mistaken remaining isn't actually aligned. You aligned it, then added fragsz. That becomes the start of the next frame so if fragsz isn't aligned the next requester will be getting an unaligned buffer, or one that is only aligned to the previous caller's alignment. > > > >>> need to align the remaining, then add fragsz, and then I guess you > >>> could store remaining and then subtract fragsz from your final virtual > >>> address to get back to where the starting offset is actually located. > >> > >> remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask); > >> remaining += fragsz; > >> nc->remaining = remaining; > >> return encoded_page_address(nc->encoded_va) + (size + remaining) - fragsz; > >> > >> If yes, I am not sure what is the point of doing the above yet, it > >> just seem to make thing more complicated and harder to understand. > > > > That isn't right. I am not sure why you are adding size + remaining or > > what those are supposed to represent. > > As the above assumes 'remaining' is a negative value as you suggested, > (size + remaining) is supposed to represent the offset of next fragment > to ensure we have count-up offset for enabling skb frag coaleasing, and > '- fragsz' is used to get the offset of current fragment. > > > > > The issue was that the "remaining" ends up being an unaligned value as > > you were starting by aligning it and then adding fragsz. So by > > subtracting fragsz you can get back to the aliglined start. What this > > patch was doing before was adding the raw unaligned nc->remaining at > > the end of the function. > > > >>> > >>> Basically your "remaining" value isn't a safe number to use for an > >>> offset since it isn't aligned to your starting value at any point. > >> > >> Does using 'aligned_remaining' local variable to make it more obvious > >> seem reasonable to you? > > > > No, as the value you are storing above isn't guaranteed to be aligned. > > If you stored it before adding fragsz then it would be aligned. > > I have a feeling that what you are proposing may be conflict with enabling > skb frag coaleasing, as doing offset count-up seems to need some room to > be reserved at the begin of a allocated fragment due to alignment requirement, > and that may mean we need to do both fragsz and offset aligning. > > Perhaps the 'remaining' changing in this patch does seems to make things > harder to discuss. Anyway, it would be more helpful if there is some pseudo > code to show the steps of how the above can be done in your mind. Basically what you would really need do for all this is: remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask); nc->remaining = remaining + fragsz; return encoded_page_address(nc->encoded_va) + size + remaining; The issue is you cannot be using page_frag_cache_current_va as that pointer will always be garbage as it isn't aligned to anything. It isn't like the code that I had that was working backwards as the offset cannot be used as soon as you compute it since you add fragsz to it. For any countup you have to align the current offset/remaining value, then that value is used for computing the page address, and you store the offset/remaining plus fragsz adjustment. At which point your offset/remaining pointer value isn't good for the current buffer anymore.
On 7/13/2024 12:55 AM, Alexander Duyck wrote: ... >> >> So it is about ensuring the additional room due to alignment requirement >> being placed at the end of a fragment, in order to avoid false sharing >> between the last fragment and the current fragment as much as possible, >> right? >> >> I am generally agreed with above if we can also ensure skb coaleasing by >> doing offset count-up instead of offset countdown. >> >> If there is conflict between them, I am assuming that enabling skb frag >> coaleasing is prefered over avoiding false sharing, right? > > The question I would have is where do we have opportunities for this > to result in coalescing? If we are using this to allocate skb->data > then there isn't such a chance as the tailroom gets in the way. > > If this is for a device allocating for an Rx buffer we won't get that > chance as we have to preallocate some fixed size not knowing the > buffer that is coming, and it needs to usually be DMA aligned in order > to avoid causing partial cacheline reads/writes. The only way these > would combine well is if you are doing aligned fixed blocks and are > the only consumer of the page frag cache. It is essentially just > optimizing for jumbo frames in that case. And hw-gro or sw-gro. > > If this is for some software interface why wouldn't it request the > coalesced size and do one allocation rather than trying to figure out > how to perform a bunch of smaller allocations and then trying to merge > them together after the fact. I am not sure I understand what 'some software interface' is referring to, I hope you are not suggesting the below optimizations utilizing of skb_can_coalesce() checking is unnecessary:( https://elixir.bootlin.com/linux/v6.10-rc7/C/ident/skb_can_coalesce Most of the usecases do that for the reason below as mentioned in the Documentation/mm/page_frags.rst as my understanding: "There is also a use case that needs minimum memory in order for forward progress, but more performant if more memory is available." > >>> >>>> The above is why I added the below paragraph in the doc to make the semantic >>>> more explicit: >>>> "Depending on different aligning requirement, the page_frag API caller may call >>>> page_frag_alloc*_align*() to ensure the returned virtual address or offset of >>>> the page is aligned according to the 'align/alignment' parameter. Note the size >>>> of the allocated fragment is not aligned, the caller needs to provide an aligned >>>> fragsz if there is an alignment requirement for the size of the fragment." >>>> >>>> And existing callers of page_frag aligned API does seems to follow the above >>>> rule last time I checked. >>>> >>>> Or did I miss something obvious here? >>> >>> No you didn't miss anything. It is just that there is now more >>> potential for error than there was before. >> >> I guess the 'error' is referred to the 'false sharing' mentioned above, >> right? If it is indeed an error, are we not supposed to fix it instead >> of allowing such implicit implication? Allowing implicit implication >> seems to make the 'error' harder to reproduce and debug. > > The concern with the code as it stands is that if I am not mistaken > remaining isn't actually aligned. You aligned it, then added fragsz. > That becomes the start of the next frame so if fragsz isn't aligned > the next requester will be getting an unaligned buffer, or one that is > only aligned to the previous caller's alignment. As mentioned below: https://lore.kernel.org/all/3da33d4c-a70e-23a4-8e00-23fe96dd0c1a@huawei.com/ what alignment semantics are we providing here: 1. Ensure alignment for both offset and fragsz. 2. Ensure alignment for offset only. 3. Ensure alignment for fragsz only. As my understanding, the original code before this patchset is only ensuring alignment for offset too. So there may be 'false sharing' both before this patchset and after this patchset. It would be better not to argue about which implementation having more/less potential to avoid the 'false sharing', it is an undefined behavior, the argument would be endless depending on usecase and personal preference. As I said before, I would love to retain the old undefined behavior when there is a reasonable way to support the new usecases. > >>> >>>>> need to align the remaining, then add fragsz, and then I guess you >>>>> could store remaining and then subtract fragsz from your final virtual >>>>> address to get back to where the starting offset is actually located. >>>> >>>> remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask); >>>> remaining += fragsz; >>>> nc->remaining = remaining; >>>> return encoded_page_address(nc->encoded_va) + (size + remaining) - fragsz; >>>> >>>> If yes, I am not sure what is the point of doing the above yet, it >>>> just seem to make thing more complicated and harder to understand. >>> >>> That isn't right. I am not sure why you are adding size + remaining or >>> what those are supposed to represent. >> >> As the above assumes 'remaining' is a negative value as you suggested, >> (size + remaining) is supposed to represent the offset of next fragment >> to ensure we have count-up offset for enabling skb frag coaleasing, and >> '- fragsz' is used to get the offset of current fragment. >> >>> >>> The issue was that the "remaining" ends up being an unaligned value as >>> you were starting by aligning it and then adding fragsz. So by >>> subtracting fragsz you can get back to the aliglined start. What this >>> patch was doing before was adding the raw unaligned nc->remaining at >>> the end of the function. >>> >>>>> >>>>> Basically your "remaining" value isn't a safe number to use for an >>>>> offset since it isn't aligned to your starting value at any point. >>>> >>>> Does using 'aligned_remaining' local variable to make it more obvious >>>> seem reasonable to you? >>> >>> No, as the value you are storing above isn't guaranteed to be aligned. >>> If you stored it before adding fragsz then it would be aligned. >> >> I have a feeling that what you are proposing may be conflict with enabling >> skb frag coaleasing, as doing offset count-up seems to need some room to >> be reserved at the begin of a allocated fragment due to alignment requirement, >> and that may mean we need to do both fragsz and offset aligning. >> >> Perhaps the 'remaining' changing in this patch does seems to make things >> harder to discuss. Anyway, it would be more helpful if there is some pseudo >> code to show the steps of how the above can be done in your mind. > > Basically what you would really need do for all this is: > remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask); > nc->remaining = remaining + fragsz; > return encoded_page_address(nc->encoded_va) + size + remaining; I am assuming 'size + remaining' is supposed to represent the offset of current allocted fragment? Are you sure the above is what you wanted? suppose remaining = -32768 & size = 32768 initially: Step 1: __page_frag_alloc_align() is called with fragsz=7 and align_mask=~0u, the remaining after this is -32761, the true fragsz is 7, the offset is 0 Step 2: __page_frag_alloc_align() is called with fragsz=7 and align_mask=-16, the offset after this is -32745, the true fragsz is 7, the offset = 16 The semantics of the above implementation seems to be the same as the semantics of implementation of patch 3 in v10: https://lore.kernel.org/all/20240709132741.47751-4-linyunsheng@huawei.com/ aligned_remaining = nc->remaining & align_mask; remaining = aligned_remaining - fragsz; nc->remaining = remaining; return encoded_page_address(encoded_va) + size - aligned_remaining; The main difference seems to be about using a negative value for nc->remaining or not, if yes, I am not sure what is other gain of using a negative value for nc->remaining besides the LEA instruction opt trick. As using a unsigned value and a 'aligned_remaining' local variable does seems to make thing easier to understand and avoid adding three checkings as mentioned below. > > The issue is you cannot be using page_frag_cache_current_va as that > pointer will always be garbage as it isn't aligned to anything. It > isn't like the code that I had that was working backwards as the > offset cannot be used as soon as you compute it since you add fragsz > to it. The above was a mistake in v9, the intend is to do: nc->remaining &= align_mask; That was why there were three 'align > PAGE_SIZE' checking in v10 to avoid doing 'nc->remaining &= align_mask' prematurely if caller passes a large align_mask value. So 'aligned_remaining' local variable in v10 is used to avoid doing 'nc->remaining &= align_mask', thus three 'align > PAGE_SIZE' checking is not needed too. > > For any countup you have to align the current offset/remaining value, > then that value is used for computing the page address, and you store > the offset/remaining plus fragsz adjustment. At which point your > offset/remaining pointer value isn't good for the current buffer > anymore. >
On Fri, Jul 12, 2024 at 10:20 PM Yunsheng Lin <yunshenglin0825@gmail.com> wrote: > > On 7/13/2024 12:55 AM, Alexander Duyck wrote: > > ... > > >> > >> So it is about ensuring the additional room due to alignment requirement > >> being placed at the end of a fragment, in order to avoid false sharing > >> between the last fragment and the current fragment as much as possible, > >> right? > >> > >> I am generally agreed with above if we can also ensure skb coaleasing by > >> doing offset count-up instead of offset countdown. > >> > >> If there is conflict between them, I am assuming that enabling skb frag > >> coaleasing is prefered over avoiding false sharing, right? > > > > The question I would have is where do we have opportunities for this > > to result in coalescing? If we are using this to allocate skb->data > > then there isn't such a chance as the tailroom gets in the way. > > > > If this is for a device allocating for an Rx buffer we won't get that > > chance as we have to preallocate some fixed size not knowing the > > buffer that is coming, and it needs to usually be DMA aligned in order > > to avoid causing partial cacheline reads/writes. The only way these > > would combine well is if you are doing aligned fixed blocks and are > > the only consumer of the page frag cache. It is essentially just > > optimizing for jumbo frames in that case. > > And hw-gro or sw-gro. No and no. The problem is for hw-gro in many cases the device will be DMA aligning the start of writes for each new frame that comes in. As such it is possible you won't be able to make use of this unless the device is either queueing up the entire packet before writing it to memory, or taking a double penalty for partial cache line writes which will negatively impact performance. For sw-gro it won't happen since as I mentioned the device is doing DMA aligned writes usually w/ fixed size buffers that will be partially empty. As such coalescing will likely not be possible in either of those scenarios. This is why I was so comfortable using the reverse ordering for the allocations. Trying to aggregate frames in this way will be very difficult and the likelihood of anyone ever needing to do it is incredibly small. > > > > If this is for some software interface why wouldn't it request the > > coalesced size and do one allocation rather than trying to figure out > > how to perform a bunch of smaller allocations and then trying to merge > > them together after the fact. > > I am not sure I understand what 'some software interface' is referring > to, I hope you are not suggesting the below optimizations utilizing of > skb_can_coalesce() checking is unnecessary:( > > https://elixir.bootlin.com/linux/v6.10-rc7/C/ident/skb_can_coalesce > > Most of the usecases do that for the reason below as mentioned in the > Documentation/mm/page_frags.rst as my understanding: > "There is also a use case that needs minimum memory in order for forward > progress, but more performant if more memory is available." No what I am talking about is that it will be expensive to use and have very little benefit to coalesce frames coming from a NIC as I called out above. Most NICs won't use page frags anyway they will be using page pool which is a slightly different beast anyway. > > > >>> > >>>> The above is why I added the below paragraph in the doc to make the semantic > >>>> more explicit: > >>>> "Depending on different aligning requirement, the page_frag API caller may call > >>>> page_frag_alloc*_align*() to ensure the returned virtual address or offset of > >>>> the page is aligned according to the 'align/alignment' parameter. Note the size > >>>> of the allocated fragment is not aligned, the caller needs to provide an aligned > >>>> fragsz if there is an alignment requirement for the size of the fragment." > >>>> > >>>> And existing callers of page_frag aligned API does seems to follow the above > >>>> rule last time I checked. > >>>> > >>>> Or did I miss something obvious here? > >>> > >>> No you didn't miss anything. It is just that there is now more > >>> potential for error than there was before. > >> > >> I guess the 'error' is referred to the 'false sharing' mentioned above, > >> right? If it is indeed an error, are we not supposed to fix it instead > >> of allowing such implicit implication? Allowing implicit implication > >> seems to make the 'error' harder to reproduce and debug. > > > > The concern with the code as it stands is that if I am not mistaken > > remaining isn't actually aligned. You aligned it, then added fragsz. > > That becomes the start of the next frame so if fragsz isn't aligned > > the next requester will be getting an unaligned buffer, or one that is > > only aligned to the previous caller's alignment. > > As mentioned below: > https://lore.kernel.org/all/3da33d4c-a70e-23a4-8e00-23fe96dd0c1a@huawei.com/ > > what alignment semantics are we providing here: > 1. Ensure alignment for both offset and fragsz. > 2. Ensure alignment for offset only. > 3. Ensure alignment for fragsz only. > > As my understanding, the original code before this patchset is only > ensuring alignment for offset too. So there may be 'false sharing' > both before this patchset and after this patchset. It would be better > not to argue about which implementation having more/less potential > to avoid the 'false sharing', it is an undefined behavior, the argument > would be endless depending on usecase and personal preference. > > As I said before, I would love to retain the old undefined behavior > when there is a reasonable way to support the new usecases. My main concern is that you were aligning "remaining", then adding fragsz, and storing that. That was then used as the offset for the next buffer. That isn't aligned since fragsz and the previous offset aren't guaranteed to align with the new allocation. So at a minimum the existing code cannot be using nc->remaining when generating the pointer to the page. Instead it has to be using the aligned version of that value that you generated before adding fragsz and verifying there is space. > > > >>> > >>>>> need to align the remaining, then add fragsz, and then I guess you > >>>>> could store remaining and then subtract fragsz from your final virtual > >>>>> address to get back to where the starting offset is actually located. > >>>> > >>>> remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask); > >>>> remaining += fragsz; > >>>> nc->remaining = remaining; > >>>> return encoded_page_address(nc->encoded_va) + (size + remaining) - fragsz; > >>>> > >>>> If yes, I am not sure what is the point of doing the above yet, it > >>>> just seem to make thing more complicated and harder to understand. > >>> > >>> That isn't right. I am not sure why you are adding size + remaining or > >>> what those are supposed to represent. > >> > >> As the above assumes 'remaining' is a negative value as you suggested, > >> (size + remaining) is supposed to represent the offset of next fragment > >> to ensure we have count-up offset for enabling skb frag coaleasing, and > >> '- fragsz' is used to get the offset of current fragment. > >> > >>> > >>> The issue was that the "remaining" ends up being an unaligned value as > >>> you were starting by aligning it and then adding fragsz. So by > >>> subtracting fragsz you can get back to the aliglined start. What this > >>> patch was doing before was adding the raw unaligned nc->remaining at > >>> the end of the function. > >>> > >>>>> > >>>>> Basically your "remaining" value isn't a safe number to use for an > >>>>> offset since it isn't aligned to your starting value at any point. > >>>> > >>>> Does using 'aligned_remaining' local variable to make it more obvious > >>>> seem reasonable to you? > >>> > >>> No, as the value you are storing above isn't guaranteed to be aligned. > >>> If you stored it before adding fragsz then it would be aligned. > >> > >> I have a feeling that what you are proposing may be conflict with enabling > >> skb frag coaleasing, as doing offset count-up seems to need some room to > >> be reserved at the begin of a allocated fragment due to alignment requirement, > >> and that may mean we need to do both fragsz and offset aligning. > >> > >> Perhaps the 'remaining' changing in this patch does seems to make things > >> harder to discuss. Anyway, it would be more helpful if there is some pseudo > >> code to show the steps of how the above can be done in your mind. > > > > Basically what you would really need do for all this is: > > remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask); > > nc->remaining = remaining + fragsz; > > return encoded_page_address(nc->encoded_va) + size + remaining; > I might have mixed my explanation up a bit. This is assuming remaining is a negative value as I mentioned before. Basically the issue is your current code is using nc->remaining to generate the current address and that is bad as it isn't aligned to anything as fragsz was added to it and no alignment check had been done on that value.
On 7/14/2024 12:55 AM, Alexander Duyck wrote: ... >>>> >>>> Perhaps the 'remaining' changing in this patch does seems to make things >>>> harder to discuss. Anyway, it would be more helpful if there is some pseudo >>>> code to show the steps of how the above can be done in your mind. >>> >>> Basically what you would really need do for all this is: >>> remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask); >>> nc->remaining = remaining + fragsz; >>> return encoded_page_address(nc->encoded_va) + size + remaining; >> > > I might have mixed my explanation up a bit. This is assuming remaining > is a negative value as I mentioned before. Let's be more specific about the options here, what you meant is below, right? Let's say it is option 1 as below: struct page_frag_cache { /* encoded_va consists of the virtual address, pfmemalloc bit and order * of a page. */ unsigned long encoded_va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) __s16 remaining; __u16 pagecnt_bias; #else __s32 remaining; __u32 pagecnt_bias; #endif }; void *__page_frag_alloc_va_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align_mask) { unsigned int size = page_frag_cache_page_size(nc->encoded_va); int remaining; remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask); if (unlikely(remaining + (int)fragsz > 0)) { if (!__page_frag_cache_refill(nc, gfp_mask)) return NULL; size = page_frag_cache_page_size(nc->encoded_va); remaining = -size; if (unlikely(remaining + (int)fragsz > 0)) return NULL; } nc->pagecnt_bias--; nc->remaining = remaining + fragsz; return encoded_page_address(nc->encoded_va) + size + remaining; } And let's say what I am proposing in v10 is option 2 as below: struct page_frag_cache { /* encoded_va consists of the virtual address, pfmemalloc bit and order * of a page. */ unsigned long encoded_va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) __u16 remaining; __u16 pagecnt_bias; #else __u32 remaining; __u32 pagecnt_bias; #endif }; void *__page_frag_alloc_va_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align_mask) { unsigned int size = page_frag_cache_page_size(nc->encoded_va); int aligned_remaining = nc->remaining & align_mask; int remaining = aligned_remaining - fragsz; if (unlikely(remaining < 0)) { if (!__page_frag_cache_refill(nc, gfp_mask)) return NULL; size = page_frag_cache_page_size(nc->encoded_va); aligned_remaining = size; remaining = aligned_remaining - fragsz; if (unlikely(remaining < 0)) return NULL; } nc->pagecnt_bias--; nc->remaining = remaining; return encoded_page_address(nc->encoded_va) + (size - aligned_remaining); } If the option 1 is not what you have in mind, it would be better to be more specific about what you have in mind. If the option 1 is what you have in mind, it seems both option 1 and option 2 have the same semantics as my understanding, right? The question here seems to be what is your perfer option and why? I implemented both of them, and the option 1 seems to have a bigger generated asm size as below: ./scripts/bloat-o-meter vmlinux_non_neg vmlinux add/remove: 0/0 grow/shrink: 1/0 up/down: 37/0 (37) Function old new delta __page_frag_alloc_va_align 414 451 +37 > > Basically the issue is your current code is using nc->remaining to > generate the current address and that is bad as it isn't aligned to > anything as fragsz was added to it and no alignment check had been > done on that value.
On Sat, Jul 13, 2024 at 9:52 PM Yunsheng Lin <yunshenglin0825@gmail.com> wrote: > > On 7/14/2024 12:55 AM, Alexander Duyck wrote: > > ... > > >>>> > >>>> Perhaps the 'remaining' changing in this patch does seems to make things > >>>> harder to discuss. Anyway, it would be more helpful if there is some pseudo > >>>> code to show the steps of how the above can be done in your mind. > >>> > >>> Basically what you would really need do for all this is: > >>> remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask); > >>> nc->remaining = remaining + fragsz; > >>> return encoded_page_address(nc->encoded_va) + size + remaining; > >> > > > > I might have mixed my explanation up a bit. This is assuming remaining > > is a negative value as I mentioned before. > > Let's be more specific about the options here, what you meant is below, > right? Let's say it is option 1 as below: > struct page_frag_cache { > /* encoded_va consists of the virtual address, pfmemalloc bit > and order > * of a page. > */ > unsigned long encoded_va; > > #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) > __s16 remaining; > __u16 pagecnt_bias; > #else > __s32 remaining; > __u32 pagecnt_bias; > #endif > }; > > void *__page_frag_alloc_va_align(struct page_frag_cache *nc, > unsigned int fragsz, gfp_t gfp_mask, > unsigned int align_mask) > { > unsigned int size = page_frag_cache_page_size(nc->encoded_va); > int remaining; > > remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask); > if (unlikely(remaining + (int)fragsz > 0)) { > if (!__page_frag_cache_refill(nc, gfp_mask)) > return NULL; > > size = page_frag_cache_page_size(nc->encoded_va); > > remaining = -size; > if (unlikely(remaining + (int)fragsz > 0)) > return NULL; > } > > nc->pagecnt_bias--; > nc->remaining = remaining + fragsz; > > return encoded_page_address(nc->encoded_va) + size + remaining; > } > > > And let's say what I am proposing in v10 is option 2 as below: > struct page_frag_cache { > /* encoded_va consists of the virtual address, pfmemalloc bit > and order > * of a page. > */ > unsigned long encoded_va; > > #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) > __u16 remaining; > __u16 pagecnt_bias; > #else > __u32 remaining; > __u32 pagecnt_bias; > #endif > }; > > void *__page_frag_alloc_va_align(struct page_frag_cache *nc, > unsigned int fragsz, gfp_t gfp_mask, > unsigned int align_mask) > { > unsigned int size = page_frag_cache_page_size(nc->encoded_va); > int aligned_remaining = nc->remaining & align_mask; > int remaining = aligned_remaining - fragsz; > > if (unlikely(remaining < 0)) { > if (!__page_frag_cache_refill(nc, gfp_mask)) > return NULL; > > size = page_frag_cache_page_size(nc->encoded_va); > > aligned_remaining = size; > remaining = aligned_remaining - fragsz; > if (unlikely(remaining < 0)) > return NULL; > } > > nc->pagecnt_bias--; > nc->remaining = remaining; > > return encoded_page_address(nc->encoded_va) + (size - > aligned_remaining); > } > > If the option 1 is not what you have in mind, it would be better to be > more specific about what you have in mind. Option 1 was more or less what I had in mind. > If the option 1 is what you have in mind, it seems both option 1 and > option 2 have the same semantics as my understanding, right? The > question here seems to be what is your perfer option and why? > > I implemented both of them, and the option 1 seems to have a > bigger generated asm size as below: > ./scripts/bloat-o-meter vmlinux_non_neg vmlinux > add/remove: 0/0 grow/shrink: 1/0 up/down: 37/0 (37) > Function old new delta > __page_frag_alloc_va_align 414 451 +37 My big complaint is that it seems option 2 is harder for people to understand and more likely to not be done correctly. In some cases if the performance difference is negligible it is better to go with the more maintainable solution.
On 2024/7/16 1:55, Alexander Duyck wrote: > On Sat, Jul 13, 2024 at 9:52 PM Yunsheng Lin <yunshenglin0825@gmail.com> wrote: ... >> >> If the option 1 is not what you have in mind, it would be better to be >> more specific about what you have in mind. > > Option 1 was more or less what I had in mind. > >> If the option 1 is what you have in mind, it seems both option 1 and >> option 2 have the same semantics as my understanding, right? The >> question here seems to be what is your perfer option and why? >> >> I implemented both of them, and the option 1 seems to have a >> bigger generated asm size as below: >> ./scripts/bloat-o-meter vmlinux_non_neg vmlinux >> add/remove: 0/0 grow/shrink: 1/0 up/down: 37/0 (37) >> Function old new delta >> __page_frag_alloc_va_align 414 451 +37 > > My big complaint is that it seems option 2 is harder for people to > understand and more likely to not be done correctly. In some cases if > the performance difference is negligible it is better to go with the > more maintainable solution. Option 1 assuming nc->remaining as a negative value does not seems to make it a more maintainable solution than option 2. How about something like below if using a negative value to enable some optimization like LEA does not have a noticeable performance difference? struct page_frag_cache { /* encoded_va consists of the virtual address, pfmemalloc bit and order * of a page. */ unsigned long encoded_va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) __u16 remaining; __u16 pagecnt_bias; #else __u32 remaining; __u32 pagecnt_bias; #endif }; void *__page_frag_alloc_va_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align_mask) { unsigned int size = page_frag_cache_page_size(nc->encoded_va); unsigned int remaining; remaining = nc->remaining & align_mask; if (unlikely(remaining < fragsz)) { if (unlikely(fragsz > PAGE_SIZE)) { /* * The caller is trying to allocate a fragment * with fragsz > PAGE_SIZE but the cache isn't big * enough to satisfy the request, this may * happen in low memory conditions. * We don't release the cache page because * it could make memory pressure worse * so we simply return NULL here. */ return NULL; } if (!__page_frag_cache_refill(nc, gfp_mask)) return NULL; size = page_frag_cache_page_size(nc->encoded_va); remaining = size; } nc->pagecnt_bias--; nc->remaining = remaining - fragsz; return encoded_page_address(nc->encoded_va) + (size - remaining); }
On 2024/7/16 20:58, Yunsheng Lin wrote: ... > > Option 1 assuming nc->remaining as a negative value does not seems to > make it a more maintainable solution than option 2. How about something > like below if using a negative value to enable some optimization like LEA > does not have a noticeable performance difference? Suppose the below as option 3, it seems the option 3 has better performance than option 2, and option 2 has better performance than option 1 using the ko introduced in patch 1. Option 1: Performance counter stats for 'insmod ./page_frag_test.ko test_push_cpu=16 test_pop_cpu=17 test_alloc_len=12 nr_test=5120000' (500 runs): 17.757768 task-clock (msec) # 0.001 CPUs utilized ( +- 0.17% ) 5 context-switches # 0.288 K/sec ( +- 0.28% ) 0 cpu-migrations # 0.007 K/sec ( +- 12.36% ) 82 page-faults # 0.005 M/sec ( +- 0.06% ) 46128280 cycles # 2.598 GHz ( +- 0.17% ) 60938595 instructions # 1.32 insn per cycle ( +- 0.02% ) 14783794 branches # 832.525 M/sec ( +- 0.02% ) 20393 branch-misses # 0.14% of all branches ( +- 0.13% ) 24.556644680 seconds time elapsed ( +- 0.07% ) Option 2: Performance counter stats for 'insmod ./page_frag_test.ko test_push_cpu=16 test_pop_cpu=17 test_alloc_len=12 nr_test=5120000' (500 runs): 18.443508 task-clock (msec) # 0.001 CPUs utilized ( +- 0.61% ) 6 context-switches # 0.342 K/sec ( +- 0.57% ) 0 cpu-migrations # 0.025 K/sec ( +- 4.89% ) 82 page-faults # 0.004 M/sec ( +- 0.06% ) 47901207 cycles # 2.597 GHz ( +- 0.61% ) 60985019 instructions # 1.27 insn per cycle ( +- 0.05% ) 14787177 branches # 801.755 M/sec ( +- 0.05% ) 21099 branch-misses # 0.14% of all branches ( +- 0.14% ) 24.413183804 seconds time elapsed ( +- 0.06% ) Option 3: Performance counter stats for 'insmod ./page_frag_test.ko test_push_cpu=16 test_pop_cpu=17 test_alloc_len=12 nr_test=5120000' (500 runs): 17.847031 task-clock (msec) # 0.001 CPUs utilized ( +- 0.23% ) 5 context-switches # 0.305 K/sec ( +- 0.55% ) 0 cpu-migrations # 0.017 K/sec ( +- 6.86% ) 82 page-faults # 0.005 M/sec ( +- 0.06% ) 46355974 cycles # 2.597 GHz ( +- 0.23% ) 60848779 instructions # 1.31 insn per cycle ( +- 0.03% ) 14758941 branches # 826.969 M/sec ( +- 0.03% ) 20728 branch-misses # 0.14% of all branches ( +- 0.15% ) 24.376161069 seconds time elapsed ( +- 0.06% ) > > struct page_frag_cache { > /* encoded_va consists of the virtual address, pfmemalloc bit and order > * of a page. > */ > unsigned long encoded_va; > > #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) > __u16 remaining; > __u16 pagecnt_bias; > #else > __u32 remaining; > __u32 pagecnt_bias; > #endif > }; > > void *__page_frag_alloc_va_align(struct page_frag_cache *nc, > unsigned int fragsz, gfp_t gfp_mask, > unsigned int align_mask) > { > unsigned int size = page_frag_cache_page_size(nc->encoded_va); > unsigned int remaining; > > remaining = nc->remaining & align_mask; > if (unlikely(remaining < fragsz)) { > if (unlikely(fragsz > PAGE_SIZE)) { > /* > * The caller is trying to allocate a fragment > * with fragsz > PAGE_SIZE but the cache isn't big > * enough to satisfy the request, this may > * happen in low memory conditions. > * We don't release the cache page because > * it could make memory pressure worse > * so we simply return NULL here. > */ > return NULL; > } > > if (!__page_frag_cache_refill(nc, gfp_mask)) > return NULL; > > size = page_frag_cache_page_size(nc->encoded_va); > remaining = size; > } > > nc->pagecnt_bias--; > nc->remaining = remaining - fragsz; > > return encoded_page_address(nc->encoded_va) + (size - remaining); > } > >
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h index 6ac3a25089d1..b33904d4494f 100644 --- a/include/linux/page_frag_cache.h +++ b/include/linux/page_frag_cache.h @@ -8,29 +8,81 @@ #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) -struct page_frag_cache { - void *va; +/* + * struct encoded_va - a nonexistent type marking this pointer + * + * An 'encoded_va' pointer is a pointer to a aligned virtual address, which is + * at least aligned to PAGE_SIZE, that means there are at least 12 lower bits + * space available for other purposes. + * + * Currently we use the lower 8 bits and bit 9 for the order and PFMEMALLOC + * flag of the page this 'va' is corresponding to. + * + * Use the supplied helper functions to endcode/decode the pointer and bits. + */ +struct encoded_va; + +#define PAGE_FRAG_CACHE_ORDER_MASK GENMASK(7, 0) +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT BIT(8) +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT 8 + +static inline struct encoded_va *encode_aligned_va(void *va, + unsigned int order, + bool pfmemalloc) +{ #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - __u16 offset; - __u16 size; + return (struct encoded_va *)((unsigned long)va | order | + pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT); #else - __u32 offset; + return (struct encoded_va *)((unsigned long)va | + pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT); +#endif +} + +static inline unsigned long encoded_page_order(struct encoded_va *encoded_va) +{ +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + return PAGE_FRAG_CACHE_ORDER_MASK & (unsigned long)encoded_va; +#else + return 0; +#endif +} + +static inline bool encoded_page_pfmemalloc(struct encoded_va *encoded_va) +{ + return PAGE_FRAG_CACHE_PFMEMALLOC_BIT & (unsigned long)encoded_va; +} + +static inline void *encoded_page_address(struct encoded_va *encoded_va) +{ + return (void *)((unsigned long)encoded_va & PAGE_MASK); +} + +struct page_frag_cache { + struct encoded_va *encoded_va; + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) + u16 pagecnt_bias; + u16 remaining; +#else + u32 pagecnt_bias; + u32 remaining; #endif - /* we maintain a pagecount bias, so that we dont dirty cache line - * containing page->_refcount every time we allocate a fragment. - */ - unsigned int pagecnt_bias; - bool pfmemalloc; }; static inline void page_frag_cache_init(struct page_frag_cache *nc) { - nc->va = NULL; + memset(nc, 0, sizeof(*nc)); } static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc) { - return !!nc->pfmemalloc; + return encoded_page_pfmemalloc(nc->encoded_va); +} + +static inline unsigned int page_frag_cache_page_size(struct encoded_va *encoded_va) +{ + return PAGE_SIZE << encoded_page_order(encoded_va); } void page_frag_cache_drain(struct page_frag_cache *nc); diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c index dd640af5607a..a3316dd50eff 100644 --- a/mm/page_frag_cache.c +++ b/mm/page_frag_cache.c @@ -18,34 +18,61 @@ #include <linux/page_frag_cache.h> #include "internal.h" +static void *page_frag_cache_current_va(struct page_frag_cache *nc) +{ + struct encoded_va *encoded_va = nc->encoded_va; + + return (void *)(((unsigned long)encoded_va & PAGE_MASK) | + (page_frag_cache_page_size(encoded_va) - nc->remaining)); +} + static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, gfp_t gfp_mask) { struct page *page = NULL; gfp_t gfp = gfp_mask; + unsigned int order; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC; page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER); - nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; #endif - if (unlikely(!page)) + if (unlikely(!page)) { page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + if (unlikely(!page)) { + memset(nc, 0, sizeof(*nc)); + return NULL; + } + + order = 0; + nc->remaining = PAGE_SIZE; + } else { + order = PAGE_FRAG_CACHE_MAX_ORDER; + nc->remaining = PAGE_FRAG_CACHE_MAX_SIZE; + } - nc->va = page ? page_address(page) : NULL; + /* Even if we own the page, we do not use atomic_set(). + * This would break get_page_unless_zero() users. + */ + page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); + /* reset page count bias of new frag */ + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; + nc->encoded_va = encode_aligned_va(page_address(page), order, + page_is_pfmemalloc(page)); return page; } void page_frag_cache_drain(struct page_frag_cache *nc) { - if (!nc->va) + if (!nc->encoded_va) return; - __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias); - nc->va = NULL; + __page_frag_cache_drain(virt_to_head_page(nc->encoded_va), + nc->pagecnt_bias); + memset(nc, 0, sizeof(*nc)); } EXPORT_SYMBOL(page_frag_cache_drain); @@ -62,51 +89,41 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align_mask) { - unsigned int size = PAGE_SIZE; + struct encoded_va *encoded_va = nc->encoded_va; struct page *page; - int offset; + int remaining; + void *va; - if (unlikely(!nc->va)) { + if (unlikely(!encoded_va)) { refill: - page = __page_frag_cache_refill(nc, gfp_mask); - if (!page) + if (unlikely(!__page_frag_cache_refill(nc, gfp_mask))) return NULL; - /* Even if we own the page, we do not use atomic_set(). - * This would break get_page_unless_zero() users. - */ - page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); - - /* reset page count bias and offset to start of new frag */ - nc->pfmemalloc = page_is_pfmemalloc(page); - nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; - nc->offset = 0; + encoded_va = nc->encoded_va; } -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - /* if size can vary use size else just use PAGE_SIZE */ - size = nc->size; -#endif - - offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask); - if (unlikely(offset + fragsz > size)) { - page = virt_to_page(nc->va); - + remaining = nc->remaining & align_mask; + remaining -= fragsz; + if (unlikely(remaining < 0)) { + page = virt_to_page(encoded_va); if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) goto refill; - if (unlikely(nc->pfmemalloc)) { - free_unref_page(page, compound_order(page)); + if (unlikely(encoded_page_pfmemalloc(encoded_va))) { + VM_BUG_ON(compound_order(page) != + encoded_page_order(encoded_va)); + free_unref_page(page, encoded_page_order(encoded_va)); goto refill; } /* OK, page count is 0, we can safely set it */ set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); - /* reset page count bias and offset to start of new frag */ + /* reset page count bias and remaining of new frag */ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; - offset = 0; - if (unlikely(fragsz > PAGE_SIZE)) { + nc->remaining = remaining = page_frag_cache_page_size(encoded_va); + remaining -= fragsz; + if (unlikely(remaining < 0)) { /* * The caller is trying to allocate a fragment * with fragsz > PAGE_SIZE but the cache isn't big @@ -120,10 +137,11 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc, } } + va = page_frag_cache_current_va(nc); nc->pagecnt_bias--; - nc->offset = offset + fragsz; + nc->remaining = remaining; - return nc->va + offset; + return va; } EXPORT_SYMBOL(__page_frag_alloc_va_align);
Currently there is one 'struct page_frag' for every 'struct sock' and 'struct task_struct', we are about to replace the 'struct page_frag' with 'struct page_frag_cache' for them. Before begin the replacing, we need to ensure the size of 'struct page_frag_cache' is not bigger than the size of 'struct page_frag', as there may be tens of thousands of 'struct sock' and 'struct task_struct' instances in the system. By or'ing the page order & pfmemalloc with lower bits of 'va' instead of using 'u16' or 'u32' for page size and 'u8' for pfmemalloc, we are able to avoid 3 or 5 bytes space waste. And page address & pfmemalloc & order is unchanged for the same page in the same 'page_frag_cache' instance, it makes sense to fit them together. Also, it is better to replace 'offset' with 'remaining', which is the remaining size for the cache in a 'page_frag_cache' instance, we are able to do a single 'fragsz > remaining' checking for the case of cache not being enough, which should be the fast path if we ensure size is zoro when 'va' == NULL by memset'ing 'struct page_frag_cache' in page_frag_cache_init() and page_frag_cache_drain(). After this patch, the size of 'struct page_frag_cache' should be the same as the size of 'struct page_frag'. CC: Alexander Duyck <alexander.duyck@gmail.com> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> --- include/linux/page_frag_cache.h | 76 +++++++++++++++++++++++----- mm/page_frag_cache.c | 90 ++++++++++++++++++++------------- 2 files changed, 118 insertions(+), 48 deletions(-)