diff mbox series

[RFC,v11,03/14] mm: page_frag: use initial zero offset for page_frag_alloc_align()

Message ID 20240719093338.55117-4-linyunsheng@huawei.com (mailing list archive)
State Superseded
Headers show
Series Replace page_frag with page_frag_cache for sk_page_frag() | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1313 this patch: 1313
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 2 of 2 maintainers
netdev/build_clang success Errors and warnings before: 1817 this patch: 1817
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 16107 this patch: 16107
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 128 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Yunsheng Lin July 19, 2024, 9:33 a.m. UTC
We are about to use page_frag_alloc_*() API to not just
allocate memory for skb->data, but also use them to do
the memory allocation for skb frag too. Currently the
implementation of page_frag in mm subsystem is running
the offset as a countdown rather than count-up value,
there may have several advantages to that as mentioned
in [1], but it may have some disadvantages, for example,
it may disable skb frag coaleasing and more correct cache
prefetching

We have a trade-off to make in order to have a unified
implementation and API for page_frag, so use a initial zero
offset in this patch, and the following patch will try to
make some optimization to avoid the disadvantages as much
as possible.

Rename 'offset' to 'remaining' to retain the 'countdown'
behavior as 'remaining countdown' instead of 'offset
countdown'. Also, Renaming enable us to do a single
'fragsz > remaining' checking for the case of cache not
being enough, which should be the fast path if we ensure
'remaining' is zero when 'va' == NULL by memset'ing
'struct page_frag_cache' in page_frag_cache_init() and
page_frag_cache_drain().

1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/mm_types_task.h |  4 +-
 mm/page_frag_cache.c          | 71 +++++++++++++++++++++--------------
 2 files changed, 44 insertions(+), 31 deletions(-)

Comments

Alexander Duyck July 21, 2024, 6:34 p.m. UTC | #1
On Fri, Jul 19, 2024 at 2:37 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> We are about to use page_frag_alloc_*() API to not just
> allocate memory for skb->data, but also use them to do
> the memory allocation for skb frag too. Currently the
> implementation of page_frag in mm subsystem is running
> the offset as a countdown rather than count-up value,
> there may have several advantages to that as mentioned
> in [1], but it may have some disadvantages, for example,
> it may disable skb frag coaleasing and more correct cache
> prefetching

You misspelled "coalescing".

> We have a trade-off to make in order to have a unified
> implementation and API for page_frag, so use a initial zero
> offset in this patch, and the following patch will try to
> make some optimization to avoid the disadvantages as much
> as possible.
>
> Rename 'offset' to 'remaining' to retain the 'countdown'
> behavior as 'remaining countdown' instead of 'offset
> countdown'. Also, Renaming enable us to do a single
> 'fragsz > remaining' checking for the case of cache not
> being enough, which should be the fast path if we ensure
> 'remaining' is zero when 'va' == NULL by memset'ing
> 'struct page_frag_cache' in page_frag_cache_init() and
> page_frag_cache_drain().
>
> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
>
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/mm_types_task.h |  4 +-
>  mm/page_frag_cache.c          | 71 +++++++++++++++++++++--------------
>  2 files changed, 44 insertions(+), 31 deletions(-)
>
> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
> index cdc1e3696439..b1c54b2b9308 100644
> --- a/include/linux/mm_types_task.h
> +++ b/include/linux/mm_types_task.h
> @@ -52,10 +52,10 @@ struct page_frag {
>  struct page_frag_cache {
>         void *va;
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -       __u16 offset;
> +       __u16 remaining;
>         __u16 size;
>  #else
> -       __u32 offset;
> +       __u32 remaining;
>  #endif
>         /* we maintain a pagecount bias, so that we dont dirty cache line
>          * containing page->_refcount every time we allocate a fragment.
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index 609a485cd02a..2958fe006fe7 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -22,6 +22,7 @@
>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>                                              gfp_t gfp_mask)
>  {
> +       unsigned int page_size = PAGE_FRAG_CACHE_MAX_SIZE;
>         struct page *page = NULL;
>         gfp_t gfp = gfp_mask;
>
> @@ -30,12 +31,21 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>                    __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>         page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>                                 PAGE_FRAG_CACHE_MAX_ORDER);
> -       nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
>  #endif
> -       if (unlikely(!page))
> +       if (unlikely(!page)) {
>                 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> +               if (unlikely(!page)) {
> +                       nc->va = NULL;
> +                       return NULL;
> +               }
>
> -       nc->va = page ? page_address(page) : NULL;
> +               page_size = PAGE_SIZE;
> +       }
> +
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +       nc->size = page_size;
> +#endif
> +       nc->va = page_address(page);
>
>         return page;
>  }

Not a huge fan of the changes here. If we are changing the direction
then just do that. I don't see the point of these changes. As far as I
can tell it is just adding noise to the diff and has no effect on the
final code as the outcome is mostly the same except for you don't
update size in the event that you overwrite nc->va to NULL.

> @@ -64,8 +74,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>                               unsigned int align_mask)
>  {
>         unsigned int size = PAGE_SIZE;
> +       unsigned int remaining;
>         struct page *page;
> -       int offset;
>
>         if (unlikely(!nc->va)) {
>  refill:
> @@ -82,35 +92,20 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>                  */
>                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>
> -               /* reset page count bias and offset to start of new frag */
> +               /* reset page count bias and remaining to start of new frag */
>                 nc->pfmemalloc = page_is_pfmemalloc(page);
>                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -               nc->offset = size;
> +               nc->remaining = size;
>         }
>
> -       offset = nc->offset - fragsz;
> -       if (unlikely(offset < 0)) {
> -               page = virt_to_page(nc->va);
> -
> -               if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> -                       goto refill;
> -
> -               if (unlikely(nc->pfmemalloc)) {
> -                       free_unref_page(page, compound_order(page));
> -                       goto refill;
> -               }
> -
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -               /* if size can vary use size else just use PAGE_SIZE */
> -               size = nc->size;
> +       /* if size can vary use size else just use PAGE_SIZE */
> +       size = nc->size;
>  #endif

Rather than pulling this out and placing it here it might make more
sense at the start of the function. Basically just overwrite size w/
either PAGE_SIZE or nc->size right at the start. Then if we have to
reallocate we overwrite it. That way we can avoid some redundancy and
this will be easier to read.

> -               /* OK, page count is 0, we can safely set it */
> -               set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>
> -               /* reset page count bias and offset to start of new frag */
> -               nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -               offset = size - fragsz;
> -               if (unlikely(offset < 0)) {
> +       remaining = nc->remaining & align_mask;
> +       if (unlikely(remaining < fragsz)) {
> +               if (unlikely(fragsz > PAGE_SIZE)) {
>                         /*
>                          * The caller is trying to allocate a fragment
>                          * with fragsz > PAGE_SIZE but the cache isn't big
> @@ -122,13 +117,31 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>                          */
>                         return NULL;
>                 }
> +
> +               page = virt_to_page(nc->va);
> +
> +               if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> +                       goto refill;
> +
> +               if (unlikely(nc->pfmemalloc)) {
> +                       free_unref_page(page, compound_order(page));
> +                       goto refill;
> +               }
> +
> +               /* OK, page count is 0, we can safely set it */
> +               set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> +
> +               /* reset page count bias and remaining to start of new frag */
> +               nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> +               nc->remaining = size;

Why are you setting nc->remaining here? You set it a few lines below.
This is redundant.

> +
> +               remaining = size;
>         }
>
>         nc->pagecnt_bias--;
> -       offset &= align_mask;
> -       nc->offset = offset;
> +       nc->remaining = remaining - fragsz;
>
> -       return nc->va + offset;
> +       return nc->va + (size - remaining);
>  }
>  EXPORT_SYMBOL(__page_frag_alloc_align);
Yunsheng Lin July 28, 2024, 2:12 p.m. UTC | #2
On 7/22/2024 2:34 AM, Alexander Duyck wrote:
> On Fri, Jul 19, 2024 at 2:37 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:

...

>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>> index 609a485cd02a..2958fe006fe7 100644
>> --- a/mm/page_frag_cache.c
>> +++ b/mm/page_frag_cache.c
>> @@ -22,6 +22,7 @@
>>   static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>                                               gfp_t gfp_mask)
>>   {
>> +       unsigned int page_size = PAGE_FRAG_CACHE_MAX_SIZE;
>>          struct page *page = NULL;
>>          gfp_t gfp = gfp_mask;
>>
>> @@ -30,12 +31,21 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>                     __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>>          page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>>                                  PAGE_FRAG_CACHE_MAX_ORDER);
>> -       nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
>>   #endif
>> -       if (unlikely(!page))
>> +       if (unlikely(!page)) {
>>                  page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>> +               if (unlikely(!page)) {
>> +                       nc->va = NULL;
>> +                       return NULL;
>> +               }
>>
>> -       nc->va = page ? page_address(page) : NULL;
>> +               page_size = PAGE_SIZE;
>> +       }
>> +
>> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> +       nc->size = page_size;
>> +#endif
>> +       nc->va = page_address(page);
>>
>>          return page;
>>   }
> 
> Not a huge fan of the changes here. If we are changing the direction
> then just do that. I don't see the point of these changes. As far as I
> can tell it is just adding noise to the diff and has no effect on the
> final code as the outcome is mostly the same except for you don't
> update size in the event that you overwrite nc->va to NULL.

While I am agreed the above changing is not really related to this
patch, but it does have some effect on the final code, as it seems
to avoid one extra '!page' checking:

  ./scripts/bloat-o-meter vmlinux.org vmlinux
add/remove: 0/0 grow/shrink: 1/0 up/down: 11/0 (11)
Function                                     old     new   delta
__page_frag_alloc_align                      594     605     +11
Total: Before=22083357, After=22083368, chg +0.00%

Let me see if I can move it to more related patch when refactoring.

> 
>> @@ -64,8 +74,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>                                unsigned int align_mask)
>>   {
>>          unsigned int size = PAGE_SIZE;
>> +       unsigned int remaining;
>>          struct page *page;
>> -       int offset;
>>
>>          if (unlikely(!nc->va)) {
>>   refill:
>> @@ -82,35 +92,20 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>                   */
>>                  page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>>
>> -               /* reset page count bias and offset to start of new frag */
>> +               /* reset page count bias and remaining to start of new frag */
>>                  nc->pfmemalloc = page_is_pfmemalloc(page);
>>                  nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> -               nc->offset = size;
>> +               nc->remaining = size;
>>          }
>>
>> -       offset = nc->offset - fragsz;
>> -       if (unlikely(offset < 0)) {
>> -               page = virt_to_page(nc->va);
>> -
>> -               if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>> -                       goto refill;
>> -
>> -               if (unlikely(nc->pfmemalloc)) {
>> -                       free_unref_page(page, compound_order(page));
>> -                       goto refill;
>> -               }
>> -
>>   #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> -               /* if size can vary use size else just use PAGE_SIZE */
>> -               size = nc->size;
>> +       /* if size can vary use size else just use PAGE_SIZE */
>> +       size = nc->size;
>>   #endif
> 
> Rather than pulling this out and placing it here it might make more
> sense at the start of the function. Basically just overwrite size w/
> either PAGE_SIZE or nc->size right at the start. Then if we have to
> reallocate we overwrite it. That way we can avoid some redundancy and
> this will be easier to read.

You meant something like below at the start of the function, it does
make more sense.
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
	unsigned int size = nc->size;
#else
	unsigned int size = PAGE_SIZE;
#endif

> 
>> -               /* OK, page count is 0, we can safely set it */
>> -               set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>>
>> -               /* reset page count bias and offset to start of new frag */
>> -               nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> -               offset = size - fragsz;
>> -               if (unlikely(offset < 0)) {
>> +       remaining = nc->remaining & align_mask;
>> +       if (unlikely(remaining < fragsz)) {
>> +               if (unlikely(fragsz > PAGE_SIZE)) {
>>                          /*
>>                           * The caller is trying to allocate a fragment
>>                           * with fragsz > PAGE_SIZE but the cache isn't big
>> @@ -122,13 +117,31 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>                           */
>>                          return NULL;
>>                  }
>> +
>> +               page = virt_to_page(nc->va);
>> +
>> +               if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>> +                       goto refill;
>> +
>> +               if (unlikely(nc->pfmemalloc)) {
>> +                       free_unref_page(page, compound_order(page));
>> +                       goto refill;
>> +               }
>> +
>> +               /* OK, page count is 0, we can safely set it */
>> +               set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>> +
>> +               /* reset page count bias and remaining to start of new frag */
>> +               nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> +               nc->remaining = size;
> 
> Why are you setting nc->remaining here? You set it a few lines below.
> This is redundant.

Yes, it is not needed after '(fragsz > PAGE_SIZE)' after checking is
moved upward.

> 
>> +
>> +               remaining = size;
>>          }
>>
>>          nc->pagecnt_bias--;
>> -       offset &= align_mask;
>> -       nc->offset = offset;
>> +       nc->remaining = remaining - fragsz;
>>
>> -       return nc->va + offset;
>> +       return nc->va + (size - remaining);
>>   }
>>   EXPORT_SYMBOL(__page_frag_alloc_align);
>
diff mbox series

Patch

diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index cdc1e3696439..b1c54b2b9308 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -52,10 +52,10 @@  struct page_frag {
 struct page_frag_cache {
 	void *va;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	__u16 offset;
+	__u16 remaining;
 	__u16 size;
 #else
-	__u32 offset;
+	__u32 remaining;
 #endif
 	/* we maintain a pagecount bias, so that we dont dirty cache line
 	 * containing page->_refcount every time we allocate a fragment.
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 609a485cd02a..2958fe006fe7 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -22,6 +22,7 @@ 
 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 					     gfp_t gfp_mask)
 {
+	unsigned int page_size = PAGE_FRAG_CACHE_MAX_SIZE;
 	struct page *page = NULL;
 	gfp_t gfp = gfp_mask;
 
@@ -30,12 +31,21 @@  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
 	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
 				PAGE_FRAG_CACHE_MAX_ORDER);
-	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
 #endif
-	if (unlikely(!page))
+	if (unlikely(!page)) {
 		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+		if (unlikely(!page)) {
+			nc->va = NULL;
+			return NULL;
+		}
 
-	nc->va = page ? page_address(page) : NULL;
+		page_size = PAGE_SIZE;
+	}
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	nc->size = page_size;
+#endif
+	nc->va = page_address(page);
 
 	return page;
 }
@@ -64,8 +74,8 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 			      unsigned int align_mask)
 {
 	unsigned int size = PAGE_SIZE;
+	unsigned int remaining;
 	struct page *page;
-	int offset;
 
 	if (unlikely(!nc->va)) {
 refill:
@@ -82,35 +92,20 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 		 */
 		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
 
-		/* reset page count bias and offset to start of new frag */
+		/* reset page count bias and remaining to start of new frag */
 		nc->pfmemalloc = page_is_pfmemalloc(page);
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->offset = size;
+		nc->remaining = size;
 	}
 
-	offset = nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
-		page = virt_to_page(nc->va);
-
-		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
-			goto refill;
-
-		if (unlikely(nc->pfmemalloc)) {
-			free_unref_page(page, compound_order(page));
-			goto refill;
-		}
-
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
+	/* if size can vary use size else just use PAGE_SIZE */
+	size = nc->size;
 #endif
-		/* OK, page count is 0, we can safely set it */
-		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
 
-		/* reset page count bias and offset to start of new frag */
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		offset = size - fragsz;
-		if (unlikely(offset < 0)) {
+	remaining = nc->remaining & align_mask;
+	if (unlikely(remaining < fragsz)) {
+		if (unlikely(fragsz > PAGE_SIZE)) {
 			/*
 			 * The caller is trying to allocate a fragment
 			 * with fragsz > PAGE_SIZE but the cache isn't big
@@ -122,13 +117,31 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 			 */
 			return NULL;
 		}
+
+		page = virt_to_page(nc->va);
+
+		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
+			goto refill;
+
+		if (unlikely(nc->pfmemalloc)) {
+			free_unref_page(page, compound_order(page));
+			goto refill;
+		}
+
+		/* OK, page count is 0, we can safely set it */
+		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+
+		/* reset page count bias and remaining to start of new frag */
+		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+		nc->remaining = size;
+
+		remaining = size;
 	}
 
 	nc->pagecnt_bias--;
-	offset &= align_mask;
-	nc->offset = offset;
+	nc->remaining = remaining - fragsz;
 
-	return nc->va + offset;
+	return nc->va + (size - remaining);
 }
 EXPORT_SYMBOL(__page_frag_alloc_align);