diff mbox series

[v5,2/3] block: add folio awareness instead of looping through pages

Message ID 20240619023420.34527-3-kundan.kumar@samsung.com (mailing list archive)
State New
Headers show
Series block: add larger order folio instead of pages | expand

Commit Message

Kundan Kumar June 19, 2024, 2:34 a.m. UTC
Add a bigger size from folio to bio and skip merge processing for pages.

Fetch the offset of page within a folio. Depending on the size of folio
and folio_offset, fetch a larger length. This length may consist of
multiple contiguous pages if folio is multiorder.

Using the length calculate number of pages which will be added to bio and
increment the loop counter to skip those pages.

Using a helper function check if pages are contiguous and belong to same
folio, this is done as a COW may happen and change contiguous mapping of
pages of folio.

This technique helps to avoid overhead of merging pages which belong to
same large order folio.

Also folio-lize the functions bio_iov_add_page() and
bio_iov_add_zone_append_page()

Signed-off-by: Kundan Kumar <kundan.kumar@samsung.com>
---
 block/bio.c | 72 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 58 insertions(+), 14 deletions(-)

--
2.25.1

Comments

Hannes Reinecke June 19, 2024, 7:47 a.m. UTC | #1
On 6/19/24 04:34, Kundan Kumar wrote:
> Add a bigger size from folio to bio and skip merge processing for pages.
> 
> Fetch the offset of page within a folio. Depending on the size of folio
> and folio_offset, fetch a larger length. This length may consist of
> multiple contiguous pages if folio is multiorder.
> 
> Using the length calculate number of pages which will be added to bio and
> increment the loop counter to skip those pages.
> 
> Using a helper function check if pages are contiguous and belong to same
> folio, this is done as a COW may happen and change contiguous mapping of
> pages of folio.
> 
> This technique helps to avoid overhead of merging pages which belong to
> same large order folio.
> 
> Also folio-lize the functions bio_iov_add_page() and
> bio_iov_add_zone_append_page()
> 
> Signed-off-by: Kundan Kumar <kundan.kumar@samsung.com>
> ---
>   block/bio.c | 72 ++++++++++++++++++++++++++++++++++++++++++-----------
>   1 file changed, 58 insertions(+), 14 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index c8914febb16e..3e75b5b0eb6e 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1224,7 +1224,7 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
>          bio_set_flag(bio, BIO_CLONED);
>   }
> 
> -static int bio_iov_add_page(struct bio *bio, struct page *page,
> +static int bio_iov_add_folio(struct bio *bio, struct folio *folio,
>                  unsigned int len, unsigned int offset)
>   {
>          bool same_page = false;
> @@ -1234,30 +1234,60 @@ static int bio_iov_add_page(struct bio *bio, struct page *page,
> 
>          if (bio->bi_vcnt > 0 &&
>              bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
> -                               page, len, offset, &same_page)) {
> +                               folio_page(folio, 0), len, offset,
> +                               &same_page)) {
>                  bio->bi_iter.bi_size += len;
>                  if (same_page)
> -                       bio_release_page(bio, page);
> +                       bio_release_page(bio, folio_page(folio, 0));
>                  return 0;
>          }
> -       __bio_add_page(bio, page, len, offset);
> +       bio_add_folio_nofail(bio, folio, len, offset);
>          return 0;
>   }
> 
> -static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
> +static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio,
>                  unsigned int len, unsigned int offset)
>   {
>          struct request_queue *q = bdev_get_queue(bio->bi_bdev);
>          bool same_page = false;
> 
> -       if (bio_add_hw_page(q, bio, page, len, offset,
> +       if (bio_add_hw_folio(q, bio, folio, len, offset,
>                          queue_max_zone_append_sectors(q), &same_page) != len)
>                  return -EINVAL;
>          if (same_page)
> -               bio_release_page(bio, page);
> +               bio_release_page(bio, folio_page(folio, 0));
>          return 0;
>   }
> 
> +static unsigned int get_contig_folio_len(int *num_pages, struct page **pages,
> +                                        int i, struct folio *folio,
> +                                        ssize_t left, size_t offset)
> +{
> +       ssize_t bytes = left;
> +       size_t contig_sz = min_t(size_t,  PAGE_SIZE - offset, bytes);
> +       unsigned int j;
> +
> +       /*
> +        * We might COW a single page in the middle of
> +        * a large folio, so we have to check that all
> +        * pages belong to the same folio.
> +        */
> +       bytes -= contig_sz;
> +       for (j = i + 1; j < i + *num_pages; j++) {
> +               size_t next = min_t(size_t, PAGE_SIZE, bytes);
> +
> +               if (page_folio(pages[j]) != folio ||
> +                   pages[j] != pages[j - 1] + 1) {
> +                       break;
> +               }
> +               contig_sz += next;
> +               bytes -= next;
> +       }
> +       *num_pages = j - i;
> +
> +       return contig_sz;
> +}
> +
>   #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
> 
>   /**
> @@ -1277,9 +1307,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>          unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
>          struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
>          struct page **pages = (struct page **)bv;
> -       ssize_t size, left;
> -       unsigned len, i = 0;
> -       size_t offset;
> +       ssize_t size, left, len;
> +       unsigned int i = 0, num_pages;
> +       size_t offset, folio_offset;
>          int ret = 0;
> 
>          /*
> @@ -1321,15 +1351,29 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> 
>          for (left = size, i = 0; left > 0; left -= len, i++) {
>                  struct page *page = pages[i];
> +               struct folio *folio = page_folio(page);
> +
> +               folio_offset = (folio_page_idx(folio, page) << PAGE_SHIFT) +
> +                               offset;
> +
> +               len = min_t(size_t, (folio_size(folio) - folio_offset), left);
> +
> +               num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
> +
> +               if (num_pages > 1)
> +                       len = get_contig_folio_len(&num_pages, pages, i,
> +                                                  folio, left, offset);
> 
> -               len = min_t(size_t, PAGE_SIZE - offset, left);
>                  if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
> -                       ret = bio_iov_add_zone_append_page(bio, page, len,
> -                                       offset);
> +                       ret = bio_iov_add_zone_append_folio(bio, folio, len,
> +                                       folio_offset);
>                          if (ret)
>                                  break;
>                  } else
> -                       bio_iov_add_page(bio, page, len, offset);
> +                       bio_iov_add_folio(bio, folio, len, folio_offset);
> +
> +               /* Skip the pages which got added */
> +               i = i + (num_pages - 1);
> 
>                  offset = 0;
>          }
> --
> 2.25.1
> 
> 

Well. The issue here is that bvecs really only use the 'struct page' 
entry as an address to the data; the page size itself is completely
immaterial. So from that perspective it doesn't really matter whether
we use 'struct page' or 'struct folio' to get to that address.
However, what matters is whether we _iterate_ over pages or folios.
The current workflow is to first allocate an array of pages,
call one of the _get_pages() variants, and the iterate over all
pages.
What we should be doing is to add _get_folios() variants, working
on folio batches and not pre-allocated arrays.
Then we could iterate over all folios in the batch, and can modify
the 'XXX_get_pages()' variants to extract pages from the folio batch.
And then gradually move over all callers to work on folio batches.

Tall order, but I fear this is the best way going forward.
Matthew? Christoph? Is that what you had in mind?

Cheers,

Hannes
Kundan Kumar June 20, 2024, 4:48 a.m. UTC | #2
On 19/06/24 09:47AM, Hannes Reinecke wrote:
>On 6/19/24 04:34, Kundan Kumar wrote:
>>Add a bigger size from folio to bio and skip merge processing for pages.
>>
>>Fetch the offset of page within a folio. Depending on the size of folio
>>and folio_offset, fetch a larger length. This length may consist of
>>multiple contiguous pages if folio is multiorder.
>>
>>Using the length calculate number of pages which will be added to bio and
>>increment the loop counter to skip those pages.
>>
>>Using a helper function check if pages are contiguous and belong to same
>>folio, this is done as a COW may happen and change contiguous mapping of
>>pages of folio.
>>
>>This technique helps to avoid overhead of merging pages which belong to
>>same large order folio.
>>
>>Also folio-lize the functions bio_iov_add_page() and
>>bio_iov_add_zone_append_page()
>>
>>Signed-off-by: Kundan Kumar <kundan.kumar@samsung.com>
>>---
>>  block/bio.c | 72 ++++++++++++++++++++++++++++++++++++++++++-----------
>>  1 file changed, 58 insertions(+), 14 deletions(-)
>>
>>diff --git a/block/bio.c b/block/bio.c
>>index c8914febb16e..3e75b5b0eb6e 100644
>>--- a/block/bio.c
>>+++ b/block/bio.c
>>@@ -1224,7 +1224,7 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
>>         bio_set_flag(bio, BIO_CLONED);
>>  }
>>
>>-static int bio_iov_add_page(struct bio *bio, struct page *page,
>>+static int bio_iov_add_folio(struct bio *bio, struct folio *folio,
>>                 unsigned int len, unsigned int offset)
>>  {
>>         bool same_page = false;
>>@@ -1234,30 +1234,60 @@ static int bio_iov_add_page(struct bio *bio, struct page *page,
>>
>>         if (bio->bi_vcnt > 0 &&
>>             bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
>>-                               page, len, offset, &same_page)) {
>>+                               folio_page(folio, 0), len, offset,
>>+                               &same_page)) {
>>                 bio->bi_iter.bi_size += len;
>>                 if (same_page)
>>-                       bio_release_page(bio, page);
>>+                       bio_release_page(bio, folio_page(folio, 0));
>>                 return 0;
>>         }
>>-       __bio_add_page(bio, page, len, offset);
>>+       bio_add_folio_nofail(bio, folio, len, offset);
>>         return 0;
>>  }
>>
>>-static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
>>+static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio,
>>                 unsigned int len, unsigned int offset)
>>  {
>>         struct request_queue *q = bdev_get_queue(bio->bi_bdev);
>>         bool same_page = false;
>>
>>-       if (bio_add_hw_page(q, bio, page, len, offset,
>>+       if (bio_add_hw_folio(q, bio, folio, len, offset,
>>                         queue_max_zone_append_sectors(q), &same_page) != len)
>>                 return -EINVAL;
>>         if (same_page)
>>-               bio_release_page(bio, page);
>>+               bio_release_page(bio, folio_page(folio, 0));
>>         return 0;
>>  }
>>
>>+static unsigned int get_contig_folio_len(int *num_pages, struct page **pages,
>>+                                        int i, struct folio *folio,
>>+                                        ssize_t left, size_t offset)
>>+{
>>+       ssize_t bytes = left;
>>+       size_t contig_sz = min_t(size_t,  PAGE_SIZE - offset, bytes);
>>+       unsigned int j;
>>+
>>+       /*
>>+        * We might COW a single page in the middle of
>>+        * a large folio, so we have to check that all
>>+        * pages belong to the same folio.
>>+        */
>>+       bytes -= contig_sz;
>>+       for (j = i + 1; j < i + *num_pages; j++) {
>>+               size_t next = min_t(size_t, PAGE_SIZE, bytes);
>>+
>>+               if (page_folio(pages[j]) != folio ||
>>+                   pages[j] != pages[j - 1] + 1) {
>>+                       break;
>>+               }
>>+               contig_sz += next;
>>+               bytes -= next;
>>+       }
>>+       *num_pages = j - i;
>>+
>>+       return contig_sz;
>>+}
>>+
>>  #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
>>
>>  /**
>>@@ -1277,9 +1307,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>>         unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
>>         struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
>>         struct page **pages = (struct page **)bv;
>>-       ssize_t size, left;
>>-       unsigned len, i = 0;
>>-       size_t offset;
>>+       ssize_t size, left, len;
>>+       unsigned int i = 0, num_pages;
>>+       size_t offset, folio_offset;
>>         int ret = 0;
>>
>>         /*
>>@@ -1321,15 +1351,29 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>>
>>         for (left = size, i = 0; left > 0; left -= len, i++) {
>>                 struct page *page = pages[i];
>>+               struct folio *folio = page_folio(page);
>>+
>>+               folio_offset = (folio_page_idx(folio, page) << PAGE_SHIFT) +
>>+                               offset;
>>+
>>+               len = min_t(size_t, (folio_size(folio) - folio_offset), left);
>>+
>>+               num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
>>+
>>+               if (num_pages > 1)
>>+                       len = get_contig_folio_len(&num_pages, pages, i,
>>+                                                  folio, left, offset);
>>
>>-               len = min_t(size_t, PAGE_SIZE - offset, left);
>>                 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
>>-                       ret = bio_iov_add_zone_append_page(bio, page, len,
>>-                                       offset);
>>+                       ret = bio_iov_add_zone_append_folio(bio, folio, len,
>>+                                       folio_offset);
>>                         if (ret)
>>                                 break;
>>                 } else
>>-                       bio_iov_add_page(bio, page, len, offset);
>>+                       bio_iov_add_folio(bio, folio, len, folio_offset);
>>+
>>+               /* Skip the pages which got added */
>>+               i = i + (num_pages - 1);
>>
>>                 offset = 0;
>>         }
>>--
>>2.25.1
>>
>>
>
>Well. The issue here is that bvecs really only use the 'struct page' 
>entry as an address to the data; the page size itself is completely
>immaterial. So from that perspective it doesn't really matter whether
>we use 'struct page' or 'struct folio' to get to that address.
>However, what matters is whether we _iterate_ over pages or folios.
>The current workflow is to first allocate an array of pages,
>call one of the _get_pages() variants, and the iterate over all
>pages.
>What we should be doing is to add _get_folios() variants, working
>on folio batches and not pre-allocated arrays.

The XXX_get_pages() functions do page table walk and fill the pages
corresponding to a user space addr in pages array. The _get_folios()
variants shall return a folio_vec, rather than folio array, as every folio
entry will need a folio_offset and len per folio. If we convert to
_get_folios() variants, number of folios may be lesser than number of
pages. But we will need allocation of folio_vec array as a replacement
of pages array.

Am I missing something?

Down in the page table walk (gup_fast_pte_range), we fill the pages array
addr -> pte -> page. This shall be modified to fill a folio_vec array. The
page table walk also deals with huge pages, and looks like huge page
functions shall also be modified to fill folio_vec array. Also handling
the gup slow path will need a modification to fill the folio_vec array.
--
Kundan

>Then we could iterate over all folios in the batch, and can modify
>the 'XXX_get_pages()' variants to extract pages from the folio batch.
>And then gradually move over all callers to work on folio batches.
>
>Tall order, but I fear this is the best way going forward.
>Matthew? Christoph? Is that what you had in mind?
>
>Cheers,
>
>Hannes
>-- 
>Dr. Hannes Reinecke                  Kernel Storage Architect
>hare@suse.de                                +49 911 74053 688
>SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
>HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
>
Hannes Reinecke June 20, 2024, 7:21 a.m. UTC | #3
On 6/20/24 06:48, Kundan Kumar wrote:
> On 19/06/24 09:47AM, Hannes Reinecke wrote:
[ .. ]
>>
>> Well. The issue here is that bvecs really only use the 'struct page' 
>> entry as an address to the data; the page size itself is completely
>> immaterial. So from that perspective it doesn't really matter whether
>> we use 'struct page' or 'struct folio' to get to that address.
>> However, what matters is whether we _iterate_ over pages or folios.
>> The current workflow is to first allocate an array of pages,
>> call one of the _get_pages() variants, and the iterate over all
>> pages.
>> What we should be doing is to add _get_folios() variants, working
>> on folio batches and not pre-allocated arrays.
> 
> The XXX_get_pages() functions do page table walk and fill the pages
> corresponding to a user space addr in pages array. The _get_folios()
> variants shall return a folio_vec, rather than folio array, as every folio
> entry will need a folio_offset and len per folio. If we convert to
> _get_folios() variants, number of folios may be lesser than number of
> pages. But we will need allocation of folio_vec array as a replacement
> of pages array.
> 
Well, actually I was thinking of using 'struct folio_batch' instead of
an array. There is an entire set of helpers here (pagevec.h) for 
precisely this purpose.

But yes, we will end up with less folios than pages (which was kinda the 
idea).

> Am I missing something?
> 
> Down in the page table walk (gup_fast_pte_range), we fill the pages array
> addr -> pte -> page. This shall be modified to fill a folio_vec array. The
> page table walk also deals with huge pages, and looks like huge page
> functions shall also be modified to fill folio_vec array. Also handling
> the gup slow path will need a modification to fill the folio_vec array.

Yes. I did say it's a tall order. But it would have the advantage of 
assembling large folios right at the start, so lower level (ie those
consuming the folio batch) would not need to worry of painstakingly
re-assemble folios from a list of pages.

Cheers,

Hannes
diff mbox series

Patch

diff --git a/block/bio.c b/block/bio.c
index c8914febb16e..3e75b5b0eb6e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1224,7 +1224,7 @@  void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
        bio_set_flag(bio, BIO_CLONED);
 }

-static int bio_iov_add_page(struct bio *bio, struct page *page,
+static int bio_iov_add_folio(struct bio *bio, struct folio *folio,
                unsigned int len, unsigned int offset)
 {
        bool same_page = false;
@@ -1234,30 +1234,60 @@  static int bio_iov_add_page(struct bio *bio, struct page *page,

        if (bio->bi_vcnt > 0 &&
            bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
-                               page, len, offset, &same_page)) {
+                               folio_page(folio, 0), len, offset,
+                               &same_page)) {
                bio->bi_iter.bi_size += len;
                if (same_page)
-                       bio_release_page(bio, page);
+                       bio_release_page(bio, folio_page(folio, 0));
                return 0;
        }
-       __bio_add_page(bio, page, len, offset);
+       bio_add_folio_nofail(bio, folio, len, offset);
        return 0;
 }

-static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
+static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio,
                unsigned int len, unsigned int offset)
 {
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        bool same_page = false;

-       if (bio_add_hw_page(q, bio, page, len, offset,
+       if (bio_add_hw_folio(q, bio, folio, len, offset,
                        queue_max_zone_append_sectors(q), &same_page) != len)
                return -EINVAL;
        if (same_page)
-               bio_release_page(bio, page);
+               bio_release_page(bio, folio_page(folio, 0));
        return 0;
 }

+static unsigned int get_contig_folio_len(int *num_pages, struct page **pages,
+                                        int i, struct folio *folio,
+                                        ssize_t left, size_t offset)
+{
+       ssize_t bytes = left;
+       size_t contig_sz = min_t(size_t,  PAGE_SIZE - offset, bytes);
+       unsigned int j;
+
+       /*
+        * We might COW a single page in the middle of
+        * a large folio, so we have to check that all
+        * pages belong to the same folio.
+        */
+       bytes -= contig_sz;
+       for (j = i + 1; j < i + *num_pages; j++) {
+               size_t next = min_t(size_t, PAGE_SIZE, bytes);
+
+               if (page_folio(pages[j]) != folio ||
+                   pages[j] != pages[j - 1] + 1) {
+                       break;
+               }
+               contig_sz += next;
+               bytes -= next;
+       }
+       *num_pages = j - i;
+
+       return contig_sz;
+}
+
 #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))

 /**
@@ -1277,9 +1307,9 @@  static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
        unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
        struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
        struct page **pages = (struct page **)bv;
-       ssize_t size, left;
-       unsigned len, i = 0;
-       size_t offset;
+       ssize_t size, left, len;
+       unsigned int i = 0, num_pages;
+       size_t offset, folio_offset;
        int ret = 0;

        /*
@@ -1321,15 +1351,29 @@  static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)

        for (left = size, i = 0; left > 0; left -= len, i++) {
                struct page *page = pages[i];
+               struct folio *folio = page_folio(page);
+
+               folio_offset = (folio_page_idx(folio, page) << PAGE_SHIFT) +
+                               offset;
+
+               len = min_t(size_t, (folio_size(folio) - folio_offset), left);
+
+               num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+
+               if (num_pages > 1)
+                       len = get_contig_folio_len(&num_pages, pages, i,
+                                                  folio, left, offset);

-               len = min_t(size_t, PAGE_SIZE - offset, left);
                if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
-                       ret = bio_iov_add_zone_append_page(bio, page, len,
-                                       offset);
+                       ret = bio_iov_add_zone_append_folio(bio, folio, len,
+                                       folio_offset);
                        if (ret)
                                break;
                } else
-                       bio_iov_add_page(bio, page, len, offset);
+                       bio_iov_add_folio(bio, folio, len, folio_offset);
+
+               /* Skip the pages which got added */
+               i = i + (num_pages - 1);

                offset = 0;
        }