diff mbox series

[01/11] libceph: support prefix and suffix in bio_iter

Message ID 1543841435-13652-2-git-send-email-dongsheng.yang@easystack.cn (mailing list archive)
State New, archived
Headers show
Series [01/11] libceph: support prefix and suffix in bio_iter | expand

Commit Message

Dongsheng Yang Dec. 3, 2018, 12:50 p.m. UTC
When we want to add some prefix to the bio data, such as journal header,
we don't want to copy it from bio into a new larger memory. Instead, we need
a prefix page and suffix page to store the header and footer in bio_iter.

Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
---
 include/linux/ceph/messenger.h |  9 ++++
 net/ceph/messenger.c           | 96 ++++++++++++++++++++++++++++++++----------
 2 files changed, 83 insertions(+), 22 deletions(-)

Comments

Ilya Dryomov Dec. 4, 2018, 2:28 p.m. UTC | #1
On Mon, Dec 3, 2018 at 1:51 PM Dongsheng Yang
<dongsheng.yang@easystack.cn> wrote:
>
> When we want to add some prefix to the bio data, such as journal header,
> we don't want to copy it from bio into a new larger memory. Instead, we need
> a prefix page and suffix page to store the header and footer in bio_iter.
>
> Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
> ---
>  include/linux/ceph/messenger.h |  9 ++++
>  net/ceph/messenger.c           | 96 ++++++++++++++++++++++++++++++++----------
>  2 files changed, 83 insertions(+), 22 deletions(-)
>
> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> index 800a212..0da6a4b 100644
> --- a/include/linux/ceph/messenger.h
> +++ b/include/linux/ceph/messenger.h
> @@ -87,6 +87,15 @@ enum ceph_msg_data_type {
>  struct ceph_bio_iter {
>         struct bio *bio;
>         struct bvec_iter iter;
> +       size_t bio_len;
> +
> +       struct page *prefix_page;
> +       unsigned int prefix_offset;
> +       unsigned int prefix_len;
> +
> +       struct page *suffix_page;
> +       unsigned int suffix_offset;
> +       unsigned int suffix_len;
>  };
>
>  #define __ceph_bio_iter_advance_step(it, n, STEP) do {                       \
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 57fcc6b..64c70c5 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -828,23 +828,46 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
>
>         cursor->resid = min_t(size_t, length, data->bio_length);
>         *it = data->bio_pos;
> -       if (cursor->resid < it->iter.bi_size)
> -               it->iter.bi_size = cursor->resid;
>
> -       BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
> -       cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
> +       if (cursor->resid < it->iter.bi_size + it->prefix_len + it->suffix_len)
> +               it->iter.bi_size = cursor->resid - it->prefix_len - it->suffix_len;
> +
> +       it->bio_len = cursor->resid - it->prefix_len - it->suffix_len;
> +       if (it->suffix_len) {
> +               cursor->last_piece = cursor->resid == it->suffix_len;
> +       } else if (it->bio_len) {
> +               cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
> +       } else if (it->prefix_len) {
> +               cursor->last_piece = cursor->resid == it->prefix_len;
> +       } else {
> +               BUG();
> +       }
>  }
>
>  static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
>                                                 size_t *page_offset,
>                                                 size_t *length)
>  {
> -       struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio,
> -                                          cursor->bio_iter.iter);
> +       struct ceph_bio_iter *it = &cursor->bio_iter;
>
> -       *page_offset = bv.bv_offset;
> -       *length = bv.bv_len;
> -       return bv.bv_page;
> +       if (it->prefix_len) {
> +               *page_offset = it->prefix_offset;
> +               *length = it->prefix_len;
> +               return it->prefix_page;
> +       } else if (it->bio_len) {
> +               struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio,
> +                                                  cursor->bio_iter.iter);
> +
> +               *page_offset = bv.bv_offset;
> +               *length = bv.bv_len;
> +               return bv.bv_page;
> +       } else {
> +               BUG_ON(!it->suffix_len);
> +
> +               *page_offset = it->suffix_offset;
> +               *length = it->suffix_len;
> +               return it->suffix_page;
> +       }
>  }
>
>  static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
> @@ -852,29 +875,58 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
>  {
>         struct ceph_bio_iter *it = &cursor->bio_iter;
>
> +       if (!bytes)
> +               return false;
> +
>         BUG_ON(bytes > cursor->resid);
> -       BUG_ON(bytes > bio_iter_len(it->bio, it->iter));
>         cursor->resid -= bytes;
> -       bio_advance_iter(it->bio, &it->iter, bytes);
> +       if (it->prefix_len) {
> +               BUG_ON(bytes > it->prefix_len);
> +               it->prefix_offset += bytes;
> +               it->prefix_len -= bytes;
> +               if (it->prefix_len)
> +                       return false;
> +       } else if (it->bio_len) {
> +               BUG_ON(bytes > bio_iter_len(it->bio, it->iter));
> +               bio_advance_iter(it->bio, &it->iter, bytes);
> +               it->bio_len -= bytes;
> +               if (!cursor->resid) {
> +                       BUG_ON(!cursor->last_piece);
> +                       return false;   /* no more data */
> +               }
> +               if (it->iter.bi_size && it->iter.bi_bvec_done)
> +                       return false;   /* more bytes to process in this segment */
> +
> +               if (it->bio_len && !it->iter.bi_size) {
> +                       it->bio = it->bio->bi_next;
> +                       it->iter = it->bio->bi_iter;
> +                       if (cursor->resid < it->iter.bi_size)
> +                               it->iter.bi_size = cursor->resid;
> +               }
> +       } else {
> +               BUG_ON(!it->suffix_len);
> +               it->suffix_offset += bytes;
> +               it->suffix_len -= bytes;
> +               if (it->suffix_len)
> +                       return false;
> +       }
>
>         if (!cursor->resid) {
>                 BUG_ON(!cursor->last_piece);
>                 return false;   /* no more data */
>         }
>
> -       if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done))
> -               return false;   /* more bytes to process in this segment */
> -
> -       if (!it->iter.bi_size) {
> -               it->bio = it->bio->bi_next;
> -               it->iter = it->bio->bi_iter;
> -               if (cursor->resid < it->iter.bi_size)
> -                       it->iter.bi_size = cursor->resid;
> +       BUG_ON(cursor->last_piece);
> +       if (it->suffix_len) {
> +               cursor->last_piece = cursor->resid == it->suffix_len;
> +       } else if (it->bio_len) {
> +               cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
> +       } else if (it->prefix_len) {
> +               cursor->last_piece = cursor->resid == it->prefix_len;
> +       } else {
> +               BUG();
>         }
>
> -       BUG_ON(cursor->last_piece);
> -       BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
> -       cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
>         return true;
>  }
>  #endif /* CONFIG_BLOCK */

Have you looked at message data items (ceph_msg_data) for this?
A message can have multiple data items, each of different type (BIO,
PAGES, etc) and with its own iterator.  The messenger will exhaust them
in order, so you can create a message with three data items: prefix
(type PAGES), data (type BIO), suffix (type PAGES).

If the data item framework isn't flexible enough, we should work on
extending it.  Special casing the bio iterator is the wrong thing to
do.

Thanks,

                Ilya
Dongsheng Yang Dec. 5, 2018, 1:16 a.m. UTC | #2
On 12/04/2018 10:28 PM, Ilya Dryomov wrote:
> On Mon, Dec 3, 2018 at 1:51 PM Dongsheng Yang
> <dongsheng.yang@easystack.cn> wrote:
>> When we want to add some prefix to the bio data, such as journal header,
>> we don't want to copy it from bio into a new larger memory. Instead, we need
>> a prefix page and suffix page to store the header and footer in bio_iter.
>>
>> Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
...
>>   }
>>   #endif /* CONFIG_BLOCK */
> Have you looked at message data items (ceph_msg_data) for this?
> A message can have multiple data items, each of different type (BIO,
> PAGES, etc) and with its own iterator.  The messenger will exhaust them
> in order, so you can create a message with three data items: prefix
> (type PAGES), data (type BIO), suffix (type PAGES).
Yes, that was my first idea, but I found the framework is not
enough to my case. And as I need to use bio only in my case,
so I chose the easy way to extend the special case for bio.
>
> If the data item framework isn't flexible enough, we should work on
> extending it.  Special casing the bio iterator is the wrong thing to
> do.

Agreed, I will go back to the framework itself and find some way
to make it work in my case.

Thanx
>
> Thanks,
>
>                  Ilya
>
Ilya Dryomov Dec. 5, 2018, 10:18 a.m. UTC | #3
On Wed, Dec 5, 2018 at 2:17 AM Dongsheng Yang
<dongsheng.yang@easystack.cn> wrote:
>
>
>
> On 12/04/2018 10:28 PM, Ilya Dryomov wrote:
> > On Mon, Dec 3, 2018 at 1:51 PM Dongsheng Yang
> > <dongsheng.yang@easystack.cn> wrote:
> >> When we want to add some prefix to the bio data, such as journal header,
> >> we don't want to copy it from bio into a new larger memory. Instead, we need
> >> a prefix page and suffix page to store the header and footer in bio_iter.
> >>
> >> Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
> ...
> >>   }
> >>   #endif /* CONFIG_BLOCK */
> > Have you looked at message data items (ceph_msg_data) for this?
> > A message can have multiple data items, each of different type (BIO,
> > PAGES, etc) and with its own iterator.  The messenger will exhaust them
> > in order, so you can create a message with three data items: prefix
> > (type PAGES), data (type BIO), suffix (type PAGES).
> Yes, that was my first idea, but I found the framework is not
> enough to my case. And as I need to use bio only in my case,

What was the problem?

Thanks,

                Ilya
Ilya Dryomov Dec. 21, 2018, 9:35 a.m. UTC | #4
On Tue, Dec 18, 2018 at 11:10 AM Dongsheng Yang
<dongsheng.yang@easystack.cn> wrote:
>
>
>
> On 12/05/2018 06:18 PM, Ilya Dryomov wrote:
>
> On Wed, Dec 5, 2018 at 2:17 AM Dongsheng Yang
> <dongsheng.yang@easystack.cn> wrote:
>
>
> On 12/04/2018 10:28 PM, Ilya Dryomov wrote:
>
> On Mon, Dec 3, 2018 at 1:51 PM Dongsheng Yang
> <dongsheng.yang@easystack.cn> wrote:
>
> When we want to add some prefix to the bio data, such as journal header,
> we don't want to copy it from bio into a new larger memory. Instead, we need
> a prefix page and suffix page to store the header and footer in bio_iter.
>
> Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
>
> ...
>
>   }
>   #endif /* CONFIG_BLOCK */
>
> Have you looked at message data items (ceph_msg_data) for this?
> A message can have multiple data items, each of different type (BIO,
> PAGES, etc) and with its own iterator.  The messenger will exhaust them
> in order, so you can create a message with three data items: prefix
> (type PAGES), data (type BIO), suffix (type PAGES).
>
> Yes, that was my first idea, but I found the framework is not
> enough to my case. And as I need to use bio only in my case,
>
> What was the problem?
>
>
> The problem is there is only one type of ceph_osd_data  in
> ceph_osd_req_op.extent.osd_data.
>
> But what I want in journal append case is one ceph_osd_req_op (op
> = CEPH_OSD_OP_APPEND), and extent of this ceph_osd_req_op have tree
> ceph_osd_data (example: PAGES, BIO, PAGES).

I don't see a problem with adding two additional ceph_osd_data members
to extent with a comment explaining that rbd journaling needs a prefix
and a suffix.  READ, WRITE, etc would still use just one, APPEND could
use all three when required.

Thanks,

                Ilya
Dongsheng Yang Dec. 24, 2018, 1:25 a.m. UTC | #5
On 12/21/2018 05:35 PM, Ilya Dryomov wrote:
> On Tue, Dec 18, 2018 at 11:10 AM Dongsheng Yang
> <dongsheng.yang@easystack.cn> wrote:
>>
>>
>> On 12/05/2018 06:18 PM, Ilya Dryomov wrote:
>>
>> On Wed, Dec 5, 2018 at 2:17 AM Dongsheng Yang
>> <dongsheng.yang@easystack.cn> wrote:
>>
>>
>> On 12/04/2018 10:28 PM, Ilya Dryomov wrote:
>>
>> On Mon, Dec 3, 2018 at 1:51 PM Dongsheng Yang
>> <dongsheng.yang@easystack.cn> wrote:
>>
>> When we want to add some prefix to the bio data, such as journal header,
>> we don't want to copy it from bio into a new larger memory. Instead, we need
>> a prefix page and suffix page to store the header and footer in bio_iter.
>>
>> Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
>>
>> ...
>>
>>    }
>>    #endif /* CONFIG_BLOCK */
>>
>> Have you looked at message data items (ceph_msg_data) for this?
>> A message can have multiple data items, each of different type (BIO,
>> PAGES, etc) and with its own iterator.  The messenger will exhaust them
>> in order, so you can create a message with three data items: prefix
>> (type PAGES), data (type BIO), suffix (type PAGES).
>>
>> Yes, that was my first idea, but I found the framework is not
>> enough to my case. And as I need to use bio only in my case,
>>
>> What was the problem?
>>
>>
>> The problem is there is only one type of ceph_osd_data  in
>> ceph_osd_req_op.extent.osd_data.
>>
>> But what I want in journal append case is one ceph_osd_req_op (op
>> = CEPH_OSD_OP_APPEND), and extent of this ceph_osd_req_op have tree
>> ceph_osd_data (example: PAGES, BIO, PAGES).
> I don't see a problem with adding two additional ceph_osd_data members
> to extent with a comment explaining that rbd journaling needs a prefix
> and a suffix.  READ, WRITE, etc would still use just one, APPEND could
> use all three when required.

Thanx for your suggestion.
>
> Thanks,
>
>                  Ilya
>
diff mbox series

Patch

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 800a212..0da6a4b 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -87,6 +87,15 @@  enum ceph_msg_data_type {
 struct ceph_bio_iter {
 	struct bio *bio;
 	struct bvec_iter iter;
+	size_t bio_len;
+
+	struct page *prefix_page;
+	unsigned int prefix_offset;
+	unsigned int prefix_len;
+
+	struct page *suffix_page;
+	unsigned int suffix_offset;
+	unsigned int suffix_len;
 };
 
 #define __ceph_bio_iter_advance_step(it, n, STEP) do {			      \
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 57fcc6b..64c70c5 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -828,23 +828,46 @@  static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
 
 	cursor->resid = min_t(size_t, length, data->bio_length);
 	*it = data->bio_pos;
-	if (cursor->resid < it->iter.bi_size)
-		it->iter.bi_size = cursor->resid;
 
-	BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
-	cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
+	if (cursor->resid < it->iter.bi_size + it->prefix_len + it->suffix_len)
+		it->iter.bi_size = cursor->resid - it->prefix_len - it->suffix_len;
+
+	it->bio_len = cursor->resid - it->prefix_len - it->suffix_len;
+	if (it->suffix_len) {
+		cursor->last_piece = cursor->resid == it->suffix_len;
+	} else if (it->bio_len) {
+		cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
+	} else if (it->prefix_len) {
+		cursor->last_piece = cursor->resid == it->prefix_len;
+	} else {
+		BUG();
+	}
 }
 
 static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
 						size_t *page_offset,
 						size_t *length)
 {
-	struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio,
-					   cursor->bio_iter.iter);
+	struct ceph_bio_iter *it = &cursor->bio_iter;
 
-	*page_offset = bv.bv_offset;
-	*length = bv.bv_len;
-	return bv.bv_page;
+	if (it->prefix_len) {
+		*page_offset = it->prefix_offset;
+		*length = it->prefix_len;
+		return it->prefix_page;
+	} else if (it->bio_len) {
+		struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio,
+						   cursor->bio_iter.iter);
+
+		*page_offset = bv.bv_offset;
+		*length = bv.bv_len;
+		return bv.bv_page;
+	} else {
+		BUG_ON(!it->suffix_len);
+
+		*page_offset = it->suffix_offset;
+		*length = it->suffix_len;
+		return it->suffix_page;
+	}
 }
 
 static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
@@ -852,29 +875,58 @@  static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
 {
 	struct ceph_bio_iter *it = &cursor->bio_iter;
 
+	if (!bytes)
+		return false;
+
 	BUG_ON(bytes > cursor->resid);
-	BUG_ON(bytes > bio_iter_len(it->bio, it->iter));
 	cursor->resid -= bytes;
-	bio_advance_iter(it->bio, &it->iter, bytes);
+	if (it->prefix_len) {
+		BUG_ON(bytes > it->prefix_len);
+		it->prefix_offset += bytes;
+		it->prefix_len -= bytes;
+		if (it->prefix_len)
+			return false;
+	} else if (it->bio_len) {
+		BUG_ON(bytes > bio_iter_len(it->bio, it->iter));
+		bio_advance_iter(it->bio, &it->iter, bytes);
+		it->bio_len -= bytes;
+		if (!cursor->resid) {
+			BUG_ON(!cursor->last_piece);
+			return false;   /* no more data */
+		}
+		if (it->iter.bi_size && it->iter.bi_bvec_done)
+			return false;	/* more bytes to process in this segment */
+
+		if (it->bio_len && !it->iter.bi_size) {
+			it->bio = it->bio->bi_next;
+			it->iter = it->bio->bi_iter;
+			if (cursor->resid < it->iter.bi_size)
+				it->iter.bi_size = cursor->resid;
+		}
+	} else {
+		BUG_ON(!it->suffix_len);
+		it->suffix_offset += bytes;
+		it->suffix_len -= bytes;
+		if (it->suffix_len)
+			return false;
+	}
 
 	if (!cursor->resid) {
 		BUG_ON(!cursor->last_piece);
 		return false;   /* no more data */
 	}
 
-	if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done))
-		return false;	/* more bytes to process in this segment */
-
-	if (!it->iter.bi_size) {
-		it->bio = it->bio->bi_next;
-		it->iter = it->bio->bi_iter;
-		if (cursor->resid < it->iter.bi_size)
-			it->iter.bi_size = cursor->resid;
+	BUG_ON(cursor->last_piece);
+	if (it->suffix_len) {
+		cursor->last_piece = cursor->resid == it->suffix_len;
+	} else if (it->bio_len) {
+		cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
+	} else if (it->prefix_len) {
+		cursor->last_piece = cursor->resid == it->prefix_len;
+	} else {
+		BUG();
 	}
 
-	BUG_ON(cursor->last_piece);
-	BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
-	cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
 	return true;
 }
 #endif /* CONFIG_BLOCK */