Message ID | 1543841435-13652-2-git-send-email-dongsheng.yang@easystack.cn (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/11] libceph: support prefix and suffix in bio_iter | expand |
On Mon, Dec 3, 2018 at 1:51 PM Dongsheng Yang <dongsheng.yang@easystack.cn> wrote: > > When we want to add some prefix to the bio data, such as journal header, > we don't want to copy it from bio into a new larger memory. Instead, we need > a prefix page and suffix page to store the header and footer in bio_iter. > > Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn> > --- > include/linux/ceph/messenger.h | 9 ++++ > net/ceph/messenger.c | 96 ++++++++++++++++++++++++++++++++---------- > 2 files changed, 83 insertions(+), 22 deletions(-) > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h > index 800a212..0da6a4b 100644 > --- a/include/linux/ceph/messenger.h > +++ b/include/linux/ceph/messenger.h > @@ -87,6 +87,15 @@ enum ceph_msg_data_type { > struct ceph_bio_iter { > struct bio *bio; > struct bvec_iter iter; > + size_t bio_len; > + > + struct page *prefix_page; > + unsigned int prefix_offset; > + unsigned int prefix_len; > + > + struct page *suffix_page; > + unsigned int suffix_offset; > + unsigned int suffix_len; > }; > > #define __ceph_bio_iter_advance_step(it, n, STEP) do { \ > diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c > index 57fcc6b..64c70c5 100644 > --- a/net/ceph/messenger.c > +++ b/net/ceph/messenger.c > @@ -828,23 +828,46 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, > > cursor->resid = min_t(size_t, length, data->bio_length); > *it = data->bio_pos; > - if (cursor->resid < it->iter.bi_size) > - it->iter.bi_size = cursor->resid; > > - BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); > - cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); > + if (cursor->resid < it->iter.bi_size + it->prefix_len + it->suffix_len) > + it->iter.bi_size = cursor->resid - it->prefix_len - it->suffix_len; > + > + it->bio_len = cursor->resid - it->prefix_len - it->suffix_len; > + if (it->suffix_len) { > + cursor->last_piece = cursor->resid == it->suffix_len; > + } else if (it->bio_len) { > + cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); > + } else if (it->prefix_len) { > + cursor->last_piece = cursor->resid == it->prefix_len; > + } else { > + BUG(); > + } > } > > static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, > size_t *page_offset, > size_t *length) > { > - struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio, > - cursor->bio_iter.iter); > + struct ceph_bio_iter *it = &cursor->bio_iter; > > - *page_offset = bv.bv_offset; > - *length = bv.bv_len; > - return bv.bv_page; > + if (it->prefix_len) { > + *page_offset = it->prefix_offset; > + *length = it->prefix_len; > + return it->prefix_page; > + } else if (it->bio_len) { > + struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio, > + cursor->bio_iter.iter); > + > + *page_offset = bv.bv_offset; > + *length = bv.bv_len; > + return bv.bv_page; > + } else { > + BUG_ON(!it->suffix_len); > + > + *page_offset = it->suffix_offset; > + *length = it->suffix_len; > + return it->suffix_page; > + } > } > > static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, > @@ -852,29 +875,58 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, > { > struct ceph_bio_iter *it = &cursor->bio_iter; > > + if (!bytes) > + return false; > + > BUG_ON(bytes > cursor->resid); > - BUG_ON(bytes > bio_iter_len(it->bio, it->iter)); > cursor->resid -= bytes; > - bio_advance_iter(it->bio, &it->iter, bytes); > + if (it->prefix_len) { > + BUG_ON(bytes > it->prefix_len); > + it->prefix_offset += bytes; > + it->prefix_len -= bytes; > + if (it->prefix_len) > + return false; > + } else if (it->bio_len) { > + BUG_ON(bytes > bio_iter_len(it->bio, it->iter)); > + bio_advance_iter(it->bio, &it->iter, bytes); > + it->bio_len -= bytes; > + if (!cursor->resid) { > + BUG_ON(!cursor->last_piece); > + return false; /* no more data */ > + } > + if (it->iter.bi_size && it->iter.bi_bvec_done) > + return false; /* more bytes to process in this segment */ > + > + if (it->bio_len && !it->iter.bi_size) { > + it->bio = it->bio->bi_next; > + it->iter = it->bio->bi_iter; > + if (cursor->resid < it->iter.bi_size) > + it->iter.bi_size = cursor->resid; > + } > + } else { > + BUG_ON(!it->suffix_len); > + it->suffix_offset += bytes; > + it->suffix_len -= bytes; > + if (it->suffix_len) > + return false; > + } > > if (!cursor->resid) { > BUG_ON(!cursor->last_piece); > return false; /* no more data */ > } > > - if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done)) > - return false; /* more bytes to process in this segment */ > - > - if (!it->iter.bi_size) { > - it->bio = it->bio->bi_next; > - it->iter = it->bio->bi_iter; > - if (cursor->resid < it->iter.bi_size) > - it->iter.bi_size = cursor->resid; > + BUG_ON(cursor->last_piece); > + if (it->suffix_len) { > + cursor->last_piece = cursor->resid == it->suffix_len; > + } else if (it->bio_len) { > + cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); > + } else if (it->prefix_len) { > + cursor->last_piece = cursor->resid == it->prefix_len; > + } else { > + BUG(); > } > > - BUG_ON(cursor->last_piece); > - BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); > - cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); > return true; > } > #endif /* CONFIG_BLOCK */ Have you looked at message data items (ceph_msg_data) for this? A message can have multiple data items, each of different type (BIO, PAGES, etc) and with its own iterator. The messenger will exhaust them in order, so you can create a message with three data items: prefix (type PAGES), data (type BIO), suffix (type PAGES). If the data item framework isn't flexible enough, we should work on extending it. Special casing the bio iterator is the wrong thing to do. Thanks, Ilya
On 12/04/2018 10:28 PM, Ilya Dryomov wrote: > On Mon, Dec 3, 2018 at 1:51 PM Dongsheng Yang > <dongsheng.yang@easystack.cn> wrote: >> When we want to add some prefix to the bio data, such as journal header, >> we don't want to copy it from bio into a new larger memory. Instead, we need >> a prefix page and suffix page to store the header and footer in bio_iter. >> >> Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn> ... >> } >> #endif /* CONFIG_BLOCK */ > Have you looked at message data items (ceph_msg_data) for this? > A message can have multiple data items, each of different type (BIO, > PAGES, etc) and with its own iterator. The messenger will exhaust them > in order, so you can create a message with three data items: prefix > (type PAGES), data (type BIO), suffix (type PAGES). Yes, that was my first idea, but I found the framework is not enough to my case. And as I need to use bio only in my case, so I chose the easy way to extend the special case for bio. > > If the data item framework isn't flexible enough, we should work on > extending it. Special casing the bio iterator is the wrong thing to > do. Agreed, I will go back to the framework itself and find some way to make it work in my case. Thanx > > Thanks, > > Ilya >
On Wed, Dec 5, 2018 at 2:17 AM Dongsheng Yang <dongsheng.yang@easystack.cn> wrote: > > > > On 12/04/2018 10:28 PM, Ilya Dryomov wrote: > > On Mon, Dec 3, 2018 at 1:51 PM Dongsheng Yang > > <dongsheng.yang@easystack.cn> wrote: > >> When we want to add some prefix to the bio data, such as journal header, > >> we don't want to copy it from bio into a new larger memory. Instead, we need > >> a prefix page and suffix page to store the header and footer in bio_iter. > >> > >> Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn> > ... > >> } > >> #endif /* CONFIG_BLOCK */ > > Have you looked at message data items (ceph_msg_data) for this? > > A message can have multiple data items, each of different type (BIO, > > PAGES, etc) and with its own iterator. The messenger will exhaust them > > in order, so you can create a message with three data items: prefix > > (type PAGES), data (type BIO), suffix (type PAGES). > Yes, that was my first idea, but I found the framework is not > enough to my case. And as I need to use bio only in my case, What was the problem? Thanks, Ilya
On Tue, Dec 18, 2018 at 11:10 AM Dongsheng Yang <dongsheng.yang@easystack.cn> wrote: > > > > On 12/05/2018 06:18 PM, Ilya Dryomov wrote: > > On Wed, Dec 5, 2018 at 2:17 AM Dongsheng Yang > <dongsheng.yang@easystack.cn> wrote: > > > On 12/04/2018 10:28 PM, Ilya Dryomov wrote: > > On Mon, Dec 3, 2018 at 1:51 PM Dongsheng Yang > <dongsheng.yang@easystack.cn> wrote: > > When we want to add some prefix to the bio data, such as journal header, > we don't want to copy it from bio into a new larger memory. Instead, we need > a prefix page and suffix page to store the header and footer in bio_iter. > > Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn> > > ... > > } > #endif /* CONFIG_BLOCK */ > > Have you looked at message data items (ceph_msg_data) for this? > A message can have multiple data items, each of different type (BIO, > PAGES, etc) and with its own iterator. The messenger will exhaust them > in order, so you can create a message with three data items: prefix > (type PAGES), data (type BIO), suffix (type PAGES). > > Yes, that was my first idea, but I found the framework is not > enough to my case. And as I need to use bio only in my case, > > What was the problem? > > > The problem is there is only one type of ceph_osd_data in > ceph_osd_req_op.extent.osd_data. > > But what I want in journal append case is one ceph_osd_req_op (op > = CEPH_OSD_OP_APPEND), and extent of this ceph_osd_req_op have tree > ceph_osd_data (example: PAGES, BIO, PAGES). I don't see a problem with adding two additional ceph_osd_data members to extent with a comment explaining that rbd journaling needs a prefix and a suffix. READ, WRITE, etc would still use just one, APPEND could use all three when required. Thanks, Ilya
On 12/21/2018 05:35 PM, Ilya Dryomov wrote: > On Tue, Dec 18, 2018 at 11:10 AM Dongsheng Yang > <dongsheng.yang@easystack.cn> wrote: >> >> >> On 12/05/2018 06:18 PM, Ilya Dryomov wrote: >> >> On Wed, Dec 5, 2018 at 2:17 AM Dongsheng Yang >> <dongsheng.yang@easystack.cn> wrote: >> >> >> On 12/04/2018 10:28 PM, Ilya Dryomov wrote: >> >> On Mon, Dec 3, 2018 at 1:51 PM Dongsheng Yang >> <dongsheng.yang@easystack.cn> wrote: >> >> When we want to add some prefix to the bio data, such as journal header, >> we don't want to copy it from bio into a new larger memory. Instead, we need >> a prefix page and suffix page to store the header and footer in bio_iter. >> >> Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn> >> >> ... >> >> } >> #endif /* CONFIG_BLOCK */ >> >> Have you looked at message data items (ceph_msg_data) for this? >> A message can have multiple data items, each of different type (BIO, >> PAGES, etc) and with its own iterator. The messenger will exhaust them >> in order, so you can create a message with three data items: prefix >> (type PAGES), data (type BIO), suffix (type PAGES). >> >> Yes, that was my first idea, but I found the framework is not >> enough to my case. And as I need to use bio only in my case, >> >> What was the problem? >> >> >> The problem is there is only one type of ceph_osd_data in >> ceph_osd_req_op.extent.osd_data. >> >> But what I want in journal append case is one ceph_osd_req_op (op >> = CEPH_OSD_OP_APPEND), and extent of this ceph_osd_req_op have tree >> ceph_osd_data (example: PAGES, BIO, PAGES). > I don't see a problem with adding two additional ceph_osd_data members > to extent with a comment explaining that rbd journaling needs a prefix > and a suffix. READ, WRITE, etc would still use just one, APPEND could > use all three when required. Thanx for your suggestion. > > Thanks, > > Ilya >
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 800a212..0da6a4b 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -87,6 +87,15 @@ enum ceph_msg_data_type { struct ceph_bio_iter { struct bio *bio; struct bvec_iter iter; + size_t bio_len; + + struct page *prefix_page; + unsigned int prefix_offset; + unsigned int prefix_len; + + struct page *suffix_page; + unsigned int suffix_offset; + unsigned int suffix_len; }; #define __ceph_bio_iter_advance_step(it, n, STEP) do { \ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 57fcc6b..64c70c5 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -828,23 +828,46 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, cursor->resid = min_t(size_t, length, data->bio_length); *it = data->bio_pos; - if (cursor->resid < it->iter.bi_size) - it->iter.bi_size = cursor->resid; - BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); - cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); + if (cursor->resid < it->iter.bi_size + it->prefix_len + it->suffix_len) + it->iter.bi_size = cursor->resid - it->prefix_len - it->suffix_len; + + it->bio_len = cursor->resid - it->prefix_len - it->suffix_len; + if (it->suffix_len) { + cursor->last_piece = cursor->resid == it->suffix_len; + } else if (it->bio_len) { + cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); + } else if (it->prefix_len) { + cursor->last_piece = cursor->resid == it->prefix_len; + } else { + BUG(); + } } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { - struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio, - cursor->bio_iter.iter); + struct ceph_bio_iter *it = &cursor->bio_iter; - *page_offset = bv.bv_offset; - *length = bv.bv_len; - return bv.bv_page; + if (it->prefix_len) { + *page_offset = it->prefix_offset; + *length = it->prefix_len; + return it->prefix_page; + } else if (it->bio_len) { + struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio, + cursor->bio_iter.iter); + + *page_offset = bv.bv_offset; + *length = bv.bv_len; + return bv.bv_page; + } else { + BUG_ON(!it->suffix_len); + + *page_offset = it->suffix_offset; + *length = it->suffix_len; + return it->suffix_page; + } } static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, @@ -852,29 +875,58 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, { struct ceph_bio_iter *it = &cursor->bio_iter; + if (!bytes) + return false; + BUG_ON(bytes > cursor->resid); - BUG_ON(bytes > bio_iter_len(it->bio, it->iter)); cursor->resid -= bytes; - bio_advance_iter(it->bio, &it->iter, bytes); + if (it->prefix_len) { + BUG_ON(bytes > it->prefix_len); + it->prefix_offset += bytes; + it->prefix_len -= bytes; + if (it->prefix_len) + return false; + } else if (it->bio_len) { + BUG_ON(bytes > bio_iter_len(it->bio, it->iter)); + bio_advance_iter(it->bio, &it->iter, bytes); + it->bio_len -= bytes; + if (!cursor->resid) { + BUG_ON(!cursor->last_piece); + return false; /* no more data */ + } + if (it->iter.bi_size && it->iter.bi_bvec_done) + return false; /* more bytes to process in this segment */ + + if (it->bio_len && !it->iter.bi_size) { + it->bio = it->bio->bi_next; + it->iter = it->bio->bi_iter; + if (cursor->resid < it->iter.bi_size) + it->iter.bi_size = cursor->resid; + } + } else { + BUG_ON(!it->suffix_len); + it->suffix_offset += bytes; + it->suffix_len -= bytes; + if (it->suffix_len) + return false; + } if (!cursor->resid) { BUG_ON(!cursor->last_piece); return false; /* no more data */ } - if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done)) - return false; /* more bytes to process in this segment */ - - if (!it->iter.bi_size) { - it->bio = it->bio->bi_next; - it->iter = it->bio->bi_iter; - if (cursor->resid < it->iter.bi_size) - it->iter.bi_size = cursor->resid; + BUG_ON(cursor->last_piece); + if (it->suffix_len) { + cursor->last_piece = cursor->resid == it->suffix_len; + } else if (it->bio_len) { + cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); + } else if (it->prefix_len) { + cursor->last_piece = cursor->resid == it->prefix_len; + } else { + BUG(); } - BUG_ON(cursor->last_piece); - BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); - cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); return true; } #endif /* CONFIG_BLOCK */
When we want to add some prefix to the bio data, such as journal header, we don't want to copy it from bio into a new larger memory. Instead, we need a prefix page and suffix page to store the header and footer in bio_iter. Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn> --- include/linux/ceph/messenger.h | 9 ++++ net/ceph/messenger.c | 96 ++++++++++++++++++++++++++++++++---------- 2 files changed, 83 insertions(+), 22 deletions(-)