diff mbox series

[v3,3/3] libceph: just wait for more data to be available on the socket

Message ID 20231215002034.205780-4-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series libceph: fix sparse-read failure bug | expand

Commit Message

Xiubo Li Dec. 15, 2023, 12:20 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

The messages from ceph maybe split into multiple socket packages
and we just need to wait for all the data to be availiable on the
sokcet.

This will add 'sr_total_resid' to record the total length for all
data items for sparse-read message and 'sr_resid_elen' to record
the current extent total length.

URL: https://tracker.ceph.com/issues/63586
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 include/linux/ceph/messenger.h |  2 ++
 net/ceph/messenger.c           |  1 +
 net/ceph/messenger_v1.c        | 21 ++++++++++++++++-----
 net/ceph/osd_client.c          |  1 +
 4 files changed, 20 insertions(+), 5 deletions(-)

Comments

Ilya Dryomov Jan. 15, 2024, 10:38 p.m. UTC | #1
On Fri, Dec 15, 2023 at 1:22 AM <xiubli@redhat.com> wrote:
>
> From: Xiubo Li <xiubli@redhat.com>
>
> The messages from ceph maybe split into multiple socket packages
> and we just need to wait for all the data to be availiable on the
> sokcet.
>
> This will add 'sr_total_resid' to record the total length for all
> data items for sparse-read message and 'sr_resid_elen' to record
> the current extent total length.
>
> URL: https://tracker.ceph.com/issues/63586
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  include/linux/ceph/messenger.h |  2 ++
>  net/ceph/messenger.c           |  1 +
>  net/ceph/messenger_v1.c        | 21 ++++++++++++++++-----
>  net/ceph/osd_client.c          |  1 +
>  4 files changed, 20 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> index 2eaaabbe98cb..05e9b39a58f8 100644
> --- a/include/linux/ceph/messenger.h
> +++ b/include/linux/ceph/messenger.h
> @@ -231,10 +231,12 @@ struct ceph_msg_data {
>
>  struct ceph_msg_data_cursor {
>         size_t                  total_resid;    /* across all data items */
> +       size_t                  sr_total_resid; /* across all data items for sparse-read */
>
>         struct ceph_msg_data    *data;          /* current data item */
>         size_t                  resid;          /* bytes not yet consumed */
>         int                     sr_resid;       /* residual sparse_read len */
> +       int                     sr_resid_elen;  /* total sparse_read elen */

Hi Xiubo,

Is adding yet another sparse-read field to the cursor really needed?
Would making read_partial_sparse_msg_extent() decrement sr_total_resid
as it goes just like sr_resid is decremented work?

>         bool                    need_crc;       /* crc update needed */
>         union {
>  #ifdef CONFIG_BLOCK
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 3c8b78d9c4d1..eafd592af382 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -1073,6 +1073,7 @@ void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
>         cursor->total_resid = length;
>         cursor->data = msg->data;
>         cursor->sr_resid = 0;
> +       cursor->sr_resid_elen = 0;
>
>         __ceph_msg_data_cursor_init(cursor);
>  }
> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
> index 4cb60bacf5f5..7425fa26e4c3 100644
> --- a/net/ceph/messenger_v1.c
> +++ b/net/ceph/messenger_v1.c
> @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
>  static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
>  {
>         /* Initialize data cursor if it's not a sparse read */
> -       if (!msg->sparse_read)
> +       if (msg->sparse_read)
> +               msg->cursor.sr_total_resid = data_len;
> +       else
>                 ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
>  }
>
> @@ -1032,18 +1034,25 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
>         bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
>         u32 crc = 0;
>         int ret = 1;
> +       int len;
>
>         if (do_datacrc)
>                 crc = con->in_data_crc;
>
> -       do {
> -               if (con->v1.in_sr_kvec.iov_base)
> +       while (cursor->sr_total_resid && ret > 0) {
> +               len = 0;
> +               if (con->v1.in_sr_kvec.iov_base) {
>                         ret = read_partial_message_chunk(con,
>                                                          &con->v1.in_sr_kvec,
>                                                          con->v1.in_sr_len,
>                                                          &crc);
> -               else if (cursor->sr_resid > 0)
> +                       if (ret == 1)
> +                               len = con->v1.in_sr_len;
> +               } else if (cursor->sr_resid > 0) {
>                         ret = read_partial_sparse_msg_extent(con, &crc);
> +                       if (ret == 1)
> +                               len = cursor->sr_resid_elen;
> +               }

I was waiting for Jeff's review since this is his code, but it's been
a while so I'll just comment.

To me, it's a sign of suboptimal structure that you needed to add new
fields to the cursor just to be able tell whether this function is done
with the message, because it's something that is already tracked but
gets lost between the loops here.

Currently this function is structured as:

    do {
        if (set up for kvec)
           read as much as possible into kvec
        else if (set up for pages)
           read as much as possible into pages

        if (short read)
            bail, will be called again later

         invoke con->ops->sparse_read() for processing the thing what
         was read and setting up the next read OR setting up the initial
         read
    } until (con->ops->sparse_read() returns "done")

If it was me, I would pursue refactoring this to:

    do {
        if (set up for kvec)
           read as much as possible into kvec
        else if (set up for pages)
           read as much as possible into pages
        else
           bail

        if (short read)
            bail, will be called again later

         invoke con->ops->sparse_read() for processing the thing that
         was read and setting up the next read
    } until (con->ops->sparse_read() returns "done")

... and invoke con->ops->sparse_read() for the first time elsewhere,
likely in prepare_message_data().  The rationale is that the state
machine inside con->ops->sparse_read() is conceptually a cursor and
prepare_message_data() is where the cursor is initialized for normal
reads.  This way no additional state should be needed.

Thanks,

                Ilya
Ilya Dryomov Jan. 16, 2024, 10 a.m. UTC | #2
On Tue, Jan 16, 2024 at 5:09 AM Xiubo Li <xiubli@redhat.com> wrote:
>
>
> On 1/16/24 06:38, Ilya Dryomov wrote:
>
> On Fri, Dec 15, 2023 at 1:22 AM <xiubli@redhat.com> wrote:
>
> From: Xiubo Li <xiubli@redhat.com>
>
> The messages from ceph maybe split into multiple socket packages
> and we just need to wait for all the data to be availiable on the
> sokcet.
>
> This will add 'sr_total_resid' to record the total length for all
> data items for sparse-read message and 'sr_resid_elen' to record
> the current extent total length.
>
> URL: https://tracker.ceph.com/issues/63586
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  include/linux/ceph/messenger.h |  2 ++
>  net/ceph/messenger.c           |  1 +
>  net/ceph/messenger_v1.c        | 21 ++++++++++++++++-----
>  net/ceph/osd_client.c          |  1 +
>  4 files changed, 20 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> index 2eaaabbe98cb..05e9b39a58f8 100644
> --- a/include/linux/ceph/messenger.h
> +++ b/include/linux/ceph/messenger.h
> @@ -231,10 +231,12 @@ struct ceph_msg_data {
>
>  struct ceph_msg_data_cursor {
>         size_t                  total_resid;    /* across all data items */
> +       size_t                  sr_total_resid; /* across all data items for sparse-read */
>
>         struct ceph_msg_data    *data;          /* current data item */
>         size_t                  resid;          /* bytes not yet consumed */
>         int                     sr_resid;       /* residual sparse_read len */
> +       int                     sr_resid_elen;  /* total sparse_read elen */
>
> Hi Xiubo,
>
> Is adding yet another sparse-read field to the cursor really needed?
> Would making read_partial_sparse_msg_extent() decrement sr_total_resid
> as it goes just like sr_resid is decremented work?
>
> This can be improved by removing it. Let me fix it.
>
>         bool                    need_crc;       /* crc update needed */
>         union {
>  #ifdef CONFIG_BLOCK
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 3c8b78d9c4d1..eafd592af382 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -1073,6 +1073,7 @@ void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
>         cursor->total_resid = length;
>         cursor->data = msg->data;
>         cursor->sr_resid = 0;
> +       cursor->sr_resid_elen = 0;
>
>         __ceph_msg_data_cursor_init(cursor);
>  }
> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
> index 4cb60bacf5f5..7425fa26e4c3 100644
> --- a/net/ceph/messenger_v1.c
> +++ b/net/ceph/messenger_v1.c
> @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
>  static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
>  {
>         /* Initialize data cursor if it's not a sparse read */
> -       if (!msg->sparse_read)
> +       if (msg->sparse_read)
> +               msg->cursor.sr_total_resid = data_len;
> +       else
>                 ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
>  }
>
> @@ -1032,18 +1034,25 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
>         bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
>         u32 crc = 0;
>         int ret = 1;
> +       int len;
>
>         if (do_datacrc)
>                 crc = con->in_data_crc;
>
> -       do {
> -               if (con->v1.in_sr_kvec.iov_base)
> +       while (cursor->sr_total_resid && ret > 0) {
> +               len = 0;
> +               if (con->v1.in_sr_kvec.iov_base) {
>                         ret = read_partial_message_chunk(con,
>                                                          &con->v1.in_sr_kvec,
>                                                          con->v1.in_sr_len,
>                                                          &crc);
> -               else if (cursor->sr_resid > 0)
> +                       if (ret == 1)
> +                               len = con->v1.in_sr_len;
> +               } else if (cursor->sr_resid > 0) {
>                         ret = read_partial_sparse_msg_extent(con, &crc);
> +                       if (ret == 1)
> +                               len = cursor->sr_resid_elen;
> +               }
>
> I was waiting for Jeff's review since this is his code, but it's been
> a while so I'll just comment.
>
> To me, it's a sign of suboptimal structure that you needed to add new
> fields to the cursor just to be able tell whether this function is done
> with the message, because it's something that is already tracked but
> gets lost between the loops here.
>
> Currently this function is structured as:
>
>     do {
>         if (set up for kvec)
>            read as much as possible into kvec
>         else if (set up for pages)
>            read as much as possible into pages
>
>         if (short read)
>             bail, will be called again later
>
>          invoke con->ops->sparse_read() for processing the thing what
>          was read and setting up the next read OR setting up the initial
>          read
>     } until (con->ops->sparse_read() returns "done")
>
> If it was me, I would pursue refactoring this to:
>
>     do {
>         if (set up for kvec)
>            read as much as possible into kvec
>         else if (set up for pages)
>            read as much as possible into pages
>         else
>            bail
>
> Why bail here ? For the new sparse read both the 'kvec' and 'pages' shouldn't be set, so the following '->sparse_read()' will set up them and continue.
>
> Or you just want the 'read_partial_sparse_msg_data()' to read data but not the first time to trigger the '->sparse_read()' ? Before I tried a similar approach and it was not as optional as this one as I do.

Hi Xiubo,

I addressed that in the previous message:

    ... and invoke con->ops->sparse_read() for the first time elsewhere,
    likely in prepare_message_data().  The rationale is that the state
    machine inside con->ops->sparse_read() is conceptually a cursor and
    prepare_message_data() is where the cursor is initialized for normal
    reads.

The benefit is that no additional state would be needed.

> The 'cursor->sr_total_resid', which is similar with 'cursor->total_resid', will just record the total data length for current sparse-read request and will determine whether should we skip parsing the "(page) data" in 'read_partial_message()'.

I understand the intent behind cursor->sr_total_resid, but it would be
nice to do without it because of its inherent redundancy.

Did you try calling con->ops->sparse_read() for the first time exactly
where cursor->sr_total_resid is initialized in your patch?

Thanks,

                Ilya
Ilya Dryomov Jan. 16, 2024, 12:06 p.m. UTC | #3
On Tue, Jan 16, 2024 at 11:45 AM Xiubo Li <xiubli@redhat.com> wrote:
>
>
> On 1/16/24 18:00, Ilya Dryomov wrote:
>
> On Tue, Jan 16, 2024 at 5:09 AM Xiubo Li <xiubli@redhat.com> wrote:
>
> On 1/16/24 06:38, Ilya Dryomov wrote:
>
> On Fri, Dec 15, 2023 at 1:22 AM <xiubli@redhat.com> wrote:
>
> From: Xiubo Li <xiubli@redhat.com>
>
> The messages from ceph maybe split into multiple socket packages
> and we just need to wait for all the data to be availiable on the
> sokcet.
>
> This will add 'sr_total_resid' to record the total length for all
> data items for sparse-read message and 'sr_resid_elen' to record
> the current extent total length.
>
> URL: https://tracker.ceph.com/issues/63586
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  include/linux/ceph/messenger.h |  2 ++
>  net/ceph/messenger.c           |  1 +
>  net/ceph/messenger_v1.c        | 21 ++++++++++++++++-----
>  net/ceph/osd_client.c          |  1 +
>  4 files changed, 20 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> index 2eaaabbe98cb..05e9b39a58f8 100644
> --- a/include/linux/ceph/messenger.h
> +++ b/include/linux/ceph/messenger.h
> @@ -231,10 +231,12 @@ struct ceph_msg_data {
>
>  struct ceph_msg_data_cursor {
>         size_t                  total_resid;    /* across all data items */
> +       size_t                  sr_total_resid; /* across all data items for sparse-read */
>
>         struct ceph_msg_data    *data;          /* current data item */
>         size_t                  resid;          /* bytes not yet consumed */
>         int                     sr_resid;       /* residual sparse_read len */
> +       int                     sr_resid_elen;  /* total sparse_read elen */
>
> Hi Xiubo,
>
> Is adding yet another sparse-read field to the cursor really needed?
> Would making read_partial_sparse_msg_extent() decrement sr_total_resid
> as it goes just like sr_resid is decremented work?
>
> This can be improved by removing it. Let me fix it.
>
>         bool                    need_crc;       /* crc update needed */
>         union {
>  #ifdef CONFIG_BLOCK
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 3c8b78d9c4d1..eafd592af382 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -1073,6 +1073,7 @@ void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
>         cursor->total_resid = length;
>         cursor->data = msg->data;
>         cursor->sr_resid = 0;
> +       cursor->sr_resid_elen = 0;
>
>         __ceph_msg_data_cursor_init(cursor);
>  }
> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
> index 4cb60bacf5f5..7425fa26e4c3 100644
> --- a/net/ceph/messenger_v1.c
> +++ b/net/ceph/messenger_v1.c
> @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
>  static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
>  {
>         /* Initialize data cursor if it's not a sparse read */
> -       if (!msg->sparse_read)
> +       if (msg->sparse_read)
> +               msg->cursor.sr_total_resid = data_len;
> +       else
>                 ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
>  }
>
> @@ -1032,18 +1034,25 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
>         bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
>         u32 crc = 0;
>         int ret = 1;
> +       int len;
>
>         if (do_datacrc)
>                 crc = con->in_data_crc;
>
> -       do {
> -               if (con->v1.in_sr_kvec.iov_base)
> +       while (cursor->sr_total_resid && ret > 0) {
> +               len = 0;
> +               if (con->v1.in_sr_kvec.iov_base) {
>                         ret = read_partial_message_chunk(con,
>                                                          &con->v1.in_sr_kvec,
>                                                          con->v1.in_sr_len,
>                                                          &crc);
> -               else if (cursor->sr_resid > 0)
> +                       if (ret == 1)
> +                               len = con->v1.in_sr_len;
> +               } else if (cursor->sr_resid > 0) {
>                         ret = read_partial_sparse_msg_extent(con, &crc);
> +                       if (ret == 1)
> +                               len = cursor->sr_resid_elen;
> +               }
>
> I was waiting for Jeff's review since this is his code, but it's been
> a while so I'll just comment.
>
> To me, it's a sign of suboptimal structure that you needed to add new
> fields to the cursor just to be able tell whether this function is done
> with the message, because it's something that is already tracked but
> gets lost between the loops here.
>
> Currently this function is structured as:
>
>     do {
>         if (set up for kvec)
>            read as much as possible into kvec
>         else if (set up for pages)
>            read as much as possible into pages
>
>         if (short read)
>             bail, will be called again later
>
>          invoke con->ops->sparse_read() for processing the thing what
>          was read and setting up the next read OR setting up the initial
>          read
>     } until (con->ops->sparse_read() returns "done")
>
> If it was me, I would pursue refactoring this to:
>
>     do {
>         if (set up for kvec)
>            read as much as possible into kvec
>         else if (set up for pages)
>            read as much as possible into pages
>         else
>            bail
>
> Why bail here ? For the new sparse read both the 'kvec' and 'pages' shouldn't be set, so the following '->sparse_read()' will set up them and continue.
>
> Or you just want the 'read_partial_sparse_msg_data()' to read data but not the first time to trigger the '->sparse_read()' ? Before I tried a similar approach and it was not as optional as this one as I do.
>
> Hi Xiubo,
>
> I addressed that in the previous message:
>
>     ... and invoke con->ops->sparse_read() for the first time elsewhere,
>     likely in prepare_message_data().  The rationale is that the state
>     machine inside con->ops->sparse_read() is conceptually a cursor and
>     prepare_message_data() is where the cursor is initialized for normal
>     reads.
>
> The benefit is that no additional state would be needed.
>
> Sorry, which state do you mean here ?  Is the 'sr_total_resid' ?

Yes.

                Ilya
Ilya Dryomov Jan. 22, 2024, 7:13 p.m. UTC | #4
On Wed, Jan 17, 2024 at 2:26 AM Xiubo Li <xiubli@redhat.com> wrote:
>
>
> On 1/16/24 18:00, Ilya Dryomov wrote:
>
> On Tue, Jan 16, 2024 at 5:09 AM Xiubo Li <xiubli@redhat.com> wrote:
>
> [...]
> I was waiting for Jeff's review since this is his code, but it's been
> a while so I'll just comment.
>
> To me, it's a sign of suboptimal structure that you needed to add new
> fields to the cursor just to be able tell whether this function is done
> with the message, because it's something that is already tracked but
> gets lost between the loops here.
>
> Currently this function is structured as:
>
>     do {
>         if (set up for kvec)
>            read as much as possible into kvec
>         else if (set up for pages)
>            read as much as possible into pages
>
>         if (short read)
>             bail, will be called again later
>
>          invoke con->ops->sparse_read() for processing the thing what
>          was read and setting up the next read OR setting up the initial
>          read
>     } until (con->ops->sparse_read() returns "done")
>
> If it was me, I would pursue refactoring this to:
>
>     do {
>         if (set up for kvec)
>            read as much as possible into kvec
>         else if (set up for pages)
>            read as much as possible into pages
>         else
>            bail
>
> Why bail here ? For the new sparse read both the 'kvec' and 'pages' shouldn't be set, so the following '->sparse_read()' will set up them and continue.
>
> Or you just want the 'read_partial_sparse_msg_data()' to read data but not the first time to trigger the '->sparse_read()' ? Before I tried a similar approach and it was not as optional as this one as I do.
>
> Hi Xiubo,
>
> I addressed that in the previous message:
>
>     ... and invoke con->ops->sparse_read() for the first time elsewhere,
>     likely in prepare_message_data().  The rationale is that the state
>     machine inside con->ops->sparse_read() is conceptually a cursor and
>     prepare_message_data() is where the cursor is initialized for normal
>     reads.
>
> The benefit is that no additional state would be needed.
>
> Hi Ilya,
>
> I am afraid this won't work if my understanding is correct.
>
> I think you want to call the first 'con->ops->sparse_read()' earlier somewhere such as in 'prepare_message_data()' to parse and skip the extra spare-read extent array info and then we could reuse the 'cursor->total_resid' as others, right ?
>
> As we know the 'cursor->total_resid' is for pure data only, while for sparse-read it introduce some extra extent array info.
>
> For sparse-read we need to call 'con->ops->sparse_read()' several times for each sparse-read OP and each time it will only parse part of the extra info. That means we need to call 'con->ops->sparse_read()' many times in 'prepare_message_data()' in this approach.
>
> The the most important thing is that we couldn't do this in 'prepare_message_data()', because the 'prepare_message_data()' will be called just before parsing the "front" and "middle" of the msg.

Hi Xiubo,

I see, I missed that in my suggestion.

>
> Else if we did this in somewhere place out of 'prepare_message_data()', then we must do it in the following code just before  Line#1267:
>
> 1262         /* (page) data */      1263         if (data_len) {        1264                 if (!m->num_data_items)              1265                         return -EIO;                         1266    1267                 if (m->sparse_read) 1268                         ret = read_partial_sparse_msg_data(con); 1269                 else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) 1270                         ret = read_partial_msg_data_bounce(con); 1271                 else           1272                         ret = read_partial_msg_data(con);    1273                 if (ret <= 0) 1274                         return ret; 1275         }
>
> But this still couldn't resolve the multiple sparse-read OPs case, once the first OP parsing finishes we couldn't jump back to call 'prepare_message_data()'.
>
> The sparse-read data in the socket buffer will be:
>
> OP1: <sparse-read header> <sparse-read extents> <sparse-read real data length> <real data 1>; OP2: <sparse-read header> <sparse-read extents> <sparse-read real data length> <real data 2>; OP3:....
>
> Currently the 'cursor->total_resid' will record the total length of <real data 1> + <real data 2> + ...
>
> And the 'cursor->sr_total_resid' will record all the above.
>
> The 'cursor->sr_total_resid', which is similar with 'cursor->total_resid', will just record the total data length for current sparse-read request and will determine whether should we skip parsing the "(page) data" in 'read_partial_message()'.
>
> I understand the intent behind cursor->sr_total_resid, but it would be
> nice to do without it because of its inherent redundancy.
>
> The above is why I added ''cursor->sr_total_resid'.
>
> Did you try calling con->ops->sparse_read() for the first time exactly
> where cursor->sr_total_resid is initialized in your patch?
>
> Yeah, I did but it didn't work as I mentioned above. If I am wrong, please correct me.

I think you are right, and also in that introducing a separate field
makes for an easier fix.  I continue to claim that a separate field is
_inherently_ redundant though -- it's just that due to the structure of
the current code, it's not obvious.

There is definitely a way to make this (to be precise: allow
read_partial_sparse_msg_data() to be invoked by the messenger on short
reads of the footer without causing any damage like attempting to parse
random data as sparse-read header and/or extents) happen without adding
additional fields to the cursor or elsewhere.  But it might involve
refactoring the entire state machine such that information isn't lost
between the messenger and the OSD client, so I'm not asking that you
take that on here.

I wouldn't object to cursor->sr_total_resid being added, I just don't
like it ;)

Thanks,

                Ilya
Ilya Dryomov Jan. 22, 2024, 9:26 p.m. UTC | #5
On Mon, Jan 22, 2024 at 8:13 PM Ilya Dryomov <idryomov@gmail.com> wrote:
> I wouldn't object to cursor->sr_total_resid being added, I just don't
> like it ;)

Actually, how about just reusing cursor->sr_resid, which happens to be
an int?  Set it to -1 when con->ops->sparse_read() returns 0 and check
for that at the top:

    struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
    bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
    u32 crc = 0;
    int ret = 1;

    if (cursor->sr_resid < 0)
            return 1;

    if (do_datacrc)
            crc = con->in_data_crc;

    [ ... no changes ... ]

    if (do_datacrc)
            con->in_data_crc = crc;

    if (ret < 0)
            return ret;

    cursor->sr_resid = -1;
    return 1;  /* must return > 0 to indicate success */

Thanks,

                Ilya
diff mbox series

Patch

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 2eaaabbe98cb..05e9b39a58f8 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -231,10 +231,12 @@  struct ceph_msg_data {
 
 struct ceph_msg_data_cursor {
 	size_t			total_resid;	/* across all data items */
+	size_t			sr_total_resid;	/* across all data items for sparse-read */
 
 	struct ceph_msg_data	*data;		/* current data item */
 	size_t			resid;		/* bytes not yet consumed */
 	int			sr_resid;	/* residual sparse_read len */
+	int			sr_resid_elen;	/* total sparse_read elen */
 	bool			need_crc;	/* crc update needed */
 	union {
 #ifdef CONFIG_BLOCK
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 3c8b78d9c4d1..eafd592af382 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1073,6 +1073,7 @@  void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
 	cursor->total_resid = length;
 	cursor->data = msg->data;
 	cursor->sr_resid = 0;
+	cursor->sr_resid_elen = 0;
 
 	__ceph_msg_data_cursor_init(cursor);
 }
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index 4cb60bacf5f5..7425fa26e4c3 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -160,7 +160,9 @@  static size_t sizeof_footer(struct ceph_connection *con)
 static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
 {
 	/* Initialize data cursor if it's not a sparse read */
-	if (!msg->sparse_read)
+	if (msg->sparse_read)
+		msg->cursor.sr_total_resid = data_len;
+	else
 		ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
 }
 
@@ -1032,18 +1034,25 @@  static int read_partial_sparse_msg_data(struct ceph_connection *con)
 	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
 	u32 crc = 0;
 	int ret = 1;
+	int len;
 
 	if (do_datacrc)
 		crc = con->in_data_crc;
 
-	do {
-		if (con->v1.in_sr_kvec.iov_base)
+	while (cursor->sr_total_resid && ret > 0) {
+		len = 0;
+		if (con->v1.in_sr_kvec.iov_base) {
 			ret = read_partial_message_chunk(con,
 							 &con->v1.in_sr_kvec,
 							 con->v1.in_sr_len,
 							 &crc);
-		else if (cursor->sr_resid > 0)
+			if (ret == 1)
+				len = con->v1.in_sr_len;
+		} else if (cursor->sr_resid > 0) {
 			ret = read_partial_sparse_msg_extent(con, &crc);
+			if (ret == 1)
+				len = cursor->sr_resid_elen;
+		}
 
 		if (ret <= 0) {
 			if (do_datacrc)
@@ -1051,11 +1060,13 @@  static int read_partial_sparse_msg_data(struct ceph_connection *con)
 			return ret;
 		}
 
+		cursor->sr_total_resid -= len;
+
 		memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
 		ret = con->ops->sparse_read(con, cursor,
 				(char **)&con->v1.in_sr_kvec.iov_base);
 		con->v1.in_sr_len = ret;
-	} while (ret > 0);
+	}
 
 	if (do_datacrc)
 		con->in_data_crc = crc;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 848ef19055a0..b53b017afc0a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -5946,6 +5946,7 @@  static int osd_sparse_read(struct ceph_connection *con,
 
 		/* send back the new length and nullify the ptr */
 		cursor->sr_resid = elen;
+		cursor->sr_resid_elen = elen;
 		ret = elen;
 		*pbuf = NULL;