diff mbox series

[v4,3/3] libceph: just wait for more data to be available on the socket

Message ID 20240118105047.792879-4-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series libceph: fix sparse-read failure bug | expand

Commit Message

Xiubo Li Jan. 18, 2024, 10:50 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

The messages from ceph maybe split into multiple socket packages
and we just need to wait for all the data to be availiable on the
sokcet.

This will add 'sr_total_resid' to record the total length for all
data items for sparse-read message and 'sr_resid_elen' to record
the current extent total length.

URL: https://tracker.ceph.com/issues/63586
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 include/linux/ceph/messenger.h |  1 +
 net/ceph/messenger_v1.c        | 32 +++++++++++++++++++++-----------
 2 files changed, 22 insertions(+), 11 deletions(-)

Comments

Jeff Layton Jan. 18, 2024, 2:36 p.m. UTC | #1
On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> The messages from ceph maybe split into multiple socket packages
> and we just need to wait for all the data to be availiable on the
> sokcet.
>
> This will add 'sr_total_resid' to record the total length for all
> data items for sparse-read message and 'sr_resid_elen' to record
> the current extent total length.
> 

It's been a while since I was in the ceph messenger code, and my v1
memory is especially fuzzy. I really don't quite understand how tracking
yet another length field solves the stated problem.

I'd really appreciate a description of the problem that you saw and how
this solves it. The ceph bug is not very straightforward.

I get that we need to wait for a certain amount of data to be available
on the socket before we drive the sparse_read op, but I don't quite see
how that's being achieved here.

Also, why is this not a problem on messenger v2?

> URL: https://tracker.ceph.com/issues/63586
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  include/linux/ceph/messenger.h |  1 +
>  net/ceph/messenger_v1.c        | 32 +++++++++++++++++++++-----------
>  2 files changed, 22 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> index 2eaaabbe98cb..ca6f82abed62 100644
> --- a/include/linux/ceph/messenger.h
> +++ b/include/linux/ceph/messenger.h
> @@ -231,6 +231,7 @@ struct ceph_msg_data {
>  
>  struct ceph_msg_data_cursor {
>  	size_t			total_resid;	/* across all data items */
> +	size_t			sr_total_resid;	/* across all data items for sparse-read */
>  
>  	struct ceph_msg_data	*data;		/* current data item */
>  	size_t			resid;		/* bytes not yet consumed */
> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
> index 4cb60bacf5f5..2733da891688 100644
> --- a/net/ceph/messenger_v1.c
> +++ b/net/ceph/messenger_v1.c
> @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
>  static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
>  {
>  	/* Initialize data cursor if it's not a sparse read */
> -	if (!msg->sparse_read)
> +	if (msg->sparse_read)
> +		msg->cursor.sr_total_resid = data_len;
> +	else
>  		ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
>  }
>  
> @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
>  	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
>  	u32 crc = 0;
>  	int ret = 1;
> +	int len;
>  
>  	if (do_datacrc)
>  		crc = con->in_data_crc;
>  
> -	do {
> -		if (con->v1.in_sr_kvec.iov_base)
> +	while (cursor->sr_total_resid) {
> +		len = 0;
> +		if (con->v1.in_sr_kvec.iov_base) {
> +			len = con->v1.in_sr_kvec.iov_len;
>  			ret = read_partial_message_chunk(con,
>  							 &con->v1.in_sr_kvec,
>  							 con->v1.in_sr_len,
>  							 &crc);
> -		else if (cursor->sr_resid > 0)
> +			len = con->v1.in_sr_kvec.iov_len - len;
> +		} else if (cursor->sr_resid > 0) {
> +			len = cursor->sr_resid;
>  			ret = read_partial_sparse_msg_extent(con, &crc);
> -
> -		if (ret <= 0) {
> -			if (do_datacrc)
> -				con->in_data_crc = crc;
> -			return ret;
> +			len -= cursor->sr_resid;
>  		}
> +		cursor->sr_total_resid -= len;
> +		if (ret <= 0)
> +			break;
>  
>  		memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
>  		ret = con->ops->sparse_read(con, cursor,
>  				(char **)&con->v1.in_sr_kvec.iov_base);
> +		if (ret <= 0) {
> +			ret = ret ? : 1; /* must return > 0 to indicate success */

The above is a gcc-ism that we probably shouldn't be using. Can this be:

	ret = ret ? ret : 1;

?

> +			break;
> +		}
>  		con->v1.in_sr_len = ret;
> -	} while (ret > 0);
> +	}
>  
>  	if (do_datacrc)
>  		con->in_data_crc = crc;
>  
> -	return ret < 0 ? ret : 1;  /* must return > 0 to indicate success */
> +	return ret;
>  }
>  
>  static int read_partial_msg_data(struct ceph_connection *con)
Jeff Layton Jan. 18, 2024, 6:24 p.m. UTC | #2
On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> The messages from ceph maybe split into multiple socket packages
> and we just need to wait for all the data to be availiable on the
> sokcet.
> 
> This will add 'sr_total_resid' to record the total length for all
> data items for sparse-read message and 'sr_resid_elen' to record
> the current extent total length.
> 
> URL: https://tracker.ceph.com/issues/63586
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  include/linux/ceph/messenger.h |  1 +
>  net/ceph/messenger_v1.c        | 32 +++++++++++++++++++++-----------
>  2 files changed, 22 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> index 2eaaabbe98cb..ca6f82abed62 100644
> --- a/include/linux/ceph/messenger.h
> +++ b/include/linux/ceph/messenger.h
> @@ -231,6 +231,7 @@ struct ceph_msg_data {
>  
>  struct ceph_msg_data_cursor {
>  	size_t			total_resid;	/* across all data items */
> +	size_t			sr_total_resid;	/* across all data items for sparse-read */
>  
>  	struct ceph_msg_data	*data;		/* current data item */
>  	size_t			resid;		/* bytes not yet consumed */
> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
> index 4cb60bacf5f5..2733da891688 100644
> --- a/net/ceph/messenger_v1.c
> +++ b/net/ceph/messenger_v1.c
> @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
>  static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
>  {
>  	/* Initialize data cursor if it's not a sparse read */
> -	if (!msg->sparse_read)
> +	if (msg->sparse_read)
> +		msg->cursor.sr_total_resid = data_len;
> +	else
>  		ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
>  }
>  
> @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
>  	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
>  	u32 crc = 0;
>  	int ret = 1;
> +	int len;
>  
>  	if (do_datacrc)
>  		crc = con->in_data_crc;
>  
> -	do {
> -		if (con->v1.in_sr_kvec.iov_base)
> +	while (cursor->sr_total_resid) {
> +		len = 0;
> +		if (con->v1.in_sr_kvec.iov_base) {
> +			len = con->v1.in_sr_kvec.iov_len;
>  			ret = read_partial_message_chunk(con,
>  							 &con->v1.in_sr_kvec,
>  							 con->v1.in_sr_len,
>  							 &crc);
> -		else if (cursor->sr_resid > 0)
> +			len = con->v1.in_sr_kvec.iov_len - len;
> +		} else if (cursor->sr_resid > 0) {
> +			len = cursor->sr_resid;
>  			ret = read_partial_sparse_msg_extent(con, &crc);
> -
> -		if (ret <= 0) {
> -			if (do_datacrc)
> -				con->in_data_crc = crc;
> -			return ret;
> +			len -= cursor->sr_resid;
>  		}
> +		cursor->sr_total_resid -= len;
> +		if (ret <= 0)
> +			break;
>  
>  		memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
>  		ret = con->ops->sparse_read(con, cursor,
>  				(char **)&con->v1.in_sr_kvec.iov_base);
> +		if (ret <= 0) {
> +			ret = ret ? : 1; /* must return > 0 to indicate success */
> +			break;
> +		}
>  		con->v1.in_sr_len = ret;
> -	} while (ret > 0);
> +	}
>  
>  	if (do_datacrc)
>  		con->in_data_crc = crc;
>  
> -	return ret < 0 ? ret : 1;  /* must return > 0 to indicate success */
> +	return ret;
>  }
>  
>  static int read_partial_msg_data(struct ceph_connection *con)

Looking back over this code...

The way it works today, once we determine it's a sparse read, we call
read_sparse_msg_data. At that point we call either
read_partial_message_chunk (to read into the kvec) or
read_sparse_msg_extent if sr_resid is already set (indicating that we're
receiving an extent).

read_sparse_msg_extent calls ceph_tcp_recvpage in a loop until
cursor->sr_resid have been received. The exception there when
ceph_tcp_recvpage returns <= 0.

ceph_tcp_recvpage returns 0 if sock_recvmsg returns -EAGAIN (maybe also
in other cases). So it sounds like the client just timed out on a read
from the socket or caught a signal or something?

If that's correct, then do we know what ceph_tcp_recvpage returned when
the problem happened?
Xiubo Li Jan. 19, 2024, 4:35 a.m. UTC | #3
On 1/19/24 02:24, Jeff Layton wrote:
> On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> The messages from ceph maybe split into multiple socket packages
>> and we just need to wait for all the data to be availiable on the
>> sokcet.
>>
>> This will add 'sr_total_resid' to record the total length for all
>> data items for sparse-read message and 'sr_resid_elen' to record
>> the current extent total length.
>>
>> URL: https://tracker.ceph.com/issues/63586
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   include/linux/ceph/messenger.h |  1 +
>>   net/ceph/messenger_v1.c        | 32 +++++++++++++++++++++-----------
>>   2 files changed, 22 insertions(+), 11 deletions(-)
>>
>> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
>> index 2eaaabbe98cb..ca6f82abed62 100644
>> --- a/include/linux/ceph/messenger.h
>> +++ b/include/linux/ceph/messenger.h
>> @@ -231,6 +231,7 @@ struct ceph_msg_data {
>>   
>>   struct ceph_msg_data_cursor {
>>   	size_t			total_resid;	/* across all data items */
>> +	size_t			sr_total_resid;	/* across all data items for sparse-read */
>>   
>>   	struct ceph_msg_data	*data;		/* current data item */
>>   	size_t			resid;		/* bytes not yet consumed */
>> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
>> index 4cb60bacf5f5..2733da891688 100644
>> --- a/net/ceph/messenger_v1.c
>> +++ b/net/ceph/messenger_v1.c
>> @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
>>   static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
>>   {
>>   	/* Initialize data cursor if it's not a sparse read */
>> -	if (!msg->sparse_read)
>> +	if (msg->sparse_read)
>> +		msg->cursor.sr_total_resid = data_len;
>> +	else
>>   		ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
>>   }
>>   
>> @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
>>   	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
>>   	u32 crc = 0;
>>   	int ret = 1;
>> +	int len;
>>   
>>   	if (do_datacrc)
>>   		crc = con->in_data_crc;
>>   
>> -	do {
>> -		if (con->v1.in_sr_kvec.iov_base)
>> +	while (cursor->sr_total_resid) {
>> +		len = 0;
>> +		if (con->v1.in_sr_kvec.iov_base) {
>> +			len = con->v1.in_sr_kvec.iov_len;
>>   			ret = read_partial_message_chunk(con,
>>   							 &con->v1.in_sr_kvec,
>>   							 con->v1.in_sr_len,
>>   							 &crc);
>> -		else if (cursor->sr_resid > 0)
>> +			len = con->v1.in_sr_kvec.iov_len - len;
>> +		} else if (cursor->sr_resid > 0) {
>> +			len = cursor->sr_resid;
>>   			ret = read_partial_sparse_msg_extent(con, &crc);
>> -
>> -		if (ret <= 0) {
>> -			if (do_datacrc)
>> -				con->in_data_crc = crc;
>> -			return ret;
>> +			len -= cursor->sr_resid;
>>   		}
>> +		cursor->sr_total_resid -= len;
>> +		if (ret <= 0)
>> +			break;
>>   
>>   		memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
>>   		ret = con->ops->sparse_read(con, cursor,
>>   				(char **)&con->v1.in_sr_kvec.iov_base);
>> +		if (ret <= 0) {
>> +			ret = ret ? : 1; /* must return > 0 to indicate success */
>> +			break;
>> +		}
>>   		con->v1.in_sr_len = ret;
>> -	} while (ret > 0);
>> +	}
>>   
>>   	if (do_datacrc)
>>   		con->in_data_crc = crc;
>>   
>> -	return ret < 0 ? ret : 1;  /* must return > 0 to indicate success */
>> +	return ret;
>>   }
>>   
>>   static int read_partial_msg_data(struct ceph_connection *con)
> Looking back over this code...
>
> The way it works today, once we determine it's a sparse read, we call
> read_sparse_msg_data. At that point we call either
> read_partial_message_chunk (to read into the kvec) or
> read_sparse_msg_extent if sr_resid is already set (indicating that we're
> receiving an extent).
>
> read_sparse_msg_extent calls ceph_tcp_recvpage in a loop until
> cursor->sr_resid have been received. The exception there when
> ceph_tcp_recvpage returns <= 0.
>
> ceph_tcp_recvpage returns 0 if sock_recvmsg returns -EAGAIN (maybe also
> in other cases). So it sounds like the client just timed out on a read
> from the socket or caught a signal or something?
>
> If that's correct, then do we know what ceph_tcp_recvpage returned when
> the problem happened?

It should just return parital data, and we should continue from here in 
the next loop when the reset data comes.

Thanks

- Xiubo
Jeff Layton Jan. 19, 2024, 11:09 a.m. UTC | #4
On Fri, 2024-01-19 at 12:35 +0800, Xiubo Li wrote:
> On 1/19/24 02:24, Jeff Layton wrote:
> > On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote:
> > > From: Xiubo Li <xiubli@redhat.com>
> > > 
> > > The messages from ceph maybe split into multiple socket packages
> > > and we just need to wait for all the data to be availiable on the
> > > sokcet.
> > > 
> > > This will add 'sr_total_resid' to record the total length for all
> > > data items for sparse-read message and 'sr_resid_elen' to record
> > > the current extent total length.
> > > 
> > > URL: https://tracker.ceph.com/issues/63586
> > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > ---
> > >   include/linux/ceph/messenger.h |  1 +
> > >   net/ceph/messenger_v1.c        | 32 +++++++++++++++++++++-----------
> > >   2 files changed, 22 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> > > index 2eaaabbe98cb..ca6f82abed62 100644
> > > --- a/include/linux/ceph/messenger.h
> > > +++ b/include/linux/ceph/messenger.h
> > > @@ -231,6 +231,7 @@ struct ceph_msg_data {
> > >   
> > >   struct ceph_msg_data_cursor {
> > >   	size_t			total_resid;	/* across all data items */
> > > +	size_t			sr_total_resid;	/* across all data items for sparse-read */
> > >   
> > >   	struct ceph_msg_data	*data;		/* current data item */
> > >   	size_t			resid;		/* bytes not yet consumed */
> > > diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
> > > index 4cb60bacf5f5..2733da891688 100644
> > > --- a/net/ceph/messenger_v1.c
> > > +++ b/net/ceph/messenger_v1.c
> > > @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
> > >   static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
> > >   {
> > >   	/* Initialize data cursor if it's not a sparse read */
> > > -	if (!msg->sparse_read)
> > > +	if (msg->sparse_read)
> > > +		msg->cursor.sr_total_resid = data_len;
> > > +	else
> > >   		ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
> > >   }
> > >   
> > > @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
> > >   	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
> > >   	u32 crc = 0;
> > >   	int ret = 1;
> > > +	int len;
> > >   
> > >   	if (do_datacrc)
> > >   		crc = con->in_data_crc;
> > >   
> > > -	do {
> > > -		if (con->v1.in_sr_kvec.iov_base)
> > > +	while (cursor->sr_total_resid) {
> > > +		len = 0;
> > > +		if (con->v1.in_sr_kvec.iov_base) {
> > > +			len = con->v1.in_sr_kvec.iov_len;
> > >   			ret = read_partial_message_chunk(con,
> > >   							 &con->v1.in_sr_kvec,
> > >   							 con->v1.in_sr_len,
> > >   							 &crc);
> > > -		else if (cursor->sr_resid > 0)
> > > +			len = con->v1.in_sr_kvec.iov_len - len;
> > > +		} else if (cursor->sr_resid > 0) {
> > > +			len = cursor->sr_resid;
> > >   			ret = read_partial_sparse_msg_extent(con, &crc);
> > > -
> > > -		if (ret <= 0) {
> > > -			if (do_datacrc)
> > > -				con->in_data_crc = crc;
> > > -			return ret;
> > > +			len -= cursor->sr_resid;
> > >   		}
> > > +		cursor->sr_total_resid -= len;
> > > +		if (ret <= 0)
> > > +			break;
> > >   
> > >   		memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
> > >   		ret = con->ops->sparse_read(con, cursor,
> > >   				(char **)&con->v1.in_sr_kvec.iov_base);
> > > +		if (ret <= 0) {
> > > +			ret = ret ? : 1; /* must return > 0 to indicate success */
> > > +			break;
> > > +		}
> > >   		con->v1.in_sr_len = ret;
> > > -	} while (ret > 0);
> > > +	}
> > >   
> > >   	if (do_datacrc)
> > >   		con->in_data_crc = crc;
> > >   
> > > -	return ret < 0 ? ret : 1;  /* must return > 0 to indicate success */
> > > +	return ret;
> > >   }
> > >   
> > >   static int read_partial_msg_data(struct ceph_connection *con)
> > Looking back over this code...
> > 
> > The way it works today, once we determine it's a sparse read, we call
> > read_sparse_msg_data. At that point we call either
> > read_partial_message_chunk (to read into the kvec) or
> > read_sparse_msg_extent if sr_resid is already set (indicating that we're
> > receiving an extent).
> > 
> > read_sparse_msg_extent calls ceph_tcp_recvpage in a loop until
> > cursor->sr_resid have been received. The exception there when
> > ceph_tcp_recvpage returns <= 0.
> > 
> > ceph_tcp_recvpage returns 0 if sock_recvmsg returns -EAGAIN (maybe also
> > in other cases). So it sounds like the client just timed out on a read
> > from the socket or caught a signal or something?
> > 
> > If that's correct, then do we know what ceph_tcp_recvpage returned when
> > the problem happened?
> 
> It should just return parital data, and we should continue from here in 
> the next loop when the reset data comes.
> 

Tracking this extra length seems like the wrong fix. We're already
looping in read_sparse_msg_extent until the sr_resid goes to 0. ISTM
that it's just that read_sparse_msg_extent is returning inappropriately
in the face of timeouts.

IOW, it does this:

                ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len);
                if (ret <= 0)
                        return ret;

...should it just not be returning there when ret == 0? Maybe it should
be retrying the recvpage instead?
Xiubo Li Jan. 22, 2024, 2:52 a.m. UTC | #5
On 1/19/24 19:09, Jeff Layton wrote:
> On Fri, 2024-01-19 at 12:35 +0800, Xiubo Li wrote:
>> On 1/19/24 02:24, Jeff Layton wrote:
>>> On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote:
>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>
>>>> The messages from ceph maybe split into multiple socket packages
>>>> and we just need to wait for all the data to be availiable on the
>>>> sokcet.
>>>>
>>>> This will add 'sr_total_resid' to record the total length for all
>>>> data items for sparse-read message and 'sr_resid_elen' to record
>>>> the current extent total length.
>>>>
>>>> URL: https://tracker.ceph.com/issues/63586
>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>> ---
>>>>    include/linux/ceph/messenger.h |  1 +
>>>>    net/ceph/messenger_v1.c        | 32 +++++++++++++++++++++-----------
>>>>    2 files changed, 22 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
>>>> index 2eaaabbe98cb..ca6f82abed62 100644
>>>> --- a/include/linux/ceph/messenger.h
>>>> +++ b/include/linux/ceph/messenger.h
>>>> @@ -231,6 +231,7 @@ struct ceph_msg_data {
>>>>    
>>>>    struct ceph_msg_data_cursor {
>>>>    	size_t			total_resid;	/* across all data items */
>>>> +	size_t			sr_total_resid;	/* across all data items for sparse-read */
>>>>    
>>>>    	struct ceph_msg_data	*data;		/* current data item */
>>>>    	size_t			resid;		/* bytes not yet consumed */
>>>> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
>>>> index 4cb60bacf5f5..2733da891688 100644
>>>> --- a/net/ceph/messenger_v1.c
>>>> +++ b/net/ceph/messenger_v1.c
>>>> @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
>>>>    static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
>>>>    {
>>>>    	/* Initialize data cursor if it's not a sparse read */
>>>> -	if (!msg->sparse_read)
>>>> +	if (msg->sparse_read)
>>>> +		msg->cursor.sr_total_resid = data_len;
>>>> +	else
>>>>    		ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
>>>>    }
>>>>    
>>>> @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
>>>>    	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
>>>>    	u32 crc = 0;
>>>>    	int ret = 1;
>>>> +	int len;
>>>>    
>>>>    	if (do_datacrc)
>>>>    		crc = con->in_data_crc;
>>>>    
>>>> -	do {
>>>> -		if (con->v1.in_sr_kvec.iov_base)
>>>> +	while (cursor->sr_total_resid) {
>>>> +		len = 0;
>>>> +		if (con->v1.in_sr_kvec.iov_base) {
>>>> +			len = con->v1.in_sr_kvec.iov_len;
>>>>    			ret = read_partial_message_chunk(con,
>>>>    							 &con->v1.in_sr_kvec,
>>>>    							 con->v1.in_sr_len,
>>>>    							 &crc);
>>>> -		else if (cursor->sr_resid > 0)
>>>> +			len = con->v1.in_sr_kvec.iov_len - len;
>>>> +		} else if (cursor->sr_resid > 0) {
>>>> +			len = cursor->sr_resid;
>>>>    			ret = read_partial_sparse_msg_extent(con, &crc);
>>>> -
>>>> -		if (ret <= 0) {
>>>> -			if (do_datacrc)
>>>> -				con->in_data_crc = crc;
>>>> -			return ret;
>>>> +			len -= cursor->sr_resid;
>>>>    		}
>>>> +		cursor->sr_total_resid -= len;
>>>> +		if (ret <= 0)
>>>> +			break;
>>>>    
>>>>    		memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
>>>>    		ret = con->ops->sparse_read(con, cursor,
>>>>    				(char **)&con->v1.in_sr_kvec.iov_base);
>>>> +		if (ret <= 0) {
>>>> +			ret = ret ? : 1; /* must return > 0 to indicate success */
>>>> +			break;
>>>> +		}
>>>>    		con->v1.in_sr_len = ret;
>>>> -	} while (ret > 0);
>>>> +	}
>>>>    
>>>>    	if (do_datacrc)
>>>>    		con->in_data_crc = crc;
>>>>    
>>>> -	return ret < 0 ? ret : 1;  /* must return > 0 to indicate success */
>>>> +	return ret;
>>>>    }
>>>>    
>>>>    static int read_partial_msg_data(struct ceph_connection *con)
>>> Looking back over this code...
>>>
>>> The way it works today, once we determine it's a sparse read, we call
>>> read_sparse_msg_data. At that point we call either
>>> read_partial_message_chunk (to read into the kvec) or
>>> read_sparse_msg_extent if sr_resid is already set (indicating that we're
>>> receiving an extent).
>>>
>>> read_sparse_msg_extent calls ceph_tcp_recvpage in a loop until
>>> cursor->sr_resid have been received. The exception there when
>>> ceph_tcp_recvpage returns <= 0.
>>>
>>> ceph_tcp_recvpage returns 0 if sock_recvmsg returns -EAGAIN (maybe also
>>> in other cases). So it sounds like the client just timed out on a read
>>> from the socket or caught a signal or something?
>>>
>>> If that's correct, then do we know what ceph_tcp_recvpage returned when
>>> the problem happened?
>> It should just return parital data, and we should continue from here in
>> the next loop when the reset data comes.
>>
> Tracking this extra length seems like the wrong fix. We're already
> looping in read_sparse_msg_extent until the sr_resid goes to 0.
Yeah, it is and it works well.
>   ISTM
> that it's just that read_sparse_msg_extent is returning inappropriately
> in the face of timeouts.
>
> IOW, it does this:
>
>                  ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len);
>                  if (ret <= 0)
>                          return ret;
>
> ...should it just not be returning there when ret == 0? Maybe it should
> be retrying the recvpage instead?

Currently the the ceph_tcp_recvpage() will read data without blocking. 
If so we will change the logic here then all the other non-sparse-read 
cases will be changed to.

Please note this won't fix anything here in this bug.

Because currently the sparse-read code works well if in step 4 it 
partially read the sparse-read data or extents.

But in case of partially reading 'footer' in step 5. What we need to 
guarantee is that in the second loop we could skip triggering a new 
sparse-read in step 4:

1, /* header */         ===> will skip and do nothing if it has already 
read the 'header' data in last loop

2, /* front */             ===> will skip and do nothing if it has 
already read the 'front' data in last loop

3, /* middle */         ===> will skip and do nothing if it has already 
read the 'middle' data in last loop

4, /* (page) data */   ===> sparse-read here, it also should skip and do 
nothing if it has already read the whole 'sparse-read' data in last 
loop, but it won't. This is the ROOT CAUSE of this bug.

5, /* footer */            ===> the 'read_partial()' will only read 
partial 'footer' data then need to loop start from /* header */ when the 
data comes

My patch could guarantee that the sparse-read code will do nothing. 
While currently the code will trigger a new sparse-read from beginning 
again, which is incorrect.

Jeff, please let me know if you have better approaches.  The last one 
Ilya mentioned didn't work.

Thanks

- Xiubo
Jeff Layton Jan. 22, 2024, 11:44 a.m. UTC | #6
On Mon, 2024-01-22 at 10:52 +0800, Xiubo Li wrote:
> On 1/19/24 19:09, Jeff Layton wrote:
> > On Fri, 2024-01-19 at 12:35 +0800, Xiubo Li wrote:
> > > On 1/19/24 02:24, Jeff Layton wrote:
> > > > On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote:
> > > > > From: Xiubo Li <xiubli@redhat.com>
> > > > > 
> > > > > The messages from ceph maybe split into multiple socket packages
> > > > > and we just need to wait for all the data to be availiable on the
> > > > > sokcet.
> > > > > 
> > > > > This will add 'sr_total_resid' to record the total length for all
> > > > > data items for sparse-read message and 'sr_resid_elen' to record
> > > > > the current extent total length.
> > > > > 
> > > > > URL: https://tracker.ceph.com/issues/63586
> > > > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > > > ---
> > > > >    include/linux/ceph/messenger.h |  1 +
> > > > >    net/ceph/messenger_v1.c        | 32 +++++++++++++++++++++-----------
> > > > >    2 files changed, 22 insertions(+), 11 deletions(-)
> > > > > 
> > > > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> > > > > index 2eaaabbe98cb..ca6f82abed62 100644
> > > > > --- a/include/linux/ceph/messenger.h
> > > > > +++ b/include/linux/ceph/messenger.h
> > > > > @@ -231,6 +231,7 @@ struct ceph_msg_data {
> > > > >    
> > > > >    struct ceph_msg_data_cursor {
> > > > >    	size_t			total_resid;	/* across all data items */
> > > > > +	size_t			sr_total_resid;	/* across all data items for sparse-read */
> > > > >    
> > > > >    	struct ceph_msg_data	*data;		/* current data item */
> > > > >    	size_t			resid;		/* bytes not yet consumed */
> > > > > diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
> > > > > index 4cb60bacf5f5..2733da891688 100644
> > > > > --- a/net/ceph/messenger_v1.c
> > > > > +++ b/net/ceph/messenger_v1.c
> > > > > @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
> > > > >    static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
> > > > >    {
> > > > >    	/* Initialize data cursor if it's not a sparse read */
> > > > > -	if (!msg->sparse_read)
> > > > > +	if (msg->sparse_read)
> > > > > +		msg->cursor.sr_total_resid = data_len;
> > > > > +	else
> > > > >    		ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
> > > > >    }
> > > > >    
> > > > > @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
> > > > >    	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
> > > > >    	u32 crc = 0;
> > > > >    	int ret = 1;
> > > > > +	int len;
> > > > >    
> > > > >    	if (do_datacrc)
> > > > >    		crc = con->in_data_crc;
> > > > >    
> > > > > -	do {
> > > > > -		if (con->v1.in_sr_kvec.iov_base)
> > > > > +	while (cursor->sr_total_resid) {
> > > > > +		len = 0;
> > > > > +		if (con->v1.in_sr_kvec.iov_base) {
> > > > > +			len = con->v1.in_sr_kvec.iov_len;
> > > > >    			ret = read_partial_message_chunk(con,
> > > > >    							 &con->v1.in_sr_kvec,
> > > > >    							 con->v1.in_sr_len,
> > > > >    							 &crc);
> > > > > -		else if (cursor->sr_resid > 0)
> > > > > +			len = con->v1.in_sr_kvec.iov_len - len;
> > > > > +		} else if (cursor->sr_resid > 0) {
> > > > > +			len = cursor->sr_resid;
> > > > >    			ret = read_partial_sparse_msg_extent(con, &crc);
> > > > > -
> > > > > -		if (ret <= 0) {
> > > > > -			if (do_datacrc)
> > > > > -				con->in_data_crc = crc;
> > > > > -			return ret;
> > > > > +			len -= cursor->sr_resid;
> > > > >    		}
> > > > > +		cursor->sr_total_resid -= len;
> > > > > +		if (ret <= 0)
> > > > > +			break;
> > > > >    
> > > > >    		memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
> > > > >    		ret = con->ops->sparse_read(con, cursor,
> > > > >    				(char **)&con->v1.in_sr_kvec.iov_base);
> > > > > +		if (ret <= 0) {
> > > > > +			ret = ret ? : 1; /* must return > 0 to indicate success */
> > > > > +			break;
> > > > > +		}
> > > > >    		con->v1.in_sr_len = ret;
> > > > > -	} while (ret > 0);
> > > > > +	}
> > > > >    
> > > > >    	if (do_datacrc)
> > > > >    		con->in_data_crc = crc;
> > > > >    
> > > > > -	return ret < 0 ? ret : 1;  /* must return > 0 to indicate success */
> > > > > +	return ret;
> > > > >    }
> > > > >    
> > > > >    static int read_partial_msg_data(struct ceph_connection *con)
> > > > Looking back over this code...
> > > > 
> > > > The way it works today, once we determine it's a sparse read, we call
> > > > read_sparse_msg_data. At that point we call either
> > > > read_partial_message_chunk (to read into the kvec) or
> > > > read_sparse_msg_extent if sr_resid is already set (indicating that we're
> > > > receiving an extent).
> > > > 
> > > > read_sparse_msg_extent calls ceph_tcp_recvpage in a loop until
> > > > cursor->sr_resid have been received. The exception there when
> > > > ceph_tcp_recvpage returns <= 0.
> > > > 
> > > > ceph_tcp_recvpage returns 0 if sock_recvmsg returns -EAGAIN (maybe also
> > > > in other cases). So it sounds like the client just timed out on a read
> > > > from the socket or caught a signal or something?
> > > > 
> > > > If that's correct, then do we know what ceph_tcp_recvpage returned when
> > > > the problem happened?
> > > It should just return parital data, and we should continue from here in
> > > the next loop when the reset data comes.
> > > 
> > Tracking this extra length seems like the wrong fix. We're already
> > looping in read_sparse_msg_extent until the sr_resid goes to 0.
> Yeah, it is and it works well.
> >   ISTM
> > that it's just that read_sparse_msg_extent is returning inappropriately
> > in the face of timeouts.
> > 
> > IOW, it does this:
> > 
> >                  ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len);
> >                  if (ret <= 0)
> >                          return ret;
> > 
> > ...should it just not be returning there when ret == 0? Maybe it should
> > be retrying the recvpage instead?
> 
> Currently the the ceph_tcp_recvpage() will read data without blocking. 

Yes.

> If so we will change the logic here then all the other non-sparse-read 
> cases will be changed to.
> 

No. read_sparse_msg_data is only called from the sparse-read codepath.
If we change it, only that will be affected.


> Please note this won't fix anything here in this bug.
> 
> Because currently the sparse-read code works well if in step 4 it 
> partially read the sparse-read data or extents.
> 
> But in case of partially reading 'footer' in step 5. What we need to 
> guarantee is that in the second loop we could skip triggering a new 
> sparse-read in step 4:
> 
> 1, /* header */         ===> will skip and do nothing if it has already 
> read the 'header' data in last loop
> 
> 2, /* front */             ===> will skip and do nothing if it has 
> already read the 'front' data in last loop
> 
> 3, /* middle */         ===> will skip and do nothing if it has already 
> read the 'middle' data in last loop
> 
> 4, /* (page) data */   ===> sparse-read here, it also should skip and do 
> nothing if it has already read the whole 'sparse-read' data in last 
> loop, but it won't. This is the ROOT CAUSE of this bug.
> 
> 5, /* footer */            ===> the 'read_partial()' will only read 
> partial 'footer' data then need to loop start from /* header */ when the 
> data comes
> 
> My patch could guarantee that the sparse-read code will do nothing. 
> While currently the code will trigger a new sparse-read from beginning 
> again, which is incorrect.
> 
> Jeff, please let me know if you have better approaches.  The last one 
> Ilya mentioned didn't work.

Your patch tries to track sr_resid independently in a new variable
sr_total_resid, but I think that's unnecessary.

read_sparse_msg_data returns under two conditions:

1. It has read all of the sparse read data (i.e. sr_resid is 0), in
which case it returns 1.

...or...

2. ceph_tcp_recvpage returns a negative error code or 0.

After your patch, the only way you'd get a case where sr_total_resid
is >0 is if case #2 happens. Clearly if we receive all of the data then
sr_total_resid will also be 0.

We want to return an error if there is a hard error from
ceph_tcp_recvpage, but it looks like it also returns 0 if the socket
read returns -EAGAIN. So, it seems to be that doing something like this
would be a sufficient fix. What am I missing?

diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index f9a50d7f0d20..cf94ebdb3b34 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -1015,8 +1015,10 @@ static int read_sparse_msg_extent(struct ceph_connection *con, u32 *crc)
                /* clamp to what remains in extent */
                len = min_t(int, len, cursor->sr_resid);
                ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len);
-               if (ret <= 0)
+               if (ret < 0)
                        return ret;
+               else if (ret == 0)
+                       continue;
                *crc = ceph_crc32c_page(*crc, rpage, off, ret);
                ceph_msg_data_advance(cursor, (size_t)ret);
                cursor->sr_resid -= ret;
Jeff Layton Jan. 22, 2024, 3:02 p.m. UTC | #7
On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> The messages from ceph maybe split into multiple socket packages
> and we just need to wait for all the data to be availiable on the
> sokcet.
> 
> This will add 'sr_total_resid' to record the total length for all
> data items for sparse-read message and 'sr_resid_elen' to record
> the current extent total length.
> 
> URL: https://tracker.ceph.com/issues/63586
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  include/linux/ceph/messenger.h |  1 +
>  net/ceph/messenger_v1.c        | 32 +++++++++++++++++++++-----------
>  2 files changed, 22 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> index 2eaaabbe98cb..ca6f82abed62 100644
> --- a/include/linux/ceph/messenger.h
> +++ b/include/linux/ceph/messenger.h
> @@ -231,6 +231,7 @@ struct ceph_msg_data {
>  
>  struct ceph_msg_data_cursor {
>  	size_t			total_resid;	/* across all data items */
> +	size_t			sr_total_resid;	/* across all data items for sparse-read */
>  
>  	struct ceph_msg_data	*data;		/* current data item */
>  	size_t			resid;		/* bytes not yet consumed */
> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
> index 4cb60bacf5f5..2733da891688 100644
> --- a/net/ceph/messenger_v1.c
> +++ b/net/ceph/messenger_v1.c
> @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
>  static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
>  {
>  	/* Initialize data cursor if it's not a sparse read */
> -	if (!msg->sparse_read)
> +	if (msg->sparse_read)
> +		msg->cursor.sr_total_resid = data_len;
> +	else
>  		ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
>  }
> 
> 

Sorry, disregard my last email.

I went and looked at the patch in the tree, and I better understand what
you're trying to do. We already have a value that gets set to data_len
called total_resid, but that is a nonsense value in a sparse read,
because we can advance farther through the receive buffer than the
amount of data in the socket.

So, I think you do need a separate value like this that's only used in
sparse reads, or you'll have to change how ->total_resid is handled
during a sparse read. The first way is simpler, so I agree with the
approach.

That said, if you're going to add a new value to ceph_msg_data_cursor,
then you really ought to initialize it in ceph_msg_data_cursor_init. You
don't necessarily have to use it elsewhere, but that would be cleaner
than the if statement above.

More comments below.

>  
> @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
>  	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
>  	u32 crc = 0;
>  	int ret = 1;
> +	int len;
>  
>  	if (do_datacrc)
>  		crc = con->in_data_crc;
>  
> -	do {
> -		if (con->v1.in_sr_kvec.iov_base)
> +	while (cursor->sr_total_resid) {
> +		len = 0;
> +		if (con->v1.in_sr_kvec.iov_base) {
> +			len = con->v1.in_sr_kvec.iov_len;
>  			ret = read_partial_message_chunk(con,
>  							 &con->v1.in_sr_kvec,
>  							 con->v1.in_sr_len,
>  							 &crc);
> -		else if (cursor->sr_resid > 0)
> +			len = con->v1.in_sr_kvec.iov_len - len;

len is going to be 0 after the above statement. I'm not sure I
understand the point of messing with its value in the above block.

> +		} else if (cursor->sr_resid > 0) {
> +			len = cursor->sr_resid;
>  			ret = read_partial_sparse_msg_extent(con, &crc);
> -
> -		if (ret <= 0) {
> -			if (do_datacrc)
> -				con->in_data_crc = crc;
> -			return ret;
> +			len -= cursor->sr_resid;


Won't "len" also be 0 after the above?

>  		}
> +		cursor->sr_total_resid -= len;


...and so sr_total_resid always decrements by 0? That doesn't look like
it's doing the right thing.

> +		if (ret <= 0)
> +			break;
>  
>  		memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
>  		ret = con->ops->sparse_read(con, cursor,
>  				(char **)&con->v1.in_sr_kvec.iov_base);
> +		if (ret <= 0) {
> +			ret = ret ? : 1; /* must return > 0 to indicate success */
> +			break;
> +		}
>  		con->v1.in_sr_len = ret;
> -	} while (ret > 0);
> +	}
>  
>  	if (do_datacrc)
>  		con->in_data_crc = crc;
>  
> -	return ret < 0 ? ret : 1;  /* must return > 0 to indicate success */
> +	return ret;
>  }
>  
>  static int read_partial_msg_data(struct ceph_connection *con)
Ilya Dryomov Jan. 22, 2024, 4:55 p.m. UTC | #8
On Mon, Jan 22, 2024 at 4:02 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote:
> > From: Xiubo Li <xiubli@redhat.com>
> >
> > The messages from ceph maybe split into multiple socket packages
> > and we just need to wait for all the data to be availiable on the
> > sokcet.
> >
> > This will add 'sr_total_resid' to record the total length for all
> > data items for sparse-read message and 'sr_resid_elen' to record
> > the current extent total length.
> >
> > URL: https://tracker.ceph.com/issues/63586
> > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > ---
> >  include/linux/ceph/messenger.h |  1 +
> >  net/ceph/messenger_v1.c        | 32 +++++++++++++++++++++-----------
> >  2 files changed, 22 insertions(+), 11 deletions(-)
> >
> > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> > index 2eaaabbe98cb..ca6f82abed62 100644
> > --- a/include/linux/ceph/messenger.h
> > +++ b/include/linux/ceph/messenger.h
> > @@ -231,6 +231,7 @@ struct ceph_msg_data {
> >
> >  struct ceph_msg_data_cursor {
> >       size_t                  total_resid;    /* across all data items */
> > +     size_t                  sr_total_resid; /* across all data items for sparse-read */
> >
> >       struct ceph_msg_data    *data;          /* current data item */
> >       size_t                  resid;          /* bytes not yet consumed */
> > diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
> > index 4cb60bacf5f5..2733da891688 100644
> > --- a/net/ceph/messenger_v1.c
> > +++ b/net/ceph/messenger_v1.c
> > @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
> >  static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
> >  {
> >       /* Initialize data cursor if it's not a sparse read */
> > -     if (!msg->sparse_read)
> > +     if (msg->sparse_read)
> > +             msg->cursor.sr_total_resid = data_len;
> > +     else
> >               ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
> >  }
> >
> >
>
> Sorry, disregard my last email.
>
> I went and looked at the patch in the tree, and I better understand what
> you're trying to do. We already have a value that gets set to data_len
> called total_resid, but that is a nonsense value in a sparse read,
> because we can advance farther through the receive buffer than the
> amount of data in the socket.

Hi Jeff,

I see that total_resid is set to sparse_data_requested(), which is
effectively the size of the receive buffer, not data_len.  (This is
ignoring the seemingly unnecessary complication of trying to support
normal reads mixed with sparse reads in the same message, which I'm
pretty sure doesn't work anyway.)

With that, total_resid should represent the amount that needs to be
filled in (advanced through) in the receive buffer.  When total_resid
reaches 0, wouldn't that mean that the amount of data in the socket is
also 0?  If not, where would the remaining data in the socket go?

Thanks,

                Ilya
Jeff Layton Jan. 22, 2024, 5:14 p.m. UTC | #9
On Mon, 2024-01-22 at 17:55 +0100, Ilya Dryomov wrote:
> On Mon, Jan 22, 2024 at 4:02 PM Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote:
> > > From: Xiubo Li <xiubli@redhat.com>
> > > 
> > > The messages from ceph maybe split into multiple socket packages
> > > and we just need to wait for all the data to be availiable on the
> > > sokcet.
> > > 
> > > This will add 'sr_total_resid' to record the total length for all
> > > data items for sparse-read message and 'sr_resid_elen' to record
> > > the current extent total length.
> > > 
> > > URL: https://tracker.ceph.com/issues/63586
> > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > ---
> > >  include/linux/ceph/messenger.h |  1 +
> > >  net/ceph/messenger_v1.c        | 32 +++++++++++++++++++++-----------
> > >  2 files changed, 22 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> > > index 2eaaabbe98cb..ca6f82abed62 100644
> > > --- a/include/linux/ceph/messenger.h
> > > +++ b/include/linux/ceph/messenger.h
> > > @@ -231,6 +231,7 @@ struct ceph_msg_data {
> > > 
> > >  struct ceph_msg_data_cursor {
> > >       size_t                  total_resid;    /* across all data items */
> > > +     size_t                  sr_total_resid; /* across all data items for sparse-read */
> > > 
> > >       struct ceph_msg_data    *data;          /* current data item */
> > >       size_t                  resid;          /* bytes not yet consumed */
> > > diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
> > > index 4cb60bacf5f5..2733da891688 100644
> > > --- a/net/ceph/messenger_v1.c
> > > +++ b/net/ceph/messenger_v1.c
> > > @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
> > >  static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
> > >  {
> > >       /* Initialize data cursor if it's not a sparse read */
> > > -     if (!msg->sparse_read)
> > > +     if (msg->sparse_read)
> > > +             msg->cursor.sr_total_resid = data_len;
> > > +     else
> > >               ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
> > >  }
> > > 
> > > 
> > 
> > Sorry, disregard my last email.
> > 
> > I went and looked at the patch in the tree, and I better understand what
> > you're trying to do. We already have a value that gets set to data_len
> > called total_resid, but that is a nonsense value in a sparse read,
> > because we can advance farther through the receive buffer than the
> > amount of data in the socket.
> 
> Hi Jeff,
> 
> I see that total_resid is set to sparse_data_requested(), which is
> effectively the size of the receive buffer, not data_len.  (This is
> ignoring the seemingly unnecessary complication of trying to support
> normal reads mixed with sparse reads in the same message, which I'm
> pretty sure doesn't work anyway.)
> 

Oh right. I missed that bit when I was re-reviewing this. Once we're in
a sparse read we'll override that value. Ok, so maybe we don't need
sr_total_resid.

> With that, total_resid should represent the amount that needs to be
> filled in (advanced through) in the receive buffer.  When total_resid
> reaches 0, wouldn't that mean that the amount of data in the socket is
> also 0?  If not, where would the remaining data in the socket go?
> 

With a properly formed reply, then I think yes, there should be no
remaining data in the socket at the end of the receive.

At this point I think I must just be confused about the actual problem.
I think I need a detailed description of it before I can properly review
this patch.
Ilya Dryomov Jan. 22, 2024, 7:41 p.m. UTC | #10
On Mon, Jan 22, 2024 at 6:14 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Mon, 2024-01-22 at 17:55 +0100, Ilya Dryomov wrote:
> > On Mon, Jan 22, 2024 at 4:02 PM Jeff Layton <jlayton@kernel.org> wrote:
> > >
> > > On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote:
> > > > From: Xiubo Li <xiubli@redhat.com>
> > > >
> > > > The messages from ceph maybe split into multiple socket packages
> > > > and we just need to wait for all the data to be availiable on the
> > > > sokcet.
> > > >
> > > > This will add 'sr_total_resid' to record the total length for all
> > > > data items for sparse-read message and 'sr_resid_elen' to record
> > > > the current extent total length.
> > > >
> > > > URL: https://tracker.ceph.com/issues/63586
> > > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > > ---
> > > >  include/linux/ceph/messenger.h |  1 +
> > > >  net/ceph/messenger_v1.c        | 32 +++++++++++++++++++++-----------
> > > >  2 files changed, 22 insertions(+), 11 deletions(-)
> > > >
> > > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> > > > index 2eaaabbe98cb..ca6f82abed62 100644
> > > > --- a/include/linux/ceph/messenger.h
> > > > +++ b/include/linux/ceph/messenger.h
> > > > @@ -231,6 +231,7 @@ struct ceph_msg_data {
> > > >
> > > >  struct ceph_msg_data_cursor {
> > > >       size_t                  total_resid;    /* across all data items */
> > > > +     size_t                  sr_total_resid; /* across all data items for sparse-read */
> > > >
> > > >       struct ceph_msg_data    *data;          /* current data item */
> > > >       size_t                  resid;          /* bytes not yet consumed */
> > > > diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
> > > > index 4cb60bacf5f5..2733da891688 100644
> > > > --- a/net/ceph/messenger_v1.c
> > > > +++ b/net/ceph/messenger_v1.c
> > > > @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
> > > >  static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
> > > >  {
> > > >       /* Initialize data cursor if it's not a sparse read */
> > > > -     if (!msg->sparse_read)
> > > > +     if (msg->sparse_read)
> > > > +             msg->cursor.sr_total_resid = data_len;
> > > > +     else
> > > >               ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
> > > >  }
> > > >
> > > >
> > >
> > > Sorry, disregard my last email.
> > >
> > > I went and looked at the patch in the tree, and I better understand what
> > > you're trying to do. We already have a value that gets set to data_len
> > > called total_resid, but that is a nonsense value in a sparse read,
> > > because we can advance farther through the receive buffer than the
> > > amount of data in the socket.
> >
> > Hi Jeff,
> >
> > I see that total_resid is set to sparse_data_requested(), which is
> > effectively the size of the receive buffer, not data_len.  (This is
> > ignoring the seemingly unnecessary complication of trying to support
> > normal reads mixed with sparse reads in the same message, which I'm
> > pretty sure doesn't work anyway.)
> >
>
> Oh right. I missed that bit when I was re-reviewing this. Once we're in
> a sparse read we'll override that value. Ok, so maybe we don't need
> sr_total_resid.
>
> > With that, total_resid should represent the amount that needs to be
> > filled in (advanced through) in the receive buffer.  When total_resid
> > reaches 0, wouldn't that mean that the amount of data in the socket is
> > also 0?  If not, where would the remaining data in the socket go?
> >
>
> With a properly formed reply, then I think yes, there should be no
> remaining data in the socket at the end of the receive.

There would be no actual data, but a message footer which follows the
data section and ends the message would be outstanding.

>
> At this point I think I must just be confused about the actual problem.
> I think I need a detailed description of it before I can properly review
> this patch.

AFAIU the problem is that a short read may occur while reading the
message footer from the socket.  Later, when the socket is ready for
another read, the messenger invokes all read_partial* handlers,
including read_partial_sparse_msg_data().  The contract between the
messenger and these handlers is that the handler should bail if the
area of the message it's responsible for is already processed.  So,
in this case, it's expected that read_sparse_msg_data() would bail,
allowing the messenger to invoke read_partial() for the footer and
pick up where it left off.

However read_sparse_msg_data() violates that contract and ends up
calling into the state machine in the OSD client.  The state machine
assumes that it's a new op and interprets some piece of the footer (or
even completely random memory?) as the sparse-read header and returns
bogus extent length, etc.

(BTW it's why I suggested the rename from read_sparse_msg_data() to
read_partial_sparse_msg_data() in another patch -- to highlight that
it's one of the "partial" handlers and the respective behavior.)

Thanks,

                Ilya
Xiubo Li Jan. 23, 2024, 12:53 a.m. UTC | #11
On 1/23/24 03:41, Ilya Dryomov wrote:
> On Mon, Jan 22, 2024 at 6:14 PM Jeff Layton <jlayton@kernel.org> wrote:
>> On Mon, 2024-01-22 at 17:55 +0100, Ilya Dryomov wrote:
>>> On Mon, Jan 22, 2024 at 4:02 PM Jeff Layton <jlayton@kernel.org> wrote:
>>>> On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote:
>>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>>
>>>>> The messages from ceph maybe split into multiple socket packages
>>>>> and we just need to wait for all the data to be availiable on the
>>>>> sokcet.
>>>>>
>>>>> This will add 'sr_total_resid' to record the total length for all
>>>>> data items for sparse-read message and 'sr_resid_elen' to record
>>>>> the current extent total length.
>>>>>
>>>>> URL: https://tracker.ceph.com/issues/63586
>>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>>> ---
>>>>>   include/linux/ceph/messenger.h |  1 +
>>>>>   net/ceph/messenger_v1.c        | 32 +++++++++++++++++++++-----------
>>>>>   2 files changed, 22 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
>>>>> index 2eaaabbe98cb..ca6f82abed62 100644
>>>>> --- a/include/linux/ceph/messenger.h
>>>>> +++ b/include/linux/ceph/messenger.h
>>>>> @@ -231,6 +231,7 @@ struct ceph_msg_data {
>>>>>
>>>>>   struct ceph_msg_data_cursor {
>>>>>        size_t                  total_resid;    /* across all data items */
>>>>> +     size_t                  sr_total_resid; /* across all data items for sparse-read */
>>>>>
>>>>>        struct ceph_msg_data    *data;          /* current data item */
>>>>>        size_t                  resid;          /* bytes not yet consumed */
>>>>> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
>>>>> index 4cb60bacf5f5..2733da891688 100644
>>>>> --- a/net/ceph/messenger_v1.c
>>>>> +++ b/net/ceph/messenger_v1.c
>>>>> @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
>>>>>   static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
>>>>>   {
>>>>>        /* Initialize data cursor if it's not a sparse read */
>>>>> -     if (!msg->sparse_read)
>>>>> +     if (msg->sparse_read)
>>>>> +             msg->cursor.sr_total_resid = data_len;
>>>>> +     else
>>>>>                ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
>>>>>   }
>>>>>
>>>>>
>>>> Sorry, disregard my last email.
>>>>
>>>> I went and looked at the patch in the tree, and I better understand what
>>>> you're trying to do. We already have a value that gets set to data_len
>>>> called total_resid, but that is a nonsense value in a sparse read,
>>>> because we can advance farther through the receive buffer than the
>>>> amount of data in the socket.
>>> Hi Jeff,
>>>
>>> I see that total_resid is set to sparse_data_requested(), which is
>>> effectively the size of the receive buffer, not data_len.  (This is
>>> ignoring the seemingly unnecessary complication of trying to support
>>> normal reads mixed with sparse reads in the same message, which I'm
>>> pretty sure doesn't work anyway.)
>>>
>> Oh right. I missed that bit when I was re-reviewing this. Once we're in
>> a sparse read we'll override that value. Ok, so maybe we don't need
>> sr_total_resid.
>>
Oh, I get you now. Yeah we can just reuse the 'total_resid' instead of 
adding a new one.

>>> With that, total_resid should represent the amount that needs to be
>>> filled in (advanced through) in the receive buffer.  When total_resid
>>> reaches 0, wouldn't that mean that the amount of data in the socket is
>>> also 0?  If not, where would the remaining data in the socket go?
>>>
>> With a properly formed reply, then I think yes, there should be no
>> remaining data in the socket at the end of the receive.
> There would be no actual data, but a message footer which follows the
> data section and ends the message would be outstanding.

Yeah, correct.

>> At this point I think I must just be confused about the actual problem.
>> I think I need a detailed description of it before I can properly review
>> this patch.
> AFAIU the problem is that a short read may occur while reading the
> message footer from the socket.  Later, when the socket is ready for
> another read, the messenger invokes all read_partial* handlers,
> including read_partial_sparse_msg_data().  The contract between the
> messenger and these handlers is that the handler should bail if the
> area of the message it's responsible for is already processed.  So,
> in this case, it's expected that read_sparse_msg_data() would bail,
> allowing the messenger to invoke read_partial() for the footer and
> pick up where it left off.
>
> However read_sparse_msg_data() violates that contract and ends up
> calling into the state machine in the OSD client.  The state machine
> assumes that it's a new op and interprets some piece of the footer (or
> even completely random memory?) as the sparse-read header and returns
> bogus extent length, etc.
>
> (BTW it's why I suggested the rename from read_sparse_msg_data() to
> read_partial_sparse_msg_data() in another patch -- to highlight that
> it's one of the "partial" handlers and the respective behavior.)

Yeah, Ilya is correct.

Thanks

- Xiubo

> Thanks,
>
>                  Ilya
>
diff mbox series

Patch

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 2eaaabbe98cb..ca6f82abed62 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -231,6 +231,7 @@  struct ceph_msg_data {
 
 struct ceph_msg_data_cursor {
 	size_t			total_resid;	/* across all data items */
+	size_t			sr_total_resid;	/* across all data items for sparse-read */
 
 	struct ceph_msg_data	*data;		/* current data item */
 	size_t			resid;		/* bytes not yet consumed */
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index 4cb60bacf5f5..2733da891688 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -160,7 +160,9 @@  static size_t sizeof_footer(struct ceph_connection *con)
 static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
 {
 	/* Initialize data cursor if it's not a sparse read */
-	if (!msg->sparse_read)
+	if (msg->sparse_read)
+		msg->cursor.sr_total_resid = data_len;
+	else
 		ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
 }
 
@@ -1032,35 +1034,43 @@  static int read_partial_sparse_msg_data(struct ceph_connection *con)
 	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
 	u32 crc = 0;
 	int ret = 1;
+	int len;
 
 	if (do_datacrc)
 		crc = con->in_data_crc;
 
-	do {
-		if (con->v1.in_sr_kvec.iov_base)
+	while (cursor->sr_total_resid) {
+		len = 0;
+		if (con->v1.in_sr_kvec.iov_base) {
+			len = con->v1.in_sr_kvec.iov_len;
 			ret = read_partial_message_chunk(con,
 							 &con->v1.in_sr_kvec,
 							 con->v1.in_sr_len,
 							 &crc);
-		else if (cursor->sr_resid > 0)
+			len = con->v1.in_sr_kvec.iov_len - len;
+		} else if (cursor->sr_resid > 0) {
+			len = cursor->sr_resid;
 			ret = read_partial_sparse_msg_extent(con, &crc);
-
-		if (ret <= 0) {
-			if (do_datacrc)
-				con->in_data_crc = crc;
-			return ret;
+			len -= cursor->sr_resid;
 		}
+		cursor->sr_total_resid -= len;
+		if (ret <= 0)
+			break;
 
 		memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
 		ret = con->ops->sparse_read(con, cursor,
 				(char **)&con->v1.in_sr_kvec.iov_base);
+		if (ret <= 0) {
+			ret = ret ? : 1; /* must return > 0 to indicate success */
+			break;
+		}
 		con->v1.in_sr_len = ret;
-	} while (ret > 0);
+	}
 
 	if (do_datacrc)
 		con->in_data_crc = crc;
 
-	return ret < 0 ? ret : 1;  /* must return > 0 to indicate success */
+	return ret;
 }
 
 static int read_partial_msg_data(struct ceph_connection *con)