diff mbox series

ceph: return the real size readed when hit EOF

Message ID 20211019115138.414187-1-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series ceph: return the real size readed when hit EOF | expand

Commit Message

Xiubo Li Oct. 19, 2021, 11:51 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

At the same time set the ki_pos to the file size.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/file.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

Comments

Xiubo Li Oct. 19, 2021, 11:54 a.m. UTC | #1
Without this, such as in case for the file encrypt feature, when doing 
the following test:

1), echo "1234567890" > dir/a.txt

2), vim dir/a.txt

It will always show the zeroed contents after string "1234567890", 
something like:

"1234567890@@@@@...."

On 10/19/21 7:51 PM, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
>
> At the same time set the ki_pos to the file size.
>
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>   fs/ceph/file.c | 14 +++++++++-----
>   1 file changed, 9 insertions(+), 5 deletions(-)
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 91173d3aa161..1abc3b591740 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -847,6 +847,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>   	ssize_t ret;
>   	u64 off = iocb->ki_pos;
>   	u64 len = iov_iter_count(to);
> +	u64 i_size = i_size_read(inode);
>   
>   	dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
>   	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
> @@ -870,7 +871,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>   		struct page **pages;
>   		int num_pages;
>   		size_t page_off;
> -		u64 i_size;
>   		bool more;
>   		int idx;
>   		size_t left;
> @@ -909,7 +909,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>   
>   		ceph_osdc_put_request(req);
>   
> -		i_size = i_size_read(inode);
>   		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
>   		     off, len, ret, i_size, (more ? " MORE" : ""));
>   
> @@ -954,10 +953,15 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>   
>   	if (off > iocb->ki_pos) {
>   		if (ret >= 0 &&
> -		    iov_iter_count(to) > 0 && off >= i_size_read(inode))
> +		    iov_iter_count(to) > 0 &&
> +		    off >= i_size_read(inode)) {
>   			*retry_op = CHECK_EOF;
> -		ret = off - iocb->ki_pos;
> -		iocb->ki_pos = off;
> +			ret = i_size - iocb->ki_pos;
> +			iocb->ki_pos = i_size;
> +		} else {
> +			ret = off - iocb->ki_pos;
> +			iocb->ki_pos = off;
> +		}
>   	}
>   
>   	dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
Jeff Layton Oct. 19, 2021, 12:59 p.m. UTC | #2
On Tue, 2021-10-19 at 19:51 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> At the same time set the ki_pos to the file size.
> 

It would be good to put the comments in your follow up email into the
patch description here, so that it explains what you're fixing and why.

> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/file.c | 14 +++++++++-----
>  1 file changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 91173d3aa161..1abc3b591740 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -847,6 +847,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>  	ssize_t ret;
>  	u64 off = iocb->ki_pos;
>  	u64 len = iov_iter_count(to);
> +	u64 i_size = i_size_read(inode);
>  

Was there a reason to change where the i_size is fetched, or did you
just not see the point in fetching it on each loop? I wonder if we can
hit any races vs. truncates with this?

Oh well, all of this non-buffered I/O seems somewhat racy anyway. ;)

>  	dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
>  	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
> @@ -870,7 +871,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>  		struct page **pages;
>  		int num_pages;
>  		size_t page_off;
> -		u64 i_size;
>  		bool more;
>  		int idx;
>  		size_t left;
> @@ -909,7 +909,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>  
>  		ceph_osdc_put_request(req);
>  
> -		i_size = i_size_read(inode);
>  		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
>  		     off, len, ret, i_size, (more ? " MORE" : ""));
>  
> @@ -954,10 +953,15 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>  
>  	if (off > iocb->ki_pos) {
>  		if (ret >= 0 &&

Do we need to check ret here? I think that if ret < 0, then "off" must
be smaller than "i_size", no?

> -		    iov_iter_count(to) > 0 && off >= i_size_read(inode))
> +		    iov_iter_count(to) > 0 &&
> +		    off >= i_size_read(inode)) {
>  			*retry_op = CHECK_EOF;
> -		ret = off - iocb->ki_pos;
> -		iocb->ki_pos = off;
> +			ret = i_size - iocb->ki_pos;
> +			iocb->ki_pos = i_size;
> +		} else {
> +			ret = off - iocb->ki_pos;
> +			iocb->ki_pos = off;
> +		}
>  	}
>  
>  	dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
Xiubo Li Oct. 20, 2021, 2:04 a.m. UTC | #3
On 10/19/21 8:59 PM, Jeff Layton wrote:
> On Tue, 2021-10-19 at 19:51 +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> At the same time set the ki_pos to the file size.
>>
> It would be good to put the comments in your follow up email into the
> patch description here, so that it explains what you're fixing and why.
Sure.
>
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/file.c | 14 +++++++++-----
>>   1 file changed, 9 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>> index 91173d3aa161..1abc3b591740 100644
>> --- a/fs/ceph/file.c
>> +++ b/fs/ceph/file.c
>> @@ -847,6 +847,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>>   	ssize_t ret;
>>   	u64 off = iocb->ki_pos;
>>   	u64 len = iov_iter_count(to);
>> +	u64 i_size = i_size_read(inode);
>>   
> Was there a reason to change where the i_size is fetched, or did you
> just not see the point in fetching it on each loop? I wonder if we can
> hit any races vs. truncates with this?

 From my reading from the code, it shouldn't.

While doing the truncate it will down_write(&inode->i_rwsem). And the 
sync read will hold down_read() it. It should be safe.

>
> Oh well, all of this non-buffered I/O seems somewhat racy anyway. ;)
>
>>   	dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
>>   	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
>> @@ -870,7 +871,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>>   		struct page **pages;
>>   		int num_pages;
>>   		size_t page_off;
>> -		u64 i_size;
>>   		bool more;
>>   		int idx;
>>   		size_t left;
>> @@ -909,7 +909,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>>   
>>   		ceph_osdc_put_request(req);
>>   
>> -		i_size = i_size_read(inode);
>>   		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
>>   		     off, len, ret, i_size, (more ? " MORE" : ""));
>>   
>> @@ -954,10 +953,15 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>>   
>>   	if (off > iocb->ki_pos) {
>>   		if (ret >= 0 &&
> Do we need to check ret here?

It seems this check makes no sense even in the following case I 
mentioned. Will check it more and try to remove it.


> I think that if ret < 0, then "off" must
> be smaller than "i_size", no?

 From current code, there has one case that it's no, such as if the file 
size is 10, and the ceph may return 4K contents and then the read length 
will be 4K too, and if it just fails in case:

                         copied = copy_page_to_iter(pages[idx++],
                                                    page_off, len, to);
                         off += copied;
                         left -= copied;
                         if (copied < len) {
                                 ret = -EFAULT;
                                 break;
                         }

And if the "copied = 1K" for some reasons, the "off" will be larger than 
the "i_size" but ret < 0.

BRs

Xiubo



>
>> -		    iov_iter_count(to) > 0 && off >= i_size_read(inode))
>> +		    iov_iter_count(to) > 0 &&
>> +		    off >= i_size_read(inode)) {
>>   			*retry_op = CHECK_EOF;
>> -		ret = off - iocb->ki_pos;
>> -		iocb->ki_pos = off;
>> +			ret = i_size - iocb->ki_pos;
>> +			iocb->ki_pos = i_size;
>> +		} else {
>> +			ret = off - iocb->ki_pos;
>> +			iocb->ki_pos = off;
>> +		}
>>   	}
>>   
>>   	dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
Xiubo Li Oct. 20, 2021, 2:15 a.m. UTC | #4
On 10/20/21 10:04 AM, Xiubo Li wrote:
>
> On 10/19/21 8:59 PM, Jeff Layton wrote:
>> On Tue, 2021-10-19 at 19:51 +0800, xiubli@redhat.com wrote:
>>> From: Xiubo Li <xiubli@redhat.com>
>>>
>>> At the same time set the ki_pos to the file size.
>>>
>> It would be good to put the comments in your follow up email into the
>> patch description here, so that it explains what you're fixing and why.
> Sure.
>>
>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>> ---
>>>   fs/ceph/file.c | 14 +++++++++-----
>>>   1 file changed, 9 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>>> index 91173d3aa161..1abc3b591740 100644
>>> --- a/fs/ceph/file.c
>>> +++ b/fs/ceph/file.c
>>> @@ -847,6 +847,7 @@ static ssize_t ceph_sync_read(struct kiocb 
>>> *iocb, struct iov_iter *to,
>>>       ssize_t ret;
>>>       u64 off = iocb->ki_pos;
>>>       u64 len = iov_iter_count(to);
>>> +    u64 i_size = i_size_read(inode);
>> Was there a reason to change where the i_size is fetched, or did you
>> just not see the point in fetching it on each loop? I wonder if we can
>> hit any races vs. truncates with this?
>
> From my reading from the code, it shouldn't.
>
> While doing the truncate it will down_write(&inode->i_rwsem). And the 
> sync read will hold down_read() it. It should be safe.
>
And also this should be safe if the truncate is in a different kclient:

When the MDS received the truncate request, it must revoke the 'Fr' caps 
from all the kclients first. So if another kclient is still reading the 
file, the truncate will be pause.


>>
>> Oh well, all of this non-buffered I/O seems somewhat racy anyway. ;)
>>
>>>       dout("sync_read on file %p %llu~%u %s\n", file, off, 
>>> (unsigned)len,
>>>            (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
>>> @@ -870,7 +871,6 @@ static ssize_t ceph_sync_read(struct kiocb 
>>> *iocb, struct iov_iter *to,
>>>           struct page **pages;
>>>           int num_pages;
>>>           size_t page_off;
>>> -        u64 i_size;
>>>           bool more;
>>>           int idx;
>>>           size_t left;
>>> @@ -909,7 +909,6 @@ static ssize_t ceph_sync_read(struct kiocb 
>>> *iocb, struct iov_iter *to,
>>>             ceph_osdc_put_request(req);
>>>   -        i_size = i_size_read(inode);
>>>           dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
>>>                off, len, ret, i_size, (more ? " MORE" : ""));
>>>   @@ -954,10 +953,15 @@ static ssize_t ceph_sync_read(struct kiocb 
>>> *iocb, struct iov_iter *to,
>>>         if (off > iocb->ki_pos) {
>>>           if (ret >= 0 &&
>> Do we need to check ret here?
>
> It seems this check makes no sense even in the following case I 
> mentioned. Will check it more and try to remove it.
>
>
>> I think that if ret < 0, then "off" must
>> be smaller than "i_size", no?
>
> From current code, there has one case that it's no, such as if the 
> file size is 10, and the ceph may return 4K contents and then the read 
> length will be 4K too, and if it just fails in case:
>
>                         copied = copy_page_to_iter(pages[idx++],
>                                                    page_off, len, to);
>                         off += copied;
>                         left -= copied;
>                         if (copied < len) {
>                                 ret = -EFAULT;
>                                 break;
>                         }
>
> And if the "copied = 1K" for some reasons, the "off" will be larger 
> than the "i_size" but ret < 0.
>
> BRs
>
> Xiubo
>
>
>
>>
>>> -            iov_iter_count(to) > 0 && off >= i_size_read(inode))
>>> +            iov_iter_count(to) > 0 &&
>>> +            off >= i_size_read(inode)) {
>>>               *retry_op = CHECK_EOF;
>>> -        ret = off - iocb->ki_pos;
>>> -        iocb->ki_pos = off;
>>> +            ret = i_size - iocb->ki_pos;
>>> +            iocb->ki_pos = i_size;
>>> +        } else {
>>> +            ret = off - iocb->ki_pos;
>>> +            iocb->ki_pos = off;
>>> +        }
>>>       }
>>>         dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
diff mbox series

Patch

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 91173d3aa161..1abc3b591740 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -847,6 +847,7 @@  static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 	ssize_t ret;
 	u64 off = iocb->ki_pos;
 	u64 len = iov_iter_count(to);
+	u64 i_size = i_size_read(inode);
 
 	dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
 	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
@@ -870,7 +871,6 @@  static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 		struct page **pages;
 		int num_pages;
 		size_t page_off;
-		u64 i_size;
 		bool more;
 		int idx;
 		size_t left;
@@ -909,7 +909,6 @@  static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 
 		ceph_osdc_put_request(req);
 
-		i_size = i_size_read(inode);
 		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
 		     off, len, ret, i_size, (more ? " MORE" : ""));
 
@@ -954,10 +953,15 @@  static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 
 	if (off > iocb->ki_pos) {
 		if (ret >= 0 &&
-		    iov_iter_count(to) > 0 && off >= i_size_read(inode))
+		    iov_iter_count(to) > 0 &&
+		    off >= i_size_read(inode)) {
 			*retry_op = CHECK_EOF;
-		ret = off - iocb->ki_pos;
-		iocb->ki_pos = off;
+			ret = i_size - iocb->ki_pos;
+			iocb->ki_pos = i_size;
+		} else {
+			ret = off - iocb->ki_pos;
+			iocb->ki_pos = off;
+		}
 	}
 
 	dout("sync_read result %zd retry_op %d\n", ret, *retry_op);