Message ID | 20211019115138.414187-1-xiubli@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ceph: return the real size readed when hit EOF | expand |
Without this, such as in case for the file encrypt feature, when doing the following test: 1), echo "1234567890" > dir/a.txt 2), vim dir/a.txt It will always show the zeroed contents after string "1234567890", something like: "1234567890@@@@@...." On 10/19/21 7:51 PM, xiubli@redhat.com wrote: > From: Xiubo Li <xiubli@redhat.com> > > At the same time set the ki_pos to the file size. > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > fs/ceph/file.c | 14 +++++++++----- > 1 file changed, 9 insertions(+), 5 deletions(-) > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index 91173d3aa161..1abc3b591740 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -847,6 +847,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > ssize_t ret; > u64 off = iocb->ki_pos; > u64 len = iov_iter_count(to); > + u64 i_size = i_size_read(inode); > > dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, > (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); > @@ -870,7 +871,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > struct page **pages; > int num_pages; > size_t page_off; > - u64 i_size; > bool more; > int idx; > size_t left; > @@ -909,7 +909,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > > ceph_osdc_put_request(req); > > - i_size = i_size_read(inode); > dout("sync_read %llu~%llu got %zd i_size %llu%s\n", > off, len, ret, i_size, (more ? " MORE" : "")); > > @@ -954,10 +953,15 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > > if (off > iocb->ki_pos) { > if (ret >= 0 && > - iov_iter_count(to) > 0 && off >= i_size_read(inode)) > + iov_iter_count(to) > 0 && > + off >= i_size_read(inode)) { > *retry_op = CHECK_EOF; > - ret = off - iocb->ki_pos; > - iocb->ki_pos = off; > + ret = i_size - iocb->ki_pos; > + iocb->ki_pos = i_size; > + } else { > + ret = off - iocb->ki_pos; > + iocb->ki_pos = off; > + } > } > > dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
On Tue, 2021-10-19 at 19:51 +0800, xiubli@redhat.com wrote: > From: Xiubo Li <xiubli@redhat.com> > > At the same time set the ki_pos to the file size. > It would be good to put the comments in your follow up email into the patch description here, so that it explains what you're fixing and why. > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > fs/ceph/file.c | 14 +++++++++----- > 1 file changed, 9 insertions(+), 5 deletions(-) > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index 91173d3aa161..1abc3b591740 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -847,6 +847,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > ssize_t ret; > u64 off = iocb->ki_pos; > u64 len = iov_iter_count(to); > + u64 i_size = i_size_read(inode); > Was there a reason to change where the i_size is fetched, or did you just not see the point in fetching it on each loop? I wonder if we can hit any races vs. truncates with this? Oh well, all of this non-buffered I/O seems somewhat racy anyway. ;) > dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, > (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); > @@ -870,7 +871,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > struct page **pages; > int num_pages; > size_t page_off; > - u64 i_size; > bool more; > int idx; > size_t left; > @@ -909,7 +909,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > > ceph_osdc_put_request(req); > > - i_size = i_size_read(inode); > dout("sync_read %llu~%llu got %zd i_size %llu%s\n", > off, len, ret, i_size, (more ? " MORE" : "")); > > @@ -954,10 +953,15 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > > if (off > iocb->ki_pos) { > if (ret >= 0 && Do we need to check ret here? I think that if ret < 0, then "off" must be smaller than "i_size", no? > - iov_iter_count(to) > 0 && off >= i_size_read(inode)) > + iov_iter_count(to) > 0 && > + off >= i_size_read(inode)) { > *retry_op = CHECK_EOF; > - ret = off - iocb->ki_pos; > - iocb->ki_pos = off; > + ret = i_size - iocb->ki_pos; > + iocb->ki_pos = i_size; > + } else { > + ret = off - iocb->ki_pos; > + iocb->ki_pos = off; > + } > } > > dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
On 10/19/21 8:59 PM, Jeff Layton wrote: > On Tue, 2021-10-19 at 19:51 +0800, xiubli@redhat.com wrote: >> From: Xiubo Li <xiubli@redhat.com> >> >> At the same time set the ki_pos to the file size. >> > It would be good to put the comments in your follow up email into the > patch description here, so that it explains what you're fixing and why. Sure. > >> Signed-off-by: Xiubo Li <xiubli@redhat.com> >> --- >> fs/ceph/file.c | 14 +++++++++----- >> 1 file changed, 9 insertions(+), 5 deletions(-) >> >> diff --git a/fs/ceph/file.c b/fs/ceph/file.c >> index 91173d3aa161..1abc3b591740 100644 >> --- a/fs/ceph/file.c >> +++ b/fs/ceph/file.c >> @@ -847,6 +847,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, >> ssize_t ret; >> u64 off = iocb->ki_pos; >> u64 len = iov_iter_count(to); >> + u64 i_size = i_size_read(inode); >> > Was there a reason to change where the i_size is fetched, or did you > just not see the point in fetching it on each loop? I wonder if we can > hit any races vs. truncates with this? From my reading from the code, it shouldn't. While doing the truncate it will down_write(&inode->i_rwsem). And the sync read will hold down_read() it. It should be safe. > > Oh well, all of this non-buffered I/O seems somewhat racy anyway. ;) > >> dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, >> (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); >> @@ -870,7 +871,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, >> struct page **pages; >> int num_pages; >> size_t page_off; >> - u64 i_size; >> bool more; >> int idx; >> size_t left; >> @@ -909,7 +909,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, >> >> ceph_osdc_put_request(req); >> >> - i_size = i_size_read(inode); >> dout("sync_read %llu~%llu got %zd i_size %llu%s\n", >> off, len, ret, i_size, (more ? " MORE" : "")); >> >> @@ -954,10 +953,15 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, >> >> if (off > iocb->ki_pos) { >> if (ret >= 0 && > Do we need to check ret here? It seems this check makes no sense even in the following case I mentioned. Will check it more and try to remove it. > I think that if ret < 0, then "off" must > be smaller than "i_size", no? From current code, there has one case that it's no, such as if the file size is 10, and the ceph may return 4K contents and then the read length will be 4K too, and if it just fails in case: copied = copy_page_to_iter(pages[idx++], page_off, len, to); off += copied; left -= copied; if (copied < len) { ret = -EFAULT; break; } And if the "copied = 1K" for some reasons, the "off" will be larger than the "i_size" but ret < 0. BRs Xiubo > >> - iov_iter_count(to) > 0 && off >= i_size_read(inode)) >> + iov_iter_count(to) > 0 && >> + off >= i_size_read(inode)) { >> *retry_op = CHECK_EOF; >> - ret = off - iocb->ki_pos; >> - iocb->ki_pos = off; >> + ret = i_size - iocb->ki_pos; >> + iocb->ki_pos = i_size; >> + } else { >> + ret = off - iocb->ki_pos; >> + iocb->ki_pos = off; >> + } >> } >> >> dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
On 10/20/21 10:04 AM, Xiubo Li wrote: > > On 10/19/21 8:59 PM, Jeff Layton wrote: >> On Tue, 2021-10-19 at 19:51 +0800, xiubli@redhat.com wrote: >>> From: Xiubo Li <xiubli@redhat.com> >>> >>> At the same time set the ki_pos to the file size. >>> >> It would be good to put the comments in your follow up email into the >> patch description here, so that it explains what you're fixing and why. > Sure. >> >>> Signed-off-by: Xiubo Li <xiubli@redhat.com> >>> --- >>> fs/ceph/file.c | 14 +++++++++----- >>> 1 file changed, 9 insertions(+), 5 deletions(-) >>> >>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c >>> index 91173d3aa161..1abc3b591740 100644 >>> --- a/fs/ceph/file.c >>> +++ b/fs/ceph/file.c >>> @@ -847,6 +847,7 @@ static ssize_t ceph_sync_read(struct kiocb >>> *iocb, struct iov_iter *to, >>> ssize_t ret; >>> u64 off = iocb->ki_pos; >>> u64 len = iov_iter_count(to); >>> + u64 i_size = i_size_read(inode); >> Was there a reason to change where the i_size is fetched, or did you >> just not see the point in fetching it on each loop? I wonder if we can >> hit any races vs. truncates with this? > > From my reading from the code, it shouldn't. > > While doing the truncate it will down_write(&inode->i_rwsem). And the > sync read will hold down_read() it. It should be safe. > And also this should be safe if the truncate is in a different kclient: When the MDS received the truncate request, it must revoke the 'Fr' caps from all the kclients first. So if another kclient is still reading the file, the truncate will be pause. >> >> Oh well, all of this non-buffered I/O seems somewhat racy anyway. ;) >> >>> dout("sync_read on file %p %llu~%u %s\n", file, off, >>> (unsigned)len, >>> (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); >>> @@ -870,7 +871,6 @@ static ssize_t ceph_sync_read(struct kiocb >>> *iocb, struct iov_iter *to, >>> struct page **pages; >>> int num_pages; >>> size_t page_off; >>> - u64 i_size; >>> bool more; >>> int idx; >>> size_t left; >>> @@ -909,7 +909,6 @@ static ssize_t ceph_sync_read(struct kiocb >>> *iocb, struct iov_iter *to, >>> ceph_osdc_put_request(req); >>> - i_size = i_size_read(inode); >>> dout("sync_read %llu~%llu got %zd i_size %llu%s\n", >>> off, len, ret, i_size, (more ? " MORE" : "")); >>> @@ -954,10 +953,15 @@ static ssize_t ceph_sync_read(struct kiocb >>> *iocb, struct iov_iter *to, >>> if (off > iocb->ki_pos) { >>> if (ret >= 0 && >> Do we need to check ret here? > > It seems this check makes no sense even in the following case I > mentioned. Will check it more and try to remove it. > > >> I think that if ret < 0, then "off" must >> be smaller than "i_size", no? > > From current code, there has one case that it's no, such as if the > file size is 10, and the ceph may return 4K contents and then the read > length will be 4K too, and if it just fails in case: > > copied = copy_page_to_iter(pages[idx++], > page_off, len, to); > off += copied; > left -= copied; > if (copied < len) { > ret = -EFAULT; > break; > } > > And if the "copied = 1K" for some reasons, the "off" will be larger > than the "i_size" but ret < 0. > > BRs > > Xiubo > > > >> >>> - iov_iter_count(to) > 0 && off >= i_size_read(inode)) >>> + iov_iter_count(to) > 0 && >>> + off >= i_size_read(inode)) { >>> *retry_op = CHECK_EOF; >>> - ret = off - iocb->ki_pos; >>> - iocb->ki_pos = off; >>> + ret = i_size - iocb->ki_pos; >>> + iocb->ki_pos = i_size; >>> + } else { >>> + ret = off - iocb->ki_pos; >>> + iocb->ki_pos = off; >>> + } >>> } >>> dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 91173d3aa161..1abc3b591740 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -847,6 +847,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ssize_t ret; u64 off = iocb->ki_pos; u64 len = iov_iter_count(to); + u64 i_size = i_size_read(inode); dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); @@ -870,7 +871,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, struct page **pages; int num_pages; size_t page_off; - u64 i_size; bool more; int idx; size_t left; @@ -909,7 +909,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ceph_osdc_put_request(req); - i_size = i_size_read(inode); dout("sync_read %llu~%llu got %zd i_size %llu%s\n", off, len, ret, i_size, (more ? " MORE" : "")); @@ -954,10 +953,15 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, if (off > iocb->ki_pos) { if (ret >= 0 && - iov_iter_count(to) > 0 && off >= i_size_read(inode)) + iov_iter_count(to) > 0 && + off >= i_size_read(inode)) { *retry_op = CHECK_EOF; - ret = off - iocb->ki_pos; - iocb->ki_pos = off; + ret = i_size - iocb->ki_pos; + iocb->ki_pos = i_size; + } else { + ret = off - iocb->ki_pos; + iocb->ki_pos = off; + } } dout("sync_read result %zd retry_op %d\n", ret, *retry_op);