Message ID | 20230601145904.1385409-4-hch@lst.de (mailing list archive) |
---|---|
State | Accepted |
Commit | 182c25e9c157f37bd0ab5a82fe2417e2223df459 |
Headers | show |
Series | [f2fs-dev,01/12] backing_dev: remove current->backing_dev_info | expand |
On Thu, Jun 01, 2023 at 04:58:55PM +0200, Christoph Hellwig wrote: > All callers of generic_perform_write need to updated ki_pos, move it into > common code. > @@ -4034,7 +4037,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) > endbyte = pos + status - 1; > err = filemap_write_and_wait_range(mapping, pos, endbyte); > if (err == 0) { > - iocb->ki_pos = endbyte + 1; > written += status; > invalidate_mapping_pages(mapping, > pos >> PAGE_SHIFT, > @@ -4047,8 +4049,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) > } > } else { > written = generic_perform_write(iocb, from); > - if (likely(written > 0)) > - iocb->ki_pos += written; > } > out: > return written ? written : err; [another late reply, sorry] That part is somewhat fishy - there's a case where you return a positive value and advance ->ki_pos by more than that amount. I really wonder if all callers of ->write_iter() are OK with that. Consider e.g. this: ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_write(f.file, buf, count, ppos); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } return ret; } ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else if (file->f_op->write_iter) ret = new_sync_write(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); return ret; } static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = call_write_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; return ret; } Suppose ->write_iter() ends up doing returning a positive value smaller than the increment of kiocb.ki_pos. What do we get? ret is positive, so kiocb.ki_pos gets copied into *ppos, which is ksys_write's pos and there we copy it into file->f_pos. Is it really OK to have write() return 4096 and advance the file position by 16K? AFAICS, userland wouldn't get any indication of something odd going on - just a short write to a regular file, with followup write of remaining 12K getting quietly written in the range 16K..28K. I don't remember what POSIX says about that, but it would qualify as nasty surprise for any userland program - sure, one can check fsync() results before closing the sucker and see if everything looks fine, but the way it's usually discussed could easily lead to assumption that (synchronous) O_DIRECT writes would not be affected by anything of that sort.
On Sun, Aug 27, 2023 at 08:41:22PM +0100, Al Viro wrote: > On Thu, Jun 01, 2023 at 04:58:55PM +0200, Christoph Hellwig wrote: > > All callers of generic_perform_write need to updated ki_pos, move it into > > common code. > > > @@ -4034,7 +4037,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) > > endbyte = pos + status - 1; > > err = filemap_write_and_wait_range(mapping, pos, endbyte); > > if (err == 0) { > > - iocb->ki_pos = endbyte + 1; > > written += status; > > invalidate_mapping_pages(mapping, > > pos >> PAGE_SHIFT, > > @@ -4047,8 +4049,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) > > } > > } else { > > written = generic_perform_write(iocb, from); > > - if (likely(written > 0)) > > - iocb->ki_pos += written; > > } > > out: > > return written ? written : err; > > [another late reply, sorry] > > That part is somewhat fishy - there's a case where you return a positive value > and advance ->ki_pos by more than that amount. I really wonder if all callers > of ->write_iter() are OK with that. Consider e.g. this: > > ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) > { > struct fd f = fdget_pos(fd); > ssize_t ret = -EBADF; > > if (f.file) { > loff_t pos, *ppos = file_ppos(f.file); > if (ppos) { > pos = *ppos; > ppos = &pos; > } > ret = vfs_write(f.file, buf, count, ppos); > if (ret >= 0 && ppos) > f.file->f_pos = pos; > fdput_pos(f); > } > > return ret; > } > > ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) > { > ssize_t ret; > > if (!(file->f_mode & FMODE_WRITE)) > return -EBADF; > if (!(file->f_mode & FMODE_CAN_WRITE)) > return -EINVAL; > if (unlikely(!access_ok(buf, count))) > return -EFAULT; > > ret = rw_verify_area(WRITE, file, pos, count); > if (ret) > return ret; > if (count > MAX_RW_COUNT) > count = MAX_RW_COUNT; > file_start_write(file); > if (file->f_op->write) > ret = file->f_op->write(file, buf, count, pos); > else if (file->f_op->write_iter) > ret = new_sync_write(file, buf, count, pos); > else > ret = -EINVAL; > if (ret > 0) { > fsnotify_modify(file); > add_wchar(current, ret); > } > inc_syscw(current); > file_end_write(file); > return ret; > } > > static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) > { > struct kiocb kiocb; > struct iov_iter iter; > ssize_t ret; > > init_sync_kiocb(&kiocb, filp); > kiocb.ki_pos = (ppos ? *ppos : 0); > iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); > > ret = call_write_iter(filp, &kiocb, &iter); > BUG_ON(ret == -EIOCBQUEUED); > if (ret > 0 && ppos) > *ppos = kiocb.ki_pos; > return ret; > } > > Suppose ->write_iter() ends up doing returning a positive value smaller than > the increment of kiocb.ki_pos. What do we get? ret is positive, so > kiocb.ki_pos gets copied into *ppos, which is ksys_write's pos and there > we copy it into file->f_pos. > > Is it really OK to have write() return 4096 and advance the file position > by 16K? AFAICS, userland wouldn't get any indication of something > odd going on - just a short write to a regular file, with followup write > of remaining 12K getting quietly written in the range 16K..28K. > > I don't remember what POSIX says about that, but it would qualify as > nasty surprise for any userland program - sure, one can check fsync() > results before closing the sucker and see if everything looks fine, > but the way it's usually discussed could easily lead to assumption that > (synchronous) O_DIRECT writes would not be affected by anything of that > sort. IOW, I suspect that the right thing to do would be something along the lines of direct_write_fallback(): on error revert the ->ki_pos update from buffered write If we fail filemap_write_and_wait_range() on the range the buffered write went into, we only report the "number of bytes which we direct-written", to quote the comment in there. Which is fine, but buffered write has already advanced iocb->ki_pos, so we need to roll that back. Otherwise we end up with e.g. write(2) advancing position by more than the amount it reports having written. Fixes: 182c25e9c157 "filemap: update ki_pos in generic_perform_write" Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- diff --git a/fs/libfs.c b/fs/libfs.c index 5b851315eeed..712c57828c0e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1646,6 +1646,7 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, * We don't know how much we wrote, so just return the number of * bytes which were direct-written */ + iocb->ki_pos -= buffered_written; if (direct_written) return direct_written; return err;
On Sun, Aug 27, 2023 at 08:41:22PM +0100, Al Viro wrote: > That part is somewhat fishy - there's a case where you return a positive value > and advance ->ki_pos by more than that amount. I really wonder if all callers > of ->write_iter() are OK with that. Speaking of which, in case of negative return value we'd better *not* use ->ki_pos; consider e.g. generic_file_write_iter() with O_DSYNC and vfs_fsync_range() failure. An error gets returned, but ->ki_pos is left advanced. Normal write(2) is fine - it will only update file->f_pos if ->write_iter() has returned a non-negative. However, io_uring kiocb_done() starts with if (req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { if (!__io_complete_rw_common(req, ret)) { /* * Safe to call io_end from here as we're inline * from the submission path. */ io_req_io_end(req); io_req_set_res(req, final_ret, io_put_kbuf(req, issue_flags)); return IOU_OK; } } else { io_rw_done(&rw->kiocb, ret); } Note that ->f_pos update is *NOT* conditional upon ret >= 0 - it happens no matter what, provided that original request had ->kiocb.ki_pos equal to -1 (on a non-FMODE_STREAM file). Jens, is there any reason for doing that unconditionally? IMO it's a bad idea - there's a wide scope for fuckups that way, especially since write(2) is not sensitive to that and this use of -1 ki_pos is not particularly encouraged on io_uring side either, AFAICT. Worse, it's handling of failure exits in the first place, which already gets little testing...
On Sun, Aug 27, 2023 at 08:41:22PM +0100, Al Viro wrote: > That part is somewhat fishy - there's a case where you return a positive value > and advance ->ki_pos by more than that amount. I really wonder if all callers > of ->write_iter() are OK with that. Consider e.g. this: This should not exist in the latest version merged by Jens. Can you check if you still see issues in the version in the block tree or linux-next. > Suppose ->write_iter() ends up doing returning a positive value smaller than > the increment of kiocb.ki_pos. What do we get? ret is positive, so > kiocb.ki_pos gets copied into *ppos, which is ksys_write's pos and there > we copy it into file->f_pos. > > Is it really OK to have write() return 4096 and advance the file position > by 16K? AFAICS, userland wouldn't get any indication of something > odd going on - just a short write to a regular file, with followup write > of remaining 12K getting quietly written in the range 16K..28K. > > I don't remember what POSIX says about that, but it would qualify as > nasty surprise for any userland program - sure, one can check fsync() > results before closing the sucker and see if everything looks fine, > but the way it's usually discussed could easily lead to assumption that > (synchronous) O_DIRECT writes would not be affected by anything of that > sort. ki_pos should always be updated by the write return value. Everything else is a bug.
On Sun, Aug 27, 2023 at 10:45:18PM +0100, Al Viro wrote: > IOW, I suspect that the right thing to do would be something along the lines > of The idea looks sensible to me, but we'll also need to do it for the filemap_write_and_wait_range failure case.
On Mon, Aug 28, 2023 at 02:32:59PM +0200, Christoph Hellwig wrote: > On Sun, Aug 27, 2023 at 10:45:18PM +0100, Al Viro wrote: > > IOW, I suspect that the right thing to do would be something along the lines > > of > > The idea looks sensible to me, but we'll also need to do it for the > filemap_write_and_wait_range failure case. Huh? That's precisely where this patch is doing that... That function in mainline is if (unlikely(buffered_written < 0)) { if (direct_written) return direct_written; return buffered_written; } /* * We need to ensure that the page cache pages are written to disk and * invalidated to preserve the expected O_DIRECT semantics. */ err = filemap_write_and_wait_range(mapping, pos, end); if (err < 0) { /* * We don't know how much we wrote, so just return the number of * bytes which were direct-written */ if (direct_written) return direct_written; return err; } invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); return direct_written + buffered_written; The first failure exit does not need any work - the caller had not bumped ->ki_pos; the second one (after that 'if (err < 0) {' line) does and that's where the patch upthread adds iocb->ki_pos -= buffered_written. Or am I completely misparsing what you've written?
On Mon, Aug 28, 2023 at 02:30:23PM +0200, Christoph Hellwig wrote: > On Sun, Aug 27, 2023 at 08:41:22PM +0100, Al Viro wrote: > > That part is somewhat fishy - there's a case where you return a positive value > > and advance ->ki_pos by more than that amount. I really wonder if all callers > > of ->write_iter() are OK with that. Consider e.g. this: > > This should not exist in the latest version merged by Jens. Can you > check if you still see issues in the version in the block tree or > linux-next. Still does - the problem has migrated into direct_write_fallback(), but that hadn't changed the situation. We are still left with ->ki_pos bumped by generic_perform_write() (evaluated as an argument of direct_write_fallback() now) and *not* retraced in case when direct_write_fallback() decides to discard the buffered write result. Both in -next and in mainline (since 6.5-rc1).
On Mon, Aug 28, 2023 at 02:56:15PM +0100, Al Viro wrote: > The first failure exit does not need any work - the caller had not bumped > ->ki_pos; the second one (after that 'if (err < 0) {' line) does and that's > where the patch upthread adds iocb->ki_pos -= buffered_written. > > Or am I completely misparsing what you've written? No, I misread the patch. Looks good: Acked-by: Christoph Hellwig <hch@lst.de>
> direct_write_fallback(): on error revert the ->ki_pos update from buffered write
Al, Christian: can you send this fix on top Linus?
On Wed, Sep 13, 2023 at 01:00:10PM +0200, Christoph Hellwig wrote: > > direct_write_fallback(): on error revert the ->ki_pos update from buffered write > > Al, Christian: can you send this fix on top Linus? Wasn't aware of this, sorry. I've picked it up and placed it with another set of small fixes I already have. I'm happy to have Al take it ofc.
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c8ef72f723badd..767f4dfe7def64 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1891,8 +1891,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) * can not run at the same time */ written = generic_perform_write(iocb, from); - if (likely(written >= 0)) - iocb->ki_pos = pos + written; ceph_end_io_write(inode); } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index bc430270c23c19..ea0ada3985cba2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -289,12 +289,9 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, out: inode_unlock(inode); - if (likely(ret > 0)) { - iocb->ki_pos += ret; - ret = generic_write_sync(iocb, ret); - } - - return ret; + if (unlikely(ret <= 0)) + return ret; + return generic_write_sync(iocb, ret); } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4f423d367a44b9..7134fe8bd008cb 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4520,7 +4520,6 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, ret = generic_perform_write(iocb, from); if (ret > 0) { - iocb->ki_pos += ret; f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_BUFFERED_IO, ret); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 665ce3fc62eaf4..e8bb4c48a3210a 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -655,7 +655,6 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; written = result; - iocb->ki_pos += written; nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); if (mntflags & NFS_MOUNT_WRITE_EAGER) { diff --git a/mm/filemap.c b/mm/filemap.c index 33b54660ad2b39..15907af4a57ff5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3957,7 +3957,10 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(i)); - return written ? written : status; + if (!written) + return status; + iocb->ki_pos += written; + return written; } EXPORT_SYMBOL(generic_perform_write); @@ -4034,7 +4037,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) endbyte = pos + status - 1; err = filemap_write_and_wait_range(mapping, pos, endbyte); if (err == 0) { - iocb->ki_pos = endbyte + 1; written += status; invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, @@ -4047,8 +4049,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } } else { written = generic_perform_write(iocb, from); - if (likely(written > 0)) - iocb->ki_pos += written; } out: return written ? written : err;