Message ID | 20230519125705.598234-9-amir73il@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | fuse: Add support for passthrough read/write | expand |
On Fri, May 19, 2023 at 8:59 PM Amir Goldstein <amir73il@gmail.com> wrote: > > Similar to update size/mtime at the end of fuse_perform_write(), > we need to bump the attr version when we update the inode size. > > Signed-off-by: Amir Goldstein <amir73il@gmail.com> > --- > fs/fuse/passthrough.c | 53 ++++++++++++++++++++++++++++++++++--------- > 1 file changed, 42 insertions(+), 11 deletions(-) > > diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c > index 10b370bcc423..8352d6b91e0e 100644 > --- a/fs/fuse/passthrough.c > +++ b/fs/fuse/passthrough.c > @@ -14,15 +14,42 @@ struct fuse_aio_req { > struct kiocb *iocb_fuse; > }; > > -static void fuse_aio_cleanup_handler(struct fuse_aio_req *aio_req) > +static void fuse_file_start_write(struct file *fuse_file, > + struct file *backing_file, > + loff_t pos, size_t count) > +{ > + struct inode *inode = file_inode(fuse_file); > + struct fuse_inode *fi = get_fuse_inode(inode); > + > + if (inode->i_size < pos + count) > + set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); > + > + file_start_write(backing_file); > +} > + > +static void fuse_file_end_write(struct file *fuse_file, > + struct file *backing_file, > + loff_t pos, ssize_t res) > +{ > + struct inode *inode = file_inode(fuse_file); > + struct fuse_inode *fi = get_fuse_inode(inode); > + > + file_end_write(backing_file); > + > + fuse_write_update_attr(inode, pos, res); Hi Amir, This function(fuse_file_end_write) will execute in interrupt context, but fuse_write_update_attr() uses fuse_inode->lock, this will cause soft lockup. So we may have to change all the fuse_inode->lock usage to fixup this bug, but I think this is one ugly resolution. Or why should we do aio_clearup_handler()? What is the difference between fuse_passthrough_write_iter() with ovl_write_iter()? Thanks, Tianci > + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); > +} > + > +static void fuse_aio_cleanup_handler(struct fuse_aio_req *aio_req, long res) > { > struct kiocb *iocb = &aio_req->iocb; > struct kiocb *iocb_fuse = aio_req->iocb_fuse; > + struct file *filp = iocb->ki_filp; > + struct file *fuse_filp = iocb_fuse->ki_filp; > > if (iocb->ki_flags & IOCB_WRITE) { > - __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb, > - SB_FREEZE_WRITE); > - file_end_write(iocb->ki_filp); > + __sb_writers_acquired(file_inode(filp)->i_sb, SB_FREEZE_WRITE); > + fuse_file_end_write(fuse_filp, filp, iocb->ki_pos, res); > } > > iocb_fuse->ki_pos = iocb->ki_pos; > @@ -35,7 +62,7 @@ static void fuse_aio_rw_complete(struct kiocb *iocb, long res) > container_of(iocb, struct fuse_aio_req, iocb); > struct kiocb *iocb_fuse = aio_req->iocb_fuse; > > - fuse_aio_cleanup_handler(aio_req); > + fuse_aio_cleanup_handler(aio_req, res); > iocb_fuse->ki_complete(iocb_fuse, res); > } > > @@ -71,7 +98,7 @@ ssize_t fuse_passthrough_read_iter(struct kiocb *iocb_fuse, > aio_req->iocb.ki_complete = fuse_aio_rw_complete; > ret = call_read_iter(passthrough_filp, &aio_req->iocb, iter); > if (ret != -EIOCBQUEUED) > - fuse_aio_cleanup_handler(aio_req); > + fuse_aio_cleanup_handler(aio_req, ret); > } > out: > revert_creds(old_cred); > @@ -87,22 +114,25 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb_fuse, > struct inode *fuse_inode = file_inode(fuse_filp); > struct file *passthrough_filp = ff->passthrough->filp; > struct inode *passthrough_inode = file_inode(passthrough_filp); > + size_t count = iov_iter_count(iter); > const struct cred *old_cred; > ssize_t ret; > rwf_t rwf; > > - if (!iov_iter_count(iter)) > + if (!count) > return 0; > > inode_lock(fuse_inode); > > old_cred = override_creds(ff->passthrough->cred); > if (is_sync_kiocb(iocb_fuse)) { > - file_start_write(passthrough_filp); > + fuse_file_start_write(fuse_filp, passthrough_filp, > + iocb_fuse->ki_pos, count); > rwf = iocb_to_rw_flags(iocb_fuse->ki_flags, FUSE_IOCB_MASK); > ret = vfs_iter_write(passthrough_filp, iter, &iocb_fuse->ki_pos, > rwf); > - file_end_write(passthrough_filp); > + fuse_file_end_write(fuse_filp, passthrough_filp, > + iocb_fuse->ki_pos, ret); > } else { > struct fuse_aio_req *aio_req; > > @@ -112,7 +142,8 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb_fuse, > goto out; > } > > - file_start_write(passthrough_filp); > + fuse_file_start_write(fuse_filp, passthrough_filp, > + iocb_fuse->ki_pos, count); > __sb_writers_release(passthrough_inode->i_sb, SB_FREEZE_WRITE); > > aio_req->iocb_fuse = iocb_fuse; > @@ -120,7 +151,7 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb_fuse, > aio_req->iocb.ki_complete = fuse_aio_rw_complete; > ret = call_write_iter(passthrough_filp, &aio_req->iocb, iter); > if (ret != -EIOCBQUEUED) > - fuse_aio_cleanup_handler(aio_req); > + fuse_aio_cleanup_handler(aio_req, ret); > } > out: > revert_creds(old_cred); > -- > 2.34.1 >
On Mon, Sep 25, 2023 at 9:41 AM Zhang Tianci <zhangtianci.1997@bytedance.com> wrote: > > On Fri, May 19, 2023 at 8:59 PM Amir Goldstein <amir73il@gmail.com> wrote: > > > > Similar to update size/mtime at the end of fuse_perform_write(), > > we need to bump the attr version when we update the inode size. > > > > Signed-off-by: Amir Goldstein <amir73il@gmail.com> > > --- > > fs/fuse/passthrough.c | 53 ++++++++++++++++++++++++++++++++++--------- > > 1 file changed, 42 insertions(+), 11 deletions(-) > > > > diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c > > index 10b370bcc423..8352d6b91e0e 100644 > > --- a/fs/fuse/passthrough.c > > +++ b/fs/fuse/passthrough.c > > @@ -14,15 +14,42 @@ struct fuse_aio_req { > > struct kiocb *iocb_fuse; > > }; > > > > -static void fuse_aio_cleanup_handler(struct fuse_aio_req *aio_req) > > +static void fuse_file_start_write(struct file *fuse_file, > > + struct file *backing_file, > > + loff_t pos, size_t count) > > +{ > > + struct inode *inode = file_inode(fuse_file); > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + > > + if (inode->i_size < pos + count) > > + set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); > > + > > + file_start_write(backing_file); > > +} > > + > > +static void fuse_file_end_write(struct file *fuse_file, > > + struct file *backing_file, > > + loff_t pos, ssize_t res) > > +{ > > + struct inode *inode = file_inode(fuse_file); > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + > > + file_end_write(backing_file); > > + > > + fuse_write_update_attr(inode, pos, res); > > Hi Amir, > This function(fuse_file_end_write) will execute in interrupt context, but > fuse_write_update_attr() uses fuse_inode->lock, this will cause soft lockup. > > So we may have to change all the fuse_inode->lock usage to fixup this bug, but > I think this is one ugly resolution. > > Or why should we do aio_clearup_handler()? What is the difference between > fuse_passthrough_write_iter() with ovl_write_iter()? > [CC Jens and Christian] Heh, very good question. Does this answer your question: https://lore.kernel.org/linux-unionfs/20230912173653.3317828-2-amir73il@gmail.com/ I queued this patch to overlayfs for 6.7, because I think overlayfs has a bug that can manifest with concurrent aio completions. For people who just joined, this is a patch review of the FUSE passthrough feature, which is expected to share the common "kiocb_clone" io passthrough helpers with overlayfs. Jens, Are there any IOCB flags that overlayfs (or backing_aio) need to set or clear, besides IOCB_DIO_CALLER_COMP, that would prevent calling completion from interrupt context? Or is the proper way to deal with this is to defer completion to workqueue in the common backing_aio helpers that I am re-factoring from overlayfs? IIUC, that could also help overlayfs support IOCB_DIO_CALLER_COMP? Is my understanding correct? Thanks, Amir.
On 9/25/23 4:43 AM, Amir Goldstein wrote: > Jens, > > Are there any IOCB flags that overlayfs (or backing_aio) need > to set or clear, besides IOCB_DIO_CALLER_COMP, that > would prevent calling completion from interrupt context? There are a few flags that may get set (like WAITQ or ALLOC_CACHE), but I think that should all work ok as-is as the former is just state in that iocb and that is persistent (and only for the read path), and ALLOC_CACHE should just be propagated. > Or is the proper way to deal with this is to defer completion > to workqueue in the common backing_aio helpers that > I am re-factoring from overlayfs? No, deferring to a workqueue would defeat the purpose of the flag, which is telling you that the caller will ensure that the end_io callback will happen from task context and need not be deferred to a workqueue. I can take a peek at how to wire it up properly for overlayfs, have some travel coming up in a few days. > IIUC, that could also help overlayfs support > IOCB_DIO_CALLER_COMP? > > Is my understanding correct? If you peek at fs.h and find the CALLER_COMP references, it'll tell you a bit about how it works. This is new with the 6.6-rc kernel, there's a series of patches from me that went in through the iomap tree that hooked that up. Those commits have an explanation as well.
On Tue, Sep 26, 2023 at 6:31 PM Jens Axboe <axboe@kernel.dk> wrote: > > On 9/25/23 4:43 AM, Amir Goldstein wrote: > > Jens, > > > > Are there any IOCB flags that overlayfs (or backing_aio) need > > to set or clear, besides IOCB_DIO_CALLER_COMP, that > > would prevent calling completion from interrupt context? > > There are a few flags that may get set (like WAITQ or ALLOC_CACHE), but > I think that should all work ok as-is as the former is just state in > that iocb and that is persistent (and only for the read path), and > ALLOC_CACHE should just be propagated. > > > Or is the proper way to deal with this is to defer completion > > to workqueue in the common backing_aio helpers that > > I am re-factoring from overlayfs? > > No, deferring to a workqueue would defeat the purpose of the flag, which > is telling you that the caller will ensure that the end_io callback will > happen from task context and need not be deferred to a workqueue. I can > take a peek at how to wire it up properly for overlayfs, have some > travel coming up in a few days. > No worries, this is not urgent. I queued a change to overlayfs to take a spin lock on completion for the 6.7 merge window, so if I can get a ACK/NACK until then It would be nice. https://lore.kernel.org/linux-unionfs/20230912173653.3317828-2-amir73il@gmail.com/ > > IIUC, that could also help overlayfs support > > IOCB_DIO_CALLER_COMP? > > > > Is my understanding correct? > > If you peek at fs.h and find the CALLER_COMP references, it'll tell you > a bit about how it works. This is new with the 6.6-rc kernel, there's a > series of patches from me that went in through the iomap tree that > hooked that up. Those commits have an explanation as well. > Sorry, I think my question wasn't clear. I wasn't asking specifically about CALLER_COMP. Zhang Tianci commented in review (above) that I am not allowed to take the inode spinlock in the ovl io completion context, because it may be called from interrupt. I wasn't sure if his statement was correct, so this is what I am asking - whether overlayfs can set any IOCB flags that will force the completion to be called from task context - this is kind of the opposite of CALLER_COMP. Let me know if I wasn't able to explain myself. I am not that fluent in aio jargon. Thanks, Amir.
On 9/26/23 9:48 AM, Amir Goldstein wrote: > On Tue, Sep 26, 2023 at 6:31?PM Jens Axboe <axboe@kernel.dk> wrote: >> >> On 9/25/23 4:43 AM, Amir Goldstein wrote: >>> Jens, >>> >>> Are there any IOCB flags that overlayfs (or backing_aio) need >>> to set or clear, besides IOCB_DIO_CALLER_COMP, that >>> would prevent calling completion from interrupt context? >> >> There are a few flags that may get set (like WAITQ or ALLOC_CACHE), but >> I think that should all work ok as-is as the former is just state in >> that iocb and that is persistent (and only for the read path), and >> ALLOC_CACHE should just be propagated. >> >>> Or is the proper way to deal with this is to defer completion >>> to workqueue in the common backing_aio helpers that >>> I am re-factoring from overlayfs? >> >> No, deferring to a workqueue would defeat the purpose of the flag, which >> is telling you that the caller will ensure that the end_io callback will >> happen from task context and need not be deferred to a workqueue. I can >> take a peek at how to wire it up properly for overlayfs, have some >> travel coming up in a few days. >> > > No worries, this is not urgent. > I queued a change to overlayfs to take a spin lock on completion > for the 6.7 merge window, so if I can get a ACK/NACK until then > It would be nice. > > https://lore.kernel.org/linux-unionfs/20230912173653.3317828-2-amir73il@gmail.com/ That's not going to work for ovl_copyattr(), as ->ki_complete() may very well be called from interrupt context in general. >>> IIUC, that could also help overlayfs support >>> IOCB_DIO_CALLER_COMP? >>> >>> Is my understanding correct? >> >> If you peek at fs.h and find the CALLER_COMP references, it'll tell you >> a bit about how it works. This is new with the 6.6-rc kernel, there's a >> series of patches from me that went in through the iomap tree that >> hooked that up. Those commits have an explanation as well. >> > > Sorry, I think my question wasn't clear. > I wasn't asking specifically about CALLER_COMP. > > Zhang Tianci commented in review (above) that I am not allowed > to take the inode spinlock in the ovl io completion context, because > it may be called from interrupt. That is correct, the inode spinlock is not IRQ safe. > I wasn't sure if his statement was correct, so this is what I am > asking - whether overlayfs can set any IOCB flags that will force > the completion to be called from task context - this is kind of the > opposite of CALLER_COMP. > > Let me know if I wasn't able to explain myself. > I am not that fluent in aio jargon. Ah gotcha. I don't think that'd really work for your case as you don't need to propagate it, you can just punt your completion handling to a context that is sane for you, like a workqueue. That is provided that you don't need any task context there, which presumably you don't since eg copyattr is already called from IRQ context. From that context you could then grab the inode lock.
On Tue, Sep 26, 2023 at 7:19 PM Jens Axboe <axboe@kernel.dk> wrote: > > On 9/26/23 9:48 AM, Amir Goldstein wrote: > > On Tue, Sep 26, 2023 at 6:31?PM Jens Axboe <axboe@kernel.dk> wrote: > >> > >> On 9/25/23 4:43 AM, Amir Goldstein wrote: > >>> Jens, > >>> > >>> Are there any IOCB flags that overlayfs (or backing_aio) need > >>> to set or clear, besides IOCB_DIO_CALLER_COMP, that > >>> would prevent calling completion from interrupt context? > >> > >> There are a few flags that may get set (like WAITQ or ALLOC_CACHE), but > >> I think that should all work ok as-is as the former is just state in > >> that iocb and that is persistent (and only for the read path), and > >> ALLOC_CACHE should just be propagated. > >> > >>> Or is the proper way to deal with this is to defer completion > >>> to workqueue in the common backing_aio helpers that > >>> I am re-factoring from overlayfs? > >> > >> No, deferring to a workqueue would defeat the purpose of the flag, which > >> is telling you that the caller will ensure that the end_io callback will > >> happen from task context and need not be deferred to a workqueue. I can > >> take a peek at how to wire it up properly for overlayfs, have some > >> travel coming up in a few days. > >> > > > > No worries, this is not urgent. > > I queued a change to overlayfs to take a spin lock on completion > > for the 6.7 merge window, so if I can get a ACK/NACK until then > > It would be nice. > > > > https://lore.kernel.org/linux-unionfs/20230912173653.3317828-2-amir73il@gmail.com/ > > That's not going to work for ovl_copyattr(), as ->ki_complete() may very > well be called from interrupt context in general. > > >>> IIUC, that could also help overlayfs support > >>> IOCB_DIO_CALLER_COMP? > >>> > >>> Is my understanding correct? > >> > >> If you peek at fs.h and find the CALLER_COMP references, it'll tell you > >> a bit about how it works. This is new with the 6.6-rc kernel, there's a > >> series of patches from me that went in through the iomap tree that > >> hooked that up. Those commits have an explanation as well. > >> > > > > Sorry, I think my question wasn't clear. > > I wasn't asking specifically about CALLER_COMP. > > > > Zhang Tianci commented in review (above) that I am not allowed > > to take the inode spinlock in the ovl io completion context, because > > it may be called from interrupt. > > That is correct, the inode spinlock is not IRQ safe. > > > I wasn't sure if his statement was correct, so this is what I am > > asking - whether overlayfs can set any IOCB flags that will force > > the completion to be called from task context - this is kind of the > > opposite of CALLER_COMP. > > > > Let me know if I wasn't able to explain myself. > > I am not that fluent in aio jargon. > > Ah gotcha. I don't think that'd really work for your case as you don't > need to propagate it, you can just punt your completion handling to a > context that is sane for you, like a workqueue. That is provided that > you don't need any task context there, which presumably you don't since > eg copyattr is already called from IRQ context. > > From that context you could then grab the inode lock. > Yes, that's what I thought. Thanks for confirming! Amir.
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 10b370bcc423..8352d6b91e0e 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -14,15 +14,42 @@ struct fuse_aio_req { struct kiocb *iocb_fuse; }; -static void fuse_aio_cleanup_handler(struct fuse_aio_req *aio_req) +static void fuse_file_start_write(struct file *fuse_file, + struct file *backing_file, + loff_t pos, size_t count) +{ + struct inode *inode = file_inode(fuse_file); + struct fuse_inode *fi = get_fuse_inode(inode); + + if (inode->i_size < pos + count) + set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + + file_start_write(backing_file); +} + +static void fuse_file_end_write(struct file *fuse_file, + struct file *backing_file, + loff_t pos, ssize_t res) +{ + struct inode *inode = file_inode(fuse_file); + struct fuse_inode *fi = get_fuse_inode(inode); + + file_end_write(backing_file); + + fuse_write_update_attr(inode, pos, res); + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); +} + +static void fuse_aio_cleanup_handler(struct fuse_aio_req *aio_req, long res) { struct kiocb *iocb = &aio_req->iocb; struct kiocb *iocb_fuse = aio_req->iocb_fuse; + struct file *filp = iocb->ki_filp; + struct file *fuse_filp = iocb_fuse->ki_filp; if (iocb->ki_flags & IOCB_WRITE) { - __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb, - SB_FREEZE_WRITE); - file_end_write(iocb->ki_filp); + __sb_writers_acquired(file_inode(filp)->i_sb, SB_FREEZE_WRITE); + fuse_file_end_write(fuse_filp, filp, iocb->ki_pos, res); } iocb_fuse->ki_pos = iocb->ki_pos; @@ -35,7 +62,7 @@ static void fuse_aio_rw_complete(struct kiocb *iocb, long res) container_of(iocb, struct fuse_aio_req, iocb); struct kiocb *iocb_fuse = aio_req->iocb_fuse; - fuse_aio_cleanup_handler(aio_req); + fuse_aio_cleanup_handler(aio_req, res); iocb_fuse->ki_complete(iocb_fuse, res); } @@ -71,7 +98,7 @@ ssize_t fuse_passthrough_read_iter(struct kiocb *iocb_fuse, aio_req->iocb.ki_complete = fuse_aio_rw_complete; ret = call_read_iter(passthrough_filp, &aio_req->iocb, iter); if (ret != -EIOCBQUEUED) - fuse_aio_cleanup_handler(aio_req); + fuse_aio_cleanup_handler(aio_req, ret); } out: revert_creds(old_cred); @@ -87,22 +114,25 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb_fuse, struct inode *fuse_inode = file_inode(fuse_filp); struct file *passthrough_filp = ff->passthrough->filp; struct inode *passthrough_inode = file_inode(passthrough_filp); + size_t count = iov_iter_count(iter); const struct cred *old_cred; ssize_t ret; rwf_t rwf; - if (!iov_iter_count(iter)) + if (!count) return 0; inode_lock(fuse_inode); old_cred = override_creds(ff->passthrough->cred); if (is_sync_kiocb(iocb_fuse)) { - file_start_write(passthrough_filp); + fuse_file_start_write(fuse_filp, passthrough_filp, + iocb_fuse->ki_pos, count); rwf = iocb_to_rw_flags(iocb_fuse->ki_flags, FUSE_IOCB_MASK); ret = vfs_iter_write(passthrough_filp, iter, &iocb_fuse->ki_pos, rwf); - file_end_write(passthrough_filp); + fuse_file_end_write(fuse_filp, passthrough_filp, + iocb_fuse->ki_pos, ret); } else { struct fuse_aio_req *aio_req; @@ -112,7 +142,8 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb_fuse, goto out; } - file_start_write(passthrough_filp); + fuse_file_start_write(fuse_filp, passthrough_filp, + iocb_fuse->ki_pos, count); __sb_writers_release(passthrough_inode->i_sb, SB_FREEZE_WRITE); aio_req->iocb_fuse = iocb_fuse; @@ -120,7 +151,7 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb_fuse, aio_req->iocb.ki_complete = fuse_aio_rw_complete; ret = call_write_iter(passthrough_filp, &aio_req->iocb, iter); if (ret != -EIOCBQUEUED) - fuse_aio_cleanup_handler(aio_req); + fuse_aio_cleanup_handler(aio_req, ret); } out: revert_creds(old_cred);
Similar to update size/mtime at the end of fuse_perform_write(), we need to bump the attr version when we update the inode size. Signed-off-by: Amir Goldstein <amir73il@gmail.com> --- fs/fuse/passthrough.c | 53 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 11 deletions(-)