Message ID | 20170831134722.GA6912@lst.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Aug 31, 2017 at 4:47 PM, Christoph Hellwig <hch@lst.de> wrote: > I think something like the following patch (totally untested, > just an idea) should fix the issue, right? I think that is not enough. > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index c4893e226fd8..555fcae9a18f 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -135,7 +135,7 @@ xfs_file_fsync( > struct xfs_inode *ip = XFS_I(inode); > struct xfs_mount *mp = ip->i_mount; > int error = 0; > - int log_flushed = 0; > + unsigned int flushseq; > xfs_lsn_t lsn = 0; > > trace_xfs_file_fsync(ip); > @@ -143,6 +143,7 @@ xfs_file_fsync( > error = file_write_and_wait_range(file, start, end); > if (error) > return error; > + flushseq = READ_ONCE(mp->m_flushseq); imagine that flush was submitted and completed before file_write_and_wait_range() but m_flushseq incremented after. maybe here READ m_flush_submitted_seq... > > if (XFS_FORCED_SHUTDOWN(mp)) > return -EIO; > @@ -181,7 +182,7 @@ xfs_file_fsync( > } > > if (lsn) { > - error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); > + error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); > ip->i_itemp->ili_fsync_fields = 0; > } > xfs_iunlock(ip, XFS_ILOCK_SHARED); > @@ -193,8 +194,9 @@ xfs_file_fsync( > * an already allocated file and thus do not have any metadata to > * commit. > */ > - if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && > - mp->m_logdev_targp == mp->m_ddev_targp) > + if (!XFS_IS_REALTIME_INODE(ip) && > + mp->m_logdev_targp == mp->m_ddev_targp && > + flushseq == READ_ONCE(mp->m_flushseq)) ... and here READ m_flush_completed_seq if (m_flush_completed_seq > m_flush_submitted_seq) it is safe to skip issue flush. Then probably READ_ONCE() is not enough and need smb_rmb? > xfs_blkdev_issue_flush(mp->m_ddev_targp); > > return error; > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c > index bcb2f860e508..3c0cbb98581e 100644 > --- a/fs/xfs/xfs_log.c > +++ b/fs/xfs/xfs_log.c > @@ -2922,6 +2922,8 @@ xlog_state_done_syncing( > iclog->ic_state = XLOG_STATE_DONE_SYNC; > } > > + log->l_mp->m_flushseq++; I recon this should use WRITE_ONCE or smp_wmb() and then also increment m_flush_submitted_seq *before* issueing flush If state machine does not allow more than a single flush to be in flight (?) then the 2 seq counters could be reduced to single seq counter with (m_flushseq % 2) == 1 for submitted and (m_flushseq % 2) == 0 for completed and the test in fsync would be (flushseq % 2) == (READ_ONCE(mp->m_flushseq) % 2) ... maybe? Amir.
On Thu, Aug 31, 2017 at 05:37:06PM +0300, Amir Goldstein wrote: > On Thu, Aug 31, 2017 at 4:47 PM, Christoph Hellwig <hch@lst.de> wrote: > > I think something like the following patch (totally untested, > > just an idea) should fix the issue, right? > > I think that is not enough. > > > > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > > index c4893e226fd8..555fcae9a18f 100644 > > --- a/fs/xfs/xfs_file.c > > +++ b/fs/xfs/xfs_file.c > > @@ -135,7 +135,7 @@ xfs_file_fsync( > > struct xfs_inode *ip = XFS_I(inode); > > struct xfs_mount *mp = ip->i_mount; > > int error = 0; > > - int log_flushed = 0; > > + unsigned int flushseq; > > xfs_lsn_t lsn = 0; > > > > trace_xfs_file_fsync(ip); > > @@ -143,6 +143,7 @@ xfs_file_fsync( > > error = file_write_and_wait_range(file, start, end); > > if (error) > > return error; > > + flushseq = READ_ONCE(mp->m_flushseq); > > imagine that flush was submitted and completed before > file_write_and_wait_range() but m_flushseq incremented after. > maybe here READ m_flush_submitted_seq... > > > > > if (XFS_FORCED_SHUTDOWN(mp)) > > return -EIO; > > @@ -181,7 +182,7 @@ xfs_file_fsync( > > } > > > > if (lsn) { > > - error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); > > + error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); > > ip->i_itemp->ili_fsync_fields = 0; > > } > > xfs_iunlock(ip, XFS_ILOCK_SHARED); > > @@ -193,8 +194,9 @@ xfs_file_fsync( > > * an already allocated file and thus do not have any metadata to > > * commit. > > */ > > - if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && > > - mp->m_logdev_targp == mp->m_ddev_targp) > > + if (!XFS_IS_REALTIME_INODE(ip) && > > + mp->m_logdev_targp == mp->m_ddev_targp && > > + flushseq == READ_ONCE(mp->m_flushseq)) > > ... and here READ m_flush_completed_seq > if (m_flush_completed_seq > m_flush_submitted_seq) > it is safe to skip issue flush. > Then probably READ_ONCE() is not enough and need smb_rmb? > IIUC, basically we need to guarantee that a flush submits after file_write_and_wait() and completes before we return. If we do something like the above, I wonder if that means we could wait for the submit == complete if we observe submit was bumped since it was initially sampled above (rather than issue another flush, which would be necessary if a submit hadn't occurred))..? If we do end up with something like this, I think it's a bit cleaner to stuff the counter(s) in the xfs_buftarg structure and update them from the generic buffer submit/completion code based on XBF_FLUSH. FWIW, I suspect we could also update said counter(s) from xfs_blkdev_issue_flush(). Brian > > xfs_blkdev_issue_flush(mp->m_ddev_targp); > > > > return error; > > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c > > index bcb2f860e508..3c0cbb98581e 100644 > > --- a/fs/xfs/xfs_log.c > > +++ b/fs/xfs/xfs_log.c > > @@ -2922,6 +2922,8 @@ xlog_state_done_syncing( > > iclog->ic_state = XLOG_STATE_DONE_SYNC; > > } > > > > + log->l_mp->m_flushseq++; > > I recon this should use WRITE_ONCE or smp_wmb() > and then also increment m_flush_submitted_seq *before* > issueing flush > > If state machine does not allow more than a single flush > to be in flight (?) then the 2 seq counters could be reduced > to single seq counter with (m_flushseq % 2) == 1 for submitted > and (m_flushseq % 2) == 0 for completed and the test in fsync > would be (flushseq % 2) == (READ_ONCE(mp->m_flushseq) % 2) > > ... maybe? > > Amir. > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 31, 2017 at 7:39 PM, Brian Foster <bfoster@redhat.com> wrote: > On Thu, Aug 31, 2017 at 05:37:06PM +0300, Amir Goldstein wrote: >> On Thu, Aug 31, 2017 at 4:47 PM, Christoph Hellwig <hch@lst.de> wrote: >> > I think something like the following patch (totally untested, >> > just an idea) should fix the issue, right? >> >> I think that is not enough. >> >> > >> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c >> > index c4893e226fd8..555fcae9a18f 100644 >> > --- a/fs/xfs/xfs_file.c >> > +++ b/fs/xfs/xfs_file.c >> > @@ -135,7 +135,7 @@ xfs_file_fsync( >> > struct xfs_inode *ip = XFS_I(inode); >> > struct xfs_mount *mp = ip->i_mount; >> > int error = 0; >> > - int log_flushed = 0; >> > + unsigned int flushseq; >> > xfs_lsn_t lsn = 0; >> > >> > trace_xfs_file_fsync(ip); >> > @@ -143,6 +143,7 @@ xfs_file_fsync( >> > error = file_write_and_wait_range(file, start, end); >> > if (error) >> > return error; >> > + flushseq = READ_ONCE(mp->m_flushseq); >> >> imagine that flush was submitted and completed before >> file_write_and_wait_range() but m_flushseq incremented after. >> maybe here READ m_flush_submitted_seq... >> >> > >> > if (XFS_FORCED_SHUTDOWN(mp)) >> > return -EIO; >> > @@ -181,7 +182,7 @@ xfs_file_fsync( >> > } >> > >> > if (lsn) { >> > - error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); >> > + error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); >> > ip->i_itemp->ili_fsync_fields = 0; >> > } >> > xfs_iunlock(ip, XFS_ILOCK_SHARED); >> > @@ -193,8 +194,9 @@ xfs_file_fsync( >> > * an already allocated file and thus do not have any metadata to >> > * commit. >> > */ >> > - if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && >> > - mp->m_logdev_targp == mp->m_ddev_targp) >> > + if (!XFS_IS_REALTIME_INODE(ip) && >> > + mp->m_logdev_targp == mp->m_ddev_targp && >> > + flushseq == READ_ONCE(mp->m_flushseq)) >> >> ... and here READ m_flush_completed_seq >> if (m_flush_completed_seq > m_flush_submitted_seq) >> it is safe to skip issue flush. >> Then probably READ_ONCE() is not enough and need smb_rmb? >> > > IIUC, basically we need to guarantee that a flush submits after > file_write_and_wait() and completes before we return. Yeh. unless we check if file_write_and_wait() submitted anything at all. > If we do something > like the above, I wonder if that means we could wait for the submit == > complete if we observe submit was bumped since it was initially sampled > above (rather than issue another flush, which would be necessary if a > submit hadn't occurred))..? > > If we do end up with something like this, I think it's a bit cleaner to > stuff the counter(s) in the xfs_buftarg structure and update them from > the generic buffer submit/completion code based on XBF_FLUSH. FWIW, I > suspect we could also update said counter(s) from > xfs_blkdev_issue_flush(). > I think what you are suggesting is to optimize more cases which are not optimized now. That is probably possible, but also more complicated to get right and not sure if the workloads that gain from this are important enough. If I am not mistaken the way to fix the current optimization is to record the last SYNC_DONE lsn (which is sort of what Christoph suggested) and the last WANY_SYNC|ACTIVE lsn. After file_write_and_wait() need to save pre_sync_lsn and before return need to make sure that post_sync_lsn >= pre_sync_lsn or issue a flush. Amir.
On Thu, Aug 31, 2017 at 10:20:19PM +0300, Amir Goldstein wrote: > On Thu, Aug 31, 2017 at 7:39 PM, Brian Foster <bfoster@redhat.com> wrote: > > On Thu, Aug 31, 2017 at 05:37:06PM +0300, Amir Goldstein wrote: > >> On Thu, Aug 31, 2017 at 4:47 PM, Christoph Hellwig <hch@lst.de> wrote: > >> > I think something like the following patch (totally untested, > >> > just an idea) should fix the issue, right? > >> > >> I think that is not enough. > >> > >> > > >> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > >> > index c4893e226fd8..555fcae9a18f 100644 > >> > --- a/fs/xfs/xfs_file.c > >> > +++ b/fs/xfs/xfs_file.c > >> > @@ -135,7 +135,7 @@ xfs_file_fsync( > >> > struct xfs_inode *ip = XFS_I(inode); > >> > struct xfs_mount *mp = ip->i_mount; > >> > int error = 0; > >> > - int log_flushed = 0; > >> > + unsigned int flushseq; > >> > xfs_lsn_t lsn = 0; > >> > > >> > trace_xfs_file_fsync(ip); > >> > @@ -143,6 +143,7 @@ xfs_file_fsync( > >> > error = file_write_and_wait_range(file, start, end); > >> > if (error) > >> > return error; > >> > + flushseq = READ_ONCE(mp->m_flushseq); > >> > >> imagine that flush was submitted and completed before > >> file_write_and_wait_range() but m_flushseq incremented after. > >> maybe here READ m_flush_submitted_seq... > >> > >> > > >> > if (XFS_FORCED_SHUTDOWN(mp)) > >> > return -EIO; > >> > @@ -181,7 +182,7 @@ xfs_file_fsync( > >> > } > >> > > >> > if (lsn) { > >> > - error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); > >> > + error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); > >> > ip->i_itemp->ili_fsync_fields = 0; > >> > } > >> > xfs_iunlock(ip, XFS_ILOCK_SHARED); > >> > @@ -193,8 +194,9 @@ xfs_file_fsync( > >> > * an already allocated file and thus do not have any metadata to > >> > * commit. > >> > */ > >> > - if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && > >> > - mp->m_logdev_targp == mp->m_ddev_targp) > >> > + if (!XFS_IS_REALTIME_INODE(ip) && > >> > + mp->m_logdev_targp == mp->m_ddev_targp && > >> > + flushseq == READ_ONCE(mp->m_flushseq)) > >> > >> ... and here READ m_flush_completed_seq > >> if (m_flush_completed_seq > m_flush_submitted_seq) > >> it is safe to skip issue flush. > >> Then probably READ_ONCE() is not enough and need smb_rmb? > >> > > > > IIUC, basically we need to guarantee that a flush submits after > > file_write_and_wait() and completes before we return. > > Yeh. unless we check if file_write_and_wait() submitted anything > at all. > Ok, thanks. > > If we do something > > like the above, I wonder if that means we could wait for the submit == > > complete if we observe submit was bumped since it was initially sampled > > above (rather than issue another flush, which would be necessary if a > > submit hadn't occurred))..? > > > > If we do end up with something like this, I think it's a bit cleaner to > > stuff the counter(s) in the xfs_buftarg structure and update them from > > the generic buffer submit/completion code based on XBF_FLUSH. FWIW, I > > suspect we could also update said counter(s) from > > xfs_blkdev_issue_flush(). > > > > I think what you are suggesting is to optimize more cases which are > not optimized now. That is probably possible, but also more complicated > to get right and not sure if the workloads that gain from this are important > enough. > Not necessarily. I'm just suggesting that the code could be factored more generically/elegantly such that the logic is easier to follow. That may facilitate optimizing more cases, but that's a secondary benefit. In practice, the log buffer code is the only place we actually set XBF_FLUSH, for example. We could toss the waiting idea for now and issue the flush unless we know one has submitted & completed since the writeback based on the counters. Waiting on a log buffer may already do the wait for us in most cases anyways, we just don't know if it submitted after the writeback or not without the submit counter. > If I am not mistaken the way to fix the current optimization is to record > the last SYNC_DONE lsn (which is sort of what Christoph suggested) > and the last WANY_SYNC|ACTIVE lsn. > After file_write_and_wait() need to save pre_sync_lsn and before > return need to make sure that post_sync_lsn >= pre_sync_lsn or > issue a flush. > Perhaps, but I'm not quite following what you mean by pre/post LSNs. Note that I believe log buffers can complete out of order, if that is relevant here. Either way, this still seems like underhanded logic IMO... If the requirement is a simple "issue a flush if we can't detect that one has submitted+completed on this device since our writeback completed" rule, why intentionally obfuscate that with internal log buffer state such as log buffer header LSN and log state machine values? Just track flush submission/completions as you suggested earlier and the fsync logic is much easier to follow. Then we don't need to work backwards from the XFS logging infrastructure just to try and verify whether a flush has occurred in all cases. :) Brian > Amir. > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 31, 2017 at 11:10 PM, Brian Foster <bfoster@redhat.com> wrote: > On Thu, Aug 31, 2017 at 10:20:19PM +0300, Amir Goldstein wrote: >> On Thu, Aug 31, 2017 at 7:39 PM, Brian Foster <bfoster@redhat.com> wrote: ... >> > If we do something >> > like the above, I wonder if that means we could wait for the submit == >> > complete if we observe submit was bumped since it was initially sampled >> > above (rather than issue another flush, which would be necessary if a >> > submit hadn't occurred))..? >> > >> > If we do end up with something like this, I think it's a bit cleaner to >> > stuff the counter(s) in the xfs_buftarg structure and update them from >> > the generic buffer submit/completion code based on XBF_FLUSH. FWIW, I >> > suspect we could also update said counter(s) from >> > xfs_blkdev_issue_flush(). >> > >> >> I think what you are suggesting is to optimize more cases which are >> not optimized now. That is probably possible, but also more complicated >> to get right and not sure if the workloads that gain from this are important >> enough. >> > > Not necessarily. I'm just suggesting that the code could be factored > more generically/elegantly such that the logic is easier to follow. That > may facilitate optimizing more cases, but that's a secondary benefit. In > practice, the log buffer code is the only place we actually set > XBF_FLUSH, for example. > I guess that makes sense. Although it is going to end up with more code, so if we are not going for optimization for more cases (i.e. subsequent fdatasync) we should consider if the extra code is worth it. > >> If I am not mistaken the way to fix the current optimization is to record >> the last SYNC_DONE lsn (which is sort of what Christoph suggested) >> and the last WANY_SYNC|ACTIVE lsn. >> After file_write_and_wait() need to save pre_sync_lsn and before >> return need to make sure that post_sync_lsn >= pre_sync_lsn or >> issue a flush. >> > > Perhaps, but I'm not quite following what you mean by pre/post LSNs. > Note that I believe log buffers can complete out of order, if that is > relevant here. Either way, this still seems like underhanded logic > IMO... > > If the requirement is a simple "issue a flush if we can't detect that > one has submitted+completed on this device since our writeback > completed" rule, why intentionally obfuscate that with internal log > buffer state such as log buffer header LSN and log state machine values? > Just track flush submission/completions as you suggested earlier and the > fsync logic is much easier to follow. Then we don't need to work > backwards from the XFS logging infrastructure just to try and verify > whether a flush has occurred in all cases. :) > Your argument makes a lot of sense. I'm just trying to be extra cautious and looking for a small step solution. As Darrick wrote.. "safety first" :) Amir.
On Thu, Aug 31, 2017 at 05:37:06PM +0300, Amir Goldstein wrote: > > index bcb2f860e508..3c0cbb98581e 100644 > > --- a/fs/xfs/xfs_log.c > > +++ b/fs/xfs/xfs_log.c > > @@ -2922,6 +2922,8 @@ xlog_state_done_syncing( > > iclog->ic_state = XLOG_STATE_DONE_SYNC; > > } > > > > + log->l_mp->m_flushseq++; > > I recon this should use WRITE_ONCE or smp_wmb() > and then also increment m_flush_submitted_seq *before* > issueing flush It's under l_icloglock, so we shouldn't need a WRITE_ONCE. If we did we'd need an atomic_t. > If state machine does not allow more than a single flush > to be in flight (?) then the 2 seq counters could be reduced > to single seq counter with (m_flushseq % 2) == 1 for submitted > and (m_flushseq % 2) == 0 for completed and the test in fsync > would be (flushseq % 2) == (READ_ONCE(mp->m_flushseq) % 2) The state machine allows multiple flushes to in flight.
On Thu, Aug 31, 2017 at 10:20:19PM +0300, Amir Goldstein wrote: > > IIUC, basically we need to guarantee that a flush submits after > > file_write_and_wait() and completes before we return. > > Yeh. unless we check if file_write_and_wait() submitted anything > at all. Even if file_write_and_wait did not submit anything we need to make sure a flush was submitted and completed after entering xfs_file_fsync. For one to deal with the case where we wrote back data from the flusher threads or the VM, and also for the direct I/O case. Btw, do you have any results for your simple catch? I wonder how much of an issue it actually is in practice.
On Fri, Sep 1, 2017 at 12:52 PM, Christoph Hellwig <hch@lst.de> wrote: > On Thu, Aug 31, 2017 at 10:20:19PM +0300, Amir Goldstein wrote: >> > IIUC, basically we need to guarantee that a flush submits after >> > file_write_and_wait() and completes before we return. >> >> Yeh. unless we check if file_write_and_wait() submitted anything >> at all. > > Even if file_write_and_wait did not submit anything we need to > make sure a flush was submitted and completed after entering > xfs_file_fsync. For one to deal with the case where we wrote > back data from the flusher threads or the VM, and also for > the direct I/O case. > Right. > > Btw, do you have any results for your simple catch? I wonder > how much of an issue it actually is in practice. Well since the bug was demonstrated using crash simulator, I have no idea how often one would run into this with actual power failure, but does it really matter how often, if we *know* that it can happen? The sequence of bios that I illustrated in commit message is what I saw regardless of the crash simulator, and that sequence leaves a data block in the disk cache, with nothing that guaranties a FLUSH afterwards. So for an unlucky subject, that gives a window of data loss in case of power failure at least until the next time a flusher thread runs. That could be seconds, no? I am working on a variant of the flushseq you suggested using log->l_last_sync_lsn that may be simple enough and not as painful as turning off optimization for WANT_SYNC log buffer. Amir.
On Fri, Sep 01, 2017 at 01:37:28PM +0300, Amir Goldstein wrote: > Well since the bug was demonstrated using crash simulator, > I have no idea how often one would run into this with actual power > failure, but does it really matter how often, if we *know* that it can > happen? No, the issue is real. I wonder how much of an issue the additional flushes are in practice. > I am working on a variant of the flushseq you suggested using > log->l_last_sync_lsn that may be simple enough and not as painful > as turning off optimization for WANT_SYNC log buffer. Ok, I'll take a look once you post it.
On Fri, Sep 01, 2017 at 10:58:43AM +0300, Amir Goldstein wrote: > On Thu, Aug 31, 2017 at 11:10 PM, Brian Foster <bfoster@redhat.com> wrote: > > On Thu, Aug 31, 2017 at 10:20:19PM +0300, Amir Goldstein wrote: > >> On Thu, Aug 31, 2017 at 7:39 PM, Brian Foster <bfoster@redhat.com> wrote: > ... > >> > If we do something > >> > like the above, I wonder if that means we could wait for the submit == > >> > complete if we observe submit was bumped since it was initially sampled > >> > above (rather than issue another flush, which would be necessary if a > >> > submit hadn't occurred))..? > >> > > >> > If we do end up with something like this, I think it's a bit cleaner to > >> > stuff the counter(s) in the xfs_buftarg structure and update them from > >> > the generic buffer submit/completion code based on XBF_FLUSH. FWIW, I > >> > suspect we could also update said counter(s) from > >> > xfs_blkdev_issue_flush(). > >> > > >> > >> I think what you are suggesting is to optimize more cases which are > >> not optimized now. That is probably possible, but also more complicated > >> to get right and not sure if the workloads that gain from this are important > >> enough. > >> > > > > Not necessarily. I'm just suggesting that the code could be factored > > more generically/elegantly such that the logic is easier to follow. That > > may facilitate optimizing more cases, but that's a secondary benefit. In > > practice, the log buffer code is the only place we actually set > > XBF_FLUSH, for example. > > > > I guess that makes sense. > Although it is going to end up with more code, so if we are not going for > optimization for more cases (i.e. subsequent fdatasync) we should consider > if the extra code is worth it. > Incrementally more than Christoph's patch. I don't think that's an issue. > > > >> If I am not mistaken the way to fix the current optimization is to record > >> the last SYNC_DONE lsn (which is sort of what Christoph suggested) > >> and the last WANY_SYNC|ACTIVE lsn. > >> After file_write_and_wait() need to save pre_sync_lsn and before > >> return need to make sure that post_sync_lsn >= pre_sync_lsn or > >> issue a flush. > >> > > > > Perhaps, but I'm not quite following what you mean by pre/post LSNs. > > Note that I believe log buffers can complete out of order, if that is > > relevant here. Either way, this still seems like underhanded logic > > IMO... > > > > If the requirement is a simple "issue a flush if we can't detect that > > one has submitted+completed on this device since our writeback > > completed" rule, why intentionally obfuscate that with internal log > > buffer state such as log buffer header LSN and log state machine values? > > Just track flush submission/completions as you suggested earlier and the > > fsync logic is much easier to follow. Then we don't need to work > > backwards from the XFS logging infrastructure just to try and verify > > whether a flush has occurred in all cases. :) > > > > Your argument makes a lot of sense. I'm just trying to be extra cautious > and looking for a small step solution. As Darrick wrote.. "safety first" :) > Sure. FWIW, I think your patch suits that purpose as it basically disables the shady bits of the optimization. It looks like it's already in Darrick's for-next branch and Cc'd to stable as well. A cleaner rework of this mechanism can come after, hopefully restore the full optimization safely and perhaps clean out the whole log_flushed thing. Brian > Amir. > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c4893e226fd8..555fcae9a18f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -135,7 +135,7 @@ xfs_file_fsync( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; int error = 0; - int log_flushed = 0; + unsigned int flushseq; xfs_lsn_t lsn = 0; trace_xfs_file_fsync(ip); @@ -143,6 +143,7 @@ xfs_file_fsync( error = file_write_and_wait_range(file, start, end); if (error) return error; + flushseq = READ_ONCE(mp->m_flushseq); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -181,7 +182,7 @@ xfs_file_fsync( } if (lsn) { - error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); + error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); ip->i_itemp->ili_fsync_fields = 0; } xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -193,8 +194,9 @@ xfs_file_fsync( * an already allocated file and thus do not have any metadata to * commit. */ - if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && - mp->m_logdev_targp == mp->m_ddev_targp) + if (!XFS_IS_REALTIME_INODE(ip) && + mp->m_logdev_targp == mp->m_ddev_targp && + flushseq == READ_ONCE(mp->m_flushseq)) xfs_blkdev_issue_flush(mp->m_ddev_targp); return error; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bcb2f860e508..3c0cbb98581e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2922,6 +2922,8 @@ xlog_state_done_syncing( iclog->ic_state = XLOG_STATE_DONE_SYNC; } + log->l_mp->m_flushseq++; + /* * Someone could be sleeping prior to writing out the next * iclog buffer, we wake them all, one will get to do the diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e0792d036be2..556e01a0175e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -79,6 +79,7 @@ typedef struct xfs_mount { struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ struct percpu_counter m_fdblocks; /* free block counter */ + unsigned int m_flushseq; struct xfs_buf *m_sb_bp; /* buffer for superblock */ char *m_fsname; /* filesystem name */