diff mbox

xfs: fix incorrect log_flushed on fsync

Message ID 20170831134722.GA6912@lst.de (mailing list archive)
State New, archived
Headers show

Commit Message

Christoph Hellwig Aug. 31, 2017, 1:47 p.m. UTC
I think something like the following patch (totally untested,
just an idea) should fix the issue, right?

Comments

Amir Goldstein Aug. 31, 2017, 2:37 p.m. UTC | #1
On Thu, Aug 31, 2017 at 4:47 PM, Christoph Hellwig <hch@lst.de> wrote:
> I think something like the following patch (totally untested,
> just an idea) should fix the issue, right?

I think that is not enough.

>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index c4893e226fd8..555fcae9a18f 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -135,7 +135,7 @@ xfs_file_fsync(
>         struct xfs_inode        *ip = XFS_I(inode);
>         struct xfs_mount        *mp = ip->i_mount;
>         int                     error = 0;
> -       int                     log_flushed = 0;
> +       unsigned int            flushseq;
>         xfs_lsn_t               lsn = 0;
>
>         trace_xfs_file_fsync(ip);
> @@ -143,6 +143,7 @@ xfs_file_fsync(
>         error = file_write_and_wait_range(file, start, end);
>         if (error)
>                 return error;
> +       flushseq = READ_ONCE(mp->m_flushseq);

imagine that flush was submitted and completed before
file_write_and_wait_range() but m_flushseq incremented  after.
maybe here READ m_flush_submitted_seq...

>
>         if (XFS_FORCED_SHUTDOWN(mp))
>                 return -EIO;
> @@ -181,7 +182,7 @@ xfs_file_fsync(
>         }
>
>         if (lsn) {
> -               error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
> +               error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
>                 ip->i_itemp->ili_fsync_fields = 0;
>         }
>         xfs_iunlock(ip, XFS_ILOCK_SHARED);
> @@ -193,8 +194,9 @@ xfs_file_fsync(
>          * an already allocated file and thus do not have any metadata to
>          * commit.
>          */
> -       if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
> -           mp->m_logdev_targp == mp->m_ddev_targp)
> +       if (!XFS_IS_REALTIME_INODE(ip) &&
> +           mp->m_logdev_targp == mp->m_ddev_targp &&
> +           flushseq == READ_ONCE(mp->m_flushseq))

... and here READ m_flush_completed_seq
if (m_flush_completed_seq > m_flush_submitted_seq)
it is safe to skip issue flush.
Then probably READ_ONCE() is not enough and need smb_rmb?

>                 xfs_blkdev_issue_flush(mp->m_ddev_targp);
>
>         return error;
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index bcb2f860e508..3c0cbb98581e 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -2922,6 +2922,8 @@ xlog_state_done_syncing(
>                 iclog->ic_state = XLOG_STATE_DONE_SYNC;
>         }
>
> +       log->l_mp->m_flushseq++;

I recon this should use WRITE_ONCE or smp_wmb()
and then also increment m_flush_submitted_seq *before*
issueing flush

If state machine does not allow more than a single flush
to be in flight (?) then the 2 seq counters could be reduced
to single seq counter with (m_flushseq % 2) == 1 for submitted
and  (m_flushseq % 2) == 0 for completed and the test in fsync
would be (flushseq % 2) == (READ_ONCE(mp->m_flushseq) % 2)

... maybe?

Amir.
Brian Foster Aug. 31, 2017, 4:39 p.m. UTC | #2
On Thu, Aug 31, 2017 at 05:37:06PM +0300, Amir Goldstein wrote:
> On Thu, Aug 31, 2017 at 4:47 PM, Christoph Hellwig <hch@lst.de> wrote:
> > I think something like the following patch (totally untested,
> > just an idea) should fix the issue, right?
> 
> I think that is not enough.
> 
> >
> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > index c4893e226fd8..555fcae9a18f 100644
> > --- a/fs/xfs/xfs_file.c
> > +++ b/fs/xfs/xfs_file.c
> > @@ -135,7 +135,7 @@ xfs_file_fsync(
> >         struct xfs_inode        *ip = XFS_I(inode);
> >         struct xfs_mount        *mp = ip->i_mount;
> >         int                     error = 0;
> > -       int                     log_flushed = 0;
> > +       unsigned int            flushseq;
> >         xfs_lsn_t               lsn = 0;
> >
> >         trace_xfs_file_fsync(ip);
> > @@ -143,6 +143,7 @@ xfs_file_fsync(
> >         error = file_write_and_wait_range(file, start, end);
> >         if (error)
> >                 return error;
> > +       flushseq = READ_ONCE(mp->m_flushseq);
> 
> imagine that flush was submitted and completed before
> file_write_and_wait_range() but m_flushseq incremented  after.
> maybe here READ m_flush_submitted_seq...
> 
> >
> >         if (XFS_FORCED_SHUTDOWN(mp))
> >                 return -EIO;
> > @@ -181,7 +182,7 @@ xfs_file_fsync(
> >         }
> >
> >         if (lsn) {
> > -               error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
> > +               error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
> >                 ip->i_itemp->ili_fsync_fields = 0;
> >         }
> >         xfs_iunlock(ip, XFS_ILOCK_SHARED);
> > @@ -193,8 +194,9 @@ xfs_file_fsync(
> >          * an already allocated file and thus do not have any metadata to
> >          * commit.
> >          */
> > -       if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
> > -           mp->m_logdev_targp == mp->m_ddev_targp)
> > +       if (!XFS_IS_REALTIME_INODE(ip) &&
> > +           mp->m_logdev_targp == mp->m_ddev_targp &&
> > +           flushseq == READ_ONCE(mp->m_flushseq))
> 
> ... and here READ m_flush_completed_seq
> if (m_flush_completed_seq > m_flush_submitted_seq)
> it is safe to skip issue flush.
> Then probably READ_ONCE() is not enough and need smb_rmb?
> 

IIUC, basically we need to guarantee that a flush submits after
file_write_and_wait() and completes before we return. If we do something
like the above, I wonder if that means we could wait for the submit ==
complete if we observe submit was bumped since it was initially sampled
above (rather than issue another flush, which would be necessary if a
submit hadn't occurred))..?

If we do end up with something like this, I think it's a bit cleaner to
stuff the counter(s) in the xfs_buftarg structure and update them from
the generic buffer submit/completion code based on XBF_FLUSH. FWIW, I
suspect we could also update said counter(s) from
xfs_blkdev_issue_flush().

Brian

> >                 xfs_blkdev_issue_flush(mp->m_ddev_targp);
> >
> >         return error;
> > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> > index bcb2f860e508..3c0cbb98581e 100644
> > --- a/fs/xfs/xfs_log.c
> > +++ b/fs/xfs/xfs_log.c
> > @@ -2922,6 +2922,8 @@ xlog_state_done_syncing(
> >                 iclog->ic_state = XLOG_STATE_DONE_SYNC;
> >         }
> >
> > +       log->l_mp->m_flushseq++;
> 
> I recon this should use WRITE_ONCE or smp_wmb()
> and then also increment m_flush_submitted_seq *before*
> issueing flush
> 
> If state machine does not allow more than a single flush
> to be in flight (?) then the 2 seq counters could be reduced
> to single seq counter with (m_flushseq % 2) == 1 for submitted
> and  (m_flushseq % 2) == 0 for completed and the test in fsync
> would be (flushseq % 2) == (READ_ONCE(mp->m_flushseq) % 2)
> 
> ... maybe?
> 
> Amir.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Aug. 31, 2017, 7:20 p.m. UTC | #3
On Thu, Aug 31, 2017 at 7:39 PM, Brian Foster <bfoster@redhat.com> wrote:
> On Thu, Aug 31, 2017 at 05:37:06PM +0300, Amir Goldstein wrote:
>> On Thu, Aug 31, 2017 at 4:47 PM, Christoph Hellwig <hch@lst.de> wrote:
>> > I think something like the following patch (totally untested,
>> > just an idea) should fix the issue, right?
>>
>> I think that is not enough.
>>
>> >
>> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> > index c4893e226fd8..555fcae9a18f 100644
>> > --- a/fs/xfs/xfs_file.c
>> > +++ b/fs/xfs/xfs_file.c
>> > @@ -135,7 +135,7 @@ xfs_file_fsync(
>> >         struct xfs_inode        *ip = XFS_I(inode);
>> >         struct xfs_mount        *mp = ip->i_mount;
>> >         int                     error = 0;
>> > -       int                     log_flushed = 0;
>> > +       unsigned int            flushseq;
>> >         xfs_lsn_t               lsn = 0;
>> >
>> >         trace_xfs_file_fsync(ip);
>> > @@ -143,6 +143,7 @@ xfs_file_fsync(
>> >         error = file_write_and_wait_range(file, start, end);
>> >         if (error)
>> >                 return error;
>> > +       flushseq = READ_ONCE(mp->m_flushseq);
>>
>> imagine that flush was submitted and completed before
>> file_write_and_wait_range() but m_flushseq incremented  after.
>> maybe here READ m_flush_submitted_seq...
>>
>> >
>> >         if (XFS_FORCED_SHUTDOWN(mp))
>> >                 return -EIO;
>> > @@ -181,7 +182,7 @@ xfs_file_fsync(
>> >         }
>> >
>> >         if (lsn) {
>> > -               error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
>> > +               error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
>> >                 ip->i_itemp->ili_fsync_fields = 0;
>> >         }
>> >         xfs_iunlock(ip, XFS_ILOCK_SHARED);
>> > @@ -193,8 +194,9 @@ xfs_file_fsync(
>> >          * an already allocated file and thus do not have any metadata to
>> >          * commit.
>> >          */
>> > -       if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
>> > -           mp->m_logdev_targp == mp->m_ddev_targp)
>> > +       if (!XFS_IS_REALTIME_INODE(ip) &&
>> > +           mp->m_logdev_targp == mp->m_ddev_targp &&
>> > +           flushseq == READ_ONCE(mp->m_flushseq))
>>
>> ... and here READ m_flush_completed_seq
>> if (m_flush_completed_seq > m_flush_submitted_seq)
>> it is safe to skip issue flush.
>> Then probably READ_ONCE() is not enough and need smb_rmb?
>>
>
> IIUC, basically we need to guarantee that a flush submits after
> file_write_and_wait() and completes before we return.

Yeh. unless we check if file_write_and_wait() submitted anything
at all.

> If we do something
> like the above, I wonder if that means we could wait for the submit ==
> complete if we observe submit was bumped since it was initially sampled
> above (rather than issue another flush, which would be necessary if a
> submit hadn't occurred))..?
>
> If we do end up with something like this, I think it's a bit cleaner to
> stuff the counter(s) in the xfs_buftarg structure and update them from
> the generic buffer submit/completion code based on XBF_FLUSH. FWIW, I
> suspect we could also update said counter(s) from
> xfs_blkdev_issue_flush().
>

I think what you are suggesting is to optimize more cases which are
not optimized now. That is probably possible, but also more complicated
to get right and not sure if the workloads that gain from this are important
enough.

If I am not mistaken the way to fix the current optimization is to record
the last SYNC_DONE lsn (which is sort of what Christoph suggested)
and the last WANY_SYNC|ACTIVE lsn.
After  file_write_and_wait() need to save pre_sync_lsn and before
return need to make sure that post_sync_lsn >= pre_sync_lsn or
issue a flush.

Amir.
Brian Foster Aug. 31, 2017, 8:10 p.m. UTC | #4
On Thu, Aug 31, 2017 at 10:20:19PM +0300, Amir Goldstein wrote:
> On Thu, Aug 31, 2017 at 7:39 PM, Brian Foster <bfoster@redhat.com> wrote:
> > On Thu, Aug 31, 2017 at 05:37:06PM +0300, Amir Goldstein wrote:
> >> On Thu, Aug 31, 2017 at 4:47 PM, Christoph Hellwig <hch@lst.de> wrote:
> >> > I think something like the following patch (totally untested,
> >> > just an idea) should fix the issue, right?
> >>
> >> I think that is not enough.
> >>
> >> >
> >> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> >> > index c4893e226fd8..555fcae9a18f 100644
> >> > --- a/fs/xfs/xfs_file.c
> >> > +++ b/fs/xfs/xfs_file.c
> >> > @@ -135,7 +135,7 @@ xfs_file_fsync(
> >> >         struct xfs_inode        *ip = XFS_I(inode);
> >> >         struct xfs_mount        *mp = ip->i_mount;
> >> >         int                     error = 0;
> >> > -       int                     log_flushed = 0;
> >> > +       unsigned int            flushseq;
> >> >         xfs_lsn_t               lsn = 0;
> >> >
> >> >         trace_xfs_file_fsync(ip);
> >> > @@ -143,6 +143,7 @@ xfs_file_fsync(
> >> >         error = file_write_and_wait_range(file, start, end);
> >> >         if (error)
> >> >                 return error;
> >> > +       flushseq = READ_ONCE(mp->m_flushseq);
> >>
> >> imagine that flush was submitted and completed before
> >> file_write_and_wait_range() but m_flushseq incremented  after.
> >> maybe here READ m_flush_submitted_seq...
> >>
> >> >
> >> >         if (XFS_FORCED_SHUTDOWN(mp))
> >> >                 return -EIO;
> >> > @@ -181,7 +182,7 @@ xfs_file_fsync(
> >> >         }
> >> >
> >> >         if (lsn) {
> >> > -               error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
> >> > +               error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
> >> >                 ip->i_itemp->ili_fsync_fields = 0;
> >> >         }
> >> >         xfs_iunlock(ip, XFS_ILOCK_SHARED);
> >> > @@ -193,8 +194,9 @@ xfs_file_fsync(
> >> >          * an already allocated file and thus do not have any metadata to
> >> >          * commit.
> >> >          */
> >> > -       if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
> >> > -           mp->m_logdev_targp == mp->m_ddev_targp)
> >> > +       if (!XFS_IS_REALTIME_INODE(ip) &&
> >> > +           mp->m_logdev_targp == mp->m_ddev_targp &&
> >> > +           flushseq == READ_ONCE(mp->m_flushseq))
> >>
> >> ... and here READ m_flush_completed_seq
> >> if (m_flush_completed_seq > m_flush_submitted_seq)
> >> it is safe to skip issue flush.
> >> Then probably READ_ONCE() is not enough and need smb_rmb?
> >>
> >
> > IIUC, basically we need to guarantee that a flush submits after
> > file_write_and_wait() and completes before we return.
> 
> Yeh. unless we check if file_write_and_wait() submitted anything
> at all.
> 

Ok, thanks.

> > If we do something
> > like the above, I wonder if that means we could wait for the submit ==
> > complete if we observe submit was bumped since it was initially sampled
> > above (rather than issue another flush, which would be necessary if a
> > submit hadn't occurred))..?
> >
> > If we do end up with something like this, I think it's a bit cleaner to
> > stuff the counter(s) in the xfs_buftarg structure and update them from
> > the generic buffer submit/completion code based on XBF_FLUSH. FWIW, I
> > suspect we could also update said counter(s) from
> > xfs_blkdev_issue_flush().
> >
> 
> I think what you are suggesting is to optimize more cases which are
> not optimized now. That is probably possible, but also more complicated
> to get right and not sure if the workloads that gain from this are important
> enough.
> 

Not necessarily. I'm just suggesting that the code could be factored
more generically/elegantly such that the logic is easier to follow. That
may facilitate optimizing more cases, but that's a secondary benefit. In
practice, the log buffer code is the only place we actually set
XBF_FLUSH, for example.

We could toss the waiting idea for now and issue the flush unless we
know one has submitted & completed since the writeback based on the
counters. Waiting on a log buffer may already do the wait for us in most
cases anyways, we just don't know if it submitted after the writeback or
not without the submit counter.

> If I am not mistaken the way to fix the current optimization is to record
> the last SYNC_DONE lsn (which is sort of what Christoph suggested)
> and the last WANY_SYNC|ACTIVE lsn.
> After  file_write_and_wait() need to save pre_sync_lsn and before
> return need to make sure that post_sync_lsn >= pre_sync_lsn or
> issue a flush.
> 

Perhaps, but I'm not quite following what you mean by pre/post LSNs.
Note that I believe log buffers can complete out of order, if that is
relevant here. Either way, this still seems like underhanded logic
IMO...

If the requirement is a simple "issue a flush if we can't detect that
one has submitted+completed on this device since our writeback
completed" rule, why intentionally obfuscate that with internal log
buffer state such as log buffer header LSN and log state machine values?
Just track flush submission/completions as you suggested earlier and the
fsync logic is much easier to follow. Then we don't need to work
backwards from the XFS logging infrastructure just to try and verify
whether a flush has occurred in all cases. :)

Brian

> Amir.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Sept. 1, 2017, 7:58 a.m. UTC | #5
On Thu, Aug 31, 2017 at 11:10 PM, Brian Foster <bfoster@redhat.com> wrote:
> On Thu, Aug 31, 2017 at 10:20:19PM +0300, Amir Goldstein wrote:
>> On Thu, Aug 31, 2017 at 7:39 PM, Brian Foster <bfoster@redhat.com> wrote:
...
>> > If we do something
>> > like the above, I wonder if that means we could wait for the submit ==
>> > complete if we observe submit was bumped since it was initially sampled
>> > above (rather than issue another flush, which would be necessary if a
>> > submit hadn't occurred))..?
>> >
>> > If we do end up with something like this, I think it's a bit cleaner to
>> > stuff the counter(s) in the xfs_buftarg structure and update them from
>> > the generic buffer submit/completion code based on XBF_FLUSH. FWIW, I
>> > suspect we could also update said counter(s) from
>> > xfs_blkdev_issue_flush().
>> >
>>
>> I think what you are suggesting is to optimize more cases which are
>> not optimized now. That is probably possible, but also more complicated
>> to get right and not sure if the workloads that gain from this are important
>> enough.
>>
>
> Not necessarily. I'm just suggesting that the code could be factored
> more generically/elegantly such that the logic is easier to follow. That
> may facilitate optimizing more cases, but that's a secondary benefit. In
> practice, the log buffer code is the only place we actually set
> XBF_FLUSH, for example.
>

I guess that makes sense.
Although it is going to end up with more code, so if we are not going for
optimization for more cases (i.e. subsequent fdatasync) we should consider
if the extra code is worth it.

>
>> If I am not mistaken the way to fix the current optimization is to record
>> the last SYNC_DONE lsn (which is sort of what Christoph suggested)
>> and the last WANY_SYNC|ACTIVE lsn.
>> After  file_write_and_wait() need to save pre_sync_lsn and before
>> return need to make sure that post_sync_lsn >= pre_sync_lsn or
>> issue a flush.
>>
>
> Perhaps, but I'm not quite following what you mean by pre/post LSNs.
> Note that I believe log buffers can complete out of order, if that is
> relevant here. Either way, this still seems like underhanded logic
> IMO...
>
> If the requirement is a simple "issue a flush if we can't detect that
> one has submitted+completed on this device since our writeback
> completed" rule, why intentionally obfuscate that with internal log
> buffer state such as log buffer header LSN and log state machine values?
> Just track flush submission/completions as you suggested earlier and the
> fsync logic is much easier to follow. Then we don't need to work
> backwards from the XFS logging infrastructure just to try and verify
> whether a flush has occurred in all cases. :)
>

Your argument makes a lot of sense. I'm just trying to be extra cautious
and looking for a small step solution. As Darrick wrote.. "safety first" :)

Amir.
Christoph Hellwig Sept. 1, 2017, 9:47 a.m. UTC | #6
On Thu, Aug 31, 2017 at 05:37:06PM +0300, Amir Goldstein wrote:
> > index bcb2f860e508..3c0cbb98581e 100644
> > --- a/fs/xfs/xfs_log.c
> > +++ b/fs/xfs/xfs_log.c
> > @@ -2922,6 +2922,8 @@ xlog_state_done_syncing(
> >                 iclog->ic_state = XLOG_STATE_DONE_SYNC;
> >         }
> >
> > +       log->l_mp->m_flushseq++;
> 
> I recon this should use WRITE_ONCE or smp_wmb()
> and then also increment m_flush_submitted_seq *before*
> issueing flush

It's under l_icloglock, so we shouldn't need a WRITE_ONCE.
If we did we'd need an atomic_t.

> If state machine does not allow more than a single flush
> to be in flight (?) then the 2 seq counters could be reduced
> to single seq counter with (m_flushseq % 2) == 1 for submitted
> and  (m_flushseq % 2) == 0 for completed and the test in fsync
> would be (flushseq % 2) == (READ_ONCE(mp->m_flushseq) % 2)

The state machine allows multiple flushes to in flight.
Christoph Hellwig Sept. 1, 2017, 9:52 a.m. UTC | #7
On Thu, Aug 31, 2017 at 10:20:19PM +0300, Amir Goldstein wrote:
> > IIUC, basically we need to guarantee that a flush submits after
> > file_write_and_wait() and completes before we return.
> 
> Yeh. unless we check if file_write_and_wait() submitted anything
> at all.

Even if file_write_and_wait did not submit anything we need to
make sure a flush was submitted and completed after entering
xfs_file_fsync.  For one to deal with the case where we wrote
back data from the flusher threads or the VM, and also for
the direct I/O case.


Btw, do you have any results for your simple catch?  I wonder
how much of an issue it actually is in practice.
Amir Goldstein Sept. 1, 2017, 10:37 a.m. UTC | #8
On Fri, Sep 1, 2017 at 12:52 PM, Christoph Hellwig <hch@lst.de> wrote:
> On Thu, Aug 31, 2017 at 10:20:19PM +0300, Amir Goldstein wrote:
>> > IIUC, basically we need to guarantee that a flush submits after
>> > file_write_and_wait() and completes before we return.
>>
>> Yeh. unless we check if file_write_and_wait() submitted anything
>> at all.
>
> Even if file_write_and_wait did not submit anything we need to
> make sure a flush was submitted and completed after entering
> xfs_file_fsync.  For one to deal with the case where we wrote
> back data from the flusher threads or the VM, and also for
> the direct I/O case.
>

Right.

>
> Btw, do you have any results for your simple catch?  I wonder
> how much of an issue it actually is in practice.

Well since the bug was demonstrated using crash simulator,
I have no idea how often one would run into this with actual power
failure, but does it really matter how often, if we *know* that it can
happen?

The sequence of bios that I illustrated in commit message is
what I saw regardless of the crash simulator, and that sequence
leaves a data block in the disk cache, with nothing that guaranties
a FLUSH afterwards. So for an unlucky subject, that gives a window
of data loss in case of power failure at least until the next time
a flusher thread runs. That could be seconds, no?

I am working on a variant of the flushseq you suggested using
log->l_last_sync_lsn that may be simple enough and not as painful
as turning off optimization for WANT_SYNC log buffer.

Amir.
Christoph Hellwig Sept. 1, 2017, 10:43 a.m. UTC | #9
On Fri, Sep 01, 2017 at 01:37:28PM +0300, Amir Goldstein wrote:
> Well since the bug was demonstrated using crash simulator,
> I have no idea how often one would run into this with actual power
> failure, but does it really matter how often, if we *know* that it can
> happen?

No, the issue is real.  I wonder how much of an issue the additional
flushes are in practice.

> I am working on a variant of the flushseq you suggested using
> log->l_last_sync_lsn that may be simple enough and not as painful
> as turning off optimization for WANT_SYNC log buffer.

Ok, I'll take a look once you post it.
Brian Foster Sept. 1, 2017, 10:46 a.m. UTC | #10
On Fri, Sep 01, 2017 at 10:58:43AM +0300, Amir Goldstein wrote:
> On Thu, Aug 31, 2017 at 11:10 PM, Brian Foster <bfoster@redhat.com> wrote:
> > On Thu, Aug 31, 2017 at 10:20:19PM +0300, Amir Goldstein wrote:
> >> On Thu, Aug 31, 2017 at 7:39 PM, Brian Foster <bfoster@redhat.com> wrote:
> ...
> >> > If we do something
> >> > like the above, I wonder if that means we could wait for the submit ==
> >> > complete if we observe submit was bumped since it was initially sampled
> >> > above (rather than issue another flush, which would be necessary if a
> >> > submit hadn't occurred))..?
> >> >
> >> > If we do end up with something like this, I think it's a bit cleaner to
> >> > stuff the counter(s) in the xfs_buftarg structure and update them from
> >> > the generic buffer submit/completion code based on XBF_FLUSH. FWIW, I
> >> > suspect we could also update said counter(s) from
> >> > xfs_blkdev_issue_flush().
> >> >
> >>
> >> I think what you are suggesting is to optimize more cases which are
> >> not optimized now. That is probably possible, but also more complicated
> >> to get right and not sure if the workloads that gain from this are important
> >> enough.
> >>
> >
> > Not necessarily. I'm just suggesting that the code could be factored
> > more generically/elegantly such that the logic is easier to follow. That
> > may facilitate optimizing more cases, but that's a secondary benefit. In
> > practice, the log buffer code is the only place we actually set
> > XBF_FLUSH, for example.
> >
> 
> I guess that makes sense.
> Although it is going to end up with more code, so if we are not going for
> optimization for more cases (i.e. subsequent fdatasync) we should consider
> if the extra code is worth it.
> 

Incrementally more than Christoph's patch. I don't think that's an
issue.

> >
> >> If I am not mistaken the way to fix the current optimization is to record
> >> the last SYNC_DONE lsn (which is sort of what Christoph suggested)
> >> and the last WANY_SYNC|ACTIVE lsn.
> >> After  file_write_and_wait() need to save pre_sync_lsn and before
> >> return need to make sure that post_sync_lsn >= pre_sync_lsn or
> >> issue a flush.
> >>
> >
> > Perhaps, but I'm not quite following what you mean by pre/post LSNs.
> > Note that I believe log buffers can complete out of order, if that is
> > relevant here. Either way, this still seems like underhanded logic
> > IMO...
> >
> > If the requirement is a simple "issue a flush if we can't detect that
> > one has submitted+completed on this device since our writeback
> > completed" rule, why intentionally obfuscate that with internal log
> > buffer state such as log buffer header LSN and log state machine values?
> > Just track flush submission/completions as you suggested earlier and the
> > fsync logic is much easier to follow. Then we don't need to work
> > backwards from the XFS logging infrastructure just to try and verify
> > whether a flush has occurred in all cases. :)
> >
> 
> Your argument makes a lot of sense. I'm just trying to be extra cautious
> and looking for a small step solution. As Darrick wrote.. "safety first" :)
> 

Sure. FWIW, I think your patch suits that purpose as it basically
disables the shady bits of the optimization. It looks like it's already
in Darrick's for-next branch and Cc'd to stable as well.

A cleaner rework of this mechanism can come after, hopefully restore the
full optimization safely and perhaps clean out the whole log_flushed
thing.

Brian

> Amir.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c4893e226fd8..555fcae9a18f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -135,7 +135,7 @@  xfs_file_fsync(
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	int			error = 0;
-	int			log_flushed = 0;
+	unsigned int		flushseq;
 	xfs_lsn_t		lsn = 0;
 
 	trace_xfs_file_fsync(ip);
@@ -143,6 +143,7 @@  xfs_file_fsync(
 	error = file_write_and_wait_range(file, start, end);
 	if (error)
 		return error;
+	flushseq = READ_ONCE(mp->m_flushseq);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
@@ -181,7 +182,7 @@  xfs_file_fsync(
 	}
 
 	if (lsn) {
-		error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+		error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 		ip->i_itemp->ili_fsync_fields = 0;
 	}
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -193,8 +194,9 @@  xfs_file_fsync(
 	 * an already allocated file and thus do not have any metadata to
 	 * commit.
 	 */
-	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
-	    mp->m_logdev_targp == mp->m_ddev_targp)
+	if (!XFS_IS_REALTIME_INODE(ip) &&
+	    mp->m_logdev_targp == mp->m_ddev_targp &&
+	    flushseq == READ_ONCE(mp->m_flushseq))
 		xfs_blkdev_issue_flush(mp->m_ddev_targp);
 
 	return error;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index bcb2f860e508..3c0cbb98581e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2922,6 +2922,8 @@  xlog_state_done_syncing(
 		iclog->ic_state = XLOG_STATE_DONE_SYNC;
 	}
 
+	log->l_mp->m_flushseq++;
+
 	/*
 	 * Someone could be sleeping prior to writing out the next
 	 * iclog buffer, we wake them all, one will get to do the
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e0792d036be2..556e01a0175e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -79,6 +79,7 @@  typedef struct xfs_mount {
 	struct percpu_counter	m_icount;	/* allocated inodes counter */
 	struct percpu_counter	m_ifree;	/* free inodes counter */
 	struct percpu_counter	m_fdblocks;	/* free block counter */
+	unsigned int		m_flushseq;
 
 	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
 	char			*m_fsname;	/* filesystem name */