diff mbox

[PATCH-v4,6/7] ext4: add support for a lazytime mount option

Message ID 1416997437-26092-7-git-send-email-tytso@mit.edu (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Theodore Ts'o Nov. 26, 2014, 10:23 a.m. UTC
Add an optimization for the MS_LAZYTIME mount option so that we will
opportunistically write out any inodes with the I_DIRTY_TIME flag set
in a particular inode table block when we need to update some inode in
that inode table block anyway.

Also add some temporary code so that we can set the lazytime mount
option without needing a modified /sbin/mount program which can set
MS_LAZYTIME.  We can eventually make this go away once util-linux has
added support.

Google-Bug-Id: 18297052

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c             | 49 ++++++++++++++++++++++++++++++++++++++++++---
 fs/ext4/super.c             |  9 +++++++++
 include/trace/events/ext4.h | 30 +++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 3 deletions(-)

Comments

Christoph Hellwig Nov. 26, 2014, 7:24 p.m. UTC | #1
The subject line seems incorrect, this seems to implement some form
of dirty inode writeback clustering.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Nov. 26, 2014, 10:48 p.m. UTC | #2
On Wed, Nov 26, 2014 at 05:23:56AM -0500, Theodore Ts'o wrote:
> Add an optimization for the MS_LAZYTIME mount option so that we will
> opportunistically write out any inodes with the I_DIRTY_TIME flag set
> in a particular inode table block when we need to update some inode in
> that inode table block anyway.
> 
> Also add some temporary code so that we can set the lazytime mount
> option without needing a modified /sbin/mount program which can set
> MS_LAZYTIME.  We can eventually make this go away once util-linux has
> added support.
> 
> Google-Bug-Id: 18297052
> 
> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> ---
>  fs/ext4/inode.c             | 49 ++++++++++++++++++++++++++++++++++++++++++---
>  fs/ext4/super.c             |  9 +++++++++
>  include/trace/events/ext4.h | 30 +++++++++++++++++++++++++++
>  3 files changed, 85 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5653fa4..8308c82 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4140,6 +4140,51 @@ static int ext4_inode_blocks_set(handle_t *handle,
>  }
>  
>  /*
> + * Opportunistically update the other time fields for other inodes in
> + * the same inode table block.
> + */
> +static void ext4_update_other_inodes_time(struct super_block *sb,
> +					  unsigned long orig_ino, char *buf)
> +{
> +	struct ext4_inode_info	*ei;
> +	struct ext4_inode	*raw_inode;
> +	unsigned long		ino;
> +	struct inode		*inode;
> +	int		i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
> +	int		inode_size = EXT4_INODE_SIZE(sb);
> +
> +	ino = orig_ino & ~(inodes_per_block - 1);
> +	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
> +		if (ino == orig_ino)
> +			continue;
> +		inode = find_active_inode_nowait(sb, ino);
> +		if (!inode ||
> +		    (inode->i_state & I_DIRTY_TIME) == 0 ||
> +		    !spin_trylock(&inode->i_lock)) {
> +			iput(inode);
> +			continue;
> +		}
> +		inode->i_state &= ~I_DIRTY_TIME;
> +		inode->i_ts_dirty_day = 0;
> +		spin_unlock(&inode->i_lock);
> +		inode_requeue_dirtytime(inode);
> +
> +		ei = EXT4_I(inode);
> +		raw_inode = (struct ext4_inode *) buf;
> +
> +		spin_lock(&ei->i_raw_lock);
> +		EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
> +		EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
> +		EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
> +		ext4_inode_csum_set(inode, raw_inode, ei);
> +		spin_unlock(&ei->i_raw_lock);
> +		trace_ext4_other_inode_update_time(inode, orig_ino);
> +		iput(inode);
> +	}
> +}

Am I right in that this now does unlogged timestamp updates of
inodes? What happens when that buffer gets overwritten by log
recover after a crash? The timestamp updates get lost?

FYI, XFS has had all sorts of nasty log recovery corner cases
caused by log recovery overwriting non-logged inode updates like
this. In the past few years we've removed every single non-logged
inode update "optimisation" so that all changes (including timestamps)
are transactional so inode state on disk not matching what log
recovery wrote to disk for all the other inode metadata...

Optimistic unlogged inode updates are a slippery slope, and history
tells me that it doesn't lead to a nice place....

Cheers,

Dave.
Andreas Dilger Nov. 26, 2014, 11:10 p.m. UTC | #3
On Nov 26, 2014, at 3:48 PM, Dave Chinner <david@fromorbit.com> wrote:
> 
> On Wed, Nov 26, 2014 at 05:23:56AM -0500, Theodore Ts'o wrote:
>> Add an optimization for the MS_LAZYTIME mount option so that we will
>> opportunistically write out any inodes with the I_DIRTY_TIME flag set
>> in a particular inode table block when we need to update some inode
>> in that inode table block anyway.
>> 
>> Also add some temporary code so that we can set the lazytime mount
>> option without needing a modified /sbin/mount program which can set
>> MS_LAZYTIME.  We can eventually make this go away once util-linux has
>> added support.
>> 
>> Google-Bug-Id: 18297052
>> 
>> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
>> ---
>> fs/ext4/inode.c             | 49 ++++++++++++++++++++++++++++++++++++++++++---
>> fs/ext4/super.c             |  9 +++++++++
>> include/trace/events/ext4.h | 30 +++++++++++++++++++++++++++
>> 3 files changed, 85 insertions(+), 3 deletions(-)
>> 
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 5653fa4..8308c82 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -4140,6 +4140,51 @@ static int ext4_inode_blocks_set(handle_t *handle,
>> }
>> 
>> /*
>> + * Opportunistically update the other time fields for other inodes in
>> + * the same inode table block.
>> + */
>> +static void ext4_update_other_inodes_time(struct super_block *sb,
>> +					  unsigned long orig_ino, char *buf)
>> +{
>> +	struct ext4_inode_info	*ei;
>> +	struct ext4_inode	*raw_inode;
>> +	unsigned long		ino;
>> +	struct inode		*inode;
>> +	int		i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
>> +	int		inode_size = EXT4_INODE_SIZE(sb);
>> +
>> +	ino = orig_ino & ~(inodes_per_block - 1);
>> +	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
>> +		if (ino == orig_ino)
>> +			continue;
>> +		inode = find_active_inode_nowait(sb, ino);
>> +		if (!inode ||
>> +		    (inode->i_state & I_DIRTY_TIME) == 0 ||
>> +		    !spin_trylock(&inode->i_lock)) {
>> +			iput(inode);
>> +			continue;
>> +		}
>> +		inode->i_state &= ~I_DIRTY_TIME;
>> +		inode->i_ts_dirty_day = 0;
>> +		spin_unlock(&inode->i_lock);
>> +		inode_requeue_dirtytime(inode);
>> +
>> +		ei = EXT4_I(inode);
>> +		raw_inode = (struct ext4_inode *) buf;
>> +
>> +		spin_lock(&ei->i_raw_lock);
>> +		EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
>> +		EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
>> +		EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
>> +		ext4_inode_csum_set(inode, raw_inode, ei);
>> +		spin_unlock(&ei->i_raw_lock);
>> +		trace_ext4_other_inode_update_time(inode, orig_ino);
>> +		iput(inode);
>> +	}
>> +}
> 
> Am I right in that this now does unlogged timestamp updates of
> inodes? What happens when that buffer gets overwritten by log
> recover after a crash? The timestamp updates get lost?
> 
> FYI, XFS has had all sorts of nasty log recovery corner cases
> caused by log recovery overwriting non-logged inode updates like
> this. In the past few years we've removed every single non-logged
> inode update "optimisation" so that all changes (including timestamps)
> are transactional so inode state on disk not matching what log
> recovery wrote to disk for all the other inode metadata...
> 
> Optimistic unlogged inode updates are a slippery slope, and history
> tells me that it doesn't lead to a nice place....

Since ext4/jbd2 is logging the whole block, unlike XFS which is doing
logical journaling, this isn't an unlogged update.  It is just taking
advantage of the fact that the whole block is going to be logged and
written to the disk anyway.  If the only update needed for other inodes
in the block is the timestamp then they may as well be flushed to disk
at the same time and avoid the need for another update later on.

Cheers, Andreas





--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Nov. 26, 2014, 11:35 p.m. UTC | #4
On Wed, Nov 26, 2014 at 04:10:44PM -0700, Andreas Dilger wrote:
> On Nov 26, 2014, at 3:48 PM, Dave Chinner <david@fromorbit.com> wrote:
> > 
> > On Wed, Nov 26, 2014 at 05:23:56AM -0500, Theodore Ts'o wrote:
> >> Add an optimization for the MS_LAZYTIME mount option so that we will
> >> opportunistically write out any inodes with the I_DIRTY_TIME flag set
> >> in a particular inode table block when we need to update some inode
> >> in that inode table block anyway.
> >> 
> >> Also add some temporary code so that we can set the lazytime mount
> >> option without needing a modified /sbin/mount program which can set
> >> MS_LAZYTIME.  We can eventually make this go away once util-linux has
> >> added support.
> >> 
> >> Google-Bug-Id: 18297052
> >> 
> >> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> >> ---
> >> fs/ext4/inode.c             | 49 ++++++++++++++++++++++++++++++++++++++++++---
> >> fs/ext4/super.c             |  9 +++++++++
> >> include/trace/events/ext4.h | 30 +++++++++++++++++++++++++++
> >> 3 files changed, 85 insertions(+), 3 deletions(-)
> >> 
> >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> >> index 5653fa4..8308c82 100644
> >> --- a/fs/ext4/inode.c
> >> +++ b/fs/ext4/inode.c
> >> @@ -4140,6 +4140,51 @@ static int ext4_inode_blocks_set(handle_t *handle,
> >> }
> >> 
> >> /*
> >> + * Opportunistically update the other time fields for other inodes in
> >> + * the same inode table block.
> >> + */
> >> +static void ext4_update_other_inodes_time(struct super_block *sb,
> >> +					  unsigned long orig_ino, char *buf)
> >> +{
> >> +	struct ext4_inode_info	*ei;
> >> +	struct ext4_inode	*raw_inode;
> >> +	unsigned long		ino;
> >> +	struct inode		*inode;
> >> +	int		i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
> >> +	int		inode_size = EXT4_INODE_SIZE(sb);
> >> +
> >> +	ino = orig_ino & ~(inodes_per_block - 1);
> >> +	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
> >> +		if (ino == orig_ino)
> >> +			continue;
> >> +		inode = find_active_inode_nowait(sb, ino);
> >> +		if (!inode ||
> >> +		    (inode->i_state & I_DIRTY_TIME) == 0 ||
> >> +		    !spin_trylock(&inode->i_lock)) {
> >> +			iput(inode);
> >> +			continue;
> >> +		}
> >> +		inode->i_state &= ~I_DIRTY_TIME;
> >> +		inode->i_ts_dirty_day = 0;
> >> +		spin_unlock(&inode->i_lock);
> >> +		inode_requeue_dirtytime(inode);
> >> +
> >> +		ei = EXT4_I(inode);
> >> +		raw_inode = (struct ext4_inode *) buf;
> >> +
> >> +		spin_lock(&ei->i_raw_lock);
> >> +		EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
> >> +		EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
> >> +		EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
> >> +		ext4_inode_csum_set(inode, raw_inode, ei);
> >> +		spin_unlock(&ei->i_raw_lock);
> >> +		trace_ext4_other_inode_update_time(inode, orig_ino);
> >> +		iput(inode);
> >> +	}
> >> +}
> > 
> > Am I right in that this now does unlogged timestamp updates of
> > inodes? What happens when that buffer gets overwritten by log
> > recover after a crash? The timestamp updates get lost?
> > 
> > FYI, XFS has had all sorts of nasty log recovery corner cases
> > caused by log recovery overwriting non-logged inode updates like
> > this. In the past few years we've removed every single non-logged
> > inode update "optimisation" so that all changes (including timestamps)
> > are transactional so inode state on disk not matching what log
> > recovery wrote to disk for all the other inode metadata...
> > 
> > Optimistic unlogged inode updates are a slippery slope, and history
> > tells me that it doesn't lead to a nice place....
> 
> Since ext4/jbd2 is logging the whole block, unlike XFS which is doing
> logical journaling, this isn't an unlogged update.  It is just taking
> advantage of the fact that the whole block is going to be logged and
> written to the disk anyway.

Urk - that's worse, isn't it? i.e the code above calls iput() from
within a current transaction context?  What happens if that drops
the last reference to the inode and it gets evicted due to racing
with an unlink? Won't that try to start another transaction to free
the inode (i.e. through ext4_evict_inode())?



>
> If the only update needed for other inodes
> in the block is the timestamp then they may as well be flushed to disk
> at the same time and avoid the need for another update later on.
> 
> Cheers, Andreas
> 
> 
> 
> 
> 
>
Jan Kara Nov. 27, 2014, 1:27 p.m. UTC | #5
On Thu 27-11-14 10:35:37, Dave Chinner wrote:
> On Wed, Nov 26, 2014 at 04:10:44PM -0700, Andreas Dilger wrote:
> > On Nov 26, 2014, at 3:48 PM, Dave Chinner <david@fromorbit.com> wrote:
> > > 
> > > On Wed, Nov 26, 2014 at 05:23:56AM -0500, Theodore Ts'o wrote:
> > >> Add an optimization for the MS_LAZYTIME mount option so that we will
> > >> opportunistically write out any inodes with the I_DIRTY_TIME flag set
> > >> in a particular inode table block when we need to update some inode
> > >> in that inode table block anyway.
> > >> 
> > >> Also add some temporary code so that we can set the lazytime mount
> > >> option without needing a modified /sbin/mount program which can set
> > >> MS_LAZYTIME.  We can eventually make this go away once util-linux has
> > >> added support.
> > >> 
> > >> Google-Bug-Id: 18297052
> > >> 
> > >> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> > >> ---
> > >> fs/ext4/inode.c             | 49 ++++++++++++++++++++++++++++++++++++++++++---
> > >> fs/ext4/super.c             |  9 +++++++++
> > >> include/trace/events/ext4.h | 30 +++++++++++++++++++++++++++
> > >> 3 files changed, 85 insertions(+), 3 deletions(-)
> > >> 
> > >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > >> index 5653fa4..8308c82 100644
> > >> --- a/fs/ext4/inode.c
> > >> +++ b/fs/ext4/inode.c
> > >> @@ -4140,6 +4140,51 @@ static int ext4_inode_blocks_set(handle_t *handle,
> > >> }
> > >> 
> > >> /*
> > >> + * Opportunistically update the other time fields for other inodes in
> > >> + * the same inode table block.
> > >> + */
> > >> +static void ext4_update_other_inodes_time(struct super_block *sb,
> > >> +					  unsigned long orig_ino, char *buf)
> > >> +{
> > >> +	struct ext4_inode_info	*ei;
> > >> +	struct ext4_inode	*raw_inode;
> > >> +	unsigned long		ino;
> > >> +	struct inode		*inode;
> > >> +	int		i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
> > >> +	int		inode_size = EXT4_INODE_SIZE(sb);
> > >> +
> > >> +	ino = orig_ino & ~(inodes_per_block - 1);
> > >> +	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
> > >> +		if (ino == orig_ino)
> > >> +			continue;
> > >> +		inode = find_active_inode_nowait(sb, ino);
> > >> +		if (!inode ||
> > >> +		    (inode->i_state & I_DIRTY_TIME) == 0 ||
> > >> +		    !spin_trylock(&inode->i_lock)) {
> > >> +			iput(inode);
> > >> +			continue;
> > >> +		}
> > >> +		inode->i_state &= ~I_DIRTY_TIME;
> > >> +		inode->i_ts_dirty_day = 0;
> > >> +		spin_unlock(&inode->i_lock);
> > >> +		inode_requeue_dirtytime(inode);
> > >> +
> > >> +		ei = EXT4_I(inode);
> > >> +		raw_inode = (struct ext4_inode *) buf;
> > >> +
> > >> +		spin_lock(&ei->i_raw_lock);
> > >> +		EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
> > >> +		EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
> > >> +		EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
> > >> +		ext4_inode_csum_set(inode, raw_inode, ei);
> > >> +		spin_unlock(&ei->i_raw_lock);
> > >> +		trace_ext4_other_inode_update_time(inode, orig_ino);
> > >> +		iput(inode);
> > >> +	}
> > >> +}
> > > 
> > > Am I right in that this now does unlogged timestamp updates of
> > > inodes? What happens when that buffer gets overwritten by log
> > > recover after a crash? The timestamp updates get lost?
> > > 
> > > FYI, XFS has had all sorts of nasty log recovery corner cases
> > > caused by log recovery overwriting non-logged inode updates like
> > > this. In the past few years we've removed every single non-logged
> > > inode update "optimisation" so that all changes (including timestamps)
> > > are transactional so inode state on disk not matching what log
> > > recovery wrote to disk for all the other inode metadata...
> > > 
> > > Optimistic unlogged inode updates are a slippery slope, and history
> > > tells me that it doesn't lead to a nice place....
> > 
> > Since ext4/jbd2 is logging the whole block, unlike XFS which is doing
> > logical journaling, this isn't an unlogged update.  It is just taking
> > advantage of the fact that the whole block is going to be logged and
> > written to the disk anyway.
> 
> Urk - that's worse, isn't it? i.e the code above calls iput() from
> within a current transaction context?  What happens if that drops
> the last reference to the inode and it gets evicted due to racing
> with an unlink? Won't that try to start another transaction to free
> the inode (i.e. through ext4_evict_inode())?
  Yeah, the patch looks buggy (and racy wrt concurrent updates of time
stamps as well). I think if we want to do this optimization, we would need
a function like "clear inode dirty bits for this range of inode numbers".
That is doable atomically within VFS and although it looks somewhat ugly,
the performance
Jan Kara Nov. 27, 2014, 1:32 p.m. UTC | #6
On Thu 27-11-14 14:27:52, Jan Kara wrote:
> On Thu 27-11-14 10:35:37, Dave Chinner wrote:
> > On Wed, Nov 26, 2014 at 04:10:44PM -0700, Andreas Dilger wrote:
> > > On Nov 26, 2014, at 3:48 PM, Dave Chinner <david@fromorbit.com> wrote:
> > > > 
> > > > On Wed, Nov 26, 2014 at 05:23:56AM -0500, Theodore Ts'o wrote:
> > > >> Add an optimization for the MS_LAZYTIME mount option so that we will
> > > >> opportunistically write out any inodes with the I_DIRTY_TIME flag set
> > > >> in a particular inode table block when we need to update some inode
> > > >> in that inode table block anyway.
> > > >> 
> > > >> Also add some temporary code so that we can set the lazytime mount
> > > >> option without needing a modified /sbin/mount program which can set
> > > >> MS_LAZYTIME.  We can eventually make this go away once util-linux has
> > > >> added support.
> > > >> 
> > > >> Google-Bug-Id: 18297052
> > > >> 
> > > >> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> > > >> ---
> > > >> fs/ext4/inode.c             | 49 ++++++++++++++++++++++++++++++++++++++++++---
> > > >> fs/ext4/super.c             |  9 +++++++++
> > > >> include/trace/events/ext4.h | 30 +++++++++++++++++++++++++++
> > > >> 3 files changed, 85 insertions(+), 3 deletions(-)
> > > >> 
> > > >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > >> index 5653fa4..8308c82 100644
> > > >> --- a/fs/ext4/inode.c
> > > >> +++ b/fs/ext4/inode.c
> > > >> @@ -4140,6 +4140,51 @@ static int ext4_inode_blocks_set(handle_t *handle,
> > > >> }
> > > >> 
> > > >> /*
> > > >> + * Opportunistically update the other time fields for other inodes in
> > > >> + * the same inode table block.
> > > >> + */
> > > >> +static void ext4_update_other_inodes_time(struct super_block *sb,
> > > >> +					  unsigned long orig_ino, char *buf)
> > > >> +{
> > > >> +	struct ext4_inode_info	*ei;
> > > >> +	struct ext4_inode	*raw_inode;
> > > >> +	unsigned long		ino;
> > > >> +	struct inode		*inode;
> > > >> +	int		i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
> > > >> +	int		inode_size = EXT4_INODE_SIZE(sb);
> > > >> +
> > > >> +	ino = orig_ino & ~(inodes_per_block - 1);
> > > >> +	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
> > > >> +		if (ino == orig_ino)
> > > >> +			continue;
> > > >> +		inode = find_active_inode_nowait(sb, ino);
> > > >> +		if (!inode ||
> > > >> +		    (inode->i_state & I_DIRTY_TIME) == 0 ||
> > > >> +		    !spin_trylock(&inode->i_lock)) {
> > > >> +			iput(inode);
> > > >> +			continue;
> > > >> +		}
> > > >> +		inode->i_state &= ~I_DIRTY_TIME;
> > > >> +		inode->i_ts_dirty_day = 0;
> > > >> +		spin_unlock(&inode->i_lock);
> > > >> +		inode_requeue_dirtytime(inode);
> > > >> +
> > > >> +		ei = EXT4_I(inode);
> > > >> +		raw_inode = (struct ext4_inode *) buf;
> > > >> +
> > > >> +		spin_lock(&ei->i_raw_lock);
> > > >> +		EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
> > > >> +		EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
> > > >> +		EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
> > > >> +		ext4_inode_csum_set(inode, raw_inode, ei);
> > > >> +		spin_unlock(&ei->i_raw_lock);
> > > >> +		trace_ext4_other_inode_update_time(inode, orig_ino);
> > > >> +		iput(inode);
> > > >> +	}
> > > >> +}
> > > > 
> > > > Am I right in that this now does unlogged timestamp updates of
> > > > inodes? What happens when that buffer gets overwritten by log
> > > > recover after a crash? The timestamp updates get lost?
> > > > 
> > > > FYI, XFS has had all sorts of nasty log recovery corner cases
> > > > caused by log recovery overwriting non-logged inode updates like
> > > > this. In the past few years we've removed every single non-logged
> > > > inode update "optimisation" so that all changes (including timestamps)
> > > > are transactional so inode state on disk not matching what log
> > > > recovery wrote to disk for all the other inode metadata...
> > > > 
> > > > Optimistic unlogged inode updates are a slippery slope, and history
> > > > tells me that it doesn't lead to a nice place....
> > > 
> > > Since ext4/jbd2 is logging the whole block, unlike XFS which is doing
> > > logical journaling, this isn't an unlogged update.  It is just taking
> > > advantage of the fact that the whole block is going to be logged and
> > > written to the disk anyway.
> > 
> > Urk - that's worse, isn't it? i.e the code above calls iput() from
> > within a current transaction context?  What happens if that drops
> > the last reference to the inode and it gets evicted due to racing
> > with an unlink? Won't that try to start another transaction to free
> > the inode (i.e. through ext4_evict_inode())?
>   Yeah, the patch looks buggy (and racy wrt concurrent updates of time
> stamps as well). I think if we want to do this optimization, we would need
> a function like "clear inode dirty bits for this range of inode numbers".
> That is doable atomically within VFS and although it looks somewhat ugly,
> the performance
  Sorry, I sent this too early (did send instead of postpone). So the patch
looks buggy because of iput() but it isn't racy wrt time updates as I
checked now. So it would be enough to move calling of this outside of the
transaction and start new handle for each inode.

								Honza
Theodore Ts'o Nov. 27, 2014, 3:25 p.m. UTC | #7
This is what I'm currently playing with which I believe fixes the iput()
problem.  In fs/ext4/inode.c:

struct other_inode {
	unsigned long		orig_ino;
	struct ext4_inode	*raw_inode;
};
static int other_inode_match(struct inode * inode, unsigned long ino,
			     void *data);

/*
 * Opportunistically update the other time fields for other inodes in
 * the same inode table block.
 */
static void ext4_update_other_inodes_time(struct super_block *sb,
					  unsigned long orig_ino, char *buf)
{
	struct other_inode oi;
	unsigned long ino;
	int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
	int inode_size = EXT4_INODE_SIZE(sb);

	oi.orig_ino = orig_ino;
	ino = orig_ino & ~(inodes_per_block - 1);
	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
		if (ino == orig_ino)
			continue;
		oi.raw_inode = (struct ext4_inode *) buf;
		(void) find_inode_nowait(sb, ino, other_inode_match, &oi);
	}
}

static int other_inode_match(struct inode * inode, unsigned long ino,
			     void *data)
{
	struct other_inode *oi = (struct other_inode *) data;

	if ((inode->i_ino != ino) ||
	    (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) ||
	    ((inode->i_state & I_DIRTY_TIME) == 0))
		return 0;
	spin_lock(&inode->i_lock);
	if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) == 0) &&
	    (inode->i_state & I_DIRTY_TIME)) {
		struct ext4_inode_info	*ei = EXT4_I(inode);

		inode->i_state &= ~I_DIRTY_TIME;
		inode->i_ts_dirty_day = 0;
		spin_unlock(&inode->i_lock);
		inode_requeue_dirtytime(inode);

		spin_lock(&ei->i_raw_lock);
		EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
		EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
		EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
		ext4_inode_csum_set(inode, oi->raw_inode, ei);
		spin_unlock(&ei->i_raw_lock);
		trace_ext4_other_inode_update_time(inode, oi->orig_ino);
		return -1;
	}
	spin_unlock(&inode->i_lock);
	return -1;
}

The above uses the following in fs/inode.c (which gets added instead of
find_active_inode_nowait):

/**
 * find_inode_nowait - find an inode in the inode cache
 * @sb:		super block of file system to search
 * @hashval:	hash value (usually inode number) to search for
 * @match:	callback used for comparisons between inodes
 * @data:	opaque data pointer to pass to @match
 *
 * Search for the inode specified by @hashval and @data in the inode
 * cache, where the helper function @match will return 0 if the inode
 * does not match, 1 if the inode does match, and -1 if the search
 * should be stopped.  The @match function must be responsible for
 * taking the i_lock spin_lock and checking i_state for an inode being
 * freed or being initialized, and incrementing the reference count
 * before returning 1.  It also must not sleep, since it is called with
 * the inode_hash_lock spinlock held.
 *
 * This is a even more generalized version of ilookup5() when the
 * function must never block --- find_inode() can block in
 * __wait_on_freeing_inode() --- or when the caller can not increment
 * the reference count because the resulting iput() might cause an
 * inode eviction().  The tradeoff is that the @match funtion must be
 * very carefully implemented.
 */
struct inode *find_inode_nowait(struct super_block *sb,
				unsigned long hashval,
				int (*match)(struct inode *, unsigned long,
					     void *),
				void *data)
{
	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
	struct inode *inode, *ret_inode = NULL;
	int mval;

	spin_lock(&inode_hash_lock);
	hlist_for_each_entry(inode, head, i_hash) {
		if (inode->i_sb != sb)
			continue;
		mval = match(inode, hashval, data);
		if (mval == 0)
			continue;
		if (mval == 1)
			ret_inode = inode;
		goto out;
	}
out:
	spin_unlock(&inode_hash_lock);
	return ret_inode;
}
EXPORT_SYMBOL(find_inode_nowait);

Comments?

							- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Nov. 27, 2014, 3:41 p.m. UTC | #8
On Thu 27-11-14 10:25:24, Ted Tso wrote:
> This is what I'm currently playing with which I believe fixes the iput()
> problem.  In fs/ext4/inode.c:
> 
> struct other_inode {
> 	unsigned long		orig_ino;
> 	struct ext4_inode	*raw_inode;
> };
> static int other_inode_match(struct inode * inode, unsigned long ino,
> 			     void *data);
> 
> /*
>  * Opportunistically update the other time fields for other inodes in
>  * the same inode table block.
>  */
> static void ext4_update_other_inodes_time(struct super_block *sb,
> 					  unsigned long orig_ino, char *buf)
> {
> 	struct other_inode oi;
> 	unsigned long ino;
> 	int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
> 	int inode_size = EXT4_INODE_SIZE(sb);
> 
> 	oi.orig_ino = orig_ino;
> 	ino = orig_ino & ~(inodes_per_block - 1);
> 	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
> 		if (ino == orig_ino)
> 			continue;
> 		oi.raw_inode = (struct ext4_inode *) buf;
> 		(void) find_inode_nowait(sb, ino, other_inode_match, &oi);
> 	}
> }
> 
> static int other_inode_match(struct inode * inode, unsigned long ino,
> 			     void *data)
> {
> 	struct other_inode *oi = (struct other_inode *) data;
> 
> 	if ((inode->i_ino != ino) ||
> 	    (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) ||
> 	    ((inode->i_state & I_DIRTY_TIME) == 0))
> 		return 0;
> 	spin_lock(&inode->i_lock);
> 	if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) == 0) &&
> 	    (inode->i_state & I_DIRTY_TIME)) {
> 		struct ext4_inode_info	*ei = EXT4_I(inode);
> 
> 		inode->i_state &= ~I_DIRTY_TIME;
> 		inode->i_ts_dirty_day = 0;
> 		spin_unlock(&inode->i_lock);
> 		inode_requeue_dirtytime(inode);
> 
> 		spin_lock(&ei->i_raw_lock);
> 		EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
> 		EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
> 		EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
> 		ext4_inode_csum_set(inode, oi->raw_inode, ei);
> 		spin_unlock(&ei->i_raw_lock);
> 		trace_ext4_other_inode_update_time(inode, oi->orig_ino);
> 		return -1;
> 	}
> 	spin_unlock(&inode->i_lock);
> 	return -1;
> }
  Hum, but this puts lots of stuff under inode_hash_lock, including
writeback list lock. I don't like this too much. I understand that getting
handle for each inode is rather more CPU intensive but it should still be a
clear win over the current situation and avoids entangling locks like this.

								Honza

> The above uses the following in fs/inode.c (which gets added instead of
> find_active_inode_nowait):
> 
> /**
>  * find_inode_nowait - find an inode in the inode cache
>  * @sb:		super block of file system to search
>  * @hashval:	hash value (usually inode number) to search for
>  * @match:	callback used for comparisons between inodes
>  * @data:	opaque data pointer to pass to @match
>  *
>  * Search for the inode specified by @hashval and @data in the inode
>  * cache, where the helper function @match will return 0 if the inode
>  * does not match, 1 if the inode does match, and -1 if the search
>  * should be stopped.  The @match function must be responsible for
>  * taking the i_lock spin_lock and checking i_state for an inode being
>  * freed or being initialized, and incrementing the reference count
>  * before returning 1.  It also must not sleep, since it is called with
>  * the inode_hash_lock spinlock held.
>  *
>  * This is a even more generalized version of ilookup5() when the
>  * function must never block --- find_inode() can block in
>  * __wait_on_freeing_inode() --- or when the caller can not increment
>  * the reference count because the resulting iput() might cause an
>  * inode eviction().  The tradeoff is that the @match funtion must be
>  * very carefully implemented.
>  */
> struct inode *find_inode_nowait(struct super_block *sb,
> 				unsigned long hashval,
> 				int (*match)(struct inode *, unsigned long,
> 					     void *),
> 				void *data)
> {
> 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
> 	struct inode *inode, *ret_inode = NULL;
> 	int mval;
> 
> 	spin_lock(&inode_hash_lock);
> 	hlist_for_each_entry(inode, head, i_hash) {
> 		if (inode->i_sb != sb)
> 			continue;
> 		mval = match(inode, hashval, data);
> 		if (mval == 0)
> 			continue;
> 		if (mval == 1)
> 			ret_inode = inode;
> 		goto out;
> 	}
> out:
> 	spin_unlock(&inode_hash_lock);
> 	return ret_inode;
> }
> EXPORT_SYMBOL(find_inode_nowait);
> 
> Comments?
> 
> 							- Ted
Theodore Ts'o Nov. 27, 2014, 8:13 p.m. UTC | #9
On Thu, Nov 27, 2014 at 04:41:59PM +0100, Jan Kara wrote:
>   Hum, but this puts lots of stuff under inode_hash_lock, including
> writeback list lock. I don't like this too much. I understand that getting
> handle for each inode is rather more CPU intensive but it should still be a
> clear win over the current situation and avoids entangling locks like this.

Hmm, if we dropped the inode_requeue_dirtytime(), then we can avoid
taking the writeback list lock.  The net result is that we would have
some inodes still on the b_dirty_time list that were no longer
I_DIRTY_TIME, but since I_DIRTY_TIME wouldn't be set, it's mostly
harmless since when we do iterate over the b_dirty_time list, those
inodes can be quickly identified and skipped over.  (And if the inode
ever gets dirtied for real, then it will get moved onto the b_dirty
list and that will be that.)

The problem with getting a handle on the inode is not just that it is
more CPU intensive, but that can't let the iput_final() call happen
until after we have finished the transaction handle.  We could keep a
linked list of inodes attached to the handle, and then only call iput
on them once ext4_journal_stop(handle) gets called, but that's a
complication I'd like to avoid if at all possible.

Being able to opporunistically write the timestamps when we are
journalling an inode table block is a pretty big win, so if we end up
extending the hold time on inode_hash_lock (only when we come across a
I_DIRTY_TIME inode that we can clear) a tiny bit, there will be a lot
of workloads where I think it's a worthwhile tradeoff.  If we can
avoid entangling the writebakc list lock, does that make you happier
about this approach?

	     	      	       	     	 - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5653fa4..8308c82 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4140,6 +4140,51 @@  static int ext4_inode_blocks_set(handle_t *handle,
 }
 
 /*
+ * Opportunistically update the other time fields for other inodes in
+ * the same inode table block.
+ */
+static void ext4_update_other_inodes_time(struct super_block *sb,
+					  unsigned long orig_ino, char *buf)
+{
+	struct ext4_inode_info	*ei;
+	struct ext4_inode	*raw_inode;
+	unsigned long		ino;
+	struct inode		*inode;
+	int		i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	int		inode_size = EXT4_INODE_SIZE(sb);
+
+	ino = orig_ino & ~(inodes_per_block - 1);
+	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
+		if (ino == orig_ino)
+			continue;
+		inode = find_active_inode_nowait(sb, ino);
+		if (!inode ||
+		    (inode->i_state & I_DIRTY_TIME) == 0 ||
+		    !spin_trylock(&inode->i_lock)) {
+			iput(inode);
+			continue;
+		}
+		inode->i_state &= ~I_DIRTY_TIME;
+		inode->i_ts_dirty_day = 0;
+		spin_unlock(&inode->i_lock);
+		inode_requeue_dirtytime(inode);
+
+		ei = EXT4_I(inode);
+		raw_inode = (struct ext4_inode *) buf;
+
+		spin_lock(&ei->i_raw_lock);
+		EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+		EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
+		EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+		ext4_inode_csum_set(inode, raw_inode, ei);
+		spin_unlock(&ei->i_raw_lock);
+		trace_ext4_other_inode_update_time(inode, orig_ino);
+		iput(inode);
+	}
+}
+
+
+/*
  * Post the struct inode info into an on-disk inode location in the
  * buffer-cache.  This gobbles the caller's reference to the
  * buffer_head in the inode location struct.
@@ -4237,7 +4282,6 @@  static int ext4_do_update_inode(handle_t *handle,
 		for (block = 0; block < EXT4_N_BLOCKS; block++)
 			raw_inode->i_block[block] = ei->i_data[block];
 	}
-
 	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
 		raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
 		if (ei->i_extra_isize) {
@@ -4248,10 +4292,9 @@  static int ext4_do_update_inode(handle_t *handle,
 				cpu_to_le16(ei->i_extra_isize);
 		}
 	}
-
 	ext4_inode_csum_set(inode, raw_inode, ei);
-
 	spin_unlock(&ei->i_raw_lock);
+	ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data);
 
 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 	rc = ext4_handle_dirty_metadata(handle, NULL, bh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 58859bc..93a2b7a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1132,6 +1132,7 @@  enum {
 	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
 	Opt_usrquota, Opt_grpquota, Opt_i_version,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+	Opt_lazytime, Opt_nolazytime,
 	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
@@ -1195,6 +1196,8 @@  static const match_table_t tokens = {
 	{Opt_i_version, "i_version"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_delalloc, "delalloc"},
+	{Opt_lazytime, "lazytime"},
+	{Opt_nolazytime, "nolazytime"},
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_removed, "mblk_io_submit"},
 	{Opt_removed, "nomblk_io_submit"},
@@ -1452,6 +1455,12 @@  static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 	case Opt_i_version:
 		sb->s_flags |= MS_I_VERSION;
 		return 1;
+	case Opt_lazytime:
+		sb->s_flags |= MS_LAZYTIME;
+		return 1;
+	case Opt_nolazytime:
+		sb->s_flags &= ~MS_LAZYTIME;
+		return 1;
 	}
 
 	for (m = ext4_mount_opts; m->token != Opt_err; m++)
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 6cfb841..6e5abd6 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -73,6 +73,36 @@  struct extent_status;
 	{ FALLOC_FL_ZERO_RANGE,		"ZERO_RANGE"})
 
 
+TRACE_EVENT(ext4_other_inode_update_time,
+	TP_PROTO(struct inode *inode, ino_t orig_ino),
+
+	TP_ARGS(inode, orig_ino),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	orig_ino		)
+		__field(	uid_t,	uid			)
+		__field(	gid_t,	gid			)
+		__field(	__u16, mode			)
+	),
+
+	TP_fast_assign(
+		__entry->orig_ino = orig_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->uid	= i_uid_read(inode);
+		__entry->gid	= i_gid_read(inode);
+		__entry->mode	= inode->i_mode;
+	),
+
+	TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->orig_ino,
+		  (unsigned long) __entry->ino, __entry->mode,
+		  __entry->uid, __entry->gid)
+);
+
 TRACE_EVENT(ext4_free_inode,
 	TP_PROTO(struct inode *inode),