diff mbox

fs: make sure the timestamps for lazytime inodes eventually get written

Message ID 20150307053408.GA18002@thunk.org (mailing list archive)
State New, archived
Headers show

Commit Message

Theodore Ts'o March 7, 2015, 5:34 a.m. UTC
Hi Jan,

I believe the following should address all of the issues that you
raised.  Could you take a look and let me know what you think?

Thanks!!

						- Ted

commit e42f32d0ed65059e5cb689cc0ac28099a729ba9a
Author: Theodore Ts'o <tytso@mit.edu>
Date:   Sat Mar 7 00:22:37 2015 -0500

    fs: make sure the timestamps for lazytime inodes eventually get written
    
    Jan Kara pointed out that if there is an inode which is constantly
    getting dirtied with I_DIRTY_PAGES, an inode with an updated timestamp
    will never be written since inode->dirtied_when is constantly getting
    updated.  We fix this by adding an extra field to the inode,
    dirtied_time_when, so inodes with a stale dirtytime can get detected
    and handled.
    
    Also add a sysctl tunable, dirtytime_expire_seconds so we can properly
    debug this code and make sure it all works.
    
    Finally, if we have a dirtytime inode caused by an atime update, and
    there is no write activity on the file system, we need to have a
    secondary system to make sure these inodes get written out.  We do
    this by setting up a second delayed work structure which wakes up the
    CPU much more rarely compared to writeback_expire_centisecs.
    
    Signed-off-by: Theodore Ts'o <tytso@mit.edu>

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Jan Kara March 8, 2015, 10:06 a.m. UTC | #1
Hi Ted,

On Sat 07-03-15 00:34:08, Ted Tso wrote:
> I believe the following should address all of the issues that you
> raised.  Could you take a look and let me know what you think?
> 
> Thanks!!
  Thanks for the patch. A few comments below:

> commit e42f32d0ed65059e5cb689cc0ac28099a729ba9a
> Author: Theodore Ts'o <tytso@mit.edu>
> Date:   Sat Mar 7 00:22:37 2015 -0500
> 
>     fs: make sure the timestamps for lazytime inodes eventually get written
>     
>     Jan Kara pointed out that if there is an inode which is constantly
>     getting dirtied with I_DIRTY_PAGES, an inode with an updated timestamp
>     will never be written since inode->dirtied_when is constantly getting
>     updated.  We fix this by adding an extra field to the inode,
>     dirtied_time_when, so inodes with a stale dirtytime can get detected
>     and handled.
>     
>     Also add a sysctl tunable, dirtytime_expire_seconds so we can properly
>     debug this code and make sure it all works.
>     
>     Finally, if we have a dirtytime inode caused by an atime update, and
>     there is no write activity on the file system, we need to have a
>     secondary system to make sure these inodes get written out.  We do
>     this by setting up a second delayed work structure which wakes up the
>     CPU much more rarely compared to writeback_expire_centisecs.
>     
>     Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index e907052..260d7e7 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -53,6 +53,9 @@ struct wb_writeback_work {
>  	struct completion *done;	/* set if the caller waits */
>  };
>  
> +/* 12 hours in seconds */
> +unsigned int dirtytime_expire_interval = 12 * 60 * 60;
> +
>  /**
>   * writeback_in_progress - determine whether there is writeback in progress
>   * @bdi: the device's backing_dev_info structure.
> @@ -275,8 +278,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
>  
>  	if ((flags & EXPIRE_DIRTY_ATIME) == 0)
>  		older_than_this = work->older_than_this;
> -	else if ((work->reason == WB_REASON_SYNC) == 0) {
> -		expire_time = jiffies - (HZ * 86400);
> +	else if (!work->for_sync) {
> +		expire_time = jiffies - (dirtytime_expire_interval * HZ);
>  		older_than_this = &expire_time;
>  	}
>  	while (!list_empty(delaying_queue)) {
  This hunk should be a separate patch since it's completely unrelated.


> @@ -458,6 +461,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
>  		 */
>  		redirty_tail(inode, wb);
>  	} else if (inode->i_state & I_DIRTY_TIME) {
> +		inode->dirtied_when = jiffies;
>  		list_move(&inode->i_wb_list, &wb->b_dirty_time);
>  	} else {
>  		/* The inode is clean. Remove from writeback lists. */
> @@ -741,6 +745,13 @@ static long writeback_sb_inodes(struct super_block *sb,
>  		spin_lock(&inode->i_lock);
>  		if (!(inode->i_state & I_DIRTY_ALL))
>  			wrote++;
> +		if ((inode->i_state & I_DIRTY_TIME) &&
> +		    ((start_time - inode->dirtied_time_when) >
> +		     (dirtytime_expire_interval * HZ))) {
> +			inode->i_state &= ~I_DIRTY_TIME;
> +			inode->i_state |= I_DIRTY_SYNC;
> +			trace_writeback_lazytime(inode);
> +		}
  Hum, why is this here? A more logical place for it would IMO be in
__writeback_single_inode() where we modify inode state. Also we would then
immediately end up writing the inode instead of just queueing it to a
different writeback queue.

>  		requeue_inode(inode, wb, &wbc);
>  		inode_sync_complete(inode);
>  		spin_unlock(&inode->i_lock);
> @@ -1131,6 +1142,56 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
>  	rcu_read_unlock();
>  }
>  
> +/*
> + * Wake up bdi's periodically to make sure dirtytime inodes gets
> + * written back periodically.  We deliberately do *not* check the
> + * b_dirtytime list in wb_has_dirty_io(), since this would cause the
> + * kernel to be constantly waking up once there are any dirtytime
> + * inodes on the system.  So instead we define a separate delayed work
> + * function which gets called much more rarely.
> + *
> + * If there is any other write activity going on in the file system,
> + * this function won't be necessary.  But if the only thing that has
> + * happened on the file system is a dirtytime inode caused by an atime
> + * update, we need this infrastructure below to make sure that inode
> + * eventually gets pushed out to disk.
> + */
> +static void wakeup_dirtytime_writeback(struct work_struct *w);
> +static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
> +
> +static void wakeup_dirtytime_writeback(struct work_struct *w)
> +{
> +	struct backing_dev_info *bdi;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
> +		if (list_empty(&bdi->wb.b_dirty_time))
> +			continue;
> +		bdi_wakeup_thread(bdi);
> +	}
> +	rcu_read_unlock();
> +	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
> +}
> +
> +static int __init start_dirtytime_writeback(void)
> +{
> +	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
> +	return 0;
> +}
> +__initcall(start_dirtytime_writeback);
> +
> +int dirtytime_interval_handler(struct ctl_table *table, int write,
> +			       void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	int ret;
> +
> +	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
> +	if (ret == 0 && write)
> +		mod_delayed_work(system_wq, &dirtytime_work, 0);
> +	return ret;
> +}
> +
> +
>  static noinline void block_dump___mark_inode_dirty(struct inode *inode)
>  {
>  	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
> @@ -1269,6 +1330,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  			}
>  
>  			inode->dirtied_when = jiffies;
> +			if (dirtytime)
> +				inode->dirtied_time_when = jiffies;
> +			if (flags & I_DIRTY_PAGES)
> +				dirtytime = 0;
>  			list_move(&inode->i_wb_list, dirtytime ?
>  				  &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
>  			spin_unlock(&bdi->wb.list_lock);
  I guess this would be more readable as:
			if (dirtytime)
				inode->dirtied_time_when = jiffies;
			if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
				list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
			else {
				list_move(&inode->i_wb_list,
					  &bdi->wb.b_dirty_time);
			}
  Since that will clearly express the inode needs to end up in the list
which corresponds to current inode state. Also preferably the change in the
condition deciding in which list inode ends up should be split in a
separate patch since that's unrelated problem to the issue described in the
changelog.

Othewise the patch looks good.

								Honza
Theodore Ts'o March 8, 2015, 7:06 p.m. UTC | #2
On Sun, Mar 08, 2015 at 11:06:50AM +0100, Jan Kara wrote:
> > @@ -275,8 +278,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
> >  
> >  	if ((flags & EXPIRE_DIRTY_ATIME) == 0)
> >  		older_than_this = work->older_than_this;
> > -	else if ((work->reason == WB_REASON_SYNC) == 0) {
> > -		expire_time = jiffies - (HZ * 86400);
> > +	else if (!work->for_sync) {
> > +		expire_time = jiffies - (dirtytime_expire_interval * HZ);
> >  		older_than_this = &expire_time;
> >  	}
> >  	while (!list_empty(delaying_queue)) {
>   This hunk should be a separate patch since it's completely unrelated.

Along with all of the other changes that relate to adding a sysctl
tunable?  Sure, I can do that.

BTW, I know that originally we talked about not needing the tunable,
but it my experience it **really** helps with testing the future.  If
we ever want to try to create a automated test suite, it really helps
to have the tunable.

> > @@ -741,6 +745,13 @@ static long writeback_sb_inodes(struct super_block *sb,
> >  		spin_lock(&inode->i_lock);
> >  		if (!(inode->i_state & I_DIRTY_ALL))
> >  			wrote++;
> > +		if ((inode->i_state & I_DIRTY_TIME) &&
> > +		    ((start_time - inode->dirtied_time_when) >
> > +		     (dirtytime_expire_interval * HZ))) {
> > +			inode->i_state &= ~I_DIRTY_TIME;
> > +			inode->i_state |= I_DIRTY_SYNC;
> > +			trace_writeback_lazytime(inode);
> > +		}
>   Hum, why is this here? A more logical place for it would IMO be in
> __writeback_single_inode() where we modify inode state. Also we would then
> immediately end up writing the inode instead of just queueing it to a
> different writeback queue.

Good point, it woud be much better to put it there.  I'll move it in
the next version of the patch.

> > @@ -1269,6 +1330,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
> >  			}
> >  
> >  			inode->dirtied_when = jiffies;
> > +			if (dirtytime)
> > +				inode->dirtied_time_when = jiffies;
> > +			if (flags & I_DIRTY_PAGES)
> > +				dirtytime = 0;
> >  			list_move(&inode->i_wb_list, dirtytime ?
> >  				  &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
> >  			spin_unlock(&bdi->wb.list_lock);
>   I guess this would be more readable as:
> 			if (dirtytime)
> 				inode->dirtied_time_when = jiffies;
> 			if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
> 				list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
> 			else {
> 				list_move(&inode->i_wb_list,
> 					  &bdi->wb.b_dirty_time);
> 			}
>   Since that will clearly express the inode needs to end up in the list
> which corresponds to current inode state. Also preferably the change in the
> condition deciding in which list inode ends up should be split in a
> separate patch since that's unrelated problem to the issue described in the
> changelog.

Agreed, I'll change this and resend.

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e907052..260d7e7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,6 +53,9 @@  struct wb_writeback_work {
 	struct completion *done;	/* set if the caller waits */
 };
 
+/* 12 hours in seconds */
+unsigned int dirtytime_expire_interval = 12 * 60 * 60;
+
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -275,8 +278,8 @@  static int move_expired_inodes(struct list_head *delaying_queue,
 
 	if ((flags & EXPIRE_DIRTY_ATIME) == 0)
 		older_than_this = work->older_than_this;
-	else if ((work->reason == WB_REASON_SYNC) == 0) {
-		expire_time = jiffies - (HZ * 86400);
+	else if (!work->for_sync) {
+		expire_time = jiffies - (dirtytime_expire_interval * HZ);
 		older_than_this = &expire_time;
 	}
 	while (!list_empty(delaying_queue)) {
@@ -458,6 +461,7 @@  static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		 */
 		redirty_tail(inode, wb);
 	} else if (inode->i_state & I_DIRTY_TIME) {
+		inode->dirtied_when = jiffies;
 		list_move(&inode->i_wb_list, &wb->b_dirty_time);
 	} else {
 		/* The inode is clean. Remove from writeback lists. */
@@ -741,6 +745,13 @@  static long writeback_sb_inodes(struct super_block *sb,
 		spin_lock(&inode->i_lock);
 		if (!(inode->i_state & I_DIRTY_ALL))
 			wrote++;
+		if ((inode->i_state & I_DIRTY_TIME) &&
+		    ((start_time - inode->dirtied_time_when) >
+		     (dirtytime_expire_interval * HZ))) {
+			inode->i_state &= ~I_DIRTY_TIME;
+			inode->i_state |= I_DIRTY_SYNC;
+			trace_writeback_lazytime(inode);
+		}
 		requeue_inode(inode, wb, &wbc);
 		inode_sync_complete(inode);
 		spin_unlock(&inode->i_lock);
@@ -1131,6 +1142,56 @@  void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 	rcu_read_unlock();
 }
 
+/*
+ * Wake up bdi's periodically to make sure dirtytime inodes gets
+ * written back periodically.  We deliberately do *not* check the
+ * b_dirtytime list in wb_has_dirty_io(), since this would cause the
+ * kernel to be constantly waking up once there are any dirtytime
+ * inodes on the system.  So instead we define a separate delayed work
+ * function which gets called much more rarely.
+ *
+ * If there is any other write activity going on in the file system,
+ * this function won't be necessary.  But if the only thing that has
+ * happened on the file system is a dirtytime inode caused by an atime
+ * update, we need this infrastructure below to make sure that inode
+ * eventually gets pushed out to disk.
+ */
+static void wakeup_dirtytime_writeback(struct work_struct *w);
+static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
+
+static void wakeup_dirtytime_writeback(struct work_struct *w)
+{
+	struct backing_dev_info *bdi;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+		if (list_empty(&bdi->wb.b_dirty_time))
+			continue;
+		bdi_wakeup_thread(bdi);
+	}
+	rcu_read_unlock();
+	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+}
+
+static int __init start_dirtytime_writeback(void)
+{
+	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+	return 0;
+}
+__initcall(start_dirtytime_writeback);
+
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret == 0 && write)
+		mod_delayed_work(system_wq, &dirtytime_work, 0);
+	return ret;
+}
+
+
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 {
 	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1269,6 +1330,10 @@  void __mark_inode_dirty(struct inode *inode, int flags)
 			}
 
 			inode->dirtied_when = jiffies;
+			if (dirtytime)
+				inode->dirtied_time_when = jiffies;
+			if (flags & I_DIRTY_PAGES)
+				dirtytime = 0;
 			list_move(&inode->i_wb_list, dirtytime ?
 				  &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
 			spin_unlock(&bdi->wb.list_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b4d71b5..f4131e8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -604,6 +604,7 @@  struct inode {
 	struct mutex		i_mutex;
 
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
+	unsigned long		dirtied_time_when;
 
 	struct hlist_node	i_hash;
 	struct list_head	i_wb_list;	/* backing dev IO list */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 0004833..b2dd371e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -130,6 +130,7 @@  extern int vm_dirty_ratio;
 extern unsigned long vm_dirty_bytes;
 extern unsigned int dirty_writeback_interval;
 extern unsigned int dirty_expire_interval;
+extern unsigned int dirtytime_expire_interval;
 extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
@@ -146,6 +147,8 @@  extern int dirty_ratio_handler(struct ctl_table *table, int write,
 extern int dirty_bytes_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp, loff_t *ppos);
 
 struct ctl_table;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 88ea2d6..ce410bb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1228,6 +1228,14 @@  static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 	{
+		.procname	= "dirtytime_expire_seconds",
+		.data		= &dirtytime_expire_interval,
+		.maxlen		= sizeof(dirty_expire_interval),
+		.mode		= 0644,
+		.proc_handler	= dirtytime_interval_handler,
+		.extra1		= &zero,
+	},
+	{
 		.procname       = "nr_pdflush_threads",
 		.mode           = 0444 /* read-only */,
 		.proc_handler   = pdflush_proc_obsolete,