diff mbox

Btrfs: separate sequence numbers for delayed ref tracking and tree mod log

Message ID 1366740027-27627-1-git-send-email-list.btrfs@jan-o-sch.net (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Schmidt April 23, 2013, 6 p.m. UTC
Sequence numbers for delayed refs have been introduced in the first version
of the qgroup patch set. To solve the problem of find_all_roots on a busy
file system, the tree mod log was introduced. The sequence numbers for that
were simply shared between those two users.

However, at one point in qgroup's quota accounting, there's a statement
accessing the previous sequence number, that's still just doing (seq - 1)
just as it had to in the very first version.

To satisfy that requirement, this patch makes the sequence number counter 64
bit and splits it into a major part (used for qgroup sequence number
counting) and a minor part (incremented for each tree modification in the
log). This enables us to go exactly one major step backwards, as required
for qgroups, while still incrementing the sequence counter for tree mod log
insertions to keep track of their order. Keeping them in a single variable
means there's no need to change all the code dealing with comparisons of two
sequence numbers.

The sequence number is reset to 0 on commit (not new in this patch), which
ensures we won't overflow the two 32 bit counters.

Without this fix, the qgroup tracking can occasionally go wrong and WARN_ONs
from the tree mod log code may happen.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/ctree.c       |   36 +++++++++++++++++++++++++++++++++---
 fs/btrfs/ctree.h       |    7 ++-----
 fs/btrfs/delayed-ref.c |    6 ++++--
 fs/btrfs/disk-io.c     |    2 +-
 fs/btrfs/extent-tree.c |    5 +++--
 fs/btrfs/qgroup.c      |   13 ++++++++-----
 fs/btrfs/transaction.c |    2 +-
 7 files changed, 52 insertions(+), 19 deletions(-)

Comments

Liu Bo April 24, 2013, 8:12 a.m. UTC | #1
On Tue, Apr 23, 2013 at 08:00:27PM +0200, Jan Schmidt wrote:
> Sequence numbers for delayed refs have been introduced in the first version
> of the qgroup patch set. To solve the problem of find_all_roots on a busy
> file system, the tree mod log was introduced. The sequence numbers for that
> were simply shared between those two users.

Can't we just separate them with two vars?

thanks,
liubo

> 
> However, at one point in qgroup's quota accounting, there's a statement
> accessing the previous sequence number, that's still just doing (seq - 1)
> just as it had to in the very first version.
> 
> To satisfy that requirement, this patch makes the sequence number counter 64
> bit and splits it into a major part (used for qgroup sequence number
> counting) and a minor part (incremented for each tree modification in the
> log). This enables us to go exactly one major step backwards, as required
> for qgroups, while still incrementing the sequence counter for tree mod log
> insertions to keep track of their order. Keeping them in a single variable
> means there's no need to change all the code dealing with comparisons of two
> sequence numbers.
> 
> The sequence number is reset to 0 on commit (not new in this patch), which
> ensures we won't overflow the two 32 bit counters.
> 
> Without this fix, the qgroup tracking can occasionally go wrong and WARN_ONs
> from the tree mod log code may happen.
> 
> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
> ---
>  fs/btrfs/ctree.c       |   36 +++++++++++++++++++++++++++++++++---
>  fs/btrfs/ctree.h       |    7 ++-----
>  fs/btrfs/delayed-ref.c |    6 ++++--
>  fs/btrfs/disk-io.c     |    2 +-
>  fs/btrfs/extent-tree.c |    5 +++--
>  fs/btrfs/qgroup.c      |   13 ++++++++-----
>  fs/btrfs/transaction.c |    2 +-
>  7 files changed, 52 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 566d99b..b74136e 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -361,6 +361,36 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
>  }
>  
>  /*
> + * increment the upper half of tree_mod_seq, set lower half zero
> + *
> + * must be called with fs_info->tree_mod_seq_lock held
> + */
> +static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
> +{
> +	u64 seq = atomic64_read(&fs_info->tree_mod_seq);
> +	seq &= 0xffffffff00000000ull;
> +	seq += 1ull << 32;
> +	atomic64_set(&fs_info->tree_mod_seq, seq);
> +	return seq;
> +}
> +
> +/*
> + * increment the lower half of tree_mod_seq
> + */
> +static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
> +{
> +	return atomic64_inc_return(&fs_info->tree_mod_seq);
> +}
> +
> +/*
> + * return the last minor in the previous major tree_mod_seq number
> + */
> +u64 btrfs_tree_mod_seq_prev(u64 seq)
> +{
> +	return (seq & 0xffffffff00000000ull) - 1ull;
> +}
> +
> +/*
>   * This adds a new blocker to the tree mod log's blocker list if the @elem
>   * passed does not already have a sequence number set. So when a caller expects
>   * to record tree modifications, it should ensure to set elem->seq to zero
> @@ -376,10 +406,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
>  	tree_mod_log_write_lock(fs_info);
>  	spin_lock(&fs_info->tree_mod_seq_lock);
>  	if (!elem->seq) {
> -		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
> +		elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
>  		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
>  	}
> -	seq = btrfs_inc_tree_mod_seq(fs_info);
> +	seq = btrfs_inc_tree_mod_seq_minor(fs_info);
>  	spin_unlock(&fs_info->tree_mod_seq_lock);
>  	tree_mod_log_write_unlock(fs_info);
>  
> @@ -524,7 +554,7 @@ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
>  	if (!tm)
>  		return -ENOMEM;
>  
> -	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
> +	tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
>  	return tm->seq;
>  }
>  
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 412c306..5f34f89 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1422,7 +1422,7 @@ struct btrfs_fs_info {
>  
>  	/* this protects tree_mod_seq_list */
>  	spinlock_t tree_mod_seq_lock;
> -	atomic_t tree_mod_seq;
> +	atomic64_t tree_mod_seq;
>  	struct list_head tree_mod_seq_list;
>  	struct seq_list tree_mod_seq_elem;
>  
> @@ -3334,10 +3334,7 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
>  			   struct seq_list *elem);
>  void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
>  			    struct seq_list *elem);
> -static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
> -{
> -	return atomic_inc_return(&fs_info->tree_mod_seq);
> -}
> +u64 btrfs_tree_mod_seq_prev(u64 seq);
>  int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
>  
>  /* root-item.c */
> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> index 116abec..c219463 100644
> --- a/fs/btrfs/delayed-ref.c
> +++ b/fs/btrfs/delayed-ref.c
> @@ -361,8 +361,10 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
>  		elem = list_first_entry(&fs_info->tree_mod_seq_list,
>  					struct seq_list, list);
>  		if (seq >= elem->seq) {
> -			pr_debug("holding back delayed_ref %llu, lowest is "
> -				 "%llu (%p)\n", seq, elem->seq, delayed_refs);
> +			pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n",
> +				 (u32)(seq >> 32), (u32)seq,
> +				 (u32)(elem->seq >> 32), (u32)elem->seq,
> +				 delayed_refs);
>  			ret = 1;
>  		}
>  	}
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index f4628c7..0207132 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2147,7 +2147,7 @@ int open_ctree(struct super_block *sb,
>  	atomic_set(&fs_info->async_submit_draining, 0);
>  	atomic_set(&fs_info->nr_async_bios, 0);
>  	atomic_set(&fs_info->defrag_running, 0);
> -	atomic_set(&fs_info->tree_mod_seq, 0);
> +	atomic64_set(&fs_info->tree_mod_seq, 0);
>  	fs_info->sb = sb;
>  	fs_info->max_inline = 8192 * 1024;
>  	fs_info->metadata_ratio = 0;
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index b441be3..f2f1654 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2523,9 +2523,10 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
>  	    !trans->delayed_ref_elem.seq) {
>  		/* list without seq or seq without list */
>  		btrfs_err(fs_info,
> -			"qgroup accounting update error, list is%s empty, seq is %llu",
> +			"qgroup accounting update error, list is%s empty, seq is %#x.%x",
>  			list_empty(&trans->qgroup_ref_list) ? "" : " not",
> -			trans->delayed_ref_elem.seq);
> +			(u32)(trans->delayed_ref_elem.seq >> 32),
> +			(u32)trans->delayed_ref_elem.seq);
>  		BUG();
>  	}
>  
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index f175471..e5c5623 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -1242,9 +1242,11 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
>  	case BTRFS_ADD_DELAYED_REF:
>  	case BTRFS_ADD_DELAYED_EXTENT:
>  		sgn = 1;
> +		seq = btrfs_tree_mod_seq_prev(node->seq);
>  		break;
>  	case BTRFS_DROP_DELAYED_REF:
>  		sgn = -1;
> +		seq = node->seq;
>  		break;
>  	case BTRFS_UPDATE_DELAYED_HEAD:
>  		return 0;
> @@ -1254,14 +1256,14 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
>  
>  	/*
>  	 * the delayed ref sequence number we pass depends on the direction of
> -	 * the operation. for add operations, we pass (node->seq - 1) to skip
> +	 * the operation. for add operations, we pass
> +	 * tree_mod_log_prev_seq(node->seq) to skip
>  	 * the delayed ref's current sequence number, because we need the state
>  	 * of the tree before the add operation. for delete operations, we pass
>  	 * (node->seq) to include the delayed ref's current sequence number,
>  	 * because we need the state of the tree after the delete operation.
>  	 */
> -	ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
> -				   sgn > 0 ? node->seq - 1 : node->seq, &roots);
> +	ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
>  	if (ret < 0)
>  		return ret;
>  
> @@ -1772,8 +1774,9 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
>  {
>  	if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
>  		return;
> -	printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
> +	pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n",
>  		trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
> -		trans->delayed_ref_elem.seq);
> +		(u32)(trans->delayed_ref_elem.seq >> 32),
> +		(u32)trans->delayed_ref_elem.seq);
>  	BUG();
>  }
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index 94cbd10..d9edff7 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -163,7 +163,7 @@ loop:
>  	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
>  		WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
>  			"creating a fresh transaction\n");
> -	atomic_set(&fs_info->tree_mod_seq, 0);
> +	atomic64_set(&fs_info->tree_mod_seq, 0);
>  
>  	spin_lock_init(&cur_trans->commit_lock);
>  	spin_lock_init(&cur_trans->delayed_refs.lock);
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Schmidt April 24, 2013, 8:26 a.m. UTC | #2
On Wed, April 24, 2013 at 10:12 (+0200), Liu Bo wrote:
> On Tue, Apr 23, 2013 at 08:00:27PM +0200, Jan Schmidt wrote:
>> Sequence numbers for delayed refs have been introduced in the first version
>> of the qgroup patch set. To solve the problem of find_all_roots on a busy
>> file system, the tree mod log was introduced. The sequence numbers for that
>> were simply shared between those two users.
> 
> Can't we just separate them with two vars?

My reasoning comes a few lines below ...

> thanks,
> liubo
> 
>>
>> However, at one point in qgroup's quota accounting, there's a statement
>> accessing the previous sequence number, that's still just doing (seq - 1)
>> just as it had to in the very first version.
>>
>> To satisfy that requirement, this patch makes the sequence number counter 64
>> bit and splits it into a major part (used for qgroup sequence number
>> counting) and a minor part (incremented for each tree modification in the
>> log). This enables us to go exactly one major step backwards, as required
>> for qgroups, while still incrementing the sequence counter for tree mod log
>> insertions to keep track of their order. Keeping them in a single variable
>> means there's no need to change all the code dealing with comparisons of two
>> sequence numbers.

See the previous sentence :-)

And, it doesn't add too much complexity, setting and incrementing remains in
fact quite easy, even though we use the upper 32 bit and the lower 32 bit of
that integer independently.

Thanks,
-Jan

>>
>> The sequence number is reset to 0 on commit (not new in this patch), which
>> ensures we won't overflow the two 32 bit counters.
>>
>> Without this fix, the qgroup tracking can occasionally go wrong and WARN_ONs
>> from the tree mod log code may happen.
>>
>> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
>> ---
>> [snip]
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik April 24, 2013, 1:04 p.m. UTC | #3
On Tue, Apr 23, 2013 at 12:00:27PM -0600, Jan Schmidt wrote:
> Sequence numbers for delayed refs have been introduced in the first version
> of the qgroup patch set. To solve the problem of find_all_roots on a busy
> file system, the tree mod log was introduced. The sequence numbers for that
> were simply shared between those two users.
> 
> However, at one point in qgroup's quota accounting, there's a statement
> accessing the previous sequence number, that's still just doing (seq - 1)
> just as it had to in the very first version.
> 
> To satisfy that requirement, this patch makes the sequence number counter 64
> bit and splits it into a major part (used for qgroup sequence number
> counting) and a minor part (incremented for each tree modification in the
> log). This enables us to go exactly one major step backwards, as required
> for qgroups, while still incrementing the sequence counter for tree mod log
> insertions to keep track of their order. Keeping them in a single variable
> means there's no need to change all the code dealing with comparisons of two
> sequence numbers.
> 
> The sequence number is reset to 0 on commit (not new in this patch), which
> ensures we won't overflow the two 32 bit counters.
> 
> Without this fix, the qgroup tracking can occasionally go wrong and WARN_ONs
> from the tree mod log code may happen.
> 
> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
> ---
>  fs/btrfs/ctree.c       |   36 +++++++++++++++++++++++++++++++++---
>  fs/btrfs/ctree.h       |    7 ++-----
>  fs/btrfs/delayed-ref.c |    6 ++++--
>  fs/btrfs/disk-io.c     |    2 +-
>  fs/btrfs/extent-tree.c |    5 +++--
>  fs/btrfs/qgroup.c      |   13 ++++++++-----
>  fs/btrfs/transaction.c |    2 +-
>  7 files changed, 52 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 566d99b..b74136e 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -361,6 +361,36 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
>  }
>  
>  /*
> + * increment the upper half of tree_mod_seq, set lower half zero
> + *
> + * must be called with fs_info->tree_mod_seq_lock held
> + */
> +static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
> +{
> +	u64 seq = atomic64_read(&fs_info->tree_mod_seq);
> +	seq &= 0xffffffff00000000ull;
> +	seq += 1ull << 32;
> +	atomic64_set(&fs_info->tree_mod_seq, seq);
> +	return seq;
> +}

This isn't going to work, you read in the value, inc it and then set the new
value.  If somebody comes in and inc's in between the read and the sync, like
btrfs_inc_tree_mod_seq_minor could do when you call tree_mod_alloc, you'll end
up losing the minor update.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Schmidt April 24, 2013, 1:25 p.m. UTC | #4
On Wed, April 24, 2013 at 15:04 (+0200), Josef Bacik wrote:
> On Tue, Apr 23, 2013 at 12:00:27PM -0600, Jan Schmidt wrote:
>> Sequence numbers for delayed refs have been introduced in the first version
>> of the qgroup patch set. To solve the problem of find_all_roots on a busy
>> file system, the tree mod log was introduced. The sequence numbers for that
>> were simply shared between those two users.
>>
>> However, at one point in qgroup's quota accounting, there's a statement
>> accessing the previous sequence number, that's still just doing (seq - 1)
>> just as it had to in the very first version.
>>
>> To satisfy that requirement, this patch makes the sequence number counter 64
>> bit and splits it into a major part (used for qgroup sequence number
>> counting) and a minor part (incremented for each tree modification in the
>> log). This enables us to go exactly one major step backwards, as required
>> for qgroups, while still incrementing the sequence counter for tree mod log
>> insertions to keep track of their order. Keeping them in a single variable
>> means there's no need to change all the code dealing with comparisons of two
>> sequence numbers.
>>
>> The sequence number is reset to 0 on commit (not new in this patch), which
>> ensures we won't overflow the two 32 bit counters.
>>
>> Without this fix, the qgroup tracking can occasionally go wrong and WARN_ONs
>> from the tree mod log code may happen.
>>
>> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
>> ---
>>  fs/btrfs/ctree.c       |   36 +++++++++++++++++++++++++++++++++---
>>  fs/btrfs/ctree.h       |    7 ++-----
>>  fs/btrfs/delayed-ref.c |    6 ++++--
>>  fs/btrfs/disk-io.c     |    2 +-
>>  fs/btrfs/extent-tree.c |    5 +++--
>>  fs/btrfs/qgroup.c      |   13 ++++++++-----
>>  fs/btrfs/transaction.c |    2 +-
>>  7 files changed, 52 insertions(+), 19 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
>> index 566d99b..b74136e 100644
>> --- a/fs/btrfs/ctree.c
>> +++ b/fs/btrfs/ctree.c
>> @@ -361,6 +361,36 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
>>  }
>>  
>>  /*
>> + * increment the upper half of tree_mod_seq, set lower half zero
>> + *
>> + * must be called with fs_info->tree_mod_seq_lock held
>> + */
>> +static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
>> +{
>> +	u64 seq = atomic64_read(&fs_info->tree_mod_seq);
>> +	seq &= 0xffffffff00000000ull;
>> +	seq += 1ull << 32;
>> +	atomic64_set(&fs_info->tree_mod_seq, seq);
>> +	return seq;
>> +}
> 
> This isn't going to work, you read in the value, inc it and then set the new
> value.  If somebody comes in and inc's in between the read and the sync, like
> btrfs_inc_tree_mod_seq_minor could do when you call tree_mod_alloc, you'll end
> up losing the minor update.  Thanks,

I don't think I'll lose it. The minor update is made and returned to the one who
needs it, that number can still be used. There is no guarantee for two
concurrent modifications to which major a minor number belongs, though.

Thanks,
-Jan
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik April 24, 2013, 1:40 p.m. UTC | #5
On Wed, Apr 24, 2013 at 07:25:09AM -0600, Jan Schmidt wrote:
> On Wed, April 24, 2013 at 15:04 (+0200), Josef Bacik wrote:
> > On Tue, Apr 23, 2013 at 12:00:27PM -0600, Jan Schmidt wrote:
> >> Sequence numbers for delayed refs have been introduced in the first version
> >> of the qgroup patch set. To solve the problem of find_all_roots on a busy
> >> file system, the tree mod log was introduced. The sequence numbers for that
> >> were simply shared between those two users.
> >>
> >> However, at one point in qgroup's quota accounting, there's a statement
> >> accessing the previous sequence number, that's still just doing (seq - 1)
> >> just as it had to in the very first version.
> >>
> >> To satisfy that requirement, this patch makes the sequence number counter 64
> >> bit and splits it into a major part (used for qgroup sequence number
> >> counting) and a minor part (incremented for each tree modification in the
> >> log). This enables us to go exactly one major step backwards, as required
> >> for qgroups, while still incrementing the sequence counter for tree mod log
> >> insertions to keep track of their order. Keeping them in a single variable
> >> means there's no need to change all the code dealing with comparisons of two
> >> sequence numbers.
> >>
> >> The sequence number is reset to 0 on commit (not new in this patch), which
> >> ensures we won't overflow the two 32 bit counters.
> >>
> >> Without this fix, the qgroup tracking can occasionally go wrong and WARN_ONs
> >> from the tree mod log code may happen.
> >>
> >> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
> >> ---
> >>  fs/btrfs/ctree.c       |   36 +++++++++++++++++++++++++++++++++---
> >>  fs/btrfs/ctree.h       |    7 ++-----
> >>  fs/btrfs/delayed-ref.c |    6 ++++--
> >>  fs/btrfs/disk-io.c     |    2 +-
> >>  fs/btrfs/extent-tree.c |    5 +++--
> >>  fs/btrfs/qgroup.c      |   13 ++++++++-----
> >>  fs/btrfs/transaction.c |    2 +-
> >>  7 files changed, 52 insertions(+), 19 deletions(-)
> >>
> >> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> >> index 566d99b..b74136e 100644
> >> --- a/fs/btrfs/ctree.c
> >> +++ b/fs/btrfs/ctree.c
> >> @@ -361,6 +361,36 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
> >>  }
> >>  
> >>  /*
> >> + * increment the upper half of tree_mod_seq, set lower half zero
> >> + *
> >> + * must be called with fs_info->tree_mod_seq_lock held
> >> + */
> >> +static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
> >> +{
> >> +	u64 seq = atomic64_read(&fs_info->tree_mod_seq);
> >> +	seq &= 0xffffffff00000000ull;
> >> +	seq += 1ull << 32;
> >> +	atomic64_set(&fs_info->tree_mod_seq, seq);
> >> +	return seq;
> >> +}
> > 
> > This isn't going to work, you read in the value, inc it and then set the new
> > value.  If somebody comes in and inc's in between the read and the sync, like
> > btrfs_inc_tree_mod_seq_minor could do when you call tree_mod_alloc, you'll end
> > up losing the minor update.  Thanks,
> 
> I don't think I'll lose it. The minor update is made and returned to the one who
> needs it, that number can still be used. There is no guarantee for two
> concurrent modifications to which major a minor number belongs, though.
> 

Ah yeah it's returned, but then you'll end up with two things that have the same
seq number when they shouldn't have.  I'm not sure if that's a bad thing or not,
but from up here it looks like a bad thing, at the very least it's going to
surprise anybody who goes to work on this code later and expects it to work out
like a normal atomic would.  So either wrap it in a spin_lock (it looks like you
already have a mod_seq_lock) or come up with something less crafty.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Schmidt April 24, 2013, 1:59 p.m. UTC | #6
On Wed, April 24, 2013 at 15:40 (+0200), Josef Bacik wrote:
> On Wed, Apr 24, 2013 at 07:25:09AM -0600, Jan Schmidt wrote:
>> On Wed, April 24, 2013 at 15:04 (+0200), Josef Bacik wrote:
>>> On Tue, Apr 23, 2013 at 12:00:27PM -0600, Jan Schmidt wrote:
>>>> Sequence numbers for delayed refs have been introduced in the first version
>>>> of the qgroup patch set. To solve the problem of find_all_roots on a busy
>>>> file system, the tree mod log was introduced. The sequence numbers for that
>>>> were simply shared between those two users.
>>>>
>>>> However, at one point in qgroup's quota accounting, there's a statement
>>>> accessing the previous sequence number, that's still just doing (seq - 1)
>>>> just as it had to in the very first version.
>>>>
>>>> To satisfy that requirement, this patch makes the sequence number counter 64
>>>> bit and splits it into a major part (used for qgroup sequence number
>>>> counting) and a minor part (incremented for each tree modification in the
>>>> log). This enables us to go exactly one major step backwards, as required
>>>> for qgroups, while still incrementing the sequence counter for tree mod log
>>>> insertions to keep track of their order. Keeping them in a single variable
>>>> means there's no need to change all the code dealing with comparisons of two
>>>> sequence numbers.
>>>>
>>>> The sequence number is reset to 0 on commit (not new in this patch), which
>>>> ensures we won't overflow the two 32 bit counters.
>>>>
>>>> Without this fix, the qgroup tracking can occasionally go wrong and WARN_ONs
>>>> from the tree mod log code may happen.
>>>>
>>>> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
>>>> ---
>>>>  fs/btrfs/ctree.c       |   36 +++++++++++++++++++++++++++++++++---
>>>>  fs/btrfs/ctree.h       |    7 ++-----
>>>>  fs/btrfs/delayed-ref.c |    6 ++++--
>>>>  fs/btrfs/disk-io.c     |    2 +-
>>>>  fs/btrfs/extent-tree.c |    5 +++--
>>>>  fs/btrfs/qgroup.c      |   13 ++++++++-----
>>>>  fs/btrfs/transaction.c |    2 +-
>>>>  7 files changed, 52 insertions(+), 19 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
>>>> index 566d99b..b74136e 100644
>>>> --- a/fs/btrfs/ctree.c
>>>> +++ b/fs/btrfs/ctree.c
>>>> @@ -361,6 +361,36 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
>>>>  }
>>>>  
>>>>  /*
>>>> + * increment the upper half of tree_mod_seq, set lower half zero
>>>> + *
>>>> + * must be called with fs_info->tree_mod_seq_lock held
>>>> + */
>>>> +static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
>>>> +{
>>>> +	u64 seq = atomic64_read(&fs_info->tree_mod_seq);
>>>> +	seq &= 0xffffffff00000000ull;
>>>> +	seq += 1ull << 32;
>>>> +	atomic64_set(&fs_info->tree_mod_seq, seq);
>>>> +	return seq;
>>>> +}
>>>
>>> This isn't going to work, you read in the value, inc it and then set the new
>>> value.  If somebody comes in and inc's in between the read and the sync, like
>>> btrfs_inc_tree_mod_seq_minor could do when you call tree_mod_alloc, you'll end
>>> up losing the minor update.  Thanks,
>>
>> I don't think I'll lose it. The minor update is made and returned to the one who
>> needs it, that number can still be used. There is no guarantee for two
>> concurrent modifications to which major a minor number belongs, though.
>>
> 
> Ah yeah it's returned, but then you'll end up with two things that have the same
> seq number when they shouldn't have.  I'm not sure if that's a bad thing or not,
> but from up here it looks like a bad thing, at the very least it's going to
> surprise anybody who goes to work on this code later and expects it to work out
> like a normal atomic would.  So either wrap it in a spin_lock (it looks like you
> already have a mod_seq_lock) or come up with something less crafty.  Thanks,

Okay, here comes the deal: I put the tree_mod_seq_lock around
btrfs_inc_tree_mod_seq_minor, I don't expect it really hurts anyway. But as soon
as David comes by and proves it in fact does hurt in one of his manifold use
cases, I'll start arguing again why we can remove it without replace.

Thanks,
-Jan


> Josef
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 566d99b..b74136e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -361,6 +361,36 @@  static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
 }
 
 /*
+ * increment the upper half of tree_mod_seq, set lower half zero
+ *
+ * must be called with fs_info->tree_mod_seq_lock held
+ */
+static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
+{
+	u64 seq = atomic64_read(&fs_info->tree_mod_seq);
+	seq &= 0xffffffff00000000ull;
+	seq += 1ull << 32;
+	atomic64_set(&fs_info->tree_mod_seq, seq);
+	return seq;
+}
+
+/*
+ * increment the lower half of tree_mod_seq
+ */
+static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
+{
+	return atomic64_inc_return(&fs_info->tree_mod_seq);
+}
+
+/*
+ * return the last minor in the previous major tree_mod_seq number
+ */
+u64 btrfs_tree_mod_seq_prev(u64 seq)
+{
+	return (seq & 0xffffffff00000000ull) - 1ull;
+}
+
+/*
  * This adds a new blocker to the tree mod log's blocker list if the @elem
  * passed does not already have a sequence number set. So when a caller expects
  * to record tree modifications, it should ensure to set elem->seq to zero
@@ -376,10 +406,10 @@  u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 	tree_mod_log_write_lock(fs_info);
 	spin_lock(&fs_info->tree_mod_seq_lock);
 	if (!elem->seq) {
-		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+		elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 	}
-	seq = btrfs_inc_tree_mod_seq(fs_info);
+	seq = btrfs_inc_tree_mod_seq_minor(fs_info);
 	spin_unlock(&fs_info->tree_mod_seq_lock);
 	tree_mod_log_write_unlock(fs_info);
 
@@ -524,7 +554,7 @@  static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
 	if (!tm)
 		return -ENOMEM;
 
-	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+	tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
 	return tm->seq;
 }
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 412c306..5f34f89 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1422,7 +1422,7 @@  struct btrfs_fs_info {
 
 	/* this protects tree_mod_seq_list */
 	spinlock_t tree_mod_seq_lock;
-	atomic_t tree_mod_seq;
+	atomic64_t tree_mod_seq;
 	struct list_head tree_mod_seq_list;
 	struct seq_list tree_mod_seq_elem;
 
@@ -3334,10 +3334,7 @@  u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 			   struct seq_list *elem);
 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 			    struct seq_list *elem);
-static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
-{
-	return atomic_inc_return(&fs_info->tree_mod_seq);
-}
+u64 btrfs_tree_mod_seq_prev(u64 seq);
 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
 
 /* root-item.c */
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 116abec..c219463 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -361,8 +361,10 @@  int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
 					struct seq_list, list);
 		if (seq >= elem->seq) {
-			pr_debug("holding back delayed_ref %llu, lowest is "
-				 "%llu (%p)\n", seq, elem->seq, delayed_refs);
+			pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n",
+				 (u32)(seq >> 32), (u32)seq,
+				 (u32)(elem->seq >> 32), (u32)elem->seq,
+				 delayed_refs);
 			ret = 1;
 		}
 	}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f4628c7..0207132 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2147,7 +2147,7 @@  int open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
 	atomic_set(&fs_info->defrag_running, 0);
-	atomic_set(&fs_info->tree_mod_seq, 0);
+	atomic64_set(&fs_info->tree_mod_seq, 0);
 	fs_info->sb = sb;
 	fs_info->max_inline = 8192 * 1024;
 	fs_info->metadata_ratio = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b441be3..f2f1654 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2523,9 +2523,10 @@  int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 	    !trans->delayed_ref_elem.seq) {
 		/* list without seq or seq without list */
 		btrfs_err(fs_info,
-			"qgroup accounting update error, list is%s empty, seq is %llu",
+			"qgroup accounting update error, list is%s empty, seq is %#x.%x",
 			list_empty(&trans->qgroup_ref_list) ? "" : " not",
-			trans->delayed_ref_elem.seq);
+			(u32)(trans->delayed_ref_elem.seq >> 32),
+			(u32)trans->delayed_ref_elem.seq);
 		BUG();
 	}
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index f175471..e5c5623 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1242,9 +1242,11 @@  int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 	case BTRFS_ADD_DELAYED_REF:
 	case BTRFS_ADD_DELAYED_EXTENT:
 		sgn = 1;
+		seq = btrfs_tree_mod_seq_prev(node->seq);
 		break;
 	case BTRFS_DROP_DELAYED_REF:
 		sgn = -1;
+		seq = node->seq;
 		break;
 	case BTRFS_UPDATE_DELAYED_HEAD:
 		return 0;
@@ -1254,14 +1256,14 @@  int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 	/*
 	 * the delayed ref sequence number we pass depends on the direction of
-	 * the operation. for add operations, we pass (node->seq - 1) to skip
+	 * the operation. for add operations, we pass
+	 * tree_mod_log_prev_seq(node->seq) to skip
 	 * the delayed ref's current sequence number, because we need the state
 	 * of the tree before the add operation. for delete operations, we pass
 	 * (node->seq) to include the delayed ref's current sequence number,
 	 * because we need the state of the tree after the delete operation.
 	 */
-	ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
-				   sgn > 0 ? node->seq - 1 : node->seq, &roots);
+	ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
 	if (ret < 0)
 		return ret;
 
@@ -1772,8 +1774,9 @@  void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
 {
 	if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
 		return;
-	printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
+	pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n",
 		trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
-		trans->delayed_ref_elem.seq);
+		(u32)(trans->delayed_ref_elem.seq >> 32),
+		(u32)trans->delayed_ref_elem.seq);
 	BUG();
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 94cbd10..d9edff7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -163,7 +163,7 @@  loop:
 	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
 		WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
 			"creating a fresh transaction\n");
-	atomic_set(&fs_info->tree_mod_seq, 0);
+	atomic64_set(&fs_info->tree_mod_seq, 0);
 
 	spin_lock_init(&cur_trans->commit_lock);
 	spin_lock_init(&cur_trans->delayed_refs.lock);