[v2] btrfs: trace: Introduce trace events for sleepable tree lock
diff mbox series

Message ID 20190402094554.8353-1-wqu@suse.com
State New
Headers show
Series
  • [v2] btrfs: trace: Introduce trace events for sleepable tree lock
Related show

Commit Message

Qu Wenruo April 2, 2019, 9:45 a.m. UTC
There are two tree lock events which can sleep:
- btrfs_tree_read_lock()
- btrfs_tree_lock()

Sometimes we may need to look into the concurrency picture of the fs.
For that case, we need the execution time of above two functions and the
owner of @eb.

Here we introduce a trace events for user space tools like bcc, to get
the execution time of above two functions, and get detailed owner info
where eBPF code can't.

All the overhead is hidden behind the trace events, so if events are not
enabled, there is no overhead.

Also, since this patch and later user space tool only cares about the
execution time and owner, other info like bytenr is ignored in this
events.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
Changelog:
v2:
- Hide all ktime_get_ns() call behind trace events
  So there will be no overhead if trace events are not enabled.
---
 fs/btrfs/locking.c           | 12 ++++++++++++
 include/trace/events/btrfs.h | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

Comments

David Sterba April 3, 2019, 3:43 p.m. UTC | #1
On Tue, Apr 02, 2019 at 05:45:54PM +0800, Qu Wenruo wrote:
> There are two tree lock events which can sleep:
> - btrfs_tree_read_lock()
> - btrfs_tree_lock()
> 
> Sometimes we may need to look into the concurrency picture of the fs.
> For that case, we need the execution time of above two functions and the
> owner of @eb.
> 
> Here we introduce a trace events for user space tools like bcc, to get
> the execution time of above two functions, and get detailed owner info
> where eBPF code can't.
> 
> All the overhead is hidden behind the trace events, so if events are not
> enabled, there is no overhead.
> 
> Also, since this patch and later user space tool only cares about the
> execution time and owner, other info like bytenr is ignored in this
> events.

The time is definetelly the interesting part here, I'm thinking if the
block number could also provide some useful hints. The gathered trace
data can be used to group by blocks to compare the times, or see the
histogram of times.

The block at minimum would be useful and I don't want to add more
information there, like level or nritems. In case we do want that
information, a heavier tracepoint version should be added with all the
other data.

> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> Changelog:
> v2:
> - Hide all ktime_get_ns() call behind trace events
>   So there will be no overhead if trace events are not enabled.
> ---
>  fs/btrfs/locking.c           | 12 ++++++++++++
>  include/trace/events/btrfs.h | 36 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 48 insertions(+)
> 
> diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
> index 6df03ba36026..67b77f1d113e 100644
> --- a/fs/btrfs/locking.c
> +++ b/fs/btrfs/locking.c
> @@ -158,6 +158,10 @@ void btrfs_clear_lock_blocking_write(struct extent_buffer *eb)
>   */
>  void btrfs_tree_read_lock(struct extent_buffer *eb)
>  {
> +	u64 start_ns = 0;
> +
> +	if (trace_btrfs_tree_read_lock_enabled())
> +		start_ns = ktime_get_ns();
>  again:
>  	BUG_ON(!atomic_read(&eb->blocking_writers) &&
>  	       current->pid == eb->lock_owner);
> @@ -174,6 +178,7 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)
>  		BUG_ON(eb->lock_nested);
>  		eb->lock_nested = true;
>  		read_unlock(&eb->lock);
> +		trace_btrfs_tree_read_lock(eb, start_ns);
>  		return;
>  	}
>  	if (atomic_read(&eb->blocking_writers)) {
> @@ -184,6 +189,7 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)
>  	}
>  	btrfs_assert_tree_read_locks_get(eb);
>  	btrfs_assert_spinning_readers_get(eb);
> +	trace_btrfs_tree_read_lock(eb, start_ns);
>  }
>  
>  /*
> @@ -299,6 +305,11 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
>   */
>  void btrfs_tree_lock(struct extent_buffer *eb)
>  {
> +	u64 start_ns = 0;
> +
> +	if (trace_btrfs_tree_lock_enabled())
> +		start_ns = ktime_get_ns();
> +
>  	WARN_ON(eb->lock_owner == current->pid);
>  again:
>  	wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
> @@ -312,6 +323,7 @@ void btrfs_tree_lock(struct extent_buffer *eb)
>  	btrfs_assert_spinning_writers_get(eb);
>  	btrfs_assert_tree_write_locks_get(eb);
>  	eb->lock_owner = current->pid;
> +	trace_btrfs_tree_lock(eb, start_ns);
>  }
>  
>  /*
> diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
> index 8b12753fee78..406a71571254 100644
> --- a/include/trace/events/btrfs.h
> +++ b/include/trace/events/btrfs.h
> @@ -2005,6 +2005,42 @@ TRACE_EVENT(btrfs_convert_extent_bit,
>  		  __print_flags(__entry->clear_bits, "|", EXTENT_FLAGS))
>  );
>  
> +DECLARE_EVENT_CLASS(btrfs_sleep_tree_lock,
> +	TP_PROTO(const struct extent_buffer *eb, u64 start_ns),
> +
> +	TP_ARGS(eb, start_ns),
> +
> +	TP_STRUCT__entry_btrfs(
> +		__field(	u64,	start_ns	)
> +		__field(	u64,	end_ns		)
> +		__field(	u64,	owner		)
> +		__field(	int,	is_log_tree	)
> +	),
> +
> +	TP_fast_assign_btrfs(eb->fs_info,
> +		__entry->start_ns	= start_ns;
> +		__entry->end_ns		= ktime_get_ns();
> +		__entry->owner		= btrfs_header_owner(eb);
> +		__entry->is_log_tree	= (eb->log_index >= 0);
> +	),
> +
> +	TP_printk_btrfs("start_ns=%llu end_ns=%llu owner=%llu is_log_tree=%d",
> +		__entry->start_ns, __entry->end_ns, __entry->owner,
> +		__entry->is_log_tree)

I think the delta between start_ns and end_ns should be printed too, so
it's obvious without further processing:

	TP_printk_btrfs("start_ns=%llu end_ns=%llu diff=%llu owner=%llu is_log_tree=%d",
Qu Wenruo April 4, 2019, 12:56 a.m. UTC | #2
On 2019/4/3 下午11:43, David Sterba wrote:
> On Tue, Apr 02, 2019 at 05:45:54PM +0800, Qu Wenruo wrote:
>> There are two tree lock events which can sleep:
>> - btrfs_tree_read_lock()
>> - btrfs_tree_lock()
>>
>> Sometimes we may need to look into the concurrency picture of the fs.
>> For that case, we need the execution time of above two functions and the
>> owner of @eb.
>>
>> Here we introduce a trace events for user space tools like bcc, to get
>> the execution time of above two functions, and get detailed owner info
>> where eBPF code can't.
>>
>> All the overhead is hidden behind the trace events, so if events are not
>> enabled, there is no overhead.
>>
>> Also, since this patch and later user space tool only cares about the
>> execution time and owner, other info like bytenr is ignored in this
>> events.
> 
> The time is definetelly the interesting part here, I'm thinking if the
> block number could also provide some useful hints. The gathered trace
> data can be used to group by blocks to compare the times, or see the
> histogram of times.
> 
> The block at minimum would be useful and I don't want to add more
> information there, like level or nritems. In case we do want that
> information, a heavier tracepoint version should be added with all the
> other data.

The block number is not that useful in current form.
As a block number can be the same while contains data for different
trees between different transactions.

That's the reason why owner makes more sense in this situation.

To make block number makes sense, we need to pair
trace_btrfs_tree_lock() with trace_btrfs_tree_unlock().


> 
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
[snip]
>> +	),
>> +
>> +	TP_printk_btrfs("start_ns=%llu end_ns=%llu owner=%llu is_log_tree=%d",
>> +		__entry->start_ns, __entry->end_ns, __entry->owner,
>> +		__entry->is_log_tree)
> 
> I think the delta between start_ns and end_ns should be printed too, so
> it's obvious without further processing:

My idea is to put that processing part into user space.
So we can reduce the space usage of the event buffer.

But it will do no harm anyway, so I'll add the output.

Thanks,
Qu

> 
> 	TP_printk_btrfs("start_ns=%llu end_ns=%llu diff=%llu owner=%llu is_log_tree=%d",
>
David Sterba April 8, 2019, 11:38 p.m. UTC | #3
On Thu, Apr 04, 2019 at 08:56:15AM +0800, Qu Wenruo wrote:
> 
> 
> On 2019/4/3 下午11:43, David Sterba wrote:
> > On Tue, Apr 02, 2019 at 05:45:54PM +0800, Qu Wenruo wrote:
> >> There are two tree lock events which can sleep:
> >> - btrfs_tree_read_lock()
> >> - btrfs_tree_lock()
> >>
> >> Sometimes we may need to look into the concurrency picture of the fs.
> >> For that case, we need the execution time of above two functions and the
> >> owner of @eb.
> >>
> >> Here we introduce a trace events for user space tools like bcc, to get
> >> the execution time of above two functions, and get detailed owner info
> >> where eBPF code can't.
> >>
> >> All the overhead is hidden behind the trace events, so if events are not
> >> enabled, there is no overhead.
> >>
> >> Also, since this patch and later user space tool only cares about the
> >> execution time and owner, other info like bytenr is ignored in this
> >> events.
> > 
> > The time is definetelly the interesting part here, I'm thinking if the
> > block number could also provide some useful hints. The gathered trace
> > data can be used to group by blocks to compare the times, or see the
> > histogram of times.
> > 
> > The block at minimum would be useful and I don't want to add more
> > information there, like level or nritems. In case we do want that
> > information, a heavier tracepoint version should be added with all the
> > other data.
> 
> The block number is not that useful in current form.
> As a block number can be the same while contains data for different
> trees between different transactions.

Right, so the generation would have to be printed too, that's more than
I'd thought.

> That's the reason why owner makes more sense in this situation.
> 
> To make block number makes sense, we need to pair
> trace_btrfs_tree_lock() with trace_btrfs_tree_unlock().
> 
> 
> > 
> >> Signed-off-by: Qu Wenruo <wqu@suse.com>
> >> ---
> [snip]
> >> +	),
> >> +
> >> +	TP_printk_btrfs("start_ns=%llu end_ns=%llu owner=%llu is_log_tree=%d",
> >> +		__entry->start_ns, __entry->end_ns, __entry->owner,
> >> +		__entry->is_log_tree)
> > 
> > I think the delta between start_ns and end_ns should be printed too, so
> > it's obvious without further processing:
> 
> My idea is to put that processing part into user space.
> So we can reduce the space usage of the event buffer.
> 
> But it will do no harm anyway, so I'll add the output.

Postprocessing is always an option, but the calculated value is more for
a quick glance or simply is suitable for copy&paste of the trace log.
Qu Wenruo April 9, 2019, 1:15 p.m. UTC | #4
On 2019/4/9 上午7:38, David Sterba wrote:
> On Thu, Apr 04, 2019 at 08:56:15AM +0800, Qu Wenruo wrote:
>>
>>
>> On 2019/4/3 下午11:43, David Sterba wrote:
>>> On Tue, Apr 02, 2019 at 05:45:54PM +0800, Qu Wenruo wrote:
>>>> There are two tree lock events which can sleep:
>>>> - btrfs_tree_read_lock()
>>>> - btrfs_tree_lock()
>>>>
>>>> Sometimes we may need to look into the concurrency picture of the fs.
>>>> For that case, we need the execution time of above two functions and the
>>>> owner of @eb.
>>>>
>>>> Here we introduce a trace events for user space tools like bcc, to get
>>>> the execution time of above two functions, and get detailed owner info
>>>> where eBPF code can't.
>>>>
>>>> All the overhead is hidden behind the trace events, so if events are not
>>>> enabled, there is no overhead.
>>>>
>>>> Also, since this patch and later user space tool only cares about the
>>>> execution time and owner, other info like bytenr is ignored in this
>>>> events.
>>>
>>> The time is definetelly the interesting part here, I'm thinking if the
>>> block number could also provide some useful hints. The gathered trace
>>> data can be used to group by blocks to compare the times, or see the
>>> histogram of times.
>>>
>>> The block at minimum would be useful and I don't want to add more
>>> information there, like level or nritems. In case we do want that
>>> information, a heavier tracepoint version should be added with all the
>>> other data.
>>
>> The block number is not that useful in current form.
>> As a block number can be the same while contains data for different
>> trees between different transactions.
> 
> Right, so the generation would have to be printed too, that's more than
> I'd thought.

Generation itself already makes sense.
With tree locking it could be used as a minor indicator of transaction
locking (although still unreliable).

> 
>> That's the reason why owner makes more sense in this situation.
>>
>> To make block number makes sense, we need to pair
>> trace_btrfs_tree_lock() with trace_btrfs_tree_unlock().
>>
>>
>>>
>>>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>>>> ---
>> [snip]
>>>> +	),
>>>> +
>>>> +	TP_printk_btrfs("start_ns=%llu end_ns=%llu owner=%llu is_log_tree=%d",
>>>> +		__entry->start_ns, __entry->end_ns, __entry->owner,
>>>> +		__entry->is_log_tree)
>>>
>>> I think the delta between start_ns and end_ns should be printed too, so
>>> it's obvious without further processing:
>>
>> My idea is to put that processing part into user space.
>> So we can reduce the space usage of the event buffer.
>>
>> But it will do no harm anyway, so I'll add the output.
> 
> Postprocessing is always an option, but the calculated value is more for
> a quick glance or simply is suitable for copy&paste of the trace log.
It won't do harm, but quick glance may not make sense for such heavy
calling trace.

Anyway, I'll add generation, block and diff_ns into the output.
Since bcc can do extra filter on the trace events, it shouldn't cause
much buffer space problem.

Thanks,
Qu

Patch
diff mbox series

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 6df03ba36026..67b77f1d113e 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -158,6 +158,10 @@  void btrfs_clear_lock_blocking_write(struct extent_buffer *eb)
  */
 void btrfs_tree_read_lock(struct extent_buffer *eb)
 {
+	u64 start_ns = 0;
+
+	if (trace_btrfs_tree_read_lock_enabled())
+		start_ns = ktime_get_ns();
 again:
 	BUG_ON(!atomic_read(&eb->blocking_writers) &&
 	       current->pid == eb->lock_owner);
@@ -174,6 +178,7 @@  void btrfs_tree_read_lock(struct extent_buffer *eb)
 		BUG_ON(eb->lock_nested);
 		eb->lock_nested = true;
 		read_unlock(&eb->lock);
+		trace_btrfs_tree_read_lock(eb, start_ns);
 		return;
 	}
 	if (atomic_read(&eb->blocking_writers)) {
@@ -184,6 +189,7 @@  void btrfs_tree_read_lock(struct extent_buffer *eb)
 	}
 	btrfs_assert_tree_read_locks_get(eb);
 	btrfs_assert_spinning_readers_get(eb);
+	trace_btrfs_tree_read_lock(eb, start_ns);
 }
 
 /*
@@ -299,6 +305,11 @@  void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
  */
 void btrfs_tree_lock(struct extent_buffer *eb)
 {
+	u64 start_ns = 0;
+
+	if (trace_btrfs_tree_lock_enabled())
+		start_ns = ktime_get_ns();
+
 	WARN_ON(eb->lock_owner == current->pid);
 again:
 	wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
@@ -312,6 +323,7 @@  void btrfs_tree_lock(struct extent_buffer *eb)
 	btrfs_assert_spinning_writers_get(eb);
 	btrfs_assert_tree_write_locks_get(eb);
 	eb->lock_owner = current->pid;
+	trace_btrfs_tree_lock(eb, start_ns);
 }
 
 /*
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 8b12753fee78..406a71571254 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2005,6 +2005,42 @@  TRACE_EVENT(btrfs_convert_extent_bit,
 		  __print_flags(__entry->clear_bits, "|", EXTENT_FLAGS))
 );
 
+DECLARE_EVENT_CLASS(btrfs_sleep_tree_lock,
+	TP_PROTO(const struct extent_buffer *eb, u64 start_ns),
+
+	TP_ARGS(eb, start_ns),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	start_ns	)
+		__field(	u64,	end_ns		)
+		__field(	u64,	owner		)
+		__field(	int,	is_log_tree	)
+	),
+
+	TP_fast_assign_btrfs(eb->fs_info,
+		__entry->start_ns	= start_ns;
+		__entry->end_ns		= ktime_get_ns();
+		__entry->owner		= btrfs_header_owner(eb);
+		__entry->is_log_tree	= (eb->log_index >= 0);
+	),
+
+	TP_printk_btrfs("start_ns=%llu end_ns=%llu owner=%llu is_log_tree=%d",
+		__entry->start_ns, __entry->end_ns, __entry->owner,
+		__entry->is_log_tree)
+);
+
+DEFINE_EVENT(btrfs_sleep_tree_lock, btrfs_tree_read_lock,
+	TP_PROTO(const struct extent_buffer *eb, u64 start_ns),
+
+	TP_ARGS(eb, start_ns)
+);
+
+DEFINE_EVENT(btrfs_sleep_tree_lock, btrfs_tree_lock,
+	TP_PROTO(const struct extent_buffer *eb, u64 start_ns),
+
+	TP_ARGS(eb, start_ns)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */