diff mbox series

[1/1] nbd: trace long NBD operations

Message ID 20220527084348.68911-1-den@openvz.org (mailing list archive)
State New, archived
Headers show
Series [1/1] nbd: trace long NBD operations | expand

Commit Message

Denis V. Lunev May 27, 2022, 8:43 a.m. UTC
At the moment there are 2 sources of lengthy operations if configured:
* open connection, which could retry inside and
* reconnect of already opened connection
These operations could be quite lengthy and cumbersome to catch thus
it would be quite natural to add trace points for them.

This patch is based on the original downstream work made by Vladimir.

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Eric Blake <eblake@redhat.com>
CC: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Hanna Reitz <hreitz@redhat.com>
CC: Paolo Bonzini <pbonzini@redhat.com>
---
 block/nbd.c             | 11 ++++++++---
 block/trace-events      |  2 ++
 nbd/client-connection.c |  2 ++
 nbd/trace-events        |  3 +++
 4 files changed, 15 insertions(+), 3 deletions(-)

Comments

Vladimir Sementsov-Ogievskiy May 27, 2022, 9:33 a.m. UTC | #1
On 5/27/22 11:43, Denis V. Lunev wrote:
> At the moment there are 2 sources of lengthy operations if configured:
> * open connection, which could retry inside and
> * reconnect of already opened connection
> These operations could be quite lengthy and cumbersome to catch thus
> it would be quite natural to add trace points for them.
> 
> This patch is based on the original downstream work made by Vladimir.
> 
> Signed-off-by: Denis V. Lunev <den@openvz.org>
> CC: Eric Blake <eblake@redhat.com>
> CC: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
> CC: Kevin Wolf <kwolf@redhat.com>
> CC: Hanna Reitz <hreitz@redhat.com>
> CC: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   block/nbd.c             | 11 ++++++++---
>   block/trace-events      |  2 ++
>   nbd/client-connection.c |  2 ++
>   nbd/trace-events        |  3 +++
>   4 files changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/block/nbd.c b/block/nbd.c
> index 6085ab1d2c..f1a473d36b 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> @@ -371,6 +371,7 @@ static bool nbd_client_connecting(BDRVNBDState *s)
>   /* Called with s->requests_lock taken.  */
>   static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
>   {
> +    int ret;
>       bool blocking = s->state == NBD_CLIENT_CONNECTING_WAIT;
>   
>       /*
> @@ -380,6 +381,8 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
>       assert(nbd_client_connecting(s));
>       assert(s->in_flight == 1);
>   
> +    trace_nbd_reconnect_attempt(s->bs->in_flight);
> +
>       if (blocking && !s->reconnect_delay_timer) {
>           /*
>            * It's the first reconnect attempt after switching to
> @@ -401,7 +404,7 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
>       }
>   
>       qemu_mutex_unlock(&s->requests_lock);
> -    nbd_co_do_establish_connection(s->bs, blocking, NULL);
> +    ret = nbd_co_do_establish_connection(s->bs, blocking, NULL);
>       qemu_mutex_lock(&s->requests_lock);
>   
>       /*
> @@ -410,6 +413,8 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
>        * this I/O request (so draining removes all timers).
>        */
>       reconnect_delay_timer_del(s);
> +
> +    trace_nbd_reconnect_attempt_result(ret, s->bs->in_flight);

May be better trace exactly after nbd_co_do_establish_connection(). Doesn't really matter, just simpler code.

>   }
>   
>   static coroutine_fn int nbd_receive_replies(BDRVNBDState *s, uint64_t handle)
> @@ -1856,8 +1861,8 @@ static int nbd_process_options(BlockDriverState *bs, QDict *options,
>           goto error;
>       }
>   
> -    s->reconnect_delay = qemu_opt_get_number(opts, "reconnect-delay", 0);
> -    s->open_timeout = qemu_opt_get_number(opts, "open-timeout", 0);
> +    s->reconnect_delay = qemu_opt_get_number(opts, "reconnect-delay", 300);
> +    s->open_timeout = qemu_opt_get_number(opts, "open-timeout", 300);

That's changing defaults. Should not be in this patch. And I don't think we can simply change upstream default of open-timeout, as it breaks habitual behavior.

>   
>       ret = 0;
>   
> diff --git a/block/trace-events b/block/trace-events
> index 549090d453..caab699c22 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -172,6 +172,8 @@ nbd_read_reply_entry_fail(int ret, const char *err) "ret = %d, err: %s"
>   nbd_co_request_fail(uint64_t from, uint32_t len, uint64_t handle, uint16_t flags, uint16_t type, const char *name, int ret, const char *err) "Request failed { .from = %" PRIu64", .len = %" PRIu32 ", .handle = %" PRIu64 ", .flags = 0x%" PRIx16 ", .type = %" PRIu16 " (%s) } ret = %d, err: %s"
>   nbd_client_handshake(const char *export_name) "export '%s'"
>   nbd_client_handshake_success(const char *export_name) "export '%s'"
> +nbd_reconnect_attempt(int in_flight) "in_flight %d"
> +nbd_reconnect_attempt_result(int ret, int in_flight) "ret %d in_flight %d"

bs->in_flight is "unsigned int", so a bit better would be use "unsigned int" and "%u" here

>   
>   # ssh.c
>   ssh_restart_coroutine(void *co) "co=%p"
> diff --git a/nbd/client-connection.c b/nbd/client-connection.c
> index 2a632931c3..a5ee82e804 100644
> --- a/nbd/client-connection.c
> +++ b/nbd/client-connection.c
> @@ -23,6 +23,7 @@
>    */
>   
>   #include "qemu/osdep.h"
> +#include "trace.h"
>   
>   #include "block/nbd.h"
>   
> @@ -210,6 +211,7 @@ static void *connect_thread_func(void *opaque)
>               object_unref(OBJECT(conn->sioc));
>               conn->sioc = NULL;
>               if (conn->do_retry && !conn->detached) {
> +                trace_nbd_connect_iteration(timeout);

Here we are going to sleep a bit, before next reconnect attempt. I'd call the trace-point "trace_nbd_connect_thread_sleep" or something like this to be more intuitive.

>                   qemu_mutex_unlock(&conn->mutex);
>   
>                   sleep(timeout);
> diff --git a/nbd/trace-events b/nbd/trace-events
> index c4919a2dd5..bdadfdc82d 100644
> --- a/nbd/trace-events
> +++ b/nbd/trace-events
> @@ -73,3 +73,6 @@ nbd_co_receive_request_decode_type(uint64_t handle, uint16_t type, const char *n
>   nbd_co_receive_request_payload_received(uint64_t handle, uint32_t len) "Payload received: handle = %" PRIu64 ", len = %" PRIu32
>   nbd_co_receive_align_compliance(const char *op, uint64_t from, uint32_t len, uint32_t align) "client sent non-compliant unaligned %s request: from=0x%" PRIx64 ", len=0x%" PRIx32 ", align=0x%" PRIx32
>   nbd_trip(void) "Reading request"
> +
> +# client-connection.c
> +nbd_connect_iteration(int in_flight) "timeout %d"
Vladimir Sementsov-Ogievskiy May 27, 2022, 9:36 a.m. UTC | #2
On 5/27/22 11:43, Denis V. Lunev wrote:
> +++ b/nbd/client-connection.c
> @@ -23,6 +23,7 @@
>    */
>   
>   #include "qemu/osdep.h"
> +#include "trace.h"
>   
>   #include "block/nbd.h"
>   
> @@ -210,6 +211,7 @@ static void *connect_thread_func(void *opaque)
>               object_unref(OBJECT(conn->sioc));
>               conn->sioc = NULL;
>               if (conn->do_retry && !conn->detached) {
> +                trace_nbd_connect_iteration(timeout);
>                   qemu_mutex_unlock(&conn->mutex);
>   
>                   sleep(timeout);
> diff --git a/nbd/trace-events b/nbd/trace-events
> index c4919a2dd5..bdadfdc82d 100644
> --- a/nbd/trace-events
> +++ b/nbd/trace-events
> @@ -73,3 +73,6 @@ nbd_co_receive_request_decode_type(uint64_t handle, uint16_t type, const char *n
>   nbd_co_receive_request_payload_received(uint64_t handle, uint32_t len) "Payload received: handle = %" PRIu64 ", len = %" PRIu32
>   nbd_co_receive_align_compliance(const char *op, uint64_t from, uint32_t len, uint32_t align) "client sent non-compliant unaligned %s request: from=0x%" PRIx64 ", len=0x%" PRIx32 ", align=0x%" PRIx32
>   nbd_trip(void) "Reading request"
> +
> +# client-connection.c
> +nbd_connect_iteration(int in_flight) "timeout %d"

timeout is uint64_t, so, it should be "uint64_t timeout" here and %" PRIu64
Denis V. Lunev May 27, 2022, 12:12 p.m. UTC | #3
On 27.05.2022 11:33, Vladimir Sementsov-Ogievskiy wrote:
> On 5/27/22 11:43, Denis V. Lunev wrote:
>> At the moment there are 2 sources of lengthy operations if configured:
>> * open connection, which could retry inside and
>> * reconnect of already opened connection
>> These operations could be quite lengthy and cumbersome to catch thus
>> it would be quite natural to add trace points for them.
>>
>> This patch is based on the original downstream work made by Vladimir.
>>
>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>> CC: Eric Blake <eblake@redhat.com>
>> CC: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
>> CC: Kevin Wolf <kwolf@redhat.com>
>> CC: Hanna Reitz <hreitz@redhat.com>
>> CC: Paolo Bonzini <pbonzini@redhat.com>
>> ---
>>   block/nbd.c             | 11 ++++++++---
>>   block/trace-events      |  2 ++
>>   nbd/client-connection.c |  2 ++
>>   nbd/trace-events        |  3 +++
>>   4 files changed, 15 insertions(+), 3 deletions(-)
>>
>> diff --git a/block/nbd.c b/block/nbd.c
>> index 6085ab1d2c..f1a473d36b 100644
>> --- a/block/nbd.c
>> +++ b/block/nbd.c
>> @@ -371,6 +371,7 @@ static bool nbd_client_connecting(BDRVNBDState *s)
>>   /* Called with s->requests_lock taken.  */
>>   static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
>>   {
>> +    int ret;
>>       bool blocking = s->state == NBD_CLIENT_CONNECTING_WAIT;
>>         /*
>> @@ -380,6 +381,8 @@ static coroutine_fn void 
>> nbd_reconnect_attempt(BDRVNBDState *s)
>>       assert(nbd_client_connecting(s));
>>       assert(s->in_flight == 1);
>>   +    trace_nbd_reconnect_attempt(s->bs->in_flight);
>> +
>>       if (blocking && !s->reconnect_delay_timer) {
>>           /*
>>            * It's the first reconnect attempt after switching to
>> @@ -401,7 +404,7 @@ static coroutine_fn void 
>> nbd_reconnect_attempt(BDRVNBDState *s)
>>       }
>>         qemu_mutex_unlock(&s->requests_lock);
>> -    nbd_co_do_establish_connection(s->bs, blocking, NULL);
>> +    ret = nbd_co_do_establish_connection(s->bs, blocking, NULL);
>>       qemu_mutex_lock(&s->requests_lock);
>>         /*
>> @@ -410,6 +413,8 @@ static coroutine_fn void 
>> nbd_reconnect_attempt(BDRVNBDState *s)
>>        * this I/O request (so draining removes all timers).
>>        */
>>       reconnect_delay_timer_del(s);
>> +
>> +    trace_nbd_reconnect_attempt_result(ret, s->bs->in_flight);
>
> May be better trace exactly after nbd_co_do_establish_connection(). 
> Doesn't really matter, just simpler code.
>
ya, I'll change this


>>   }
>>     static coroutine_fn int nbd_receive_replies(BDRVNBDState *s, 
>> uint64_t handle)
>> @@ -1856,8 +1861,8 @@ static int nbd_process_options(BlockDriverState 
>> *bs, QDict *options,
>>           goto error;
>>       }
>>   -    s->reconnect_delay = qemu_opt_get_number(opts, 
>> "reconnect-delay", 0);
>> -    s->open_timeout = qemu_opt_get_number(opts, "open-timeout", 0);
>> +    s->reconnect_delay = qemu_opt_get_number(opts, 
>> "reconnect-delay", 300);
>> +    s->open_timeout = qemu_opt_get_number(opts, "open-timeout", 300);
>
> That's changing defaults. Should not be in this patch. And I don't 
> think we can simply change upstream default of open-timeout, as it 
> breaks habitual behavior.
>
whoops :( used for testing, cut before sending once but somehow
this goes out anyway.

Thanks for pointing this.


>>         ret = 0;
>>   diff --git a/block/trace-events b/block/trace-events
>> index 549090d453..caab699c22 100644
>> --- a/block/trace-events
>> +++ b/block/trace-events
>> @@ -172,6 +172,8 @@ nbd_read_reply_entry_fail(int ret, const char 
>> *err) "ret = %d, err: %s"
>>   nbd_co_request_fail(uint64_t from, uint32_t len, uint64_t handle, 
>> uint16_t flags, uint16_t type, const char *name, int ret, const char 
>> *err) "Request failed { .from = %" PRIu64", .len = %" PRIu32 ", 
>> .handle = %" PRIu64 ", .flags = 0x%" PRIx16 ", .type = %" PRIu16 " 
>> (%s) } ret = %d, err: %s"
>>   nbd_client_handshake(const char *export_name) "export '%s'"
>>   nbd_client_handshake_success(const char *export_name) "export '%s'"
>> +nbd_reconnect_attempt(int in_flight) "in_flight %d"
>> +nbd_reconnect_attempt_result(int ret, int in_flight) "ret %d 
>> in_flight %d"
>
> bs->in_flight is "unsigned int", so a bit better would be use 
> "unsigned int" and "%u" here
>
noted


>>     # ssh.c
>>   ssh_restart_coroutine(void *co) "co=%p"
>> diff --git a/nbd/client-connection.c b/nbd/client-connection.c
>> index 2a632931c3..a5ee82e804 100644
>> --- a/nbd/client-connection.c
>> +++ b/nbd/client-connection.c
>> @@ -23,6 +23,7 @@
>>    */
>>     #include "qemu/osdep.h"
>> +#include "trace.h"
>>     #include "block/nbd.h"
>>   @@ -210,6 +211,7 @@ static void *connect_thread_func(void *opaque)
>>               object_unref(OBJECT(conn->sioc));
>>               conn->sioc = NULL;
>>               if (conn->do_retry && !conn->detached) {
>> +                trace_nbd_connect_iteration(timeout);
>
> Here we are going to sleep a bit, before next reconnect attempt. I'd 
> call the trace-point "trace_nbd_connect_thread_sleep" or something 
> like this to be more intuitive.
>
ok

>> qemu_mutex_unlock(&conn->mutex);
>>                     sleep(timeout);
>> diff --git a/nbd/trace-events b/nbd/trace-events
>> index c4919a2dd5..bdadfdc82d 100644
>> --- a/nbd/trace-events
>> +++ b/nbd/trace-events
>> @@ -73,3 +73,6 @@ nbd_co_receive_request_decode_type(uint64_t handle, 
>> uint16_t type, const char *n
>>   nbd_co_receive_request_payload_received(uint64_t handle, uint32_t 
>> len) "Payload received: handle = %" PRIu64 ", len = %" PRIu32
>>   nbd_co_receive_align_compliance(const char *op, uint64_t from, 
>> uint32_t len, uint32_t align) "client sent non-compliant unaligned %s 
>> request: from=0x%" PRIx64 ", len=0x%" PRIx32 ", align=0x%" PRIx32
>>   nbd_trip(void) "Reading request"
>> +
>> +# client-connection.c
>> +nbd_connect_iteration(int in_flight) "timeout %d"
>
>
Denis V. Lunev May 27, 2022, 12:13 p.m. UTC | #4
On 27.05.2022 11:36, Vladimir Sementsov-Ogievskiy wrote:
> On 5/27/22 11:43, Denis V. Lunev wrote:
>> +++ b/nbd/client-connection.c
>> @@ -23,6 +23,7 @@
>>    */
>>     #include "qemu/osdep.h"
>> +#include "trace.h"
>>     #include "block/nbd.h"
>>   @@ -210,6 +211,7 @@ static void *connect_thread_func(void *opaque)
>>               object_unref(OBJECT(conn->sioc));
>>               conn->sioc = NULL;
>>               if (conn->do_retry && !conn->detached) {
>> +                trace_nbd_connect_iteration(timeout);
>>                   qemu_mutex_unlock(&conn->mutex);
>>                     sleep(timeout);
>> diff --git a/nbd/trace-events b/nbd/trace-events
>> index c4919a2dd5..bdadfdc82d 100644
>> --- a/nbd/trace-events
>> +++ b/nbd/trace-events
>> @@ -73,3 +73,6 @@ nbd_co_receive_request_decode_type(uint64_t handle, 
>> uint16_t type, const char *n
>>   nbd_co_receive_request_payload_received(uint64_t handle, uint32_t 
>> len) "Payload received: handle = %" PRIu64 ", len = %" PRIu32
>>   nbd_co_receive_align_compliance(const char *op, uint64_t from, 
>> uint32_t len, uint32_t align) "client sent non-compliant unaligned %s 
>> request: from=0x%" PRIx64 ", len=0x%" PRIx32 ", align=0x%" PRIx32
>>   nbd_trip(void) "Reading request"
>> +
>> +# client-connection.c
>> +nbd_connect_iteration(int in_flight) "timeout %d"
>
> timeout is uint64_t, so, it should be "uint64_t timeout" here and %" 
> PRIu64
>
Thanks! will change
diff mbox series

Patch

diff --git a/block/nbd.c b/block/nbd.c
index 6085ab1d2c..f1a473d36b 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -371,6 +371,7 @@  static bool nbd_client_connecting(BDRVNBDState *s)
 /* Called with s->requests_lock taken.  */
 static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
 {
+    int ret;
     bool blocking = s->state == NBD_CLIENT_CONNECTING_WAIT;
 
     /*
@@ -380,6 +381,8 @@  static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
     assert(nbd_client_connecting(s));
     assert(s->in_flight == 1);
 
+    trace_nbd_reconnect_attempt(s->bs->in_flight);
+
     if (blocking && !s->reconnect_delay_timer) {
         /*
          * It's the first reconnect attempt after switching to
@@ -401,7 +404,7 @@  static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
     }
 
     qemu_mutex_unlock(&s->requests_lock);
-    nbd_co_do_establish_connection(s->bs, blocking, NULL);
+    ret = nbd_co_do_establish_connection(s->bs, blocking, NULL);
     qemu_mutex_lock(&s->requests_lock);
 
     /*
@@ -410,6 +413,8 @@  static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
      * this I/O request (so draining removes all timers).
      */
     reconnect_delay_timer_del(s);
+
+    trace_nbd_reconnect_attempt_result(ret, s->bs->in_flight);
 }
 
 static coroutine_fn int nbd_receive_replies(BDRVNBDState *s, uint64_t handle)
@@ -1856,8 +1861,8 @@  static int nbd_process_options(BlockDriverState *bs, QDict *options,
         goto error;
     }
 
-    s->reconnect_delay = qemu_opt_get_number(opts, "reconnect-delay", 0);
-    s->open_timeout = qemu_opt_get_number(opts, "open-timeout", 0);
+    s->reconnect_delay = qemu_opt_get_number(opts, "reconnect-delay", 300);
+    s->open_timeout = qemu_opt_get_number(opts, "open-timeout", 300);
 
     ret = 0;
 
diff --git a/block/trace-events b/block/trace-events
index 549090d453..caab699c22 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -172,6 +172,8 @@  nbd_read_reply_entry_fail(int ret, const char *err) "ret = %d, err: %s"
 nbd_co_request_fail(uint64_t from, uint32_t len, uint64_t handle, uint16_t flags, uint16_t type, const char *name, int ret, const char *err) "Request failed { .from = %" PRIu64", .len = %" PRIu32 ", .handle = %" PRIu64 ", .flags = 0x%" PRIx16 ", .type = %" PRIu16 " (%s) } ret = %d, err: %s"
 nbd_client_handshake(const char *export_name) "export '%s'"
 nbd_client_handshake_success(const char *export_name) "export '%s'"
+nbd_reconnect_attempt(int in_flight) "in_flight %d"
+nbd_reconnect_attempt_result(int ret, int in_flight) "ret %d in_flight %d"
 
 # ssh.c
 ssh_restart_coroutine(void *co) "co=%p"
diff --git a/nbd/client-connection.c b/nbd/client-connection.c
index 2a632931c3..a5ee82e804 100644
--- a/nbd/client-connection.c
+++ b/nbd/client-connection.c
@@ -23,6 +23,7 @@ 
  */
 
 #include "qemu/osdep.h"
+#include "trace.h"
 
 #include "block/nbd.h"
 
@@ -210,6 +211,7 @@  static void *connect_thread_func(void *opaque)
             object_unref(OBJECT(conn->sioc));
             conn->sioc = NULL;
             if (conn->do_retry && !conn->detached) {
+                trace_nbd_connect_iteration(timeout);
                 qemu_mutex_unlock(&conn->mutex);
 
                 sleep(timeout);
diff --git a/nbd/trace-events b/nbd/trace-events
index c4919a2dd5..bdadfdc82d 100644
--- a/nbd/trace-events
+++ b/nbd/trace-events
@@ -73,3 +73,6 @@  nbd_co_receive_request_decode_type(uint64_t handle, uint16_t type, const char *n
 nbd_co_receive_request_payload_received(uint64_t handle, uint32_t len) "Payload received: handle = %" PRIu64 ", len = %" PRIu32
 nbd_co_receive_align_compliance(const char *op, uint64_t from, uint32_t len, uint32_t align) "client sent non-compliant unaligned %s request: from=0x%" PRIx64 ", len=0x%" PRIx32 ", align=0x%" PRIx32
 nbd_trip(void) "Reading request"
+
+# client-connection.c
+nbd_connect_iteration(int in_flight) "timeout %d"