diff mbox series

[v9,1/8] Introduce yank feature

Message ID 7b3b182b6ab1a859a1e9fb4ebfa5ce0a7a441e10.1603909658.git.lukasstraub2@web.de
State New, archived
Headers show
Series Introduce 'yank' oob qmp command to recover from hanging qemu | expand

Commit Message

Lukas Straub Oct. 28, 2020, 6:45 p.m. UTC
The yank feature allows to recover from hanging qemu by "yanking"
at various parts. Other qemu systems can register themselves and
multiple yank functions. Then all yank functions for selected
instances can be called by the 'yank' out-of-band qmp command.
Available instances can be queried by a 'query-yank' oob command.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/yank.h |  95 ++++++++++++++++++++
 qapi/misc.json      | 106 ++++++++++++++++++++++
 util/meson.build    |   1 +
 util/yank.c         | 213 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 415 insertions(+)
 create mode 100644 include/qemu/yank.h
 create mode 100644 util/yank.c

--
2.20.1

Comments

Markus Armbruster Oct. 29, 2020, 4:36 p.m. UTC | #1
Nothing major, looks almost ready to me.

Lukas Straub <lukasstraub2@web.de> writes:

> The yank feature allows to recover from hanging qemu by "yanking"
> at various parts. Other qemu systems can register themselves and
> multiple yank functions. Then all yank functions for selected
> instances can be called by the 'yank' out-of-band qmp command.
> Available instances can be queried by a 'query-yank' oob command.
>
> Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  include/qemu/yank.h |  95 ++++++++++++++++++++
>  qapi/misc.json      | 106 ++++++++++++++++++++++
>  util/meson.build    |   1 +
>  util/yank.c         | 213 ++++++++++++++++++++++++++++++++++++++++++++

checkpatch.pl warns:

    WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?

Can we find a maintainer for the two new files?

>  4 files changed, 415 insertions(+)
>  create mode 100644 include/qemu/yank.h
>  create mode 100644 util/yank.c
>
> diff --git a/include/qemu/yank.h b/include/qemu/yank.h
> new file mode 100644
> index 0000000000..89755e62af
> --- /dev/null
> +++ b/include/qemu/yank.h
> @@ -0,0 +1,95 @@
> +/*
> + * QEMU yank feature
> + *
> + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#ifndef YANK_H
> +#define YANK_H
> +
> +#include "qapi/qapi-types-misc.h"
> +
> +typedef void (YankFn)(void *opaque);
> +
> +/**
> + * yank_register_instance: Register a new instance.
> + *
> + * This registers a new instance for yanking. Must be called before any yank
> + * function is registered for this instance.
> + *
> + * This function is thread-safe.
> + *
> + * @instance: The instance.
> + * @errp: Error object.
> + */
> +void yank_register_instance(const YankInstance *instance, Error **errp);
> +
> +/**
> + * yank_unregister_instance: Unregister a instance.
> + *
> + * This unregisters a instance. Must be called only after every yank function
> + * of the instance has been unregistered.
> + *
> + * This function is thread-safe.
> + *
> + * @instance: The instance.
> + */
> +void yank_unregister_instance(const YankInstance *instance);
> +
> +/**
> + * yank_register_function: Register a yank function
> + *
> + * This registers a yank function. All limitations of qmp oob commands apply
> + * to the yank function as well. See docs/devel/qapi-code-gen.txt under
> + * "An OOB-capable command handler must satisfy the following conditions".
> + *
> + * This function is thread-safe.
> + *
> + * @instance: The instance.
> + * @func: The yank function.
> + * @opaque: Will be passed to the yank function.
> + */
> +void yank_register_function(const YankInstance *instance,
> +                            YankFn *func,
> +                            void *opaque);
> +
> +/**
> + * yank_unregister_function: Unregister a yank function
> + *
> + * This unregisters a yank function.
> + *
> + * This function is thread-safe.
> + *
> + * @instance: The instance.
> + * @func: func that was passed to yank_register_function.
> + * @opaque: opaque that was passed to yank_register_function.
> + */
> +void yank_unregister_function(const YankInstance *instance,
> +                              YankFn *func,
> +                              void *opaque);
> +
> +/**
> + * yank_generic_iochannel: Generic yank function for iochannel
> + *
> + * This is a generic yank function which will call qio_channel_shutdown on the
> + * provided QIOChannel.
> + *
> + * @opaque: QIOChannel to shutdown
> + */
> +void yank_generic_iochannel(void *opaque);
> +
> +#define BLOCKDEV_YANK_INSTANCE(the_node_name) (&(YankInstance) { \
> +        .type = YANK_INSTANCE_TYPE_BLOCKDEV, \
> +        .u.blockdev.node_name = (the_node_name) })
> +
> +#define CHARDEV_YANK_INSTANCE(the_id) (&(YankInstance) { \
> +        .type = YANK_INSTANCE_TYPE_CHARDEV, \
> +        .u.chardev.id = (the_id) })
> +
> +#define MIGRATION_YANK_INSTANCE (&(YankInstance) { \
> +        .type = YANK_INSTANCE_TYPE_MIGRATION })
> +
> +#endif
> diff --git a/qapi/misc.json b/qapi/misc.json
> index 40df513856..3b7de02a4d 100644
> --- a/qapi/misc.json
> +++ b/qapi/misc.json
> @@ -568,3 +568,109 @@
>   'data': { '*option': 'str' },
>   'returns': ['CommandLineOptionInfo'],
>   'allow-preconfig': true }
> +
> +##
> +# @YankInstanceType:
> +#
> +# An enumeration of yank instance types. See "YankInstance" for more

Please write cross-references as @YankInstance.  This gives us a chance
to turn them into links (seems not to be implemented, yet).  More of the
same below.

> +# information.
> +#
> +# Since: 5.2
> +##
> +{ 'enum': 'YankInstanceType',
> +  'data': [ 'blockdev', 'chardev', 'migration' ] }
> +
> +##
> +# @YankInstanceBlockdev:
> +#
> +# Specifies which blockdev to yank. See "YankInstance" for more information.
> +#
> +# @node-name: the blockdev's node-name

Is this really about block devices?  A node-name identifies a block
graph node, which may or may not be associated with a block device.

If I understand PATCH 2 correctly, it makes *any* block node with driver
'nbd' yankable while it has a network connection.  If that's true, then
this is definitely about block graph nodes, not about block devices.

If it is about devices, let's say "block device", not "blockdev".

If it is about nodes, let's say "block graph node".  Also rename
YankInstanceType member 'blockdev' by 'block-node', YankInstanceBlockdev
to YankInstanceBlockNode.

> +#
> +# Since: 5.2
> +##
> +{ 'struct': 'YankInstanceBlockdev',
> +  'data': { 'node-name': 'str' } }
> +
> +##
> +# @YankInstanceChardev:
> +#
> +# Specifies which chardev to yank. See "YankInstance" for more information.

"character device", not "chardev".

> +#
> +# @id: the chardev's ID
> +#
> +# Since: 5.2
> +##
> +{ 'struct': 'YankInstanceChardev',
> +  'data': { 'id': 'str' } }

This is called 'label' in @ChardevInfo.  It's also called 'id' in
@chardev-add.  Oh well, 'id' it is.

> +
> +##
> +# @YankInstance:
> +#
> +# A yank instance can be yanked with the "yank" qmp command to recover from a
> +# hanging qemu.

QEMU

> +#
> +# Currently implemented yank instances:
> +#  -nbd block device:
> +#   Yanking it will shutdown the connection to the nbd server without
> +#   attempting to reconnect.
> +#  -socket chardev:
> +#   Yanking it will shutdown the connected socket.
> +#  -migration:
> +#   Yanking it will shutdown all migration connections.

To my surprise, this is recognized as bullet list markup.  But please
put a space between the bullet and the text anyway.

Also: "shutdown" is a noun, the verb is spelled "shut down".

In my review of v8, I asked how yanking migration is related to command
migrate_cancel.  Daniel explained:

    migrate_cancel will do a shutdown() on the primary migration socket only.
    In addition it will toggle the migration state.

    Yanking will do a shutdown on all migration sockets (important for
    multifd), but won't touch migration state or any other aspect of QEMU
    code.

    Overall yanking has less potential for things to go wrong than the
    migrate_cancel method, as it doesn't try to do any kind of cleanup
    or migration.

Would it make sense to work this into the documentation?

> +#
> +# Since: 5.2
> +##
> +{ 'union': 'YankInstance',
> +  'base': { 'type': 'YankInstanceType' },
> +  'discriminator': 'type',
> +  'data': {
> +      'blockdev': 'YankInstanceBlockdev',
> +      'chardev': 'YankInstanceChardev' } }
> +
> +##
> +# @yank:
> +#
> +# Recover from hanging qemu by yanking the specified instances. See

QEMU

"Try to recover" would be more precise, I think.

> +# "YankInstance" for more information.
> +#
> +# Takes a list of @YankInstance as argument.
> +#
> +# Returns: nothing.
> +#
> +# Example:
> +#
> +# -> { "execute": "yank",
> +#      "arguments": {
> +#          "instances": [
> +#               { "type": "block-node",
> +#                 "node-name": "nbd0" }
> +#          ] } }
> +# <- { "return": {} }
> +#
> +# Since: 5.2
> +##
> +{ 'command': 'yank',
> +  'data': { 'instances': ['YankInstance'] },
> +  'allow-oob': true }
> +
> +##
> +# @query-yank:
> +#
> +# Query yank instances. See "YankInstance" for more information.
> +#
> +# Returns: list of @YankInstance
> +#
> +# Example:
> +#
> +# -> { "execute": "query-yank" }
> +# <- { "return": [
> +#          { "type": "block-node",
> +#            "node-name": "nbd0" }
> +#      ] }
> +#
> +# Since: 5.2
> +##
> +{ 'command': 'query-yank',
> +  'returns': ['YankInstance'],
> +  'allow-oob': true }
> diff --git a/util/meson.build b/util/meson.build
> index c5159ad79d..dbda9d9123 100644
> --- a/util/meson.build
> +++ b/util/meson.build
> @@ -50,6 +50,7 @@ endif
>
>  if have_system
>    util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio])
> +  util_ss.add(files('yank.c'))
>  endif
>
>  if have_block
> diff --git a/util/yank.c b/util/yank.c
> new file mode 100644
> index 0000000000..0b3a816706
> --- /dev/null
> +++ b/util/yank.c
> @@ -0,0 +1,213 @@
> +/*
> + * QEMU yank feature
> + *
> + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qemu/thread.h"
> +#include "qemu/queue.h"
> +#include "qapi/qapi-commands-misc.h"
> +#include "qapi/qapi-visit-misc.h"
> +#include "qapi/clone-visitor.h"
> +#include "io/channel.h"
> +#include "qemu/yank.h"
> +
> +struct YankFuncAndParam {
> +    YankFn *func;
> +    void *opaque;
> +    QLIST_ENTRY(YankFuncAndParam) next;
> +};
> +
> +struct YankInstanceEntry {
> +    YankInstance *instance;
> +    QLIST_HEAD(, YankFuncAndParam) yankfns;
> +    QLIST_ENTRY(YankInstanceEntry) next;
> +};
> +
> +typedef struct YankFuncAndParam YankFuncAndParam;
> +typedef struct YankInstanceEntry YankInstanceEntry;
> +
> +/*
> + * This lock protects the yank_instance_list below.

Please add something like

    * Because it's taken by OOB-capable commands, it must be "fast",
    * i.e. it may only be held for a bounded, short time.  See
    * docs/devel/qapi-code-gen.txt for additional information.

> + */
> +static QemuMutex yank_lock;
> +
> +static QLIST_HEAD(, YankInstanceEntry) yank_instance_list
> +    = QLIST_HEAD_INITIALIZER(yank_instance_list);
> +
> +static int yank_compare_instances(const YankInstance *a, const YankInstance *b)
> +{
> +    if (a->type != b->type) {
> +        return 0;
> +    }
> +
> +    switch (a->type) {
> +    case YANK_INSTANCE_TYPE_BLOCKDEV:
> +        return !strcmp(a->u.blockdev.node_name, b->u.blockdev.node_name);
> +    break;
> +
> +    case YANK_INSTANCE_TYPE_CHARDEV:
> +        return !strcmp(a->u.chardev.id, b->u.chardev.id);
> +    break;
> +
> +    case YANK_INSTANCE_TYPE_MIGRATION:
> +        return 1;
> +    break;
> +
> +    default:
> +        abort();
> +    }
> +}

Please make this function return bool.  When a comparison function
returns int, I expect it to return negative value when less, zero when
equal, and positive value when greater.

> +
> +static YankInstanceEntry *yank_find_entry(const YankInstance *instance)
> +{
> +    YankInstanceEntry *entry;
> +
> +    QLIST_FOREACH(entry, &yank_instance_list, next) {
> +        if (yank_compare_instances(entry->instance, instance)) {
> +            return entry;
> +        }
> +    }
> +    return NULL;
> +}
> +
> +void yank_register_instance(const YankInstance *instance, Error **errp)
> +{
> +    YankInstanceEntry *entry;
> +
> +    qemu_mutex_lock(&yank_lock);
> +
> +    if (yank_find_entry(instance)) {
> +        error_setg(errp, "duplicate yank instance");

How could this ever be anything but a programming error?

If it is a programming error, use assert(), just like you do in
yank_unregister_instance() below.

> +        qemu_mutex_unlock(&yank_lock);
> +        return;
> +    }
> +
> +    entry = g_slice_new(YankInstanceEntry);

Why not g_new()?  Hmm, GLib documentation says

    For newly written code it is recommended to use the new g_slice API
    instead of g_malloc() and friends, as long as objects are not
    resized during their lifetime and the object size used at allocation
    time is still available when freeing.

I see.

> +    entry->instance = QAPI_CLONE(YankInstance, instance);

You clone so the callers can build the argument on the stack.  Another
way to skin this cat: caller builds on the heap, this function takes
ownership.  But we're at v9, and your code should work just fine :)

> +    QLIST_INIT(&entry->yankfns);
> +    QLIST_INSERT_HEAD(&yank_instance_list, entry, next);
> +
> +    qemu_mutex_unlock(&yank_lock);
> +}
> +
> +void yank_unregister_instance(const YankInstance *instance)
> +{
> +    YankInstanceEntry *entry;
> +
> +    qemu_mutex_lock(&yank_lock);
> +    entry = yank_find_entry(instance);
> +    assert(entry);
> +
> +    assert(QLIST_EMPTY(&entry->yankfns));
> +    QLIST_REMOVE(entry, next);
> +    qapi_free_YankInstance(entry->instance);
> +    g_slice_free(YankInstanceEntry, entry);
> +
> +    qemu_mutex_unlock(&yank_lock);
> +}
> +
> +void yank_register_function(const YankInstance *instance,
> +                            YankFn *func,
> +                            void *opaque)
> +{
> +    YankInstanceEntry *entry;
> +    YankFuncAndParam *func_entry;
> +
> +    qemu_mutex_lock(&yank_lock);
> +    entry = yank_find_entry(instance);
> +    assert(entry);
> +
> +    func_entry = g_slice_new(YankFuncAndParam);
> +    func_entry->func = func;
> +    func_entry->opaque = opaque;
> +
> +    QLIST_INSERT_HEAD(&entry->yankfns, func_entry, next);
> +    qemu_mutex_unlock(&yank_lock);
> +}
> +
> +void yank_unregister_function(const YankInstance *instance,
> +                              YankFn *func,
> +                              void *opaque)
> +{
> +    YankInstanceEntry *entry;
> +    YankFuncAndParam *func_entry;
> +
> +    qemu_mutex_lock(&yank_lock);
> +    entry = yank_find_entry(instance);
> +    assert(entry);
> +
> +    QLIST_FOREACH(func_entry, &entry->yankfns, next) {
> +        if (func_entry->func == func && func_entry->opaque == opaque) {
> +            QLIST_REMOVE(func_entry, next);
> +            g_slice_free(YankFuncAndParam, func_entry);
> +            qemu_mutex_unlock(&yank_lock);
> +            return;
> +        }
> +    }
> +
> +    abort();
> +}
> +
> +void yank_generic_iochannel(void *opaque)
> +{
> +    QIOChannel *ioc = QIO_CHANNEL(opaque);
> +
> +    qio_channel_shutdown(ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
> +}
> +
> +void qmp_yank(YankInstanceList *instances,
> +              Error **errp)
> +{
> +    YankInstanceList *tail;
> +    YankInstanceEntry *entry;
> +    YankFuncAndParam *func_entry;
> +
> +    qemu_mutex_lock(&yank_lock);
> +    for (tail = instances; tail; tail = tail->next) {
> +        entry = yank_find_entry(tail->value);
> +        if (!entry) {
> +            error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, "Instance not found");

Quote error.h:

 * Note: use of error classes other than ERROR_CLASS_GENERIC_ERROR is
 * strongly discouraged.

Any particular reason for ERROR_CLASS_DEVICE_NOT_FOUND?  If not, then
error_setg(), please.

> +            qemu_mutex_unlock(&yank_lock);
> +            return;
> +        }
> +    }
> +    for (tail = instances; tail; tail = tail->next) {
> +        entry = yank_find_entry(tail->value);
> +        assert(entry);
> +        QLIST_FOREACH(func_entry, &entry->yankfns, next) {
> +            func_entry->func(func_entry->opaque);
> +        }
> +    }
> +    qemu_mutex_unlock(&yank_lock);
> +}
> +
> +YankInstanceList *qmp_query_yank(Error **errp)
> +{
> +    YankInstanceEntry *entry;
> +    YankInstanceList *ret;
> +
> +    ret = NULL;
> +
> +    qemu_mutex_lock(&yank_lock);
> +    QLIST_FOREACH(entry, &yank_instance_list, next) {
> +        YankInstanceList *new_entry;
> +        new_entry = g_new0(YankInstanceList, 1);
> +        new_entry->value = QAPI_CLONE(YankInstance, entry->instance);
> +        new_entry->next = ret;
> +        ret = new_entry;

There is a pull request pending that adds QAPI_LIST_PREPEND().  If it
gets pulled before you respin, using it would be nice.

> +    }
> +    qemu_mutex_unlock(&yank_lock);

Your critical sections all look "fast" to me.  Good.

> +
> +    return ret;
> +}
> +
> +static void __attribute__((__constructor__)) yank_init(void)
> +{
> +    qemu_mutex_init(&yank_lock);
> +}
> --
> 2.20.1
Lukas Straub Oct. 30, 2020, 10:32 a.m. UTC | #2
On Thu, 29 Oct 2020 17:36:14 +0100
Markus Armbruster <armbru@redhat.com> wrote:

> Nothing major, looks almost ready to me.
> 
> Lukas Straub <lukasstraub2@web.de> writes:
> 
> > The yank feature allows to recover from hanging qemu by "yanking"
> > at various parts. Other qemu systems can register themselves and
> > multiple yank functions. Then all yank functions for selected
> > instances can be called by the 'yank' out-of-band qmp command.
> > Available instances can be queried by a 'query-yank' oob command.
> >
> > Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> > Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
> > ---
> >  include/qemu/yank.h |  95 ++++++++++++++++++++
> >  qapi/misc.json      | 106 ++++++++++++++++++++++
> >  util/meson.build    |   1 +
> >  util/yank.c         | 213 ++++++++++++++++++++++++++++++++++++++++++++  
> 
> checkpatch.pl warns:
> 
>     WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
> 
> Can we find a maintainer for the two new files?

Yes, I'm maintaining this for now, see patch 7.

> >  4 files changed, 415 insertions(+)
> >  create mode 100644 include/qemu/yank.h
> >  create mode 100644 util/yank.c
> >
> > diff --git a/include/qemu/yank.h b/include/qemu/yank.h
> > new file mode 100644
> > index 0000000000..89755e62af
> > --- /dev/null
> > +++ b/include/qemu/yank.h
> > @@ -0,0 +1,95 @@
> > +/*
> > + * QEMU yank feature
> > + *
> > + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +
> > +#ifndef YANK_H
> > +#define YANK_H
> > +
> > +#include "qapi/qapi-types-misc.h"
> > +
> > +typedef void (YankFn)(void *opaque);
> > +
> > +/**
> > + * yank_register_instance: Register a new instance.
> > + *
> > + * This registers a new instance for yanking. Must be called before any yank
> > + * function is registered for this instance.
> > + *
> > + * This function is thread-safe.
> > + *
> > + * @instance: The instance.
> > + * @errp: Error object.
> > + */
> > +void yank_register_instance(const YankInstance *instance, Error **errp);
> > +
> > +/**
> > + * yank_unregister_instance: Unregister a instance.
> > + *
> > + * This unregisters a instance. Must be called only after every yank function
> > + * of the instance has been unregistered.
> > + *
> > + * This function is thread-safe.
> > + *
> > + * @instance: The instance.
> > + */
> > +void yank_unregister_instance(const YankInstance *instance);
> > +
> > +/**
> > + * yank_register_function: Register a yank function
> > + *
> > + * This registers a yank function. All limitations of qmp oob commands apply
> > + * to the yank function as well. See docs/devel/qapi-code-gen.txt under
> > + * "An OOB-capable command handler must satisfy the following conditions".
> > + *
> > + * This function is thread-safe.
> > + *
> > + * @instance: The instance.
> > + * @func: The yank function.
> > + * @opaque: Will be passed to the yank function.
> > + */
> > +void yank_register_function(const YankInstance *instance,
> > +                            YankFn *func,
> > +                            void *opaque);
> > +
> > +/**
> > + * yank_unregister_function: Unregister a yank function
> > + *
> > + * This unregisters a yank function.
> > + *
> > + * This function is thread-safe.
> > + *
> > + * @instance: The instance.
> > + * @func: func that was passed to yank_register_function.
> > + * @opaque: opaque that was passed to yank_register_function.
> > + */
> > +void yank_unregister_function(const YankInstance *instance,
> > +                              YankFn *func,
> > +                              void *opaque);
> > +
> > +/**
> > + * yank_generic_iochannel: Generic yank function for iochannel
> > + *
> > + * This is a generic yank function which will call qio_channel_shutdown on the
> > + * provided QIOChannel.
> > + *
> > + * @opaque: QIOChannel to shutdown
> > + */
> > +void yank_generic_iochannel(void *opaque);
> > +
> > +#define BLOCKDEV_YANK_INSTANCE(the_node_name) (&(YankInstance) { \
> > +        .type = YANK_INSTANCE_TYPE_BLOCKDEV, \
> > +        .u.blockdev.node_name = (the_node_name) })
> > +
> > +#define CHARDEV_YANK_INSTANCE(the_id) (&(YankInstance) { \
> > +        .type = YANK_INSTANCE_TYPE_CHARDEV, \
> > +        .u.chardev.id = (the_id) })
> > +
> > +#define MIGRATION_YANK_INSTANCE (&(YankInstance) { \
> > +        .type = YANK_INSTANCE_TYPE_MIGRATION })
> > +
> > +#endif
> > diff --git a/qapi/misc.json b/qapi/misc.json
> > index 40df513856..3b7de02a4d 100644
> > --- a/qapi/misc.json
> > +++ b/qapi/misc.json
> > @@ -568,3 +568,109 @@
> >   'data': { '*option': 'str' },
> >   'returns': ['CommandLineOptionInfo'],
> >   'allow-preconfig': true }
> > +
> > +##
> > +# @YankInstanceType:
> > +#
> > +# An enumeration of yank instance types. See "YankInstance" for more  
> 
> Please write cross-references as @YankInstance.  This gives us a chance
> to turn them into links (seems not to be implemented, yet).  More of the
> same below.

Changed for the next version.

> > +# information.
> > +#
> > +# Since: 5.2
> > +##
> > +{ 'enum': 'YankInstanceType',
> > +  'data': [ 'blockdev', 'chardev', 'migration' ] }
> > +
> > +##
> > +# @YankInstanceBlockdev:
> > +#
> > +# Specifies which blockdev to yank. See "YankInstance" for more information.
> > +#
> > +# @node-name: the blockdev's node-name  
> 
> Is this really about block devices?  A node-name identifies a block
> graph node, which may or may not be associated with a block device.
> 
> If I understand PATCH 2 correctly, it makes *any* block node with driver
> 'nbd' yankable while it has a network connection.  If that's true, then
> this is definitely about block graph nodes, not about block devices.

Indeed, changed for the next version.

> If it is about devices, let's say "block device", not "blockdev".
> 
> If it is about nodes, let's say "block graph node".  Also rename
> YankInstanceType member 'blockdev' by 'block-node', YankInstanceBlockdev
> to YankInstanceBlockNode.
> 
> > +#
> > +# Since: 5.2
> > +##
> > +{ 'struct': 'YankInstanceBlockdev',
> > +  'data': { 'node-name': 'str' } }
> > +
> > +##
> > +# @YankInstanceChardev:
> > +#
> > +# Specifies which chardev to yank. See "YankInstance" for more information.  
> 
> "character device", not "chardev".

Changed for the next version.

> > +#
> > +# @id: the chardev's ID
> > +#
> > +# Since: 5.2
> > +##
> > +{ 'struct': 'YankInstanceChardev',
> > +  'data': { 'id': 'str' } }  
> 
> This is called 'label' in @ChardevInfo.  It's also called 'id' in
> @chardev-add.  Oh well, 'id' it is.
> 
> > +
> > +##
> > +# @YankInstance:
> > +#
> > +# A yank instance can be yanked with the "yank" qmp command to recover from a
> > +# hanging qemu.  
> 
> QEMU
> 
> > +#
> > +# Currently implemented yank instances:
> > +#  -nbd block device:
> > +#   Yanking it will shutdown the connection to the nbd server without
> > +#   attempting to reconnect.
> > +#  -socket chardev:
> > +#   Yanking it will shutdown the connected socket.
> > +#  -migration:
> > +#   Yanking it will shutdown all migration connections.  
> 
> To my surprise, this is recognized as bullet list markup.  But please
> put a space between the bullet and the text anyway.
> 
> Also: "shutdown" is a noun, the verb is spelled "shut down".

Both changed for the next version.

> In my review of v8, I asked how yanking migration is related to command
> migrate_cancel.  Daniel explained:
> 
>     migrate_cancel will do a shutdown() on the primary migration socket only.
>     In addition it will toggle the migration state.
> 
>     Yanking will do a shutdown on all migration sockets (important for
>     multifd), but won't touch migration state or any other aspect of QEMU
>     code.
> 
>     Overall yanking has less potential for things to go wrong than the
>     migrate_cancel method, as it doesn't try to do any kind of cleanup
>     or migration.
> 
> Would it make sense to work this into the documentation?

How about this?

  - migration:
    Yanking it will shut down all migration connections. Unlike
    @migrate_cancel, it will not notify the migration process,
    so migration will go into @failed state, instead of @cancelled
    state.

> > +#
> > +# Since: 5.2
> > +##
> > +{ 'union': 'YankInstance',
> > +  'base': { 'type': 'YankInstanceType' },
> > +  'discriminator': 'type',
> > +  'data': {
> > +      'blockdev': 'YankInstanceBlockdev',
> > +      'chardev': 'YankInstanceChardev' } }
> > +
> > +##
> > +# @yank:
> > +#
> > +# Recover from hanging qemu by yanking the specified instances. See  
> 
> QEMU
> 
> "Try to recover" would be more precise, I think.

Changed for the next version.

> > +# "YankInstance" for more information.
> > +#
> > +# Takes a list of @YankInstance as argument.
> > +#
> > +# Returns: nothing.
> > +#
> > +# Example:
> > +#
> > +# -> { "execute": "yank",
> > +#      "arguments": {
> > +#          "instances": [
> > +#               { "type": "block-node",
> > +#                 "node-name": "nbd0" }
> > +#          ] } }
> > +# <- { "return": {} }
> > +#
> > +# Since: 5.2
> > +##
> > +{ 'command': 'yank',
> > +  'data': { 'instances': ['YankInstance'] },
> > +  'allow-oob': true }
> > +
> > +##
> > +# @query-yank:
> > +#
> > +# Query yank instances. See "YankInstance" for more information.
> > +#
> > +# Returns: list of @YankInstance
> > +#
> > +# Example:
> > +#
> > +# -> { "execute": "query-yank" }
> > +# <- { "return": [
> > +#          { "type": "block-node",
> > +#            "node-name": "nbd0" }
> > +#      ] }
> > +#
> > +# Since: 5.2
> > +##
> > +{ 'command': 'query-yank',
> > +  'returns': ['YankInstance'],
> > +  'allow-oob': true }
> > diff --git a/util/meson.build b/util/meson.build
> > index c5159ad79d..dbda9d9123 100644
> > --- a/util/meson.build
> > +++ b/util/meson.build
> > @@ -50,6 +50,7 @@ endif
> >
> >  if have_system
> >    util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio])
> > +  util_ss.add(files('yank.c'))
> >  endif
> >
> >  if have_block
> > diff --git a/util/yank.c b/util/yank.c
> > new file mode 100644
> > index 0000000000..0b3a816706
> > --- /dev/null
> > +++ b/util/yank.c
> > @@ -0,0 +1,213 @@
> > +/*
> > + * QEMU yank feature
> > + *
> > + * Copyright (c) Lukas Straub <lukasstraub2@web.de>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "qapi/error.h"
> > +#include "qemu/thread.h"
> > +#include "qemu/queue.h"
> > +#include "qapi/qapi-commands-misc.h"
> > +#include "qapi/qapi-visit-misc.h"
> > +#include "qapi/clone-visitor.h"
> > +#include "io/channel.h"
> > +#include "qemu/yank.h"
> > +
> > +struct YankFuncAndParam {
> > +    YankFn *func;
> > +    void *opaque;
> > +    QLIST_ENTRY(YankFuncAndParam) next;
> > +};
> > +
> > +struct YankInstanceEntry {
> > +    YankInstance *instance;
> > +    QLIST_HEAD(, YankFuncAndParam) yankfns;
> > +    QLIST_ENTRY(YankInstanceEntry) next;
> > +};
> > +
> > +typedef struct YankFuncAndParam YankFuncAndParam;
> > +typedef struct YankInstanceEntry YankInstanceEntry;
> > +
> > +/*
> > + * This lock protects the yank_instance_list below.  
> 
> Please add something like
> 
>     * Because it's taken by OOB-capable commands, it must be "fast",
>     * i.e. it may only be held for a bounded, short time.  See
>     * docs/devel/qapi-code-gen.txt for additional information.

Changed for the next version.

> > + */
> > +static QemuMutex yank_lock;
> > +
> > +static QLIST_HEAD(, YankInstanceEntry) yank_instance_list
> > +    = QLIST_HEAD_INITIALIZER(yank_instance_list);
> > +
> > +static int yank_compare_instances(const YankInstance *a, const YankInstance *b)
> > +{
> > +    if (a->type != b->type) {
> > +        return 0;
> > +    }
> > +
> > +    switch (a->type) {
> > +    case YANK_INSTANCE_TYPE_BLOCKDEV:
> > +        return !strcmp(a->u.blockdev.node_name, b->u.blockdev.node_name);
> > +    break;
> > +
> > +    case YANK_INSTANCE_TYPE_CHARDEV:
> > +        return !strcmp(a->u.chardev.id, b->u.chardev.id);
> > +    break;
> > +
> > +    case YANK_INSTANCE_TYPE_MIGRATION:
> > +        return 1;
> > +    break;
> > +
> > +    default:
> > +        abort();
> > +    }
> > +}  
> 
> Please make this function return bool.  When a comparison function
> returns int, I expect it to return negative value when less, zero when
> equal, and positive value when greater.

Changed for the next version.

> > +
> > +static YankInstanceEntry *yank_find_entry(const YankInstance *instance)
> > +{
> > +    YankInstanceEntry *entry;
> > +
> > +    QLIST_FOREACH(entry, &yank_instance_list, next) {
> > +        if (yank_compare_instances(entry->instance, instance)) {
> > +            return entry;
> > +        }
> > +    }
> > +    return NULL;
> > +}
> > +
> > +void yank_register_instance(const YankInstance *instance, Error **errp)
> > +{
> > +    YankInstanceEntry *entry;
> > +
> > +    qemu_mutex_lock(&yank_lock);
> > +
> > +    if (yank_find_entry(instance)) {
> > +        error_setg(errp, "duplicate yank instance");  
> 
> How could this ever be anything but a programming error?
> 
> If it is a programming error, use assert(), just like you do in
> yank_unregister_instance() below.

The chardev code initializes the chardev-socket which in turn initiates the
connection, before checking for duplicate id. So we'll have to catch it here.

> 
> > +        qemu_mutex_unlock(&yank_lock);
> > +        return;
> > +    }
> > +
> > +    entry = g_slice_new(YankInstanceEntry);  
> 
> Why not g_new()?  Hmm, GLib documentation says
> 
>     For newly written code it is recommended to use the new g_slice API
>     instead of g_malloc() and friends, as long as objects are not
>     resized during their lifetime and the object size used at allocation
>     time is still available when freeing.
> 
> I see.
> 
> > +    entry->instance = QAPI_CLONE(YankInstance, instance);  
> 
> You clone so the callers can build the argument on the stack.  Another
> way to skin this cat: caller builds on the heap, this function takes
> ownership.  But we're at v9, and your code should work just fine :)

This makes the code simpler in the later patches.

> > +    QLIST_INIT(&entry->yankfns);
> > +    QLIST_INSERT_HEAD(&yank_instance_list, entry, next);
> > +
> > +    qemu_mutex_unlock(&yank_lock);
> > +}
> > +
> > +void yank_unregister_instance(const YankInstance *instance)
> > +{
> > +    YankInstanceEntry *entry;
> > +
> > +    qemu_mutex_lock(&yank_lock);
> > +    entry = yank_find_entry(instance);
> > +    assert(entry);
> > +
> > +    assert(QLIST_EMPTY(&entry->yankfns));
> > +    QLIST_REMOVE(entry, next);
> > +    qapi_free_YankInstance(entry->instance);
> > +    g_slice_free(YankInstanceEntry, entry);
> > +
> > +    qemu_mutex_unlock(&yank_lock);
> > +}
> > +
> > +void yank_register_function(const YankInstance *instance,
> > +                            YankFn *func,
> > +                            void *opaque)
> > +{
> > +    YankInstanceEntry *entry;
> > +    YankFuncAndParam *func_entry;
> > +
> > +    qemu_mutex_lock(&yank_lock);
> > +    entry = yank_find_entry(instance);
> > +    assert(entry);
> > +
> > +    func_entry = g_slice_new(YankFuncAndParam);
> > +    func_entry->func = func;
> > +    func_entry->opaque = opaque;
> > +
> > +    QLIST_INSERT_HEAD(&entry->yankfns, func_entry, next);
> > +    qemu_mutex_unlock(&yank_lock);
> > +}
> > +
> > +void yank_unregister_function(const YankInstance *instance,
> > +                              YankFn *func,
> > +                              void *opaque)
> > +{
> > +    YankInstanceEntry *entry;
> > +    YankFuncAndParam *func_entry;
> > +
> > +    qemu_mutex_lock(&yank_lock);
> > +    entry = yank_find_entry(instance);
> > +    assert(entry);
> > +
> > +    QLIST_FOREACH(func_entry, &entry->yankfns, next) {
> > +        if (func_entry->func == func && func_entry->opaque == opaque) {
> > +            QLIST_REMOVE(func_entry, next);
> > +            g_slice_free(YankFuncAndParam, func_entry);
> > +            qemu_mutex_unlock(&yank_lock);
> > +            return;
> > +        }
> > +    }
> > +
> > +    abort();
> > +}
> > +
> > +void yank_generic_iochannel(void *opaque)
> > +{
> > +    QIOChannel *ioc = QIO_CHANNEL(opaque);
> > +
> > +    qio_channel_shutdown(ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
> > +}
> > +
> > +void qmp_yank(YankInstanceList *instances,
> > +              Error **errp)
> > +{
> > +    YankInstanceList *tail;
> > +    YankInstanceEntry *entry;
> > +    YankFuncAndParam *func_entry;
> > +
> > +    qemu_mutex_lock(&yank_lock);
> > +    for (tail = instances; tail; tail = tail->next) {
> > +        entry = yank_find_entry(tail->value);
> > +        if (!entry) {
> > +            error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, "Instance not found");  
> 
> Quote error.h:
> 
>  * Note: use of error classes other than ERROR_CLASS_GENERIC_ERROR is
>  * strongly discouraged.
> 
> Any particular reason for ERROR_CLASS_DEVICE_NOT_FOUND?  If not, then
> error_setg(), please.

There may be a time-to-check to time-to-use race condition here. Suppose
the management application (via QMP) calls 'blockev-del nbd0', then QEMU
hangs, so after a timeout it calls 'yank nbd0' while shortly before that
QEMU unhangs for some reason and removes the blockdev. Then yank returns
this error.

QMP errors should not be parsed except for the error class, so the the
management application can only differentiate between this (ignorable)
error and other (possibly fatal) ones by parsing the error class.

> 
> > +            qemu_mutex_unlock(&yank_lock);
> > +            return;
> > +        }
> > +    }
> > +    for (tail = instances; tail; tail = tail->next) {
> > +        entry = yank_find_entry(tail->value);
> > +        assert(entry);
> > +        QLIST_FOREACH(func_entry, &entry->yankfns, next) {
> > +            func_entry->func(func_entry->opaque);
> > +        }
> > +    }
> > +    qemu_mutex_unlock(&yank_lock);
> > +}
> > +
> > +YankInstanceList *qmp_query_yank(Error **errp)
> > +{
> > +    YankInstanceEntry *entry;
> > +    YankInstanceList *ret;
> > +
> > +    ret = NULL;
> > +
> > +    qemu_mutex_lock(&yank_lock);
> > +    QLIST_FOREACH(entry, &yank_instance_list, next) {
> > +        YankInstanceList *new_entry;
> > +        new_entry = g_new0(YankInstanceList, 1);
> > +        new_entry->value = QAPI_CLONE(YankInstance, entry->instance);
> > +        new_entry->next = ret;
> > +        ret = new_entry;  
> 
> There is a pull request pending that adds QAPI_LIST_PREPEND().  If it
> gets pulled before you respin, using it would be nice.
> 
> > +    }
> > +    qemu_mutex_unlock(&yank_lock);  
> 
> Your critical sections all look "fast" to me.  Good.
> 
> > +
> > +    return ret;
> > +}
> > +
> > +static void __attribute__((__constructor__)) yank_init(void)
> > +{
> > +    qemu_mutex_init(&yank_lock);
> > +}
> > --
> > 2.20.1  
>
Markus Armbruster Oct. 30, 2020, 2:02 p.m. UTC | #3
Lukas Straub <lukasstraub2@web.de> writes:

> On Thu, 29 Oct 2020 17:36:14 +0100
> Markus Armbruster <armbru@redhat.com> wrote:
>
>> Nothing major, looks almost ready to me.
>> 
>> Lukas Straub <lukasstraub2@web.de> writes:
>> 
>> > The yank feature allows to recover from hanging qemu by "yanking"
>> > at various parts. Other qemu systems can register themselves and
>> > multiple yank functions. Then all yank functions for selected
>> > instances can be called by the 'yank' out-of-band qmp command.
>> > Available instances can be queried by a 'query-yank' oob command.
>> >
>> > Signed-off-by: Lukas Straub <lukasstraub2@web.de>
>> > Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
>> > ---
>> >  include/qemu/yank.h |  95 ++++++++++++++++++++
>> >  qapi/misc.json      | 106 ++++++++++++++++++++++
>> >  util/meson.build    |   1 +
>> >  util/yank.c         | 213 ++++++++++++++++++++++++++++++++++++++++++++  
>> 
>> checkpatch.pl warns:
>> 
>>     WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
>> 
>> Can we find a maintainer for the two new files?
>
> Yes, I'm maintaining this for now, see patch 7.

Thanks!  Would it make sense to add the yank stuff to a new QAPI module
yank.json instead of misc.jaon, so the new MAINTAINERS stanza can cover
it?

[...]
>> > diff --git a/qapi/misc.json b/qapi/misc.json
>> > index 40df513856..3b7de02a4d 100644
>> > --- a/qapi/misc.json
>> > +++ b/qapi/misc.json
[...]
>> > +##
>> > +# @YankInstance:
>> > +#
>> > +# A yank instance can be yanked with the "yank" qmp command to recover from a
>> > +# hanging qemu.  
>> 
>> QEMU
>> 
>> > +#
>> > +# Currently implemented yank instances:
>> > +#  -nbd block device:
>> > +#   Yanking it will shutdown the connection to the nbd server without
>> > +#   attempting to reconnect.
>> > +#  -socket chardev:
>> > +#   Yanking it will shutdown the connected socket.
>> > +#  -migration:
>> > +#   Yanking it will shutdown all migration connections.  
>> 
>> To my surprise, this is recognized as bullet list markup.  But please
>> put a space between the bullet and the text anyway.
>> 
>> Also: "shutdown" is a noun, the verb is spelled "shut down".
>
> Both changed for the next version.
>
>> In my review of v8, I asked how yanking migration is related to command
>> migrate_cancel.  Daniel explained:
>> 
>>     migrate_cancel will do a shutdown() on the primary migration socket only.
>>     In addition it will toggle the migration state.
>> 
>>     Yanking will do a shutdown on all migration sockets (important for
>>     multifd), but won't touch migration state or any other aspect of QEMU
>>     code.
>> 
>>     Overall yanking has less potential for things to go wrong than the
>>     migrate_cancel method, as it doesn't try to do any kind of cleanup
>>     or migration.
>> 
>> Would it make sense to work this into the documentation?
>
> How about this?
>
>   - migration:
>     Yanking it will shut down all migration connections. Unlike
>     @migrate_cancel, it will not notify the migration process,
>     so migration will go into @failed state, instead of @cancelled
>     state.

Works for me.  Advice on when to use it rather than migrate_cancel would
be nice, though.

>> > +#
>> > +# Since: 5.2
>> > +##
>> > +{ 'union': 'YankInstance',
>> > +  'base': { 'type': 'YankInstanceType' },
>> > +  'discriminator': 'type',
>> > +  'data': {
>> > +      'blockdev': 'YankInstanceBlockdev',
>> > +      'chardev': 'YankInstanceChardev' } }
>> > +
>> > +##
>> > +# @yank:
>> > +#
>> > +# Recover from hanging qemu by yanking the specified instances. See  
>> 
>> QEMU
>> 
>> "Try to recover" would be more precise, I think.
>
> Changed for the next version.
>
>> > +# "YankInstance" for more information.
>> > +#
>> > +# Takes a list of @YankInstance as argument.
>> > +#
>> > +# Returns: nothing.
>> > +#
>> > +# Example:
>> > +#
>> > +# -> { "execute": "yank",
>> > +#      "arguments": {
>> > +#          "instances": [
>> > +#               { "type": "block-node",
>> > +#                 "node-name": "nbd0" }
>> > +#          ] } }
>> > +# <- { "return": {} }
>> > +#
>> > +# Since: 5.2
>> > +##
>> > +{ 'command': 'yank',
>> > +  'data': { 'instances': ['YankInstance'] },
>> > +  'allow-oob': true }
>> > +
[...]
>> > diff --git a/util/yank.c b/util/yank.c
>> > new file mode 100644
>> > index 0000000000..0b3a816706
>> > --- /dev/null
>> > +++ b/util/yank.c
[...]
>> > +void qmp_yank(YankInstanceList *instances,
>> > +              Error **errp)
>> > +{
>> > +    YankInstanceList *tail;
>> > +    YankInstanceEntry *entry;
>> > +    YankFuncAndParam *func_entry;
>> > +
>> > +    qemu_mutex_lock(&yank_lock);
>> > +    for (tail = instances; tail; tail = tail->next) {
>> > +        entry = yank_find_entry(tail->value);
>> > +        if (!entry) {
>> > +            error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, "Instance not found");  
>> 
>> Quote error.h:
>> 
>>  * Note: use of error classes other than ERROR_CLASS_GENERIC_ERROR is
>>  * strongly discouraged.
>> 
>> Any particular reason for ERROR_CLASS_DEVICE_NOT_FOUND?  If not, then
>> error_setg(), please.
>
> There may be a time-to-check to time-to-use race condition here. Suppose
> the management application (via QMP) calls 'blockev-del nbd0', then QEMU
> hangs, so after a timeout it calls 'yank nbd0' while shortly before that
> QEMU unhangs for some reason and removes the blockdev. Then yank returns
> this error.
>
> QMP errors should not be parsed except for the error class, so the the
> management application can only differentiate between this (ignorable)
> error and other (possibly fatal) ones by parsing the error class.

I see.

DeviceNotFound doesn't really fit, but none of the other error classes
is any better.

I think the line

      # Returns: nothing.

in the QAPI schema (quoted above) should be expanded to something like


      # Returns: - Nothing on success
                 - If the YankInstance doesn't exist, DeviceNotFound

>> > +            qemu_mutex_unlock(&yank_lock);
>> > +            return;
>> > +        }
>> > +    }
>> > +    for (tail = instances; tail; tail = tail->next) {
>> > +        entry = yank_find_entry(tail->value);
>> > +        assert(entry);
>> > +        QLIST_FOREACH(func_entry, &entry->yankfns, next) {
>> > +            func_entry->func(func_entry->opaque);
>> > +        }
>> > +    }
>> > +    qemu_mutex_unlock(&yank_lock);
>> > +}
[...]
Lukas Straub Oct. 30, 2020, 3:19 p.m. UTC | #4
On Fri, 30 Oct 2020 15:02:09 +0100
Markus Armbruster <armbru@redhat.com> wrote:

> Lukas Straub <lukasstraub2@web.de> writes:
> 
> > On Thu, 29 Oct 2020 17:36:14 +0100
> > Markus Armbruster <armbru@redhat.com> wrote:
> >  
> >> Nothing major, looks almost ready to me.
> >> 
> >> Lukas Straub <lukasstraub2@web.de> writes:
> >>   
> >> > The yank feature allows to recover from hanging qemu by "yanking"
> >> > at various parts. Other qemu systems can register themselves and
> >> > multiple yank functions. Then all yank functions for selected
> >> > instances can be called by the 'yank' out-of-band qmp command.
> >> > Available instances can be queried by a 'query-yank' oob command.
> >> >
> >> > Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> >> > Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
> >> > ---
> >> >  include/qemu/yank.h |  95 ++++++++++++++++++++
> >> >  qapi/misc.json      | 106 ++++++++++++++++++++++
> >> >  util/meson.build    |   1 +
> >> >  util/yank.c         | 213 ++++++++++++++++++++++++++++++++++++++++++++    
> >> 
> >> checkpatch.pl warns:
> >> 
> >>     WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
> >> 
> >> Can we find a maintainer for the two new files?  
> >
> > Yes, I'm maintaining this for now, see patch 7.  
> 
> Thanks!  Would it make sense to add the yank stuff to a new QAPI module
> yank.json instead of misc.jaon, so the new MAINTAINERS stanza can cover
> it?

Yes, makes sense. Changed for the next version.

> [...]
> >> > diff --git a/qapi/misc.json b/qapi/misc.json
> >> > index 40df513856..3b7de02a4d 100644
> >> > --- a/qapi/misc.json
> >> > +++ b/qapi/misc.json  
> [...]
> >> > +##
> >> > +# @YankInstance:
> >> > +#
> >> > +# A yank instance can be yanked with the "yank" qmp command to recover from a
> >> > +# hanging qemu.    
> >> 
> >> QEMU
> >>   
> >> > +#
> >> > +# Currently implemented yank instances:
> >> > +#  -nbd block device:
> >> > +#   Yanking it will shutdown the connection to the nbd server without
> >> > +#   attempting to reconnect.
> >> > +#  -socket chardev:
> >> > +#   Yanking it will shutdown the connected socket.
> >> > +#  -migration:
> >> > +#   Yanking it will shutdown all migration connections.    
> >> 
> >> To my surprise, this is recognized as bullet list markup.  But please
> >> put a space between the bullet and the text anyway.
> >> 
> >> Also: "shutdown" is a noun, the verb is spelled "shut down".  
> >
> > Both changed for the next version.
> >  
> >> In my review of v8, I asked how yanking migration is related to command
> >> migrate_cancel.  Daniel explained:
> >> 
> >>     migrate_cancel will do a shutdown() on the primary migration socket only.
> >>     In addition it will toggle the migration state.
> >> 
> >>     Yanking will do a shutdown on all migration sockets (important for
> >>     multifd), but won't touch migration state or any other aspect of QEMU
> >>     code.
> >> 
> >>     Overall yanking has less potential for things to go wrong than the
> >>     migrate_cancel method, as it doesn't try to do any kind of cleanup
> >>     or migration.
> >> 
> >> Would it make sense to work this into the documentation?  
> >
> > How about this?
> >
> >   - migration:
> >     Yanking it will shut down all migration connections. Unlike
> >     @migrate_cancel, it will not notify the migration process,
> >     so migration will go into @failed state, instead of @cancelled
> >     state.  
> 
> Works for me.  Advice on when to use it rather than migrate_cancel would
> be nice, though.

Ok, Changed for the next version.

> >> > +#
> >> > +# Since: 5.2
> >> > +##
> >> > +{ 'union': 'YankInstance',
> >> > +  'base': { 'type': 'YankInstanceType' },
> >> > +  'discriminator': 'type',
> >> > +  'data': {
> >> > +      'blockdev': 'YankInstanceBlockdev',
> >> > +      'chardev': 'YankInstanceChardev' } }
> >> > +
> >> > +##
> >> > +# @yank:
> >> > +#
> >> > +# Recover from hanging qemu by yanking the specified instances. See    
> >> 
> >> QEMU
> >> 
> >> "Try to recover" would be more precise, I think.  
> >
> > Changed for the next version.
> >  
> >> > +# "YankInstance" for more information.
> >> > +#
> >> > +# Takes a list of @YankInstance as argument.
> >> > +#
> >> > +# Returns: nothing.
> >> > +#
> >> > +# Example:
> >> > +#
> >> > +# -> { "execute": "yank",
> >> > +#      "arguments": {
> >> > +#          "instances": [
> >> > +#               { "type": "block-node",
> >> > +#                 "node-name": "nbd0" }
> >> > +#          ] } }
> >> > +# <- { "return": {} }
> >> > +#
> >> > +# Since: 5.2
> >> > +##
> >> > +{ 'command': 'yank',
> >> > +  'data': { 'instances': ['YankInstance'] },
> >> > +  'allow-oob': true }
> >> > +  
> [...]
> >> > diff --git a/util/yank.c b/util/yank.c
> >> > new file mode 100644
> >> > index 0000000000..0b3a816706
> >> > --- /dev/null
> >> > +++ b/util/yank.c  
> [...]
> >> > +void qmp_yank(YankInstanceList *instances,
> >> > +              Error **errp)
> >> > +{
> >> > +    YankInstanceList *tail;
> >> > +    YankInstanceEntry *entry;
> >> > +    YankFuncAndParam *func_entry;
> >> > +
> >> > +    qemu_mutex_lock(&yank_lock);
> >> > +    for (tail = instances; tail; tail = tail->next) {
> >> > +        entry = yank_find_entry(tail->value);
> >> > +        if (!entry) {
> >> > +            error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, "Instance not found");    
> >> 
> >> Quote error.h:
> >> 
> >>  * Note: use of error classes other than ERROR_CLASS_GENERIC_ERROR is
> >>  * strongly discouraged.
> >> 
> >> Any particular reason for ERROR_CLASS_DEVICE_NOT_FOUND?  If not, then
> >> error_setg(), please.  
> >
> > There may be a time-to-check to time-to-use race condition here. Suppose
> > the management application (via QMP) calls 'blockev-del nbd0', then QEMU
> > hangs, so after a timeout it calls 'yank nbd0' while shortly before that
> > QEMU unhangs for some reason and removes the blockdev. Then yank returns
> > this error.
> >
> > QMP errors should not be parsed except for the error class, so the the
> > management application can only differentiate between this (ignorable)
> > error and other (possibly fatal) ones by parsing the error class.  
> 
> I see.
> 
> DeviceNotFound doesn't really fit, but none of the other error classes
> is any better.
> 
> I think the line
> 
>       # Returns: nothing.
> 
> in the QAPI schema (quoted above) should be expanded to something like
> 
> 
>       # Returns: - Nothing on success
>                  - If the YankInstance doesn't exist, DeviceNotFound

Changed for the next version.

> >> > +            qemu_mutex_unlock(&yank_lock);
> >> > +            return;
> >> > +        }
> >> > +    }
> >> > +    for (tail = instances; tail; tail = tail->next) {
> >> > +        entry = yank_find_entry(tail->value);
> >> > +        assert(entry);
> >> > +        QLIST_FOREACH(func_entry, &entry->yankfns, next) {
> >> > +            func_entry->func(func_entry->opaque);
> >> > +        }
> >> > +    }
> >> > +    qemu_mutex_unlock(&yank_lock);
> >> > +}  
> [...]
> 



--
diff mbox series

Patch

diff --git a/include/qemu/yank.h b/include/qemu/yank.h
new file mode 100644
index 0000000000..89755e62af
--- /dev/null
+++ b/include/qemu/yank.h
@@ -0,0 +1,95 @@ 
+/*
+ * QEMU yank feature
+ *
+ * Copyright (c) Lukas Straub <lukasstraub2@web.de>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef YANK_H
+#define YANK_H
+
+#include "qapi/qapi-types-misc.h"
+
+typedef void (YankFn)(void *opaque);
+
+/**
+ * yank_register_instance: Register a new instance.
+ *
+ * This registers a new instance for yanking. Must be called before any yank
+ * function is registered for this instance.
+ *
+ * This function is thread-safe.
+ *
+ * @instance: The instance.
+ * @errp: Error object.
+ */
+void yank_register_instance(const YankInstance *instance, Error **errp);
+
+/**
+ * yank_unregister_instance: Unregister a instance.
+ *
+ * This unregisters a instance. Must be called only after every yank function
+ * of the instance has been unregistered.
+ *
+ * This function is thread-safe.
+ *
+ * @instance: The instance.
+ */
+void yank_unregister_instance(const YankInstance *instance);
+
+/**
+ * yank_register_function: Register a yank function
+ *
+ * This registers a yank function. All limitations of qmp oob commands apply
+ * to the yank function as well. See docs/devel/qapi-code-gen.txt under
+ * "An OOB-capable command handler must satisfy the following conditions".
+ *
+ * This function is thread-safe.
+ *
+ * @instance: The instance.
+ * @func: The yank function.
+ * @opaque: Will be passed to the yank function.
+ */
+void yank_register_function(const YankInstance *instance,
+                            YankFn *func,
+                            void *opaque);
+
+/**
+ * yank_unregister_function: Unregister a yank function
+ *
+ * This unregisters a yank function.
+ *
+ * This function is thread-safe.
+ *
+ * @instance: The instance.
+ * @func: func that was passed to yank_register_function.
+ * @opaque: opaque that was passed to yank_register_function.
+ */
+void yank_unregister_function(const YankInstance *instance,
+                              YankFn *func,
+                              void *opaque);
+
+/**
+ * yank_generic_iochannel: Generic yank function for iochannel
+ *
+ * This is a generic yank function which will call qio_channel_shutdown on the
+ * provided QIOChannel.
+ *
+ * @opaque: QIOChannel to shutdown
+ */
+void yank_generic_iochannel(void *opaque);
+
+#define BLOCKDEV_YANK_INSTANCE(the_node_name) (&(YankInstance) { \
+        .type = YANK_INSTANCE_TYPE_BLOCKDEV, \
+        .u.blockdev.node_name = (the_node_name) })
+
+#define CHARDEV_YANK_INSTANCE(the_id) (&(YankInstance) { \
+        .type = YANK_INSTANCE_TYPE_CHARDEV, \
+        .u.chardev.id = (the_id) })
+
+#define MIGRATION_YANK_INSTANCE (&(YankInstance) { \
+        .type = YANK_INSTANCE_TYPE_MIGRATION })
+
+#endif
diff --git a/qapi/misc.json b/qapi/misc.json
index 40df513856..3b7de02a4d 100644
--- a/qapi/misc.json
+++ b/qapi/misc.json
@@ -568,3 +568,109 @@ 
  'data': { '*option': 'str' },
  'returns': ['CommandLineOptionInfo'],
  'allow-preconfig': true }
+
+##
+# @YankInstanceType:
+#
+# An enumeration of yank instance types. See "YankInstance" for more
+# information.
+#
+# Since: 5.2
+##
+{ 'enum': 'YankInstanceType',
+  'data': [ 'blockdev', 'chardev', 'migration' ] }
+
+##
+# @YankInstanceBlockdev:
+#
+# Specifies which blockdev to yank. See "YankInstance" for more information.
+#
+# @node-name: the blockdev's node-name
+#
+# Since: 5.2
+##
+{ 'struct': 'YankInstanceBlockdev',
+  'data': { 'node-name': 'str' } }
+
+##
+# @YankInstanceChardev:
+#
+# Specifies which chardev to yank. See "YankInstance" for more information.
+#
+# @id: the chardev's ID
+#
+# Since: 5.2
+##
+{ 'struct': 'YankInstanceChardev',
+  'data': { 'id': 'str' } }
+
+##
+# @YankInstance:
+#
+# A yank instance can be yanked with the "yank" qmp command to recover from a
+# hanging qemu.
+#
+# Currently implemented yank instances:
+#  -nbd block device:
+#   Yanking it will shutdown the connection to the nbd server without
+#   attempting to reconnect.
+#  -socket chardev:
+#   Yanking it will shutdown the connected socket.
+#  -migration:
+#   Yanking it will shutdown all migration connections.
+#
+# Since: 5.2
+##
+{ 'union': 'YankInstance',
+  'base': { 'type': 'YankInstanceType' },
+  'discriminator': 'type',
+  'data': {
+      'blockdev': 'YankInstanceBlockdev',
+      'chardev': 'YankInstanceChardev' } }
+
+##
+# @yank:
+#
+# Recover from hanging qemu by yanking the specified instances. See
+# "YankInstance" for more information.
+#
+# Takes a list of @YankInstance as argument.
+#
+# Returns: nothing.
+#
+# Example:
+#
+# -> { "execute": "yank",
+#      "arguments": {
+#          "instances": [
+#               { "type": "block-node",
+#                 "node-name": "nbd0" }
+#          ] } }
+# <- { "return": {} }
+#
+# Since: 5.2
+##
+{ 'command': 'yank',
+  'data': { 'instances': ['YankInstance'] },
+  'allow-oob': true }
+
+##
+# @query-yank:
+#
+# Query yank instances. See "YankInstance" for more information.
+#
+# Returns: list of @YankInstance
+#
+# Example:
+#
+# -> { "execute": "query-yank" }
+# <- { "return": [
+#          { "type": "block-node",
+#            "node-name": "nbd0" }
+#      ] }
+#
+# Since: 5.2
+##
+{ 'command': 'query-yank',
+  'returns': ['YankInstance'],
+  'allow-oob': true }
diff --git a/util/meson.build b/util/meson.build
index c5159ad79d..dbda9d9123 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -50,6 +50,7 @@  endif

 if have_system
   util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio])
+  util_ss.add(files('yank.c'))
 endif

 if have_block
diff --git a/util/yank.c b/util/yank.c
new file mode 100644
index 0000000000..0b3a816706
--- /dev/null
+++ b/util/yank.c
@@ -0,0 +1,213 @@ 
+/*
+ * QEMU yank feature
+ *
+ * Copyright (c) Lukas Straub <lukasstraub2@web.de>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/thread.h"
+#include "qemu/queue.h"
+#include "qapi/qapi-commands-misc.h"
+#include "qapi/qapi-visit-misc.h"
+#include "qapi/clone-visitor.h"
+#include "io/channel.h"
+#include "qemu/yank.h"
+
+struct YankFuncAndParam {
+    YankFn *func;
+    void *opaque;
+    QLIST_ENTRY(YankFuncAndParam) next;
+};
+
+struct YankInstanceEntry {
+    YankInstance *instance;
+    QLIST_HEAD(, YankFuncAndParam) yankfns;
+    QLIST_ENTRY(YankInstanceEntry) next;
+};
+
+typedef struct YankFuncAndParam YankFuncAndParam;
+typedef struct YankInstanceEntry YankInstanceEntry;
+
+/*
+ * This lock protects the yank_instance_list below.
+ */
+static QemuMutex yank_lock;
+
+static QLIST_HEAD(, YankInstanceEntry) yank_instance_list
+    = QLIST_HEAD_INITIALIZER(yank_instance_list);
+
+static int yank_compare_instances(const YankInstance *a, const YankInstance *b)
+{
+    if (a->type != b->type) {
+        return 0;
+    }
+
+    switch (a->type) {
+    case YANK_INSTANCE_TYPE_BLOCKDEV:
+        return !strcmp(a->u.blockdev.node_name, b->u.blockdev.node_name);
+    break;
+
+    case YANK_INSTANCE_TYPE_CHARDEV:
+        return !strcmp(a->u.chardev.id, b->u.chardev.id);
+    break;
+
+    case YANK_INSTANCE_TYPE_MIGRATION:
+        return 1;
+    break;
+
+    default:
+        abort();
+    }
+}
+
+static YankInstanceEntry *yank_find_entry(const YankInstance *instance)
+{
+    YankInstanceEntry *entry;
+
+    QLIST_FOREACH(entry, &yank_instance_list, next) {
+        if (yank_compare_instances(entry->instance, instance)) {
+            return entry;
+        }
+    }
+    return NULL;
+}
+
+void yank_register_instance(const YankInstance *instance, Error **errp)
+{
+    YankInstanceEntry *entry;
+
+    qemu_mutex_lock(&yank_lock);
+
+    if (yank_find_entry(instance)) {
+        error_setg(errp, "duplicate yank instance");
+        qemu_mutex_unlock(&yank_lock);
+        return;
+    }
+
+    entry = g_slice_new(YankInstanceEntry);
+    entry->instance = QAPI_CLONE(YankInstance, instance);
+    QLIST_INIT(&entry->yankfns);
+    QLIST_INSERT_HEAD(&yank_instance_list, entry, next);
+
+    qemu_mutex_unlock(&yank_lock);
+}
+
+void yank_unregister_instance(const YankInstance *instance)
+{
+    YankInstanceEntry *entry;
+
+    qemu_mutex_lock(&yank_lock);
+    entry = yank_find_entry(instance);
+    assert(entry);
+
+    assert(QLIST_EMPTY(&entry->yankfns));
+    QLIST_REMOVE(entry, next);
+    qapi_free_YankInstance(entry->instance);
+    g_slice_free(YankInstanceEntry, entry);
+
+    qemu_mutex_unlock(&yank_lock);
+}
+
+void yank_register_function(const YankInstance *instance,
+                            YankFn *func,
+                            void *opaque)
+{
+    YankInstanceEntry *entry;
+    YankFuncAndParam *func_entry;
+
+    qemu_mutex_lock(&yank_lock);
+    entry = yank_find_entry(instance);
+    assert(entry);
+
+    func_entry = g_slice_new(YankFuncAndParam);
+    func_entry->func = func;
+    func_entry->opaque = opaque;
+
+    QLIST_INSERT_HEAD(&entry->yankfns, func_entry, next);
+    qemu_mutex_unlock(&yank_lock);
+}
+
+void yank_unregister_function(const YankInstance *instance,
+                              YankFn *func,
+                              void *opaque)
+{
+    YankInstanceEntry *entry;
+    YankFuncAndParam *func_entry;
+
+    qemu_mutex_lock(&yank_lock);
+    entry = yank_find_entry(instance);
+    assert(entry);
+
+    QLIST_FOREACH(func_entry, &entry->yankfns, next) {
+        if (func_entry->func == func && func_entry->opaque == opaque) {
+            QLIST_REMOVE(func_entry, next);
+            g_slice_free(YankFuncAndParam, func_entry);
+            qemu_mutex_unlock(&yank_lock);
+            return;
+        }
+    }
+
+    abort();
+}
+
+void yank_generic_iochannel(void *opaque)
+{
+    QIOChannel *ioc = QIO_CHANNEL(opaque);
+
+    qio_channel_shutdown(ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+}
+
+void qmp_yank(YankInstanceList *instances,
+              Error **errp)
+{
+    YankInstanceList *tail;
+    YankInstanceEntry *entry;
+    YankFuncAndParam *func_entry;
+
+    qemu_mutex_lock(&yank_lock);
+    for (tail = instances; tail; tail = tail->next) {
+        entry = yank_find_entry(tail->value);
+        if (!entry) {
+            error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, "Instance not found");
+            qemu_mutex_unlock(&yank_lock);
+            return;
+        }
+    }
+    for (tail = instances; tail; tail = tail->next) {
+        entry = yank_find_entry(tail->value);
+        assert(entry);
+        QLIST_FOREACH(func_entry, &entry->yankfns, next) {
+            func_entry->func(func_entry->opaque);
+        }
+    }
+    qemu_mutex_unlock(&yank_lock);
+}
+
+YankInstanceList *qmp_query_yank(Error **errp)
+{
+    YankInstanceEntry *entry;
+    YankInstanceList *ret;
+
+    ret = NULL;
+
+    qemu_mutex_lock(&yank_lock);
+    QLIST_FOREACH(entry, &yank_instance_list, next) {
+        YankInstanceList *new_entry;
+        new_entry = g_new0(YankInstanceList, 1);
+        new_entry->value = QAPI_CLONE(YankInstance, entry->instance);
+        new_entry->next = ret;
+        ret = new_entry;
+    }
+    qemu_mutex_unlock(&yank_lock);
+
+    return ret;
+}
+
+static void __attribute__((__constructor__)) yank_init(void)
+{
+    qemu_mutex_init(&yank_lock);
+}