diff mbox series

[v26,05/17] vfio: Add VM state change handler to know state of VM

Message ID 1600817059-26721-6-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Add migration support for VFIO devices | expand

Commit Message

Kirti Wankhede Sept. 22, 2020, 11:24 p.m. UTC
VM state change handler gets called on change in VM's state. This is used to set
VFIO device state to _RUNNING.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 hw/vfio/migration.c           | 136 ++++++++++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events          |   3 +-
 include/hw/vfio/vfio-common.h |   4 ++
 3 files changed, 142 insertions(+), 1 deletion(-)

Comments

Cornelia Huck Sept. 24, 2020, 3:02 p.m. UTC | #1
On Wed, 23 Sep 2020 04:54:07 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> VM state change handler gets called on change in VM's state. This is used to set
> VFIO device state to _RUNNING.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  hw/vfio/migration.c           | 136 ++++++++++++++++++++++++++++++++++++++++++
>  hw/vfio/trace-events          |   3 +-
>  include/hw/vfio/vfio-common.h |   4 ++
>  3 files changed, 142 insertions(+), 1 deletion(-)
> 

(...)

> +static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
> +                                    uint32_t value)

I think I've mentioned that before, but this function could really
benefit from a comment what mask and value mean. 

> +{
> +    VFIOMigration *migration = vbasedev->migration;
> +    VFIORegion *region = &migration->region;
> +    off_t dev_state_off = region->fd_offset +
> +                      offsetof(struct vfio_device_migration_info, device_state);
> +    uint32_t device_state;
> +    int ret;
> +
> +    ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
> +                        dev_state_off);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    device_state = (device_state & mask) | value;
> +
> +    if (!VFIO_DEVICE_STATE_VALID(device_state)) {
> +        return -EINVAL;
> +    }
> +
> +    ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state),
> +                         dev_state_off);
> +    if (ret < 0) {
> +        ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
> +                          dev_state_off);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        if (VFIO_DEVICE_STATE_IS_ERROR(device_state)) {
> +            hw_error("%s: Device is in error state 0x%x",
> +                     vbasedev->name, device_state);
> +            return -EFAULT;

Is -EFAULT a good return value here? Maybe -EIO?

> +        }
> +    }
> +
> +    vbasedev->device_state = device_state;
> +    trace_vfio_migration_set_state(vbasedev->name, device_state);
> +    return 0;
> +}
> +
> +static void vfio_vmstate_change(void *opaque, int running, RunState state)
> +{
> +    VFIODevice *vbasedev = opaque;
> +
> +    if ((vbasedev->vm_running != running)) {
> +        int ret;
> +        uint32_t value = 0, mask = 0;
> +
> +        if (running) {
> +            value = VFIO_DEVICE_STATE_RUNNING;
> +            if (vbasedev->device_state & VFIO_DEVICE_STATE_RESUMING) {
> +                mask = ~VFIO_DEVICE_STATE_RESUMING;

I've been staring at this for some time and I think that the desired
result is
- set _RUNNING
- if _RESUMING was set, clear it, but leave the other bits intact
- if _RESUMING was not set, clear everything previously set
This would really benefit from a comment (or am I the only one
struggling here?)

> +            }
> +        } else {
> +            mask = ~VFIO_DEVICE_STATE_RUNNING;
> +        }
> +
> +        ret = vfio_migration_set_state(vbasedev, mask, value);
> +        if (ret) {
> +            /*
> +             * vm_state_notify() doesn't support reporting failure. If such
> +             * error reporting support added in furure, migration should be
> +             * aborted.


"We should abort the migration in this case, but vm_state_notify()
currently does not support reporting failures."

?

Can/should we mark the failing device in some way?

> +             */
> +            error_report("%s: Failed to set device state 0x%x",
> +                         vbasedev->name, value & mask);
> +        }
> +        vbasedev->vm_running = running;
> +        trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
> +                                  value & mask);
> +    }
> +}
> +
>  static int vfio_migration_init(VFIODevice *vbasedev,
>                                 struct vfio_region_info *info)
>  {

(...)
Alex Williamson Sept. 25, 2020, 8:20 p.m. UTC | #2
On Wed, 23 Sep 2020 04:54:07 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> VM state change handler gets called on change in VM's state. This is used to set
> VFIO device state to _RUNNING.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  hw/vfio/migration.c           | 136 ++++++++++++++++++++++++++++++++++++++++++
>  hw/vfio/trace-events          |   3 +-
>  include/hw/vfio/vfio-common.h |   4 ++
>  3 files changed, 142 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index 2f760f1f9c47..a30d628ba963 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -10,6 +10,7 @@
>  #include "qemu/osdep.h"
>  #include <linux/vfio.h>
>  
> +#include "sysemu/runstate.h"
>  #include "hw/vfio/vfio-common.h"
>  #include "cpu.h"
>  #include "migration/migration.h"
> @@ -22,6 +23,58 @@
>  #include "exec/ram_addr.h"
>  #include "pci.h"
>  #include "trace.h"
> +#include "hw/hw.h"
> +
> +static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count,
> +                                  off_t off, bool iswrite)
> +{
> +    int ret;
> +
> +    ret = iswrite ? pwrite(vbasedev->fd, val, count, off) :
> +                    pread(vbasedev->fd, val, count, off);
> +    if (ret < count) {
> +        error_report("vfio_mig_%s%d %s: failed at offset 0x%lx, err: %s",
> +                     iswrite ? "write" : "read", count * 8,
> +                     vbasedev->name, off, strerror(errno));

This would suggest from the log that there's, for example, a
vfio_mig_read8 function, which doesn't exist.

> +        return (ret < 0) ? ret : -EINVAL;
> +    }
> +    return 0;
> +}
> +
> +static int vfio_mig_rw(VFIODevice *vbasedev, __u8 *buf, size_t count,
> +                       off_t off, bool iswrite)
> +{
> +    int ret, done = 0;
> +    __u8 *tbuf = buf;
> +
> +    while (count) {
> +        int bytes = 0;
> +
> +        if (count >= 8 && !(off % 8)) {
> +            bytes = 8;
> +        } else if (count >= 4 && !(off % 4)) {
> +            bytes = 4;
> +        } else if (count >= 2 && !(off % 2)) {
> +            bytes = 2;
> +        } else {
> +            bytes = 1;
> +        }
> +
> +        ret = vfio_mig_access(vbasedev, tbuf, bytes, off, iswrite);
> +        if (ret) {
> +            return ret;
> +        }
> +
> +        count -= bytes;
> +        done += bytes;
> +        off += bytes;
> +        tbuf += bytes;
> +    }
> +    return done;
> +}
> +
> +#define vfio_mig_read(f, v, c, o)       vfio_mig_rw(f, (__u8 *)v, c, o, false)
> +#define vfio_mig_write(f, v, c, o)      vfio_mig_rw(f, (__u8 *)v, c, o, true)
>  
>  static void vfio_migration_region_exit(VFIODevice *vbasedev)
>  {
> @@ -70,6 +123,82 @@ err:
>      return ret;
>  }
>  
> +static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
> +                                    uint32_t value)
> +{
> +    VFIOMigration *migration = vbasedev->migration;
> +    VFIORegion *region = &migration->region;
> +    off_t dev_state_off = region->fd_offset +
> +                      offsetof(struct vfio_device_migration_info, device_state);
> +    uint32_t device_state;
> +    int ret;
> +
> +    ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
> +                        dev_state_off);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    device_state = (device_state & mask) | value;

Agree with Connie that mask and value args are not immediately obvious
how they're used.  I don't have a naming convention that would be more
clear and the names do make some sense once they're understood, but a
comment to indicate mask bits are preserved, value bits are set,
remaining bits are cleared would probably help the reader.

> +
> +    if (!VFIO_DEVICE_STATE_VALID(device_state)) {
> +        return -EINVAL;
> +    }
> +
> +    ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state),
> +                         dev_state_off);
> +    if (ret < 0) {
> +        ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
> +                          dev_state_off);
> +        if (ret < 0) {
> +            return ret;

Seems like we're in pretty bad shape here, should this be combined with
below to trigger a hw_error?

> +        }
> +
> +        if (VFIO_DEVICE_STATE_IS_ERROR(device_state)) {
> +            hw_error("%s: Device is in error state 0x%x",
> +                     vbasedev->name, device_state);
> +            return -EFAULT;
> +        }
> +    }
> +
> +    vbasedev->device_state = device_state;
> +    trace_vfio_migration_set_state(vbasedev->name, device_state);
> +    return 0;

So we return success even if we failed to write the desired state as
long as we were able to read back any non-error state?
vbasedev->device_state remains correct, but it seems confusing form a
caller perspective that a set-state can succeed but it's then necessary
to check the state.

> +}
> +
> +static void vfio_vmstate_change(void *opaque, int running, RunState state)
> +{
> +    VFIODevice *vbasedev = opaque;
> +
> +    if ((vbasedev->vm_running != running)) {
> +        int ret;
> +        uint32_t value = 0, mask = 0;
> +
> +        if (running) {
> +            value = VFIO_DEVICE_STATE_RUNNING;
> +            if (vbasedev->device_state & VFIO_DEVICE_STATE_RESUMING) {
> +                mask = ~VFIO_DEVICE_STATE_RESUMING;
> +            }
> +        } else {
> +            mask = ~VFIO_DEVICE_STATE_RUNNING;
> +        }
> +
> +        ret = vfio_migration_set_state(vbasedev, mask, value);
> +        if (ret) {
> +            /*
> +             * vm_state_notify() doesn't support reporting failure. If such
> +             * error reporting support added in furure, migration should be
> +             * aborted.
> +             */
> +            error_report("%s: Failed to set device state 0x%x",
> +                         vbasedev->name, value & mask);
> +        }

Here for instance we assume that success means the device is now in the
desired state, but we'd actually need to evaluate
vbasedev->device_state to determine that.

> +        vbasedev->vm_running = running;
> +        trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
> +                                  value & mask);
> +    }
> +}
> +
>  static int vfio_migration_init(VFIODevice *vbasedev,
>                                 struct vfio_region_info *info)
>  {
> @@ -87,8 +216,11 @@ static int vfio_migration_init(VFIODevice *vbasedev,
>                       vbasedev->name);
>          g_free(vbasedev->migration);
>          vbasedev->migration = NULL;
> +        return ret;
>      }
>  
> +    vbasedev->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change,
> +                                                          vbasedev);
>      return ret;
>  }
>  
> @@ -131,6 +263,10 @@ add_blocker:
>  
>  void vfio_migration_finalize(VFIODevice *vbasedev)
>  {
> +    if (vbasedev->vm_state) {
> +        qemu_del_vm_change_state_handler(vbasedev->vm_state);
> +    }
> +
>      if (vbasedev->migration_blocker) {
>          migrate_del_blocker(vbasedev->migration_blocker);
>          error_free(vbasedev->migration_blocker);
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 8fe913175d85..6524734bf7b4 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -149,4 +149,5 @@ vfio_display_edid_write_error(void) ""
>  
>  # migration.c
>  vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d"
> -
> +vfio_migration_set_state(char *name, uint32_t state) " (%s) state %d"
> +vfio_vmstate_change(char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d"
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 8275c4c68f45..25e3b1a3b90a 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -29,6 +29,7 @@
>  #ifdef CONFIG_LINUX
>  #include <linux/vfio.h>
>  #endif
> +#include "sysemu/sysemu.h"
>  
>  #define VFIO_MSG_PREFIX "vfio %s: "
>  
> @@ -119,6 +120,9 @@ typedef struct VFIODevice {
>      unsigned int flags;
>      VFIOMigration *migration;
>      Error *migration_blocker;
> +    VMChangeStateEntry *vm_state;
> +    uint32_t device_state;
> +    int vm_running;

Could these be placed in VFIOMigration?  Thanks,

Alex

>  } VFIODevice;
>  
>  struct VFIODeviceOps {
Dr. David Alan Gilbert Sept. 29, 2020, 11:03 a.m. UTC | #3
* Cornelia Huck (cohuck@redhat.com) wrote:
> On Wed, 23 Sep 2020 04:54:07 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > VM state change handler gets called on change in VM's state. This is used to set
> > VFIO device state to _RUNNING.
> > 
> > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > Reviewed-by: Neo Jia <cjia@nvidia.com>
> > Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  hw/vfio/migration.c           | 136 ++++++++++++++++++++++++++++++++++++++++++
> >  hw/vfio/trace-events          |   3 +-
> >  include/hw/vfio/vfio-common.h |   4 ++
> >  3 files changed, 142 insertions(+), 1 deletion(-)
> > 
> 
> (...)
> 
> > +static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
> > +                                    uint32_t value)
> 
> I think I've mentioned that before, but this function could really
> benefit from a comment what mask and value mean. 
> 
> > +{
> > +    VFIOMigration *migration = vbasedev->migration;
> > +    VFIORegion *region = &migration->region;
> > +    off_t dev_state_off = region->fd_offset +
> > +                      offsetof(struct vfio_device_migration_info, device_state);
> > +    uint32_t device_state;
> > +    int ret;
> > +
> > +    ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
> > +                        dev_state_off);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    device_state = (device_state & mask) | value;
> > +
> > +    if (!VFIO_DEVICE_STATE_VALID(device_state)) {
> > +        return -EINVAL;
> > +    }
> > +
> > +    ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state),
> > +                         dev_state_off);
> > +    if (ret < 0) {
> > +        ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
> > +                          dev_state_off);
> > +        if (ret < 0) {
> > +            return ret;
> > +        }
> > +
> > +        if (VFIO_DEVICE_STATE_IS_ERROR(device_state)) {
> > +            hw_error("%s: Device is in error state 0x%x",
> > +                     vbasedev->name, device_state);
> > +            return -EFAULT;
> 
> Is -EFAULT a good return value here? Maybe -EIO?
> 
> > +        }
> > +    }
> > +
> > +    vbasedev->device_state = device_state;
> > +    trace_vfio_migration_set_state(vbasedev->name, device_state);
> > +    return 0;
> > +}
> > +
> > +static void vfio_vmstate_change(void *opaque, int running, RunState state)
> > +{
> > +    VFIODevice *vbasedev = opaque;
> > +
> > +    if ((vbasedev->vm_running != running)) {
> > +        int ret;
> > +        uint32_t value = 0, mask = 0;
> > +
> > +        if (running) {
> > +            value = VFIO_DEVICE_STATE_RUNNING;
> > +            if (vbasedev->device_state & VFIO_DEVICE_STATE_RESUMING) {
> > +                mask = ~VFIO_DEVICE_STATE_RESUMING;
> 
> I've been staring at this for some time and I think that the desired
> result is
> - set _RUNNING
> - if _RESUMING was set, clear it, but leave the other bits intact
> - if _RESUMING was not set, clear everything previously set
> This would really benefit from a comment (or am I the only one
> struggling here?)
> 
> > +            }
> > +        } else {
> > +            mask = ~VFIO_DEVICE_STATE_RUNNING;
> > +        }
> > +
> > +        ret = vfio_migration_set_state(vbasedev, mask, value);
> > +        if (ret) {
> > +            /*
> > +             * vm_state_notify() doesn't support reporting failure. If such
> > +             * error reporting support added in furure, migration should be
> > +             * aborted.
> 
> 
> "We should abort the migration in this case, but vm_state_notify()
> currently does not support reporting failures."
> 
> ?
> 
> Can/should we mark the failing device in some way?

I think you can call qemu_file_set_error on the migration stream to
force an error.

Dave

> > +             */
> > +            error_report("%s: Failed to set device state 0x%x",
> > +                         vbasedev->name, value & mask);
> > +        }
> > +        vbasedev->vm_running = running;
> > +        trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
> > +                                  value & mask);
> > +    }
> > +}
> > +
> >  static int vfio_migration_init(VFIODevice *vbasedev,
> >                                 struct vfio_region_info *info)
> >  {
> 
> (...)
Kirti Wankhede Oct. 17, 2020, 8:24 p.m. UTC | #4
On 9/29/2020 4:33 PM, Dr. David Alan Gilbert wrote:
> * Cornelia Huck (cohuck@redhat.com) wrote:
>> On Wed, 23 Sep 2020 04:54:07 +0530
>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>
>>> VM state change handler gets called on change in VM's state. This is used to set
>>> VFIO device state to _RUNNING.
>>>
>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>> ---
>>>   hw/vfio/migration.c           | 136 ++++++++++++++++++++++++++++++++++++++++++
>>>   hw/vfio/trace-events          |   3 +-
>>>   include/hw/vfio/vfio-common.h |   4 ++
>>>   3 files changed, 142 insertions(+), 1 deletion(-)
>>>
>>
>> (...)
>>
>>> +static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
>>> +                                    uint32_t value)
>>
>> I think I've mentioned that before, but this function could really
>> benefit from a comment what mask and value mean.
>>

Adding a comment as:

/*
  *  Write device_state field to inform the vendor driver about the 
device state
  *  to be transitioned to.
  *  vbasedev: VFIO device
  *  mask : bits set in the mask are preserved in device_state
  *  value: bits set in the value are set in device_state
  *  Remaining bits in device_state are cleared.
  */


>>> +{
>>> +    VFIOMigration *migration = vbasedev->migration;
>>> +    VFIORegion *region = &migration->region;
>>> +    off_t dev_state_off = region->fd_offset +
>>> +                      offsetof(struct vfio_device_migration_info, device_state);
>>> +    uint32_t device_state;
>>> +    int ret;
>>> +
>>> +    ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
>>> +                        dev_state_off);
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    device_state = (device_state & mask) | value;
>>> +
>>> +    if (!VFIO_DEVICE_STATE_VALID(device_state)) {
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state),
>>> +                         dev_state_off);
>>> +    if (ret < 0) {
>>> +        ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
>>> +                          dev_state_off);
>>> +        if (ret < 0) {
>>> +            return ret;
>>> +        }
>>> +
>>> +        if (VFIO_DEVICE_STATE_IS_ERROR(device_state)) {
>>> +            hw_error("%s: Device is in error state 0x%x",
>>> +                     vbasedev->name, device_state);
>>> +            return -EFAULT;
>>
>> Is -EFAULT a good return value here? Maybe -EIO?
>>

Ok. Changing to -EIO.

>>> +        }
>>> +    }
>>> +
>>> +    vbasedev->device_state = device_state;
>>> +    trace_vfio_migration_set_state(vbasedev->name, device_state);
>>> +    return 0;
>>> +}
>>> +
>>> +static void vfio_vmstate_change(void *opaque, int running, RunState state)
>>> +{
>>> +    VFIODevice *vbasedev = opaque;
>>> +
>>> +    if ((vbasedev->vm_running != running)) {
>>> +        int ret;
>>> +        uint32_t value = 0, mask = 0;
>>> +
>>> +        if (running) {
>>> +            value = VFIO_DEVICE_STATE_RUNNING;
>>> +            if (vbasedev->device_state & VFIO_DEVICE_STATE_RESUMING) {
>>> +                mask = ~VFIO_DEVICE_STATE_RESUMING;
>>
>> I've been staring at this for some time and I think that the desired
>> result is
>> - set _RUNNING
>> - if _RESUMING was set, clear it, but leave the other bits intact

Upto here, you're correct.

>> - if _RESUMING was not set, clear everything previously set
>> This would really benefit from a comment (or am I the only one
>> struggling here?)
>>

Here mask should be ~0. Correcting it.


>>> +            }
>>> +        } else {
>>> +            mask = ~VFIO_DEVICE_STATE_RUNNING;
>>> +        }
>>> +
>>> +        ret = vfio_migration_set_state(vbasedev, mask, value);
>>> +        if (ret) {
>>> +            /*
>>> +             * vm_state_notify() doesn't support reporting failure. If such
>>> +             * error reporting support added in furure, migration should be
>>> +             * aborted.
>>
>>
>> "We should abort the migration in this case, but vm_state_notify()
>> currently does not support reporting failures."
>>
>> ?
>>

Ok. Updating comment as suggested here.

>> Can/should we mark the failing device in some way?
> 
> I think you can call qemu_file_set_error on the migration stream to
> force an error.
> 

It should be as below, right?
qemu_file_set_error(migrate_get_current()->to_dst_file, ret);


Thanks,
Kirti

> Dave
> 
>>> +             */
>>> +            error_report("%s: Failed to set device state 0x%x",
>>> +                         vbasedev->name, value & mask);
>>> +        }
>>> +        vbasedev->vm_running = running;
>>> +        trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
>>> +                                  value & mask);
>>> +    }
>>> +}
>>> +
>>>   static int vfio_migration_init(VFIODevice *vbasedev,
>>>                                  struct vfio_region_info *info)
>>>   {
>>
>> (...)
Kirti Wankhede Oct. 17, 2020, 8:30 p.m. UTC | #5
On 9/26/2020 1:50 AM, Alex Williamson wrote:
> On Wed, 23 Sep 2020 04:54:07 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> VM state change handler gets called on change in VM's state. This is used to set
>> VFIO device state to _RUNNING.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>> ---
>>   hw/vfio/migration.c           | 136 ++++++++++++++++++++++++++++++++++++++++++
>>   hw/vfio/trace-events          |   3 +-
>>   include/hw/vfio/vfio-common.h |   4 ++
>>   3 files changed, 142 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
>> index 2f760f1f9c47..a30d628ba963 100644
>> --- a/hw/vfio/migration.c
>> +++ b/hw/vfio/migration.c
>> @@ -10,6 +10,7 @@
>>   #include "qemu/osdep.h"
>>   #include <linux/vfio.h>
>>   
>> +#include "sysemu/runstate.h"
>>   #include "hw/vfio/vfio-common.h"
>>   #include "cpu.h"
>>   #include "migration/migration.h"
>> @@ -22,6 +23,58 @@
>>   #include "exec/ram_addr.h"
>>   #include "pci.h"
>>   #include "trace.h"
>> +#include "hw/hw.h"
>> +
>> +static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count,
>> +                                  off_t off, bool iswrite)
>> +{
>> +    int ret;
>> +
>> +    ret = iswrite ? pwrite(vbasedev->fd, val, count, off) :
>> +                    pread(vbasedev->fd, val, count, off);
>> +    if (ret < count) {
>> +        error_report("vfio_mig_%s%d %s: failed at offset 0x%lx, err: %s",
>> +                     iswrite ? "write" : "read", count * 8,
>> +                     vbasedev->name, off, strerror(errno));
> 
> This would suggest from the log that there's, for example, a
> vfio_mig_read8 function, which doesn't exist.
> 

Changing to:
error_report("vfio_mig_%s %d byte %s: failed at offset 0x%lx, err: %s",
              iswrite ? "write" : "read", count,
              vbasedev->name, off, strerror(errno));
Hope this address your concern.

>> +        return (ret < 0) ? ret : -EINVAL;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static int vfio_mig_rw(VFIODevice *vbasedev, __u8 *buf, size_t count,
>> +                       off_t off, bool iswrite)
>> +{
>> +    int ret, done = 0;
>> +    __u8 *tbuf = buf;
>> +
>> +    while (count) {
>> +        int bytes = 0;
>> +
>> +        if (count >= 8 && !(off % 8)) {
>> +            bytes = 8;
>> +        } else if (count >= 4 && !(off % 4)) {
>> +            bytes = 4;
>> +        } else if (count >= 2 && !(off % 2)) {
>> +            bytes = 2;
>> +        } else {
>> +            bytes = 1;
>> +        }
>> +
>> +        ret = vfio_mig_access(vbasedev, tbuf, bytes, off, iswrite);
>> +        if (ret) {
>> +            return ret;
>> +        }
>> +
>> +        count -= bytes;
>> +        done += bytes;
>> +        off += bytes;
>> +        tbuf += bytes;
>> +    }
>> +    return done;
>> +}
>> +
>> +#define vfio_mig_read(f, v, c, o)       vfio_mig_rw(f, (__u8 *)v, c, o, false)
>> +#define vfio_mig_write(f, v, c, o)      vfio_mig_rw(f, (__u8 *)v, c, o, true)
>>   
>>   static void vfio_migration_region_exit(VFIODevice *vbasedev)
>>   {
>> @@ -70,6 +123,82 @@ err:
>>       return ret;
>>   }
>>   
>> +static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
>> +                                    uint32_t value)
>> +{
>> +    VFIOMigration *migration = vbasedev->migration;
>> +    VFIORegion *region = &migration->region;
>> +    off_t dev_state_off = region->fd_offset +
>> +                      offsetof(struct vfio_device_migration_info, device_state);
>> +    uint32_t device_state;
>> +    int ret;
>> +
>> +    ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
>> +                        dev_state_off);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    device_state = (device_state & mask) | value;
> 
> Agree with Connie that mask and value args are not immediately obvious
> how they're used.  I don't have a naming convention that would be more
> clear and the names do make some sense once they're understood, but a
> comment to indicate mask bits are preserved, value bits are set,
> remaining bits are cleared would probably help the reader.
> 

Added comment.

>> +
>> +    if (!VFIO_DEVICE_STATE_VALID(device_state)) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state),
>> +                         dev_state_off);
>> +    if (ret < 0) {
>> +        ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
>> +                          dev_state_off);
>> +        if (ret < 0) {
>> +            return ret;
> 
> Seems like we're in pretty bad shape here, should this be combined with
> below to trigger a hw_error?
> 

Ok.

>> +        }
>> +
>> +        if (VFIO_DEVICE_STATE_IS_ERROR(device_state)) {
>> +            hw_error("%s: Device is in error state 0x%x",
>> +                     vbasedev->name, device_state);
>> +            return -EFAULT;
>> +        }
>> +    }
>> +
>> +    vbasedev->device_state = device_state;
>> +    trace_vfio_migration_set_state(vbasedev->name, device_state);
>> +    return 0;
> 
> So we return success even if we failed to write the desired state as
> long as we were able to read back any non-error state?
> vbasedev->device_state remains correct, but it seems confusing form a
> caller perspective that a set-state can succeed but it's then necessary
> to check the state.
> 

Correcting here. If vfio_mig_write() had retured error, return error 
from vfio_migration_set_state()

>> +}
>> +
>> +static void vfio_vmstate_change(void *opaque, int running, RunState state)
>> +{
>> +    VFIODevice *vbasedev = opaque;
>> +
>> +    if ((vbasedev->vm_running != running)) {
>> +        int ret;
>> +        uint32_t value = 0, mask = 0;
>> +
>> +        if (running) {
>> +            value = VFIO_DEVICE_STATE_RUNNING;
>> +            if (vbasedev->device_state & VFIO_DEVICE_STATE_RESUMING) {
>> +                mask = ~VFIO_DEVICE_STATE_RESUMING;
>> +            }
>> +        } else {
>> +            mask = ~VFIO_DEVICE_STATE_RUNNING;
>> +        }
>> +
>> +        ret = vfio_migration_set_state(vbasedev, mask, value);
>> +        if (ret) {
>> +            /*
>> +             * vm_state_notify() doesn't support reporting failure. If such
>> +             * error reporting support added in furure, migration should be
>> +             * aborted.
>> +             */
>> +            error_report("%s: Failed to set device state 0x%x",
>> +                         vbasedev->name, value & mask);
>> +        }
> 
> Here for instance we assume that success means the device is now in the
> desired state, but we'd actually need to evaluate
> vbasedev->device_state to determine that.
> 

Updating.

>> +        vbasedev->vm_running = running;
>> +        trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
>> +                                  value & mask);
>> +    }
>> +}
>> +
>>   static int vfio_migration_init(VFIODevice *vbasedev,
>>                                  struct vfio_region_info *info)
>>   {
>> @@ -87,8 +216,11 @@ static int vfio_migration_init(VFIODevice *vbasedev,
>>                        vbasedev->name);
>>           g_free(vbasedev->migration);
>>           vbasedev->migration = NULL;
>> +        return ret;
>>       }
>>   
>> +    vbasedev->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change,
>> +                                                          vbasedev);
>>       return ret;
>>   }
>>   
>> @@ -131,6 +263,10 @@ add_blocker:
>>   
>>   void vfio_migration_finalize(VFIODevice *vbasedev)
>>   {
>> +    if (vbasedev->vm_state) {
>> +        qemu_del_vm_change_state_handler(vbasedev->vm_state);
>> +    }
>> +
>>       if (vbasedev->migration_blocker) {
>>           migrate_del_blocker(vbasedev->migration_blocker);
>>           error_free(vbasedev->migration_blocker);
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 8fe913175d85..6524734bf7b4 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -149,4 +149,5 @@ vfio_display_edid_write_error(void) ""
>>   
>>   # migration.c
>>   vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d"
>> -
>> +vfio_migration_set_state(char *name, uint32_t state) " (%s) state %d"
>> +vfio_vmstate_change(char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d"
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 8275c4c68f45..25e3b1a3b90a 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -29,6 +29,7 @@
>>   #ifdef CONFIG_LINUX
>>   #include <linux/vfio.h>
>>   #endif
>> +#include "sysemu/sysemu.h"
>>   
>>   #define VFIO_MSG_PREFIX "vfio %s: "
>>   
>> @@ -119,6 +120,9 @@ typedef struct VFIODevice {
>>       unsigned int flags;
>>       VFIOMigration *migration;
>>       Error *migration_blocker;
>> +    VMChangeStateEntry *vm_state;
>> +    uint32_t device_state;
>> +    int vm_running;
> 
> Could these be placed in VFIOMigration?  Thanks,
>

I think device_state should be part of VFIODevice since its about device 
rather than only related to migration, others can be moved to VFIOMigration.

Thanks,
Kirti


> Alex
> 
>>   } VFIODevice;
>>   
>>   struct VFIODeviceOps {
>
Alex Williamson Oct. 17, 2020, 11:44 p.m. UTC | #6
On Sun, 18 Oct 2020 02:00:44 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 9/26/2020 1:50 AM, Alex Williamson wrote:
> > On Wed, 23 Sep 2020 04:54:07 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> VM state change handler gets called on change in VM's state. This is used to set
> >> VFIO device state to _RUNNING.
> >>
> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> >> ---
> >>   hw/vfio/migration.c           | 136 ++++++++++++++++++++++++++++++++++++++++++
> >>   hw/vfio/trace-events          |   3 +-
> >>   include/hw/vfio/vfio-common.h |   4 ++
> >>   3 files changed, 142 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> >> index 2f760f1f9c47..a30d628ba963 100644
> >> --- a/hw/vfio/migration.c
> >> +++ b/hw/vfio/migration.c
> >> @@ -10,6 +10,7 @@
> >>   #include "qemu/osdep.h"
> >>   #include <linux/vfio.h>
> >>   
> >> +#include "sysemu/runstate.h"
> >>   #include "hw/vfio/vfio-common.h"
> >>   #include "cpu.h"
> >>   #include "migration/migration.h"
> >> @@ -22,6 +23,58 @@
> >>   #include "exec/ram_addr.h"
> >>   #include "pci.h"
> >>   #include "trace.h"
> >> +#include "hw/hw.h"
> >> +
> >> +static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count,
> >> +                                  off_t off, bool iswrite)
> >> +{
> >> +    int ret;
> >> +
> >> +    ret = iswrite ? pwrite(vbasedev->fd, val, count, off) :
> >> +                    pread(vbasedev->fd, val, count, off);
> >> +    if (ret < count) {
> >> +        error_report("vfio_mig_%s%d %s: failed at offset 0x%lx, err: %s",
> >> +                     iswrite ? "write" : "read", count * 8,
> >> +                     vbasedev->name, off, strerror(errno));  
> > 
> > This would suggest from the log that there's, for example, a
> > vfio_mig_read8 function, which doesn't exist.
> >   
> 
> Changing to:
> error_report("vfio_mig_%s %d byte %s: failed at offset 0x%lx, err: %s",
>               iswrite ? "write" : "read", count,
>               vbasedev->name, off, strerror(errno));
> Hope this address your concern.
> 
> >> +        return (ret < 0) ? ret : -EINVAL;
> >> +    }
> >> +    return 0;
> >> +}
> >> +
> >> +static int vfio_mig_rw(VFIODevice *vbasedev, __u8 *buf, size_t count,
> >> +                       off_t off, bool iswrite)
> >> +{
> >> +    int ret, done = 0;
> >> +    __u8 *tbuf = buf;
> >> +
> >> +    while (count) {
> >> +        int bytes = 0;
> >> +
> >> +        if (count >= 8 && !(off % 8)) {
> >> +            bytes = 8;
> >> +        } else if (count >= 4 && !(off % 4)) {
> >> +            bytes = 4;
> >> +        } else if (count >= 2 && !(off % 2)) {
> >> +            bytes = 2;
> >> +        } else {
> >> +            bytes = 1;
> >> +        }
> >> +
> >> +        ret = vfio_mig_access(vbasedev, tbuf, bytes, off, iswrite);
> >> +        if (ret) {
> >> +            return ret;
> >> +        }
> >> +
> >> +        count -= bytes;
> >> +        done += bytes;
> >> +        off += bytes;
> >> +        tbuf += bytes;
> >> +    }
> >> +    return done;
> >> +}
> >> +
> >> +#define vfio_mig_read(f, v, c, o)       vfio_mig_rw(f, (__u8 *)v, c, o, false)
> >> +#define vfio_mig_write(f, v, c, o)      vfio_mig_rw(f, (__u8 *)v, c, o, true)
> >>   
> >>   static void vfio_migration_region_exit(VFIODevice *vbasedev)
> >>   {
> >> @@ -70,6 +123,82 @@ err:
> >>       return ret;
> >>   }
> >>   
> >> +static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
> >> +                                    uint32_t value)
> >> +{
> >> +    VFIOMigration *migration = vbasedev->migration;
> >> +    VFIORegion *region = &migration->region;
> >> +    off_t dev_state_off = region->fd_offset +
> >> +                      offsetof(struct vfio_device_migration_info, device_state);
> >> +    uint32_t device_state;
> >> +    int ret;
> >> +
> >> +    ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
> >> +                        dev_state_off);
> >> +    if (ret < 0) {
> >> +        return ret;
> >> +    }
> >> +
> >> +    device_state = (device_state & mask) | value;  
> > 
> > Agree with Connie that mask and value args are not immediately obvious
> > how they're used.  I don't have a naming convention that would be more
> > clear and the names do make some sense once they're understood, but a
> > comment to indicate mask bits are preserved, value bits are set,
> > remaining bits are cleared would probably help the reader.
> >   
> 
> Added comment.
> 
> >> +
> >> +    if (!VFIO_DEVICE_STATE_VALID(device_state)) {
> >> +        return -EINVAL;
> >> +    }
> >> +
> >> +    ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state),
> >> +                         dev_state_off);
> >> +    if (ret < 0) {
> >> +        ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
> >> +                          dev_state_off);
> >> +        if (ret < 0) {
> >> +            return ret;  
> > 
> > Seems like we're in pretty bad shape here, should this be combined with
> > below to trigger a hw_error?
> >   
> 
> Ok.
> 
> >> +        }
> >> +
> >> +        if (VFIO_DEVICE_STATE_IS_ERROR(device_state)) {
> >> +            hw_error("%s: Device is in error state 0x%x",
> >> +                     vbasedev->name, device_state);
> >> +            return -EFAULT;
> >> +        }
> >> +    }
> >> +
> >> +    vbasedev->device_state = device_state;
> >> +    trace_vfio_migration_set_state(vbasedev->name, device_state);
> >> +    return 0;  
> > 
> > So we return success even if we failed to write the desired state as
> > long as we were able to read back any non-error state?
> > vbasedev->device_state remains correct, but it seems confusing form a
> > caller perspective that a set-state can succeed but it's then necessary
> > to check the state.
> >   
> 
> Correcting here. If vfio_mig_write() had retured error, return error 
> from vfio_migration_set_state()
> 
> >> +}
> >> +
> >> +static void vfio_vmstate_change(void *opaque, int running, RunState state)
> >> +{
> >> +    VFIODevice *vbasedev = opaque;
> >> +
> >> +    if ((vbasedev->vm_running != running)) {
> >> +        int ret;
> >> +        uint32_t value = 0, mask = 0;
> >> +
> >> +        if (running) {
> >> +            value = VFIO_DEVICE_STATE_RUNNING;
> >> +            if (vbasedev->device_state & VFIO_DEVICE_STATE_RESUMING) {
> >> +                mask = ~VFIO_DEVICE_STATE_RESUMING;
> >> +            }
> >> +        } else {
> >> +            mask = ~VFIO_DEVICE_STATE_RUNNING;
> >> +        }
> >> +
> >> +        ret = vfio_migration_set_state(vbasedev, mask, value);
> >> +        if (ret) {
> >> +            /*
> >> +             * vm_state_notify() doesn't support reporting failure. If such
> >> +             * error reporting support added in furure, migration should be
> >> +             * aborted.
> >> +             */
> >> +            error_report("%s: Failed to set device state 0x%x",
> >> +                         vbasedev->name, value & mask);
> >> +        }  
> > 
> > Here for instance we assume that success means the device is now in the
> > desired state, but we'd actually need to evaluate
> > vbasedev->device_state to determine that.
> >   
> 
> Updating.
> 
> >> +        vbasedev->vm_running = running;
> >> +        trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
> >> +                                  value & mask);
> >> +    }
> >> +}
> >> +
> >>   static int vfio_migration_init(VFIODevice *vbasedev,
> >>                                  struct vfio_region_info *info)
> >>   {
> >> @@ -87,8 +216,11 @@ static int vfio_migration_init(VFIODevice *vbasedev,
> >>                        vbasedev->name);
> >>           g_free(vbasedev->migration);
> >>           vbasedev->migration = NULL;
> >> +        return ret;
> >>       }
> >>   
> >> +    vbasedev->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change,
> >> +                                                          vbasedev);
> >>       return ret;
> >>   }
> >>   
> >> @@ -131,6 +263,10 @@ add_blocker:
> >>   
> >>   void vfio_migration_finalize(VFIODevice *vbasedev)
> >>   {
> >> +    if (vbasedev->vm_state) {
> >> +        qemu_del_vm_change_state_handler(vbasedev->vm_state);
> >> +    }
> >> +
> >>       if (vbasedev->migration_blocker) {
> >>           migrate_del_blocker(vbasedev->migration_blocker);
> >>           error_free(vbasedev->migration_blocker);
> >> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> >> index 8fe913175d85..6524734bf7b4 100644
> >> --- a/hw/vfio/trace-events
> >> +++ b/hw/vfio/trace-events
> >> @@ -149,4 +149,5 @@ vfio_display_edid_write_error(void) ""
> >>   
> >>   # migration.c
> >>   vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d"
> >> -
> >> +vfio_migration_set_state(char *name, uint32_t state) " (%s) state %d"
> >> +vfio_vmstate_change(char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d"
> >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> >> index 8275c4c68f45..25e3b1a3b90a 100644
> >> --- a/include/hw/vfio/vfio-common.h
> >> +++ b/include/hw/vfio/vfio-common.h
> >> @@ -29,6 +29,7 @@
> >>   #ifdef CONFIG_LINUX
> >>   #include <linux/vfio.h>
> >>   #endif
> >> +#include "sysemu/sysemu.h"
> >>   
> >>   #define VFIO_MSG_PREFIX "vfio %s: "
> >>   
> >> @@ -119,6 +120,9 @@ typedef struct VFIODevice {
> >>       unsigned int flags;
> >>       VFIOMigration *migration;
> >>       Error *migration_blocker;
> >> +    VMChangeStateEntry *vm_state;
> >> +    uint32_t device_state;
> >> +    int vm_running;  
> > 
> > Could these be placed in VFIOMigration?  Thanks,
> >  
> 
> I think device_state should be part of VFIODevice since its about device 
> rather than only related to migration, others can be moved to VFIOMigration.

But these are only valid when migration is supported and thus when
VFIOMigration exists.  Thanks,

Alex
Kirti Wankhede Oct. 18, 2020, 5:43 p.m. UTC | #7
<snip>

>>>> +vfio_migration_set_state(char *name, uint32_t state) " (%s) state %d"
>>>> +vfio_vmstate_change(char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d"
>>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>>>> index 8275c4c68f45..25e3b1a3b90a 100644
>>>> --- a/include/hw/vfio/vfio-common.h
>>>> +++ b/include/hw/vfio/vfio-common.h
>>>> @@ -29,6 +29,7 @@
>>>>    #ifdef CONFIG_LINUX
>>>>    #include <linux/vfio.h>
>>>>    #endif
>>>> +#include "sysemu/sysemu.h"
>>>>    
>>>>    #define VFIO_MSG_PREFIX "vfio %s: "
>>>>    
>>>> @@ -119,6 +120,9 @@ typedef struct VFIODevice {
>>>>        unsigned int flags;
>>>>        VFIOMigration *migration;
>>>>        Error *migration_blocker;
>>>> +    VMChangeStateEntry *vm_state;
>>>> +    uint32_t device_state;
>>>> +    int vm_running;
>>>
>>> Could these be placed in VFIOMigration?  Thanks,
>>>   
>>
>> I think device_state should be part of VFIODevice since its about device
>> rather than only related to migration, others can be moved to VFIOMigration.
> 
> But these are only valid when migration is supported and thus when
> VFIOMigration exists.  Thanks,
> 

Even though it is used when migration is supported, its device's attribute.

Thanks,
Kirti
Alex Williamson Oct. 19, 2020, 5:51 p.m. UTC | #8
On Sun, 18 Oct 2020 23:13:39 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> <snip>
> 
> >>>> +vfio_migration_set_state(char *name, uint32_t state) " (%s) state %d"
> >>>> +vfio_vmstate_change(char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d"
> >>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> >>>> index 8275c4c68f45..25e3b1a3b90a 100644
> >>>> --- a/include/hw/vfio/vfio-common.h
> >>>> +++ b/include/hw/vfio/vfio-common.h
> >>>> @@ -29,6 +29,7 @@
> >>>>    #ifdef CONFIG_LINUX
> >>>>    #include <linux/vfio.h>
> >>>>    #endif
> >>>> +#include "sysemu/sysemu.h"
> >>>>    
> >>>>    #define VFIO_MSG_PREFIX "vfio %s: "
> >>>>    
> >>>> @@ -119,6 +120,9 @@ typedef struct VFIODevice {
> >>>>        unsigned int flags;
> >>>>        VFIOMigration *migration;
> >>>>        Error *migration_blocker;
> >>>> +    VMChangeStateEntry *vm_state;
> >>>> +    uint32_t device_state;
> >>>> +    int vm_running;  
> >>>
> >>> Could these be placed in VFIOMigration?  Thanks,
> >>>     
> >>
> >> I think device_state should be part of VFIODevice since its about device
> >> rather than only related to migration, others can be moved to VFIOMigration.  
> > 
> > But these are only valid when migration is supported and thus when
> > VFIOMigration exists.  Thanks,
> >   
> 
> Even though it is used when migration is supported, its device's attribute.

device_state is a local copy of the migration region register, so it
serves no purpose when a migration region is not present.  In fact the
initial value would indicate the device is stopped, which is incorrect.
vm_running is never initialized and cannot be set other than through a
migration region update of device_state, so at least two of these
values show incorrect state when migration is not supported by the
device.  vm_state is unused when migration isn't present, so if nothing
else the pointer here is wasteful.  It's not clear to me what
justification is being presented here as a "device's attribute",
supporting migration as indicated by a non-NULL migration pointer is
also a device attribute and these are attributes further defining the
state of that support.

BTW, device_state is used in patch 03/ but only defined here in 05/, so
the series would fail to compile on bisect.  Thanks,

Alex
Cornelia Huck Oct. 20, 2020, 10:23 a.m. UTC | #9
On Mon, 19 Oct 2020 11:51:36 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Sun, 18 Oct 2020 23:13:39 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > <snip>
> >   
> > >>>> +vfio_migration_set_state(char *name, uint32_t state) " (%s) state %d"
> > >>>> +vfio_vmstate_change(char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d"
> > >>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> > >>>> index 8275c4c68f45..25e3b1a3b90a 100644
> > >>>> --- a/include/hw/vfio/vfio-common.h
> > >>>> +++ b/include/hw/vfio/vfio-common.h
> > >>>> @@ -29,6 +29,7 @@
> > >>>>    #ifdef CONFIG_LINUX
> > >>>>    #include <linux/vfio.h>
> > >>>>    #endif
> > >>>> +#include "sysemu/sysemu.h"
> > >>>>    
> > >>>>    #define VFIO_MSG_PREFIX "vfio %s: "
> > >>>>    
> > >>>> @@ -119,6 +120,9 @@ typedef struct VFIODevice {
> > >>>>        unsigned int flags;
> > >>>>        VFIOMigration *migration;
> > >>>>        Error *migration_blocker;
> > >>>> +    VMChangeStateEntry *vm_state;
> > >>>> +    uint32_t device_state;
> > >>>> +    int vm_running;    
> > >>>
> > >>> Could these be placed in VFIOMigration?  Thanks,
> > >>>       
> > >>
> > >> I think device_state should be part of VFIODevice since its about device
> > >> rather than only related to migration, others can be moved to VFIOMigration.    
> > > 
> > > But these are only valid when migration is supported and thus when
> > > VFIOMigration exists.  Thanks,
> > >     
> > 
> > Even though it is used when migration is supported, its device's attribute.  
> 
> device_state is a local copy of the migration region register, so it
> serves no purpose when a migration region is not present.  In fact the
> initial value would indicate the device is stopped, which is incorrect.
> vm_running is never initialized and cannot be set other than through a
> migration region update of device_state, so at least two of these
> values show incorrect state when migration is not supported by the
> device.  vm_state is unused when migration isn't present, so if nothing
> else the pointer here is wasteful.  It's not clear to me what
> justification is being presented here as a "device's attribute",
> supporting migration as indicated by a non-NULL migration pointer is
> also a device attribute and these are attributes further defining the
> state of that support.

Agreed. Also, it is not obvious from the naming that 'device_state' is
related to migration, and it is easy to assume that this field is
useful even in the non-migration case. Moving it would solve that
problem.
Cornelia Huck Oct. 20, 2020, 10:51 a.m. UTC | #10
On Sun, 18 Oct 2020 01:54:56 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 9/29/2020 4:33 PM, Dr. David Alan Gilbert wrote:
> > * Cornelia Huck (cohuck@redhat.com) wrote:  
> >> On Wed, 23 Sep 2020 04:54:07 +0530
> >> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>  
> >>> VM state change handler gets called on change in VM's state. This is used to set
> >>> VFIO device state to _RUNNING.
> >>>
> >>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> >>> ---
> >>>   hw/vfio/migration.c           | 136 ++++++++++++++++++++++++++++++++++++++++++
> >>>   hw/vfio/trace-events          |   3 +-
> >>>   include/hw/vfio/vfio-common.h |   4 ++
> >>>   3 files changed, 142 insertions(+), 1 deletion(-)
> >>>  
> >>
> >> (...)
> >>  
> >>> +static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
> >>> +                                    uint32_t value)  
> >>
> >> I think I've mentioned that before, but this function could really
> >> benefit from a comment what mask and value mean.
> >>  
> 
> Adding a comment as:
> 
> /*
>   *  Write device_state field to inform the vendor driver about the 
> device state
>   *  to be transitioned to.
>   *  vbasedev: VFIO device
>   *  mask : bits set in the mask are preserved in device_state
>   *  value: bits set in the value are set in device_state
>   *  Remaining bits in device_state are cleared.
>   */

Maybe:

"Change the device_state register for device @vbasedev. Bits set in
@mask are preserved, bits set in @value are set, and bits not set in
either @mask or @value are cleared in device_state. If the register
cannot be accessed, the resulting state would be invalid, or the device
enters an error state, an error is returned." ?

> 
> 
> >>> +{
> >>> +    VFIOMigration *migration = vbasedev->migration;
> >>> +    VFIORegion *region = &migration->region;
> >>> +    off_t dev_state_off = region->fd_offset +
> >>> +                      offsetof(struct vfio_device_migration_info, device_state);
> >>> +    uint32_t device_state;
> >>> +    int ret;
> >>> +
> >>> +    ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
> >>> +                        dev_state_off);
> >>> +    if (ret < 0) {
> >>> +        return ret;
> >>> +    }
> >>> +
> >>> +    device_state = (device_state & mask) | value;
> >>> +
> >>> +    if (!VFIO_DEVICE_STATE_VALID(device_state)) {
> >>> +        return -EINVAL;
> >>> +    }
> >>> +
> >>> +    ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state),
> >>> +                         dev_state_off);
> >>> +    if (ret < 0) {
> >>> +        ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
> >>> +                          dev_state_off);
> >>> +        if (ret < 0) {
> >>> +            return ret;
> >>> +        }
> >>> +
> >>> +        if (VFIO_DEVICE_STATE_IS_ERROR(device_state)) {
> >>> +            hw_error("%s: Device is in error state 0x%x",
> >>> +                     vbasedev->name, device_state);
> >>> +            return -EFAULT;  
> >>
> >> Is -EFAULT a good return value here? Maybe -EIO?
> >>  
> 
> Ok. Changing to -EIO.
> 
> >>> +        }
> >>> +    }
> >>> +
> >>> +    vbasedev->device_state = device_state;
> >>> +    trace_vfio_migration_set_state(vbasedev->name, device_state);
> >>> +    return 0;
> >>> +}
> >>> +
> >>> +static void vfio_vmstate_change(void *opaque, int running, RunState state)
> >>> +{
> >>> +    VFIODevice *vbasedev = opaque;
> >>> +
> >>> +    if ((vbasedev->vm_running != running)) {
> >>> +        int ret;
> >>> +        uint32_t value = 0, mask = 0;
> >>> +
> >>> +        if (running) {
> >>> +            value = VFIO_DEVICE_STATE_RUNNING;
> >>> +            if (vbasedev->device_state & VFIO_DEVICE_STATE_RESUMING) {
> >>> +                mask = ~VFIO_DEVICE_STATE_RESUMING;  
> >>
> >> I've been staring at this for some time and I think that the desired
> >> result is
> >> - set _RUNNING
> >> - if _RESUMING was set, clear it, but leave the other bits intact  
> 
> Upto here, you're correct.
> 
> >> - if _RESUMING was not set, clear everything previously set
> >> This would really benefit from a comment (or am I the only one
> >> struggling here?)
> >>  
> 
> Here mask should be ~0. Correcting it.

Hm, now I'm confused. With value == _RUNNING, ~_RUNNING and ~0 as mask
should be equivalent, shouldn't they?

> 
> 
> >>> +            }
> >>> +        } else {
> >>> +            mask = ~VFIO_DEVICE_STATE_RUNNING;
> >>> +        }
> >>> +
> >>> +        ret = vfio_migration_set_state(vbasedev, mask, value);
> >>> +        if (ret) {
> >>> +            /*
> >>> +             * vm_state_notify() doesn't support reporting failure. If such
> >>> +             * error reporting support added in furure, migration should be
> >>> +             * aborted.  
> >>
> >>
> >> "We should abort the migration in this case, but vm_state_notify()
> >> currently does not support reporting failures."
> >>
> >> ?
> >>  
> 
> Ok. Updating comment as suggested here.
> 
> >> Can/should we mark the failing device in some way?  
> > 
> > I think you can call qemu_file_set_error on the migration stream to
> > force an error.
> >   
> 
> It should be as below, right?
> qemu_file_set_error(migrate_get_current()->to_dst_file, ret);

Does this indicate in any way which device was causing problems? (I'm
not sure how visible the error_report would be?)

> 
> 
> Thanks,
> Kirti
> 
> > Dave
> >   
> >>> +             */
> >>> +            error_report("%s: Failed to set device state 0x%x",
> >>> +                         vbasedev->name, value & mask);
> >>> +        }
> >>> +        vbasedev->vm_running = running;
> >>> +        trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
> >>> +                                  value & mask);
> >>> +    }
> >>> +}
> >>> +
> >>>   static int vfio_migration_init(VFIODevice *vbasedev,
> >>>                                  struct vfio_region_info *info)
> >>>   {  
> >>
> >> (...)  
>
Kirti Wankhede Oct. 21, 2020, 5:33 a.m. UTC | #11
On 10/20/2020 4:21 PM, Cornelia Huck wrote:
> On Sun, 18 Oct 2020 01:54:56 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 9/29/2020 4:33 PM, Dr. David Alan Gilbert wrote:
>>> * Cornelia Huck (cohuck@redhat.com) wrote:
>>>> On Wed, 23 Sep 2020 04:54:07 +0530
>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>   
>>>>> VM state change handler gets called on change in VM's state. This is used to set
>>>>> VFIO device state to _RUNNING.
>>>>>
>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>>>>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>>>> ---
>>>>>    hw/vfio/migration.c           | 136 ++++++++++++++++++++++++++++++++++++++++++
>>>>>    hw/vfio/trace-events          |   3 +-
>>>>>    include/hw/vfio/vfio-common.h |   4 ++
>>>>>    3 files changed, 142 insertions(+), 1 deletion(-)
>>>>>   
>>>>
>>>> (...)
>>>>   
>>>>> +static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
>>>>> +                                    uint32_t value)
>>>>
>>>> I think I've mentioned that before, but this function could really
>>>> benefit from a comment what mask and value mean.
>>>>   
>>
>> Adding a comment as:
>>
>> /*
>>    *  Write device_state field to inform the vendor driver about the
>> device state
>>    *  to be transitioned to.
>>    *  vbasedev: VFIO device
>>    *  mask : bits set in the mask are preserved in device_state
>>    *  value: bits set in the value are set in device_state
>>    *  Remaining bits in device_state are cleared.
>>    */
> 
> Maybe:
> 
> "Change the device_state register for device @vbasedev. Bits set in
> @mask are preserved, bits set in @value are set, and bits not set in
> either @mask or @value are cleared in device_state. If the register
> cannot be accessed, the resulting state would be invalid, or the device
> enters an error state, an error is returned." ?
> 

Ok.
>>
>>
>>>>> +{
>>>>> +    VFIOMigration *migration = vbasedev->migration;
>>>>> +    VFIORegion *region = &migration->region;
>>>>> +    off_t dev_state_off = region->fd_offset +
>>>>> +                      offsetof(struct vfio_device_migration_info, device_state);
>>>>> +    uint32_t device_state;
>>>>> +    int ret;
>>>>> +
>>>>> +    ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
>>>>> +                        dev_state_off);
>>>>> +    if (ret < 0) {
>>>>> +        return ret;
>>>>> +    }
>>>>> +
>>>>> +    device_state = (device_state & mask) | value;
>>>>> +
>>>>> +    if (!VFIO_DEVICE_STATE_VALID(device_state)) {
>>>>> +        return -EINVAL;
>>>>> +    }
>>>>> +
>>>>> +    ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state),
>>>>> +                         dev_state_off);
>>>>> +    if (ret < 0) {
>>>>> +        ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
>>>>> +                          dev_state_off);
>>>>> +        if (ret < 0) {
>>>>> +            return ret;
>>>>> +        }
>>>>> +
>>>>> +        if (VFIO_DEVICE_STATE_IS_ERROR(device_state)) {
>>>>> +            hw_error("%s: Device is in error state 0x%x",
>>>>> +                     vbasedev->name, device_state);
>>>>> +            return -EFAULT;
>>>>
>>>> Is -EFAULT a good return value here? Maybe -EIO?
>>>>   
>>
>> Ok. Changing to -EIO.
>>
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    vbasedev->device_state = device_state;
>>>>> +    trace_vfio_migration_set_state(vbasedev->name, device_state);
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static void vfio_vmstate_change(void *opaque, int running, RunState state)
>>>>> +{
>>>>> +    VFIODevice *vbasedev = opaque;
>>>>> +
>>>>> +    if ((vbasedev->vm_running != running)) {
>>>>> +        int ret;
>>>>> +        uint32_t value = 0, mask = 0;
>>>>> +
>>>>> +        if (running) {
>>>>> +            value = VFIO_DEVICE_STATE_RUNNING;
>>>>> +            if (vbasedev->device_state & VFIO_DEVICE_STATE_RESUMING) {
>>>>> +                mask = ~VFIO_DEVICE_STATE_RESUMING;
>>>>
>>>> I've been staring at this for some time and I think that the desired
>>>> result is
>>>> - set _RUNNING
>>>> - if _RESUMING was set, clear it, but leave the other bits intact
>>
>> Upto here, you're correct.
>>
>>>> - if _RESUMING was not set, clear everything previously set
>>>> This would really benefit from a comment (or am I the only one
>>>> struggling here?)
>>>>   
>>
>> Here mask should be ~0. Correcting it.
> 
> Hm, now I'm confused. With value == _RUNNING, ~_RUNNING and ~0 as mask
> should be equivalent, shouldn't they?
> 

I too got confused after reading your comment.
Lets walk through the device states and transitions can happen here:

if running
  - device state could be either _SAVING or _RESUMING or _STOP. Both 
_SAVING and _RESUMING can't be set at a time, that is the error state. 
_STOP means 0.
  - Transition from _SAVING to _RUNNING can happen if there is migration 
failure, in that case we have to clear _SAVING
- Transition from _RESUMING to _RUNNING can happen on resuming and we 
have to clear _RESUMING.
- In both the above cases, we have to set _RUNNING and clear rest 2 bits.
Then:
mask = ~VFIO_DEVICE_STATE_MASK;
value = VFIO_DEVICE_STATE_RUNNING;

if !running
- device state could be either _RUNNING or _SAVING|_RUNNING. Here we 
have to reset running bit.
Then:
mask = ~VFIO_DEVICE_STATE_RUNNING;
value = 0;

I'll add comment in the code above.


>>
>>
>>>>> +            }
>>>>> +        } else {
>>>>> +            mask = ~VFIO_DEVICE_STATE_RUNNING;
>>>>> +        }
>>>>> +
>>>>> +        ret = vfio_migration_set_state(vbasedev, mask, value);
>>>>> +        if (ret) {
>>>>> +            /*
>>>>> +             * vm_state_notify() doesn't support reporting failure. If such
>>>>> +             * error reporting support added in furure, migration should be
>>>>> +             * aborted.
>>>>
>>>>
>>>> "We should abort the migration in this case, but vm_state_notify()
>>>> currently does not support reporting failures."
>>>>
>>>> ?
>>>>   
>>
>> Ok. Updating comment as suggested here.
>>
>>>> Can/should we mark the failing device in some way?
>>>
>>> I think you can call qemu_file_set_error on the migration stream to
>>> force an error.
>>>    
>>
>> It should be as below, right?
>> qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
> 
> Does this indicate in any way which device was causing problems? (I'm
> not sure how visible the error_report would be?)
> 

I think it doesn't indicate which device caused problem but it sets 
error on migration stream

Thanks,
Kirti

>>
>>
>> Thanks,
>> Kirti
>>
>>> Dave
>>>    
>>>>> +             */
>>>>> +            error_report("%s: Failed to set device state 0x%x",
>>>>> +                         vbasedev->name, value & mask);
>>>>> +        }
>>>>> +        vbasedev->vm_running = running;
>>>>> +        trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
>>>>> +                                  value & mask);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>>    static int vfio_migration_init(VFIODevice *vbasedev,
>>>>>                                   struct vfio_region_info *info)
>>>>>    {
>>>>
>>>> (...)
>>
>
Cornelia Huck Oct. 22, 2020, 7:51 a.m. UTC | #12
On Wed, 21 Oct 2020 11:03:23 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 10/20/2020 4:21 PM, Cornelia Huck wrote:
> > On Sun, 18 Oct 2020 01:54:56 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 9/29/2020 4:33 PM, Dr. David Alan Gilbert wrote:  
> >>> * Cornelia Huck (cohuck@redhat.com) wrote:  
> >>>> On Wed, 23 Sep 2020 04:54:07 +0530
> >>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:

> >>>>> +static void vfio_vmstate_change(void *opaque, int running, RunState state)
> >>>>> +{
> >>>>> +    VFIODevice *vbasedev = opaque;
> >>>>> +
> >>>>> +    if ((vbasedev->vm_running != running)) {
> >>>>> +        int ret;
> >>>>> +        uint32_t value = 0, mask = 0;
> >>>>> +
> >>>>> +        if (running) {
> >>>>> +            value = VFIO_DEVICE_STATE_RUNNING;
> >>>>> +            if (vbasedev->device_state & VFIO_DEVICE_STATE_RESUMING) {
> >>>>> +                mask = ~VFIO_DEVICE_STATE_RESUMING;  
> >>>>
> >>>> I've been staring at this for some time and I think that the desired
> >>>> result is
> >>>> - set _RUNNING
> >>>> - if _RESUMING was set, clear it, but leave the other bits intact  
> >>
> >> Upto here, you're correct.
> >>  
> >>>> - if _RESUMING was not set, clear everything previously set
> >>>> This would really benefit from a comment (or am I the only one
> >>>> struggling here?)
> >>>>     
> >>
> >> Here mask should be ~0. Correcting it.  
> > 
> > Hm, now I'm confused. With value == _RUNNING, ~_RUNNING and ~0 as mask
> > should be equivalent, shouldn't they?
> >   
> 
> I too got confused after reading your comment.
> Lets walk through the device states and transitions can happen here:
> 
> if running
>   - device state could be either _SAVING or _RESUMING or _STOP. Both 
> _SAVING and _RESUMING can't be set at a time, that is the error state. 
> _STOP means 0.
>   - Transition from _SAVING to _RUNNING can happen if there is migration 
> failure, in that case we have to clear _SAVING
> - Transition from _RESUMING to _RUNNING can happen on resuming and we 
> have to clear _RESUMING.
> - In both the above cases, we have to set _RUNNING and clear rest 2 bits.
> Then:
> mask = ~VFIO_DEVICE_STATE_MASK;
> value = VFIO_DEVICE_STATE_RUNNING;

ok

> 
> if !running
> - device state could be either _RUNNING or _SAVING|_RUNNING. Here we 
> have to reset running bit.
> Then:
> mask = ~VFIO_DEVICE_STATE_RUNNING;
> value = 0;

ok

> 
> I'll add comment in the code above.

That will help.

I'm a bit worried though that all that reasoning which flags are set or
cleared when is quite complex, and it's easy to make mistakes.

Can we model this as a FSM, where an event (running state changes)
transitions the device state from one state to another? I (personally)
find FSMs easier to comprehend, but I'm not sure whether that change
would be too invasive. If others can parse the state changes with that
mask/value interface, I won't object to it.

> 
> 
> >>
> >>  
> >>>>> +            }
> >>>>> +        } else {
> >>>>> +            mask = ~VFIO_DEVICE_STATE_RUNNING;
> >>>>> +        }
Kirti Wankhede Oct. 22, 2020, 3:42 p.m. UTC | #13
On 10/22/2020 1:21 PM, Cornelia Huck wrote:
> On Wed, 21 Oct 2020 11:03:23 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 10/20/2020 4:21 PM, Cornelia Huck wrote:
>>> On Sun, 18 Oct 2020 01:54:56 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>    
>>>> On 9/29/2020 4:33 PM, Dr. David Alan Gilbert wrote:
>>>>> * Cornelia Huck (cohuck@redhat.com) wrote:
>>>>>> On Wed, 23 Sep 2020 04:54:07 +0530
>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>>>>>>> +static void vfio_vmstate_change(void *opaque, int running, RunState state)
>>>>>>> +{
>>>>>>> +    VFIODevice *vbasedev = opaque;
>>>>>>> +
>>>>>>> +    if ((vbasedev->vm_running != running)) {
>>>>>>> +        int ret;
>>>>>>> +        uint32_t value = 0, mask = 0;
>>>>>>> +
>>>>>>> +        if (running) {
>>>>>>> +            value = VFIO_DEVICE_STATE_RUNNING;
>>>>>>> +            if (vbasedev->device_state & VFIO_DEVICE_STATE_RESUMING) {
>>>>>>> +                mask = ~VFIO_DEVICE_STATE_RESUMING;
>>>>>>
>>>>>> I've been staring at this for some time and I think that the desired
>>>>>> result is
>>>>>> - set _RUNNING
>>>>>> - if _RESUMING was set, clear it, but leave the other bits intact
>>>>
>>>> Upto here, you're correct.
>>>>   
>>>>>> - if _RESUMING was not set, clear everything previously set
>>>>>> This would really benefit from a comment (or am I the only one
>>>>>> struggling here?)
>>>>>>      
>>>>
>>>> Here mask should be ~0. Correcting it.
>>>
>>> Hm, now I'm confused. With value == _RUNNING, ~_RUNNING and ~0 as mask
>>> should be equivalent, shouldn't they?
>>>    
>>
>> I too got confused after reading your comment.
>> Lets walk through the device states and transitions can happen here:
>>
>> if running
>>    - device state could be either _SAVING or _RESUMING or _STOP. Both
>> _SAVING and _RESUMING can't be set at a time, that is the error state.
>> _STOP means 0.
>>    - Transition from _SAVING to _RUNNING can happen if there is migration
>> failure, in that case we have to clear _SAVING
>> - Transition from _RESUMING to _RUNNING can happen on resuming and we
>> have to clear _RESUMING.
>> - In both the above cases, we have to set _RUNNING and clear rest 2 bits.
>> Then:
>> mask = ~VFIO_DEVICE_STATE_MASK;
>> value = VFIO_DEVICE_STATE_RUNNING;
> 
> ok
> 
>>
>> if !running
>> - device state could be either _RUNNING or _SAVING|_RUNNING. Here we
>> have to reset running bit.
>> Then:
>> mask = ~VFIO_DEVICE_STATE_RUNNING;
>> value = 0;
> 
> ok
> 
>>
>> I'll add comment in the code above.
> 
> That will help.
> 
> I'm a bit worried though that all that reasoning which flags are set or
> cleared when is quite complex, and it's easy to make mistakes.
> 
> Can we model this as a FSM, where an event (running state changes)
> transitions the device state from one state to another? I (personally)
> find FSMs easier to comprehend, but I'm not sure whether that change
> would be too invasive. If others can parse the state changes with that
> mask/value interface, I won't object to it.
> 

I agree FSM will be easy and for long term may be easy to maintain. But 
at this moment it will be intrusive change. For now we can go ahead with 
this code and later we can change to FSM model, if all agrees on it.

Thanks,
Kirti


>>
>>
>>>>
>>>>   
>>>>>>> +            }
>>>>>>> +        } else {
>>>>>>> +            mask = ~VFIO_DEVICE_STATE_RUNNING;
>>>>>>> +        }
>
Cornelia Huck Oct. 22, 2020, 3:49 p.m. UTC | #14
On Thu, 22 Oct 2020 21:12:58 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 10/22/2020 1:21 PM, Cornelia Huck wrote:

> > I'm a bit worried though that all that reasoning which flags are set or
> > cleared when is quite complex, and it's easy to make mistakes.
> > 
> > Can we model this as a FSM, where an event (running state changes)
> > transitions the device state from one state to another? I (personally)
> > find FSMs easier to comprehend, but I'm not sure whether that change
> > would be too invasive. If others can parse the state changes with that
> > mask/value interface, I won't object to it.
> >   
> 
> I agree FSM will be easy and for long term may be easy to maintain. But 
> at this moment it will be intrusive change. For now we can go ahead with 
> this code and later we can change to FSM model, if all agrees on it.

Yes, we can certainly revisit this later.
diff mbox series

Patch

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 2f760f1f9c47..a30d628ba963 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -10,6 +10,7 @@ 
 #include "qemu/osdep.h"
 #include <linux/vfio.h>
 
+#include "sysemu/runstate.h"
 #include "hw/vfio/vfio-common.h"
 #include "cpu.h"
 #include "migration/migration.h"
@@ -22,6 +23,58 @@ 
 #include "exec/ram_addr.h"
 #include "pci.h"
 #include "trace.h"
+#include "hw/hw.h"
+
+static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count,
+                                  off_t off, bool iswrite)
+{
+    int ret;
+
+    ret = iswrite ? pwrite(vbasedev->fd, val, count, off) :
+                    pread(vbasedev->fd, val, count, off);
+    if (ret < count) {
+        error_report("vfio_mig_%s%d %s: failed at offset 0x%lx, err: %s",
+                     iswrite ? "write" : "read", count * 8,
+                     vbasedev->name, off, strerror(errno));
+        return (ret < 0) ? ret : -EINVAL;
+    }
+    return 0;
+}
+
+static int vfio_mig_rw(VFIODevice *vbasedev, __u8 *buf, size_t count,
+                       off_t off, bool iswrite)
+{
+    int ret, done = 0;
+    __u8 *tbuf = buf;
+
+    while (count) {
+        int bytes = 0;
+
+        if (count >= 8 && !(off % 8)) {
+            bytes = 8;
+        } else if (count >= 4 && !(off % 4)) {
+            bytes = 4;
+        } else if (count >= 2 && !(off % 2)) {
+            bytes = 2;
+        } else {
+            bytes = 1;
+        }
+
+        ret = vfio_mig_access(vbasedev, tbuf, bytes, off, iswrite);
+        if (ret) {
+            return ret;
+        }
+
+        count -= bytes;
+        done += bytes;
+        off += bytes;
+        tbuf += bytes;
+    }
+    return done;
+}
+
+#define vfio_mig_read(f, v, c, o)       vfio_mig_rw(f, (__u8 *)v, c, o, false)
+#define vfio_mig_write(f, v, c, o)      vfio_mig_rw(f, (__u8 *)v, c, o, true)
 
 static void vfio_migration_region_exit(VFIODevice *vbasedev)
 {
@@ -70,6 +123,82 @@  err:
     return ret;
 }
 
+static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
+                                    uint32_t value)
+{
+    VFIOMigration *migration = vbasedev->migration;
+    VFIORegion *region = &migration->region;
+    off_t dev_state_off = region->fd_offset +
+                      offsetof(struct vfio_device_migration_info, device_state);
+    uint32_t device_state;
+    int ret;
+
+    ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
+                        dev_state_off);
+    if (ret < 0) {
+        return ret;
+    }
+
+    device_state = (device_state & mask) | value;
+
+    if (!VFIO_DEVICE_STATE_VALID(device_state)) {
+        return -EINVAL;
+    }
+
+    ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state),
+                         dev_state_off);
+    if (ret < 0) {
+        ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
+                          dev_state_off);
+        if (ret < 0) {
+            return ret;
+        }
+
+        if (VFIO_DEVICE_STATE_IS_ERROR(device_state)) {
+            hw_error("%s: Device is in error state 0x%x",
+                     vbasedev->name, device_state);
+            return -EFAULT;
+        }
+    }
+
+    vbasedev->device_state = device_state;
+    trace_vfio_migration_set_state(vbasedev->name, device_state);
+    return 0;
+}
+
+static void vfio_vmstate_change(void *opaque, int running, RunState state)
+{
+    VFIODevice *vbasedev = opaque;
+
+    if ((vbasedev->vm_running != running)) {
+        int ret;
+        uint32_t value = 0, mask = 0;
+
+        if (running) {
+            value = VFIO_DEVICE_STATE_RUNNING;
+            if (vbasedev->device_state & VFIO_DEVICE_STATE_RESUMING) {
+                mask = ~VFIO_DEVICE_STATE_RESUMING;
+            }
+        } else {
+            mask = ~VFIO_DEVICE_STATE_RUNNING;
+        }
+
+        ret = vfio_migration_set_state(vbasedev, mask, value);
+        if (ret) {
+            /*
+             * vm_state_notify() doesn't support reporting failure. If such
+             * error reporting support added in furure, migration should be
+             * aborted.
+             */
+            error_report("%s: Failed to set device state 0x%x",
+                         vbasedev->name, value & mask);
+        }
+        vbasedev->vm_running = running;
+        trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
+                                  value & mask);
+    }
+}
+
 static int vfio_migration_init(VFIODevice *vbasedev,
                                struct vfio_region_info *info)
 {
@@ -87,8 +216,11 @@  static int vfio_migration_init(VFIODevice *vbasedev,
                      vbasedev->name);
         g_free(vbasedev->migration);
         vbasedev->migration = NULL;
+        return ret;
     }
 
+    vbasedev->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change,
+                                                          vbasedev);
     return ret;
 }
 
@@ -131,6 +263,10 @@  add_blocker:
 
 void vfio_migration_finalize(VFIODevice *vbasedev)
 {
+    if (vbasedev->vm_state) {
+        qemu_del_vm_change_state_handler(vbasedev->vm_state);
+    }
+
     if (vbasedev->migration_blocker) {
         migrate_del_blocker(vbasedev->migration_blocker);
         error_free(vbasedev->migration_blocker);
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 8fe913175d85..6524734bf7b4 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -149,4 +149,5 @@  vfio_display_edid_write_error(void) ""
 
 # migration.c
 vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d"
-
+vfio_migration_set_state(char *name, uint32_t state) " (%s) state %d"
+vfio_vmstate_change(char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 8275c4c68f45..25e3b1a3b90a 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -29,6 +29,7 @@ 
 #ifdef CONFIG_LINUX
 #include <linux/vfio.h>
 #endif
+#include "sysemu/sysemu.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -119,6 +120,9 @@  typedef struct VFIODevice {
     unsigned int flags;
     VFIOMigration *migration;
     Error *migration_blocker;
+    VMChangeStateEntry *vm_state;
+    uint32_t device_state;
+    int vm_running;
 } VFIODevice;
 
 struct VFIODeviceOps {