diff mbox series

[mlx5-next,2/7] vfio: Add an API to check migration state transition validity

Message ID c87f55d6fec77a22b110d3c9611744e6b28bba46.1632305919.git.leonro@nvidia.com (mailing list archive)
State Not Applicable
Headers show
Series Add mlx5 live migration driver | expand

Commit Message

Leon Romanovsky Sept. 22, 2021, 10:38 a.m. UTC
From: Yishai Hadas <yishaih@nvidia.com>

Add an API in the core layer to check migration state transition validity
as part of a migration flow.

The valid transitions follow the expected usage as described in
uapi/vfio.h and triggered by QEMU.

This ensures that all migration implementations follow a consistent
migration state machine.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/vfio/vfio.c  | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/vfio.h |  1 +
 2 files changed, 42 insertions(+)

Comments

Shameer Kolothum Sept. 23, 2021, 10:33 a.m. UTC | #1
> -----Original Message-----
> From: Leon Romanovsky [mailto:leon@kernel.org]
> Sent: 22 September 2021 11:39
> To: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe <jgg@nvidia.com>
> Cc: Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>; David
> S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>; Kirti
> Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
> linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
> linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
> <saeedm@nvidia.com>
> Subject: [PATCH mlx5-next 2/7] vfio: Add an API to check migration state
> transition validity
> 
> From: Yishai Hadas <yishaih@nvidia.com>
> 
> Add an API in the core layer to check migration state transition validity
> as part of a migration flow.
> 
> The valid transitions follow the expected usage as described in
> uapi/vfio.h and triggered by QEMU.
> 
> This ensures that all migration implementations follow a consistent
> migration state machine.
> 
> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>  drivers/vfio/vfio.c  | 41 +++++++++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h |  1 +
>  2 files changed, 42 insertions(+)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 3c034fe14ccb..c3ca33e513c8 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1664,6 +1664,47 @@ static int vfio_device_fops_release(struct inode
> *inode, struct file *filep)
>  	return 0;
>  }
> 
> +/**
> + * vfio_change_migration_state_allowed - Checks whether a migration state
> + *   transition is valid.
> + * @new_state: The new state to move to.
> + * @old_state: The old state.
> + * Return: true if the transition is valid.
> + */
> +bool vfio_change_migration_state_allowed(u32 new_state, u32 old_state)
> +{
> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
> +		[VFIO_DEVICE_STATE_STOP] = {
> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
> +		},
> +		[VFIO_DEVICE_STATE_RUNNING] = {
> +			[VFIO_DEVICE_STATE_STOP] = 1,
> +			[VFIO_DEVICE_STATE_SAVING] = 1,
> +			[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING]
> = 1,

Do we need to allow _RESUMING state here or not? As per the "State transitions"
section from uapi/linux/vfio.h, 

" * 4. To start the resuming phase, the device state should be transitioned from
 *    the _RUNNING to the _RESUMING state."

IIRC, I have seen that transition happening on the destination dev while testing the 
HiSilicon ACC dev migration. 

Thanks,
Shameer

> +		},
> +		[VFIO_DEVICE_STATE_SAVING] = {
> +			[VFIO_DEVICE_STATE_STOP] = 1,
> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> +		},
> +		[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING] = {
> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> +			[VFIO_DEVICE_STATE_SAVING] = 1,
> +		},
> +		[VFIO_DEVICE_STATE_RESUMING] = {
> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> +			[VFIO_DEVICE_STATE_STOP] = 1,
> +		},
> +	};
> +
> +	if (new_state > MAX_STATE || old_state > MAX_STATE)
> +		return false;
> +
> +	return vfio_from_state_table[old_state][new_state];
> +}
> +EXPORT_SYMBOL_GPL(vfio_change_migration_state_allowed);
> +
>  static long vfio_device_fops_unl_ioctl(struct file *filep,
>  				       unsigned int cmd, unsigned long arg)
>  {
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index b53a9557884a..e65137a708f1 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -83,6 +83,7 @@ extern struct vfio_device
> *vfio_device_get_from_dev(struct device *dev);
>  extern void vfio_device_put(struct vfio_device *device);
> 
>  int vfio_assign_device_set(struct vfio_device *device, void *set_id);
> +bool vfio_change_migration_state_allowed(u32 new_state, u32 old_state);
> 
>  /* events for the backend driver notify callback */
>  enum vfio_iommu_notify_type {
> --
> 2.31.1
Leon Romanovsky Sept. 23, 2021, 11:17 a.m. UTC | #2
On Thu, Sep 23, 2021 at 10:33:10AM +0000, Shameerali Kolothum Thodi wrote:
> 
> 
> > -----Original Message-----
> > From: Leon Romanovsky [mailto:leon@kernel.org]
> > Sent: 22 September 2021 11:39
> > To: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe <jgg@nvidia.com>
> > Cc: Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
> > <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>; David
> > S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>; Kirti
> > Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
> > linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
> > linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
> > <saeedm@nvidia.com>
> > Subject: [PATCH mlx5-next 2/7] vfio: Add an API to check migration state
> > transition validity
> > 
> > From: Yishai Hadas <yishaih@nvidia.com>
> > 
> > Add an API in the core layer to check migration state transition validity
> > as part of a migration flow.
> > 
> > The valid transitions follow the expected usage as described in
> > uapi/vfio.h and triggered by QEMU.
> > 
> > This ensures that all migration implementations follow a consistent
> > migration state machine.
> > 
> > Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> > Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> >  drivers/vfio/vfio.c  | 41 +++++++++++++++++++++++++++++++++++++++++
> >  include/linux/vfio.h |  1 +
> >  2 files changed, 42 insertions(+)
> > 
> > diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> > index 3c034fe14ccb..c3ca33e513c8 100644
> > --- a/drivers/vfio/vfio.c
> > +++ b/drivers/vfio/vfio.c
> > @@ -1664,6 +1664,47 @@ static int vfio_device_fops_release(struct inode
> > *inode, struct file *filep)
> >  	return 0;
> >  }
> > 
> > +/**
> > + * vfio_change_migration_state_allowed - Checks whether a migration state
> > + *   transition is valid.
> > + * @new_state: The new state to move to.
> > + * @old_state: The old state.
> > + * Return: true if the transition is valid.
> > + */
> > +bool vfio_change_migration_state_allowed(u32 new_state, u32 old_state)
> > +{
> > +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
> > +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
> > +		[VFIO_DEVICE_STATE_STOP] = {
> > +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> > +			[VFIO_DEVICE_STATE_RESUMING] = 1,
> > +		},
> > +		[VFIO_DEVICE_STATE_RUNNING] = {
> > +			[VFIO_DEVICE_STATE_STOP] = 1,
> > +			[VFIO_DEVICE_STATE_SAVING] = 1,
> > +			[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING]
> > = 1,
> 
> Do we need to allow _RESUMING state here or not? As per the "State transitions"
> section from uapi/linux/vfio.h, 

It looks like we missed this state transition.

Thanks

> 
> " * 4. To start the resuming phase, the device state should be transitioned from
>  *    the _RUNNING to the _RESUMING state."
> 
> IIRC, I have seen that transition happening on the destination dev while testing the 
> HiSilicon ACC dev migration. 
> 
> Thanks,
> Shameer
> 
> > +		},
> > +		[VFIO_DEVICE_STATE_SAVING] = {
> > +			[VFIO_DEVICE_STATE_STOP] = 1,
> > +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> > +		},
> > +		[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING] = {
> > +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> > +			[VFIO_DEVICE_STATE_SAVING] = 1,
> > +		},
> > +		[VFIO_DEVICE_STATE_RESUMING] = {
> > +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> > +			[VFIO_DEVICE_STATE_STOP] = 1,
> > +		},
> > +	};
> > +
> > +	if (new_state > MAX_STATE || old_state > MAX_STATE)
> > +		return false;
> > +
> > +	return vfio_from_state_table[old_state][new_state];
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_change_migration_state_allowed);
> > +
> >  static long vfio_device_fops_unl_ioctl(struct file *filep,
> >  				       unsigned int cmd, unsigned long arg)
> >  {
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > index b53a9557884a..e65137a708f1 100644
> > --- a/include/linux/vfio.h
> > +++ b/include/linux/vfio.h
> > @@ -83,6 +83,7 @@ extern struct vfio_device
> > *vfio_device_get_from_dev(struct device *dev);
> >  extern void vfio_device_put(struct vfio_device *device);
> > 
> >  int vfio_assign_device_set(struct vfio_device *device, void *set_id);
> > +bool vfio_change_migration_state_allowed(u32 new_state, u32 old_state);
> > 
> >  /* events for the backend driver notify callback */
> >  enum vfio_iommu_notify_type {
> > --
> > 2.31.1
>
Max Gurtovoy Sept. 23, 2021, 1:55 p.m. UTC | #3
On 9/23/2021 2:17 PM, Leon Romanovsky wrote:
> On Thu, Sep 23, 2021 at 10:33:10AM +0000, Shameerali Kolothum Thodi wrote:
>>
>>> -----Original Message-----
>>> From: Leon Romanovsky [mailto:leon@kernel.org]
>>> Sent: 22 September 2021 11:39
>>> To: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe <jgg@nvidia.com>
>>> Cc: Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
>>> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>; David
>>> S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>; Kirti
>>> Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
>>> linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
>>> linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
>>> <saeedm@nvidia.com>
>>> Subject: [PATCH mlx5-next 2/7] vfio: Add an API to check migration state
>>> transition validity
>>>
>>> From: Yishai Hadas <yishaih@nvidia.com>
>>>
>>> Add an API in the core layer to check migration state transition validity
>>> as part of a migration flow.
>>>
>>> The valid transitions follow the expected usage as described in
>>> uapi/vfio.h and triggered by QEMU.
>>>
>>> This ensures that all migration implementations follow a consistent
>>> migration state machine.
>>>
>>> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
>>> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
>>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>>> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
>>> ---
>>>   drivers/vfio/vfio.c  | 41 +++++++++++++++++++++++++++++++++++++++++
>>>   include/linux/vfio.h |  1 +
>>>   2 files changed, 42 insertions(+)
>>>
>>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>>> index 3c034fe14ccb..c3ca33e513c8 100644
>>> --- a/drivers/vfio/vfio.c
>>> +++ b/drivers/vfio/vfio.c
>>> @@ -1664,6 +1664,47 @@ static int vfio_device_fops_release(struct inode
>>> *inode, struct file *filep)
>>>   	return 0;
>>>   }
>>>
>>> +/**
>>> + * vfio_change_migration_state_allowed - Checks whether a migration state
>>> + *   transition is valid.
>>> + * @new_state: The new state to move to.
>>> + * @old_state: The old state.
>>> + * Return: true if the transition is valid.
>>> + */
>>> +bool vfio_change_migration_state_allowed(u32 new_state, u32 old_state)
>>> +{
>>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
>>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
>>> +		[VFIO_DEVICE_STATE_STOP] = {
>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
>>> +		},
>>> +		[VFIO_DEVICE_STATE_RUNNING] = {
>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
>>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
>>> +			[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING]
>>> = 1,
>> Do we need to allow _RESUMING state here or not? As per the "State transitions"
>> section from uapi/linux/vfio.h,
> It looks like we missed this state transition.
>
> Thanks

I'm not sure this state transition is valid.

Kirti, When we would like to move from RUNNING to RESUMING ?

Sameerali, can you please re-test and update if you see this transition ?


>
>> " * 4. To start the resuming phase, the device state should be transitioned from
>>   *    the _RUNNING to the _RESUMING state."
>>
>> IIRC, I have seen that transition happening on the destination dev while testing the
>> HiSilicon ACC dev migration.
>>
>> Thanks,
>> Shameer
>>
>>> +		},
>>> +		[VFIO_DEVICE_STATE_SAVING] = {
>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>> +		},
>>> +		[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING] = {
>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
>>> +		},
>>> +		[VFIO_DEVICE_STATE_RESUMING] = {
>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
>>> +		},
>>> +	};
>>> +
>>> +	if (new_state > MAX_STATE || old_state > MAX_STATE)
>>> +		return false;
>>> +
>>> +	return vfio_from_state_table[old_state][new_state];
>>> +}
>>> +EXPORT_SYMBOL_GPL(vfio_change_migration_state_allowed);
>>> +
>>>   static long vfio_device_fops_unl_ioctl(struct file *filep,
>>>   				       unsigned int cmd, unsigned long arg)
>>>   {
>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>>> index b53a9557884a..e65137a708f1 100644
>>> --- a/include/linux/vfio.h
>>> +++ b/include/linux/vfio.h
>>> @@ -83,6 +83,7 @@ extern struct vfio_device
>>> *vfio_device_get_from_dev(struct device *dev);
>>>   extern void vfio_device_put(struct vfio_device *device);
>>>
>>>   int vfio_assign_device_set(struct vfio_device *device, void *set_id);
>>> +bool vfio_change_migration_state_allowed(u32 new_state, u32 old_state);
>>>
>>>   /* events for the backend driver notify callback */
>>>   enum vfio_iommu_notify_type {
>>> --
>>> 2.31.1
Shameer Kolothum Sept. 24, 2021, 7:44 a.m. UTC | #4
> -----Original Message-----
> From: Max Gurtovoy [mailto:mgurtovoy@nvidia.com]
> Sent: 23 September 2021 14:56
> To: Leon Romanovsky <leon@kernel.org>; Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>
> Cc: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
> <jgg@nvidia.com>; Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>; David
> S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>; Kirti
> Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
> linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
> linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
> <saeedm@nvidia.com>
> Subject: Re: [PATCH mlx5-next 2/7] vfio: Add an API to check migration state
> transition validity
> 
> 
> On 9/23/2021 2:17 PM, Leon Romanovsky wrote:
> > On Thu, Sep 23, 2021 at 10:33:10AM +0000, Shameerali Kolothum Thodi
> wrote:
> >>
> >>> -----Original Message-----
> >>> From: Leon Romanovsky [mailto:leon@kernel.org]
> >>> Sent: 22 September 2021 11:39
> >>> To: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
> <jgg@nvidia.com>
> >>> Cc: Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
> >>> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>;
> David
> >>> S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>; Kirti
> >>> Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
> >>> linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
> >>> linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
> >>> <saeedm@nvidia.com>
> >>> Subject: [PATCH mlx5-next 2/7] vfio: Add an API to check migration state
> >>> transition validity
> >>>
> >>> From: Yishai Hadas <yishaih@nvidia.com>
> >>>
> >>> Add an API in the core layer to check migration state transition validity
> >>> as part of a migration flow.
> >>>
> >>> The valid transitions follow the expected usage as described in
> >>> uapi/vfio.h and triggered by QEMU.
> >>>
> >>> This ensures that all migration implementations follow a consistent
> >>> migration state machine.
> >>>
> >>> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> >>> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> >>> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> >>> ---
> >>>   drivers/vfio/vfio.c  | 41
> +++++++++++++++++++++++++++++++++++++++++
> >>>   include/linux/vfio.h |  1 +
> >>>   2 files changed, 42 insertions(+)
> >>>
> >>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> >>> index 3c034fe14ccb..c3ca33e513c8 100644
> >>> --- a/drivers/vfio/vfio.c
> >>> +++ b/drivers/vfio/vfio.c
> >>> @@ -1664,6 +1664,47 @@ static int vfio_device_fops_release(struct
> inode
> >>> *inode, struct file *filep)
> >>>   	return 0;
> >>>   }
> >>>
> >>> +/**
> >>> + * vfio_change_migration_state_allowed - Checks whether a migration
> state
> >>> + *   transition is valid.
> >>> + * @new_state: The new state to move to.
> >>> + * @old_state: The old state.
> >>> + * Return: true if the transition is valid.
> >>> + */
> >>> +bool vfio_change_migration_state_allowed(u32 new_state, u32
> old_state)
> >>> +{
> >>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
> >>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE +
> 1] = {
> >>> +		[VFIO_DEVICE_STATE_STOP] = {
> >>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
> >>> +		},
> >>> +		[VFIO_DEVICE_STATE_RUNNING] = {
> >>> +			[VFIO_DEVICE_STATE_STOP] = 1,
> >>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
> >>> +			[VFIO_DEVICE_STATE_SAVING |
> VFIO_DEVICE_STATE_RUNNING]
> >>> = 1,
> >> Do we need to allow _RESUMING state here or not? As per the "State
> transitions"
> >> section from uapi/linux/vfio.h,
> > It looks like we missed this state transition.
> >
> > Thanks
> 
> I'm not sure this state transition is valid.
> 
> Kirti, When we would like to move from RUNNING to RESUMING ?

I guess it depends on what you report as your dev default state. 

For HiSilicon ACC migration driver, we set the default to _RUNNING.

And when the migration starts, the destination side Qemu, set the 
device state to _RESUMING(vfio_load_state()).

From the documentation, it looks like the assumption on default state of
the VFIO dev is _RUNNING.

"
*  001b => Device running, which is the default state
"

> 
> Sameerali, can you please re-test and update if you see this transition ?

Yes. And if I change the default state to _STOP, then the transition
is from _STOP --> _RESUMING.

But the documentation on State transitions doesn't have _STOP --> _RESUMING
transition as valid.

Thanks,
Shameer 

> 
> 
> >
> >> " * 4. To start the resuming phase, the device state should be transitioned
> from
> >>   *    the _RUNNING to the _RESUMING state."
> >>
> >> IIRC, I have seen that transition happening on the destination dev while
> testing the
> >> HiSilicon ACC dev migration.
> >>
> >> Thanks,
> >> Shameer
> >>
> >>> +		},
> >>> +		[VFIO_DEVICE_STATE_SAVING] = {
> >>> +			[VFIO_DEVICE_STATE_STOP] = 1,
> >>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>> +		},
> >>> +		[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING]
> = {
> >>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
> >>> +		},
> >>> +		[VFIO_DEVICE_STATE_RESUMING] = {
> >>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>> +			[VFIO_DEVICE_STATE_STOP] = 1,
> >>> +		},
> >>> +	};
> >>> +
> >>> +	if (new_state > MAX_STATE || old_state > MAX_STATE)
> >>> +		return false;
> >>> +
> >>> +	return vfio_from_state_table[old_state][new_state];
> >>> +}
> >>> +EXPORT_SYMBOL_GPL(vfio_change_migration_state_allowed);
> >>> +
> >>>   static long vfio_device_fops_unl_ioctl(struct file *filep,
> >>>   				       unsigned int cmd, unsigned long arg)
> >>>   {
> >>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> >>> index b53a9557884a..e65137a708f1 100644
> >>> --- a/include/linux/vfio.h
> >>> +++ b/include/linux/vfio.h
> >>> @@ -83,6 +83,7 @@ extern struct vfio_device
> >>> *vfio_device_get_from_dev(struct device *dev);
> >>>   extern void vfio_device_put(struct vfio_device *device);
> >>>
> >>>   int vfio_assign_device_set(struct vfio_device *device, void *set_id);
> >>> +bool vfio_change_migration_state_allowed(u32 new_state, u32
> old_state);
> >>>
> >>>   /* events for the backend driver notify callback */
> >>>   enum vfio_iommu_notify_type {
> >>> --
> >>> 2.31.1
Kirti Wankhede Sept. 24, 2021, 9:37 a.m. UTC | #5
On 9/24/2021 1:14 PM, Shameerali Kolothum Thodi wrote:
> 
> 
>> -----Original Message-----
>> From: Max Gurtovoy [mailto:mgurtovoy@nvidia.com]
>> Sent: 23 September 2021 14:56
>> To: Leon Romanovsky <leon@kernel.org>; Shameerali Kolothum Thodi
>> <shameerali.kolothum.thodi@huawei.com>
>> Cc: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
>> <jgg@nvidia.com>; Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
>> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>; David
>> S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>; Kirti
>> Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
>> linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
>> linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
>> <saeedm@nvidia.com>
>> Subject: Re: [PATCH mlx5-next 2/7] vfio: Add an API to check migration state
>> transition validity
>>
>>
>> On 9/23/2021 2:17 PM, Leon Romanovsky wrote:
>>> On Thu, Sep 23, 2021 at 10:33:10AM +0000, Shameerali Kolothum Thodi
>> wrote:
>>>>
>>>>> -----Original Message-----
>>>>> From: Leon Romanovsky [mailto:leon@kernel.org]
>>>>> Sent: 22 September 2021 11:39
>>>>> To: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
>> <jgg@nvidia.com>
>>>>> Cc: Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
>>>>> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>;
>> David
>>>>> S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>; Kirti
>>>>> Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
>>>>> linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
>>>>> linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
>>>>> <saeedm@nvidia.com>
>>>>> Subject: [PATCH mlx5-next 2/7] vfio: Add an API to check migration state
>>>>> transition validity
>>>>>
>>>>> From: Yishai Hadas <yishaih@nvidia.com>
>>>>>
>>>>> Add an API in the core layer to check migration state transition validity
>>>>> as part of a migration flow.
>>>>>
>>>>> The valid transitions follow the expected usage as described in
>>>>> uapi/vfio.h and triggered by QEMU.
>>>>>
>>>>> This ensures that all migration implementations follow a consistent
>>>>> migration state machine.
>>>>>
>>>>> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
>>>>> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>>>>> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
>>>>> ---
>>>>>    drivers/vfio/vfio.c  | 41
>> +++++++++++++++++++++++++++++++++++++++++
>>>>>    include/linux/vfio.h |  1 +
>>>>>    2 files changed, 42 insertions(+)
>>>>>
>>>>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>>>>> index 3c034fe14ccb..c3ca33e513c8 100644
>>>>> --- a/drivers/vfio/vfio.c
>>>>> +++ b/drivers/vfio/vfio.c
>>>>> @@ -1664,6 +1664,47 @@ static int vfio_device_fops_release(struct
>> inode
>>>>> *inode, struct file *filep)
>>>>>    	return 0;
>>>>>    }
>>>>>
>>>>> +/**
>>>>> + * vfio_change_migration_state_allowed - Checks whether a migration
>> state
>>>>> + *   transition is valid.
>>>>> + * @new_state: The new state to move to.
>>>>> + * @old_state: The old state.
>>>>> + * Return: true if the transition is valid.
>>>>> + */
>>>>> +bool vfio_change_migration_state_allowed(u32 new_state, u32
>> old_state)
>>>>> +{
>>>>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
>>>>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE +
>> 1] = {
>>>>> +		[VFIO_DEVICE_STATE_STOP] = {
>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
>>>>> +		},
>>>>> +		[VFIO_DEVICE_STATE_RUNNING] = {
>>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
>>>>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
>>>>> +			[VFIO_DEVICE_STATE_SAVING |
>> VFIO_DEVICE_STATE_RUNNING]
>>>>> = 1,
>>>> Do we need to allow _RESUMING state here or not? As per the "State
>> transitions"
>>>> section from uapi/linux/vfio.h,
>>> It looks like we missed this state transition.
>>>
>>> Thanks
>>
>> I'm not sure this state transition is valid.
>>
>> Kirti, When we would like to move from RUNNING to RESUMING ?
> 
> I guess it depends on what you report as your dev default state.
> 
> For HiSilicon ACC migration driver, we set the default to _RUNNING.
> 
> And when the migration starts, the destination side Qemu, set the
> device state to _RESUMING(vfio_load_state()).
> 
>  From the documentation, it looks like the assumption on default state of
> the VFIO dev is _RUNNING.
> 

That's right. in QEMU VFIO device state at init is running to maintain 
backward compatibility since migration support was added later.

RUNNING -> RESUMING state transition is valid.

Thanks,
Kirti

> "
> *  001b => Device running, which is the default state
> "
> 
>>
>> Sameerali, can you please re-test and update if you see this transition ?
> 
> Yes. And if I change the default state to _STOP, then the transition
> is from _STOP --> _RESUMING.
> 
> But the documentation on State transitions doesn't have _STOP --> _RESUMING
> transition as valid.
> 
> Thanks,
> Shameer
> 
>>
>>
>>>
>>>> " * 4. To start the resuming phase, the device state should be transitioned
>> from
>>>>    *    the _RUNNING to the _RESUMING state."
>>>>
>>>> IIRC, I have seen that transition happening on the destination dev while
>> testing the
>>>> HiSilicon ACC dev migration.
>>>>
>>>> Thanks,
>>>> Shameer
>>>>
>>>>> +		},
>>>>> +		[VFIO_DEVICE_STATE_SAVING] = {
>>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>> +		},
>>>>> +		[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING]
>> = {
>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
>>>>> +		},
>>>>> +		[VFIO_DEVICE_STATE_RESUMING] = {
>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
>>>>> +		},
>>>>> +	};
>>>>> +
>>>>> +	if (new_state > MAX_STATE || old_state > MAX_STATE)
>>>>> +		return false;
>>>>> +
>>>>> +	return vfio_from_state_table[old_state][new_state];
>>>>> +}
>>>>> +EXPORT_SYMBOL_GPL(vfio_change_migration_state_allowed);
>>>>> +
>>>>>    static long vfio_device_fops_unl_ioctl(struct file *filep,
>>>>>    				       unsigned int cmd, unsigned long arg)
>>>>>    {
>>>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>>>>> index b53a9557884a..e65137a708f1 100644
>>>>> --- a/include/linux/vfio.h
>>>>> +++ b/include/linux/vfio.h
>>>>> @@ -83,6 +83,7 @@ extern struct vfio_device
>>>>> *vfio_device_get_from_dev(struct device *dev);
>>>>>    extern void vfio_device_put(struct vfio_device *device);
>>>>>
>>>>>    int vfio_assign_device_set(struct vfio_device *device, void *set_id);
>>>>> +bool vfio_change_migration_state_allowed(u32 new_state, u32
>> old_state);
>>>>>
>>>>>    /* events for the backend driver notify callback */
>>>>>    enum vfio_iommu_notify_type {
>>>>> --
>>>>> 2.31.1
Max Gurtovoy Sept. 26, 2021, 9:09 a.m. UTC | #6
On 9/24/2021 10:44 AM, Shameerali Kolothum Thodi wrote:
>
>> -----Original Message-----
>> From: Max Gurtovoy [mailto:mgurtovoy@nvidia.com]
>> Sent: 23 September 2021 14:56
>> To: Leon Romanovsky <leon@kernel.org>; Shameerali Kolothum Thodi
>> <shameerali.kolothum.thodi@huawei.com>
>> Cc: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
>> <jgg@nvidia.com>; Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
>> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>; David
>> S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>; Kirti
>> Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
>> linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
>> linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
>> <saeedm@nvidia.com>
>> Subject: Re: [PATCH mlx5-next 2/7] vfio: Add an API to check migration state
>> transition validity
>>
>>
>> On 9/23/2021 2:17 PM, Leon Romanovsky wrote:
>>> On Thu, Sep 23, 2021 at 10:33:10AM +0000, Shameerali Kolothum Thodi
>> wrote:
>>>>> -----Original Message-----
>>>>> From: Leon Romanovsky [mailto:leon@kernel.org]
>>>>> Sent: 22 September 2021 11:39
>>>>> To: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
>> <jgg@nvidia.com>
>>>>> Cc: Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
>>>>> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>;
>> David
>>>>> S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>; Kirti
>>>>> Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
>>>>> linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
>>>>> linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
>>>>> <saeedm@nvidia.com>
>>>>> Subject: [PATCH mlx5-next 2/7] vfio: Add an API to check migration state
>>>>> transition validity
>>>>>
>>>>> From: Yishai Hadas <yishaih@nvidia.com>
>>>>>
>>>>> Add an API in the core layer to check migration state transition validity
>>>>> as part of a migration flow.
>>>>>
>>>>> The valid transitions follow the expected usage as described in
>>>>> uapi/vfio.h and triggered by QEMU.
>>>>>
>>>>> This ensures that all migration implementations follow a consistent
>>>>> migration state machine.
>>>>>
>>>>> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
>>>>> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>>>>> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
>>>>> ---
>>>>>    drivers/vfio/vfio.c  | 41
>> +++++++++++++++++++++++++++++++++++++++++
>>>>>    include/linux/vfio.h |  1 +
>>>>>    2 files changed, 42 insertions(+)
>>>>>
>>>>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>>>>> index 3c034fe14ccb..c3ca33e513c8 100644
>>>>> --- a/drivers/vfio/vfio.c
>>>>> +++ b/drivers/vfio/vfio.c
>>>>> @@ -1664,6 +1664,47 @@ static int vfio_device_fops_release(struct
>> inode
>>>>> *inode, struct file *filep)
>>>>>    	return 0;
>>>>>    }
>>>>>
>>>>> +/**
>>>>> + * vfio_change_migration_state_allowed - Checks whether a migration
>> state
>>>>> + *   transition is valid.
>>>>> + * @new_state: The new state to move to.
>>>>> + * @old_state: The old state.
>>>>> + * Return: true if the transition is valid.
>>>>> + */
>>>>> +bool vfio_change_migration_state_allowed(u32 new_state, u32
>> old_state)
>>>>> +{
>>>>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
>>>>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE +
>> 1] = {
>>>>> +		[VFIO_DEVICE_STATE_STOP] = {
>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
>>>>> +		},
>>>>> +		[VFIO_DEVICE_STATE_RUNNING] = {
>>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
>>>>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
>>>>> +			[VFIO_DEVICE_STATE_SAVING |
>> VFIO_DEVICE_STATE_RUNNING]
>>>>> = 1,
>>>> Do we need to allow _RESUMING state here or not? As per the "State
>> transitions"
>>>> section from uapi/linux/vfio.h,
>>> It looks like we missed this state transition.
>>>
>>> Thanks
>> I'm not sure this state transition is valid.
>>
>> Kirti, When we would like to move from RUNNING to RESUMING ?
> I guess it depends on what you report as your dev default state.
>
> For HiSilicon ACC migration driver, we set the default to _RUNNING.

Where do you set it and report it ?


>
> And when the migration starts, the destination side Qemu, set the
> device state to _RESUMING(vfio_load_state()).
>
>  From the documentation, it looks like the assumption on default state of
> the VFIO dev is _RUNNING.
>
> "
> *  001b => Device running, which is the default state
> "
>
>> Sameerali, can you please re-test and update if you see this transition ?
> Yes. And if I change the default state to _STOP, then the transition
> is from _STOP --> _RESUMING.
>
> But the documentation on State transitions doesn't have _STOP --> _RESUMING
> transition as valid.
>
> Thanks,
> Shameer
>
>>
>>>> " * 4. To start the resuming phase, the device state should be transitioned
>> from
>>>>    *    the _RUNNING to the _RESUMING state."
>>>>
>>>> IIRC, I have seen that transition happening on the destination dev while
>> testing the
>>>> HiSilicon ACC dev migration.
>>>>
>>>> Thanks,
>>>> Shameer
>>>>
>>>>> +		},
>>>>> +		[VFIO_DEVICE_STATE_SAVING] = {
>>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>> +		},
>>>>> +		[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING]
>> = {
>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
>>>>> +		},
>>>>> +		[VFIO_DEVICE_STATE_RESUMING] = {
>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
>>>>> +		},
>>>>> +	};
>>>>> +
>>>>> +	if (new_state > MAX_STATE || old_state > MAX_STATE)
>>>>> +		return false;
>>>>> +
>>>>> +	return vfio_from_state_table[old_state][new_state];
>>>>> +}
>>>>> +EXPORT_SYMBOL_GPL(vfio_change_migration_state_allowed);
>>>>> +
>>>>>    static long vfio_device_fops_unl_ioctl(struct file *filep,
>>>>>    				       unsigned int cmd, unsigned long arg)
>>>>>    {
>>>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>>>>> index b53a9557884a..e65137a708f1 100644
>>>>> --- a/include/linux/vfio.h
>>>>> +++ b/include/linux/vfio.h
>>>>> @@ -83,6 +83,7 @@ extern struct vfio_device
>>>>> *vfio_device_get_from_dev(struct device *dev);
>>>>>    extern void vfio_device_put(struct vfio_device *device);
>>>>>
>>>>>    int vfio_assign_device_set(struct vfio_device *device, void *set_id);
>>>>> +bool vfio_change_migration_state_allowed(u32 new_state, u32
>> old_state);
>>>>>    /* events for the backend driver notify callback */
>>>>>    enum vfio_iommu_notify_type {
>>>>> --
>>>>> 2.31.1
Shameer Kolothum Sept. 26, 2021, 4:17 p.m. UTC | #7
> -----Original Message-----
> From: Max Gurtovoy [mailto:mgurtovoy@nvidia.com]
> Sent: 26 September 2021 10:10
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> Leon Romanovsky <leon@kernel.org>
> Cc: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
> <jgg@nvidia.com>; Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>; David
> S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>; Kirti
> Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
> linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
> linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
> <saeedm@nvidia.com>; liulongfang <liulongfang@huawei.com>
> Subject: Re: [PATCH mlx5-next 2/7] vfio: Add an API to check migration state
> transition validity
> 
> 
> On 9/24/2021 10:44 AM, Shameerali Kolothum Thodi wrote:
> >
> >> -----Original Message-----
> >> From: Max Gurtovoy [mailto:mgurtovoy@nvidia.com]
> >> Sent: 23 September 2021 14:56
> >> To: Leon Romanovsky <leon@kernel.org>; Shameerali Kolothum Thodi
> >> <shameerali.kolothum.thodi@huawei.com>
> >> Cc: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
> >> <jgg@nvidia.com>; Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
> >> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>;
> >> David S. Miller <davem@davemloft.net>; Jakub Kicinski
> >> <kuba@kernel.org>; Kirti Wankhede <kwankhede@nvidia.com>;
> >> kvm@vger.kernel.org; linux-kernel@vger.kernel.org;
> >> linux-pci@vger.kernel.org; linux-rdma@vger.kernel.org;
> >> netdev@vger.kernel.org; Saeed Mahameed <saeedm@nvidia.com>
> >> Subject: Re: [PATCH mlx5-next 2/7] vfio: Add an API to check
> >> migration state transition validity
> >>
> >>
> >> On 9/23/2021 2:17 PM, Leon Romanovsky wrote:
> >>> On Thu, Sep 23, 2021 at 10:33:10AM +0000, Shameerali Kolothum Thodi
> >> wrote:
> >>>>> -----Original Message-----
> >>>>> From: Leon Romanovsky [mailto:leon@kernel.org]
> >>>>> Sent: 22 September 2021 11:39
> >>>>> To: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
> >> <jgg@nvidia.com>
> >>>>> Cc: Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
> >>>>> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>;
> >> David
> >>>>> S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>;
> >>>>> Kirti Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
> >>>>> linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
> >>>>> linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
> >>>>> <saeedm@nvidia.com>
> >>>>> Subject: [PATCH mlx5-next 2/7] vfio: Add an API to check migration
> >>>>> state transition validity
> >>>>>
> >>>>> From: Yishai Hadas <yishaih@nvidia.com>
> >>>>>
> >>>>> Add an API in the core layer to check migration state transition
> >>>>> validity as part of a migration flow.
> >>>>>
> >>>>> The valid transitions follow the expected usage as described in
> >>>>> uapi/vfio.h and triggered by QEMU.
> >>>>>
> >>>>> This ensures that all migration implementations follow a
> >>>>> consistent migration state machine.
> >>>>>
> >>>>> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> >>>>> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>>>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> >>>>> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> >>>>> ---
> >>>>>    drivers/vfio/vfio.c  | 41
> >> +++++++++++++++++++++++++++++++++++++++++
> >>>>>    include/linux/vfio.h |  1 +
> >>>>>    2 files changed, 42 insertions(+)
> >>>>>
> >>>>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index
> >>>>> 3c034fe14ccb..c3ca33e513c8 100644
> >>>>> --- a/drivers/vfio/vfio.c
> >>>>> +++ b/drivers/vfio/vfio.c
> >>>>> @@ -1664,6 +1664,47 @@ static int vfio_device_fops_release(struct
> >> inode
> >>>>> *inode, struct file *filep)
> >>>>>    	return 0;
> >>>>>    }
> >>>>>
> >>>>> +/**
> >>>>> + * vfio_change_migration_state_allowed - Checks whether a
> >>>>> +migration
> >> state
> >>>>> + *   transition is valid.
> >>>>> + * @new_state: The new state to move to.
> >>>>> + * @old_state: The old state.
> >>>>> + * Return: true if the transition is valid.
> >>>>> + */
> >>>>> +bool vfio_change_migration_state_allowed(u32 new_state, u32
> >> old_state)
> >>>>> +{
> >>>>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
> >>>>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE +
> >> 1] = {
> >>>>> +		[VFIO_DEVICE_STATE_STOP] = {
> >>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>>>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
> >>>>> +		},
> >>>>> +		[VFIO_DEVICE_STATE_RUNNING] = {
> >>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
> >>>>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
> >>>>> +			[VFIO_DEVICE_STATE_SAVING |
> >> VFIO_DEVICE_STATE_RUNNING]
> >>>>> = 1,
> >>>> Do we need to allow _RESUMING state here or not? As per the "State
> >> transitions"
> >>>> section from uapi/linux/vfio.h,
> >>> It looks like we missed this state transition.
> >>>
> >>> Thanks
> >> I'm not sure this state transition is valid.
> >>
> >> Kirti, When we would like to move from RUNNING to RESUMING ?
> > I guess it depends on what you report as your dev default state.
> >
> > For HiSilicon ACC migration driver, we set the default to _RUNNING.
> 
> Where do you set it and report it ?

Currently, in _open_device() we set the device_state to _RUNNING.

I think in your case the default of vmig->vfio_dev_state == 0 (_STOP).

> 
> >
> > And when the migration starts, the destination side Qemu, set the
> > device state to _RESUMING(vfio_load_state()).
> >
> >  From the documentation, it looks like the assumption on default state
> > of the VFIO dev is _RUNNING.
> >
> > "
> > *  001b => Device running, which is the default state "
> >
> >> Sameerali, can you please re-test and update if you see this transition ?
> > Yes. And if I change the default state to _STOP, then the transition
> > is from _STOP --> _RESUMING.
> >
> > But the documentation on State transitions doesn't have _STOP -->
> > _RESUMING transition as valid.
> >
> > Thanks,
> > Shameer
> >
> >>
> >>>> " * 4. To start the resuming phase, the device state should be
> >>>> transitioned
> >> from
> >>>>    *    the _RUNNING to the _RESUMING state."
> >>>>
> >>>> IIRC, I have seen that transition happening on the destination dev
> >>>> while
> >> testing the
> >>>> HiSilicon ACC dev migration.
> >>>>
> >>>> Thanks,
> >>>> Shameer
> >>>>
> >>>>> +		},
> >>>>> +		[VFIO_DEVICE_STATE_SAVING] = {
> >>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
> >>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>>>> +		},
> >>>>> +		[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING]
> >> = {
> >>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>>>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
> >>>>> +		},
> >>>>> +		[VFIO_DEVICE_STATE_RESUMING] = {
> >>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
> >>>>> +		},
> >>>>> +	};
> >>>>> +
> >>>>> +	if (new_state > MAX_STATE || old_state > MAX_STATE)
> >>>>> +		return false;
> >>>>> +
> >>>>> +	return vfio_from_state_table[old_state][new_state];
> >>>>> +}
> >>>>> +EXPORT_SYMBOL_GPL(vfio_change_migration_state_allowed);
> >>>>> +
> >>>>>    static long vfio_device_fops_unl_ioctl(struct file *filep,
> >>>>>    				       unsigned int cmd, unsigned long arg)
> >>>>>    {
> >>>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h index
> >>>>> b53a9557884a..e65137a708f1 100644
> >>>>> --- a/include/linux/vfio.h
> >>>>> +++ b/include/linux/vfio.h
> >>>>> @@ -83,6 +83,7 @@ extern struct vfio_device
> >>>>> *vfio_device_get_from_dev(struct device *dev);
> >>>>>    extern void vfio_device_put(struct vfio_device *device);
> >>>>>
> >>>>>    int vfio_assign_device_set(struct vfio_device *device, void
> >>>>> *set_id);
> >>>>> +bool vfio_change_migration_state_allowed(u32 new_state, u32
> >> old_state);
> >>>>>    /* events for the backend driver notify callback */
> >>>>>    enum vfio_iommu_notify_type {
> >>>>> --
> >>>>> 2.31.1
Max Gurtovoy Sept. 27, 2021, 6:24 p.m. UTC | #8
On 9/26/2021 7:17 PM, Shameerali Kolothum Thodi wrote:
>
>> -----Original Message-----
>> From: Max Gurtovoy [mailto:mgurtovoy@nvidia.com]
>> Sent: 26 September 2021 10:10
>> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
>> Leon Romanovsky <leon@kernel.org>
>> Cc: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
>> <jgg@nvidia.com>; Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
>> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>; David
>> S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>; Kirti
>> Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
>> linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
>> linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
>> <saeedm@nvidia.com>; liulongfang <liulongfang@huawei.com>
>> Subject: Re: [PATCH mlx5-next 2/7] vfio: Add an API to check migration state
>> transition validity
>>
>>
>> On 9/24/2021 10:44 AM, Shameerali Kolothum Thodi wrote:
>>>> -----Original Message-----
>>>> From: Max Gurtovoy [mailto:mgurtovoy@nvidia.com]
>>>> Sent: 23 September 2021 14:56
>>>> To: Leon Romanovsky <leon@kernel.org>; Shameerali Kolothum Thodi
>>>> <shameerali.kolothum.thodi@huawei.com>
>>>> Cc: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
>>>> <jgg@nvidia.com>; Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
>>>> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>;
>>>> David S. Miller <davem@davemloft.net>; Jakub Kicinski
>>>> <kuba@kernel.org>; Kirti Wankhede <kwankhede@nvidia.com>;
>>>> kvm@vger.kernel.org; linux-kernel@vger.kernel.org;
>>>> linux-pci@vger.kernel.org; linux-rdma@vger.kernel.org;
>>>> netdev@vger.kernel.org; Saeed Mahameed <saeedm@nvidia.com>
>>>> Subject: Re: [PATCH mlx5-next 2/7] vfio: Add an API to check
>>>> migration state transition validity
>>>>
>>>>
>>>> On 9/23/2021 2:17 PM, Leon Romanovsky wrote:
>>>>> On Thu, Sep 23, 2021 at 10:33:10AM +0000, Shameerali Kolothum Thodi
>>>> wrote:
>>>>>>> -----Original Message-----
>>>>>>> From: Leon Romanovsky [mailto:leon@kernel.org]
>>>>>>> Sent: 22 September 2021 11:39
>>>>>>> To: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
>>>> <jgg@nvidia.com>
>>>>>>> Cc: Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
>>>>>>> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>;
>>>> David
>>>>>>> S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>;
>>>>>>> Kirti Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
>>>>>>> linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
>>>>>>> linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
>>>>>>> <saeedm@nvidia.com>
>>>>>>> Subject: [PATCH mlx5-next 2/7] vfio: Add an API to check migration
>>>>>>> state transition validity
>>>>>>>
>>>>>>> From: Yishai Hadas <yishaih@nvidia.com>
>>>>>>>
>>>>>>> Add an API in the core layer to check migration state transition
>>>>>>> validity as part of a migration flow.
>>>>>>>
>>>>>>> The valid transitions follow the expected usage as described in
>>>>>>> uapi/vfio.h and triggered by QEMU.
>>>>>>>
>>>>>>> This ensures that all migration implementations follow a
>>>>>>> consistent migration state machine.
>>>>>>>
>>>>>>> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
>>>>>>> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>>>>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>>>>>>> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
>>>>>>> ---
>>>>>>>     drivers/vfio/vfio.c  | 41
>>>> +++++++++++++++++++++++++++++++++++++++++
>>>>>>>     include/linux/vfio.h |  1 +
>>>>>>>     2 files changed, 42 insertions(+)
>>>>>>>
>>>>>>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index
>>>>>>> 3c034fe14ccb..c3ca33e513c8 100644
>>>>>>> --- a/drivers/vfio/vfio.c
>>>>>>> +++ b/drivers/vfio/vfio.c
>>>>>>> @@ -1664,6 +1664,47 @@ static int vfio_device_fops_release(struct
>>>> inode
>>>>>>> *inode, struct file *filep)
>>>>>>>     	return 0;
>>>>>>>     }
>>>>>>>
>>>>>>> +/**
>>>>>>> + * vfio_change_migration_state_allowed - Checks whether a
>>>>>>> +migration
>>>> state
>>>>>>> + *   transition is valid.
>>>>>>> + * @new_state: The new state to move to.
>>>>>>> + * @old_state: The old state.
>>>>>>> + * Return: true if the transition is valid.
>>>>>>> + */
>>>>>>> +bool vfio_change_migration_state_allowed(u32 new_state, u32
>>>> old_state)
>>>>>>> +{
>>>>>>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
>>>>>>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE +
>>>> 1] = {
>>>>>>> +		[VFIO_DEVICE_STATE_STOP] = {
>>>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>>>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
>>>>>>> +		},
>>>>>>> +		[VFIO_DEVICE_STATE_RUNNING] = {
>>>>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
>>>>>>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
>>>>>>> +			[VFIO_DEVICE_STATE_SAVING |
>>>> VFIO_DEVICE_STATE_RUNNING]
>>>>>>> = 1,
>>>>>> Do we need to allow _RESUMING state here or not? As per the "State
>>>> transitions"
>>>>>> section from uapi/linux/vfio.h,
>>>>> It looks like we missed this state transition.
>>>>>
>>>>> Thanks
>>>> I'm not sure this state transition is valid.
>>>>
>>>> Kirti, When we would like to move from RUNNING to RESUMING ?
>>> I guess it depends on what you report as your dev default state.
>>>
>>> For HiSilicon ACC migration driver, we set the default to _RUNNING.
>> Where do you set it and report it ?
> Currently, in _open_device() we set the device_state to _RUNNING.

Why do you do it ?

>
> I think in your case the default of vmig->vfio_dev_state == 0 (_STOP).
>
>>> And when the migration starts, the destination side Qemu, set the
>>> device state to _RESUMING(vfio_load_state()).
>>>
>>>   From the documentation, it looks like the assumption on default state
>>> of the VFIO dev is _RUNNING.
>>>
>>> "
>>> *  001b => Device running, which is the default state "
>>>
>>>> Sameerali, can you please re-test and update if you see this transition ?
>>> Yes. And if I change the default state to _STOP, then the transition
>>> is from _STOP --> _RESUMING.
>>>
>>> But the documentation on State transitions doesn't have _STOP -->
>>> _RESUMING transition as valid.
>>>
>>> Thanks,
>>> Shameer
>>>
>>>>>> " * 4. To start the resuming phase, the device state should be
>>>>>> transitioned
>>>> from
>>>>>>     *    the _RUNNING to the _RESUMING state."
>>>>>>
>>>>>> IIRC, I have seen that transition happening on the destination dev
>>>>>> while
>>>> testing the
>>>>>> HiSilicon ACC dev migration.
>>>>>>
>>>>>> Thanks,
>>>>>> Shameer
>>>>>>
>>>>>>> +		},
>>>>>>> +		[VFIO_DEVICE_STATE_SAVING] = {
>>>>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
>>>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>>>> +		},
>>>>>>> +		[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING]
>>>> = {
>>>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>>>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
>>>>>>> +		},
>>>>>>> +		[VFIO_DEVICE_STATE_RESUMING] = {
>>>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
>>>>>>> +		},
>>>>>>> +	};
>>>>>>> +
>>>>>>> +	if (new_state > MAX_STATE || old_state > MAX_STATE)
>>>>>>> +		return false;
>>>>>>> +
>>>>>>> +	return vfio_from_state_table[old_state][new_state];
>>>>>>> +}
>>>>>>> +EXPORT_SYMBOL_GPL(vfio_change_migration_state_allowed);
>>>>>>> +
>>>>>>>     static long vfio_device_fops_unl_ioctl(struct file *filep,
>>>>>>>     				       unsigned int cmd, unsigned long arg)
>>>>>>>     {
>>>>>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h index
>>>>>>> b53a9557884a..e65137a708f1 100644
>>>>>>> --- a/include/linux/vfio.h
>>>>>>> +++ b/include/linux/vfio.h
>>>>>>> @@ -83,6 +83,7 @@ extern struct vfio_device
>>>>>>> *vfio_device_get_from_dev(struct device *dev);
>>>>>>>     extern void vfio_device_put(struct vfio_device *device);
>>>>>>>
>>>>>>>     int vfio_assign_device_set(struct vfio_device *device, void
>>>>>>> *set_id);
>>>>>>> +bool vfio_change_migration_state_allowed(u32 new_state, u32
>>>> old_state);
>>>>>>>     /* events for the backend driver notify callback */
>>>>>>>     enum vfio_iommu_notify_type {
>>>>>>> --
>>>>>>> 2.31.1
Shameer Kolothum Sept. 27, 2021, 6:29 p.m. UTC | #9
> -----Original Message-----
> From: Max Gurtovoy [mailto:mgurtovoy@nvidia.com]
> Sent: 27 September 2021 19:24
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> Leon Romanovsky <leon@kernel.org>
> Cc: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
> <jgg@nvidia.com>; Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>; David
> S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>; Kirti
> Wankhede <kwankhede@nvidia.com>; kvm@vger.kernel.org;
> linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org;
> linux-rdma@vger.kernel.org; netdev@vger.kernel.org; Saeed Mahameed
> <saeedm@nvidia.com>; liulongfang <liulongfang@huawei.com>
> Subject: Re: [PATCH mlx5-next 2/7] vfio: Add an API to check migration state
> transition validity
> 
> 
> On 9/26/2021 7:17 PM, Shameerali Kolothum Thodi wrote:
> >
> >> -----Original Message-----
> >> From: Max Gurtovoy [mailto:mgurtovoy@nvidia.com]
> >> Sent: 26 September 2021 10:10
> >> To: Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>;
> >> Leon Romanovsky <leon@kernel.org>
> >> Cc: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
> >> <jgg@nvidia.com>; Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
> >> <alex.williamson@redhat.com>; Bjorn Helgaas <bhelgaas@google.com>;
> >> David S. Miller <davem@davemloft.net>; Jakub Kicinski
> >> <kuba@kernel.org>; Kirti Wankhede <kwankhede@nvidia.com>;
> >> kvm@vger.kernel.org; linux-kernel@vger.kernel.org;
> >> linux-pci@vger.kernel.org; linux-rdma@vger.kernel.org;
> >> netdev@vger.kernel.org; Saeed Mahameed <saeedm@nvidia.com>;
> >> liulongfang <liulongfang@huawei.com>
> >> Subject: Re: [PATCH mlx5-next 2/7] vfio: Add an API to check
> >> migration state transition validity
> >>
> >>
> >> On 9/24/2021 10:44 AM, Shameerali Kolothum Thodi wrote:
> >>>> -----Original Message-----
> >>>> From: Max Gurtovoy [mailto:mgurtovoy@nvidia.com]
> >>>> Sent: 23 September 2021 14:56
> >>>> To: Leon Romanovsky <leon@kernel.org>; Shameerali Kolothum Thodi
> >>>> <shameerali.kolothum.thodi@huawei.com>
> >>>> Cc: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
> >>>> <jgg@nvidia.com>; Yishai Hadas <yishaih@nvidia.com>; Alex
> >>>> Williamson <alex.williamson@redhat.com>; Bjorn Helgaas
> >>>> <bhelgaas@google.com>; David S. Miller <davem@davemloft.net>;
> Jakub
> >>>> Kicinski <kuba@kernel.org>; Kirti Wankhede <kwankhede@nvidia.com>;
> >>>> kvm@vger.kernel.org; linux-kernel@vger.kernel.org;
> >>>> linux-pci@vger.kernel.org; linux-rdma@vger.kernel.org;
> >>>> netdev@vger.kernel.org; Saeed Mahameed <saeedm@nvidia.com>
> >>>> Subject: Re: [PATCH mlx5-next 2/7] vfio: Add an API to check
> >>>> migration state transition validity
> >>>>
> >>>>
> >>>> On 9/23/2021 2:17 PM, Leon Romanovsky wrote:
> >>>>> On Thu, Sep 23, 2021 at 10:33:10AM +0000, Shameerali Kolothum
> >>>>> Thodi
> >>>> wrote:
> >>>>>>> -----Original Message-----
> >>>>>>> From: Leon Romanovsky [mailto:leon@kernel.org]
> >>>>>>> Sent: 22 September 2021 11:39
> >>>>>>> To: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe
> >>>> <jgg@nvidia.com>
> >>>>>>> Cc: Yishai Hadas <yishaih@nvidia.com>; Alex Williamson
> >>>>>>> <alex.williamson@redhat.com>; Bjorn Helgaas
> >>>>>>> <bhelgaas@google.com>;
> >>>> David
> >>>>>>> S. Miller <davem@davemloft.net>; Jakub Kicinski
> >>>>>>> <kuba@kernel.org>; Kirti Wankhede <kwankhede@nvidia.com>;
> >>>>>>> kvm@vger.kernel.org; linux-kernel@vger.kernel.org;
> >>>>>>> linux-pci@vger.kernel.org; linux-rdma@vger.kernel.org;
> >>>>>>> netdev@vger.kernel.org; Saeed Mahameed <saeedm@nvidia.com>
> >>>>>>> Subject: [PATCH mlx5-next 2/7] vfio: Add an API to check
> >>>>>>> migration state transition validity
> >>>>>>>
> >>>>>>> From: Yishai Hadas <yishaih@nvidia.com>
> >>>>>>>
> >>>>>>> Add an API in the core layer to check migration state transition
> >>>>>>> validity as part of a migration flow.
> >>>>>>>
> >>>>>>> The valid transitions follow the expected usage as described in
> >>>>>>> uapi/vfio.h and triggered by QEMU.
> >>>>>>>
> >>>>>>> This ensures that all migration implementations follow a
> >>>>>>> consistent migration state machine.
> >>>>>>>
> >>>>>>> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> >>>>>>> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>>>>>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> >>>>>>> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> >>>>>>> ---
> >>>>>>>     drivers/vfio/vfio.c  | 41
> >>>> +++++++++++++++++++++++++++++++++++++++++
> >>>>>>>     include/linux/vfio.h |  1 +
> >>>>>>>     2 files changed, 42 insertions(+)
> >>>>>>>
> >>>>>>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index
> >>>>>>> 3c034fe14ccb..c3ca33e513c8 100644
> >>>>>>> --- a/drivers/vfio/vfio.c
> >>>>>>> +++ b/drivers/vfio/vfio.c
> >>>>>>> @@ -1664,6 +1664,47 @@ static int
> >>>>>>> vfio_device_fops_release(struct
> >>>> inode
> >>>>>>> *inode, struct file *filep)
> >>>>>>>     	return 0;
> >>>>>>>     }
> >>>>>>>
> >>>>>>> +/**
> >>>>>>> + * vfio_change_migration_state_allowed - Checks whether a
> >>>>>>> +migration
> >>>> state
> >>>>>>> + *   transition is valid.
> >>>>>>> + * @new_state: The new state to move to.
> >>>>>>> + * @old_state: The old state.
> >>>>>>> + * Return: true if the transition is valid.
> >>>>>>> + */
> >>>>>>> +bool vfio_change_migration_state_allowed(u32 new_state, u32
> >>>> old_state)
> >>>>>>> +{
> >>>>>>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
> >>>>>>> +	static const u8 vfio_from_state_table[MAX_STATE +
> 1][MAX_STATE
> >>>>>>> ++
> >>>> 1] = {
> >>>>>>> +		[VFIO_DEVICE_STATE_STOP] = {
> >>>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>>>>>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
> >>>>>>> +		},
> >>>>>>> +		[VFIO_DEVICE_STATE_RUNNING] = {
> >>>>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
> >>>>>>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
> >>>>>>> +			[VFIO_DEVICE_STATE_SAVING |
> >>>> VFIO_DEVICE_STATE_RUNNING]
> >>>>>>> = 1,
> >>>>>> Do we need to allow _RESUMING state here or not? As per the
> >>>>>> "State
> >>>> transitions"
> >>>>>> section from uapi/linux/vfio.h,
> >>>>> It looks like we missed this state transition.
> >>>>>
> >>>>> Thanks
> >>>> I'm not sure this state transition is valid.
> >>>>
> >>>> Kirti, When we would like to move from RUNNING to RESUMING ?
> >>> I guess it depends on what you report as your dev default state.
> >>>
> >>> For HiSilicon ACC migration driver, we set the default to _RUNNING.
> >> Where do you set it and report it ?
> > Currently, in _open_device() we set the device_state to _RUNNING.
> 
> Why do you do it ?

It is by the assumption that the default state to be _RUNNING and then
we take it from there for migration state changes.

Any particular reason why it needs to be in _STOP state? We need to update
the documentation in that case to allow _STOP --> _RESUMING.

> 
> >
> > I think in your case the default of vmig->vfio_dev_state == 0 (_STOP).
> >
> >>> And when the migration starts, the destination side Qemu, set the
> >>> device state to _RESUMING(vfio_load_state()).
> >>>
> >>>   From the documentation, it looks like the assumption on default
> >>> state of the VFIO dev is _RUNNING.
> >>>
> >>> "
> >>> *  001b => Device running, which is the default state "
> >>>
> >>>> Sameerali, can you please re-test and update if you see this transition ?
> >>> Yes. And if I change the default state to _STOP, then the transition
> >>> is from _STOP --> _RESUMING.
> >>>
> >>> But the documentation on State transitions doesn't have _STOP -->
> >>> _RESUMING transition as valid.
> >>>
> >>> Thanks,
> >>> Shameer
> >>>
> >>>>>> " * 4. To start the resuming phase, the device state should be
> >>>>>> transitioned
> >>>> from
> >>>>>>     *    the _RUNNING to the _RESUMING state."
> >>>>>>
> >>>>>> IIRC, I have seen that transition happening on the destination
> >>>>>> dev while
> >>>> testing the
> >>>>>> HiSilicon ACC dev migration.
> >>>>>>
> >>>>>> Thanks,
> >>>>>> Shameer
> >>>>>>
> >>>>>>> +		},
> >>>>>>> +		[VFIO_DEVICE_STATE_SAVING] = {
> >>>>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
> >>>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>>>>>> +		},
> >>>>>>> +		[VFIO_DEVICE_STATE_SAVING |
> VFIO_DEVICE_STATE_RUNNING]
> >>>> = {
> >>>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>>>>>> +			[VFIO_DEVICE_STATE_SAVING] = 1,
> >>>>>>> +		},
> >>>>>>> +		[VFIO_DEVICE_STATE_RESUMING] = {
> >>>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>>>>>> +			[VFIO_DEVICE_STATE_STOP] = 1,
> >>>>>>> +		},
> >>>>>>> +	};
> >>>>>>> +
> >>>>>>> +	if (new_state > MAX_STATE || old_state > MAX_STATE)
> >>>>>>> +		return false;
> >>>>>>> +
> >>>>>>> +	return vfio_from_state_table[old_state][new_state];
> >>>>>>> +}
> >>>>>>> +EXPORT_SYMBOL_GPL(vfio_change_migration_state_allowed);
> >>>>>>> +
> >>>>>>>     static long vfio_device_fops_unl_ioctl(struct file *filep,
> >>>>>>>     				       unsigned int cmd, unsigned long arg)
> >>>>>>>     {
> >>>>>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h index
> >>>>>>> b53a9557884a..e65137a708f1 100644
> >>>>>>> --- a/include/linux/vfio.h
> >>>>>>> +++ b/include/linux/vfio.h
> >>>>>>> @@ -83,6 +83,7 @@ extern struct vfio_device
> >>>>>>> *vfio_device_get_from_dev(struct device *dev);
> >>>>>>>     extern void vfio_device_put(struct vfio_device *device);
> >>>>>>>
> >>>>>>>     int vfio_assign_device_set(struct vfio_device *device, void
> >>>>>>> *set_id);
> >>>>>>> +bool vfio_change_migration_state_allowed(u32 new_state, u32
> >>>> old_state);
> >>>>>>>     /* events for the backend driver notify callback */
> >>>>>>>     enum vfio_iommu_notify_type {
> >>>>>>> --
> >>>>>>> 2.31.1
Alex Williamson Sept. 27, 2021, 10:46 p.m. UTC | #10
On Wed, 22 Sep 2021 13:38:51 +0300
Leon Romanovsky <leon@kernel.org> wrote:

> From: Yishai Hadas <yishaih@nvidia.com>
> 
> Add an API in the core layer to check migration state transition validity
> as part of a migration flow.
> 
> The valid transitions follow the expected usage as described in
> uapi/vfio.h and triggered by QEMU.
> 
> This ensures that all migration implementations follow a consistent
> migration state machine.
> 
> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>  drivers/vfio/vfio.c  | 41 +++++++++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h |  1 +
>  2 files changed, 42 insertions(+)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 3c034fe14ccb..c3ca33e513c8 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1664,6 +1664,47 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
>  	return 0;
>  }
>  
> +/**
> + * vfio_change_migration_state_allowed - Checks whether a migration state
> + *   transition is valid.
> + * @new_state: The new state to move to.
> + * @old_state: The old state.
> + * Return: true if the transition is valid.
> + */
> +bool vfio_change_migration_state_allowed(u32 new_state, u32 old_state)
> +{
> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
> +		[VFIO_DEVICE_STATE_STOP] = {
> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
> +		},

Our state transition diagram is pretty weak on reachable transitions
out of the _STOP state, why do we select only these two as valid?

Consistent behavior to userspace is of course nice, but I wonder if we
were expecting a device reset to get us back to _RUNNING, or if the
drivers would make use of the protocol through which a driver can nak
(write error, no state change) or fault (_ERROR device state) a state
change.

There does need to be a way to get back to _RUNNING to support a
migration failure without a reset, but would that be from _SAVING
or from _STOP and what's our rationale for the excluded states?

I'll see if I can dig through emails to find what was intended to be
reachable from _STOP.  Kirti or Connie, do you recall?

Also, I think the _ERROR state is implicitly handled correctly here,
its value is >MAX_STATE so we can't transition into or out of it, but a
comment to indicate that it's been considered for this would be nice.

> +		[VFIO_DEVICE_STATE_RUNNING] = {
> +			[VFIO_DEVICE_STATE_STOP] = 1,
> +			[VFIO_DEVICE_STATE_SAVING] = 1,
> +			[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING] = 1,
> +		},

Shameer's comment is correct here, _RESUMING is a valid next state
since the default state is _RUNNING.

> +		[VFIO_DEVICE_STATE_SAVING] = {
> +			[VFIO_DEVICE_STATE_STOP] = 1,
> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> +		},

What's the rationale that we can't return to _SAVING|_RUNNING here?

> +		[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING] = {
> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> +			[VFIO_DEVICE_STATE_SAVING] = 1,
> +		},

Can't we always _STOP the device at any point?

> +		[VFIO_DEVICE_STATE_RESUMING] = {
> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> +			[VFIO_DEVICE_STATE_STOP] = 1,
> +		},

Couldn't it be possible to switch immediately to _RUNNING|_SAVING for
tracing purposes?  Or _SAVING, perhaps to validate the restored state
without starting the device?  Thanks,

Alex

> +	};
> +
> +	if (new_state > MAX_STATE || old_state > MAX_STATE)
> +		return false;
> +
> +	return vfio_from_state_table[old_state][new_state];
> +}
> +EXPORT_SYMBOL_GPL(vfio_change_migration_state_allowed);
> +
>  static long vfio_device_fops_unl_ioctl(struct file *filep,
>  				       unsigned int cmd, unsigned long arg)
>  {
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index b53a9557884a..e65137a708f1 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -83,6 +83,7 @@ extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
>  extern void vfio_device_put(struct vfio_device *device);
>  
>  int vfio_assign_device_set(struct vfio_device *device, void *set_id);
> +bool vfio_change_migration_state_allowed(u32 new_state, u32 old_state);
>  
>  /* events for the backend driver notify callback */
>  enum vfio_iommu_notify_type {
Jason Gunthorpe Sept. 27, 2021, 11:12 p.m. UTC | #11
On Mon, Sep 27, 2021 at 04:46:48PM -0600, Alex Williamson wrote:
> > +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
> > +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
> > +		[VFIO_DEVICE_STATE_STOP] = {
> > +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> > +			[VFIO_DEVICE_STATE_RESUMING] = 1,
> > +		},
> 
> Our state transition diagram is pretty weak on reachable transitions
> out of the _STOP state, why do we select only these two as valid?

I have no particular opinion on specific states here, however adding
more states means more stuff for drivers to implement and more risk
driver writers will mess up this uAPI.

So only on those grounds I'd suggest to keep this to the minimum
needed instead of the maximum logically possible..

Also, probably the FSM comment from the uapi header file should be
moved into a function comment above this function?

Jason
Alex Williamson Sept. 28, 2021, 7:19 p.m. UTC | #12
On Mon, 27 Sep 2021 20:12:39 -0300
Jason Gunthorpe <jgg@ziepe.ca> wrote:

> On Mon, Sep 27, 2021 at 04:46:48PM -0600, Alex Williamson wrote:
> > > +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
> > > +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
> > > +		[VFIO_DEVICE_STATE_STOP] = {
> > > +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> > > +			[VFIO_DEVICE_STATE_RESUMING] = 1,
> > > +		},  
> > 
> > Our state transition diagram is pretty weak on reachable transitions
> > out of the _STOP state, why do we select only these two as valid?  
> 
> I have no particular opinion on specific states here, however adding
> more states means more stuff for drivers to implement and more risk
> driver writers will mess up this uAPI.

It looks like state transitions were largely discussed in v9 and v10 of
the migration proposals:

https://lore.kernel.org/all/1573578220-7530-2-git-send-email-kwankhede@nvidia.com/
https://lore.kernel.org/all/1576527700-21805-2-git-send-email-kwankhede@nvidia.com/

I'm not seeing that we really excluded many transitions there.

> So only on those grounds I'd suggest to keep this to the minimum
> needed instead of the maximum logically possible..
> 
> Also, probably the FSM comment from the uapi header file should be
> moved into a function comment above this function?

It's not clear this function shouldn't be anything more than:

	if (new_state > MAX_STATE || old_state > MAX_STATE)
		return false;	/* exited via device reset, */
				/* entered via transition fault */

	return true;

That's still only 5 fully interconnected states to work between, and
potentially a 6th if we decide _RESUMING|_RUNNING is valid for a device
supporting post-copy.

In defining the device state, we tried to steer away from defining it
in terms of the QEMU migration API, but rather as a set of controls
that could be used to support that API to leave us some degree of
independence that QEMU implementation might evolve.

To that extent, it actually seems easier for a device implementation to
focus on bit definition rather than the state machine node.

I'd also vote that any clarification of state validity and transitions
belongs in the uAPI header and a transition test function should
reference that header as the source of truth, rather than the other way
around.  Thanks,

Alex
Jason Gunthorpe Sept. 28, 2021, 7:35 p.m. UTC | #13
On Tue, Sep 28, 2021 at 01:19:58PM -0600, Alex Williamson wrote:

> In defining the device state, we tried to steer away from defining it
> in terms of the QEMU migration API, but rather as a set of controls
> that could be used to support that API to leave us some degree of
> independence that QEMU implementation might evolve.

That is certainly a different perspective, it would have been
better to not express this idea as a FSM in that case...

So each state in mlx5vf_pci_set_device_state() should call the correct
combination of (un)freeze, (un)quiesce and so on so each state
reflects a defined operation of the device?

Jason
Alex Williamson Sept. 28, 2021, 8:18 p.m. UTC | #14
On Tue, 28 Sep 2021 16:35:50 -0300
Jason Gunthorpe <jgg@ziepe.ca> wrote:

> On Tue, Sep 28, 2021 at 01:19:58PM -0600, Alex Williamson wrote:
> 
> > In defining the device state, we tried to steer away from defining it
> > in terms of the QEMU migration API, but rather as a set of controls
> > that could be used to support that API to leave us some degree of
> > independence that QEMU implementation might evolve.  
> 
> That is certainly a different perspective, it would have been
> better to not express this idea as a FSM in that case...
> 
> So each state in mlx5vf_pci_set_device_state() should call the correct
> combination of (un)freeze, (un)quiesce and so on so each state
> reflects a defined operation of the device?

I'd expect so, for instance the implementation of entering the _STOP
state presumes a previous state that where the device is apparently
already quiesced.  That doesn't support a direct _RUNNING -> _STOP
transition where I argued in the linked threads that those states
should be reachable from any other state.  Thanks,

Alex
Max Gurtovoy Sept. 29, 2021, 10:44 a.m. UTC | #15
On 9/28/2021 2:12 AM, Jason Gunthorpe wrote:
> On Mon, Sep 27, 2021 at 04:46:48PM -0600, Alex Williamson wrote:
>>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
>>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
>>> +		[VFIO_DEVICE_STATE_STOP] = {
>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
>>> +		},
>> Our state transition diagram is pretty weak on reachable transitions
>> out of the _STOP state, why do we select only these two as valid?
> I have no particular opinion on specific states here, however adding
> more states means more stuff for drivers to implement and more risk
> driver writers will mess up this uAPI.

_STOP == 000b => Device Stopped, not saving or resuming (from UAPI).

This is the default initial state and not RUNNING.

The user application should move device from STOP => RUNNING or STOP => 
RESUMING.

Maybe we need to extend the comment in the UAPI file.

>
> So only on those grounds I'd suggest to keep this to the minimum
> needed instead of the maximum logically possible..
>
> Also, probably the FSM comment from the uapi header file should be
> moved into a function comment above this function?
>
> Jason
Max Gurtovoy Sept. 29, 2021, 10:57 a.m. UTC | #16
On 9/28/2021 10:19 PM, Alex Williamson wrote:
> On Mon, 27 Sep 2021 20:12:39 -0300
> Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
>> On Mon, Sep 27, 2021 at 04:46:48PM -0600, Alex Williamson wrote:
>>>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
>>>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
>>>> +		[VFIO_DEVICE_STATE_STOP] = {
>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
>>>> +		},
>>> Our state transition diagram is pretty weak on reachable transitions
>>> out of the _STOP state, why do we select only these two as valid?
>> I have no particular opinion on specific states here, however adding
>> more states means more stuff for drivers to implement and more risk
>> driver writers will mess up this uAPI.
> It looks like state transitions were largely discussed in v9 and v10 of
> the migration proposals:
>
> https://lore.kernel.org/all/1573578220-7530-2-git-send-email-kwankhede@nvidia.com/
> https://lore.kernel.org/all/1576527700-21805-2-git-send-email-kwankhede@nvidia.com/
>
> I'm not seeing that we really excluded many transitions there.
>
>> So only on those grounds I'd suggest to keep this to the minimum
>> needed instead of the maximum logically possible..
>>
>> Also, probably the FSM comment from the uapi header file should be
>> moved into a function comment above this function?
> It's not clear this function shouldn't be anything more than:
>
> 	if (new_state > MAX_STATE || old_state > MAX_STATE)
> 		return false;	/* exited via device reset, */
> 				/* entered via transition fault */
>
> 	return true;
>
> That's still only 5 fully interconnected states to work between, and
> potentially a 6th if we decide _RESUMING|_RUNNING is valid for a device
> supporting post-copy.
>
> In defining the device state, we tried to steer away from defining it
> in terms of the QEMU migration API, but rather as a set of controls
> that could be used to support that API to leave us some degree of
> independence that QEMU implementation might evolve.

The state machine is not related to QEMU specifically.

The state machine defines an agreement between user application (let's 
say QEMU) and VFIO.

If a user application would like to move, for example, from RESUMING to 
SAVING state, then the kernel should fail. I don't that there is a 
device that can support it.

If you prefer we check this inside our mlx5 vfio driver, we can do it. 
But we think that this is a common logic according to the defined FSM.

Do you prefer code duplication in vendor vfio-pci drivers ?

> To that extent, it actually seems easier for a device implementation to
> focus on bit definition rather than the state machine node.
>
> I'd also vote that any clarification of state validity and transitions
> belongs in the uAPI header and a transition test function should
> reference that header as the source of truth, rather than the other way
> around.  Thanks,

Yes, I guess this is possible.

>
> Alex
>
Alex Williamson Sept. 29, 2021, 12:35 p.m. UTC | #17
On Wed, 29 Sep 2021 13:44:10 +0300
Max Gurtovoy <mgurtovoy@nvidia.com> wrote:

> On 9/28/2021 2:12 AM, Jason Gunthorpe wrote:
> > On Mon, Sep 27, 2021 at 04:46:48PM -0600, Alex Williamson wrote:  
> >>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
> >>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
> >>> +		[VFIO_DEVICE_STATE_STOP] = {
> >>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
> >>> +		},  
> >> Our state transition diagram is pretty weak on reachable transitions
> >> out of the _STOP state, why do we select only these two as valid?  
> > I have no particular opinion on specific states here, however adding
> > more states means more stuff for drivers to implement and more risk
> > driver writers will mess up this uAPI.  
> 
> _STOP == 000b => Device Stopped, not saving or resuming (from UAPI).
> 
> This is the default initial state and not RUNNING.
> 
> The user application should move device from STOP => RUNNING or STOP => 
> RESUMING.
> 
> Maybe we need to extend the comment in the UAPI file.


include/uapi/linux/vfio.h:
...
 *  +------- _RESUMING
 *  |+------ _SAVING
 *  ||+----- _RUNNING
 *  |||
 *  000b => Device Stopped, not saving or resuming
 *  001b => Device running, which is the default state
                            ^^^^^^^^^^^^^^^^^^^^^^^^^^
...
 * State transitions:
 *
 *              _RESUMING  _RUNNING    Pre-copy    Stop-and-copy   _STOP
 *                (100b)     (001b)     (011b)        (010b)       (000b)
 * 0. Running or default state
 *                             |
                 ^^^^^^^^^^^^^
...
 * 0. Default state of VFIO device is _RUNNING when the user application starts.
      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The uAPI is pretty clear here.  A default state of _STOP is not
compatible with existing devices and userspace that does not support
migration.  Thanks,

Alex
Max Gurtovoy Sept. 29, 2021, 1:26 p.m. UTC | #18
On 9/29/2021 3:35 PM, Alex Williamson wrote:
> On Wed, 29 Sep 2021 13:44:10 +0300
> Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
>
>> On 9/28/2021 2:12 AM, Jason Gunthorpe wrote:
>>> On Mon, Sep 27, 2021 at 04:46:48PM -0600, Alex Williamson wrote:
>>>>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
>>>>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
>>>>> +		[VFIO_DEVICE_STATE_STOP] = {
>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
>>>>> +		},
>>>> Our state transition diagram is pretty weak on reachable transitions
>>>> out of the _STOP state, why do we select only these two as valid?
>>> I have no particular opinion on specific states here, however adding
>>> more states means more stuff for drivers to implement and more risk
>>> driver writers will mess up this uAPI.
>> _STOP == 000b => Device Stopped, not saving or resuming (from UAPI).
>>
>> This is the default initial state and not RUNNING.
>>
>> The user application should move device from STOP => RUNNING or STOP =>
>> RESUMING.
>>
>> Maybe we need to extend the comment in the UAPI file.
>
> include/uapi/linux/vfio.h:
> ...
>   *  +------- _RESUMING
>   *  |+------ _SAVING
>   *  ||+----- _RUNNING
>   *  |||
>   *  000b => Device Stopped, not saving or resuming
>   *  001b => Device running, which is the default state
>                              ^^^^^^^^^^^^^^^^^^^^^^^^^^
> ...
>   * State transitions:
>   *
>   *              _RESUMING  _RUNNING    Pre-copy    Stop-and-copy   _STOP
>   *                (100b)     (001b)     (011b)        (010b)       (000b)
>   * 0. Running or default state
>   *                             |
>                   ^^^^^^^^^^^^^
> ...
>   * 0. Default state of VFIO device is _RUNNING when the user application starts.
>        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> The uAPI is pretty clear here.  A default state of _STOP is not
> compatible with existing devices and userspace that does not support
> migration.  Thanks,

Why do you need this state machine for userspace that doesn't support 
migration ?

What is the definition of RUNNING state for a paused VM that is waiting 
for incoming migration blob ?

>
> Alex
>
Alex Williamson Sept. 29, 2021, 1:50 p.m. UTC | #19
On Wed, 29 Sep 2021 16:26:55 +0300
Max Gurtovoy <mgurtovoy@nvidia.com> wrote:

> On 9/29/2021 3:35 PM, Alex Williamson wrote:
> > On Wed, 29 Sep 2021 13:44:10 +0300
> > Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
> >  
> >> On 9/28/2021 2:12 AM, Jason Gunthorpe wrote:  
> >>> On Mon, Sep 27, 2021 at 04:46:48PM -0600, Alex Williamson wrote:  
> >>>>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
> >>>>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
> >>>>> +		[VFIO_DEVICE_STATE_STOP] = {
> >>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>>>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
> >>>>> +		},  
> >>>> Our state transition diagram is pretty weak on reachable transitions
> >>>> out of the _STOP state, why do we select only these two as valid?  
> >>> I have no particular opinion on specific states here, however adding
> >>> more states means more stuff for drivers to implement and more risk
> >>> driver writers will mess up this uAPI.  
> >> _STOP == 000b => Device Stopped, not saving or resuming (from UAPI).
> >>
> >> This is the default initial state and not RUNNING.
> >>
> >> The user application should move device from STOP => RUNNING or STOP =>
> >> RESUMING.
> >>
> >> Maybe we need to extend the comment in the UAPI file.  
> >
> > include/uapi/linux/vfio.h:
> > ...
> >   *  +------- _RESUMING
> >   *  |+------ _SAVING
> >   *  ||+----- _RUNNING
> >   *  |||
> >   *  000b => Device Stopped, not saving or resuming
> >   *  001b => Device running, which is the default state
> >                              ^^^^^^^^^^^^^^^^^^^^^^^^^^
> > ...
> >   * State transitions:
> >   *
> >   *              _RESUMING  _RUNNING    Pre-copy    Stop-and-copy   _STOP
> >   *                (100b)     (001b)     (011b)        (010b)       (000b)
> >   * 0. Running or default state
> >   *                             |
> >                   ^^^^^^^^^^^^^
> > ...
> >   * 0. Default state of VFIO device is _RUNNING when the user application starts.
> >        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >
> > The uAPI is pretty clear here.  A default state of _STOP is not
> > compatible with existing devices and userspace that does not support
> > migration.  Thanks,  
> 
> Why do you need this state machine for userspace that doesn't support 
> migration ?

For userspace that doesn't support migration, there's one state,
_RUNNING.  That's what we're trying to be compatible and consistent
with.  Migration is an extension, not a base requirement.

> What is the definition of RUNNING state for a paused VM that is waiting 
> for incoming migration blob ?

A VM supporting migration of the device would move the device to
_RESUMING to load the incoming data.  If the VM leaves the device in
_RUNNING, then it doesn't support migration of the device and it's out
of scope how it handles that device state.  Existing devices continue
running regardless of whether the VM state is paused, it's only devices
supporting migration where userspace could optionally have the device
run state follow the VM run state.  Thanks,

Alex
Max Gurtovoy Sept. 29, 2021, 2:36 p.m. UTC | #20
On 9/29/2021 4:50 PM, Alex Williamson wrote:
> On Wed, 29 Sep 2021 16:26:55 +0300
> Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
>
>> On 9/29/2021 3:35 PM, Alex Williamson wrote:
>>> On Wed, 29 Sep 2021 13:44:10 +0300
>>> Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
>>>   
>>>> On 9/28/2021 2:12 AM, Jason Gunthorpe wrote:
>>>>> On Mon, Sep 27, 2021 at 04:46:48PM -0600, Alex Williamson wrote:
>>>>>>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
>>>>>>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
>>>>>>> +		[VFIO_DEVICE_STATE_STOP] = {
>>>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>>>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
>>>>>>> +		},
>>>>>> Our state transition diagram is pretty weak on reachable transitions
>>>>>> out of the _STOP state, why do we select only these two as valid?
>>>>> I have no particular opinion on specific states here, however adding
>>>>> more states means more stuff for drivers to implement and more risk
>>>>> driver writers will mess up this uAPI.
>>>> _STOP == 000b => Device Stopped, not saving or resuming (from UAPI).
>>>>
>>>> This is the default initial state and not RUNNING.
>>>>
>>>> The user application should move device from STOP => RUNNING or STOP =>
>>>> RESUMING.
>>>>
>>>> Maybe we need to extend the comment in the UAPI file.
>>> include/uapi/linux/vfio.h:
>>> ...
>>>    *  +------- _RESUMING
>>>    *  |+------ _SAVING
>>>    *  ||+----- _RUNNING
>>>    *  |||
>>>    *  000b => Device Stopped, not saving or resuming
>>>    *  001b => Device running, which is the default state
>>>                               ^^^^^^^^^^^^^^^^^^^^^^^^^^
>>> ...
>>>    * State transitions:
>>>    *
>>>    *              _RESUMING  _RUNNING    Pre-copy    Stop-and-copy   _STOP
>>>    *                (100b)     (001b)     (011b)        (010b)       (000b)
>>>    * 0. Running or default state
>>>    *                             |
>>>                    ^^^^^^^^^^^^^
>>> ...
>>>    * 0. Default state of VFIO device is _RUNNING when the user application starts.
>>>         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>>
>>> The uAPI is pretty clear here.  A default state of _STOP is not
>>> compatible with existing devices and userspace that does not support
>>> migration.  Thanks,
>> Why do you need this state machine for userspace that doesn't support
>> migration ?
> For userspace that doesn't support migration, there's one state,
> _RUNNING.  That's what we're trying to be compatible and consistent
> with.  Migration is an extension, not a base requirement.

Userspace without migration doesn't care about this state.

We left with kernel now. vfio-pci today doesn't support migration, right 
? state is in theory is 0 (STOP).

This state machine is controlled by the migration SW. The drivers don't 
move state implicitly.

mlx5-vfio-pci support migration and will work fine with non-migration SW 
(it will stay with state = 0 unless someone will move it. but nobody 
will) exactly like vfio-pci does today.

So where is the problem ?

>> What is the definition of RUNNING state for a paused VM that is waiting
>> for incoming migration blob ?
> A VM supporting migration of the device would move the device to
> _RESUMING to load the incoming data.  If the VM leaves the device in
> _RUNNING, then it doesn't support migration of the device and it's out
> of scope how it handles that device state.  Existing devices continue
> running regardless of whether the VM state is paused, it's only devices
> supporting migration where userspace could optionally have the device
> run state follow the VM run state.  Thanks,
>
> Alex
>
Alex Williamson Sept. 29, 2021, 3:17 p.m. UTC | #21
On Wed, 29 Sep 2021 17:36:59 +0300
Max Gurtovoy <mgurtovoy@nvidia.com> wrote:

> On 9/29/2021 4:50 PM, Alex Williamson wrote:
> > On Wed, 29 Sep 2021 16:26:55 +0300
> > Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
> >  
> >> On 9/29/2021 3:35 PM, Alex Williamson wrote:  
> >>> On Wed, 29 Sep 2021 13:44:10 +0300
> >>> Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
> >>>     
> >>>> On 9/28/2021 2:12 AM, Jason Gunthorpe wrote:  
> >>>>> On Mon, Sep 27, 2021 at 04:46:48PM -0600, Alex Williamson wrote:  
> >>>>>>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
> >>>>>>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
> >>>>>>> +		[VFIO_DEVICE_STATE_STOP] = {
> >>>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
> >>>>>>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
> >>>>>>> +		},  
> >>>>>> Our state transition diagram is pretty weak on reachable transitions
> >>>>>> out of the _STOP state, why do we select only these two as valid?  
> >>>>> I have no particular opinion on specific states here, however adding
> >>>>> more states means more stuff for drivers to implement and more risk
> >>>>> driver writers will mess up this uAPI.  
> >>>> _STOP == 000b => Device Stopped, not saving or resuming (from UAPI).
> >>>>
> >>>> This is the default initial state and not RUNNING.
> >>>>
> >>>> The user application should move device from STOP => RUNNING or STOP =>
> >>>> RESUMING.
> >>>>
> >>>> Maybe we need to extend the comment in the UAPI file.  
> >>> include/uapi/linux/vfio.h:
> >>> ...
> >>>    *  +------- _RESUMING
> >>>    *  |+------ _SAVING
> >>>    *  ||+----- _RUNNING
> >>>    *  |||
> >>>    *  000b => Device Stopped, not saving or resuming
> >>>    *  001b => Device running, which is the default state
> >>>                               ^^^^^^^^^^^^^^^^^^^^^^^^^^
> >>> ...
> >>>    * State transitions:
> >>>    *
> >>>    *              _RESUMING  _RUNNING    Pre-copy    Stop-and-copy   _STOP
> >>>    *                (100b)     (001b)     (011b)        (010b)       (000b)
> >>>    * 0. Running or default state
> >>>    *                             |
> >>>                    ^^^^^^^^^^^^^
> >>> ...
> >>>    * 0. Default state of VFIO device is _RUNNING when the user application starts.
> >>>         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >>>
> >>> The uAPI is pretty clear here.  A default state of _STOP is not
> >>> compatible with existing devices and userspace that does not support
> >>> migration.  Thanks,  
> >> Why do you need this state machine for userspace that doesn't support
> >> migration ?  
> > For userspace that doesn't support migration, there's one state,
> > _RUNNING.  That's what we're trying to be compatible and consistent
> > with.  Migration is an extension, not a base requirement.  
> 
> Userspace without migration doesn't care about this state.
> 
> We left with kernel now. vfio-pci today doesn't support migration, right 
> ? state is in theory is 0 (STOP).
> 
> This state machine is controlled by the migration SW. The drivers don't 
> move state implicitly.
> 
> mlx5-vfio-pci support migration and will work fine with non-migration SW 
> (it will stay with state = 0 unless someone will move it. but nobody 
> will) exactly like vfio-pci does today.
> 
> So where is the problem ?

So you have a device that's actively modifying its internal state,
performing I/O, including DMA (thereby dirtying VM memory), all while
in the _STOP state?  And you don't see this as a problem?

There's a major inconsistency if the migration interface is telling us
something different than we can actually observe through the behavior of
the device.  Thanks,

Alex
Max Gurtovoy Sept. 29, 2021, 3:28 p.m. UTC | #22
On 9/29/2021 6:17 PM, Alex Williamson wrote:
> On Wed, 29 Sep 2021 17:36:59 +0300
> Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
>
>> On 9/29/2021 4:50 PM, Alex Williamson wrote:
>>> On Wed, 29 Sep 2021 16:26:55 +0300
>>> Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
>>>   
>>>> On 9/29/2021 3:35 PM, Alex Williamson wrote:
>>>>> On Wed, 29 Sep 2021 13:44:10 +0300
>>>>> Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
>>>>>      
>>>>>> On 9/28/2021 2:12 AM, Jason Gunthorpe wrote:
>>>>>>> On Mon, Sep 27, 2021 at 04:46:48PM -0600, Alex Williamson wrote:
>>>>>>>>> +	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
>>>>>>>>> +	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
>>>>>>>>> +		[VFIO_DEVICE_STATE_STOP] = {
>>>>>>>>> +			[VFIO_DEVICE_STATE_RUNNING] = 1,
>>>>>>>>> +			[VFIO_DEVICE_STATE_RESUMING] = 1,
>>>>>>>>> +		},
>>>>>>>> Our state transition diagram is pretty weak on reachable transitions
>>>>>>>> out of the _STOP state, why do we select only these two as valid?
>>>>>>> I have no particular opinion on specific states here, however adding
>>>>>>> more states means more stuff for drivers to implement and more risk
>>>>>>> driver writers will mess up this uAPI.
>>>>>> _STOP == 000b => Device Stopped, not saving or resuming (from UAPI).
>>>>>>
>>>>>> This is the default initial state and not RUNNING.
>>>>>>
>>>>>> The user application should move device from STOP => RUNNING or STOP =>
>>>>>> RESUMING.
>>>>>>
>>>>>> Maybe we need to extend the comment in the UAPI file.
>>>>> include/uapi/linux/vfio.h:
>>>>> ...
>>>>>     *  +------- _RESUMING
>>>>>     *  |+------ _SAVING
>>>>>     *  ||+----- _RUNNING
>>>>>     *  |||
>>>>>     *  000b => Device Stopped, not saving or resuming
>>>>>     *  001b => Device running, which is the default state
>>>>>                                ^^^^^^^^^^^^^^^^^^^^^^^^^^
>>>>> ...
>>>>>     * State transitions:
>>>>>     *
>>>>>     *              _RESUMING  _RUNNING    Pre-copy    Stop-and-copy   _STOP
>>>>>     *                (100b)     (001b)     (011b)        (010b)       (000b)
>>>>>     * 0. Running or default state
>>>>>     *                             |
>>>>>                     ^^^^^^^^^^^^^
>>>>> ...
>>>>>     * 0. Default state of VFIO device is _RUNNING when the user application starts.
>>>>>          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>>>>
>>>>> The uAPI is pretty clear here.  A default state of _STOP is not
>>>>> compatible with existing devices and userspace that does not support
>>>>> migration.  Thanks,
>>>> Why do you need this state machine for userspace that doesn't support
>>>> migration ?
>>> For userspace that doesn't support migration, there's one state,
>>> _RUNNING.  That's what we're trying to be compatible and consistent
>>> with.  Migration is an extension, not a base requirement.
>> Userspace without migration doesn't care about this state.
>>
>> We left with kernel now. vfio-pci today doesn't support migration, right
>> ? state is in theory is 0 (STOP).
>>
>> This state machine is controlled by the migration SW. The drivers don't
>> move state implicitly.
>>
>> mlx5-vfio-pci support migration and will work fine with non-migration SW
>> (it will stay with state = 0 unless someone will move it. but nobody
>> will) exactly like vfio-pci does today.
>>
>> So where is the problem ?
> So you have a device that's actively modifying its internal state,
> performing I/O, including DMA (thereby dirtying VM memory), all while
> in the _STOP state?  And you don't see this as a problem?

I don't see how is it different from vfio-pci situation.

And you said you're worried from compatibility. I can't see a 
compatibility issue here.

Maybe we need to rename STOP state. We can call it READY or LIVE or 
NON_MIGRATION_STATE.

>
> There's a major inconsistency if the migration interface is telling us
> something different than we can actually observe through the behavior of
> the device.  Thanks,
>
> Alex
>
Jason Gunthorpe Sept. 29, 2021, 4:14 p.m. UTC | #23
On Wed, Sep 29, 2021 at 06:28:44PM +0300, Max Gurtovoy wrote:

> > So you have a device that's actively modifying its internal state,
> > performing I/O, including DMA (thereby dirtying VM memory), all while
> > in the _STOP state?  And you don't see this as a problem?
> 
> I don't see how is it different from vfio-pci situation.

vfio-pci provides no way to observe the migration state. It isn't
"000b"

> Maybe we need to rename STOP state. We can call it READY or LIVE or
> NON_MIGRATION_STATE.

It was a poor choice to use 000b as stop, but it doesn't really
matter. The mlx5 driver should just pre-init this readable to running.

Jason
Jason Gunthorpe Sept. 29, 2021, 4:16 p.m. UTC | #24
On Tue, Sep 28, 2021 at 02:18:44PM -0600, Alex Williamson wrote:
> On Tue, 28 Sep 2021 16:35:50 -0300
> Jason Gunthorpe <jgg@ziepe.ca> wrote:
> 
> > On Tue, Sep 28, 2021 at 01:19:58PM -0600, Alex Williamson wrote:
> > 
> > > In defining the device state, we tried to steer away from defining it
> > > in terms of the QEMU migration API, but rather as a set of controls
> > > that could be used to support that API to leave us some degree of
> > > independence that QEMU implementation might evolve.  
> > 
> > That is certainly a different perspective, it would have been
> > better to not express this idea as a FSM in that case...
> > 
> > So each state in mlx5vf_pci_set_device_state() should call the correct
> > combination of (un)freeze, (un)quiesce and so on so each state
> > reflects a defined operation of the device?
> 
> I'd expect so, for instance the implementation of entering the _STOP
> state presumes a previous state that where the device is apparently
> already quiesced.  That doesn't support a direct _RUNNING -> _STOP
> transition where I argued in the linked threads that those states
> should be reachable from any other state.  Thanks,

If we focus on mlx5 there are two device 'flags' to manage:
 - Device cannot issue DMAs
 - Device internal state cannot change (ie cannot receive DMAs)

This is necessary to co-ordinate across multiple devices that might be
doing peer to peer DMA between them. The whole multi-device complex
should be moved to "cannot issue DMA's" then the whole complex would
go to "state cannot change" and be serialized.

The expected sequence at the device is thus

Resuming
 full stop -> does not issue DMAs -> full operation
Suspend
 full operation -> does not issue DMAs -> full stop

Further the device has two actions
 - Trigger serializating the device state
 - Trigger de-serializing the device state

So, what is the behavior upon each state:

 *  000b => Device Stopped, not saving or resuming
     Does not issue DMAs
     Internal state cannot change

 *  001b => Device running, which is the default state
     Neither flags

 *  010b => Stop the device & save the device state, stop-and-copy state
     Does not issue DMAs
     Internal state cannot change

 *  011b => Device running and save the device state, pre-copy state
     Neither flags
     (future, DMA tracking turned on)

 *  100b => Device stopped and the device state is resuming
     Does not issue DMAs
     Internal state cannot change
     
 *  110b => Error state
    ???

 *  101b => Invalid state
 *  111b => Invalid state

    ???

What should the ??'s be? It looks like mlx5 doesn't use these, so it
should just refuse to enter these states in the first place..

The two actions:
 trigger serializing the device state
   Done when asked to go to 010b ?

 trigger de-serializing the device state
   Done when transition from 100b -> 000b ?

There is a missing state "Stop Active Transactions" which would be
only "does not issue DMAs". I've seen a proposal to add that.

I'm happy enough with this and it seems clean and easy enough to
implement.

Jason
Alex Williamson Sept. 29, 2021, 6:06 p.m. UTC | #25
On Wed, 29 Sep 2021 13:16:02 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Sep 28, 2021 at 02:18:44PM -0600, Alex Williamson wrote:
> > On Tue, 28 Sep 2021 16:35:50 -0300
> > Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >   
> > > On Tue, Sep 28, 2021 at 01:19:58PM -0600, Alex Williamson wrote:
> > >   
> > > > In defining the device state, we tried to steer away from defining it
> > > > in terms of the QEMU migration API, but rather as a set of controls
> > > > that could be used to support that API to leave us some degree of
> > > > independence that QEMU implementation might evolve.    
> > > 
> > > That is certainly a different perspective, it would have been
> > > better to not express this idea as a FSM in that case...
> > > 
> > > So each state in mlx5vf_pci_set_device_state() should call the correct
> > > combination of (un)freeze, (un)quiesce and so on so each state
> > > reflects a defined operation of the device?  
> > 
> > I'd expect so, for instance the implementation of entering the _STOP
> > state presumes a previous state that where the device is apparently
> > already quiesced.  That doesn't support a direct _RUNNING -> _STOP
> > transition where I argued in the linked threads that those states
> > should be reachable from any other state.  Thanks,  
> 
> If we focus on mlx5 there are two device 'flags' to manage:
>  - Device cannot issue DMAs
>  - Device internal state cannot change (ie cannot receive DMAs)
> 
> This is necessary to co-ordinate across multiple devices that might be
> doing peer to peer DMA between them. The whole multi-device complex
> should be moved to "cannot issue DMA's" then the whole complex would
> go to "state cannot change" and be serialized.

Are you anticipating p2p from outside the VM?  The typical scenario
here would be that p2p occurs only intra-VM, so all the devices would
stop issuing DMA (modulo trying to quiesce devices simultaneously).

> The expected sequence at the device is thus
> 
> Resuming
>  full stop -> does not issue DMAs -> full operation
> Suspend
>  full operation -> does not issue DMAs -> full stop
> 
> Further the device has two actions
>  - Trigger serializating the device state
>  - Trigger de-serializing the device state
> 
> So, what is the behavior upon each state:
> 
>  *  000b => Device Stopped, not saving or resuming
>      Does not issue DMAs
>      Internal state cannot change
> 
>  *  001b => Device running, which is the default state
>      Neither flags
> 
>  *  010b => Stop the device & save the device state, stop-and-copy state
>      Does not issue DMAs
>      Internal state cannot change
> 
>  *  011b => Device running and save the device state, pre-copy state
>      Neither flags
>      (future, DMA tracking turned on)
> 
>  *  100b => Device stopped and the device state is resuming
>      Does not issue DMAs
>      Internal state cannot change

cannot change... other that as loaded via migration region.

>      
>  *  110b => Error state
>     ???
> 
>  *  101b => Invalid state
>  *  111b => Invalid state
> 
>     ???
> 
> What should the ??'s be? It looks like mlx5 doesn't use these, so it
> should just refuse to enter these states in the first place..

_SAVING and _RESUMING are considered mutually exclusive, therefore any
combination of both of them is invalid.  We've chosen to use the
combination of 110b as an error state to indicate the device state is
undefined, but not _RUNNING.  This state is only reachable by an
internal error of the driver during a state transition.

The expected protocol is that if the user write to the device_state
register returns an errno, the user reevaluates the device_state to
determine if the desired transition is unavailable (previous state
value is returned) or generated a fault (error state value returned).
Due to the undefined state of the device, the only exit from the error
state is to re-initialize the device state via a reset.  Therefore a
successful device reset should always return the device to the 001b
state.

The 111b state is also considered unreachable through normal means due
to the _SAVING | _RESUMING conflict, but suggests the device is also
_RUNNING in this undefined state.  This combination has no currently
defined use case and should not be reachable.

The 101b state indicates _RUNNING while _RESUMING, which is simply not
a mode that has been spec'd at this time as it would require some
mechanism for the device to fault in state on demand.
 
> The two actions:
>  trigger serializing the device state
>    Done when asked to go to 010b ?

When the _SAVING bit is set.  The exact mechanics depends on the size
and volatility of the device state.  A GPU might begin in pre-copy
(011b) to transmit chunks of framebuffer data, recording hashes of
blocks read by the user to avoid re-sending them during the
stop-and-copy (010b) phase.  A device with a small internal state
representation may choose to forgo providing data in the pre-copy phase
and entirely serialize internal state at stop-and-copy.

>  trigger de-serializing the device state
>    Done when transition from 100b -> 000b ?

100b -> 000b is not a required transition, generally this would be 100b
-> 001b, ie. end state of _RUNNING vs _STOP.

I think the requirement is that de-serialization is complete when the
_RESUMING bit is cleared.  Whether the driver chooses to de-serialize
piece-wise as each block of data is written to the device or in bulk
from a buffer is left to the implementation.  In either case, the
driver can fail the transition to !_RESUMING if the state is incomplete
or otherwise corrupt.  It would again be the driver's discretion if
the device enters the error state or remains in _RESUMING.  If the user
has no valid state with which to exit the _RESUMING phase, a device
reset should return the device to _RUNNING with a default initial state.

> There is a missing state "Stop Active Transactions" which would be
> only "does not issue DMAs". I've seen a proposal to add that.

This would be to get all devices to stop issuing DMA while internal
state can be modified to avoid the synchronization issue of trying to
stop devices concurrently?  For PCI devices we obviously have the bus
master bit to manage that, but I could see how a migration extension
for such support (perhaps even just wired through to BM for PCI) could
be useful.  Thanks,

Alex
Jason Gunthorpe Sept. 29, 2021, 6:26 p.m. UTC | #26
On Wed, Sep 29, 2021 at 12:06:55PM -0600, Alex Williamson wrote:
> On Wed, 29 Sep 2021 13:16:02 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Tue, Sep 28, 2021 at 02:18:44PM -0600, Alex Williamson wrote:
> > > On Tue, 28 Sep 2021 16:35:50 -0300
> > > Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > >   
> > > > On Tue, Sep 28, 2021 at 01:19:58PM -0600, Alex Williamson wrote:
> > > >   
> > > > > In defining the device state, we tried to steer away from defining it
> > > > > in terms of the QEMU migration API, but rather as a set of controls
> > > > > that could be used to support that API to leave us some degree of
> > > > > independence that QEMU implementation might evolve.    
> > > > 
> > > > That is certainly a different perspective, it would have been
> > > > better to not express this idea as a FSM in that case...
> > > > 
> > > > So each state in mlx5vf_pci_set_device_state() should call the correct
> > > > combination of (un)freeze, (un)quiesce and so on so each state
> > > > reflects a defined operation of the device?  
> > > 
> > > I'd expect so, for instance the implementation of entering the _STOP
> > > state presumes a previous state that where the device is apparently
> > > already quiesced.  That doesn't support a direct _RUNNING -> _STOP
> > > transition where I argued in the linked threads that those states
> > > should be reachable from any other state.  Thanks,  
> > 
> > If we focus on mlx5 there are two device 'flags' to manage:
> >  - Device cannot issue DMAs
> >  - Device internal state cannot change (ie cannot receive DMAs)
> > 
> > This is necessary to co-ordinate across multiple devices that might be
> > doing peer to peer DMA between them. The whole multi-device complex
> > should be moved to "cannot issue DMA's" then the whole complex would
> > go to "state cannot change" and be serialized.
> 
> Are you anticipating p2p from outside the VM?  The typical scenario
> here would be that p2p occurs only intra-VM, so all the devices would
> stop issuing DMA (modulo trying to quiesce devices simultaneously).

Inside the VM.

Your 'modulo trying to quiesce devices simultaneously' is correct -
this is a real issue that needs to be solved.

If we put one device in a state where it's internal state is immutable
it can no longer accept DMA messages from the other devices. So there
are two states in the HW model - do not generate DMAs and finally the
immutable internal state where even external DMAs are refused.

> > The expected sequence at the device is thus
> > 
> > Resuming
> >  full stop -> does not issue DMAs -> full operation
> > Suspend
> >  full operation -> does not issue DMAs -> full stop
> > 
> > Further the device has two actions
> >  - Trigger serializating the device state
> >  - Trigger de-serializing the device state
> > 
> > So, what is the behavior upon each state:
> > 
> >  *  000b => Device Stopped, not saving or resuming
> >      Does not issue DMAs
> >      Internal state cannot change
> > 
> >  *  001b => Device running, which is the default state
> >      Neither flags
> > 
> >  *  010b => Stop the device & save the device state, stop-and-copy state
> >      Does not issue DMAs
> >      Internal state cannot change
> > 
> >  *  011b => Device running and save the device state, pre-copy state
> >      Neither flags
> >      (future, DMA tracking turned on)
> > 
> >  *  100b => Device stopped and the device state is resuming
> >      Does not issue DMAs
> >      Internal state cannot change
> 
> cannot change... other that as loaded via migration region.

Yes

> The expected protocol is that if the user write to the device_state
> register returns an errno, the user reevaluates the device_state to
> determine if the desired transition is unavailable (previous state
> value is returned) or generated a fault (error state value
> returned).

Hmm, interesting, mlx5 should be doing this as well. Eg resuming with
corrupt state should fail and cannot be recovered except via reset.

> The 101b state indicates _RUNNING while _RESUMING, which is simply not
> a mode that has been spec'd at this time as it would require some
> mechanism for the device to fault in state on demand.

So lets error on these requests since we don't know what state to put
the device into.

> > The two actions:
> >  trigger serializing the device state
> >    Done when asked to go to 010b ?
> 
> When the _SAVING bit is set.  The exact mechanics depends on the size
> and volatility of the device state.  A GPU might begin in pre-copy
> (011b) to transmit chunks of framebuffer data, recording hashes of
> blocks read by the user to avoid re-sending them during the
> stop-and-copy (010b) phase.  

Here I am talking specifically about mlx5 which does not have a state
capture in pre-copy. So mlx5 should capture state on 010b only, and
the 011b is a NOP.

> >  trigger de-serializing the device state
> >    Done when transition from 100b -> 000b ?
> 
> 100b -> 000b is not a required transition, generally this would be 100b
> -> 001b, ie. end state of _RUNNING vs _STOP.

Sorry, I typo'd it, yes to _RUNNING

> I think the requirement is that de-serialization is complete when the
> _RESUMING bit is cleared.  Whether the driver chooses to de-serialize
> piece-wise as each block of data is written to the device or in bulk
> from a buffer is left to the implementation.  In either case, the
> driver can fail the transition to !_RESUMING if the state is incomplete
> or otherwise corrupt.  It would again be the driver's discretion if
> the device enters the error state or remains in _RESUMING.  If the user
> has no valid state with which to exit the _RESUMING phase, a device
> reset should return the device to _RUNNING with a default initial state.

That makes sense enough.

> > There is a missing state "Stop Active Transactions" which would be
> > only "does not issue DMAs". I've seen a proposal to add that.
> 
> This would be to get all devices to stop issuing DMA while internal
> state can be modified to avoid the synchronization issue of trying to
> stop devices concurrently?  

Yes, as above

> For PCI devices we obviously have the bus master bit to manage that,
> but I could see how a migration extension for such support (perhaps
> even just wired through to BM for PCI) could be useful.  Thanks,

I'm nervous to override the BM bit for something like this, the BM bit
isn't a gentle "please coherently stop what you are doing" it is a
hanbrake the OS pulls to ensure any PCI device becomes quiet.

Thanks,
Jason
Max Gurtovoy Sept. 29, 2021, 9:48 p.m. UTC | #27
On 9/29/2021 7:14 PM, Jason Gunthorpe wrote:
> On Wed, Sep 29, 2021 at 06:28:44PM +0300, Max Gurtovoy wrote:
>
>>> So you have a device that's actively modifying its internal state,
>>> performing I/O, including DMA (thereby dirtying VM memory), all while
>>> in the _STOP state?  And you don't see this as a problem?
>> I don't see how is it different from vfio-pci situation.
> vfio-pci provides no way to observe the migration state. It isn't
> "000b"

Alex said that there is a problem of compatibility.

I migration SW is not involved, nobody will read this migration state.

>> Maybe we need to rename STOP state. We can call it READY or LIVE or
>> NON_MIGRATION_STATE.
> It was a poor choice to use 000b as stop, but it doesn't really
> matter. The mlx5 driver should just pre-init this readable to running.

I guess we can do it for this reason. There is no functional problem nor 
compatibility issue here as was mentioned.

But still we need the kernel to track transitions. We don't want to 
allow moving from RESUMING to SAVING state for example. How this 
transition can be allowed ?

In this case we need to fail the request from the migration SW...


>
> Jason
Alex Williamson Sept. 29, 2021, 10:44 p.m. UTC | #28
On Thu, 30 Sep 2021 00:48:55 +0300
Max Gurtovoy <mgurtovoy@nvidia.com> wrote:

> On 9/29/2021 7:14 PM, Jason Gunthorpe wrote:
> > On Wed, Sep 29, 2021 at 06:28:44PM +0300, Max Gurtovoy wrote:
> >  
> >>> So you have a device that's actively modifying its internal state,
> >>> performing I/O, including DMA (thereby dirtying VM memory), all while
> >>> in the _STOP state?  And you don't see this as a problem?  
> >> I don't see how is it different from vfio-pci situation.  
> > vfio-pci provides no way to observe the migration state. It isn't
> > "000b"  
> 
> Alex said that there is a problem of compatibility.
> 
> I migration SW is not involved, nobody will read this migration state.

The _STOP state has a specific meaning regardless of whether userspace
reads the device state value.  I think what you're suggesting is that
the device reports itself as _STOP'd but it's actually _RUNNING.  Is
that the compatibility workaround, create a self inconsistency?

We cannot impose on userspace to move a device from _STOP to _RUNNING
simply because the device supports the migration region, nor should we
report a device state that is inconsistent with the actual device state.

> >> Maybe we need to rename STOP state. We can call it READY or LIVE or
> >> NON_MIGRATION_STATE.  
> > It was a poor choice to use 000b as stop, but it doesn't really
> > matter. The mlx5 driver should just pre-init this readable to running.  
> 
> I guess we can do it for this reason. There is no functional problem nor 
> compatibility issue here as was mentioned.
> 
> But still we need the kernel to track transitions. We don't want to 
> allow moving from RESUMING to SAVING state for example. How this 
> transition can be allowed ?
> 
> In this case we need to fail the request from the migration SW...

_RESUMING to _SAVING seems like a good way to test round trip migration
without running the device to modify the state.  Potentially it's a
means to update a saved device migration data stream to a newer format
using an intermediate driver version.

If a driver is written such that it simply sees clearing the _RESUME
bit as an indicator to de-serialize the data stream to the device, and
setting the _SAVING flag as an indicator to re-serialize that data
stream from the device, then this is just a means to make use of
existing data paths.

The uAPI specifies a means for drivers to reject a state change, but
that risks failing to support a transition which might find mainstream
use cases.  I don't think common code should be responsible for
filtering out viable transitions.  Thanks,

Alex
Jason Gunthorpe Sept. 29, 2021, 11:21 p.m. UTC | #29
On Thu, Sep 30, 2021 at 12:48:55AM +0300, Max Gurtovoy wrote:
> 
> On 9/29/2021 7:14 PM, Jason Gunthorpe wrote:
> > On Wed, Sep 29, 2021 at 06:28:44PM +0300, Max Gurtovoy wrote:
> > 
> > > > So you have a device that's actively modifying its internal state,
> > > > performing I/O, including DMA (thereby dirtying VM memory), all while
> > > > in the _STOP state?  And you don't see this as a problem?
> > > I don't see how is it different from vfio-pci situation.
> > vfio-pci provides no way to observe the migration state. It isn't
> > "000b"
> 
> Alex said that there is a problem of compatibility.

Yes, when a vfio_device first opens it must be running - ie able to do
DMA and otherwise operational.

When we add the migration extension this cannot change, so after
open_device() the device should be operational.

The reported state in the migration region should accurately reflect
what the device is currently doing. If the device is operational then
it must report running, not stopped.

Thus a driver cannot just zero initalize the migration "registers",
they have to be accurate.

> > > Maybe we need to rename STOP state. We can call it READY or LIVE or
> > > NON_MIGRATION_STATE.
> > It was a poor choice to use 000b as stop, but it doesn't really
> > matter. The mlx5 driver should just pre-init this readable to running.
> 
> I guess we can do it for this reason. There is no functional problem nor
> compatibility issue here as was mentioned.
> 
> But still we need the kernel to track transitions. We don't want to allow
> moving from RESUMING to SAVING state for example. How this transition can be
> allowed ?

It seems semantically fine to me, as per Alex's note what will happen
is defined:

driver will see RESUMING toggle off so it will trigger a
de-serialization

driver will see SAVING toggled on so it will serialize the new state
(either the pre-copy state or the post-copy state dpending on the
running bit)

Depending on the running bit the device may or may not be woken up.

If de-serialization fails then the state goes to error and SAVING is
ignored.

The driver logic probably looks something like this:

// Running toggles off
if (oldstate & RUNNING != newstate & RUNNING && oldstate & RUNNING)
    queice
    freeze

// Resuming toggles off
if (oldstate & RESUMING != newstate & RESUMING && oldstate & RESUMING)
   deserialize

// Saving toggles on
if (oldstate & SAVING != newstate & SAVING && newstate & SAVING)
   if (!(newstate & RUNNING))
     serialize post copy

// Running toggles on
if (oldstate & RUNNING != newstate & RUNNING && newstate & RUNNING)
   unfreeze
   unqueice

I'd have to check that carefully against the state chart from my last
email though..

And need to check how the "Stop Active Transactions" bit fits in there

Jason
Max Gurtovoy Sept. 30, 2021, 9:25 a.m. UTC | #30
On 9/30/2021 1:44 AM, Alex Williamson wrote:
> On Thu, 30 Sep 2021 00:48:55 +0300
> Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
>
>> On 9/29/2021 7:14 PM, Jason Gunthorpe wrote:
>>> On Wed, Sep 29, 2021 at 06:28:44PM +0300, Max Gurtovoy wrote:
>>>   
>>>>> So you have a device that's actively modifying its internal state,
>>>>> performing I/O, including DMA (thereby dirtying VM memory), all while
>>>>> in the _STOP state?  And you don't see this as a problem?
>>>> I don't see how is it different from vfio-pci situation.
>>> vfio-pci provides no way to observe the migration state. It isn't
>>> "000b"
>> Alex said that there is a problem of compatibility.
>>
>> I migration SW is not involved, nobody will read this migration state.
> The _STOP state has a specific meaning regardless of whether userspace
> reads the device state value.  I think what you're suggesting is that
> the device reports itself as _STOP'd but it's actually _RUNNING.  Is
> that the compatibility workaround, create a self inconsistency?

 From migration point of view the device is stopped.

>
> We cannot impose on userspace to move a device from _STOP to _RUNNING
> simply because the device supports the migration region, nor should we
> report a device state that is inconsistent with the actual device state.

In this case we can think maybe moving to running during enabling the 
bus master..


>
>>>> Maybe we need to rename STOP state. We can call it READY or LIVE or
>>>> NON_MIGRATION_STATE.
>>> It was a poor choice to use 000b as stop, but it doesn't really
>>> matter. The mlx5 driver should just pre-init this readable to running.
>> I guess we can do it for this reason. There is no functional problem nor
>> compatibility issue here as was mentioned.
>>
>> But still we need the kernel to track transitions. We don't want to
>> allow moving from RESUMING to SAVING state for example. How this
>> transition can be allowed ?
>>
>> In this case we need to fail the request from the migration SW...
> _RESUMING to _SAVING seems like a good way to test round trip migration
> without running the device to modify the state.  Potentially it's a
> means to update a saved device migration data stream to a newer format
> using an intermediate driver version.

what do you mean by "without running the device to modify the state." ?

did you describe a case where you migrate from source to dst and then 
back to source with a new migration data format ?

>
> If a driver is written such that it simply sees clearing the _RESUME
> bit as an indicator to de-serialize the data stream to the device, and
> setting the _SAVING flag as an indicator to re-serialize that data
> stream from the device, then this is just a means to make use of
> existing data paths.
>
> The uAPI specifies a means for drivers to reject a state change, but
> that risks failing to support a transition which might find mainstream
> use cases.  I don't think common code should be responsible for
> filtering out viable transitions.  Thanks,
>
> Alex
>
Max Gurtovoy Sept. 30, 2021, 9:34 a.m. UTC | #31
On 9/30/2021 2:21 AM, Jason Gunthorpe wrote:
> On Thu, Sep 30, 2021 at 12:48:55AM +0300, Max Gurtovoy wrote:
>> On 9/29/2021 7:14 PM, Jason Gunthorpe wrote:
>>> On Wed, Sep 29, 2021 at 06:28:44PM +0300, Max Gurtovoy wrote:
>>>
>>>>> So you have a device that's actively modifying its internal state,
>>>>> performing I/O, including DMA (thereby dirtying VM memory), all while
>>>>> in the _STOP state?  And you don't see this as a problem?
>>>> I don't see how is it different from vfio-pci situation.
>>> vfio-pci provides no way to observe the migration state. It isn't
>>> "000b"
>> Alex said that there is a problem of compatibility.
> Yes, when a vfio_device first opens it must be running - ie able to do
> DMA and otherwise operational.

how can non resumed device do DMA ?

Also the bus master is not set.

>
> When we add the migration extension this cannot change, so after
> open_device() the device should be operational.

if it's waiting for incoming migration blob, it is not running.

>
> The reported state in the migration region should accurately reflect
> what the device is currently doing. If the device is operational then
> it must report running, not stopped.

STOP in migration meaning.

>
> Thus a driver cannot just zero initalize the migration "registers",
> they have to be accurate.
>
>>>> Maybe we need to rename STOP state. We can call it READY or LIVE or
>>>> NON_MIGRATION_STATE.
>>> It was a poor choice to use 000b as stop, but it doesn't really
>>> matter. The mlx5 driver should just pre-init this readable to running.
>> I guess we can do it for this reason. There is no functional problem nor
>> compatibility issue here as was mentioned.
>>
>> But still we need the kernel to track transitions. We don't want to allow
>> moving from RESUMING to SAVING state for example. How this transition can be
>> allowed ?
> It seems semantically fine to me, as per Alex's note what will happen
> is defined:
>
> driver will see RESUMING toggle off so it will trigger a
> de-serialization

You mean stop serialization ?

>
> driver will see SAVING toggled on so it will serialize the new state
> (either the pre-copy state or the post-copy state dpending on the
> running bit)

lets leave the bits and how you implement the state numbering aside.

If you finish resuming you can move to a new state (that we should add) 
=> RESUMED.

Now you suggested moving from RESUMED to SAVING to get the state again 
from the dst device ? and send it back to src ? before staring the VM 
and moving to RUNNING ?

where this is coming from ?

>
> Depending on the running bit the device may or may not be woken up.

lets take about logic here and not bits.

>
> If de-serialization fails then the state goes to error and SAVING is
> ignored.
>
> The driver logic probably looks something like this:
>
> // Running toggles off
> if (oldstate & RUNNING != newstate & RUNNING && oldstate & RUNNING)
>      queice
>      freeze
>
> // Resuming toggles off
> if (oldstate & RESUMING != newstate & RESUMING && oldstate & RESUMING)
>     deserialize
>
> // Saving toggles on
> if (oldstate & SAVING != newstate & SAVING && newstate & SAVING)
>     if (!(newstate & RUNNING))
>       serialize post copy
>
> // Running toggles on
> if (oldstate & RUNNING != newstate & RUNNING && newstate & RUNNING)
>     unfreeze
>     unqueice
>
> I'd have to check that carefully against the state chart from my last
> email though..
>
> And need to check how the "Stop Active Transactions" bit fits in there
>
> Jason
Alex Williamson Sept. 30, 2021, 12:41 p.m. UTC | #32
On Thu, 30 Sep 2021 12:25:23 +0300
Max Gurtovoy <mgurtovoy@nvidia.com> wrote:

> On 9/30/2021 1:44 AM, Alex Williamson wrote:
> > On Thu, 30 Sep 2021 00:48:55 +0300
> > Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
> >  
> >> On 9/29/2021 7:14 PM, Jason Gunthorpe wrote:  
> >>> On Wed, Sep 29, 2021 at 06:28:44PM +0300, Max Gurtovoy wrote:
> >>>     
> >>>>> So you have a device that's actively modifying its internal state,
> >>>>> performing I/O, including DMA (thereby dirtying VM memory), all while
> >>>>> in the _STOP state?  And you don't see this as a problem?  
> >>>> I don't see how is it different from vfio-pci situation.  
> >>> vfio-pci provides no way to observe the migration state. It isn't
> >>> "000b"  
> >> Alex said that there is a problem of compatibility.
> >>
> >> I migration SW is not involved, nobody will read this migration state.  
> > The _STOP state has a specific meaning regardless of whether userspace
> > reads the device state value.  I think what you're suggesting is that
> > the device reports itself as _STOP'd but it's actually _RUNNING.  Is
> > that the compatibility workaround, create a self inconsistency?  
> 
>  From migration point of view the device is stopped.

The _RESUMING and _SAVING bits control the migration activity, the
_RUNNING bit controls the ability of the device to modify its internal
state and affect external state.  The initial state of the device is
absolutely not stopped.

> > We cannot impose on userspace to move a device from _STOP to _RUNNING
> > simply because the device supports the migration region, nor should we
> > report a device state that is inconsistent with the actual device state.  
> 
> In this case we can think maybe moving to running during enabling the 
> bus master..

There are no spontaneous state transitions, device_state changes only
via user manipulation of the register.

> >>>> Maybe we need to rename STOP state. We can call it READY or LIVE or
> >>>> NON_MIGRATION_STATE.  
> >>> It was a poor choice to use 000b as stop, but it doesn't really
> >>> matter. The mlx5 driver should just pre-init this readable to running.  
> >> I guess we can do it for this reason. There is no functional problem nor
> >> compatibility issue here as was mentioned.
> >>
> >> But still we need the kernel to track transitions. We don't want to
> >> allow moving from RESUMING to SAVING state for example. How this
> >> transition can be allowed ?
> >>
> >> In this case we need to fail the request from the migration SW...  
> > _RESUMING to _SAVING seems like a good way to test round trip migration
> > without running the device to modify the state.  Potentially it's a
> > means to update a saved device migration data stream to a newer format
> > using an intermediate driver version.  
> 
> what do you mean by "without running the device to modify the state." ?

If a device is !_RUNNING it should not be advancing its internal state,
therefore state-in == state-out.
 
> did you describe a case where you migrate from source to dst and then 
> back to source with a new migration data format ?

I'm speculating that as the driver evolves, the migration data stream
generated from the device's migration region can change.  Hopefully in
compatible ways.  The above sequence of restoring and extracting state
without the complication of the device running could help to validate
compatibility.  Thanks,

Alex
Jason Gunthorpe Sept. 30, 2021, 2:47 p.m. UTC | #33
On Thu, Sep 30, 2021 at 12:34:19PM +0300, Max Gurtovoy wrote:

> > When we add the migration extension this cannot change, so after
> > open_device() the device should be operational.
> 
> if it's waiting for incoming migration blob, it is not running.

It cannot be waiting for a migration blob after open_device, that is
not backwards compatible.

Just prior to open device the vfio pci layer will generate a FLR to
the function so we expect that post open_device has a fresh from reset
fully running device state.

> > The reported state in the migration region should accurately reflect
> > what the device is currently doing. If the device is operational then
> > it must report running, not stopped.
> 
> STOP in migration meaning.

As Alex and I have said several times STOP means the internal state is
not allowed to change.

> > driver will see RESUMING toggle off so it will trigger a
> > de-serialization
> 
> You mean stop serialization ?

No, I mean it will take all the migration data that has been uploaded
through the migration region and de-serialize it into active device
state.

> > driver will see SAVING toggled on so it will serialize the new state
> > (either the pre-copy state or the post-copy state dpending on the
> > running bit)
> 
> lets leave the bits and how you implement the state numbering aside.

You've missed the point. This isn't a FSM. It is a series of three
control bits that we have assigned logical meaning their combinatoins.

The algorithm I gave is a control centric algorithm not a state
centric algorithm and matches the direction Alex thought this was
being designed for.
 
> If you finish resuming you can move to a new state (that we should add) =>
> RESUMED.

It is not a state machine. Once you stop prentending this is
implementing a FSM Alex's position makes perfect sense.

Jason
Max Gurtovoy Sept. 30, 2021, 3:32 p.m. UTC | #34
On 9/30/2021 5:47 PM, Jason Gunthorpe wrote:
> On Thu, Sep 30, 2021 at 12:34:19PM +0300, Max Gurtovoy wrote:
>
>>> When we add the migration extension this cannot change, so after
>>> open_device() the device should be operational.
>> if it's waiting for incoming migration blob, it is not running.
> It cannot be waiting for a migration blob after open_device, that is
> not backwards compatible.
>
> Just prior to open device the vfio pci layer will generate a FLR to
> the function so we expect that post open_device has a fresh from reset
> fully running device state.

running also mean that the device doesn't have a clue on its internal 
state ? or running means unfreezed and unquiesced ?

>
>>> The reported state in the migration region should accurately reflect
>>> what the device is currently doing. If the device is operational then
>>> it must report running, not stopped.
>> STOP in migration meaning.
> As Alex and I have said several times STOP means the internal state is
> not allowed to change.
>
>>> driver will see RESUMING toggle off so it will trigger a
>>> de-serialization
>> You mean stop serialization ?
> No, I mean it will take all the migration data that has been uploaded
> through the migration region and de-serialize it into active device
> state.

you should feed the device way before that.

>
>>> driver will see SAVING toggled on so it will serialize the new state
>>> (either the pre-copy state or the post-copy state dpending on the
>>> running bit)
>> lets leave the bits and how you implement the state numbering aside.
> You've missed the point. This isn't a FSM. It is a series of three
> control bits that we have assigned logical meaning their combinatoins.
>
> The algorithm I gave is a control centric algorithm not a state
> centric algorithm and matches the direction Alex thought this was
> being designed for.
>   
>> If you finish resuming you can move to a new state (that we should add) =>
>> RESUMED.
> It is not a state machine. Once you stop prentending this is
> implementing a FSM Alex's position makes perfect sense.

You can look on it anyway you want. Three control bits or FSM. And I can 
look on it anyway I want.

The point is what bits/state you set during the resume phase:

1. you initialize at  _RUNNING bit == 001b. No problem.

2. state stream arrives, migration SW raise _RESUMING bit. should it be 
101b or 100b ? for now it's 100b. But according to your statement is 
should be 101b (invalid today) since device state can change. right ?

3. Then you should indicate that all the state was serialized to the 
device (actually to all the pci devices). 100b mean RESUMING and not 
RUNNING so maybe this can say RESUMED and state can't change now ?

4. all devices move to running 001b only after all devices moved to 100b.

Otherwise, devices will start changing each other internal states.

-Max.

>
> Jason
Jason Gunthorpe Sept. 30, 2021, 4:24 p.m. UTC | #35
On Thu, Sep 30, 2021 at 06:32:07PM +0300, Max Gurtovoy wrote:
> > Just prior to open device the vfio pci layer will generate a FLR to
> > the function so we expect that post open_device has a fresh from reset
> > fully running device state.
> 
> running also mean that the device doesn't have a clue on its internal state
> ? or running means unfreezed and unquiesced ?

The device just got FLR'd and it should be in a clean state and
operating. Think the VM is booting for the first time.

> > > > driver will see RESUMING toggle off so it will trigger a
> > > > de-serialization
> > > You mean stop serialization ?
> > No, I mean it will take all the migration data that has been uploaded
> > through the migration region and de-serialize it into active device
> > state.
> 
> you should feed the device way before that.

I don't know what this means, when the resuming bit is set the
migration data buffer is wiped and userspace should beging loading
it. When the resuming bit is cleared whatever is in the migration
buffer is deserialized into the current device internal state.

It is the opposite of saving. When the saving bit is set the current
device state is serialized into the migration buffer and userspace and
reads it out.

> 1. you initialize at  _RUNNING bit == 001b. No problem.
> 
> 2. state stream arrives, migration SW raise _RESUMING bit. should it be 101b
> or 100b ? for now it's 100b. But according to your statement is should be
> 101b (invalid today) since device state can change. right ?

Running means the device state chanages independently, the controlled
change of the device state via deserializing the migration buffer is
different. Both running and saving commands need running to be zero.

ie commands that are marked invalid in the uapi comment are rejected
at the start - and that is probably the core helper we should provide.

> 3. Then you should indicate that all the state was serialized to the device
> (actually to all the pci devices). 100b mean RESUMING and not RUNNING so
> maybe this can say RESUMED and state can't change now ?

State is not loaded into the device until the resuming bit is
cleared. There is no RESUMED state until we incorporate Artem's
proposal for an additional bit eg 1001b - running with DMA master
disabled.

Jason
Max Gurtovoy Sept. 30, 2021, 4:51 p.m. UTC | #36
On 9/30/2021 7:24 PM, Jason Gunthorpe wrote:
> On Thu, Sep 30, 2021 at 06:32:07PM +0300, Max Gurtovoy wrote:
>>> Just prior to open device the vfio pci layer will generate a FLR to
>>> the function so we expect that post open_device has a fresh from reset
>>> fully running device state.
>> running also mean that the device doesn't have a clue on its internal state
>> ? or running means unfreezed and unquiesced ?
> The device just got FLR'd and it should be in a clean state and
> operating. Think the VM is booting for the first time.

During the resume phase in the dst, the VM is paused and not booting. 
Migration SW is waiting to get memory and state from SRC. The device 
will start from the exact point that was in the src.

it's exactly "000b => Device Stopped, not saving or resuming"

>
>>>>> driver will see RESUMING toggle off so it will trigger a
>>>>> de-serialization
>>>> You mean stop serialization ?
>>> No, I mean it will take all the migration data that has been uploaded
>>> through the migration region and de-serialize it into active device
>>> state.
>> you should feed the device way before that.
> I don't know what this means, when the resuming bit is set the
> migration data buffer is wiped and userspace should beging loading
> it. When the resuming bit is cleared whatever is in the migration
> buffer is deserialized into the current device internal state.

Well, this is your design for the driver implementation. Nobody is 
preventing other drivers to start deserializing device state into the 
device during RESUMING bit on.

Or is this a must ?

>
> It is the opposite of saving. When the saving bit is set the current
> device state is serialized into the migration buffer and userspace and
> reads it out.

This is not new.

>> 1. you initialize at  _RUNNING bit == 001b. No problem.
>>
>> 2. state stream arrives, migration SW raise _RESUMING bit. should it be 101b
>> or 100b ? for now it's 100b. But according to your statement is should be
>> 101b (invalid today) since device state can change. right ?
> Running means the device state chanages independently, the controlled
> change of the device state via deserializing the migration buffer is
> different. Both running and saving commands need running to be zero.
>
> ie commands that are marked invalid in the uapi comment are rejected
> at the start - and that is probably the core helper we should provide.
>
>> 3. Then you should indicate that all the state was serialized to the device
>> (actually to all the pci devices). 100b mean RESUMING and not RUNNING so
>> maybe this can say RESUMED and state can't change now ?
> State is not loaded into the device until the resuming bit is
> cleared. There is no RESUMED state until we incorporate Artem's
> proposal for an additional bit eg 1001b - running with DMA master
> disabled.

So if we moved from 100b to 010b somehow, one should deserialized its 
buffer to the device, and then serialize it to migration region again ?

I guess its doable since the device is freeze and quiesced. But moving 
from 100b to 011b is not possible, right ?

>
> Jason
Jason Gunthorpe Sept. 30, 2021, 5:01 p.m. UTC | #37
On Thu, Sep 30, 2021 at 07:51:22PM +0300, Max Gurtovoy wrote:
> 
> On 9/30/2021 7:24 PM, Jason Gunthorpe wrote:
> > On Thu, Sep 30, 2021 at 06:32:07PM +0300, Max Gurtovoy wrote:
> > > > Just prior to open device the vfio pci layer will generate a FLR to
> > > > the function so we expect that post open_device has a fresh from reset
> > > > fully running device state.
> > > running also mean that the device doesn't have a clue on its internal state
> > > ? or running means unfreezed and unquiesced ?
> > The device just got FLR'd and it should be in a clean state and
> > operating. Think the VM is booting for the first time.
> 
> During the resume phase in the dst, the VM is paused and not booting.
> Migration SW is waiting to get memory and state from SRC. The device will
> start from the exact point that was in the src.
> 
> it's exactly "000b => Device Stopped, not saving or resuming"

For this case qmeu should open the VFIO device and immediately issue a
command to go to resuming. The kernel cannot know at open_device time
which case userspace is trying to do. Due to backwards compat we
assume userspace is going to boot a fresh VM.

> Well, this is your design for the driver implementation. Nobody is
> preventing other drivers to start deserializing device state into the device
> during RESUMING bit on.

It is a logical model. Devices can stream the migration data directly
into the internal state if they like. It just creates more conditions
where they have report an error state.

> So if we moved from 100b to 010b somehow, one should deserialized its buffer
> to the device, and then serialize it to migration region again ?

Yes.
 
> I guess its doable since the device is freeze and quiesced. But moving from
> 100b to 011b is not possible, right ?

Why not?

100b to 011b is no different than going indirectly 100b -> 001b -> 011b

The time spent in 001b is just negligable.

Jason
diff mbox series

Patch

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 3c034fe14ccb..c3ca33e513c8 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1664,6 +1664,47 @@  static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 	return 0;
 }
 
+/**
+ * vfio_change_migration_state_allowed - Checks whether a migration state
+ *   transition is valid.
+ * @new_state: The new state to move to.
+ * @old_state: The old state.
+ * Return: true if the transition is valid.
+ */
+bool vfio_change_migration_state_allowed(u32 new_state, u32 old_state)
+{
+	enum { MAX_STATE = VFIO_DEVICE_STATE_RESUMING };
+	static const u8 vfio_from_state_table[MAX_STATE + 1][MAX_STATE + 1] = {
+		[VFIO_DEVICE_STATE_STOP] = {
+			[VFIO_DEVICE_STATE_RUNNING] = 1,
+			[VFIO_DEVICE_STATE_RESUMING] = 1,
+		},
+		[VFIO_DEVICE_STATE_RUNNING] = {
+			[VFIO_DEVICE_STATE_STOP] = 1,
+			[VFIO_DEVICE_STATE_SAVING] = 1,
+			[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING] = 1,
+		},
+		[VFIO_DEVICE_STATE_SAVING] = {
+			[VFIO_DEVICE_STATE_STOP] = 1,
+			[VFIO_DEVICE_STATE_RUNNING] = 1,
+		},
+		[VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING] = {
+			[VFIO_DEVICE_STATE_RUNNING] = 1,
+			[VFIO_DEVICE_STATE_SAVING] = 1,
+		},
+		[VFIO_DEVICE_STATE_RESUMING] = {
+			[VFIO_DEVICE_STATE_RUNNING] = 1,
+			[VFIO_DEVICE_STATE_STOP] = 1,
+		},
+	};
+
+	if (new_state > MAX_STATE || old_state > MAX_STATE)
+		return false;
+
+	return vfio_from_state_table[old_state][new_state];
+}
+EXPORT_SYMBOL_GPL(vfio_change_migration_state_allowed);
+
 static long vfio_device_fops_unl_ioctl(struct file *filep,
 				       unsigned int cmd, unsigned long arg)
 {
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index b53a9557884a..e65137a708f1 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -83,6 +83,7 @@  extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
 
 int vfio_assign_device_set(struct vfio_device *device, void *set_id);
+bool vfio_change_migration_state_allowed(u32 new_state, u32 old_state);
 
 /* events for the backend driver notify callback */
 enum vfio_iommu_notify_type {