diff mbox series

[vfio,10/11] vfio/virtio: Expose admin commands over virtio device

Message ID 20230921124040.145386-11-yishaih@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Introduce a vfio driver over virtio devices | expand

Commit Message

Yishai Hadas Sept. 21, 2023, 12:40 p.m. UTC
Expose admin commands over the virtio device, to be used by the
vfio-virtio driver in the next patches.

It includes: list query/use, legacy write/read, read notify_info.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
 drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++
 drivers/vfio/pci/virtio/cmd.h |  27 +++++++
 2 files changed, 173 insertions(+)
 create mode 100644 drivers/vfio/pci/virtio/cmd.c
 create mode 100644 drivers/vfio/pci/virtio/cmd.h

Comments

Michael S. Tsirkin Sept. 21, 2023, 1:08 p.m. UTC | #1
On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote:
> Expose admin commands over the virtio device, to be used by the
> vfio-virtio driver in the next patches.
> 
> It includes: list query/use, legacy write/read, read notify_info.
> 
> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>


I don't get the motivation for this and the next patch.
We already have vdpa that seems to do exactly this:
drive virtio from userspace. Why do we need these extra 1000
lines of code in vfio - just because we can?
Not to talk about user confusion all this will cause.


> ---
>  drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++
>  drivers/vfio/pci/virtio/cmd.h |  27 +++++++
>  2 files changed, 173 insertions(+)
>  create mode 100644 drivers/vfio/pci/virtio/cmd.c
>  create mode 100644 drivers/vfio/pci/virtio/cmd.h
> 
> diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c
> new file mode 100644
> index 000000000000..f068239cdbb0
> --- /dev/null
> +++ b/drivers/vfio/pci/virtio/cmd.c
> @@ -0,0 +1,146 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +/*
> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved
> + */
> +
> +#include "cmd.h"
> +
> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size)
> +{
> +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
> +	struct scatterlist out_sg;
> +	struct virtio_admin_cmd cmd = {};
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	sg_init_one(&out_sg, buf, buf_size);
> +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.result_sg = &out_sg;
> +
> +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
> +}
> +
> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size)
> +{
> +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
> +	struct scatterlist in_sg;
> +	struct virtio_admin_cmd cmd = {};
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	sg_init_one(&in_sg, buf, buf_size);
> +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.data_sg = &in_sg;
> +
> +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
> +}
> +
> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> +			  u8 offset, u8 size, u8 *buf)
> +{
> +	struct virtio_device *virtio_dev =
> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> +	struct virtio_admin_cmd_data_lr_write *in;
> +	struct scatterlist in_sg;
> +	struct virtio_admin_cmd cmd = {};
> +	int ret;
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	in = kzalloc(sizeof(*in) + size, GFP_KERNEL);
> +	if (!in)
> +		return -ENOMEM;
> +
> +	in->offset = offset;
> +	memcpy(in->registers, buf, size);
> +	sg_init_one(&in_sg, in, sizeof(*in) + size);
> +	cmd.opcode = opcode;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.group_member_id = virtvdev->vf_id + 1;
> +	cmd.data_sg = &in_sg;
> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> +
> +	kfree(in);
> +	return ret;
> +}
> +
> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> +			 u8 offset, u8 size, u8 *buf)
> +{
> +	struct virtio_device *virtio_dev =
> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> +	struct virtio_admin_cmd_data_lr_read *in;
> +	struct scatterlist in_sg, out_sg;
> +	struct virtio_admin_cmd cmd = {};
> +	int ret;
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	in = kzalloc(sizeof(*in), GFP_KERNEL);
> +	if (!in)
> +		return -ENOMEM;
> +
> +	in->offset = offset;
> +	sg_init_one(&in_sg, in, sizeof(*in));
> +	sg_init_one(&out_sg, buf, size);
> +	cmd.opcode = opcode;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.data_sg = &in_sg;
> +	cmd.result_sg = &out_sg;
> +	cmd.group_member_id = virtvdev->vf_id + 1;
> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> +
> +	kfree(in);
> +	return ret;
> +}
> +
> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
> +				u8 req_bar_flags, u8 *bar, u64 *bar_offset)
> +{
> +	struct virtio_device *virtio_dev =
> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> +	struct virtio_admin_cmd_notify_info_result *out;
> +	struct scatterlist out_sg;
> +	struct virtio_admin_cmd cmd = {};
> +	int ret;
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	out = kzalloc(sizeof(*out), GFP_KERNEL);
> +	if (!out)
> +		return -ENOMEM;
> +
> +	sg_init_one(&out_sg, out, sizeof(*out));
> +	cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.result_sg = &out_sg;
> +	cmd.group_member_id = virtvdev->vf_id + 1;
> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> +	if (!ret) {
> +		struct virtio_admin_cmd_notify_info_data *entry;
> +		int i;
> +
> +		ret = -ENOENT;
> +		for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) {
> +			entry = &out->entries[i];
> +			if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END)
> +				break;
> +			if (entry->flags != req_bar_flags)
> +				continue;
> +			*bar = entry->bar;
> +			*bar_offset = le64_to_cpu(entry->offset);
> +			ret = 0;
> +			break;
> +		}
> +	}
> +
> +	kfree(out);
> +	return ret;
> +}
> diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h
> new file mode 100644
> index 000000000000..c2a3645f4b90
> --- /dev/null
> +++ b/drivers/vfio/pci/virtio/cmd.h
> @@ -0,0 +1,27 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +/*
> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
> + */
> +
> +#ifndef VIRTIO_VFIO_CMD_H
> +#define VIRTIO_VFIO_CMD_H
> +
> +#include <linux/kernel.h>
> +#include <linux/virtio.h>
> +#include <linux/vfio_pci_core.h>
> +#include <linux/virtio_pci.h>
> +
> +struct virtiovf_pci_core_device {
> +	struct vfio_pci_core_device core_device;
> +	int vf_id;
> +};
> +
> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size);
> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size);
> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> +			  u8 offset, u8 size, u8 *buf);
> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> +			 u8 offset, u8 size, u8 *buf);
> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
> +				u8 req_bar_flags, u8 *bar, u64 *bar_offset);
> +#endif /* VIRTIO_VFIO_CMD_H */
> -- 
> 2.27.0
Michael S. Tsirkin Sept. 21, 2023, 8:34 p.m. UTC | #2
On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote:
> Expose admin commands over the virtio device, to be used by the
> vfio-virtio driver in the next patches.
> 
> It includes: list query/use, legacy write/read, read notify_info.
> 
> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> ---
>  drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++
>  drivers/vfio/pci/virtio/cmd.h |  27 +++++++
>  2 files changed, 173 insertions(+)
>  create mode 100644 drivers/vfio/pci/virtio/cmd.c
>  create mode 100644 drivers/vfio/pci/virtio/cmd.h
> 
> diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c
> new file mode 100644
> index 000000000000..f068239cdbb0
> --- /dev/null
> +++ b/drivers/vfio/pci/virtio/cmd.c
> @@ -0,0 +1,146 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +/*
> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved
> + */
> +
> +#include "cmd.h"
> +
> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size)
> +{
> +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
> +	struct scatterlist out_sg;
> +	struct virtio_admin_cmd cmd = {};
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	sg_init_one(&out_sg, buf, buf_size);
> +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.result_sg = &out_sg;
> +
> +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
> +}
> +

in/out seem all wrong here. In virtio terminology, in means from
device to driver, out means from driver to device.

> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size)
> +{
> +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
> +	struct scatterlist in_sg;
> +	struct virtio_admin_cmd cmd = {};
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	sg_init_one(&in_sg, buf, buf_size);
> +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.data_sg = &in_sg;
> +
> +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
> +}
> +
> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,


what is _lr short for?

> +			  u8 offset, u8 size, u8 *buf)
> +{
> +	struct virtio_device *virtio_dev =
> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> +	struct virtio_admin_cmd_data_lr_write *in;
> +	struct scatterlist in_sg;
> +	struct virtio_admin_cmd cmd = {};
> +	int ret;
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	in = kzalloc(sizeof(*in) + size, GFP_KERNEL);
> +	if (!in)
> +		return -ENOMEM;
> +
> +	in->offset = offset;
> +	memcpy(in->registers, buf, size);
> +	sg_init_one(&in_sg, in, sizeof(*in) + size);
> +	cmd.opcode = opcode;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.group_member_id = virtvdev->vf_id + 1;

weird. why + 1?

> +	cmd.data_sg = &in_sg;
> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> +
> +	kfree(in);
> +	return ret;
> +}

How do you know it's safe to send this command, in particular at
this time? This seems to be doing zero checks, and zero synchronization
with the PF driver.


> +
> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> +			 u8 offset, u8 size, u8 *buf)
> +{
> +	struct virtio_device *virtio_dev =
> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> +	struct virtio_admin_cmd_data_lr_read *in;
> +	struct scatterlist in_sg, out_sg;
> +	struct virtio_admin_cmd cmd = {};
> +	int ret;
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	in = kzalloc(sizeof(*in), GFP_KERNEL);
> +	if (!in)
> +		return -ENOMEM;
> +
> +	in->offset = offset;
> +	sg_init_one(&in_sg, in, sizeof(*in));
> +	sg_init_one(&out_sg, buf, size);
> +	cmd.opcode = opcode;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.data_sg = &in_sg;
> +	cmd.result_sg = &out_sg;
> +	cmd.group_member_id = virtvdev->vf_id + 1;
> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> +
> +	kfree(in);
> +	return ret;
> +}
> +
> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,

and what is lq short for?

> +				u8 req_bar_flags, u8 *bar, u64 *bar_offset)
> +{
> +	struct virtio_device *virtio_dev =
> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> +	struct virtio_admin_cmd_notify_info_result *out;
> +	struct scatterlist out_sg;
> +	struct virtio_admin_cmd cmd = {};
> +	int ret;
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	out = kzalloc(sizeof(*out), GFP_KERNEL);
> +	if (!out)
> +		return -ENOMEM;
> +
> +	sg_init_one(&out_sg, out, sizeof(*out));
> +	cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.result_sg = &out_sg;
> +	cmd.group_member_id = virtvdev->vf_id + 1;
> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> +	if (!ret) {
> +		struct virtio_admin_cmd_notify_info_data *entry;
> +		int i;
> +
> +		ret = -ENOENT;
> +		for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) {
> +			entry = &out->entries[i];
> +			if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END)
> +				break;
> +			if (entry->flags != req_bar_flags)
> +				continue;
> +			*bar = entry->bar;
> +			*bar_offset = le64_to_cpu(entry->offset);
> +			ret = 0;
> +			break;
> +		}
> +	}
> +
> +	kfree(out);
> +	return ret;
> +}
> diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h
> new file mode 100644
> index 000000000000..c2a3645f4b90
> --- /dev/null
> +++ b/drivers/vfio/pci/virtio/cmd.h
> @@ -0,0 +1,27 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +/*
> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
> + */
> +
> +#ifndef VIRTIO_VFIO_CMD_H
> +#define VIRTIO_VFIO_CMD_H
> +
> +#include <linux/kernel.h>
> +#include <linux/virtio.h>
> +#include <linux/vfio_pci_core.h>
> +#include <linux/virtio_pci.h>
> +
> +struct virtiovf_pci_core_device {
> +	struct vfio_pci_core_device core_device;
> +	int vf_id;
> +};
> +
> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size);
> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size);
> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> +			  u8 offset, u8 size, u8 *buf);
> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> +			 u8 offset, u8 size, u8 *buf);
> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
> +				u8 req_bar_flags, u8 *bar, u64 *bar_offset);
> +#endif /* VIRTIO_VFIO_CMD_H */
> -- 
> 2.27.0
Michael S. Tsirkin Sept. 22, 2023, 9:54 a.m. UTC | #3
On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote:
> Expose admin commands over the virtio device, to be used by the
> vfio-virtio driver in the next patches.
> 
> It includes: list query/use, legacy write/read, read notify_info.
> 
> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>


This stuff is pure virtio spec. I think it should live under
drivers/virtio, too.

> ---
>  drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++
>  drivers/vfio/pci/virtio/cmd.h |  27 +++++++
>  2 files changed, 173 insertions(+)
>  create mode 100644 drivers/vfio/pci/virtio/cmd.c
>  create mode 100644 drivers/vfio/pci/virtio/cmd.h
> 
> diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c
> new file mode 100644
> index 000000000000..f068239cdbb0
> --- /dev/null
> +++ b/drivers/vfio/pci/virtio/cmd.c
> @@ -0,0 +1,146 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +/*
> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved
> + */
> +
> +#include "cmd.h"
> +
> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size)
> +{
> +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
> +	struct scatterlist out_sg;
> +	struct virtio_admin_cmd cmd = {};
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	sg_init_one(&out_sg, buf, buf_size);
> +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.result_sg = &out_sg;
> +
> +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
> +}
> +
> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size)
> +{
> +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
> +	struct scatterlist in_sg;
> +	struct virtio_admin_cmd cmd = {};
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	sg_init_one(&in_sg, buf, buf_size);
> +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.data_sg = &in_sg;
> +
> +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
> +}
> +
> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> +			  u8 offset, u8 size, u8 *buf)
> +{
> +	struct virtio_device *virtio_dev =
> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> +	struct virtio_admin_cmd_data_lr_write *in;
> +	struct scatterlist in_sg;
> +	struct virtio_admin_cmd cmd = {};
> +	int ret;
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	in = kzalloc(sizeof(*in) + size, GFP_KERNEL);
> +	if (!in)
> +		return -ENOMEM;
> +
> +	in->offset = offset;
> +	memcpy(in->registers, buf, size);
> +	sg_init_one(&in_sg, in, sizeof(*in) + size);
> +	cmd.opcode = opcode;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.group_member_id = virtvdev->vf_id + 1;
> +	cmd.data_sg = &in_sg;
> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> +
> +	kfree(in);
> +	return ret;
> +}
> +
> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> +			 u8 offset, u8 size, u8 *buf)
> +{
> +	struct virtio_device *virtio_dev =
> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> +	struct virtio_admin_cmd_data_lr_read *in;
> +	struct scatterlist in_sg, out_sg;
> +	struct virtio_admin_cmd cmd = {};
> +	int ret;
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	in = kzalloc(sizeof(*in), GFP_KERNEL);
> +	if (!in)
> +		return -ENOMEM;
> +
> +	in->offset = offset;
> +	sg_init_one(&in_sg, in, sizeof(*in));
> +	sg_init_one(&out_sg, buf, size);
> +	cmd.opcode = opcode;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.data_sg = &in_sg;
> +	cmd.result_sg = &out_sg;
> +	cmd.group_member_id = virtvdev->vf_id + 1;
> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> +
> +	kfree(in);
> +	return ret;
> +}
> +
> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
> +				u8 req_bar_flags, u8 *bar, u64 *bar_offset)
> +{
> +	struct virtio_device *virtio_dev =
> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> +	struct virtio_admin_cmd_notify_info_result *out;
> +	struct scatterlist out_sg;
> +	struct virtio_admin_cmd cmd = {};
> +	int ret;
> +
> +	if (!virtio_dev)
> +		return -ENOTCONN;
> +
> +	out = kzalloc(sizeof(*out), GFP_KERNEL);
> +	if (!out)
> +		return -ENOMEM;
> +
> +	sg_init_one(&out_sg, out, sizeof(*out));
> +	cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO;
> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> +	cmd.result_sg = &out_sg;
> +	cmd.group_member_id = virtvdev->vf_id + 1;
> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> +	if (!ret) {
> +		struct virtio_admin_cmd_notify_info_data *entry;
> +		int i;
> +
> +		ret = -ENOENT;
> +		for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) {
> +			entry = &out->entries[i];
> +			if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END)
> +				break;
> +			if (entry->flags != req_bar_flags)
> +				continue;
> +			*bar = entry->bar;
> +			*bar_offset = le64_to_cpu(entry->offset);
> +			ret = 0;
> +			break;
> +		}
> +	}
> +
> +	kfree(out);
> +	return ret;
> +}
> diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h
> new file mode 100644
> index 000000000000..c2a3645f4b90
> --- /dev/null
> +++ b/drivers/vfio/pci/virtio/cmd.h
> @@ -0,0 +1,27 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +/*
> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
> + */
> +
> +#ifndef VIRTIO_VFIO_CMD_H
> +#define VIRTIO_VFIO_CMD_H
> +
> +#include <linux/kernel.h>
> +#include <linux/virtio.h>
> +#include <linux/vfio_pci_core.h>
> +#include <linux/virtio_pci.h>
> +
> +struct virtiovf_pci_core_device {
> +	struct vfio_pci_core_device core_device;
> +	int vf_id;
> +};
> +
> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size);
> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size);
> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> +			  u8 offset, u8 size, u8 *buf);
> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> +			 u8 offset, u8 size, u8 *buf);
> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
> +				u8 req_bar_flags, u8 *bar, u64 *bar_offset);
> +#endif /* VIRTIO_VFIO_CMD_H */
> -- 
> 2.27.0
Yishai Hadas Sept. 26, 2023, 10:51 a.m. UTC | #4
On 21/09/2023 23:34, Michael S. Tsirkin wrote:
> On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote:
>> Expose admin commands over the virtio device, to be used by the
>> vfio-virtio driver in the next patches.
>>
>> It includes: list query/use, legacy write/read, read notify_info.
>>
>> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
>> ---
>>   drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++
>>   drivers/vfio/pci/virtio/cmd.h |  27 +++++++
>>   2 files changed, 173 insertions(+)
>>   create mode 100644 drivers/vfio/pci/virtio/cmd.c
>>   create mode 100644 drivers/vfio/pci/virtio/cmd.h
>>
>> diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c
>> new file mode 100644
>> index 000000000000..f068239cdbb0
>> --- /dev/null
>> +++ b/drivers/vfio/pci/virtio/cmd.c
>> @@ -0,0 +1,146 @@
>> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>> +/*
>> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved
>> + */
>> +
>> +#include "cmd.h"
>> +
>> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size)
>> +{
>> +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
>> +	struct scatterlist out_sg;
>> +	struct virtio_admin_cmd cmd = {};
>> +
>> +	if (!virtio_dev)
>> +		return -ENOTCONN;
>> +
>> +	sg_init_one(&out_sg, buf, buf_size);
>> +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY;
>> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
>> +	cmd.result_sg = &out_sg;
>> +
>> +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
>> +}
>> +
> in/out seem all wrong here. In virtio terminology, in means from
> device to driver, out means from driver to device.
I referred here to in/out from vfio POV who prepares the command.

However, I can replace it to follow the virtio terminology as you 
suggested if this more makes sense.

Please see also my coming answer on your suggestion to put all of this 
in the virtio layer.

>> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size)
>> +{
>> +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
>> +	struct scatterlist in_sg;
>> +	struct virtio_admin_cmd cmd = {};
>> +
>> +	if (!virtio_dev)
>> +		return -ENOTCONN;
>> +
>> +	sg_init_one(&in_sg, buf, buf_size);
>> +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE;
>> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
>> +	cmd.data_sg = &in_sg;
>> +
>> +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
>> +}
>> +
>> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
>
> what is _lr short for?

This was an acronym to legacy_read.

The actual command is according to the given opcode which can be one 
among LEGACY_COMMON_CFG_READ, LEGACY_DEV_CFG_READ.

I can rename it to '_legacy_read' (i.e. virtiovf_issue_legacy_read_cmd) 
to be clearer.

>
>> +			  u8 offset, u8 size, u8 *buf)
>> +{
>> +	struct virtio_device *virtio_dev =
>> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
>> +	struct virtio_admin_cmd_data_lr_write *in;
>> +	struct scatterlist in_sg;
>> +	struct virtio_admin_cmd cmd = {};
>> +	int ret;
>> +
>> +	if (!virtio_dev)
>> +		return -ENOTCONN;
>> +
>> +	in = kzalloc(sizeof(*in) + size, GFP_KERNEL);
>> +	if (!in)
>> +		return -ENOMEM;
>> +
>> +	in->offset = offset;
>> +	memcpy(in->registers, buf, size);
>> +	sg_init_one(&in_sg, in, sizeof(*in) + size);
>> +	cmd.opcode = opcode;
>> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
>> +	cmd.group_member_id = virtvdev->vf_id + 1;
> weird. why + 1?

This follows the virtio spec in that area.

"When sending commands with the SR-IOV group type, the driver specify a 
value for group_member_id
between 1 and NumVFs inclusive."

The 'virtvdev->vf_id' was set upon vfio/virtio driver initialization by 
pci_iov_vf_id() which its first index is 0.

>> +	cmd.data_sg = &in_sg;
>> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
>> +
>> +	kfree(in);
>> +	return ret;
>> +}
> How do you know it's safe to send this command, in particular at
> this time? This seems to be doing zero checks, and zero synchronization
> with the PF driver.
>
The virtiovf_cmd_lr_read()/other gets a virtio VF and it gets its PF by 
calling virtio_pci_vf_get_pf_dev().

The VF can't gone by 'disable sriov' as it's owned/used by vfio.

The PF can't gone by rmmod/modprobe -r of virtio, as of the 'module in 
use'/dependencies between VFIO to VIRTIO.

The below check [1] was done only from a clean code perspective, which 
might theoretically fail in case the given VF doesn't use a virtio driver.

[1] if (!virtio_dev)
         return -ENOTCONN;

So, it looks safe as is.

>> +
>> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
>> +			 u8 offset, u8 size, u8 *buf)
>> +{
>> +	struct virtio_device *virtio_dev =
>> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
>> +	struct virtio_admin_cmd_data_lr_read *in;
>> +	struct scatterlist in_sg, out_sg;
>> +	struct virtio_admin_cmd cmd = {};
>> +	int ret;
>> +
>> +	if (!virtio_dev)
>> +		return -ENOTCONN;
>> +
>> +	in = kzalloc(sizeof(*in), GFP_KERNEL);
>> +	if (!in)
>> +		return -ENOMEM;
>> +
>> +	in->offset = offset;
>> +	sg_init_one(&in_sg, in, sizeof(*in));
>> +	sg_init_one(&out_sg, buf, size);
>> +	cmd.opcode = opcode;
>> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
>> +	cmd.data_sg = &in_sg;
>> +	cmd.result_sg = &out_sg;
>> +	cmd.group_member_id = virtvdev->vf_id + 1;
>> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
>> +
>> +	kfree(in);
>> +	return ret;
>> +}
>> +
>> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
> and what is lq short for?

To be more explicit, I may replace to virtiovf_cmd_legacy_notify_info() 
to follow the spec opcode.

Yishai

>
>> +				u8 req_bar_flags, u8 *bar, u64 *bar_offset)
>> +{
>> +	struct virtio_device *virtio_dev =
>> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
>> +	struct virtio_admin_cmd_notify_info_result *out;
>> +	struct scatterlist out_sg;
>> +	struct virtio_admin_cmd cmd = {};
>> +	int ret;
>> +
>> +	if (!virtio_dev)
>> +		return -ENOTCONN;
>> +
>> +	out = kzalloc(sizeof(*out), GFP_KERNEL);
>> +	if (!out)
>> +		return -ENOMEM;
>> +
>> +	sg_init_one(&out_sg, out, sizeof(*out));
>> +	cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO;
>> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
>> +	cmd.result_sg = &out_sg;
>> +	cmd.group_member_id = virtvdev->vf_id + 1;
>> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
>> +	if (!ret) {
>> +		struct virtio_admin_cmd_notify_info_data *entry;
>> +		int i;
>> +
>> +		ret = -ENOENT;
>> +		for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) {
>> +			entry = &out->entries[i];
>> +			if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END)
>> +				break;
>> +			if (entry->flags != req_bar_flags)
>> +				continue;
>> +			*bar = entry->bar;
>> +			*bar_offset = le64_to_cpu(entry->offset);
>> +			ret = 0;
>> +			break;
>> +		}
>> +	}
>> +
>> +	kfree(out);
>> +	return ret;
>> +}
>> diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h
>> new file mode 100644
>> index 000000000000..c2a3645f4b90
>> --- /dev/null
>> +++ b/drivers/vfio/pci/virtio/cmd.h
>> @@ -0,0 +1,27 @@
>> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>> +/*
>> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
>> + */
>> +
>> +#ifndef VIRTIO_VFIO_CMD_H
>> +#define VIRTIO_VFIO_CMD_H
>> +
>> +#include <linux/kernel.h>
>> +#include <linux/virtio.h>
>> +#include <linux/vfio_pci_core.h>
>> +#include <linux/virtio_pci.h>
>> +
>> +struct virtiovf_pci_core_device {
>> +	struct vfio_pci_core_device core_device;
>> +	int vf_id;
>> +};
>> +
>> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size);
>> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size);
>> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
>> +			  u8 offset, u8 size, u8 *buf);
>> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
>> +			 u8 offset, u8 size, u8 *buf);
>> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
>> +				u8 req_bar_flags, u8 *bar, u64 *bar_offset);
>> +#endif /* VIRTIO_VFIO_CMD_H */
>> -- 
>> 2.27.0
Yishai Hadas Sept. 26, 2023, 11:14 a.m. UTC | #5
On 22/09/2023 12:54, Michael S. Tsirkin wrote:
> On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote:
>> Expose admin commands over the virtio device, to be used by the
>> vfio-virtio driver in the next patches.
>>
>> It includes: list query/use, legacy write/read, read notify_info.
>>
>> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
>
> This stuff is pure virtio spec. I think it should live under
> drivers/virtio, too.

The motivation to put it in the vfio layer was from the below main reasons:

1) Having it inside virtio may require to export a symbol/function per 
command.

    This will end up today by 5 and in the future (e.g. live migration) 
with much more exported symbols.

    With current code we export only 2 generic symbols 
virtio_pci_vf_get_pf_dev(), virtio_admin_cmd_exec() which may fit for 
any further extension.

2) For now there is no logic in this vfio layer, however, in the future 
we may have some DMA/other logic that should better fit to the 
caller/client layer (i.e. vfio).

By the way, this follows what was done already between vfio/mlx5 to 
mlx5_core modules where mlx5_core exposes generic APIs to execute a 
command and to get the a PF from a given mlx5 VF.

This way, we can enable further commands to be added/extended 
easily/cleanly.

See for example here [1, 2].

[1] 
https://elixir.bootlin.com/linux/v6.6-rc3/source/drivers/vfio/pci/mlx5/cmd.c#L210

[2] 
https://elixir.bootlin.com/linux/v6.6-rc3/source/drivers/vfio/pci/mlx5/cmd.c#L683

Yishai

>
>> ---
>>   drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++
>>   drivers/vfio/pci/virtio/cmd.h |  27 +++++++
>>   2 files changed, 173 insertions(+)
>>   create mode 100644 drivers/vfio/pci/virtio/cmd.c
>>   create mode 100644 drivers/vfio/pci/virtio/cmd.h
>>
>> diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c
>> new file mode 100644
>> index 000000000000..f068239cdbb0
>> --- /dev/null
>> +++ b/drivers/vfio/pci/virtio/cmd.c
>> @@ -0,0 +1,146 @@
>> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>> +/*
>> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved
>> + */
>> +
>> +#include "cmd.h"
>> +
>> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size)
>> +{
>> +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
>> +	struct scatterlist out_sg;
>> +	struct virtio_admin_cmd cmd = {};
>> +
>> +	if (!virtio_dev)
>> +		return -ENOTCONN;
>> +
>> +	sg_init_one(&out_sg, buf, buf_size);
>> +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY;
>> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
>> +	cmd.result_sg = &out_sg;
>> +
>> +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
>> +}
>> +
>> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size)
>> +{
>> +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
>> +	struct scatterlist in_sg;
>> +	struct virtio_admin_cmd cmd = {};
>> +
>> +	if (!virtio_dev)
>> +		return -ENOTCONN;
>> +
>> +	sg_init_one(&in_sg, buf, buf_size);
>> +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE;
>> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
>> +	cmd.data_sg = &in_sg;
>> +
>> +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
>> +}
>> +
>> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
>> +			  u8 offset, u8 size, u8 *buf)
>> +{
>> +	struct virtio_device *virtio_dev =
>> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
>> +	struct virtio_admin_cmd_data_lr_write *in;
>> +	struct scatterlist in_sg;
>> +	struct virtio_admin_cmd cmd = {};
>> +	int ret;
>> +
>> +	if (!virtio_dev)
>> +		return -ENOTCONN;
>> +
>> +	in = kzalloc(sizeof(*in) + size, GFP_KERNEL);
>> +	if (!in)
>> +		return -ENOMEM;
>> +
>> +	in->offset = offset;
>> +	memcpy(in->registers, buf, size);
>> +	sg_init_one(&in_sg, in, sizeof(*in) + size);
>> +	cmd.opcode = opcode;
>> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
>> +	cmd.group_member_id = virtvdev->vf_id + 1;
>> +	cmd.data_sg = &in_sg;
>> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
>> +
>> +	kfree(in);
>> +	return ret;
>> +}
>> +
>> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
>> +			 u8 offset, u8 size, u8 *buf)
>> +{
>> +	struct virtio_device *virtio_dev =
>> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
>> +	struct virtio_admin_cmd_data_lr_read *in;
>> +	struct scatterlist in_sg, out_sg;
>> +	struct virtio_admin_cmd cmd = {};
>> +	int ret;
>> +
>> +	if (!virtio_dev)
>> +		return -ENOTCONN;
>> +
>> +	in = kzalloc(sizeof(*in), GFP_KERNEL);
>> +	if (!in)
>> +		return -ENOMEM;
>> +
>> +	in->offset = offset;
>> +	sg_init_one(&in_sg, in, sizeof(*in));
>> +	sg_init_one(&out_sg, buf, size);
>> +	cmd.opcode = opcode;
>> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
>> +	cmd.data_sg = &in_sg;
>> +	cmd.result_sg = &out_sg;
>> +	cmd.group_member_id = virtvdev->vf_id + 1;
>> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
>> +
>> +	kfree(in);
>> +	return ret;
>> +}
>> +
>> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
>> +				u8 req_bar_flags, u8 *bar, u64 *bar_offset)
>> +{
>> +	struct virtio_device *virtio_dev =
>> +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
>> +	struct virtio_admin_cmd_notify_info_result *out;
>> +	struct scatterlist out_sg;
>> +	struct virtio_admin_cmd cmd = {};
>> +	int ret;
>> +
>> +	if (!virtio_dev)
>> +		return -ENOTCONN;
>> +
>> +	out = kzalloc(sizeof(*out), GFP_KERNEL);
>> +	if (!out)
>> +		return -ENOMEM;
>> +
>> +	sg_init_one(&out_sg, out, sizeof(*out));
>> +	cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO;
>> +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
>> +	cmd.result_sg = &out_sg;
>> +	cmd.group_member_id = virtvdev->vf_id + 1;
>> +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
>> +	if (!ret) {
>> +		struct virtio_admin_cmd_notify_info_data *entry;
>> +		int i;
>> +
>> +		ret = -ENOENT;
>> +		for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) {
>> +			entry = &out->entries[i];
>> +			if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END)
>> +				break;
>> +			if (entry->flags != req_bar_flags)
>> +				continue;
>> +			*bar = entry->bar;
>> +			*bar_offset = le64_to_cpu(entry->offset);
>> +			ret = 0;
>> +			break;
>> +		}
>> +	}
>> +
>> +	kfree(out);
>> +	return ret;
>> +}
>> diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h
>> new file mode 100644
>> index 000000000000..c2a3645f4b90
>> --- /dev/null
>> +++ b/drivers/vfio/pci/virtio/cmd.h
>> @@ -0,0 +1,27 @@
>> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>> +/*
>> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
>> + */
>> +
>> +#ifndef VIRTIO_VFIO_CMD_H
>> +#define VIRTIO_VFIO_CMD_H
>> +
>> +#include <linux/kernel.h>
>> +#include <linux/virtio.h>
>> +#include <linux/vfio_pci_core.h>
>> +#include <linux/virtio_pci.h>
>> +
>> +struct virtiovf_pci_core_device {
>> +	struct vfio_pci_core_device core_device;
>> +	int vf_id;
>> +};
>> +
>> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size);
>> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size);
>> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
>> +			  u8 offset, u8 size, u8 *buf);
>> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
>> +			 u8 offset, u8 size, u8 *buf);
>> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
>> +				u8 req_bar_flags, u8 *bar, u64 *bar_offset);
>> +#endif /* VIRTIO_VFIO_CMD_H */
>> -- 
>> 2.27.0
Michael S. Tsirkin Sept. 26, 2023, 11:25 a.m. UTC | #6
On Tue, Sep 26, 2023 at 01:51:13PM +0300, Yishai Hadas wrote:
> On 21/09/2023 23:34, Michael S. Tsirkin wrote:
> > On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote:
> > > Expose admin commands over the virtio device, to be used by the
> > > vfio-virtio driver in the next patches.
> > > 
> > > It includes: list query/use, legacy write/read, read notify_info.
> > > 
> > > Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> > > ---
> > >   drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++
> > >   drivers/vfio/pci/virtio/cmd.h |  27 +++++++
> > >   2 files changed, 173 insertions(+)
> > >   create mode 100644 drivers/vfio/pci/virtio/cmd.c
> > >   create mode 100644 drivers/vfio/pci/virtio/cmd.h
> > > 
> > > diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c
> > > new file mode 100644
> > > index 000000000000..f068239cdbb0
> > > --- /dev/null
> > > +++ b/drivers/vfio/pci/virtio/cmd.c
> > > @@ -0,0 +1,146 @@
> > > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> > > +/*
> > > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved
> > > + */
> > > +
> > > +#include "cmd.h"
> > > +
> > > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size)
> > > +{
> > > +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
> > > +	struct scatterlist out_sg;
> > > +	struct virtio_admin_cmd cmd = {};
> > > +
> > > +	if (!virtio_dev)
> > > +		return -ENOTCONN;
> > > +
> > > +	sg_init_one(&out_sg, buf, buf_size);
> > > +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY;
> > > +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> > > +	cmd.result_sg = &out_sg;
> > > +
> > > +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
> > > +}
> > > +
> > in/out seem all wrong here. In virtio terminology, in means from
> > device to driver, out means from driver to device.
> I referred here to in/out from vfio POV who prepares the command.
> 
> However, I can replace it to follow the virtio terminology as you suggested
> if this more makes sense.
> 
> Please see also my coming answer on your suggestion to put all of this in
> the virtio layer.
> 
> > > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size)
> > > +{
> > > +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
> > > +	struct scatterlist in_sg;
> > > +	struct virtio_admin_cmd cmd = {};
> > > +
> > > +	if (!virtio_dev)
> > > +		return -ENOTCONN;
> > > +
> > > +	sg_init_one(&in_sg, buf, buf_size);
> > > +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE;
> > > +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> > > +	cmd.data_sg = &in_sg;
> > > +
> > > +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
> > > +}
> > > +
> > > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> > 
> > what is _lr short for?
> 
> This was an acronym to legacy_read.
> 
> The actual command is according to the given opcode which can be one among
> LEGACY_COMMON_CFG_READ, LEGACY_DEV_CFG_READ.
> 
> I can rename it to '_legacy_read' (i.e. virtiovf_issue_legacy_read_cmd) to
> be clearer.
> 
> > 
> > > +			  u8 offset, u8 size, u8 *buf)
> > > +{
> > > +	struct virtio_device *virtio_dev =
> > > +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> > > +	struct virtio_admin_cmd_data_lr_write *in;
> > > +	struct scatterlist in_sg;
> > > +	struct virtio_admin_cmd cmd = {};
> > > +	int ret;
> > > +
> > > +	if (!virtio_dev)
> > > +		return -ENOTCONN;
> > > +
> > > +	in = kzalloc(sizeof(*in) + size, GFP_KERNEL);
> > > +	if (!in)
> > > +		return -ENOMEM;
> > > +
> > > +	in->offset = offset;
> > > +	memcpy(in->registers, buf, size);
> > > +	sg_init_one(&in_sg, in, sizeof(*in) + size);
> > > +	cmd.opcode = opcode;
> > > +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> > > +	cmd.group_member_id = virtvdev->vf_id + 1;
> > weird. why + 1?
> 
> This follows the virtio spec in that area.
> 
> "When sending commands with the SR-IOV group type, the driver specify a
> value for group_member_id
> between 1 and NumVFs inclusive."

Ah, I get it. Pls add a comment.

> The 'virtvdev->vf_id' was set upon vfio/virtio driver initialization by
> pci_iov_vf_id() which its first index is 0.
> 
> > > +	cmd.data_sg = &in_sg;
> > > +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> > > +
> > > +	kfree(in);
> > > +	return ret;
> > > +}
> > How do you know it's safe to send this command, in particular at
> > this time? This seems to be doing zero checks, and zero synchronization
> > with the PF driver.
> > 
> The virtiovf_cmd_lr_read()/other gets a virtio VF and it gets its PF by
> calling virtio_pci_vf_get_pf_dev().
> 
> The VF can't gone by 'disable sriov' as it's owned/used by vfio.
> 
> The PF can't gone by rmmod/modprobe -r of virtio, as of the 'module in
> use'/dependencies between VFIO to VIRTIO.
> 
> The below check [1] was done only from a clean code perspective, which might
> theoretically fail in case the given VF doesn't use a virtio driver.
> 
> [1] if (!virtio_dev)
>         return -ENOTCONN;
> 
> So, it looks safe as is.

Can the device can be unbound from module right after you did the check?
What about suspend - can this be called while suspend is in progress?


More importantly, virtio can decide to reset the device for its
own internal reasons (e.g. to recover from an error).
We used to do it when attaching XDP, and we can start doing it again.
That's one of the reasons why I want all this code under virtio, so we'll remember.


> > > +
> > > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> > > +			 u8 offset, u8 size, u8 *buf)
> > > +{
> > > +	struct virtio_device *virtio_dev =
> > > +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> > > +	struct virtio_admin_cmd_data_lr_read *in;
> > > +	struct scatterlist in_sg, out_sg;
> > > +	struct virtio_admin_cmd cmd = {};
> > > +	int ret;
> > > +
> > > +	if (!virtio_dev)
> > > +		return -ENOTCONN;
> > > +
> > > +	in = kzalloc(sizeof(*in), GFP_KERNEL);
> > > +	if (!in)
> > > +		return -ENOMEM;
> > > +
> > > +	in->offset = offset;
> > > +	sg_init_one(&in_sg, in, sizeof(*in));
> > > +	sg_init_one(&out_sg, buf, size);
> > > +	cmd.opcode = opcode;
> > > +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> > > +	cmd.data_sg = &in_sg;
> > > +	cmd.result_sg = &out_sg;
> > > +	cmd.group_member_id = virtvdev->vf_id + 1;
> > > +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> > > +
> > > +	kfree(in);
> > > +	return ret;
> > > +}
> > > +
> > > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
> > and what is lq short for?
> 
> To be more explicit, I may replace to virtiovf_cmd_legacy_notify_info() to
> follow the spec opcode.
> 
> Yishai
> 
> > 
> > > +				u8 req_bar_flags, u8 *bar, u64 *bar_offset)
> > > +{
> > > +	struct virtio_device *virtio_dev =
> > > +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> > > +	struct virtio_admin_cmd_notify_info_result *out;
> > > +	struct scatterlist out_sg;
> > > +	struct virtio_admin_cmd cmd = {};
> > > +	int ret;
> > > +
> > > +	if (!virtio_dev)
> > > +		return -ENOTCONN;
> > > +
> > > +	out = kzalloc(sizeof(*out), GFP_KERNEL);
> > > +	if (!out)
> > > +		return -ENOMEM;
> > > +
> > > +	sg_init_one(&out_sg, out, sizeof(*out));
> > > +	cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO;
> > > +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> > > +	cmd.result_sg = &out_sg;
> > > +	cmd.group_member_id = virtvdev->vf_id + 1;
> > > +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> > > +	if (!ret) {
> > > +		struct virtio_admin_cmd_notify_info_data *entry;
> > > +		int i;
> > > +
> > > +		ret = -ENOENT;
> > > +		for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) {
> > > +			entry = &out->entries[i];
> > > +			if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END)
> > > +				break;
> > > +			if (entry->flags != req_bar_flags)
> > > +				continue;
> > > +			*bar = entry->bar;
> > > +			*bar_offset = le64_to_cpu(entry->offset);
> > > +			ret = 0;
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	kfree(out);
> > > +	return ret;
> > > +}
> > > diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h
> > > new file mode 100644
> > > index 000000000000..c2a3645f4b90
> > > --- /dev/null
> > > +++ b/drivers/vfio/pci/virtio/cmd.h
> > > @@ -0,0 +1,27 @@
> > > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> > > +/*
> > > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
> > > + */
> > > +
> > > +#ifndef VIRTIO_VFIO_CMD_H
> > > +#define VIRTIO_VFIO_CMD_H
> > > +
> > > +#include <linux/kernel.h>
> > > +#include <linux/virtio.h>
> > > +#include <linux/vfio_pci_core.h>
> > > +#include <linux/virtio_pci.h>
> > > +
> > > +struct virtiovf_pci_core_device {
> > > +	struct vfio_pci_core_device core_device;
> > > +	int vf_id;
> > > +};
> > > +
> > > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size);
> > > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size);
> > > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> > > +			  u8 offset, u8 size, u8 *buf);
> > > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> > > +			 u8 offset, u8 size, u8 *buf);
> > > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
> > > +				u8 req_bar_flags, u8 *bar, u64 *bar_offset);
> > > +#endif /* VIRTIO_VFIO_CMD_H */
> > > -- 
> > > 2.27.0
>
Michael S. Tsirkin Sept. 26, 2023, 11:41 a.m. UTC | #7
On Tue, Sep 26, 2023 at 02:14:01PM +0300, Yishai Hadas wrote:
> On 22/09/2023 12:54, Michael S. Tsirkin wrote:
> > On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote:
> > > Expose admin commands over the virtio device, to be used by the
> > > vfio-virtio driver in the next patches.
> > > 
> > > It includes: list query/use, legacy write/read, read notify_info.
> > > 
> > > Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> > 
> > This stuff is pure virtio spec. I think it should live under
> > drivers/virtio, too.
> 
> The motivation to put it in the vfio layer was from the below main reasons:
> 
> 1) Having it inside virtio may require to export a symbol/function per
> command.
> 
>    This will end up today by 5 and in the future (e.g. live migration) with
> much more exported symbols.
>
>    With current code we export only 2 generic symbols
> virtio_pci_vf_get_pf_dev(), virtio_admin_cmd_exec() which may fit for any
> further extension.

Except, there's no reasonable way for virtio to know what is done with
the device then. You are not using just 2 symbols at all, instead you
are using the rich vq API which was explicitly designed for the driver
running the device being responsible for serializing accesses. Which is
actually loaded and running. And I *think* your use won't conflict ATM
mostly by luck. Witness the hack in patch 01 as exhibit 1 - nothing
at all even hints at the fact that the reason for the complicated
dance is because another driver pokes at some of the vqs.


> 2) For now there is no logic in this vfio layer, however, in the future we
> may have some DMA/other logic that should better fit to the caller/client
> layer (i.e. vfio).

You are poking at the device without any locks etc. Maybe it looks like
no logic to you but it does not look like that from where I stand.

> By the way, this follows what was done already between vfio/mlx5 to
> mlx5_core modules where mlx5_core exposes generic APIs to execute a command
> and to get the a PF from a given mlx5 VF.

This is up to mlx5 maintainers. In particular they only need to worry
that their patches work with specific hardware which they likely have.
virtio has to work with multiple vendors - hardware and software -
and exposing a low level API that I can't test on my laptop
is not at all my ideal.

> This way, we can enable further commands to be added/extended
> easily/cleanly.

Something for vfio maintainer to consider in case it was
assumed that it's just this one weird thing
but otherwise it's all generic vfio. It's not going to stop there,
will it? The duplication of functionality with vdpa will continue :(


I am much more interested in adding reusable functionality that
everyone benefits from than in vfio poking at the device in its
own weird ways that only benefit specific hardware.


> See for example here [1, 2].
> 
> [1] https://elixir.bootlin.com/linux/v6.6-rc3/source/drivers/vfio/pci/mlx5/cmd.c#L210
> 
> [2] https://elixir.bootlin.com/linux/v6.6-rc3/source/drivers/vfio/pci/mlx5/cmd.c#L683
> 
> Yishai



> > 
> > > ---
> > >   drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++
> > >   drivers/vfio/pci/virtio/cmd.h |  27 +++++++
> > >   2 files changed, 173 insertions(+)
> > >   create mode 100644 drivers/vfio/pci/virtio/cmd.c
> > >   create mode 100644 drivers/vfio/pci/virtio/cmd.h
> > > 
> > > diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c
> > > new file mode 100644
> > > index 000000000000..f068239cdbb0
> > > --- /dev/null
> > > +++ b/drivers/vfio/pci/virtio/cmd.c
> > > @@ -0,0 +1,146 @@
> > > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> > > +/*
> > > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved
> > > + */
> > > +
> > > +#include "cmd.h"
> > > +
> > > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size)
> > > +{
> > > +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
> > > +	struct scatterlist out_sg;
> > > +	struct virtio_admin_cmd cmd = {};
> > > +
> > > +	if (!virtio_dev)
> > > +		return -ENOTCONN;
> > > +
> > > +	sg_init_one(&out_sg, buf, buf_size);
> > > +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY;
> > > +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> > > +	cmd.result_sg = &out_sg;
> > > +
> > > +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
> > > +}
> > > +
> > > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size)
> > > +{
> > > +	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
> > > +	struct scatterlist in_sg;
> > > +	struct virtio_admin_cmd cmd = {};
> > > +
> > > +	if (!virtio_dev)
> > > +		return -ENOTCONN;
> > > +
> > > +	sg_init_one(&in_sg, buf, buf_size);
> > > +	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE;
> > > +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> > > +	cmd.data_sg = &in_sg;
> > > +
> > > +	return virtio_admin_cmd_exec(virtio_dev, &cmd);
> > > +}
> > > +
> > > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> > > +			  u8 offset, u8 size, u8 *buf)
> > > +{
> > > +	struct virtio_device *virtio_dev =
> > > +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> > > +	struct virtio_admin_cmd_data_lr_write *in;
> > > +	struct scatterlist in_sg;
> > > +	struct virtio_admin_cmd cmd = {};
> > > +	int ret;
> > > +
> > > +	if (!virtio_dev)
> > > +		return -ENOTCONN;
> > > +
> > > +	in = kzalloc(sizeof(*in) + size, GFP_KERNEL);
> > > +	if (!in)
> > > +		return -ENOMEM;
> > > +
> > > +	in->offset = offset;
> > > +	memcpy(in->registers, buf, size);
> > > +	sg_init_one(&in_sg, in, sizeof(*in) + size);
> > > +	cmd.opcode = opcode;
> > > +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> > > +	cmd.group_member_id = virtvdev->vf_id + 1;
> > > +	cmd.data_sg = &in_sg;
> > > +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> > > +
> > > +	kfree(in);
> > > +	return ret;
> > > +}
> > > +
> > > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> > > +			 u8 offset, u8 size, u8 *buf)
> > > +{
> > > +	struct virtio_device *virtio_dev =
> > > +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> > > +	struct virtio_admin_cmd_data_lr_read *in;
> > > +	struct scatterlist in_sg, out_sg;
> > > +	struct virtio_admin_cmd cmd = {};
> > > +	int ret;
> > > +
> > > +	if (!virtio_dev)
> > > +		return -ENOTCONN;
> > > +
> > > +	in = kzalloc(sizeof(*in), GFP_KERNEL);
> > > +	if (!in)
> > > +		return -ENOMEM;
> > > +
> > > +	in->offset = offset;
> > > +	sg_init_one(&in_sg, in, sizeof(*in));
> > > +	sg_init_one(&out_sg, buf, size);
> > > +	cmd.opcode = opcode;
> > > +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> > > +	cmd.data_sg = &in_sg;
> > > +	cmd.result_sg = &out_sg;
> > > +	cmd.group_member_id = virtvdev->vf_id + 1;
> > > +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> > > +
> > > +	kfree(in);
> > > +	return ret;
> > > +}
> > > +
> > > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
> > > +				u8 req_bar_flags, u8 *bar, u64 *bar_offset)
> > > +{
> > > +	struct virtio_device *virtio_dev =
> > > +		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
> > > +	struct virtio_admin_cmd_notify_info_result *out;
> > > +	struct scatterlist out_sg;
> > > +	struct virtio_admin_cmd cmd = {};
> > > +	int ret;
> > > +
> > > +	if (!virtio_dev)
> > > +		return -ENOTCONN;
> > > +
> > > +	out = kzalloc(sizeof(*out), GFP_KERNEL);
> > > +	if (!out)
> > > +		return -ENOMEM;
> > > +
> > > +	sg_init_one(&out_sg, out, sizeof(*out));
> > > +	cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO;
> > > +	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
> > > +	cmd.result_sg = &out_sg;
> > > +	cmd.group_member_id = virtvdev->vf_id + 1;
> > > +	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
> > > +	if (!ret) {
> > > +		struct virtio_admin_cmd_notify_info_data *entry;
> > > +		int i;
> > > +
> > > +		ret = -ENOENT;
> > > +		for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) {
> > > +			entry = &out->entries[i];
> > > +			if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END)
> > > +				break;
> > > +			if (entry->flags != req_bar_flags)
> > > +				continue;
> > > +			*bar = entry->bar;
> > > +			*bar_offset = le64_to_cpu(entry->offset);
> > > +			ret = 0;
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	kfree(out);
> > > +	return ret;
> > > +}
> > > diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h
> > > new file mode 100644
> > > index 000000000000..c2a3645f4b90
> > > --- /dev/null
> > > +++ b/drivers/vfio/pci/virtio/cmd.h
> > > @@ -0,0 +1,27 @@
> > > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> > > +/*
> > > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
> > > + */
> > > +
> > > +#ifndef VIRTIO_VFIO_CMD_H
> > > +#define VIRTIO_VFIO_CMD_H
> > > +
> > > +#include <linux/kernel.h>
> > > +#include <linux/virtio.h>
> > > +#include <linux/vfio_pci_core.h>
> > > +#include <linux/virtio_pci.h>
> > > +
> > > +struct virtiovf_pci_core_device {
> > > +	struct vfio_pci_core_device core_device;
> > > +	int vf_id;
> > > +};
> > > +
> > > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size);
> > > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size);
> > > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> > > +			  u8 offset, u8 size, u8 *buf);
> > > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
> > > +			 u8 offset, u8 size, u8 *buf);
> > > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
> > > +				u8 req_bar_flags, u8 *bar, u64 *bar_offset);
> > > +#endif /* VIRTIO_VFIO_CMD_H */
> > > -- 
> > > 2.27.0
>
Jason Gunthorpe Sept. 27, 2023, 1:18 p.m. UTC | #8
On Tue, Sep 26, 2023 at 07:41:44AM -0400, Michael S. Tsirkin wrote:

> > By the way, this follows what was done already between vfio/mlx5 to
> > mlx5_core modules where mlx5_core exposes generic APIs to execute a command
> > and to get the a PF from a given mlx5 VF.
> 
> This is up to mlx5 maintainers. In particular they only need to worry
> that their patches work with specific hardware which they likely have.
> virtio has to work with multiple vendors - hardware and software -
> and exposing a low level API that I can't test on my laptop
> is not at all my ideal.

mlx5 has a reasonable API from the lower level that allows the vfio
driver to safely issue commands. The API provides all the safety and
locking you have been questioning here.

Then the vfio driver can form the commands directly and in the way it
needs. This avoids spewing code into the core modules that is only
used by vfio - which has been a key design consideration for our
driver layering.

I suggest following the same design here as it has been well proven.
Provide a solid API to operate the admin queue and let VFIO use
it. One of the main purposes of the admin queue is to deliver commands
on behalf of the VF driver, so this is a logical and reasonable place
to put an API.

> > This way, we can enable further commands to be added/extended
> > easily/cleanly.
> 
> Something for vfio maintainer to consider in case it was
> assumed that it's just this one weird thing
> but otherwise it's all generic vfio. It's not going to stop there,
> will it? The duplication of functionality with vdpa will continue :(

VFIO live migration is expected to come as well once OASIS completes
its work.

Parav, are there other things?

Jason
Michael S. Tsirkin Sept. 27, 2023, 9:30 p.m. UTC | #9
On Wed, Sep 27, 2023 at 10:18:17AM -0300, Jason Gunthorpe wrote:
> On Tue, Sep 26, 2023 at 07:41:44AM -0400, Michael S. Tsirkin wrote:
> 
> > > By the way, this follows what was done already between vfio/mlx5 to
> > > mlx5_core modules where mlx5_core exposes generic APIs to execute a command
> > > and to get the a PF from a given mlx5 VF.
> > 
> > This is up to mlx5 maintainers. In particular they only need to worry
> > that their patches work with specific hardware which they likely have.
> > virtio has to work with multiple vendors - hardware and software -
> > and exposing a low level API that I can't test on my laptop
> > is not at all my ideal.
> 
> mlx5 has a reasonable API from the lower level that allows the vfio
> driver to safely issue commands. The API provides all the safety and
> locking you have been questioning here.
> 
> Then the vfio driver can form the commands directly and in the way it
> needs. This avoids spewing code into the core modules that is only
> used by vfio - which has been a key design consideration for our
> driver layering.
> 
> I suggest following the same design here as it has been well proven.
> Provide a solid API to operate the admin queue and let VFIO use
> it. One of the main purposes of the admin queue is to deliver commands
> on behalf of the VF driver, so this is a logical and reasonable place
> to put an API.

Not the way virtio is designed now. I guess mlx5 is designed in
a way that makes it safe.

> > > This way, we can enable further commands to be added/extended
> > > easily/cleanly.
> > 
> > Something for vfio maintainer to consider in case it was
> > assumed that it's just this one weird thing
> > but otherwise it's all generic vfio. It's not going to stop there,
> > will it? The duplication of functionality with vdpa will continue :(
> 
> VFIO live migration is expected to come as well once OASIS completes
> its work.

Exactly. Is there doubt vdpa will want to support live migration?
Put this code in a library please.

> Parav, are there other things?
> 
> Jason
Jason Gunthorpe Sept. 27, 2023, 11:16 p.m. UTC | #10
On Wed, Sep 27, 2023 at 05:30:04PM -0400, Michael S. Tsirkin wrote:
> On Wed, Sep 27, 2023 at 10:18:17AM -0300, Jason Gunthorpe wrote:
> > On Tue, Sep 26, 2023 at 07:41:44AM -0400, Michael S. Tsirkin wrote:
> > 
> > > > By the way, this follows what was done already between vfio/mlx5 to
> > > > mlx5_core modules where mlx5_core exposes generic APIs to execute a command
> > > > and to get the a PF from a given mlx5 VF.
> > > 
> > > This is up to mlx5 maintainers. In particular they only need to worry
> > > that their patches work with specific hardware which they likely have.
> > > virtio has to work with multiple vendors - hardware and software -
> > > and exposing a low level API that I can't test on my laptop
> > > is not at all my ideal.
> > 
> > mlx5 has a reasonable API from the lower level that allows the vfio
> > driver to safely issue commands. The API provides all the safety and
> > locking you have been questioning here.
> > 
> > Then the vfio driver can form the commands directly and in the way it
> > needs. This avoids spewing code into the core modules that is only
> > used by vfio - which has been a key design consideration for our
> > driver layering.
> > 
> > I suggest following the same design here as it has been well proven.
> > Provide a solid API to operate the admin queue and let VFIO use
> > it. One of the main purposes of the admin queue is to deliver commands
> > on behalf of the VF driver, so this is a logical and reasonable place
> > to put an API.
> 
> Not the way virtio is designed now. I guess mlx5 is designed in
> a way that makes it safe.

If you can't reliably issue commmands from the VF at all it doesn't
really matter where you put the code. Once that is established up then
an admin command execution interface is a nice cut point for
modularity.

The locking in mlx5 to make this safe is not too complex, if Feng
missed some items for virtio then he can work to fix it up.

> > VFIO live migration is expected to come as well once OASIS completes
> > its work.
> 
> Exactly. Is there doubt vdpa will want to support live migration?
> Put this code in a library please.

I have a doubt, you both said vdpa already does live migration, so
what will it even do with a live migration interface to a PCI
function?

It already has to use full mediation to operate a physical virtio
function, so it seems like it shouldn't need the migration interface?

Regardless, it is better kernel development hygiene to put the code
where it is used and wait for a second user to consolidate it than to
guess.

Jason
Michael S. Tsirkin Sept. 28, 2023, 5:26 a.m. UTC | #11
On Wed, Sep 27, 2023 at 08:16:00PM -0300, Jason Gunthorpe wrote:
> On Wed, Sep 27, 2023 at 05:30:04PM -0400, Michael S. Tsirkin wrote:
> > On Wed, Sep 27, 2023 at 10:18:17AM -0300, Jason Gunthorpe wrote:
> > > On Tue, Sep 26, 2023 at 07:41:44AM -0400, Michael S. Tsirkin wrote:
> > > 
> > > > > By the way, this follows what was done already between vfio/mlx5 to
> > > > > mlx5_core modules where mlx5_core exposes generic APIs to execute a command
> > > > > and to get the a PF from a given mlx5 VF.
> > > > 
> > > > This is up to mlx5 maintainers. In particular they only need to worry
> > > > that their patches work with specific hardware which they likely have.
> > > > virtio has to work with multiple vendors - hardware and software -
> > > > and exposing a low level API that I can't test on my laptop
> > > > is not at all my ideal.
> > > 
> > > mlx5 has a reasonable API from the lower level that allows the vfio
> > > driver to safely issue commands. The API provides all the safety and
> > > locking you have been questioning here.
> > > 
> > > Then the vfio driver can form the commands directly and in the way it
> > > needs. This avoids spewing code into the core modules that is only
> > > used by vfio - which has been a key design consideration for our
> > > driver layering.
> > > 
> > > I suggest following the same design here as it has been well proven.
> > > Provide a solid API to operate the admin queue and let VFIO use
> > > it. One of the main purposes of the admin queue is to deliver commands
> > > on behalf of the VF driver, so this is a logical and reasonable place
> > > to put an API.
> > 
> > Not the way virtio is designed now. I guess mlx5 is designed in
> > a way that makes it safe.
> 
> If you can't reliably issue commmands from the VF at all it doesn't
> really matter where you put the code. Once that is established up then
> an admin command execution interface is a nice cut point for
> modularity.
> 
> The locking in mlx5 to make this safe is not too complex, if Feng
> missed some items for virtio then he can work to fix it up.

Above two paragraphs don't make sense to me at all. VF issues
no commands and there's no locking.

> > > VFIO live migration is expected to come as well once OASIS completes
> > > its work.
> > 
> > Exactly. Is there doubt vdpa will want to support live migration?
> > Put this code in a library please.
> 
> I have a doubt, you both said vdpa already does live migration, so
> what will it even do with a live migration interface to a PCI
> function?

This is not the thread to explain how vdpa live migration works now and
why it needs new interfaces, sorry. Suffice is to say right now on virtio
tc Parav from nvidia is arguing for vdpa to use admin commands for
migration.

> It already has to use full mediation to operate a physical virtio
> function, so it seems like it shouldn't need the migration interface?
> 
> Regardless, it is better kernel development hygiene to put the code
> where it is used and wait for a second user to consolidate it than to
> guess.
> 
> Jason

Sorry no time right now to argue philosophy. I gave some hints on how to
make the virtio changes behave in a way that I'm ok with maintaining.
Hope they help.
Christoph Hellwig Oct. 2, 2023, 6:28 a.m. UTC | #12
On Tue, Sep 26, 2023 at 07:41:44AM -0400, Michael S. Tsirkin wrote:
> 
> Except, there's no reasonable way for virtio to know what is done with
> the device then. You are not using just 2 symbols at all, instead you
> are using the rich vq API which was explicitly designed for the driver
> running the device being responsible for serializing accesses. Which is
> actually loaded and running. And I *think* your use won't conflict ATM
> mostly by luck. Witness the hack in patch 01 as exhibit 1 - nothing
> at all even hints at the fact that the reason for the complicated
> dance is because another driver pokes at some of the vqs.

Fully agreed.  The smart nic vendors are trying to do the same mess
in nvme, and we really need to stop them and agree on proper standarized
live migration features implemented in the core virtio/nvme code.
Jason Gunthorpe Oct. 2, 2023, 3:13 p.m. UTC | #13
On Sun, Oct 01, 2023 at 11:28:26PM -0700, Christoph Hellwig wrote:
> On Tue, Sep 26, 2023 at 07:41:44AM -0400, Michael S. Tsirkin wrote:
> > 
> > Except, there's no reasonable way for virtio to know what is done with
> > the device then. You are not using just 2 symbols at all, instead you
> > are using the rich vq API which was explicitly designed for the driver
> > running the device being responsible for serializing accesses. Which is
> > actually loaded and running. And I *think* your use won't conflict ATM
> > mostly by luck. Witness the hack in patch 01 as exhibit 1 - nothing
> > at all even hints at the fact that the reason for the complicated
> > dance is because another driver pokes at some of the vqs.
> 
> Fully agreed.  The smart nic vendors are trying to do the same mess
> in nvme, and we really need to stop them and agree on proper standarized
> live migration features implemented in the core virtio/nvme code.

??? This patch series is an implementation of changes that OASIS
approved.

The live migration work is going to OASIS first no patches have been
presented.

This thread is arguing about how to split up the code for the
implementatin of the standard given that VFIO owns the VF and the
virtio core owns the PF. The standard defined that PF admin queue
operations are needed to do operations on behalf of the VF.

Jason
Christoph Hellwig Oct. 5, 2023, 8:49 a.m. UTC | #14
On Mon, Oct 02, 2023 at 12:13:20PM -0300, Jason Gunthorpe wrote:
> ??? This patch series is an implementation of changes that OASIS
> approved.

I think you are fundamentally missing my point.  This is not about
who publish a spec, but how we struture Linux code.

And the problem is that we trea vfio as a separate thing, and not an
integral part of the driver.  vfio being separate totally makes sense
for the original purpose of vfio, that is a a no-op passthrough of
a device to userspace.

But for all the augmented vfio use cases it doesn't, for them the
augmented vfio functionality is an integral part of the core driver.
That is true for nvme, virtio and I'd argue mlx5 as well.

So we need to stop registering separate pci_drivers for this kind
of functionality, and instead have an interface to the driver to
switch to certain functionalities.

E.g. for this case there should be no new vfio-virtio device, but
instead you should be able to switch the virtio device to an
fake-legacy vfio mode.

Assuming the whole thing actually makes sense, as the use case seems
a bit fishy to start with, but I'll leave that argument to the virtio
maintainers.

Similarly for nvme.  We'll never accept a separate nvme-live migration
vfio driver.  This functionality needs to be part of the nvme driver,
probed there and fully controlled there.
Jason Gunthorpe Oct. 5, 2023, 11:10 a.m. UTC | #15
On Thu, Oct 05, 2023 at 01:49:54AM -0700, Christoph Hellwig wrote:

> But for all the augmented vfio use cases it doesn't, for them the
> augmented vfio functionality is an integral part of the core driver.
> That is true for nvme, virtio and I'd argue mlx5 as well.

I don't agree with this. I see the extra functionality as being an
integral part of the VF and VFIO. The PF driver is only providing a
proxied communication channel.

It is a limitation of PCI that the PF must act as a proxy.

> So we need to stop registering separate pci_drivers for this kind
> of functionality, and instead have an interface to the driver to
> switch to certain functionalities.

?? We must bind something to the VF's pci_driver, what do you imagine
that is?

> E.g. for this case there should be no new vfio-virtio device, but
> instead you should be able to switch the virtio device to an
> fake-legacy vfio mode.

Are you aruging about how we reach to vfio_register_XX() and what
directory the file lives?

I don't know what "fake-legacy" even means, VFIO is not legacy.

There is alot of code in VFIO and the VMM side to take a VF and turn
it into a vPCI function. You can't just trivially duplicate VFIO in a
dozen drivers without creating a giant mess.

Further, userspace wants consistent ways to operate this stuff. If we
need a dozen ways to activate VFIO for every kind of driver that is
not a positive direction.

Basically, I don't know what you are suggesting here. We talked about
this before, and my position is still the same. Continuing to have
/dev/vfio/XX be the kernel uAPI for the VMM to work with non-mediated
vPCI functions with live migration is the technically correct thing to
do.

Why wouldn't it be?

> Similarly for nvme.  We'll never accept a separate nvme-live migration
> vfio driver.  This functionality needs to be part of the nvme driver,
> probed there and fully controlled there.

We can debate where to put the files when the standard is done, but
the end of the day it needs to create /dev/vfio/XXX.

Jason
Christoph Hellwig Oct. 6, 2023, 1:09 p.m. UTC | #16
On Thu, Oct 05, 2023 at 08:10:04AM -0300, Jason Gunthorpe wrote:
> > But for all the augmented vfio use cases it doesn't, for them the
> > augmented vfio functionality is an integral part of the core driver.
> > That is true for nvme, virtio and I'd argue mlx5 as well.
> 
> I don't agree with this. I see the extra functionality as being an
> integral part of the VF and VFIO. The PF driver is only providing a
> proxied communication channel.
> 
> It is a limitation of PCI that the PF must act as a proxy.

For anything live migration it very fundamentally is not, as a function
that is visible to a guest by definition can't drive the migration
itself.  That isn't really a limitation in PCI, but follows form the
fact that something else must control a live migration that is
transparent to the guest.

> 
> > So we need to stop registering separate pci_drivers for this kind
> > of functionality, and instead have an interface to the driver to
> > switch to certain functionalities.
> 
> ?? We must bind something to the VF's pci_driver, what do you imagine
> that is?

The driver that knows this hardware.  In this case the virtio subsystem,
in case of nvme the nvme driver, and in case of mlx5 the mlx5 driver.

> > E.g. for this case there should be no new vfio-virtio device, but
> > instead you should be able to switch the virtio device to an
> > fake-legacy vfio mode.
> 
> Are you aruging about how we reach to vfio_register_XX() and what
> directory the file lives?

No.  That layout logically follows from what codebase the functionality
is part of, though.

> I don't know what "fake-legacy" even means, VFIO is not legacy.

The driver we're talking about in this thread fakes up a virtio_pci
legacy devie to the guest on top of a "modern" virtio_pci device.

> There is alot of code in VFIO and the VMM side to take a VF and turn
> it into a vPCI function. You can't just trivially duplicate VFIO in a
> dozen drivers without creating a giant mess.

I do not advocate for duplicating it.  But the code that calls this
functionality belongs into the driver that deals with the compound
device that we're doing this work for.

> Further, userspace wants consistent ways to operate this stuff. If we
> need a dozen ways to activate VFIO for every kind of driver that is
> not a positive direction.

We don't need a dozen ways.  We just need a single attribute on the
pci (or $OTHERBUS) devide that switches it to vfio mode.
Jason Gunthorpe Oct. 10, 2023, 1:10 p.m. UTC | #17
On Fri, Oct 06, 2023 at 06:09:09AM -0700, Christoph Hellwig wrote:
> On Thu, Oct 05, 2023 at 08:10:04AM -0300, Jason Gunthorpe wrote:
> > > But for all the augmented vfio use cases it doesn't, for them the
> > > augmented vfio functionality is an integral part of the core driver.
> > > That is true for nvme, virtio and I'd argue mlx5 as well.
> > 
> > I don't agree with this. I see the extra functionality as being an
> > integral part of the VF and VFIO. The PF driver is only providing a
> > proxied communication channel.
> > 
> > It is a limitation of PCI that the PF must act as a proxy.
> 
> For anything live migration it very fundamentally is not, as a function
> that is visible to a guest by definition can't drive the migration
> itself.  That isn't really a limitation in PCI, but follows form the
> fact that something else must control a live migration that is
> transparent to the guest.

We've talked around ideas like allowing the VF config space to do some
of the work. For simple devices we could get away with 1 VF config
space register. (VF config space is owned by the hypervisor, not the
guest)

Devices that need DMA as part of their migration could be imagined to
co-opt a VF PASID or something. eg using ENQCMD.

SIOVr2 is discussing more a flexible RID mapping - there is a possible
route where a "VF" could actually have two RIDs, a hypervisor RID and a
guest RID.

It really is PCI limitations that force this design of making a PF
driver do dual duty as a fully functionally normal device and act as a
communication channel proxy to make a back channel into a SRIOV VF.

My view has always been that the VFIO live migration operations are
executed logically within the VF as they only effect the VF.

So we have a logical design seperation where VFIO world owns the
commands and the PF driver supplies the communication channel. This
works well for devices that already have a robust RPC interface to
their device FW.

> > ?? We must bind something to the VF's pci_driver, what do you imagine
> > that is?
> 
> The driver that knows this hardware.  In this case the virtio subsystem,
> in case of nvme the nvme driver, and in case of mlx5 the mlx5 driver.

But those are drivers operating the HW to create kernel devices. Here
we need a VFIO device. They can't co-exist, if you switch mlx5 from
normal to vfio you have to tear down the entire normal driver.

> > > E.g. for this case there should be no new vfio-virtio device, but
> > > instead you should be able to switch the virtio device to an
> > > fake-legacy vfio mode.
> > 
> > Are you aruging about how we reach to vfio_register_XX() and what
> > directory the file lives?
> 
> No.  That layout logically follows from what codebase the functionality
> is part of, though.

I don't understand what we are talking about really. Where do you
imagine the vfio_register_XX() goes?

> > I don't know what "fake-legacy" even means, VFIO is not legacy.
> 
> The driver we're talking about in this thread fakes up a virtio_pci
> legacy devie to the guest on top of a "modern" virtio_pci device.

I'm not sure I'd use the word fake, inb/outb are always trapped
operations in VMs. If the device provided a real IO BAR then VFIO
common code would trap and relay inb/outb to the device.

All this is doing is changing the inb/outb relay from using a physical
IO BAR to a DMA command ring.

The motivation is simply because normal IO BAR space is incredibly
limited and you can't get enough SRIOV functions when using it.

> > There is alot of code in VFIO and the VMM side to take a VF and turn
> > it into a vPCI function. You can't just trivially duplicate VFIO in a
> > dozen drivers without creating a giant mess.
> 
> I do not advocate for duplicating it.  But the code that calls this
> functionality belongs into the driver that deals with the compound
> device that we're doing this work for.

On one hand, I don't really care - we can put the code where people
like.

However - the Intel GPU VFIO driver is such a bad experiance I don't
want to encourage people to make VFIO drivers, or code that is only
used by VFIO drivers, that are not under drivers/vfio review.

> > Further, userspace wants consistent ways to operate this stuff. If we
> > need a dozen ways to activate VFIO for every kind of driver that is
> > not a positive direction.
> 
> We don't need a dozen ways.  We just need a single attribute on the
> pci (or $OTHERBUS) devide that switches it to vfio mode.

Well, we sort of do these days, it is just a convoluted bind thing.

Realistically switching modes requires unprobing the entire normal VF
driver. Having this be linked to the driver core probe/unprobe flows
is a good code reuse thing, IMHO.

We already spent alot of effort making this quite general from the
userspace perspective. Nobody yet came up with an idea to avoid the
ugly unbind/bind flow.

Be aware, there is a significant performance concern here. If you want
to create 1000 VFIO devices (this is a real thing), we *can't* probe a
normal driver first, it is too slow. We need a path that goes directly
from creating the RIDs to turning those RIDs into VFIO.

mlx5 takes *seconds* to complete its normal probe. We must avoid this.

Looking a few years into the future, with SIOVr1/2, the flow I want to
target is some uAPI commands:
  'create a PCI RID with params XYZ and attach a normal/VFIO/etc driver'
  'destroy a PCI RID'

We need to get away from this scheme of SRIOV where you bulk create a
bunch of empty VFs at one time and then have to somehow provision
them.

Jason
Michael S. Tsirkin Oct. 10, 2023, 1:56 p.m. UTC | #18
On Tue, Oct 10, 2023 at 10:10:31AM -0300, Jason Gunthorpe wrote:
> > > There is alot of code in VFIO and the VMM side to take a VF and turn
> > > it into a vPCI function. You can't just trivially duplicate VFIO in a
> > > dozen drivers without creating a giant mess.
> > 
> > I do not advocate for duplicating it.  But the code that calls this
> > functionality belongs into the driver that deals with the compound
> > device that we're doing this work for.
> 
> On one hand, I don't really care - we can put the code where people
> like.
> 
> However - the Intel GPU VFIO driver is such a bad experiance I don't
> want to encourage people to make VFIO drivers, or code that is only
> used by VFIO drivers, that are not under drivers/vfio review.

So if Alex feels it makes sense to add some virtio functionality
to vfio and is happy to maintain or let you maintain the UAPI
then why would I say no? But we never expected devices to have
two drivers like this does, so just exposing device pointer
and saying "use regular virtio APIs for the rest" does not
cut it, the new APIs have to make sense
so virtio drivers can develop normally without fear of stepping
on the toes of this admin driver.
Jason Gunthorpe Oct. 10, 2023, 2:08 p.m. UTC | #19
On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote:

> > However - the Intel GPU VFIO driver is such a bad experiance I don't
> > want to encourage people to make VFIO drivers, or code that is only
> > used by VFIO drivers, that are not under drivers/vfio review.
> 
> So if Alex feels it makes sense to add some virtio functionality
> to vfio and is happy to maintain or let you maintain the UAPI
> then why would I say no? But we never expected devices to have
> two drivers like this does, so just exposing device pointer
> and saying "use regular virtio APIs for the rest" does not
> cut it, the new APIs have to make sense
> so virtio drivers can develop normally without fear of stepping
> on the toes of this admin driver.

Please work with Yishai to get something that make sense to you. He
can post a v2 with the accumulated comments addressed so far and then
go over what the API between the drivers is.

Thanks,
Jason
Michael S. Tsirkin Oct. 10, 2023, 2:54 p.m. UTC | #20
On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote:
> On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote:
> 
> > > However - the Intel GPU VFIO driver is such a bad experiance I don't
> > > want to encourage people to make VFIO drivers, or code that is only
> > > used by VFIO drivers, that are not under drivers/vfio review.
> > 
> > So if Alex feels it makes sense to add some virtio functionality
> > to vfio and is happy to maintain or let you maintain the UAPI
> > then why would I say no? But we never expected devices to have
> > two drivers like this does, so just exposing device pointer
> > and saying "use regular virtio APIs for the rest" does not
> > cut it, the new APIs have to make sense
> > so virtio drivers can develop normally without fear of stepping
> > on the toes of this admin driver.
> 
> Please work with Yishai to get something that make sense to you. He
> can post a v2 with the accumulated comments addressed so far and then
> go over what the API between the drivers is.
> 
> Thanks,
> Jason

/me shrugs. I pretty much posted suggestions already. Should not be hard.
Anything unclear - post on list.
Yishai Hadas Oct. 10, 2023, 3:09 p.m. UTC | #21
On 10/10/2023 17:54, Michael S. Tsirkin wrote:
> On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote:
>> On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote:
>>
>>>> However - the Intel GPU VFIO driver is such a bad experiance I don't
>>>> want to encourage people to make VFIO drivers, or code that is only
>>>> used by VFIO drivers, that are not under drivers/vfio review.
>>> So if Alex feels it makes sense to add some virtio functionality
>>> to vfio and is happy to maintain or let you maintain the UAPI
>>> then why would I say no? But we never expected devices to have
>>> two drivers like this does, so just exposing device pointer
>>> and saying "use regular virtio APIs for the rest" does not
>>> cut it, the new APIs have to make sense
>>> so virtio drivers can develop normally without fear of stepping
>>> on the toes of this admin driver.
>> Please work with Yishai to get something that make sense to you. He
>> can post a v2 with the accumulated comments addressed so far and then
>> go over what the API between the drivers is.
>>
>> Thanks,
>> Jason
> /me shrugs. I pretty much posted suggestions already. Should not be hard.
> Anything unclear - post on list.
>
Yes, this is the plan.

We are working to address the comments that we got so far in both VFIO & 
VIRTIO, retest and send the next version.

Re the API between the modules, It looks like we have the below 
alternatives.

1) Proceed with current approach where we exposed a generic API to 
execute any admin command, however, make it much more solid inside VIRTIO.
2) Expose extra APIs from VIRTIO for commands that we can consider 
future client usage of them as of LIST_QUERY/LIST_USE, however still 
have the generic execute admin command for others.
3) Expose API per command from VIRTIO and fully drop the generic execute 
admin command.

Few notes:
Option #1 looks the most generic one, it drops the need to expose 
multiple symbols / APIs per command and for now we have a single client 
for them (i.e. VFIO).
Options #2 & #3, may still require to expose the 
virtio_pci_vf_get_pf_dev() API to let VFIO get the VIRTIO PF (struct 
virtio_device *) from its PCI device, each command will get it as its 
first argument.

Michael,
What do you suggest here ?

Thanks,
Yishai
Michael S. Tsirkin Oct. 10, 2023, 3:14 p.m. UTC | #22
On Tue, Oct 10, 2023 at 06:09:44PM +0300, Yishai Hadas wrote:
> On 10/10/2023 17:54, Michael S. Tsirkin wrote:
> > On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote:
> > > On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote:
> > > 
> > > > > However - the Intel GPU VFIO driver is such a bad experiance I don't
> > > > > want to encourage people to make VFIO drivers, or code that is only
> > > > > used by VFIO drivers, that are not under drivers/vfio review.
> > > > So if Alex feels it makes sense to add some virtio functionality
> > > > to vfio and is happy to maintain or let you maintain the UAPI
> > > > then why would I say no? But we never expected devices to have
> > > > two drivers like this does, so just exposing device pointer
> > > > and saying "use regular virtio APIs for the rest" does not
> > > > cut it, the new APIs have to make sense
> > > > so virtio drivers can develop normally without fear of stepping
> > > > on the toes of this admin driver.
> > > Please work with Yishai to get something that make sense to you. He
> > > can post a v2 with the accumulated comments addressed so far and then
> > > go over what the API between the drivers is.
> > > 
> > > Thanks,
> > > Jason
> > /me shrugs. I pretty much posted suggestions already. Should not be hard.
> > Anything unclear - post on list.
> > 
> Yes, this is the plan.
> 
> We are working to address the comments that we got so far in both VFIO &
> VIRTIO, retest and send the next version.
> 
> Re the API between the modules, It looks like we have the below
> alternatives.
> 
> 1) Proceed with current approach where we exposed a generic API to execute
> any admin command, however, make it much more solid inside VIRTIO.
> 2) Expose extra APIs from VIRTIO for commands that we can consider future
> client usage of them as of LIST_QUERY/LIST_USE, however still have the
> generic execute admin command for others.
> 3) Expose API per command from VIRTIO and fully drop the generic execute
> admin command.
> 
> Few notes:
> Option #1 looks the most generic one, it drops the need to expose multiple
> symbols / APIs per command and for now we have a single client for them
> (i.e. VFIO).
> Options #2 & #3, may still require to expose the virtio_pci_vf_get_pf_dev()
> API to let VFIO get the VIRTIO PF (struct virtio_device *) from its PCI
> device, each command will get it as its first argument.
> 
> Michael,
> What do you suggest here ?
> 
> Thanks,
> Yishai

I suggest 3 but call it on the VF. commands will switch to PF
internally as needed. For example, intel might be interested in exposing
admin commands through a memory BAR of VF itself.
Yishai Hadas Oct. 10, 2023, 3:43 p.m. UTC | #23
On 10/10/2023 18:14, Michael S. Tsirkin wrote:
> On Tue, Oct 10, 2023 at 06:09:44PM +0300, Yishai Hadas wrote:
>> On 10/10/2023 17:54, Michael S. Tsirkin wrote:
>>> On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote:
>>>> On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote:
>>>>
>>>>>> However - the Intel GPU VFIO driver is such a bad experiance I don't
>>>>>> want to encourage people to make VFIO drivers, or code that is only
>>>>>> used by VFIO drivers, that are not under drivers/vfio review.
>>>>> So if Alex feels it makes sense to add some virtio functionality
>>>>> to vfio and is happy to maintain or let you maintain the UAPI
>>>>> then why would I say no? But we never expected devices to have
>>>>> two drivers like this does, so just exposing device pointer
>>>>> and saying "use regular virtio APIs for the rest" does not
>>>>> cut it, the new APIs have to make sense
>>>>> so virtio drivers can develop normally without fear of stepping
>>>>> on the toes of this admin driver.
>>>> Please work with Yishai to get something that make sense to you. He
>>>> can post a v2 with the accumulated comments addressed so far and then
>>>> go over what the API between the drivers is.
>>>>
>>>> Thanks,
>>>> Jason
>>> /me shrugs. I pretty much posted suggestions already. Should not be hard.
>>> Anything unclear - post on list.
>>>
>> Yes, this is the plan.
>>
>> We are working to address the comments that we got so far in both VFIO &
>> VIRTIO, retest and send the next version.
>>
>> Re the API between the modules, It looks like we have the below
>> alternatives.
>>
>> 1) Proceed with current approach where we exposed a generic API to execute
>> any admin command, however, make it much more solid inside VIRTIO.
>> 2) Expose extra APIs from VIRTIO for commands that we can consider future
>> client usage of them as of LIST_QUERY/LIST_USE, however still have the
>> generic execute admin command for others.
>> 3) Expose API per command from VIRTIO and fully drop the generic execute
>> admin command.
>>
>> Few notes:
>> Option #1 looks the most generic one, it drops the need to expose multiple
>> symbols / APIs per command and for now we have a single client for them
>> (i.e. VFIO).
>> Options #2 & #3, may still require to expose the virtio_pci_vf_get_pf_dev()
>> API to let VFIO get the VIRTIO PF (struct virtio_device *) from its PCI
>> device, each command will get it as its first argument.
>>
>> Michael,
>> What do you suggest here ?
>>
>> Thanks,
>> Yishai
> I suggest 3 but call it on the VF. commands will switch to PF
> internally as needed. For example, intel might be interested in exposing
> admin commands through a memory BAR of VF itself.
>
The driver who owns the VF is VFIO, it's not a VIRTIO one.

The ability to get the VIRTIO PF is from the PCI device (i.e. struct 
pci_dev).

In addition,
virtio_pci_vf_get_pf_dev() was implemented for now in virtio-pci as it 
worked on pci_dev.
Assuming that we'll put each command inside virtio as the generic layer, 
we won't be able to call/use this API internally to get the PF as of 
cyclic dependencies between the modules, link will fail.

So in option #3 we may still need to get outside into VFIO the VIRTIO PF 
and give it as pointer to VIRTIO upon each command.

Does it work for you ?

Yishai
Parav Pandit Oct. 10, 2023, 3:58 p.m. UTC | #24
> From: Yishai Hadas <yishaih@nvidia.com>
> Sent: Tuesday, October 10, 2023 9:14 PM
> 
> On 10/10/2023 18:14, Michael S. Tsirkin wrote:
> > On Tue, Oct 10, 2023 at 06:09:44PM +0300, Yishai Hadas wrote:
> >> On 10/10/2023 17:54, Michael S. Tsirkin wrote:
> >>> On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote:
> >>>> On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote:
> >>>>
> >>>>>> However - the Intel GPU VFIO driver is such a bad experiance I
> >>>>>> don't want to encourage people to make VFIO drivers, or code that
> >>>>>> is only used by VFIO drivers, that are not under drivers/vfio review.
> >>>>> So if Alex feels it makes sense to add some virtio functionality
> >>>>> to vfio and is happy to maintain or let you maintain the UAPI then
> >>>>> why would I say no? But we never expected devices to have two
> >>>>> drivers like this does, so just exposing device pointer and saying
> >>>>> "use regular virtio APIs for the rest" does not cut it, the new
> >>>>> APIs have to make sense so virtio drivers can develop normally
> >>>>> without fear of stepping on the toes of this admin driver.
> >>>> Please work with Yishai to get something that make sense to you. He
> >>>> can post a v2 with the accumulated comments addressed so far and
> >>>> then go over what the API between the drivers is.
> >>>>
> >>>> Thanks,
> >>>> Jason
> >>> /me shrugs. I pretty much posted suggestions already. Should not be hard.
> >>> Anything unclear - post on list.
> >>>
> >> Yes, this is the plan.
> >>
> >> We are working to address the comments that we got so far in both
> >> VFIO & VIRTIO, retest and send the next version.
> >>
> >> Re the API between the modules, It looks like we have the below
> >> alternatives.
> >>
> >> 1) Proceed with current approach where we exposed a generic API to
> >> execute any admin command, however, make it much more solid inside
> VIRTIO.
> >> 2) Expose extra APIs from VIRTIO for commands that we can consider
> >> future client usage of them as of LIST_QUERY/LIST_USE, however still
> >> have the generic execute admin command for others.
> >> 3) Expose API per command from VIRTIO and fully drop the generic
> >> execute admin command.
> >>
> >> Few notes:
> >> Option #1 looks the most generic one, it drops the need to expose
> >> multiple symbols / APIs per command and for now we have a single
> >> client for them (i.e. VFIO).
> >> Options #2 & #3, may still require to expose the
> >> virtio_pci_vf_get_pf_dev() API to let VFIO get the VIRTIO PF (struct
> >> virtio_device *) from its PCI device, each command will get it as its first
> argument.
> >>
> >> Michael,
> >> What do you suggest here ?
> >>
> >> Thanks,
> >> Yishai
> > I suggest 3 but call it on the VF. commands will switch to PF
> > internally as needed. For example, intel might be interested in
> > exposing admin commands through a memory BAR of VF itself.
> >
> The driver who owns the VF is VFIO, it's not a VIRTIO one.
> 
> The ability to get the VIRTIO PF is from the PCI device (i.e. struct pci_dev).
> 
> In addition,
> virtio_pci_vf_get_pf_dev() was implemented for now in virtio-pci as it worked
> on pci_dev.
> Assuming that we'll put each command inside virtio as the generic layer, we
> won't be able to call/use this API internally to get the PF as of cyclic
> dependencies between the modules, link will fail.
> 
> So in option #3 we may still need to get outside into VFIO the VIRTIO PF and
> give it as pointer to VIRTIO upon each command.
>
I think,
For #3 the virtio level API signature should be,

virtio_admin_legacy_xyz_cmd(struct virtio_device *dev, u64 group_member_id, ....);

This maintains the right abstraction needed between vfio, generic virtio and virtio transport (pci) layer.
Michael S. Tsirkin Oct. 10, 2023, 3:58 p.m. UTC | #25
On Tue, Oct 10, 2023 at 06:43:32PM +0300, Yishai Hadas wrote:
> On 10/10/2023 18:14, Michael S. Tsirkin wrote:
> > On Tue, Oct 10, 2023 at 06:09:44PM +0300, Yishai Hadas wrote:
> > > On 10/10/2023 17:54, Michael S. Tsirkin wrote:
> > > > On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote:
> > > > > On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote:
> > > > > 
> > > > > > > However - the Intel GPU VFIO driver is such a bad experiance I don't
> > > > > > > want to encourage people to make VFIO drivers, or code that is only
> > > > > > > used by VFIO drivers, that are not under drivers/vfio review.
> > > > > > So if Alex feels it makes sense to add some virtio functionality
> > > > > > to vfio and is happy to maintain or let you maintain the UAPI
> > > > > > then why would I say no? But we never expected devices to have
> > > > > > two drivers like this does, so just exposing device pointer
> > > > > > and saying "use regular virtio APIs for the rest" does not
> > > > > > cut it, the new APIs have to make sense
> > > > > > so virtio drivers can develop normally without fear of stepping
> > > > > > on the toes of this admin driver.
> > > > > Please work with Yishai to get something that make sense to you. He
> > > > > can post a v2 with the accumulated comments addressed so far and then
> > > > > go over what the API between the drivers is.
> > > > > 
> > > > > Thanks,
> > > > > Jason
> > > > /me shrugs. I pretty much posted suggestions already. Should not be hard.
> > > > Anything unclear - post on list.
> > > > 
> > > Yes, this is the plan.
> > > 
> > > We are working to address the comments that we got so far in both VFIO &
> > > VIRTIO, retest and send the next version.
> > > 
> > > Re the API between the modules, It looks like we have the below
> > > alternatives.
> > > 
> > > 1) Proceed with current approach where we exposed a generic API to execute
> > > any admin command, however, make it much more solid inside VIRTIO.
> > > 2) Expose extra APIs from VIRTIO for commands that we can consider future
> > > client usage of them as of LIST_QUERY/LIST_USE, however still have the
> > > generic execute admin command for others.
> > > 3) Expose API per command from VIRTIO and fully drop the generic execute
> > > admin command.
> > > 
> > > Few notes:
> > > Option #1 looks the most generic one, it drops the need to expose multiple
> > > symbols / APIs per command and for now we have a single client for them
> > > (i.e. VFIO).
> > > Options #2 & #3, may still require to expose the virtio_pci_vf_get_pf_dev()
> > > API to let VFIO get the VIRTIO PF (struct virtio_device *) from its PCI
> > > device, each command will get it as its first argument.
> > > 
> > > Michael,
> > > What do you suggest here ?
> > > 
> > > Thanks,
> > > Yishai
> > I suggest 3 but call it on the VF. commands will switch to PF
> > internally as needed. For example, intel might be interested in exposing
> > admin commands through a memory BAR of VF itself.
> > 
> The driver who owns the VF is VFIO, it's not a VIRTIO one.
> 
> The ability to get the VIRTIO PF is from the PCI device (i.e. struct
> pci_dev).
> 
> In addition,
> virtio_pci_vf_get_pf_dev() was implemented for now in virtio-pci as it
> worked on pci_dev.

On pci_dev of vf, yes? So again just move this into each command,
that's all. I.e. pass pci_dev to each.

> Assuming that we'll put each command inside virtio as the generic layer, we
> won't be able to call/use this API internally to get the PF as of cyclic
> dependencies between the modules, link will fail.
> 
> So in option #3 we may still need to get outside into VFIO the VIRTIO PF and
> give it as pointer to VIRTIO upon each command.
> 
> Does it work for you ?
> 
> Yishai
Jason Gunthorpe Oct. 10, 2023, 3:59 p.m. UTC | #26
On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote:

> I suggest 3 but call it on the VF. commands will switch to PF
> internally as needed. For example, intel might be interested in exposing
> admin commands through a memory BAR of VF itself.

FWIW, we have been pushing back on such things in VFIO, so it will
have to be very carefully security justified.

Probably since that is not standard it should just live in under some
intel-only vfio driver behavior, not in virtio land.

It is also costly to switch between pf/vf, it should not be done
pointlessly on the fast path.

Jason
Michael S. Tsirkin Oct. 10, 2023, 4:03 p.m. UTC | #27
On Tue, Oct 10, 2023 at 12:59:37PM -0300, Jason Gunthorpe wrote:
> On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote:
> 
> > I suggest 3 but call it on the VF. commands will switch to PF
> > internally as needed. For example, intel might be interested in exposing
> > admin commands through a memory BAR of VF itself.
> 
> FWIW, we have been pushing back on such things in VFIO, so it will
> have to be very carefully security justified.
> 
> Probably since that is not standard it should just live in under some
> intel-only vfio driver behavior, not in virtio land.
> 
> It is also costly to switch between pf/vf, it should not be done
> pointlessly on the fast path.
> 
> Jason

Currently, the switch seems to be just a cast of private data.
I am suggesting keeping that cast inside virtio. Why is that
expensive?
Jason Gunthorpe Oct. 10, 2023, 4:07 p.m. UTC | #28
On Tue, Oct 10, 2023 at 12:03:29PM -0400, Michael S. Tsirkin wrote:
> On Tue, Oct 10, 2023 at 12:59:37PM -0300, Jason Gunthorpe wrote:
> > On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote:
> > 
> > > I suggest 3 but call it on the VF. commands will switch to PF
> > > internally as needed. For example, intel might be interested in exposing
> > > admin commands through a memory BAR of VF itself.
> > 
> > FWIW, we have been pushing back on such things in VFIO, so it will
> > have to be very carefully security justified.
> > 
> > Probably since that is not standard it should just live in under some
> > intel-only vfio driver behavior, not in virtio land.
> > 
> > It is also costly to switch between pf/vf, it should not be done
> > pointlessly on the fast path.
> > 
> > Jason
> 
> Currently, the switch seems to be just a cast of private data.
> I am suggesting keeping that cast inside virtio. Why is that
> expensive?

pci_iov_get_pf_drvdata() does a bunch of sanity checks and function
calls. It was not intended to be used on a fast path.

Jason
Yishai Hadas Oct. 10, 2023, 4:09 p.m. UTC | #29
On 10/10/2023 18:58, Michael S. Tsirkin wrote:
> On Tue, Oct 10, 2023 at 06:43:32PM +0300, Yishai Hadas wrote:
>> On 10/10/2023 18:14, Michael S. Tsirkin wrote:
>>> On Tue, Oct 10, 2023 at 06:09:44PM +0300, Yishai Hadas wrote:
>>>> On 10/10/2023 17:54, Michael S. Tsirkin wrote:
>>>>> On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote:
>>>>>> On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote:
>>>>>>
>>>>>>>> However - the Intel GPU VFIO driver is such a bad experiance I don't
>>>>>>>> want to encourage people to make VFIO drivers, or code that is only
>>>>>>>> used by VFIO drivers, that are not under drivers/vfio review.
>>>>>>> So if Alex feels it makes sense to add some virtio functionality
>>>>>>> to vfio and is happy to maintain or let you maintain the UAPI
>>>>>>> then why would I say no? But we never expected devices to have
>>>>>>> two drivers like this does, so just exposing device pointer
>>>>>>> and saying "use regular virtio APIs for the rest" does not
>>>>>>> cut it, the new APIs have to make sense
>>>>>>> so virtio drivers can develop normally without fear of stepping
>>>>>>> on the toes of this admin driver.
>>>>>> Please work with Yishai to get something that make sense to you. He
>>>>>> can post a v2 with the accumulated comments addressed so far and then
>>>>>> go over what the API between the drivers is.
>>>>>>
>>>>>> Thanks,
>>>>>> Jason
>>>>> /me shrugs. I pretty much posted suggestions already. Should not be hard.
>>>>> Anything unclear - post on list.
>>>>>
>>>> Yes, this is the plan.
>>>>
>>>> We are working to address the comments that we got so far in both VFIO &
>>>> VIRTIO, retest and send the next version.
>>>>
>>>> Re the API between the modules, It looks like we have the below
>>>> alternatives.
>>>>
>>>> 1) Proceed with current approach where we exposed a generic API to execute
>>>> any admin command, however, make it much more solid inside VIRTIO.
>>>> 2) Expose extra APIs from VIRTIO for commands that we can consider future
>>>> client usage of them as of LIST_QUERY/LIST_USE, however still have the
>>>> generic execute admin command for others.
>>>> 3) Expose API per command from VIRTIO and fully drop the generic execute
>>>> admin command.
>>>>
>>>> Few notes:
>>>> Option #1 looks the most generic one, it drops the need to expose multiple
>>>> symbols / APIs per command and for now we have a single client for them
>>>> (i.e. VFIO).
>>>> Options #2 & #3, may still require to expose the virtio_pci_vf_get_pf_dev()
>>>> API to let VFIO get the VIRTIO PF (struct virtio_device *) from its PCI
>>>> device, each command will get it as its first argument.
>>>>
>>>> Michael,
>>>> What do you suggest here ?
>>>>
>>>> Thanks,
>>>> Yishai
>>> I suggest 3 but call it on the VF. commands will switch to PF
>>> internally as needed. For example, intel might be interested in exposing
>>> admin commands through a memory BAR of VF itself.
>>>
>> The driver who owns the VF is VFIO, it's not a VIRTIO one.
>>
>> The ability to get the VIRTIO PF is from the PCI device (i.e. struct
>> pci_dev).
>>
>> In addition,
>> virtio_pci_vf_get_pf_dev() was implemented for now in virtio-pci as it
>> worked on pci_dev.
> On pci_dev of vf, yes? So again just move this into each command,
> that's all. I.e. pass pci_dev to each.

How about the cyclic dependencies issue inside VIRTIO that I mentioned  
below ?

In my suggestion it's fine, VFIO will get the PF and give it to VIRTIO 
per command.

Yishai

>> Assuming that we'll put each command inside virtio as the generic layer, we
>> won't be able to call/use this API internally to get the PF as of cyclic
>> dependencies between the modules, link will fail.
>>
>> So in option #3 we may still need to get outside into VFIO the VIRTIO PF and
>> give it as pointer to VIRTIO upon each command.
>>
>> Does it work for you ?
>>
>> Yishai
Parav Pandit Oct. 10, 2023, 4:21 p.m. UTC | #30
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, October 10, 2023 9:37 PM
> 
> On Tue, Oct 10, 2023 at 12:03:29PM -0400, Michael S. Tsirkin wrote:
> > On Tue, Oct 10, 2023 at 12:59:37PM -0300, Jason Gunthorpe wrote:
> > > On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote:
> > >
> > > > I suggest 3 but call it on the VF. commands will switch to PF
> > > > internally as needed. For example, intel might be interested in
> > > > exposing admin commands through a memory BAR of VF itself.

If in the future if one does admin command on the VF memory BAR, there is no need of cast either.
vfio-virtio-pci driver can do on the pci vf device directly.

(though per VF memory registers would be anti-scale design for real hw; to discuss in other forum).
Michael S. Tsirkin Oct. 10, 2023, 8:38 p.m. UTC | #31
On Tue, Oct 10, 2023 at 04:21:15PM +0000, Parav Pandit wrote:
> 
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, October 10, 2023 9:37 PM
> > 
> > On Tue, Oct 10, 2023 at 12:03:29PM -0400, Michael S. Tsirkin wrote:
> > > On Tue, Oct 10, 2023 at 12:59:37PM -0300, Jason Gunthorpe wrote:
> > > > On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote:
> > > >
> > > > > I suggest 3 but call it on the VF. commands will switch to PF
> > > > > internally as needed. For example, intel might be interested in
> > > > > exposing admin commands through a memory BAR of VF itself.
> 
> If in the future if one does admin command on the VF memory BAR, there is no need of cast either.
> vfio-virtio-pci driver can do on the pci vf device directly.

this is why I want the API to get the VF pci device as a parameter.
I don't get what is cyclic about it, yet.

> (though per VF memory registers would be anti-scale design for real hw; to discuss in other forum).

up to hardware vendor really.
Michael S. Tsirkin Oct. 10, 2023, 8:42 p.m. UTC | #32
On Tue, Oct 10, 2023 at 07:09:08PM +0300, Yishai Hadas wrote:
> 
> > > Assuming that we'll put each command inside virtio as the generic layer, we
> > > won't be able to call/use this API internally to get the PF as of cyclic
> > > dependencies between the modules, link will fail.

I just mean:
virtio_admin_legacy_io_write(sruct pci_device *,  ....)


internally it starts from vf gets the pf (or vf itself or whatever
the transport is) sends command gets status returns.

what is cyclic here?
Christoph Hellwig Oct. 11, 2023, 6:12 a.m. UTC | #33
On Tue, Oct 10, 2023 at 06:43:32PM +0300, Yishai Hadas wrote:
> > I suggest 3 but call it on the VF. commands will switch to PF
> > internally as needed. For example, intel might be interested in exposing
> > admin commands through a memory BAR of VF itself.
> > 
> The driver who owns the VF is VFIO, it's not a VIRTIO one.

And to loop back into my previous discussion: that's the fundamental
problem here.  If it is owned by the virtio subsystem, which just
calls into vfio we would not have this problem, including the
circular loops and exposed APIs.
Christoph Hellwig Oct. 11, 2023, 6:13 a.m. UTC | #34
On Tue, Oct 10, 2023 at 12:59:37PM -0300, Jason Gunthorpe wrote:
> On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote:
> 
> > I suggest 3 but call it on the VF. commands will switch to PF
> > internally as needed. For example, intel might be interested in exposing
> > admin commands through a memory BAR of VF itself.
> 
> FWIW, we have been pushing back on such things in VFIO, so it will
> have to be very carefully security justified.
> 
> Probably since that is not standard it should just live in under some
> intel-only vfio driver behavior, not in virtio land.

Btw, what is that intel thing everyone is talking about?  And why
would the virtio core support vendor specific behavior like that?
Christoph Hellwig Oct. 11, 2023, 6:26 a.m. UTC | #35
On Tue, Oct 10, 2023 at 10:10:31AM -0300, Jason Gunthorpe wrote:
> We've talked around ideas like allowing the VF config space to do some
> of the work. For simple devices we could get away with 1 VF config
> space register. (VF config space is owned by the hypervisor, not the
> guest)

Which assumes you're actually using VFs and not multiple PFs, which
is a very limiting assumption.  It also limits your from actually
using DMA during the live migration process, which again is major
limitation once you have a non-tivial amount of state.

> SIOVr2 is discussing more a flexible RID mapping - there is a possible
> route where a "VF" could actually have two RIDs, a hypervisor RID and a
> guest RID.

Well, then you go down the SIOV route, which requires a complex driver
actually presenting the guest visible device anyway.

> It really is PCI limitations that force this design of making a PF
> driver do dual duty as a fully functionally normal device and act as a
> communication channel proxy to make a back channel into a SRIOV VF.
> 
> My view has always been that the VFIO live migration operations are
> executed logically within the VF as they only effect the VF.
> 
> So we have a logical design seperation where VFIO world owns the
> commands and the PF driver supplies the communication channel. This
> works well for devices that already have a robust RPC interface to
> their device FW.

Independent of my above points on the doubts on VF-controlled live
migration for PCe device I absolutely agree with your that the Linux
abstraction and user interface should be VF based.  Which further
reinforeces my point that the VFIO driver for the controlled function
(PF or VF) and the Linux driver for the controlling function (better
be a PF in practice) must be very tightly integrated.  And the best
way to do that is to export the vfio nodes from the Linux driver
that knowns the hardware and not split out into a separate one.

> > The driver that knows this hardware.  In this case the virtio subsystem,
> > in case of nvme the nvme driver, and in case of mlx5 the mlx5 driver.
> 
> But those are drivers operating the HW to create kernel devices. Here
> we need a VFIO device. They can't co-exist, if you switch mlx5 from
> normal to vfio you have to tear down the entire normal driver.

Yes, absolutey.  And if we're smart enough we structure it in a way
that we never even initialize the bits of the driver only needed for
the normal kernel consumers.

> > No.  That layout logically follows from what codebase the functionality
> > is part of, though.
> 
> I don't understand what we are talking about really. Where do you
> imagine the vfio_register_XX() goes?

In the driver controlling the hardware.  E.g. for virtio in
driver/virtio/ and for nvme in drivers/nvme/ and for mlx5
in the mlx5 driver directory.

> > > I don't know what "fake-legacy" even means, VFIO is not legacy.
> > 
> > The driver we're talking about in this thread fakes up a virtio_pci
> > legacy devie to the guest on top of a "modern" virtio_pci device.
> 
> I'm not sure I'd use the word fake, inb/outb are always trapped
> operations in VMs. If the device provided a real IO BAR then VFIO
> common code would trap and relay inb/outb to the device.
> 
> All this is doing is changing the inb/outb relay from using a physical
> IO BAR to a DMA command ring.
> 
> The motivation is simply because normal IO BAR space is incredibly
> limited and you can't get enough SRIOV functions when using it.

The fake is not meant as a judgement.  But it creates a virtio-legacy
device that in this form does not exist in hardware.  That's what
I call fake.  If you prefer a different term that's fine with me too.

> > > There is alot of code in VFIO and the VMM side to take a VF and turn
> > > it into a vPCI function. You can't just trivially duplicate VFIO in a
> > > dozen drivers without creating a giant mess.
> > 
> > I do not advocate for duplicating it.  But the code that calls this
> > functionality belongs into the driver that deals with the compound
> > device that we're doing this work for.
> 
> On one hand, I don't really care - we can put the code where people
> like.
> 
> However - the Intel GPU VFIO driver is such a bad experiance I don't
> want to encourage people to make VFIO drivers, or code that is only
> used by VFIO drivers, that are not under drivers/vfio review.

We can and should require vfio review for users of the vfio API.
But to be honest code placement was not the problem with i915.  The
problem was that the mdev APIs (under drivers/vfio) were a complete
trainwreck when it was written, and that the driver had a horrible
hypervisor API abstraction.

> Be aware, there is a significant performance concern here. If you want
> to create 1000 VFIO devices (this is a real thing), we *can't* probe a
> normal driver first, it is too slow. We need a path that goes directly
> from creating the RIDs to turning those RIDs into VFIO.

And by calling the vfio funtions from mlx5 you get this easily.

But I think you're totally mixing things up here anyway.

For mdev/SIOV like flows you must call vfio APIs from the main
driver anyway, as there is no pci_dev to probe on anyway.  That's
what i915 does btw.

For "classic" vfio that requires a pci_dev (or $otherbus_dev) we need
to have a similar flow.  And I think the best way is to have the
bus-level attribute on the device and/or a device-specific side band
protocol to device how new functions are probed.  With that you
avoid all the duplicate PCI IDs for the binding, and actually allow to
sanely establush a communication channel between the functions.
Because without that there is no way to know how any two functions
related.  The driver might think they know, but there's all kinds of
whacky PCI passthough schemes that will break such a logic.
Michael S. Tsirkin Oct. 11, 2023, 6:43 a.m. UTC | #36
On Tue, Oct 10, 2023 at 11:13:30PM -0700, Christoph Hellwig wrote:
> On Tue, Oct 10, 2023 at 12:59:37PM -0300, Jason Gunthorpe wrote:
> > On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote:
> > 
> > > I suggest 3 but call it on the VF. commands will switch to PF
> > > internally as needed. For example, intel might be interested in exposing
> > > admin commands through a memory BAR of VF itself.
> > 
> > FWIW, we have been pushing back on such things in VFIO, so it will
> > have to be very carefully security justified.
> > 
> > Probably since that is not standard it should just live in under some
> > intel-only vfio driver behavior, not in virtio land.
> 
> Btw, what is that intel thing everyone is talking about?  And why
> would the virtio core support vendor specific behavior like that?

It's not a thing it's Zhu Lingshan :) intel is just one of the vendors
that implemented vdpa support and so Zhu Lingshan from intel is working
on vdpa and has also proposed virtio spec extensions for migration.
intel's driver is called ifcvf.  vdpa composes all this stuff that is
added to vfio in userspace, so it's a different approach.
Christoph Hellwig Oct. 11, 2023, 6:59 a.m. UTC | #37
On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote:
> > Btw, what is that intel thing everyone is talking about?  And why
> > would the virtio core support vendor specific behavior like that?
> 
> It's not a thing it's Zhu Lingshan :) intel is just one of the vendors
> that implemented vdpa support and so Zhu Lingshan from intel is working
> on vdpa and has also proposed virtio spec extensions for migration.
> intel's driver is called ifcvf.  vdpa composes all this stuff that is
> added to vfio in userspace, so it's a different approach.

Well, so let's call it virtio live migration instead of intel.

And please work all together in the virtio committee that you have
one way of communication between controlling and controlled functions.
If one extension does it one way and the other a different way that's
just creating a giant mess.
Yishai Hadas Oct. 11, 2023, 7:44 a.m. UTC | #38
On 10/10/2023 23:42, Michael S. Tsirkin wrote:
> On Tue, Oct 10, 2023 at 07:09:08PM +0300, Yishai Hadas wrote:
>>>> Assuming that we'll put each command inside virtio as the generic layer, we
>>>> won't be able to call/use this API internally to get the PF as of cyclic
>>>> dependencies between the modules, link will fail.
> I just mean:
> virtio_admin_legacy_io_write(sruct pci_device *,  ....)
>
>
> internally it starts from vf gets the pf (or vf itself or whatever
> the transport is) sends command gets status returns.
>
> what is cyclic here?
>
virtio-pci depends on virtio [1].

If we put the commands in the generic layer as we expect it to be (i.e. 
virtio), then trying to call internally call for 
virtio_pci_vf_get_pf_dev() to get the PF from the VF will end-up by a 
linker cyclic error as of below [2].

As of that, someone can suggest to put the commands in virtio-pci, 
however this will fully bypass the generic layer of virtio and future 
clients won't be able to use it.

In addition, passing in the VF PCI pointer instead of the VF group 
member ID + the VIRTIO PF device, will require in the future to 
duplicate each command once we'll use SIOV devices.

Instead, we suggest the below API for the above example.

virtio_admin_legacy_io_write(virtio_device *virtio_dev,  u64 
group_member_id,  ....)

[1]

[yishaih@reg-l-vrt-209 linux]$ modinfo virtio-pci
filename: /lib/modules/6.6.0-rc2+/kernel/drivers/virtio/virtio_pci.ko
version:        1
license:        GPL
description:    virtio-pci
author:         Anthony Liguori <aliguori@us.ibm.com>
srcversion:     7355EAC9408D38891938391
alias:          pci:v00001AF4d*sv*sd*bc*sc*i*
depends: virtio_pci_modern_dev,virtio,virtio_ring,virtio_pci_legacy_dev
retpoline:      Y
intree:         Y
name:           virtio_pci
vermagic:       6.6.0-rc2+ SMP preempt mod_unload modversions
parm:           force_legacy:Force legacy mode for transitional virtio 1 
devices (bool)

[2]

depmod: ERROR: Cycle detected: virtio -> virtio_pci -> virtio
depmod: ERROR: Found 2 modules in dependency cycles!
make[2]: *** [scripts/Makefile.modinst:128: depmod] Error 1
make[1]: *** [/images/yishaih/src/kernel/linux/Makefile:1821: 
modules_install] Error 2

Yishai
Parav Pandit Oct. 11, 2023, 8 a.m. UTC | #39
Hi Christoph,

> From: Christoph Hellwig <hch@infradead.org>
> Sent: Wednesday, October 11, 2023 12:29 PM
> 
> On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote:
> > > Btw, what is that intel thing everyone is talking about?  And why
> > > would the virtio core support vendor specific behavior like that?
> >
> > It's not a thing it's Zhu Lingshan :) intel is just one of the vendors
> > that implemented vdpa support and so Zhu Lingshan from intel is
> > working on vdpa and has also proposed virtio spec extensions for migration.
> > intel's driver is called ifcvf.  vdpa composes all this stuff that is
> > added to vfio in userspace, so it's a different approach.
> 
> Well, so let's call it virtio live migration instead of intel.
> 
> And please work all together in the virtio committee that you have one way of
> communication between controlling and controlled functions.
> If one extension does it one way and the other a different way that's just
> creating a giant mess.

We in virtio committee are working on VF device migration where:
VF = controlled function
PF = controlling function

The second proposal is what Michael mentioned from Intel that somehow combine controlled and controlling function as single entity on VF.

The main reasons I find it weird are:
1. it must always need to do mediation to do fake the device reset, and flr flows
2. dma cannot work as you explained for complex device state
3. it needs constant knowledge of each tiny things for each virtio device type

Such single entity appears a bit very weird to me but maybe it is just me.
Michael S. Tsirkin Oct. 11, 2023, 8:02 a.m. UTC | #40
On Wed, Oct 11, 2023 at 10:44:49AM +0300, Yishai Hadas wrote:
> On 10/10/2023 23:42, Michael S. Tsirkin wrote:
> > On Tue, Oct 10, 2023 at 07:09:08PM +0300, Yishai Hadas wrote:
> > > > > Assuming that we'll put each command inside virtio as the generic layer, we
> > > > > won't be able to call/use this API internally to get the PF as of cyclic
> > > > > dependencies between the modules, link will fail.
> > I just mean:
> > virtio_admin_legacy_io_write(sruct pci_device *,  ....)
> > 
> > 
> > internally it starts from vf gets the pf (or vf itself or whatever
> > the transport is) sends command gets status returns.
> > 
> > what is cyclic here?
> > 
> virtio-pci depends on virtio [1].
> 
> If we put the commands in the generic layer as we expect it to be (i.e.
> virtio), then trying to call internally call for virtio_pci_vf_get_pf_dev()
> to get the PF from the VF will end-up by a linker cyclic error as of below
> [2].
> 
> As of that, someone can suggest to put the commands in virtio-pci, however
> this will fully bypass the generic layer of virtio and future clients won't
> be able to use it.

virtio_pci would get pci device.
virtio pci convers that to virtio device of owner + group member id and calls virtio.

no cycles and minimal transport specific code, right?

> In addition, passing in the VF PCI pointer instead of the VF group member ID
> + the VIRTIO PF device, will require in the future to duplicate each command
> once we'll use SIOV devices.

I don't think anyone knows how will SIOV look. But shuffling
APIs around is not a big deal. We'll see.

> Instead, we suggest the below API for the above example.
> 
> virtio_admin_legacy_io_write(virtio_device *virtio_dev,  u64
> group_member_id,  ....)
> 
> [1]

> [yishaih@reg-l-vrt-209 linux]$ modinfo virtio-pci
> filename: /lib/modules/6.6.0-rc2+/kernel/drivers/virtio/virtio_pci.ko
> version:        1
> license:        GPL
> description:    virtio-pci
> author:         Anthony Liguori <aliguori@us.ibm.com>
> srcversion:     7355EAC9408D38891938391
> alias:          pci:v00001AF4d*sv*sd*bc*sc*i*
> depends: virtio_pci_modern_dev,virtio,virtio_ring,virtio_pci_legacy_dev
> retpoline:      Y
> intree:         Y
> name:           virtio_pci
> vermagic:       6.6.0-rc2+ SMP preempt mod_unload modversions
> parm:           force_legacy:Force legacy mode for transitional virtio 1
> devices (bool)
> 
> [2]
> 
> depmod: ERROR: Cycle detected: virtio -> virtio_pci -> virtio
> depmod: ERROR: Found 2 modules in dependency cycles!
> make[2]: *** [scripts/Makefile.modinst:128: depmod] Error 1
> make[1]: *** [/images/yishaih/src/kernel/linux/Makefile:1821:
> modules_install] Error 2
> 
> Yishai

virtio absolutely must not depend on virtio pci, it is used on
systems without pci at all.
Michael S. Tsirkin Oct. 11, 2023, 8:10 a.m. UTC | #41
On Wed, Oct 11, 2023 at 08:00:57AM +0000, Parav Pandit wrote:
> Hi Christoph,
> 
> > From: Christoph Hellwig <hch@infradead.org>
> > Sent: Wednesday, October 11, 2023 12:29 PM
> > 
> > On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote:
> > > > Btw, what is that intel thing everyone is talking about?  And why
> > > > would the virtio core support vendor specific behavior like that?
> > >
> > > It's not a thing it's Zhu Lingshan :) intel is just one of the vendors
> > > that implemented vdpa support and so Zhu Lingshan from intel is
> > > working on vdpa and has also proposed virtio spec extensions for migration.
> > > intel's driver is called ifcvf.  vdpa composes all this stuff that is
> > > added to vfio in userspace, so it's a different approach.
> > 
> > Well, so let's call it virtio live migration instead of intel.
> > 
> > And please work all together in the virtio committee that you have one way of
> > communication between controlling and controlled functions.
> > If one extension does it one way and the other a different way that's just
> > creating a giant mess.
> 
> We in virtio committee are working on VF device migration where:
> VF = controlled function
> PF = controlling function
> 
> The second proposal is what Michael mentioned from Intel that somehow combine controlled and controlling function as single entity on VF.
> 
> The main reasons I find it weird are:
> 1. it must always need to do mediation to do fake the device reset, and flr flows
> 2. dma cannot work as you explained for complex device state
> 3. it needs constant knowledge of each tiny things for each virtio device type
> 
> Such single entity appears a bit very weird to me but maybe it is just me.

Yea it appears to include everyone from nvidia. Others are used to it -
this is exactly what happens with virtio generally. E.g. vhost
processes fast path in the kernel and control path is in userspace.
vdpa has been largely modeled after that, for better or worse.
Michael S. Tsirkin Oct. 11, 2023, 8:12 a.m. UTC | #42
On Tue, Oct 10, 2023 at 11:59:26PM -0700, Christoph Hellwig wrote:
> On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote:
> > > Btw, what is that intel thing everyone is talking about?  And why
> > > would the virtio core support vendor specific behavior like that?
> > 
> > It's not a thing it's Zhu Lingshan :) intel is just one of the vendors
> > that implemented vdpa support and so Zhu Lingshan from intel is working
> > on vdpa and has also proposed virtio spec extensions for migration.
> > intel's driver is called ifcvf.  vdpa composes all this stuff that is
> > added to vfio in userspace, so it's a different approach.
> 
> Well, so let's call it virtio live migration instead of intel.
> 
> And please work all together in the virtio committee that you have
> one way of communication between controlling and controlled functions.
> If one extension does it one way and the other a different way that's
> just creating a giant mess.

Absolutely, this is exactly what I keep suggesting. Thanks for
bringing this up, will help me drive the point home!
Yishai Hadas Oct. 11, 2023, 8:58 a.m. UTC | #43
On 11/10/2023 11:02, Michael S. Tsirkin wrote:
> On Wed, Oct 11, 2023 at 10:44:49AM +0300, Yishai Hadas wrote:
>> On 10/10/2023 23:42, Michael S. Tsirkin wrote:
>>> On Tue, Oct 10, 2023 at 07:09:08PM +0300, Yishai Hadas wrote:
>>>>>> Assuming that we'll put each command inside virtio as the generic layer, we
>>>>>> won't be able to call/use this API internally to get the PF as of cyclic
>>>>>> dependencies between the modules, link will fail.
>>> I just mean:
>>> virtio_admin_legacy_io_write(sruct pci_device *,  ....)
>>>
>>>
>>> internally it starts from vf gets the pf (or vf itself or whatever
>>> the transport is) sends command gets status returns.
>>>
>>> what is cyclic here?
>>>
>> virtio-pci depends on virtio [1].
>>
>> If we put the commands in the generic layer as we expect it to be (i.e.
>> virtio), then trying to call internally call for virtio_pci_vf_get_pf_dev()
>> to get the PF from the VF will end-up by a linker cyclic error as of below
>> [2].
>>
>> As of that, someone can suggest to put the commands in virtio-pci, however
>> this will fully bypass the generic layer of virtio and future clients won't
>> be able to use it.
> virtio_pci would get pci device.
> virtio pci convers that to virtio device of owner + group member id and calls virtio.

Do you suggest another set of exported symbols (i.e per command ) in 
virtio which will get the owner device + group member + the extra 
specific command parameters ?

This will end-up duplicating the number of export symbols per command.

> no cycles and minimal transport specific code, right?

See my above note, if we may just call virtio without any further work 
on the command's input, than YES.

If so, virtio will prepare the command by setting the relevant SG lists 
and other data and finally will call:

vdev->config->exec_admin_cmd(vdev, cmd);

Was that your plan ?

>
>> In addition, passing in the VF PCI pointer instead of the VF group member ID
>> + the VIRTIO PF device, will require in the future to duplicate each command
>> once we'll use SIOV devices.
> I don't think anyone knows how will SIOV look. But shuffling
> APIs around is not a big deal. We'll see.

As you are the maintainer it's up-to-you, just need to consider another 
further duplication here.

Yishai

>
>> Instead, we suggest the below API for the above example.
>>
>> virtio_admin_legacy_io_write(virtio_device *virtio_dev,  u64
>> group_member_id,  ....)
>>
>> [1]
>> [yishaih@reg-l-vrt-209 linux]$ modinfo virtio-pci
>> filename: /lib/modules/6.6.0-rc2+/kernel/drivers/virtio/virtio_pci.ko
>> version:        1
>> license:        GPL
>> description:    virtio-pci
>> author:         Anthony Liguori <aliguori@us.ibm.com>
>> srcversion:     7355EAC9408D38891938391
>> alias:          pci:v00001AF4d*sv*sd*bc*sc*i*
>> depends: virtio_pci_modern_dev,virtio,virtio_ring,virtio_pci_legacy_dev
>> retpoline:      Y
>> intree:         Y
>> name:           virtio_pci
>> vermagic:       6.6.0-rc2+ SMP preempt mod_unload modversions
>> parm:           force_legacy:Force legacy mode for transitional virtio 1
>> devices (bool)
>>
>> [2]
>>
>> depmod: ERROR: Cycle detected: virtio -> virtio_pci -> virtio
>> depmod: ERROR: Found 2 modules in dependency cycles!
>> make[2]: *** [scripts/Makefile.modinst:128: depmod] Error 1
>> make[1]: *** [/images/yishaih/src/kernel/linux/Makefile:1821:
>> modules_install] Error 2
>>
>> Yishai
> virtio absolutely must not depend on virtio pci, it is used on
> systems without pci at all.
>
Michael S. Tsirkin Oct. 11, 2023, 9:03 a.m. UTC | #44
On Wed, Oct 11, 2023 at 11:58:11AM +0300, Yishai Hadas wrote:
> On 11/10/2023 11:02, Michael S. Tsirkin wrote:
> > On Wed, Oct 11, 2023 at 10:44:49AM +0300, Yishai Hadas wrote:
> > > On 10/10/2023 23:42, Michael S. Tsirkin wrote:
> > > > On Tue, Oct 10, 2023 at 07:09:08PM +0300, Yishai Hadas wrote:
> > > > > > > Assuming that we'll put each command inside virtio as the generic layer, we
> > > > > > > won't be able to call/use this API internally to get the PF as of cyclic
> > > > > > > dependencies between the modules, link will fail.
> > > > I just mean:
> > > > virtio_admin_legacy_io_write(sruct pci_device *,  ....)
> > > > 
> > > > 
> > > > internally it starts from vf gets the pf (or vf itself or whatever
> > > > the transport is) sends command gets status returns.
> > > > 
> > > > what is cyclic here?
> > > > 
> > > virtio-pci depends on virtio [1].
> > > 
> > > If we put the commands in the generic layer as we expect it to be (i.e.
> > > virtio), then trying to call internally call for virtio_pci_vf_get_pf_dev()
> > > to get the PF from the VF will end-up by a linker cyclic error as of below
> > > [2].
> > > 
> > > As of that, someone can suggest to put the commands in virtio-pci, however
> > > this will fully bypass the generic layer of virtio and future clients won't
> > > be able to use it.
> > virtio_pci would get pci device.
> > virtio pci convers that to virtio device of owner + group member id and calls virtio.
> 
> Do you suggest another set of exported symbols (i.e per command ) in virtio
> which will get the owner device + group member + the extra specific command
> parameters ?
> 
> This will end-up duplicating the number of export symbols per command.

Or make them inline.
Or maybe actually even the specific commands should live inside virtio pci
they are pci specific after all.

> > no cycles and minimal transport specific code, right?
> 
> See my above note, if we may just call virtio without any further work on
> the command's input, than YES.
> 
> If so, virtio will prepare the command by setting the relevant SG lists and
> other data and finally will call:
> 
> vdev->config->exec_admin_cmd(vdev, cmd);
> 
> Was that your plan ?

is vdev the pf? then it won't support the transport where commands
are submitted through bar0 of vf itself.

> > 
> > > In addition, passing in the VF PCI pointer instead of the VF group member ID
> > > + the VIRTIO PF device, will require in the future to duplicate each command
> > > once we'll use SIOV devices.
> > I don't think anyone knows how will SIOV look. But shuffling
> > APIs around is not a big deal. We'll see.
> 
> As you are the maintainer it's up-to-you, just need to consider another
> further duplication here.
> 
> Yishai
> 
> > 
> > > Instead, we suggest the below API for the above example.
> > > 
> > > virtio_admin_legacy_io_write(virtio_device *virtio_dev,  u64
> > > group_member_id,  ....)
> > > 
> > > [1]
> > > [yishaih@reg-l-vrt-209 linux]$ modinfo virtio-pci
> > > filename: /lib/modules/6.6.0-rc2+/kernel/drivers/virtio/virtio_pci.ko
> > > version:        1
> > > license:        GPL
> > > description:    virtio-pci
> > > author:         Anthony Liguori <aliguori@us.ibm.com>
> > > srcversion:     7355EAC9408D38891938391
> > > alias:          pci:v00001AF4d*sv*sd*bc*sc*i*
> > > depends: virtio_pci_modern_dev,virtio,virtio_ring,virtio_pci_legacy_dev
> > > retpoline:      Y
> > > intree:         Y
> > > name:           virtio_pci
> > > vermagic:       6.6.0-rc2+ SMP preempt mod_unload modversions
> > > parm:           force_legacy:Force legacy mode for transitional virtio 1
> > > devices (bool)
> > > 
> > > [2]
> > > 
> > > depmod: ERROR: Cycle detected: virtio -> virtio_pci -> virtio
> > > depmod: ERROR: Found 2 modules in dependency cycles!
> > > make[2]: *** [scripts/Makefile.modinst:128: depmod] Error 1
> > > make[1]: *** [/images/yishaih/src/kernel/linux/Makefile:1821:
> > > modules_install] Error 2
> > > 
> > > Yishai
> > virtio absolutely must not depend on virtio pci, it is used on
> > systems without pci at all.
> >
Yishai Hadas Oct. 11, 2023, 11:25 a.m. UTC | #45
On 11/10/2023 12:03, Michael S. Tsirkin wrote:
> On Wed, Oct 11, 2023 at 11:58:11AM +0300, Yishai Hadas wrote:
>> On 11/10/2023 11:02, Michael S. Tsirkin wrote:
>>> On Wed, Oct 11, 2023 at 10:44:49AM +0300, Yishai Hadas wrote:
>>>> On 10/10/2023 23:42, Michael S. Tsirkin wrote:
>>>>> On Tue, Oct 10, 2023 at 07:09:08PM +0300, Yishai Hadas wrote:
>>>>>>>> Assuming that we'll put each command inside virtio as the generic layer, we
>>>>>>>> won't be able to call/use this API internally to get the PF as of cyclic
>>>>>>>> dependencies between the modules, link will fail.
>>>>> I just mean:
>>>>> virtio_admin_legacy_io_write(sruct pci_device *,  ....)
>>>>>
>>>>>
>>>>> internally it starts from vf gets the pf (or vf itself or whatever
>>>>> the transport is) sends command gets status returns.
>>>>>
>>>>> what is cyclic here?
>>>>>
>>>> virtio-pci depends on virtio [1].
>>>>
>>>> If we put the commands in the generic layer as we expect it to be (i.e.
>>>> virtio), then trying to call internally call for virtio_pci_vf_get_pf_dev()
>>>> to get the PF from the VF will end-up by a linker cyclic error as of below
>>>> [2].
>>>>
>>>> As of that, someone can suggest to put the commands in virtio-pci, however
>>>> this will fully bypass the generic layer of virtio and future clients won't
>>>> be able to use it.
>>> virtio_pci would get pci device.
>>> virtio pci convers that to virtio device of owner + group member id and calls virtio.
>> Do you suggest another set of exported symbols (i.e per command ) in virtio
>> which will get the owner device + group member + the extra specific command
>> parameters ?
>>
>> This will end-up duplicating the number of export symbols per command.
> Or make them inline.
> Or maybe actually even the specific commands should live inside virtio pci
> they are pci specific after all.

OK, let's leave them in virtio-pci as you suggested here.

You can see below [1] some scheme of how a specific command will look like.

Few notes:
- virtio_pci_vf_get_pf_dev() will become a static function.

- The commands will be placed inside virtio_pci_common.c and will use 
locally the above static function to get the owner PF.

- Post of preparing the command we may call directly to 
vp_avq_cmd_exec() which is part of vfio-pci and not to virtio.

- vp_avq_cmd_exec() will be part of virtio_pci_modern.c as you asked in 
the ML.

- The AQ creation/destruction will still be called upon probing virtio 
as was in V0, it will use the underlay config->create/destroy_avq() ops 
if exist.

- virtio_admin_cmd_exec() won't be exported any more outside virtio, 
we'll have an exported symbol in virtio-pci per command.

Is the above fine for you ?

By the way, from API namespace POV, are you fine with 
virtio_admin_legacy_io_write() or maybe let's have '_pci' as part of the 
name ? (i.e. virtio_pci_admin_legacy_io_write)

[1]

int virtio_admin_legacy_io_write(struct pci_dev *pdev, u16 opcode, u8 
offset,
                  u8 size, u8 *buf)
{
     struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
     struct virtio_admin_cmd_legacy_wr_data *in;
     struct virtio_admin_cmd cmd = {};
     struct scatterlist in_sg;
     int ret;
     int vf_id;

     if (!virtio_dev)
         return -ENODEV;

     vf_id = pci_iov_vf_id(pdev);
     if (vf_id < 0)
         return -EINVAL;

     in = kzalloc(sizeof(*in) + size, GFP_KERNEL);
     if (!in)
         return -ENOMEM;

     in->offset = offset;
     memcpy(in->registers, buf, size);
     sg_init_one(&in_sg, in, sizeof(*in) + size);
     cmd.opcode = opcode;
     cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
     cmd.group_member_id = vf_id + 1;
     cmd.data_sg = &in_sg;
     ret = vp_avq_cmd_exec(virtio_dev, &cmd);

     kfree(in);
     return ret;
} EXPORT_SYMBOL_GPL(virtio_admin_legacy_io_write);

>
>>> no cycles and minimal transport specific code, right?
>> See my above note, if we may just call virtio without any further work on
>> the command's input, than YES.
>>
>> If so, virtio will prepare the command by setting the relevant SG lists and
>> other data and finally will call:
>>
>> vdev->config->exec_admin_cmd(vdev, cmd);
>>
>> Was that your plan ?
> is vdev the pf? then it won't support the transport where commands
> are submitted through bar0 of vf itself.

Yes, it's a PF.
Based on current spec for the existing admin commands we issue commands 
only on the PF.

In any case, moving to the above suggested scheme to handle per command 
and to get the VF PCI as the first argument we now have a full control 
for any future command.

Yishai

>>>> In addition, passing in the VF PCI pointer instead of the VF group member ID
>>>> + the VIRTIO PF device, will require in the future to duplicate each command
>>>> once we'll use SIOV devices.
>>> I don't think anyone knows how will SIOV look. But shuffling
>>> APIs around is not a big deal. We'll see.
>> As you are the maintainer it's up-to-you, just need to consider another
>> further duplication here.
>>
>> Yishai
>>
>>>> Instead, we suggest the below API for the above example.
>>>>
>>>> virtio_admin_legacy_io_write(virtio_device *virtio_dev,  u64
>>>> group_member_id,  ....)
>>>>
>>>> [1]
>>>> [yishaih@reg-l-vrt-209 linux]$ modinfo virtio-pci
>>>> filename: /lib/modules/6.6.0-rc2+/kernel/drivers/virtio/virtio_pci.ko
>>>> version:        1
>>>> license:        GPL
>>>> description:    virtio-pci
>>>> author:         Anthony Liguori <aliguori@us.ibm.com>
>>>> srcversion:     7355EAC9408D38891938391
>>>> alias:          pci:v00001AF4d*sv*sd*bc*sc*i*
>>>> depends: virtio_pci_modern_dev,virtio,virtio_ring,virtio_pci_legacy_dev
>>>> retpoline:      Y
>>>> intree:         Y
>>>> name:           virtio_pci
>>>> vermagic:       6.6.0-rc2+ SMP preempt mod_unload modversions
>>>> parm:           force_legacy:Force legacy mode for transitional virtio 1
>>>> devices (bool)
>>>>
>>>> [2]
>>>>
>>>> depmod: ERROR: Cycle detected: virtio -> virtio_pci -> virtio
>>>> depmod: ERROR: Found 2 modules in dependency cycles!
>>>> make[2]: *** [scripts/Makefile.modinst:128: depmod] Error 1
>>>> make[1]: *** [/images/yishaih/src/kernel/linux/Makefile:1821:
>>>> modules_install] Error 2
>>>>
>>>> Yishai
>>> virtio absolutely must not depend on virtio pci, it is used on
>>> systems without pci at all.
>>>
Jason Gunthorpe Oct. 11, 2023, 12:18 p.m. UTC | #46
On Wed, Oct 11, 2023 at 04:10:58AM -0400, Michael S. Tsirkin wrote:
> On Wed, Oct 11, 2023 at 08:00:57AM +0000, Parav Pandit wrote:
> > Hi Christoph,
> > 
> > > From: Christoph Hellwig <hch@infradead.org>
> > > Sent: Wednesday, October 11, 2023 12:29 PM
> > > 
> > > On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote:
> > > > > Btw, what is that intel thing everyone is talking about?  And why
> > > > > would the virtio core support vendor specific behavior like that?
> > > >
> > > > It's not a thing it's Zhu Lingshan :) intel is just one of the vendors
> > > > that implemented vdpa support and so Zhu Lingshan from intel is
> > > > working on vdpa and has also proposed virtio spec extensions for migration.
> > > > intel's driver is called ifcvf.  vdpa composes all this stuff that is
> > > > added to vfio in userspace, so it's a different approach.
> > > 
> > > Well, so let's call it virtio live migration instead of intel.
> > > 
> > > And please work all together in the virtio committee that you have one way of
> > > communication between controlling and controlled functions.
> > > If one extension does it one way and the other a different way that's just
> > > creating a giant mess.
> > 
> > We in virtio committee are working on VF device migration where:
> > VF = controlled function
> > PF = controlling function
> > 
> > The second proposal is what Michael mentioned from Intel that somehow combine controlled and controlling function as single entity on VF.
> > 
> > The main reasons I find it weird are:
> > 1. it must always need to do mediation to do fake the device reset, and flr flows
> > 2. dma cannot work as you explained for complex device state
> > 3. it needs constant knowledge of each tiny things for each virtio device type
> > 
> > Such single entity appears a bit very weird to me but maybe it is just me.
> 
> Yea it appears to include everyone from nvidia. Others are used to it -
> this is exactly what happens with virtio generally. E.g. vhost
> processes fast path in the kernel and control path is in userspace.
> vdpa has been largely modeled after that, for better or worse.

As Parav says, you can't use DMA for any migration flows, and you open
a single VF scheme up to PCI P2P attacks from the VM. It is a pretty
bad design.

vfio reviewers will reject things like this that are not secure - we
just did for Intel E800, for instance.

With VDPA doing the same stuff as vfio I'm not sure who is auditing it
for security.

The simple way to be sure is to never touch the PCI function that has
DMA assigned to a VM from the hypervisor, except through config space.

Beyond that.. Well, think carefully about security.

IMHO the single-VF approach is not suitable for standardization.

Jason
Jason Gunthorpe Oct. 11, 2023, 1:57 p.m. UTC | #47
On Tue, Oct 10, 2023 at 11:26:42PM -0700, Christoph Hellwig wrote:
> On Tue, Oct 10, 2023 at 10:10:31AM -0300, Jason Gunthorpe wrote:
> > We've talked around ideas like allowing the VF config space to do some
> > of the work. For simple devices we could get away with 1 VF config
> > space register. (VF config space is owned by the hypervisor, not the
> > guest)
> 
> Which assumes you're actually using VFs and not multiple PFs, which
> is a very limiting assumption.  

? It doesn't matter VF/PF, the same functions config space could do
simple migration.

> It also limits your from actually
> using DMA during the live migration process, which again is major
> limitation once you have a non-tivial amount of state.

Yes, this is a dealbreaker for big cases. But we do see several
smaller/simpler devices that don't use DMA in their migration.

> > SIOVr2 is discussing more a flexible RID mapping - there is a possible
> > route where a "VF" could actually have two RIDs, a hypervisor RID and a
> > guest RID.
> 
> Well, then you go down the SIOV route, which requires a complex driver
> actually presenting the guest visible device anyway.

Yep
 
> Independent of my above points on the doubts on VF-controlled live
> migration for PCe device I absolutely agree with your that the Linux
> abstraction and user interface should be VF based.  Which further
> reinforeces my point that the VFIO driver for the controlled function
> (PF or VF) and the Linux driver for the controlling function (better
> be a PF in practice) must be very tightly integrated.  And the best
> way to do that is to export the vfio nodes from the Linux driver
> that knowns the hardware and not split out into a separate one.

I'm not sure how we get to "very tightly integrated". We have many
examples of live migration vfio drivers now and they do not seem to
require tight integration. The PF driver only has to provide a way to
execute a small number of proxied operations.

Regardless, I'm not too fussed about what directory the implementation
lives in, though I do prefer the current arrangement where VFIO only
stuff is in drivers/vfio. I like the process we have where subsystems
are responsible for the code that implements the subsystem ops.

> > However - the Intel GPU VFIO driver is such a bad experiance I don't
> > want to encourage people to make VFIO drivers, or code that is only
> > used by VFIO drivers, that are not under drivers/vfio review.
> 
> We can and should require vfio review for users of the vfio API.
> But to be honest code placement was not the problem with i915.  The
> problem was that the mdev APIs (under drivers/vfio) were a complete
> trainwreck when it was written, and that the driver had a horrible
> hypervisor API abstraction.

E800 also made some significant security mistakes that VFIO side
caught. I think would have been missed if it went into a netdev
tree.

Even unrelated to mdev, Intel GPU is still not using the vfio side
properly, and the way it hacked into KVM to try to get page tracking
is totally logically wrong (but Works For Me (tm))

Aside from technical concerns, I do have a big process worry
here. vfio is responsible for the security side of the review of
things implementing its ops.

> > Be aware, there is a significant performance concern here. If you want
> > to create 1000 VFIO devices (this is a real thing), we *can't* probe a
> > normal driver first, it is too slow. We need a path that goes directly
> > from creating the RIDs to turning those RIDs into VFIO.
> 
> And by calling the vfio funtions from mlx5 you get this easily.

"easily" I don't know about that :)

> For mdev/SIOV like flows you must call vfio APIs from the main
> driver anyway, as there is no pci_dev to probe on anyway.  That's
> what i915 does btw.

IMHO i915 is not an good example to copy.

mlx5 is already much closer to your ideal, and I would hold up as the
right general direction for SIOV/mdev/etc, as we basically already do
a lot of SIOV ideas.

mlx5 is a multi-subsystem device. It has driver components in net,
VDPA and infiniband. It can create non-PCI "functions".

It is not feasible, process wise, for all of this to live under one
directory. We *want* the driver split up by subystem and subsystem
maintainer.

So, we created the auxiliary_device stuff to manage this. It can do
what you are imagining, I think.

The core PF/VF driver is in charge of what to carve off to a sub
system driver. IIRC mlx5 uses netlink to deliver commands to trigger
this (eg create a VDPA device). An auxilary_device is created and the
target subsystem driver probes to that and autoloads. eg see
drivers/vdpa/mlx5/net/mlx5_vnet.c

They are not 'tightly coupled', the opposite really. The
auxilary_device comes along with a mlx5 API that allows all the
subsystem to do what they need on the HW mostly independently. For
mlx5 this is mostly a way to execute FW RPC commands.

So, if you want to turn the VFIO stuff inside out, I'd still suggest
to have the VFIO driver part under drivers/vfio and probe to an
auxilary_device that represents the aspect of the HW to turn into VFIO
(or VPDA, or whatever). The 'core' driver can provide an appropriate
API between its VFIO part and its core part.

We lack a common uAPI to trigger this creation, but otherwise the
infrastructure exists and works well now. It allows subsystems to
remain together and complex devices to spread their functionality to
multiple subsystems.

The current pci_iov_get_pf_drvdata() hack in VFIO is really a short
cut to doing the auxilary_device stuff. (actually we tried to build
this with auxilary_device first, it did not work out, needs more
driver core infastructure).

I can easially imagine all the current VFIO drivers probing to an
auxilary_device and obtinaing the VF pci_device and the handle for the
core functionalitty directly without the pci_iov_get_pf_drvdata()
approach.

> For "classic" vfio that requires a pci_dev (or $otherbus_dev) we need
> to have a similar flow.  And I think the best way is to have the
> bus-level attribute on the device and/or a device-specific side band
> protocol to device how new functions are probed.  With that you
> avoid all the duplicate PCI IDs for the binding, and actually allow to
> sanely establush a communication channel between the functions.
> Because without that there is no way to know how any two functions
> related.  The driver might think they know, but there's all kinds of
> whacky PCI passthough schemes that will break such a logic.

Yes, if things are not simple PF/VF then Linux struggles at the driver
core level. auxilary_devices are a way out of that since one spot can
figure out how to assemble the multi-component device and then
delegate portions of the HW to other subsystems.

If something wants to probe its own driver to a PF/VF to assemble the
components it can do that and then bundle it up into an aux device and
trigger a VFIO/etc driver to run on that bundle of resources.

We don't *need* to put all the VFIO code someplace else to put the
control over slicing the HW into a shared core driver. mlx5 and
several other drivers now already demonstrates all of this.

Jason
Christoph Hellwig Oct. 11, 2023, 2:17 p.m. UTC | #48
On Wed, Oct 11, 2023 at 10:57:09AM -0300, Jason Gunthorpe wrote:
> > Independent of my above points on the doubts on VF-controlled live
> > migration for PCe device I absolutely agree with your that the Linux
> > abstraction and user interface should be VF based.  Which further
> > reinforeces my point that the VFIO driver for the controlled function
> > (PF or VF) and the Linux driver for the controlling function (better
> > be a PF in practice) must be very tightly integrated.  And the best
> > way to do that is to export the vfio nodes from the Linux driver
> > that knowns the hardware and not split out into a separate one.
> 
> I'm not sure how we get to "very tightly integrated". We have many
> examples of live migration vfio drivers now and they do not seem to
> require tight integration. The PF driver only has to provide a way to
> execute a small number of proxied operations.

Yes.  And for that I need to know what VF it actually is dealing
with.  Which is tight integration in my book.

> Regardless, I'm not too fussed about what directory the implementation
> lives in, though I do prefer the current arrangement where VFIO only
> stuff is in drivers/vfio. I like the process we have where subsystems
> are responsible for the code that implements the subsystem ops.

I really don't care about where the code lives (in the directory tree)
either.  But as you see with virtio trying to split it out into
an arbitrary module causes all kinds of pain.

> 
> E800 also made some significant security mistakes that VFIO side
> caught. I think would have been missed if it went into a netdev
> tree.
> 
> Even unrelated to mdev, Intel GPU is still not using the vfio side
> properly, and the way it hacked into KVM to try to get page tracking
> is totally logically wrong (but Works For Me (tm))
> 
> Aside from technical concerns, I do have a big process worry
> here. vfio is responsible for the security side of the review of
> things implementing its ops.

Yes, anytjing exposing a vfio node needs vfio review, period.  And
I don't think where the code lived was the i915 problem.  The problem
was they they were the first open user of the mdev API, which was
just a badly deisgned hook for never published code at that time, and
they then shoehorned it into a weird hypervisor abstraction.  There's
no good way to succeed with that.
Jason Gunthorpe Oct. 11, 2023, 2:58 p.m. UTC | #49
On Wed, Oct 11, 2023 at 07:17:25AM -0700, Christoph Hellwig wrote:
> On Wed, Oct 11, 2023 at 10:57:09AM -0300, Jason Gunthorpe wrote:
> > > Independent of my above points on the doubts on VF-controlled live
> > > migration for PCe device I absolutely agree with your that the Linux
> > > abstraction and user interface should be VF based.  Which further
> > > reinforeces my point that the VFIO driver for the controlled function
> > > (PF or VF) and the Linux driver for the controlling function (better
> > > be a PF in practice) must be very tightly integrated.  And the best
> > > way to do that is to export the vfio nodes from the Linux driver
> > > that knowns the hardware and not split out into a separate one.
> > 
> > I'm not sure how we get to "very tightly integrated". We have many
> > examples of live migration vfio drivers now and they do not seem to
> > require tight integration. The PF driver only has to provide a way to
> > execute a small number of proxied operations.
> 
> Yes.  And for that I need to know what VF it actually is dealing
> with.  Which is tight integration in my book.

Well, I see two modalities here

Simple devices with a fixed PF/VF relationship use a VF pci_device for
VFIO and pci_iov_get_pf_drvdata()/related APIs to assemble their
parts. This is very limited (and kind of hacky).

Complex devices can use an auxiliary_device for VFIO and assemble
their parts however they like.

After probe is done the VFIO code operates effectively identically
regardless of how the components were found.

Intel is going to submit their IDXD SIOV driver "soon" and I'd like to
pause there and have a real discussion about how to manage VFIO
lifecycle and dynamic "function" creation in this brave new world.

Ideally we can get a lifecycle API that works uniformly for PCI VFs
too. Then maybe this gets more resolved.

In my mind at least, definately no mdevs and that sysfs GUID junk. :(

> I really don't care about where the code lives (in the directory tree)
> either.  But as you see with virtio trying to split it out into
> an arbitrary module causes all kinds of pain.

Trying to put VFIO-only code in virtio is what causes all the
issues. If you mis-design the API boundary everything will be painful,
no matter where you put the code.

Jason
Michael S. Tsirkin Oct. 11, 2023, 4:59 p.m. UTC | #50
On Wed, Oct 11, 2023 at 11:58:10AM -0300, Jason Gunthorpe wrote:
> Trying to put VFIO-only code in virtio is what causes all the
> issues. If you mis-design the API boundary everything will be painful,
> no matter where you put the code.

Are you implying the whole idea of adding these legacy virtio admin
commands to virtio spec was a design mistake?
It was nvidia guys who proposed it, so I'm surprised to hear you say this.
Michael S. Tsirkin Oct. 11, 2023, 5:03 p.m. UTC | #51
On Wed, Oct 11, 2023 at 09:18:49AM -0300, Jason Gunthorpe wrote:
> The simple way to be sure is to never touch the PCI function that has
> DMA assigned to a VM from the hypervisor, except through config space.

What makes config space different that it's safe though?
Isn't this more of a "we can't avoid touching config space" than
that it's safe? The line doesn't look that bright to me -
if there's e.g. a memory area designed explicitly for
hypervisor to poke at, that seems fine.
Michael S. Tsirkin Oct. 11, 2023, 5:05 p.m. UTC | #52
On Wed, Oct 11, 2023 at 09:18:49AM -0300, Jason Gunthorpe wrote:
> With VDPA doing the same stuff as vfio I'm not sure who is auditing it
> for security.

Check the signed off tags and who sends the pull requests if you want to
know.
Jason Gunthorpe Oct. 11, 2023, 5:19 p.m. UTC | #53
On Wed, Oct 11, 2023 at 12:59:30PM -0400, Michael S. Tsirkin wrote:
> On Wed, Oct 11, 2023 at 11:58:10AM -0300, Jason Gunthorpe wrote:
> > Trying to put VFIO-only code in virtio is what causes all the
> > issues. If you mis-design the API boundary everything will be painful,
> > no matter where you put the code.
> 
> Are you implying the whole idea of adding these legacy virtio admin
> commands to virtio spec was a design mistake?

No, I'm saying again that trying to relocate all the vfio code into
drivers/virtio is a mistake

Jason
Jason Gunthorpe Oct. 11, 2023, 5:20 p.m. UTC | #54
On Wed, Oct 11, 2023 at 01:03:09PM -0400, Michael S. Tsirkin wrote:
> On Wed, Oct 11, 2023 at 09:18:49AM -0300, Jason Gunthorpe wrote:
> > The simple way to be sure is to never touch the PCI function that has
> > DMA assigned to a VM from the hypervisor, except through config space.
> 
> What makes config space different that it's safe though?

Hypervisor fully mediates it and it is not accessible to P2P attacks.

> Isn't this more of a "we can't avoid touching config space" than
> that it's safe? The line doesn't look that bright to me -
> if there's e.g. a memory area designed explicitly for
> hypervisor to poke at, that seems fine.

It is not.

Jason
Michael S. Tsirkin Oct. 11, 2023, 8:20 p.m. UTC | #55
On Wed, Oct 11, 2023 at 02:19:44PM -0300, Jason Gunthorpe wrote:
> On Wed, Oct 11, 2023 at 12:59:30PM -0400, Michael S. Tsirkin wrote:
> > On Wed, Oct 11, 2023 at 11:58:10AM -0300, Jason Gunthorpe wrote:
> > > Trying to put VFIO-only code in virtio is what causes all the
> > > issues. If you mis-design the API boundary everything will be painful,
> > > no matter where you put the code.
> > 
> > Are you implying the whole idea of adding these legacy virtio admin
> > commands to virtio spec was a design mistake?
> 
> No, I'm saying again that trying to relocate all the vfio code into
> drivers/virtio is a mistake
> 
> Jason

Yea please don't. And by the same token, please do not put
implementations of virtio spec under drivers/vfio.
Zhu, Lingshan Oct. 12, 2023, 10:29 a.m. UTC | #56
On 10/11/2023 4:00 PM, Parav Pandit via Virtualization wrote:
> Hi Christoph,
>
>> From: Christoph Hellwig <hch@infradead.org>
>> Sent: Wednesday, October 11, 2023 12:29 PM
>>
>> On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote:
>>>> Btw, what is that intel thing everyone is talking about?  And why
>>>> would the virtio core support vendor specific behavior like that?
>>> It's not a thing it's Zhu Lingshan :) intel is just one of the vendors
>>> that implemented vdpa support and so Zhu Lingshan from intel is
>>> working on vdpa and has also proposed virtio spec extensions for migration.
>>> intel's driver is called ifcvf.  vdpa composes all this stuff that is
>>> added to vfio in userspace, so it's a different approach.
>> Well, so let's call it virtio live migration instead of intel.
>>
>> And please work all together in the virtio committee that you have one way of
>> communication between controlling and controlled functions.
>> If one extension does it one way and the other a different way that's just
>> creating a giant mess.
> We in virtio committee are working on VF device migration where:
> VF = controlled function
> PF = controlling function
>
> The second proposal is what Michael mentioned from Intel that somehow combine controlled and controlling function as single entity on VF.
>
> The main reasons I find it weird are:
> 1. it must always need to do mediation to do fake the device reset, and flr flows
> 2. dma cannot work as you explained for complex device state
> 3. it needs constant knowledge of each tiny things for each virtio device type
>
> Such single entity appears a bit very weird to me but maybe it is just me.
sorry for the late reply, we have discussed this for weeks in virtio 
mailing list.
I have proposed a live migration solution which is a config space solution.

We(me, Jason and Eugenio) have been working on this solution for more 
than two years
and we are implementing virtio live migration basic facilities.

The implementation is transport specific, e.g., for PCI we implement new 
or extend registers which
work as other config space registers do.

The reason we are arguing is:
I am not sure admin vq based live migration solution is a good choice, 
because:
1) it does not work for nested
2) it does not work for bare metal
3) QOS problem
4) security leaks.

Sorry to span the discussions here.

Thanks,
Zhu Lingshan
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Zhu, Lingshan Oct. 12, 2023, 10:30 a.m. UTC | #57
On 10/11/2023 2:59 PM, Christoph Hellwig wrote:
> On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote:
>>> Btw, what is that intel thing everyone is talking about?  And why
>>> would the virtio core support vendor specific behavior like that?
>> It's not a thing it's Zhu Lingshan :) intel is just one of the vendors
>> that implemented vdpa support and so Zhu Lingshan from intel is working
>> on vdpa and has also proposed virtio spec extensions for migration.
>> intel's driver is called ifcvf.  vdpa composes all this stuff that is
>> added to vfio in userspace, so it's a different approach.
> Well, so let's call it virtio live migration instead of intel.
>
> And please work all together in the virtio committee that you have
> one way of communication between controlling and controlled functions.
> If one extension does it one way and the other a different way that's
> just creating a giant mess.
I hope so, Jason Wang has proposed a solution to cooperate, but sadly
rejected...
>
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Jason Gunthorpe Oct. 12, 2023, 1:27 p.m. UTC | #58
On Thu, Oct 12, 2023 at 06:29:47PM +0800, Zhu, Lingshan wrote:

> sorry for the late reply, we have discussed this for weeks in virtio mailing
> list. I have proposed a live migration solution which is a config space solution.

I'm sorry that can't be a serious proposal - config space can't do
DMA, it is not suitable.

Jason
Michael S. Tsirkin Oct. 13, 2023, 1:50 p.m. UTC | #59
On Fri, Oct 13, 2023 at 06:28:34PM +0800, Zhu, Lingshan wrote:
> 
> 
> On 10/12/2023 9:27 PM, Jason Gunthorpe wrote:
> 
>     On Thu, Oct 12, 2023 at 06:29:47PM +0800, Zhu, Lingshan wrote:
> 
> 
>         sorry for the late reply, we have discussed this for weeks in virtio mailing
>         list. I have proposed a live migration solution which is a config space solution.
> 
>     I'm sorry that can't be a serious proposal - config space can't do
>     DMA, it is not suitable.
> 
> config space only controls the live migration process and config the related
> facilities.
> We don't use config space to transfer data.
> 
> The new added registers work like queue_enable or features.
> 
> For example, we use DMA to report dirty pages and MMIO to fetch the dirty data.
> 
> I remember in another thread you said:"you can't use DMA for any migration
> flows"
> 
> And I agree to that statement, so we use config space registers to control the
> flow.
> 
> Thanks,
> Zhu Lingshan
> 
> 
>     Jason
> 

If you are using dma then I don't see what's wrong with admin vq.
dma is all it does.
Zhu, Lingshan Oct. 16, 2023, 8:33 a.m. UTC | #60
On 10/13/2023 9:50 PM, Michael S. Tsirkin wrote:
> On Fri, Oct 13, 2023 at 06:28:34PM +0800, Zhu, Lingshan wrote:
>>
>> On 10/12/2023 9:27 PM, Jason Gunthorpe wrote:
>>
>>      On Thu, Oct 12, 2023 at 06:29:47PM +0800, Zhu, Lingshan wrote:
>>
>>
>>          sorry for the late reply, we have discussed this for weeks in virtio mailing
>>          list. I have proposed a live migration solution which is a config space solution.
>>
>>      I'm sorry that can't be a serious proposal - config space can't do
>>      DMA, it is not suitable.
>>
>> config space only controls the live migration process and config the related
>> facilities.
>> We don't use config space to transfer data.
>>
>> The new added registers work like queue_enable or features.
>>
>> For example, we use DMA to report dirty pages and MMIO to fetch the dirty data.
>>
>> I remember in another thread you said:"you can't use DMA for any migration
>> flows"
>>
>> And I agree to that statement, so we use config space registers to control the
>> flow.
>>
>> Thanks,
>> Zhu Lingshan
>>
>>
>>      Jason
>>
> If you are using dma then I don't see what's wrong with admin vq.
> dma is all it does.
dma != admin vq,

and I think we have discussed many details in pros and cons
in admin vq live migration proposal in virtio-comment.
I am not sure we should span the discussions here, repeat them over again.

Thanks
>
Michael S. Tsirkin Oct. 16, 2023, 8:52 a.m. UTC | #61
On Mon, Oct 16, 2023 at 04:33:10PM +0800, Zhu, Lingshan wrote:
> 
> 
> On 10/13/2023 9:50 PM, Michael S. Tsirkin wrote:
> > On Fri, Oct 13, 2023 at 06:28:34PM +0800, Zhu, Lingshan wrote:
> > > 
> > > On 10/12/2023 9:27 PM, Jason Gunthorpe wrote:
> > > 
> > >      On Thu, Oct 12, 2023 at 06:29:47PM +0800, Zhu, Lingshan wrote:
> > > 
> > > 
> > >          sorry for the late reply, we have discussed this for weeks in virtio mailing
> > >          list. I have proposed a live migration solution which is a config space solution.
> > > 
> > >      I'm sorry that can't be a serious proposal - config space can't do
> > >      DMA, it is not suitable.
> > > 
> > > config space only controls the live migration process and config the related
> > > facilities.
> > > We don't use config space to transfer data.
> > > 
> > > The new added registers work like queue_enable or features.
> > > 
> > > For example, we use DMA to report dirty pages and MMIO to fetch the dirty data.
> > > 
> > > I remember in another thread you said:"you can't use DMA for any migration
> > > flows"
> > > 
> > > And I agree to that statement, so we use config space registers to control the
> > > flow.
> > > 
> > > Thanks,
> > > Zhu Lingshan
> > > 
> > > 
> > >      Jason
> > > 
> > If you are using dma then I don't see what's wrong with admin vq.
> > dma is all it does.
> dma != admin vq,

Well they share the same issue that they don't work for nesting
because DMA can not be intercepted.

> and I think we have discussed many details in pros and cons
> in admin vq live migration proposal in virtio-comment.
> I am not sure we should span the discussions here, repeat them over again.
> 
> Thanks
> > 

Yea let's not.
Zhu, Lingshan Oct. 16, 2023, 9:53 a.m. UTC | #62
On 10/16/2023 4:52 PM, Michael S. Tsirkin wrote:
> On Mon, Oct 16, 2023 at 04:33:10PM +0800, Zhu, Lingshan wrote:
>>
>> On 10/13/2023 9:50 PM, Michael S. Tsirkin wrote:
>>> On Fri, Oct 13, 2023 at 06:28:34PM +0800, Zhu, Lingshan wrote:
>>>> On 10/12/2023 9:27 PM, Jason Gunthorpe wrote:
>>>>
>>>>       On Thu, Oct 12, 2023 at 06:29:47PM +0800, Zhu, Lingshan wrote:
>>>>
>>>>
>>>>           sorry for the late reply, we have discussed this for weeks in virtio mailing
>>>>           list. I have proposed a live migration solution which is a config space solution.
>>>>
>>>>       I'm sorry that can't be a serious proposal - config space can't do
>>>>       DMA, it is not suitable.
>>>>
>>>> config space only controls the live migration process and config the related
>>>> facilities.
>>>> We don't use config space to transfer data.
>>>>
>>>> The new added registers work like queue_enable or features.
>>>>
>>>> For example, we use DMA to report dirty pages and MMIO to fetch the dirty data.
>>>>
>>>> I remember in another thread you said:"you can't use DMA for any migration
>>>> flows"
>>>>
>>>> And I agree to that statement, so we use config space registers to control the
>>>> flow.
>>>>
>>>> Thanks,
>>>> Zhu Lingshan
>>>>
>>>>
>>>>       Jason
>>>>
>>> If you are using dma then I don't see what's wrong with admin vq.
>>> dma is all it does.
>> dma != admin vq,
> Well they share the same issue that they don't work for nesting
> because DMA can not be intercepted.
(hope this is not a spam to virtualization list and I try to keep this 
short)
only use dma for host memory access, e.g., dirty page bitmap, no need to 
intercepted.
>
>> and I think we have discussed many details in pros and cons
>> in admin vq live migration proposal in virtio-comment.
>> I am not sure we should span the discussions here, repeat them over again.
>>
>> Thanks
> Yea let's not.
>
diff mbox series

Patch

diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c
new file mode 100644
index 000000000000..f068239cdbb0
--- /dev/null
+++ b/drivers/vfio/pci/virtio/cmd.c
@@ -0,0 +1,146 @@ 
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include "cmd.h"
+
+int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size)
+{
+	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+	struct scatterlist out_sg;
+	struct virtio_admin_cmd cmd = {};
+
+	if (!virtio_dev)
+		return -ENOTCONN;
+
+	sg_init_one(&out_sg, buf, buf_size);
+	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY;
+	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
+	cmd.result_sg = &out_sg;
+
+	return virtio_admin_cmd_exec(virtio_dev, &cmd);
+}
+
+int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size)
+{
+	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+	struct scatterlist in_sg;
+	struct virtio_admin_cmd cmd = {};
+
+	if (!virtio_dev)
+		return -ENOTCONN;
+
+	sg_init_one(&in_sg, buf, buf_size);
+	cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE;
+	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
+	cmd.data_sg = &in_sg;
+
+	return virtio_admin_cmd_exec(virtio_dev, &cmd);
+}
+
+int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
+			  u8 offset, u8 size, u8 *buf)
+{
+	struct virtio_device *virtio_dev =
+		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
+	struct virtio_admin_cmd_data_lr_write *in;
+	struct scatterlist in_sg;
+	struct virtio_admin_cmd cmd = {};
+	int ret;
+
+	if (!virtio_dev)
+		return -ENOTCONN;
+
+	in = kzalloc(sizeof(*in) + size, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	in->offset = offset;
+	memcpy(in->registers, buf, size);
+	sg_init_one(&in_sg, in, sizeof(*in) + size);
+	cmd.opcode = opcode;
+	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
+	cmd.group_member_id = virtvdev->vf_id + 1;
+	cmd.data_sg = &in_sg;
+	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
+
+	kfree(in);
+	return ret;
+}
+
+int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
+			 u8 offset, u8 size, u8 *buf)
+{
+	struct virtio_device *virtio_dev =
+		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
+	struct virtio_admin_cmd_data_lr_read *in;
+	struct scatterlist in_sg, out_sg;
+	struct virtio_admin_cmd cmd = {};
+	int ret;
+
+	if (!virtio_dev)
+		return -ENOTCONN;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	in->offset = offset;
+	sg_init_one(&in_sg, in, sizeof(*in));
+	sg_init_one(&out_sg, buf, size);
+	cmd.opcode = opcode;
+	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
+	cmd.data_sg = &in_sg;
+	cmd.result_sg = &out_sg;
+	cmd.group_member_id = virtvdev->vf_id + 1;
+	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
+
+	kfree(in);
+	return ret;
+}
+
+int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
+				u8 req_bar_flags, u8 *bar, u64 *bar_offset)
+{
+	struct virtio_device *virtio_dev =
+		virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev);
+	struct virtio_admin_cmd_notify_info_result *out;
+	struct scatterlist out_sg;
+	struct virtio_admin_cmd cmd = {};
+	int ret;
+
+	if (!virtio_dev)
+		return -ENOTCONN;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	sg_init_one(&out_sg, out, sizeof(*out));
+	cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO;
+	cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV;
+	cmd.result_sg = &out_sg;
+	cmd.group_member_id = virtvdev->vf_id + 1;
+	ret = virtio_admin_cmd_exec(virtio_dev, &cmd);
+	if (!ret) {
+		struct virtio_admin_cmd_notify_info_data *entry;
+		int i;
+
+		ret = -ENOENT;
+		for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) {
+			entry = &out->entries[i];
+			if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END)
+				break;
+			if (entry->flags != req_bar_flags)
+				continue;
+			*bar = entry->bar;
+			*bar_offset = le64_to_cpu(entry->offset);
+			ret = 0;
+			break;
+		}
+	}
+
+	kfree(out);
+	return ret;
+}
diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h
new file mode 100644
index 000000000000..c2a3645f4b90
--- /dev/null
+++ b/drivers/vfio/pci/virtio/cmd.h
@@ -0,0 +1,27 @@ 
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#ifndef VIRTIO_VFIO_CMD_H
+#define VIRTIO_VFIO_CMD_H
+
+#include <linux/kernel.h>
+#include <linux/virtio.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/virtio_pci.h>
+
+struct virtiovf_pci_core_device {
+	struct vfio_pci_core_device core_device;
+	int vf_id;
+};
+
+int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size);
+int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size);
+int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
+			  u8 offset, u8 size, u8 *buf);
+int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode,
+			 u8 offset, u8 size, u8 *buf);
+int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev,
+				u8 req_bar_flags, u8 *bar, u64 *bar_offset);
+#endif /* VIRTIO_VFIO_CMD_H */