Message ID | 20230921124040.145386-11-yishaih@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Introduce a vfio driver over virtio devices | expand |
On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote: > Expose admin commands over the virtio device, to be used by the > vfio-virtio driver in the next patches. > > It includes: list query/use, legacy write/read, read notify_info. > > Signed-off-by: Yishai Hadas <yishaih@nvidia.com> I don't get the motivation for this and the next patch. We already have vdpa that seems to do exactly this: drive virtio from userspace. Why do we need these extra 1000 lines of code in vfio - just because we can? Not to talk about user confusion all this will cause. > --- > drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++ > drivers/vfio/pci/virtio/cmd.h | 27 +++++++ > 2 files changed, 173 insertions(+) > create mode 100644 drivers/vfio/pci/virtio/cmd.c > create mode 100644 drivers/vfio/pci/virtio/cmd.h > > diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c > new file mode 100644 > index 000000000000..f068239cdbb0 > --- /dev/null > +++ b/drivers/vfio/pci/virtio/cmd.c > @@ -0,0 +1,146 @@ > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > +/* > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved > + */ > + > +#include "cmd.h" > + > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size) > +{ > + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); > + struct scatterlist out_sg; > + struct virtio_admin_cmd cmd = {}; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + sg_init_one(&out_sg, buf, buf_size); > + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.result_sg = &out_sg; > + > + return virtio_admin_cmd_exec(virtio_dev, &cmd); > +} > + > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size) > +{ > + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); > + struct scatterlist in_sg; > + struct virtio_admin_cmd cmd = {}; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + sg_init_one(&in_sg, buf, buf_size); > + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.data_sg = &in_sg; > + > + return virtio_admin_cmd_exec(virtio_dev, &cmd); > +} > + > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > + u8 offset, u8 size, u8 *buf) > +{ > + struct virtio_device *virtio_dev = > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > + struct virtio_admin_cmd_data_lr_write *in; > + struct scatterlist in_sg; > + struct virtio_admin_cmd cmd = {}; > + int ret; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + in = kzalloc(sizeof(*in) + size, GFP_KERNEL); > + if (!in) > + return -ENOMEM; > + > + in->offset = offset; > + memcpy(in->registers, buf, size); > + sg_init_one(&in_sg, in, sizeof(*in) + size); > + cmd.opcode = opcode; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.group_member_id = virtvdev->vf_id + 1; > + cmd.data_sg = &in_sg; > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > + > + kfree(in); > + return ret; > +} > + > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > + u8 offset, u8 size, u8 *buf) > +{ > + struct virtio_device *virtio_dev = > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > + struct virtio_admin_cmd_data_lr_read *in; > + struct scatterlist in_sg, out_sg; > + struct virtio_admin_cmd cmd = {}; > + int ret; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + in = kzalloc(sizeof(*in), GFP_KERNEL); > + if (!in) > + return -ENOMEM; > + > + in->offset = offset; > + sg_init_one(&in_sg, in, sizeof(*in)); > + sg_init_one(&out_sg, buf, size); > + cmd.opcode = opcode; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.data_sg = &in_sg; > + cmd.result_sg = &out_sg; > + cmd.group_member_id = virtvdev->vf_id + 1; > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > + > + kfree(in); > + return ret; > +} > + > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, > + u8 req_bar_flags, u8 *bar, u64 *bar_offset) > +{ > + struct virtio_device *virtio_dev = > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > + struct virtio_admin_cmd_notify_info_result *out; > + struct scatterlist out_sg; > + struct virtio_admin_cmd cmd = {}; > + int ret; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + out = kzalloc(sizeof(*out), GFP_KERNEL); > + if (!out) > + return -ENOMEM; > + > + sg_init_one(&out_sg, out, sizeof(*out)); > + cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.result_sg = &out_sg; > + cmd.group_member_id = virtvdev->vf_id + 1; > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > + if (!ret) { > + struct virtio_admin_cmd_notify_info_data *entry; > + int i; > + > + ret = -ENOENT; > + for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) { > + entry = &out->entries[i]; > + if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END) > + break; > + if (entry->flags != req_bar_flags) > + continue; > + *bar = entry->bar; > + *bar_offset = le64_to_cpu(entry->offset); > + ret = 0; > + break; > + } > + } > + > + kfree(out); > + return ret; > +} > diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h > new file mode 100644 > index 000000000000..c2a3645f4b90 > --- /dev/null > +++ b/drivers/vfio/pci/virtio/cmd.h > @@ -0,0 +1,27 @@ > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > +/* > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. > + */ > + > +#ifndef VIRTIO_VFIO_CMD_H > +#define VIRTIO_VFIO_CMD_H > + > +#include <linux/kernel.h> > +#include <linux/virtio.h> > +#include <linux/vfio_pci_core.h> > +#include <linux/virtio_pci.h> > + > +struct virtiovf_pci_core_device { > + struct vfio_pci_core_device core_device; > + int vf_id; > +}; > + > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size); > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size); > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > + u8 offset, u8 size, u8 *buf); > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > + u8 offset, u8 size, u8 *buf); > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, > + u8 req_bar_flags, u8 *bar, u64 *bar_offset); > +#endif /* VIRTIO_VFIO_CMD_H */ > -- > 2.27.0
On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote: > Expose admin commands over the virtio device, to be used by the > vfio-virtio driver in the next patches. > > It includes: list query/use, legacy write/read, read notify_info. > > Signed-off-by: Yishai Hadas <yishaih@nvidia.com> > --- > drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++ > drivers/vfio/pci/virtio/cmd.h | 27 +++++++ > 2 files changed, 173 insertions(+) > create mode 100644 drivers/vfio/pci/virtio/cmd.c > create mode 100644 drivers/vfio/pci/virtio/cmd.h > > diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c > new file mode 100644 > index 000000000000..f068239cdbb0 > --- /dev/null > +++ b/drivers/vfio/pci/virtio/cmd.c > @@ -0,0 +1,146 @@ > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > +/* > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved > + */ > + > +#include "cmd.h" > + > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size) > +{ > + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); > + struct scatterlist out_sg; > + struct virtio_admin_cmd cmd = {}; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + sg_init_one(&out_sg, buf, buf_size); > + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.result_sg = &out_sg; > + > + return virtio_admin_cmd_exec(virtio_dev, &cmd); > +} > + in/out seem all wrong here. In virtio terminology, in means from device to driver, out means from driver to device. > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size) > +{ > + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); > + struct scatterlist in_sg; > + struct virtio_admin_cmd cmd = {}; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + sg_init_one(&in_sg, buf, buf_size); > + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.data_sg = &in_sg; > + > + return virtio_admin_cmd_exec(virtio_dev, &cmd); > +} > + > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, what is _lr short for? > + u8 offset, u8 size, u8 *buf) > +{ > + struct virtio_device *virtio_dev = > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > + struct virtio_admin_cmd_data_lr_write *in; > + struct scatterlist in_sg; > + struct virtio_admin_cmd cmd = {}; > + int ret; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + in = kzalloc(sizeof(*in) + size, GFP_KERNEL); > + if (!in) > + return -ENOMEM; > + > + in->offset = offset; > + memcpy(in->registers, buf, size); > + sg_init_one(&in_sg, in, sizeof(*in) + size); > + cmd.opcode = opcode; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.group_member_id = virtvdev->vf_id + 1; weird. why + 1? > + cmd.data_sg = &in_sg; > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > + > + kfree(in); > + return ret; > +} How do you know it's safe to send this command, in particular at this time? This seems to be doing zero checks, and zero synchronization with the PF driver. > + > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > + u8 offset, u8 size, u8 *buf) > +{ > + struct virtio_device *virtio_dev = > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > + struct virtio_admin_cmd_data_lr_read *in; > + struct scatterlist in_sg, out_sg; > + struct virtio_admin_cmd cmd = {}; > + int ret; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + in = kzalloc(sizeof(*in), GFP_KERNEL); > + if (!in) > + return -ENOMEM; > + > + in->offset = offset; > + sg_init_one(&in_sg, in, sizeof(*in)); > + sg_init_one(&out_sg, buf, size); > + cmd.opcode = opcode; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.data_sg = &in_sg; > + cmd.result_sg = &out_sg; > + cmd.group_member_id = virtvdev->vf_id + 1; > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > + > + kfree(in); > + return ret; > +} > + > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, and what is lq short for? > + u8 req_bar_flags, u8 *bar, u64 *bar_offset) > +{ > + struct virtio_device *virtio_dev = > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > + struct virtio_admin_cmd_notify_info_result *out; > + struct scatterlist out_sg; > + struct virtio_admin_cmd cmd = {}; > + int ret; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + out = kzalloc(sizeof(*out), GFP_KERNEL); > + if (!out) > + return -ENOMEM; > + > + sg_init_one(&out_sg, out, sizeof(*out)); > + cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.result_sg = &out_sg; > + cmd.group_member_id = virtvdev->vf_id + 1; > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > + if (!ret) { > + struct virtio_admin_cmd_notify_info_data *entry; > + int i; > + > + ret = -ENOENT; > + for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) { > + entry = &out->entries[i]; > + if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END) > + break; > + if (entry->flags != req_bar_flags) > + continue; > + *bar = entry->bar; > + *bar_offset = le64_to_cpu(entry->offset); > + ret = 0; > + break; > + } > + } > + > + kfree(out); > + return ret; > +} > diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h > new file mode 100644 > index 000000000000..c2a3645f4b90 > --- /dev/null > +++ b/drivers/vfio/pci/virtio/cmd.h > @@ -0,0 +1,27 @@ > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > +/* > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. > + */ > + > +#ifndef VIRTIO_VFIO_CMD_H > +#define VIRTIO_VFIO_CMD_H > + > +#include <linux/kernel.h> > +#include <linux/virtio.h> > +#include <linux/vfio_pci_core.h> > +#include <linux/virtio_pci.h> > + > +struct virtiovf_pci_core_device { > + struct vfio_pci_core_device core_device; > + int vf_id; > +}; > + > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size); > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size); > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > + u8 offset, u8 size, u8 *buf); > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > + u8 offset, u8 size, u8 *buf); > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, > + u8 req_bar_flags, u8 *bar, u64 *bar_offset); > +#endif /* VIRTIO_VFIO_CMD_H */ > -- > 2.27.0
On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote: > Expose admin commands over the virtio device, to be used by the > vfio-virtio driver in the next patches. > > It includes: list query/use, legacy write/read, read notify_info. > > Signed-off-by: Yishai Hadas <yishaih@nvidia.com> This stuff is pure virtio spec. I think it should live under drivers/virtio, too. > --- > drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++ > drivers/vfio/pci/virtio/cmd.h | 27 +++++++ > 2 files changed, 173 insertions(+) > create mode 100644 drivers/vfio/pci/virtio/cmd.c > create mode 100644 drivers/vfio/pci/virtio/cmd.h > > diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c > new file mode 100644 > index 000000000000..f068239cdbb0 > --- /dev/null > +++ b/drivers/vfio/pci/virtio/cmd.c > @@ -0,0 +1,146 @@ > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > +/* > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved > + */ > + > +#include "cmd.h" > + > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size) > +{ > + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); > + struct scatterlist out_sg; > + struct virtio_admin_cmd cmd = {}; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + sg_init_one(&out_sg, buf, buf_size); > + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.result_sg = &out_sg; > + > + return virtio_admin_cmd_exec(virtio_dev, &cmd); > +} > + > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size) > +{ > + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); > + struct scatterlist in_sg; > + struct virtio_admin_cmd cmd = {}; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + sg_init_one(&in_sg, buf, buf_size); > + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.data_sg = &in_sg; > + > + return virtio_admin_cmd_exec(virtio_dev, &cmd); > +} > + > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > + u8 offset, u8 size, u8 *buf) > +{ > + struct virtio_device *virtio_dev = > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > + struct virtio_admin_cmd_data_lr_write *in; > + struct scatterlist in_sg; > + struct virtio_admin_cmd cmd = {}; > + int ret; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + in = kzalloc(sizeof(*in) + size, GFP_KERNEL); > + if (!in) > + return -ENOMEM; > + > + in->offset = offset; > + memcpy(in->registers, buf, size); > + sg_init_one(&in_sg, in, sizeof(*in) + size); > + cmd.opcode = opcode; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.group_member_id = virtvdev->vf_id + 1; > + cmd.data_sg = &in_sg; > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > + > + kfree(in); > + return ret; > +} > + > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > + u8 offset, u8 size, u8 *buf) > +{ > + struct virtio_device *virtio_dev = > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > + struct virtio_admin_cmd_data_lr_read *in; > + struct scatterlist in_sg, out_sg; > + struct virtio_admin_cmd cmd = {}; > + int ret; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + in = kzalloc(sizeof(*in), GFP_KERNEL); > + if (!in) > + return -ENOMEM; > + > + in->offset = offset; > + sg_init_one(&in_sg, in, sizeof(*in)); > + sg_init_one(&out_sg, buf, size); > + cmd.opcode = opcode; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.data_sg = &in_sg; > + cmd.result_sg = &out_sg; > + cmd.group_member_id = virtvdev->vf_id + 1; > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > + > + kfree(in); > + return ret; > +} > + > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, > + u8 req_bar_flags, u8 *bar, u64 *bar_offset) > +{ > + struct virtio_device *virtio_dev = > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > + struct virtio_admin_cmd_notify_info_result *out; > + struct scatterlist out_sg; > + struct virtio_admin_cmd cmd = {}; > + int ret; > + > + if (!virtio_dev) > + return -ENOTCONN; > + > + out = kzalloc(sizeof(*out), GFP_KERNEL); > + if (!out) > + return -ENOMEM; > + > + sg_init_one(&out_sg, out, sizeof(*out)); > + cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO; > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > + cmd.result_sg = &out_sg; > + cmd.group_member_id = virtvdev->vf_id + 1; > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > + if (!ret) { > + struct virtio_admin_cmd_notify_info_data *entry; > + int i; > + > + ret = -ENOENT; > + for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) { > + entry = &out->entries[i]; > + if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END) > + break; > + if (entry->flags != req_bar_flags) > + continue; > + *bar = entry->bar; > + *bar_offset = le64_to_cpu(entry->offset); > + ret = 0; > + break; > + } > + } > + > + kfree(out); > + return ret; > +} > diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h > new file mode 100644 > index 000000000000..c2a3645f4b90 > --- /dev/null > +++ b/drivers/vfio/pci/virtio/cmd.h > @@ -0,0 +1,27 @@ > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > +/* > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. > + */ > + > +#ifndef VIRTIO_VFIO_CMD_H > +#define VIRTIO_VFIO_CMD_H > + > +#include <linux/kernel.h> > +#include <linux/virtio.h> > +#include <linux/vfio_pci_core.h> > +#include <linux/virtio_pci.h> > + > +struct virtiovf_pci_core_device { > + struct vfio_pci_core_device core_device; > + int vf_id; > +}; > + > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size); > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size); > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > + u8 offset, u8 size, u8 *buf); > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > + u8 offset, u8 size, u8 *buf); > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, > + u8 req_bar_flags, u8 *bar, u64 *bar_offset); > +#endif /* VIRTIO_VFIO_CMD_H */ > -- > 2.27.0
On 21/09/2023 23:34, Michael S. Tsirkin wrote: > On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote: >> Expose admin commands over the virtio device, to be used by the >> vfio-virtio driver in the next patches. >> >> It includes: list query/use, legacy write/read, read notify_info. >> >> Signed-off-by: Yishai Hadas <yishaih@nvidia.com> >> --- >> drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++ >> drivers/vfio/pci/virtio/cmd.h | 27 +++++++ >> 2 files changed, 173 insertions(+) >> create mode 100644 drivers/vfio/pci/virtio/cmd.c >> create mode 100644 drivers/vfio/pci/virtio/cmd.h >> >> diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c >> new file mode 100644 >> index 000000000000..f068239cdbb0 >> --- /dev/null >> +++ b/drivers/vfio/pci/virtio/cmd.c >> @@ -0,0 +1,146 @@ >> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB >> +/* >> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved >> + */ >> + >> +#include "cmd.h" >> + >> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size) >> +{ >> + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); >> + struct scatterlist out_sg; >> + struct virtio_admin_cmd cmd = {}; >> + >> + if (!virtio_dev) >> + return -ENOTCONN; >> + >> + sg_init_one(&out_sg, buf, buf_size); >> + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY; >> + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; >> + cmd.result_sg = &out_sg; >> + >> + return virtio_admin_cmd_exec(virtio_dev, &cmd); >> +} >> + > in/out seem all wrong here. In virtio terminology, in means from > device to driver, out means from driver to device. I referred here to in/out from vfio POV who prepares the command. However, I can replace it to follow the virtio terminology as you suggested if this more makes sense. Please see also my coming answer on your suggestion to put all of this in the virtio layer. >> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size) >> +{ >> + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); >> + struct scatterlist in_sg; >> + struct virtio_admin_cmd cmd = {}; >> + >> + if (!virtio_dev) >> + return -ENOTCONN; >> + >> + sg_init_one(&in_sg, buf, buf_size); >> + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE; >> + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; >> + cmd.data_sg = &in_sg; >> + >> + return virtio_admin_cmd_exec(virtio_dev, &cmd); >> +} >> + >> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > > what is _lr short for? This was an acronym to legacy_read. The actual command is according to the given opcode which can be one among LEGACY_COMMON_CFG_READ, LEGACY_DEV_CFG_READ. I can rename it to '_legacy_read' (i.e. virtiovf_issue_legacy_read_cmd) to be clearer. > >> + u8 offset, u8 size, u8 *buf) >> +{ >> + struct virtio_device *virtio_dev = >> + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); >> + struct virtio_admin_cmd_data_lr_write *in; >> + struct scatterlist in_sg; >> + struct virtio_admin_cmd cmd = {}; >> + int ret; >> + >> + if (!virtio_dev) >> + return -ENOTCONN; >> + >> + in = kzalloc(sizeof(*in) + size, GFP_KERNEL); >> + if (!in) >> + return -ENOMEM; >> + >> + in->offset = offset; >> + memcpy(in->registers, buf, size); >> + sg_init_one(&in_sg, in, sizeof(*in) + size); >> + cmd.opcode = opcode; >> + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; >> + cmd.group_member_id = virtvdev->vf_id + 1; > weird. why + 1? This follows the virtio spec in that area. "When sending commands with the SR-IOV group type, the driver specify a value for group_member_id between 1 and NumVFs inclusive." The 'virtvdev->vf_id' was set upon vfio/virtio driver initialization by pci_iov_vf_id() which its first index is 0. >> + cmd.data_sg = &in_sg; >> + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); >> + >> + kfree(in); >> + return ret; >> +} > How do you know it's safe to send this command, in particular at > this time? This seems to be doing zero checks, and zero synchronization > with the PF driver. > The virtiovf_cmd_lr_read()/other gets a virtio VF and it gets its PF by calling virtio_pci_vf_get_pf_dev(). The VF can't gone by 'disable sriov' as it's owned/used by vfio. The PF can't gone by rmmod/modprobe -r of virtio, as of the 'module in use'/dependencies between VFIO to VIRTIO. The below check [1] was done only from a clean code perspective, which might theoretically fail in case the given VF doesn't use a virtio driver. [1] if (!virtio_dev) return -ENOTCONN; So, it looks safe as is. >> + >> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, >> + u8 offset, u8 size, u8 *buf) >> +{ >> + struct virtio_device *virtio_dev = >> + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); >> + struct virtio_admin_cmd_data_lr_read *in; >> + struct scatterlist in_sg, out_sg; >> + struct virtio_admin_cmd cmd = {}; >> + int ret; >> + >> + if (!virtio_dev) >> + return -ENOTCONN; >> + >> + in = kzalloc(sizeof(*in), GFP_KERNEL); >> + if (!in) >> + return -ENOMEM; >> + >> + in->offset = offset; >> + sg_init_one(&in_sg, in, sizeof(*in)); >> + sg_init_one(&out_sg, buf, size); >> + cmd.opcode = opcode; >> + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; >> + cmd.data_sg = &in_sg; >> + cmd.result_sg = &out_sg; >> + cmd.group_member_id = virtvdev->vf_id + 1; >> + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); >> + >> + kfree(in); >> + return ret; >> +} >> + >> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, > and what is lq short for? To be more explicit, I may replace to virtiovf_cmd_legacy_notify_info() to follow the spec opcode. Yishai > >> + u8 req_bar_flags, u8 *bar, u64 *bar_offset) >> +{ >> + struct virtio_device *virtio_dev = >> + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); >> + struct virtio_admin_cmd_notify_info_result *out; >> + struct scatterlist out_sg; >> + struct virtio_admin_cmd cmd = {}; >> + int ret; >> + >> + if (!virtio_dev) >> + return -ENOTCONN; >> + >> + out = kzalloc(sizeof(*out), GFP_KERNEL); >> + if (!out) >> + return -ENOMEM; >> + >> + sg_init_one(&out_sg, out, sizeof(*out)); >> + cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO; >> + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; >> + cmd.result_sg = &out_sg; >> + cmd.group_member_id = virtvdev->vf_id + 1; >> + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); >> + if (!ret) { >> + struct virtio_admin_cmd_notify_info_data *entry; >> + int i; >> + >> + ret = -ENOENT; >> + for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) { >> + entry = &out->entries[i]; >> + if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END) >> + break; >> + if (entry->flags != req_bar_flags) >> + continue; >> + *bar = entry->bar; >> + *bar_offset = le64_to_cpu(entry->offset); >> + ret = 0; >> + break; >> + } >> + } >> + >> + kfree(out); >> + return ret; >> +} >> diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h >> new file mode 100644 >> index 000000000000..c2a3645f4b90 >> --- /dev/null >> +++ b/drivers/vfio/pci/virtio/cmd.h >> @@ -0,0 +1,27 @@ >> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB >> +/* >> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. >> + */ >> + >> +#ifndef VIRTIO_VFIO_CMD_H >> +#define VIRTIO_VFIO_CMD_H >> + >> +#include <linux/kernel.h> >> +#include <linux/virtio.h> >> +#include <linux/vfio_pci_core.h> >> +#include <linux/virtio_pci.h> >> + >> +struct virtiovf_pci_core_device { >> + struct vfio_pci_core_device core_device; >> + int vf_id; >> +}; >> + >> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size); >> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size); >> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, >> + u8 offset, u8 size, u8 *buf); >> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, >> + u8 offset, u8 size, u8 *buf); >> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, >> + u8 req_bar_flags, u8 *bar, u64 *bar_offset); >> +#endif /* VIRTIO_VFIO_CMD_H */ >> -- >> 2.27.0
On 22/09/2023 12:54, Michael S. Tsirkin wrote: > On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote: >> Expose admin commands over the virtio device, to be used by the >> vfio-virtio driver in the next patches. >> >> It includes: list query/use, legacy write/read, read notify_info. >> >> Signed-off-by: Yishai Hadas <yishaih@nvidia.com> > > This stuff is pure virtio spec. I think it should live under > drivers/virtio, too. The motivation to put it in the vfio layer was from the below main reasons: 1) Having it inside virtio may require to export a symbol/function per command. This will end up today by 5 and in the future (e.g. live migration) with much more exported symbols. With current code we export only 2 generic symbols virtio_pci_vf_get_pf_dev(), virtio_admin_cmd_exec() which may fit for any further extension. 2) For now there is no logic in this vfio layer, however, in the future we may have some DMA/other logic that should better fit to the caller/client layer (i.e. vfio). By the way, this follows what was done already between vfio/mlx5 to mlx5_core modules where mlx5_core exposes generic APIs to execute a command and to get the a PF from a given mlx5 VF. This way, we can enable further commands to be added/extended easily/cleanly. See for example here [1, 2]. [1] https://elixir.bootlin.com/linux/v6.6-rc3/source/drivers/vfio/pci/mlx5/cmd.c#L210 [2] https://elixir.bootlin.com/linux/v6.6-rc3/source/drivers/vfio/pci/mlx5/cmd.c#L683 Yishai > >> --- >> drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++ >> drivers/vfio/pci/virtio/cmd.h | 27 +++++++ >> 2 files changed, 173 insertions(+) >> create mode 100644 drivers/vfio/pci/virtio/cmd.c >> create mode 100644 drivers/vfio/pci/virtio/cmd.h >> >> diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c >> new file mode 100644 >> index 000000000000..f068239cdbb0 >> --- /dev/null >> +++ b/drivers/vfio/pci/virtio/cmd.c >> @@ -0,0 +1,146 @@ >> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB >> +/* >> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved >> + */ >> + >> +#include "cmd.h" >> + >> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size) >> +{ >> + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); >> + struct scatterlist out_sg; >> + struct virtio_admin_cmd cmd = {}; >> + >> + if (!virtio_dev) >> + return -ENOTCONN; >> + >> + sg_init_one(&out_sg, buf, buf_size); >> + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY; >> + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; >> + cmd.result_sg = &out_sg; >> + >> + return virtio_admin_cmd_exec(virtio_dev, &cmd); >> +} >> + >> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size) >> +{ >> + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); >> + struct scatterlist in_sg; >> + struct virtio_admin_cmd cmd = {}; >> + >> + if (!virtio_dev) >> + return -ENOTCONN; >> + >> + sg_init_one(&in_sg, buf, buf_size); >> + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE; >> + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; >> + cmd.data_sg = &in_sg; >> + >> + return virtio_admin_cmd_exec(virtio_dev, &cmd); >> +} >> + >> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, >> + u8 offset, u8 size, u8 *buf) >> +{ >> + struct virtio_device *virtio_dev = >> + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); >> + struct virtio_admin_cmd_data_lr_write *in; >> + struct scatterlist in_sg; >> + struct virtio_admin_cmd cmd = {}; >> + int ret; >> + >> + if (!virtio_dev) >> + return -ENOTCONN; >> + >> + in = kzalloc(sizeof(*in) + size, GFP_KERNEL); >> + if (!in) >> + return -ENOMEM; >> + >> + in->offset = offset; >> + memcpy(in->registers, buf, size); >> + sg_init_one(&in_sg, in, sizeof(*in) + size); >> + cmd.opcode = opcode; >> + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; >> + cmd.group_member_id = virtvdev->vf_id + 1; >> + cmd.data_sg = &in_sg; >> + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); >> + >> + kfree(in); >> + return ret; >> +} >> + >> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, >> + u8 offset, u8 size, u8 *buf) >> +{ >> + struct virtio_device *virtio_dev = >> + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); >> + struct virtio_admin_cmd_data_lr_read *in; >> + struct scatterlist in_sg, out_sg; >> + struct virtio_admin_cmd cmd = {}; >> + int ret; >> + >> + if (!virtio_dev) >> + return -ENOTCONN; >> + >> + in = kzalloc(sizeof(*in), GFP_KERNEL); >> + if (!in) >> + return -ENOMEM; >> + >> + in->offset = offset; >> + sg_init_one(&in_sg, in, sizeof(*in)); >> + sg_init_one(&out_sg, buf, size); >> + cmd.opcode = opcode; >> + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; >> + cmd.data_sg = &in_sg; >> + cmd.result_sg = &out_sg; >> + cmd.group_member_id = virtvdev->vf_id + 1; >> + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); >> + >> + kfree(in); >> + return ret; >> +} >> + >> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, >> + u8 req_bar_flags, u8 *bar, u64 *bar_offset) >> +{ >> + struct virtio_device *virtio_dev = >> + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); >> + struct virtio_admin_cmd_notify_info_result *out; >> + struct scatterlist out_sg; >> + struct virtio_admin_cmd cmd = {}; >> + int ret; >> + >> + if (!virtio_dev) >> + return -ENOTCONN; >> + >> + out = kzalloc(sizeof(*out), GFP_KERNEL); >> + if (!out) >> + return -ENOMEM; >> + >> + sg_init_one(&out_sg, out, sizeof(*out)); >> + cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO; >> + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; >> + cmd.result_sg = &out_sg; >> + cmd.group_member_id = virtvdev->vf_id + 1; >> + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); >> + if (!ret) { >> + struct virtio_admin_cmd_notify_info_data *entry; >> + int i; >> + >> + ret = -ENOENT; >> + for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) { >> + entry = &out->entries[i]; >> + if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END) >> + break; >> + if (entry->flags != req_bar_flags) >> + continue; >> + *bar = entry->bar; >> + *bar_offset = le64_to_cpu(entry->offset); >> + ret = 0; >> + break; >> + } >> + } >> + >> + kfree(out); >> + return ret; >> +} >> diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h >> new file mode 100644 >> index 000000000000..c2a3645f4b90 >> --- /dev/null >> +++ b/drivers/vfio/pci/virtio/cmd.h >> @@ -0,0 +1,27 @@ >> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB >> +/* >> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. >> + */ >> + >> +#ifndef VIRTIO_VFIO_CMD_H >> +#define VIRTIO_VFIO_CMD_H >> + >> +#include <linux/kernel.h> >> +#include <linux/virtio.h> >> +#include <linux/vfio_pci_core.h> >> +#include <linux/virtio_pci.h> >> + >> +struct virtiovf_pci_core_device { >> + struct vfio_pci_core_device core_device; >> + int vf_id; >> +}; >> + >> +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size); >> +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size); >> +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, >> + u8 offset, u8 size, u8 *buf); >> +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, >> + u8 offset, u8 size, u8 *buf); >> +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, >> + u8 req_bar_flags, u8 *bar, u64 *bar_offset); >> +#endif /* VIRTIO_VFIO_CMD_H */ >> -- >> 2.27.0
On Tue, Sep 26, 2023 at 01:51:13PM +0300, Yishai Hadas wrote: > On 21/09/2023 23:34, Michael S. Tsirkin wrote: > > On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote: > > > Expose admin commands over the virtio device, to be used by the > > > vfio-virtio driver in the next patches. > > > > > > It includes: list query/use, legacy write/read, read notify_info. > > > > > > Signed-off-by: Yishai Hadas <yishaih@nvidia.com> > > > --- > > > drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++ > > > drivers/vfio/pci/virtio/cmd.h | 27 +++++++ > > > 2 files changed, 173 insertions(+) > > > create mode 100644 drivers/vfio/pci/virtio/cmd.c > > > create mode 100644 drivers/vfio/pci/virtio/cmd.h > > > > > > diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c > > > new file mode 100644 > > > index 000000000000..f068239cdbb0 > > > --- /dev/null > > > +++ b/drivers/vfio/pci/virtio/cmd.c > > > @@ -0,0 +1,146 @@ > > > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > > > +/* > > > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved > > > + */ > > > + > > > +#include "cmd.h" > > > + > > > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size) > > > +{ > > > + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); > > > + struct scatterlist out_sg; > > > + struct virtio_admin_cmd cmd = {}; > > > + > > > + if (!virtio_dev) > > > + return -ENOTCONN; > > > + > > > + sg_init_one(&out_sg, buf, buf_size); > > > + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY; > > > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > > > + cmd.result_sg = &out_sg; > > > + > > > + return virtio_admin_cmd_exec(virtio_dev, &cmd); > > > +} > > > + > > in/out seem all wrong here. In virtio terminology, in means from > > device to driver, out means from driver to device. > I referred here to in/out from vfio POV who prepares the command. > > However, I can replace it to follow the virtio terminology as you suggested > if this more makes sense. > > Please see also my coming answer on your suggestion to put all of this in > the virtio layer. > > > > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size) > > > +{ > > > + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); > > > + struct scatterlist in_sg; > > > + struct virtio_admin_cmd cmd = {}; > > > + > > > + if (!virtio_dev) > > > + return -ENOTCONN; > > > + > > > + sg_init_one(&in_sg, buf, buf_size); > > > + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE; > > > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > > > + cmd.data_sg = &in_sg; > > > + > > > + return virtio_admin_cmd_exec(virtio_dev, &cmd); > > > +} > > > + > > > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > > > > what is _lr short for? > > This was an acronym to legacy_read. > > The actual command is according to the given opcode which can be one among > LEGACY_COMMON_CFG_READ, LEGACY_DEV_CFG_READ. > > I can rename it to '_legacy_read' (i.e. virtiovf_issue_legacy_read_cmd) to > be clearer. > > > > > > + u8 offset, u8 size, u8 *buf) > > > +{ > > > + struct virtio_device *virtio_dev = > > > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > > > + struct virtio_admin_cmd_data_lr_write *in; > > > + struct scatterlist in_sg; > > > + struct virtio_admin_cmd cmd = {}; > > > + int ret; > > > + > > > + if (!virtio_dev) > > > + return -ENOTCONN; > > > + > > > + in = kzalloc(sizeof(*in) + size, GFP_KERNEL); > > > + if (!in) > > > + return -ENOMEM; > > > + > > > + in->offset = offset; > > > + memcpy(in->registers, buf, size); > > > + sg_init_one(&in_sg, in, sizeof(*in) + size); > > > + cmd.opcode = opcode; > > > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > > > + cmd.group_member_id = virtvdev->vf_id + 1; > > weird. why + 1? > > This follows the virtio spec in that area. > > "When sending commands with the SR-IOV group type, the driver specify a > value for group_member_id > between 1 and NumVFs inclusive." Ah, I get it. Pls add a comment. > The 'virtvdev->vf_id' was set upon vfio/virtio driver initialization by > pci_iov_vf_id() which its first index is 0. > > > > + cmd.data_sg = &in_sg; > > > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > > > + > > > + kfree(in); > > > + return ret; > > > +} > > How do you know it's safe to send this command, in particular at > > this time? This seems to be doing zero checks, and zero synchronization > > with the PF driver. > > > The virtiovf_cmd_lr_read()/other gets a virtio VF and it gets its PF by > calling virtio_pci_vf_get_pf_dev(). > > The VF can't gone by 'disable sriov' as it's owned/used by vfio. > > The PF can't gone by rmmod/modprobe -r of virtio, as of the 'module in > use'/dependencies between VFIO to VIRTIO. > > The below check [1] was done only from a clean code perspective, which might > theoretically fail in case the given VF doesn't use a virtio driver. > > [1] if (!virtio_dev) > return -ENOTCONN; > > So, it looks safe as is. Can the device can be unbound from module right after you did the check? What about suspend - can this be called while suspend is in progress? More importantly, virtio can decide to reset the device for its own internal reasons (e.g. to recover from an error). We used to do it when attaching XDP, and we can start doing it again. That's one of the reasons why I want all this code under virtio, so we'll remember. > > > + > > > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > > > + u8 offset, u8 size, u8 *buf) > > > +{ > > > + struct virtio_device *virtio_dev = > > > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > > > + struct virtio_admin_cmd_data_lr_read *in; > > > + struct scatterlist in_sg, out_sg; > > > + struct virtio_admin_cmd cmd = {}; > > > + int ret; > > > + > > > + if (!virtio_dev) > > > + return -ENOTCONN; > > > + > > > + in = kzalloc(sizeof(*in), GFP_KERNEL); > > > + if (!in) > > > + return -ENOMEM; > > > + > > > + in->offset = offset; > > > + sg_init_one(&in_sg, in, sizeof(*in)); > > > + sg_init_one(&out_sg, buf, size); > > > + cmd.opcode = opcode; > > > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > > > + cmd.data_sg = &in_sg; > > > + cmd.result_sg = &out_sg; > > > + cmd.group_member_id = virtvdev->vf_id + 1; > > > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > > > + > > > + kfree(in); > > > + return ret; > > > +} > > > + > > > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, > > and what is lq short for? > > To be more explicit, I may replace to virtiovf_cmd_legacy_notify_info() to > follow the spec opcode. > > Yishai > > > > > > + u8 req_bar_flags, u8 *bar, u64 *bar_offset) > > > +{ > > > + struct virtio_device *virtio_dev = > > > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > > > + struct virtio_admin_cmd_notify_info_result *out; > > > + struct scatterlist out_sg; > > > + struct virtio_admin_cmd cmd = {}; > > > + int ret; > > > + > > > + if (!virtio_dev) > > > + return -ENOTCONN; > > > + > > > + out = kzalloc(sizeof(*out), GFP_KERNEL); > > > + if (!out) > > > + return -ENOMEM; > > > + > > > + sg_init_one(&out_sg, out, sizeof(*out)); > > > + cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO; > > > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > > > + cmd.result_sg = &out_sg; > > > + cmd.group_member_id = virtvdev->vf_id + 1; > > > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > > > + if (!ret) { > > > + struct virtio_admin_cmd_notify_info_data *entry; > > > + int i; > > > + > > > + ret = -ENOENT; > > > + for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) { > > > + entry = &out->entries[i]; > > > + if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END) > > > + break; > > > + if (entry->flags != req_bar_flags) > > > + continue; > > > + *bar = entry->bar; > > > + *bar_offset = le64_to_cpu(entry->offset); > > > + ret = 0; > > > + break; > > > + } > > > + } > > > + > > > + kfree(out); > > > + return ret; > > > +} > > > diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h > > > new file mode 100644 > > > index 000000000000..c2a3645f4b90 > > > --- /dev/null > > > +++ b/drivers/vfio/pci/virtio/cmd.h > > > @@ -0,0 +1,27 @@ > > > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > > > +/* > > > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. > > > + */ > > > + > > > +#ifndef VIRTIO_VFIO_CMD_H > > > +#define VIRTIO_VFIO_CMD_H > > > + > > > +#include <linux/kernel.h> > > > +#include <linux/virtio.h> > > > +#include <linux/vfio_pci_core.h> > > > +#include <linux/virtio_pci.h> > > > + > > > +struct virtiovf_pci_core_device { > > > + struct vfio_pci_core_device core_device; > > > + int vf_id; > > > +}; > > > + > > > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size); > > > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size); > > > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > > > + u8 offset, u8 size, u8 *buf); > > > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > > > + u8 offset, u8 size, u8 *buf); > > > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, > > > + u8 req_bar_flags, u8 *bar, u64 *bar_offset); > > > +#endif /* VIRTIO_VFIO_CMD_H */ > > > -- > > > 2.27.0 >
On Tue, Sep 26, 2023 at 02:14:01PM +0300, Yishai Hadas wrote: > On 22/09/2023 12:54, Michael S. Tsirkin wrote: > > On Thu, Sep 21, 2023 at 03:40:39PM +0300, Yishai Hadas wrote: > > > Expose admin commands over the virtio device, to be used by the > > > vfio-virtio driver in the next patches. > > > > > > It includes: list query/use, legacy write/read, read notify_info. > > > > > > Signed-off-by: Yishai Hadas <yishaih@nvidia.com> > > > > This stuff is pure virtio spec. I think it should live under > > drivers/virtio, too. > > The motivation to put it in the vfio layer was from the below main reasons: > > 1) Having it inside virtio may require to export a symbol/function per > command. > > This will end up today by 5 and in the future (e.g. live migration) with > much more exported symbols. > > With current code we export only 2 generic symbols > virtio_pci_vf_get_pf_dev(), virtio_admin_cmd_exec() which may fit for any > further extension. Except, there's no reasonable way for virtio to know what is done with the device then. You are not using just 2 symbols at all, instead you are using the rich vq API which was explicitly designed for the driver running the device being responsible for serializing accesses. Which is actually loaded and running. And I *think* your use won't conflict ATM mostly by luck. Witness the hack in patch 01 as exhibit 1 - nothing at all even hints at the fact that the reason for the complicated dance is because another driver pokes at some of the vqs. > 2) For now there is no logic in this vfio layer, however, in the future we > may have some DMA/other logic that should better fit to the caller/client > layer (i.e. vfio). You are poking at the device without any locks etc. Maybe it looks like no logic to you but it does not look like that from where I stand. > By the way, this follows what was done already between vfio/mlx5 to > mlx5_core modules where mlx5_core exposes generic APIs to execute a command > and to get the a PF from a given mlx5 VF. This is up to mlx5 maintainers. In particular they only need to worry that their patches work with specific hardware which they likely have. virtio has to work with multiple vendors - hardware and software - and exposing a low level API that I can't test on my laptop is not at all my ideal. > This way, we can enable further commands to be added/extended > easily/cleanly. Something for vfio maintainer to consider in case it was assumed that it's just this one weird thing but otherwise it's all generic vfio. It's not going to stop there, will it? The duplication of functionality with vdpa will continue :( I am much more interested in adding reusable functionality that everyone benefits from than in vfio poking at the device in its own weird ways that only benefit specific hardware. > See for example here [1, 2]. > > [1] https://elixir.bootlin.com/linux/v6.6-rc3/source/drivers/vfio/pci/mlx5/cmd.c#L210 > > [2] https://elixir.bootlin.com/linux/v6.6-rc3/source/drivers/vfio/pci/mlx5/cmd.c#L683 > > Yishai > > > > > --- > > > drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++ > > > drivers/vfio/pci/virtio/cmd.h | 27 +++++++ > > > 2 files changed, 173 insertions(+) > > > create mode 100644 drivers/vfio/pci/virtio/cmd.c > > > create mode 100644 drivers/vfio/pci/virtio/cmd.h > > > > > > diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c > > > new file mode 100644 > > > index 000000000000..f068239cdbb0 > > > --- /dev/null > > > +++ b/drivers/vfio/pci/virtio/cmd.c > > > @@ -0,0 +1,146 @@ > > > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > > > +/* > > > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved > > > + */ > > > + > > > +#include "cmd.h" > > > + > > > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size) > > > +{ > > > + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); > > > + struct scatterlist out_sg; > > > + struct virtio_admin_cmd cmd = {}; > > > + > > > + if (!virtio_dev) > > > + return -ENOTCONN; > > > + > > > + sg_init_one(&out_sg, buf, buf_size); > > > + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY; > > > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > > > + cmd.result_sg = &out_sg; > > > + > > > + return virtio_admin_cmd_exec(virtio_dev, &cmd); > > > +} > > > + > > > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size) > > > +{ > > > + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); > > > + struct scatterlist in_sg; > > > + struct virtio_admin_cmd cmd = {}; > > > + > > > + if (!virtio_dev) > > > + return -ENOTCONN; > > > + > > > + sg_init_one(&in_sg, buf, buf_size); > > > + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE; > > > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > > > + cmd.data_sg = &in_sg; > > > + > > > + return virtio_admin_cmd_exec(virtio_dev, &cmd); > > > +} > > > + > > > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > > > + u8 offset, u8 size, u8 *buf) > > > +{ > > > + struct virtio_device *virtio_dev = > > > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > > > + struct virtio_admin_cmd_data_lr_write *in; > > > + struct scatterlist in_sg; > > > + struct virtio_admin_cmd cmd = {}; > > > + int ret; > > > + > > > + if (!virtio_dev) > > > + return -ENOTCONN; > > > + > > > + in = kzalloc(sizeof(*in) + size, GFP_KERNEL); > > > + if (!in) > > > + return -ENOMEM; > > > + > > > + in->offset = offset; > > > + memcpy(in->registers, buf, size); > > > + sg_init_one(&in_sg, in, sizeof(*in) + size); > > > + cmd.opcode = opcode; > > > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > > > + cmd.group_member_id = virtvdev->vf_id + 1; > > > + cmd.data_sg = &in_sg; > > > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > > > + > > > + kfree(in); > > > + return ret; > > > +} > > > + > > > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > > > + u8 offset, u8 size, u8 *buf) > > > +{ > > > + struct virtio_device *virtio_dev = > > > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > > > + struct virtio_admin_cmd_data_lr_read *in; > > > + struct scatterlist in_sg, out_sg; > > > + struct virtio_admin_cmd cmd = {}; > > > + int ret; > > > + > > > + if (!virtio_dev) > > > + return -ENOTCONN; > > > + > > > + in = kzalloc(sizeof(*in), GFP_KERNEL); > > > + if (!in) > > > + return -ENOMEM; > > > + > > > + in->offset = offset; > > > + sg_init_one(&in_sg, in, sizeof(*in)); > > > + sg_init_one(&out_sg, buf, size); > > > + cmd.opcode = opcode; > > > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > > > + cmd.data_sg = &in_sg; > > > + cmd.result_sg = &out_sg; > > > + cmd.group_member_id = virtvdev->vf_id + 1; > > > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > > > + > > > + kfree(in); > > > + return ret; > > > +} > > > + > > > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, > > > + u8 req_bar_flags, u8 *bar, u64 *bar_offset) > > > +{ > > > + struct virtio_device *virtio_dev = > > > + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); > > > + struct virtio_admin_cmd_notify_info_result *out; > > > + struct scatterlist out_sg; > > > + struct virtio_admin_cmd cmd = {}; > > > + int ret; > > > + > > > + if (!virtio_dev) > > > + return -ENOTCONN; > > > + > > > + out = kzalloc(sizeof(*out), GFP_KERNEL); > > > + if (!out) > > > + return -ENOMEM; > > > + > > > + sg_init_one(&out_sg, out, sizeof(*out)); > > > + cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO; > > > + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; > > > + cmd.result_sg = &out_sg; > > > + cmd.group_member_id = virtvdev->vf_id + 1; > > > + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); > > > + if (!ret) { > > > + struct virtio_admin_cmd_notify_info_data *entry; > > > + int i; > > > + > > > + ret = -ENOENT; > > > + for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) { > > > + entry = &out->entries[i]; > > > + if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END) > > > + break; > > > + if (entry->flags != req_bar_flags) > > > + continue; > > > + *bar = entry->bar; > > > + *bar_offset = le64_to_cpu(entry->offset); > > > + ret = 0; > > > + break; > > > + } > > > + } > > > + > > > + kfree(out); > > > + return ret; > > > +} > > > diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h > > > new file mode 100644 > > > index 000000000000..c2a3645f4b90 > > > --- /dev/null > > > +++ b/drivers/vfio/pci/virtio/cmd.h > > > @@ -0,0 +1,27 @@ > > > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > > > +/* > > > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. > > > + */ > > > + > > > +#ifndef VIRTIO_VFIO_CMD_H > > > +#define VIRTIO_VFIO_CMD_H > > > + > > > +#include <linux/kernel.h> > > > +#include <linux/virtio.h> > > > +#include <linux/vfio_pci_core.h> > > > +#include <linux/virtio_pci.h> > > > + > > > +struct virtiovf_pci_core_device { > > > + struct vfio_pci_core_device core_device; > > > + int vf_id; > > > +}; > > > + > > > +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size); > > > +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size); > > > +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > > > + u8 offset, u8 size, u8 *buf); > > > +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, > > > + u8 offset, u8 size, u8 *buf); > > > +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, > > > + u8 req_bar_flags, u8 *bar, u64 *bar_offset); > > > +#endif /* VIRTIO_VFIO_CMD_H */ > > > -- > > > 2.27.0 >
On Tue, Sep 26, 2023 at 07:41:44AM -0400, Michael S. Tsirkin wrote: > > By the way, this follows what was done already between vfio/mlx5 to > > mlx5_core modules where mlx5_core exposes generic APIs to execute a command > > and to get the a PF from a given mlx5 VF. > > This is up to mlx5 maintainers. In particular they only need to worry > that their patches work with specific hardware which they likely have. > virtio has to work with multiple vendors - hardware and software - > and exposing a low level API that I can't test on my laptop > is not at all my ideal. mlx5 has a reasonable API from the lower level that allows the vfio driver to safely issue commands. The API provides all the safety and locking you have been questioning here. Then the vfio driver can form the commands directly and in the way it needs. This avoids spewing code into the core modules that is only used by vfio - which has been a key design consideration for our driver layering. I suggest following the same design here as it has been well proven. Provide a solid API to operate the admin queue and let VFIO use it. One of the main purposes of the admin queue is to deliver commands on behalf of the VF driver, so this is a logical and reasonable place to put an API. > > This way, we can enable further commands to be added/extended > > easily/cleanly. > > Something for vfio maintainer to consider in case it was > assumed that it's just this one weird thing > but otherwise it's all generic vfio. It's not going to stop there, > will it? The duplication of functionality with vdpa will continue :( VFIO live migration is expected to come as well once OASIS completes its work. Parav, are there other things? Jason
On Wed, Sep 27, 2023 at 10:18:17AM -0300, Jason Gunthorpe wrote: > On Tue, Sep 26, 2023 at 07:41:44AM -0400, Michael S. Tsirkin wrote: > > > > By the way, this follows what was done already between vfio/mlx5 to > > > mlx5_core modules where mlx5_core exposes generic APIs to execute a command > > > and to get the a PF from a given mlx5 VF. > > > > This is up to mlx5 maintainers. In particular they only need to worry > > that their patches work with specific hardware which they likely have. > > virtio has to work with multiple vendors - hardware and software - > > and exposing a low level API that I can't test on my laptop > > is not at all my ideal. > > mlx5 has a reasonable API from the lower level that allows the vfio > driver to safely issue commands. The API provides all the safety and > locking you have been questioning here. > > Then the vfio driver can form the commands directly and in the way it > needs. This avoids spewing code into the core modules that is only > used by vfio - which has been a key design consideration for our > driver layering. > > I suggest following the same design here as it has been well proven. > Provide a solid API to operate the admin queue and let VFIO use > it. One of the main purposes of the admin queue is to deliver commands > on behalf of the VF driver, so this is a logical and reasonable place > to put an API. Not the way virtio is designed now. I guess mlx5 is designed in a way that makes it safe. > > > This way, we can enable further commands to be added/extended > > > easily/cleanly. > > > > Something for vfio maintainer to consider in case it was > > assumed that it's just this one weird thing > > but otherwise it's all generic vfio. It's not going to stop there, > > will it? The duplication of functionality with vdpa will continue :( > > VFIO live migration is expected to come as well once OASIS completes > its work. Exactly. Is there doubt vdpa will want to support live migration? Put this code in a library please. > Parav, are there other things? > > Jason
On Wed, Sep 27, 2023 at 05:30:04PM -0400, Michael S. Tsirkin wrote: > On Wed, Sep 27, 2023 at 10:18:17AM -0300, Jason Gunthorpe wrote: > > On Tue, Sep 26, 2023 at 07:41:44AM -0400, Michael S. Tsirkin wrote: > > > > > > By the way, this follows what was done already between vfio/mlx5 to > > > > mlx5_core modules where mlx5_core exposes generic APIs to execute a command > > > > and to get the a PF from a given mlx5 VF. > > > > > > This is up to mlx5 maintainers. In particular they only need to worry > > > that their patches work with specific hardware which they likely have. > > > virtio has to work with multiple vendors - hardware and software - > > > and exposing a low level API that I can't test on my laptop > > > is not at all my ideal. > > > > mlx5 has a reasonable API from the lower level that allows the vfio > > driver to safely issue commands. The API provides all the safety and > > locking you have been questioning here. > > > > Then the vfio driver can form the commands directly and in the way it > > needs. This avoids spewing code into the core modules that is only > > used by vfio - which has been a key design consideration for our > > driver layering. > > > > I suggest following the same design here as it has been well proven. > > Provide a solid API to operate the admin queue and let VFIO use > > it. One of the main purposes of the admin queue is to deliver commands > > on behalf of the VF driver, so this is a logical and reasonable place > > to put an API. > > Not the way virtio is designed now. I guess mlx5 is designed in > a way that makes it safe. If you can't reliably issue commmands from the VF at all it doesn't really matter where you put the code. Once that is established up then an admin command execution interface is a nice cut point for modularity. The locking in mlx5 to make this safe is not too complex, if Feng missed some items for virtio then he can work to fix it up. > > VFIO live migration is expected to come as well once OASIS completes > > its work. > > Exactly. Is there doubt vdpa will want to support live migration? > Put this code in a library please. I have a doubt, you both said vdpa already does live migration, so what will it even do with a live migration interface to a PCI function? It already has to use full mediation to operate a physical virtio function, so it seems like it shouldn't need the migration interface? Regardless, it is better kernel development hygiene to put the code where it is used and wait for a second user to consolidate it than to guess. Jason
On Wed, Sep 27, 2023 at 08:16:00PM -0300, Jason Gunthorpe wrote: > On Wed, Sep 27, 2023 at 05:30:04PM -0400, Michael S. Tsirkin wrote: > > On Wed, Sep 27, 2023 at 10:18:17AM -0300, Jason Gunthorpe wrote: > > > On Tue, Sep 26, 2023 at 07:41:44AM -0400, Michael S. Tsirkin wrote: > > > > > > > > By the way, this follows what was done already between vfio/mlx5 to > > > > > mlx5_core modules where mlx5_core exposes generic APIs to execute a command > > > > > and to get the a PF from a given mlx5 VF. > > > > > > > > This is up to mlx5 maintainers. In particular they only need to worry > > > > that their patches work with specific hardware which they likely have. > > > > virtio has to work with multiple vendors - hardware and software - > > > > and exposing a low level API that I can't test on my laptop > > > > is not at all my ideal. > > > > > > mlx5 has a reasonable API from the lower level that allows the vfio > > > driver to safely issue commands. The API provides all the safety and > > > locking you have been questioning here. > > > > > > Then the vfio driver can form the commands directly and in the way it > > > needs. This avoids spewing code into the core modules that is only > > > used by vfio - which has been a key design consideration for our > > > driver layering. > > > > > > I suggest following the same design here as it has been well proven. > > > Provide a solid API to operate the admin queue and let VFIO use > > > it. One of the main purposes of the admin queue is to deliver commands > > > on behalf of the VF driver, so this is a logical and reasonable place > > > to put an API. > > > > Not the way virtio is designed now. I guess mlx5 is designed in > > a way that makes it safe. > > If you can't reliably issue commmands from the VF at all it doesn't > really matter where you put the code. Once that is established up then > an admin command execution interface is a nice cut point for > modularity. > > The locking in mlx5 to make this safe is not too complex, if Feng > missed some items for virtio then he can work to fix it up. Above two paragraphs don't make sense to me at all. VF issues no commands and there's no locking. > > > VFIO live migration is expected to come as well once OASIS completes > > > its work. > > > > Exactly. Is there doubt vdpa will want to support live migration? > > Put this code in a library please. > > I have a doubt, you both said vdpa already does live migration, so > what will it even do with a live migration interface to a PCI > function? This is not the thread to explain how vdpa live migration works now and why it needs new interfaces, sorry. Suffice is to say right now on virtio tc Parav from nvidia is arguing for vdpa to use admin commands for migration. > It already has to use full mediation to operate a physical virtio > function, so it seems like it shouldn't need the migration interface? > > Regardless, it is better kernel development hygiene to put the code > where it is used and wait for a second user to consolidate it than to > guess. > > Jason Sorry no time right now to argue philosophy. I gave some hints on how to make the virtio changes behave in a way that I'm ok with maintaining. Hope they help.
On Tue, Sep 26, 2023 at 07:41:44AM -0400, Michael S. Tsirkin wrote: > > Except, there's no reasonable way for virtio to know what is done with > the device then. You are not using just 2 symbols at all, instead you > are using the rich vq API which was explicitly designed for the driver > running the device being responsible for serializing accesses. Which is > actually loaded and running. And I *think* your use won't conflict ATM > mostly by luck. Witness the hack in patch 01 as exhibit 1 - nothing > at all even hints at the fact that the reason for the complicated > dance is because another driver pokes at some of the vqs. Fully agreed. The smart nic vendors are trying to do the same mess in nvme, and we really need to stop them and agree on proper standarized live migration features implemented in the core virtio/nvme code.
On Sun, Oct 01, 2023 at 11:28:26PM -0700, Christoph Hellwig wrote: > On Tue, Sep 26, 2023 at 07:41:44AM -0400, Michael S. Tsirkin wrote: > > > > Except, there's no reasonable way for virtio to know what is done with > > the device then. You are not using just 2 symbols at all, instead you > > are using the rich vq API which was explicitly designed for the driver > > running the device being responsible for serializing accesses. Which is > > actually loaded and running. And I *think* your use won't conflict ATM > > mostly by luck. Witness the hack in patch 01 as exhibit 1 - nothing > > at all even hints at the fact that the reason for the complicated > > dance is because another driver pokes at some of the vqs. > > Fully agreed. The smart nic vendors are trying to do the same mess > in nvme, and we really need to stop them and agree on proper standarized > live migration features implemented in the core virtio/nvme code. ??? This patch series is an implementation of changes that OASIS approved. The live migration work is going to OASIS first no patches have been presented. This thread is arguing about how to split up the code for the implementatin of the standard given that VFIO owns the VF and the virtio core owns the PF. The standard defined that PF admin queue operations are needed to do operations on behalf of the VF. Jason
On Mon, Oct 02, 2023 at 12:13:20PM -0300, Jason Gunthorpe wrote: > ??? This patch series is an implementation of changes that OASIS > approved. I think you are fundamentally missing my point. This is not about who publish a spec, but how we struture Linux code. And the problem is that we trea vfio as a separate thing, and not an integral part of the driver. vfio being separate totally makes sense for the original purpose of vfio, that is a a no-op passthrough of a device to userspace. But for all the augmented vfio use cases it doesn't, for them the augmented vfio functionality is an integral part of the core driver. That is true for nvme, virtio and I'd argue mlx5 as well. So we need to stop registering separate pci_drivers for this kind of functionality, and instead have an interface to the driver to switch to certain functionalities. E.g. for this case there should be no new vfio-virtio device, but instead you should be able to switch the virtio device to an fake-legacy vfio mode. Assuming the whole thing actually makes sense, as the use case seems a bit fishy to start with, but I'll leave that argument to the virtio maintainers. Similarly for nvme. We'll never accept a separate nvme-live migration vfio driver. This functionality needs to be part of the nvme driver, probed there and fully controlled there.
On Thu, Oct 05, 2023 at 01:49:54AM -0700, Christoph Hellwig wrote: > But for all the augmented vfio use cases it doesn't, for them the > augmented vfio functionality is an integral part of the core driver. > That is true for nvme, virtio and I'd argue mlx5 as well. I don't agree with this. I see the extra functionality as being an integral part of the VF and VFIO. The PF driver is only providing a proxied communication channel. It is a limitation of PCI that the PF must act as a proxy. > So we need to stop registering separate pci_drivers for this kind > of functionality, and instead have an interface to the driver to > switch to certain functionalities. ?? We must bind something to the VF's pci_driver, what do you imagine that is? > E.g. for this case there should be no new vfio-virtio device, but > instead you should be able to switch the virtio device to an > fake-legacy vfio mode. Are you aruging about how we reach to vfio_register_XX() and what directory the file lives? I don't know what "fake-legacy" even means, VFIO is not legacy. There is alot of code in VFIO and the VMM side to take a VF and turn it into a vPCI function. You can't just trivially duplicate VFIO in a dozen drivers without creating a giant mess. Further, userspace wants consistent ways to operate this stuff. If we need a dozen ways to activate VFIO for every kind of driver that is not a positive direction. Basically, I don't know what you are suggesting here. We talked about this before, and my position is still the same. Continuing to have /dev/vfio/XX be the kernel uAPI for the VMM to work with non-mediated vPCI functions with live migration is the technically correct thing to do. Why wouldn't it be? > Similarly for nvme. We'll never accept a separate nvme-live migration > vfio driver. This functionality needs to be part of the nvme driver, > probed there and fully controlled there. We can debate where to put the files when the standard is done, but the end of the day it needs to create /dev/vfio/XXX. Jason
On Thu, Oct 05, 2023 at 08:10:04AM -0300, Jason Gunthorpe wrote: > > But for all the augmented vfio use cases it doesn't, for them the > > augmented vfio functionality is an integral part of the core driver. > > That is true for nvme, virtio and I'd argue mlx5 as well. > > I don't agree with this. I see the extra functionality as being an > integral part of the VF and VFIO. The PF driver is only providing a > proxied communication channel. > > It is a limitation of PCI that the PF must act as a proxy. For anything live migration it very fundamentally is not, as a function that is visible to a guest by definition can't drive the migration itself. That isn't really a limitation in PCI, but follows form the fact that something else must control a live migration that is transparent to the guest. > > > So we need to stop registering separate pci_drivers for this kind > > of functionality, and instead have an interface to the driver to > > switch to certain functionalities. > > ?? We must bind something to the VF's pci_driver, what do you imagine > that is? The driver that knows this hardware. In this case the virtio subsystem, in case of nvme the nvme driver, and in case of mlx5 the mlx5 driver. > > E.g. for this case there should be no new vfio-virtio device, but > > instead you should be able to switch the virtio device to an > > fake-legacy vfio mode. > > Are you aruging about how we reach to vfio_register_XX() and what > directory the file lives? No. That layout logically follows from what codebase the functionality is part of, though. > I don't know what "fake-legacy" even means, VFIO is not legacy. The driver we're talking about in this thread fakes up a virtio_pci legacy devie to the guest on top of a "modern" virtio_pci device. > There is alot of code in VFIO and the VMM side to take a VF and turn > it into a vPCI function. You can't just trivially duplicate VFIO in a > dozen drivers without creating a giant mess. I do not advocate for duplicating it. But the code that calls this functionality belongs into the driver that deals with the compound device that we're doing this work for. > Further, userspace wants consistent ways to operate this stuff. If we > need a dozen ways to activate VFIO for every kind of driver that is > not a positive direction. We don't need a dozen ways. We just need a single attribute on the pci (or $OTHERBUS) devide that switches it to vfio mode.
On Fri, Oct 06, 2023 at 06:09:09AM -0700, Christoph Hellwig wrote: > On Thu, Oct 05, 2023 at 08:10:04AM -0300, Jason Gunthorpe wrote: > > > But for all the augmented vfio use cases it doesn't, for them the > > > augmented vfio functionality is an integral part of the core driver. > > > That is true for nvme, virtio and I'd argue mlx5 as well. > > > > I don't agree with this. I see the extra functionality as being an > > integral part of the VF and VFIO. The PF driver is only providing a > > proxied communication channel. > > > > It is a limitation of PCI that the PF must act as a proxy. > > For anything live migration it very fundamentally is not, as a function > that is visible to a guest by definition can't drive the migration > itself. That isn't really a limitation in PCI, but follows form the > fact that something else must control a live migration that is > transparent to the guest. We've talked around ideas like allowing the VF config space to do some of the work. For simple devices we could get away with 1 VF config space register. (VF config space is owned by the hypervisor, not the guest) Devices that need DMA as part of their migration could be imagined to co-opt a VF PASID or something. eg using ENQCMD. SIOVr2 is discussing more a flexible RID mapping - there is a possible route where a "VF" could actually have two RIDs, a hypervisor RID and a guest RID. It really is PCI limitations that force this design of making a PF driver do dual duty as a fully functionally normal device and act as a communication channel proxy to make a back channel into a SRIOV VF. My view has always been that the VFIO live migration operations are executed logically within the VF as they only effect the VF. So we have a logical design seperation where VFIO world owns the commands and the PF driver supplies the communication channel. This works well for devices that already have a robust RPC interface to their device FW. > > ?? We must bind something to the VF's pci_driver, what do you imagine > > that is? > > The driver that knows this hardware. In this case the virtio subsystem, > in case of nvme the nvme driver, and in case of mlx5 the mlx5 driver. But those are drivers operating the HW to create kernel devices. Here we need a VFIO device. They can't co-exist, if you switch mlx5 from normal to vfio you have to tear down the entire normal driver. > > > E.g. for this case there should be no new vfio-virtio device, but > > > instead you should be able to switch the virtio device to an > > > fake-legacy vfio mode. > > > > Are you aruging about how we reach to vfio_register_XX() and what > > directory the file lives? > > No. That layout logically follows from what codebase the functionality > is part of, though. I don't understand what we are talking about really. Where do you imagine the vfio_register_XX() goes? > > I don't know what "fake-legacy" even means, VFIO is not legacy. > > The driver we're talking about in this thread fakes up a virtio_pci > legacy devie to the guest on top of a "modern" virtio_pci device. I'm not sure I'd use the word fake, inb/outb are always trapped operations in VMs. If the device provided a real IO BAR then VFIO common code would trap and relay inb/outb to the device. All this is doing is changing the inb/outb relay from using a physical IO BAR to a DMA command ring. The motivation is simply because normal IO BAR space is incredibly limited and you can't get enough SRIOV functions when using it. > > There is alot of code in VFIO and the VMM side to take a VF and turn > > it into a vPCI function. You can't just trivially duplicate VFIO in a > > dozen drivers without creating a giant mess. > > I do not advocate for duplicating it. But the code that calls this > functionality belongs into the driver that deals with the compound > device that we're doing this work for. On one hand, I don't really care - we can put the code where people like. However - the Intel GPU VFIO driver is such a bad experiance I don't want to encourage people to make VFIO drivers, or code that is only used by VFIO drivers, that are not under drivers/vfio review. > > Further, userspace wants consistent ways to operate this stuff. If we > > need a dozen ways to activate VFIO for every kind of driver that is > > not a positive direction. > > We don't need a dozen ways. We just need a single attribute on the > pci (or $OTHERBUS) devide that switches it to vfio mode. Well, we sort of do these days, it is just a convoluted bind thing. Realistically switching modes requires unprobing the entire normal VF driver. Having this be linked to the driver core probe/unprobe flows is a good code reuse thing, IMHO. We already spent alot of effort making this quite general from the userspace perspective. Nobody yet came up with an idea to avoid the ugly unbind/bind flow. Be aware, there is a significant performance concern here. If you want to create 1000 VFIO devices (this is a real thing), we *can't* probe a normal driver first, it is too slow. We need a path that goes directly from creating the RIDs to turning those RIDs into VFIO. mlx5 takes *seconds* to complete its normal probe. We must avoid this. Looking a few years into the future, with SIOVr1/2, the flow I want to target is some uAPI commands: 'create a PCI RID with params XYZ and attach a normal/VFIO/etc driver' 'destroy a PCI RID' We need to get away from this scheme of SRIOV where you bulk create a bunch of empty VFs at one time and then have to somehow provision them. Jason
On Tue, Oct 10, 2023 at 10:10:31AM -0300, Jason Gunthorpe wrote: > > > There is alot of code in VFIO and the VMM side to take a VF and turn > > > it into a vPCI function. You can't just trivially duplicate VFIO in a > > > dozen drivers without creating a giant mess. > > > > I do not advocate for duplicating it. But the code that calls this > > functionality belongs into the driver that deals with the compound > > device that we're doing this work for. > > On one hand, I don't really care - we can put the code where people > like. > > However - the Intel GPU VFIO driver is such a bad experiance I don't > want to encourage people to make VFIO drivers, or code that is only > used by VFIO drivers, that are not under drivers/vfio review. So if Alex feels it makes sense to add some virtio functionality to vfio and is happy to maintain or let you maintain the UAPI then why would I say no? But we never expected devices to have two drivers like this does, so just exposing device pointer and saying "use regular virtio APIs for the rest" does not cut it, the new APIs have to make sense so virtio drivers can develop normally without fear of stepping on the toes of this admin driver.
On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote: > > However - the Intel GPU VFIO driver is such a bad experiance I don't > > want to encourage people to make VFIO drivers, or code that is only > > used by VFIO drivers, that are not under drivers/vfio review. > > So if Alex feels it makes sense to add some virtio functionality > to vfio and is happy to maintain or let you maintain the UAPI > then why would I say no? But we never expected devices to have > two drivers like this does, so just exposing device pointer > and saying "use regular virtio APIs for the rest" does not > cut it, the new APIs have to make sense > so virtio drivers can develop normally without fear of stepping > on the toes of this admin driver. Please work with Yishai to get something that make sense to you. He can post a v2 with the accumulated comments addressed so far and then go over what the API between the drivers is. Thanks, Jason
On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote: > On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote: > > > > However - the Intel GPU VFIO driver is such a bad experiance I don't > > > want to encourage people to make VFIO drivers, or code that is only > > > used by VFIO drivers, that are not under drivers/vfio review. > > > > So if Alex feels it makes sense to add some virtio functionality > > to vfio and is happy to maintain or let you maintain the UAPI > > then why would I say no? But we never expected devices to have > > two drivers like this does, so just exposing device pointer > > and saying "use regular virtio APIs for the rest" does not > > cut it, the new APIs have to make sense > > so virtio drivers can develop normally without fear of stepping > > on the toes of this admin driver. > > Please work with Yishai to get something that make sense to you. He > can post a v2 with the accumulated comments addressed so far and then > go over what the API between the drivers is. > > Thanks, > Jason /me shrugs. I pretty much posted suggestions already. Should not be hard. Anything unclear - post on list.
On 10/10/2023 17:54, Michael S. Tsirkin wrote: > On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote: >> On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote: >> >>>> However - the Intel GPU VFIO driver is such a bad experiance I don't >>>> want to encourage people to make VFIO drivers, or code that is only >>>> used by VFIO drivers, that are not under drivers/vfio review. >>> So if Alex feels it makes sense to add some virtio functionality >>> to vfio and is happy to maintain or let you maintain the UAPI >>> then why would I say no? But we never expected devices to have >>> two drivers like this does, so just exposing device pointer >>> and saying "use regular virtio APIs for the rest" does not >>> cut it, the new APIs have to make sense >>> so virtio drivers can develop normally without fear of stepping >>> on the toes of this admin driver. >> Please work with Yishai to get something that make sense to you. He >> can post a v2 with the accumulated comments addressed so far and then >> go over what the API between the drivers is. >> >> Thanks, >> Jason > /me shrugs. I pretty much posted suggestions already. Should not be hard. > Anything unclear - post on list. > Yes, this is the plan. We are working to address the comments that we got so far in both VFIO & VIRTIO, retest and send the next version. Re the API between the modules, It looks like we have the below alternatives. 1) Proceed with current approach where we exposed a generic API to execute any admin command, however, make it much more solid inside VIRTIO. 2) Expose extra APIs from VIRTIO for commands that we can consider future client usage of them as of LIST_QUERY/LIST_USE, however still have the generic execute admin command for others. 3) Expose API per command from VIRTIO and fully drop the generic execute admin command. Few notes: Option #1 looks the most generic one, it drops the need to expose multiple symbols / APIs per command and for now we have a single client for them (i.e. VFIO). Options #2 & #3, may still require to expose the virtio_pci_vf_get_pf_dev() API to let VFIO get the VIRTIO PF (struct virtio_device *) from its PCI device, each command will get it as its first argument. Michael, What do you suggest here ? Thanks, Yishai
On Tue, Oct 10, 2023 at 06:09:44PM +0300, Yishai Hadas wrote: > On 10/10/2023 17:54, Michael S. Tsirkin wrote: > > On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote: > > > On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote: > > > > > > > > However - the Intel GPU VFIO driver is such a bad experiance I don't > > > > > want to encourage people to make VFIO drivers, or code that is only > > > > > used by VFIO drivers, that are not under drivers/vfio review. > > > > So if Alex feels it makes sense to add some virtio functionality > > > > to vfio and is happy to maintain or let you maintain the UAPI > > > > then why would I say no? But we never expected devices to have > > > > two drivers like this does, so just exposing device pointer > > > > and saying "use regular virtio APIs for the rest" does not > > > > cut it, the new APIs have to make sense > > > > so virtio drivers can develop normally without fear of stepping > > > > on the toes of this admin driver. > > > Please work with Yishai to get something that make sense to you. He > > > can post a v2 with the accumulated comments addressed so far and then > > > go over what the API between the drivers is. > > > > > > Thanks, > > > Jason > > /me shrugs. I pretty much posted suggestions already. Should not be hard. > > Anything unclear - post on list. > > > Yes, this is the plan. > > We are working to address the comments that we got so far in both VFIO & > VIRTIO, retest and send the next version. > > Re the API between the modules, It looks like we have the below > alternatives. > > 1) Proceed with current approach where we exposed a generic API to execute > any admin command, however, make it much more solid inside VIRTIO. > 2) Expose extra APIs from VIRTIO for commands that we can consider future > client usage of them as of LIST_QUERY/LIST_USE, however still have the > generic execute admin command for others. > 3) Expose API per command from VIRTIO and fully drop the generic execute > admin command. > > Few notes: > Option #1 looks the most generic one, it drops the need to expose multiple > symbols / APIs per command and for now we have a single client for them > (i.e. VFIO). > Options #2 & #3, may still require to expose the virtio_pci_vf_get_pf_dev() > API to let VFIO get the VIRTIO PF (struct virtio_device *) from its PCI > device, each command will get it as its first argument. > > Michael, > What do you suggest here ? > > Thanks, > Yishai I suggest 3 but call it on the VF. commands will switch to PF internally as needed. For example, intel might be interested in exposing admin commands through a memory BAR of VF itself.
On 10/10/2023 18:14, Michael S. Tsirkin wrote: > On Tue, Oct 10, 2023 at 06:09:44PM +0300, Yishai Hadas wrote: >> On 10/10/2023 17:54, Michael S. Tsirkin wrote: >>> On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote: >>>> On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote: >>>> >>>>>> However - the Intel GPU VFIO driver is such a bad experiance I don't >>>>>> want to encourage people to make VFIO drivers, or code that is only >>>>>> used by VFIO drivers, that are not under drivers/vfio review. >>>>> So if Alex feels it makes sense to add some virtio functionality >>>>> to vfio and is happy to maintain or let you maintain the UAPI >>>>> then why would I say no? But we never expected devices to have >>>>> two drivers like this does, so just exposing device pointer >>>>> and saying "use regular virtio APIs for the rest" does not >>>>> cut it, the new APIs have to make sense >>>>> so virtio drivers can develop normally without fear of stepping >>>>> on the toes of this admin driver. >>>> Please work with Yishai to get something that make sense to you. He >>>> can post a v2 with the accumulated comments addressed so far and then >>>> go over what the API between the drivers is. >>>> >>>> Thanks, >>>> Jason >>> /me shrugs. I pretty much posted suggestions already. Should not be hard. >>> Anything unclear - post on list. >>> >> Yes, this is the plan. >> >> We are working to address the comments that we got so far in both VFIO & >> VIRTIO, retest and send the next version. >> >> Re the API between the modules, It looks like we have the below >> alternatives. >> >> 1) Proceed with current approach where we exposed a generic API to execute >> any admin command, however, make it much more solid inside VIRTIO. >> 2) Expose extra APIs from VIRTIO for commands that we can consider future >> client usage of them as of LIST_QUERY/LIST_USE, however still have the >> generic execute admin command for others. >> 3) Expose API per command from VIRTIO and fully drop the generic execute >> admin command. >> >> Few notes: >> Option #1 looks the most generic one, it drops the need to expose multiple >> symbols / APIs per command and for now we have a single client for them >> (i.e. VFIO). >> Options #2 & #3, may still require to expose the virtio_pci_vf_get_pf_dev() >> API to let VFIO get the VIRTIO PF (struct virtio_device *) from its PCI >> device, each command will get it as its first argument. >> >> Michael, >> What do you suggest here ? >> >> Thanks, >> Yishai > I suggest 3 but call it on the VF. commands will switch to PF > internally as needed. For example, intel might be interested in exposing > admin commands through a memory BAR of VF itself. > The driver who owns the VF is VFIO, it's not a VIRTIO one. The ability to get the VIRTIO PF is from the PCI device (i.e. struct pci_dev). In addition, virtio_pci_vf_get_pf_dev() was implemented for now in virtio-pci as it worked on pci_dev. Assuming that we'll put each command inside virtio as the generic layer, we won't be able to call/use this API internally to get the PF as of cyclic dependencies between the modules, link will fail. So in option #3 we may still need to get outside into VFIO the VIRTIO PF and give it as pointer to VIRTIO upon each command. Does it work for you ? Yishai
> From: Yishai Hadas <yishaih@nvidia.com> > Sent: Tuesday, October 10, 2023 9:14 PM > > On 10/10/2023 18:14, Michael S. Tsirkin wrote: > > On Tue, Oct 10, 2023 at 06:09:44PM +0300, Yishai Hadas wrote: > >> On 10/10/2023 17:54, Michael S. Tsirkin wrote: > >>> On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote: > >>>> On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote: > >>>> > >>>>>> However - the Intel GPU VFIO driver is such a bad experiance I > >>>>>> don't want to encourage people to make VFIO drivers, or code that > >>>>>> is only used by VFIO drivers, that are not under drivers/vfio review. > >>>>> So if Alex feels it makes sense to add some virtio functionality > >>>>> to vfio and is happy to maintain or let you maintain the UAPI then > >>>>> why would I say no? But we never expected devices to have two > >>>>> drivers like this does, so just exposing device pointer and saying > >>>>> "use regular virtio APIs for the rest" does not cut it, the new > >>>>> APIs have to make sense so virtio drivers can develop normally > >>>>> without fear of stepping on the toes of this admin driver. > >>>> Please work with Yishai to get something that make sense to you. He > >>>> can post a v2 with the accumulated comments addressed so far and > >>>> then go over what the API between the drivers is. > >>>> > >>>> Thanks, > >>>> Jason > >>> /me shrugs. I pretty much posted suggestions already. Should not be hard. > >>> Anything unclear - post on list. > >>> > >> Yes, this is the plan. > >> > >> We are working to address the comments that we got so far in both > >> VFIO & VIRTIO, retest and send the next version. > >> > >> Re the API between the modules, It looks like we have the below > >> alternatives. > >> > >> 1) Proceed with current approach where we exposed a generic API to > >> execute any admin command, however, make it much more solid inside > VIRTIO. > >> 2) Expose extra APIs from VIRTIO for commands that we can consider > >> future client usage of them as of LIST_QUERY/LIST_USE, however still > >> have the generic execute admin command for others. > >> 3) Expose API per command from VIRTIO and fully drop the generic > >> execute admin command. > >> > >> Few notes: > >> Option #1 looks the most generic one, it drops the need to expose > >> multiple symbols / APIs per command and for now we have a single > >> client for them (i.e. VFIO). > >> Options #2 & #3, may still require to expose the > >> virtio_pci_vf_get_pf_dev() API to let VFIO get the VIRTIO PF (struct > >> virtio_device *) from its PCI device, each command will get it as its first > argument. > >> > >> Michael, > >> What do you suggest here ? > >> > >> Thanks, > >> Yishai > > I suggest 3 but call it on the VF. commands will switch to PF > > internally as needed. For example, intel might be interested in > > exposing admin commands through a memory BAR of VF itself. > > > The driver who owns the VF is VFIO, it's not a VIRTIO one. > > The ability to get the VIRTIO PF is from the PCI device (i.e. struct pci_dev). > > In addition, > virtio_pci_vf_get_pf_dev() was implemented for now in virtio-pci as it worked > on pci_dev. > Assuming that we'll put each command inside virtio as the generic layer, we > won't be able to call/use this API internally to get the PF as of cyclic > dependencies between the modules, link will fail. > > So in option #3 we may still need to get outside into VFIO the VIRTIO PF and > give it as pointer to VIRTIO upon each command. > I think, For #3 the virtio level API signature should be, virtio_admin_legacy_xyz_cmd(struct virtio_device *dev, u64 group_member_id, ....); This maintains the right abstraction needed between vfio, generic virtio and virtio transport (pci) layer.
On Tue, Oct 10, 2023 at 06:43:32PM +0300, Yishai Hadas wrote: > On 10/10/2023 18:14, Michael S. Tsirkin wrote: > > On Tue, Oct 10, 2023 at 06:09:44PM +0300, Yishai Hadas wrote: > > > On 10/10/2023 17:54, Michael S. Tsirkin wrote: > > > > On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote: > > > > > On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote: > > > > > > > > > > > > However - the Intel GPU VFIO driver is such a bad experiance I don't > > > > > > > want to encourage people to make VFIO drivers, or code that is only > > > > > > > used by VFIO drivers, that are not under drivers/vfio review. > > > > > > So if Alex feels it makes sense to add some virtio functionality > > > > > > to vfio and is happy to maintain or let you maintain the UAPI > > > > > > then why would I say no? But we never expected devices to have > > > > > > two drivers like this does, so just exposing device pointer > > > > > > and saying "use regular virtio APIs for the rest" does not > > > > > > cut it, the new APIs have to make sense > > > > > > so virtio drivers can develop normally without fear of stepping > > > > > > on the toes of this admin driver. > > > > > Please work with Yishai to get something that make sense to you. He > > > > > can post a v2 with the accumulated comments addressed so far and then > > > > > go over what the API between the drivers is. > > > > > > > > > > Thanks, > > > > > Jason > > > > /me shrugs. I pretty much posted suggestions already. Should not be hard. > > > > Anything unclear - post on list. > > > > > > > Yes, this is the plan. > > > > > > We are working to address the comments that we got so far in both VFIO & > > > VIRTIO, retest and send the next version. > > > > > > Re the API between the modules, It looks like we have the below > > > alternatives. > > > > > > 1) Proceed with current approach where we exposed a generic API to execute > > > any admin command, however, make it much more solid inside VIRTIO. > > > 2) Expose extra APIs from VIRTIO for commands that we can consider future > > > client usage of them as of LIST_QUERY/LIST_USE, however still have the > > > generic execute admin command for others. > > > 3) Expose API per command from VIRTIO and fully drop the generic execute > > > admin command. > > > > > > Few notes: > > > Option #1 looks the most generic one, it drops the need to expose multiple > > > symbols / APIs per command and for now we have a single client for them > > > (i.e. VFIO). > > > Options #2 & #3, may still require to expose the virtio_pci_vf_get_pf_dev() > > > API to let VFIO get the VIRTIO PF (struct virtio_device *) from its PCI > > > device, each command will get it as its first argument. > > > > > > Michael, > > > What do you suggest here ? > > > > > > Thanks, > > > Yishai > > I suggest 3 but call it on the VF. commands will switch to PF > > internally as needed. For example, intel might be interested in exposing > > admin commands through a memory BAR of VF itself. > > > The driver who owns the VF is VFIO, it's not a VIRTIO one. > > The ability to get the VIRTIO PF is from the PCI device (i.e. struct > pci_dev). > > In addition, > virtio_pci_vf_get_pf_dev() was implemented for now in virtio-pci as it > worked on pci_dev. On pci_dev of vf, yes? So again just move this into each command, that's all. I.e. pass pci_dev to each. > Assuming that we'll put each command inside virtio as the generic layer, we > won't be able to call/use this API internally to get the PF as of cyclic > dependencies between the modules, link will fail. > > So in option #3 we may still need to get outside into VFIO the VIRTIO PF and > give it as pointer to VIRTIO upon each command. > > Does it work for you ? > > Yishai
On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote: > I suggest 3 but call it on the VF. commands will switch to PF > internally as needed. For example, intel might be interested in exposing > admin commands through a memory BAR of VF itself. FWIW, we have been pushing back on such things in VFIO, so it will have to be very carefully security justified. Probably since that is not standard it should just live in under some intel-only vfio driver behavior, not in virtio land. It is also costly to switch between pf/vf, it should not be done pointlessly on the fast path. Jason
On Tue, Oct 10, 2023 at 12:59:37PM -0300, Jason Gunthorpe wrote: > On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote: > > > I suggest 3 but call it on the VF. commands will switch to PF > > internally as needed. For example, intel might be interested in exposing > > admin commands through a memory BAR of VF itself. > > FWIW, we have been pushing back on such things in VFIO, so it will > have to be very carefully security justified. > > Probably since that is not standard it should just live in under some > intel-only vfio driver behavior, not in virtio land. > > It is also costly to switch between pf/vf, it should not be done > pointlessly on the fast path. > > Jason Currently, the switch seems to be just a cast of private data. I am suggesting keeping that cast inside virtio. Why is that expensive?
On Tue, Oct 10, 2023 at 12:03:29PM -0400, Michael S. Tsirkin wrote: > On Tue, Oct 10, 2023 at 12:59:37PM -0300, Jason Gunthorpe wrote: > > On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote: > > > > > I suggest 3 but call it on the VF. commands will switch to PF > > > internally as needed. For example, intel might be interested in exposing > > > admin commands through a memory BAR of VF itself. > > > > FWIW, we have been pushing back on such things in VFIO, so it will > > have to be very carefully security justified. > > > > Probably since that is not standard it should just live in under some > > intel-only vfio driver behavior, not in virtio land. > > > > It is also costly to switch between pf/vf, it should not be done > > pointlessly on the fast path. > > > > Jason > > Currently, the switch seems to be just a cast of private data. > I am suggesting keeping that cast inside virtio. Why is that > expensive? pci_iov_get_pf_drvdata() does a bunch of sanity checks and function calls. It was not intended to be used on a fast path. Jason
On 10/10/2023 18:58, Michael S. Tsirkin wrote: > On Tue, Oct 10, 2023 at 06:43:32PM +0300, Yishai Hadas wrote: >> On 10/10/2023 18:14, Michael S. Tsirkin wrote: >>> On Tue, Oct 10, 2023 at 06:09:44PM +0300, Yishai Hadas wrote: >>>> On 10/10/2023 17:54, Michael S. Tsirkin wrote: >>>>> On Tue, Oct 10, 2023 at 11:08:49AM -0300, Jason Gunthorpe wrote: >>>>>> On Tue, Oct 10, 2023 at 09:56:00AM -0400, Michael S. Tsirkin wrote: >>>>>> >>>>>>>> However - the Intel GPU VFIO driver is such a bad experiance I don't >>>>>>>> want to encourage people to make VFIO drivers, or code that is only >>>>>>>> used by VFIO drivers, that are not under drivers/vfio review. >>>>>>> So if Alex feels it makes sense to add some virtio functionality >>>>>>> to vfio and is happy to maintain or let you maintain the UAPI >>>>>>> then why would I say no? But we never expected devices to have >>>>>>> two drivers like this does, so just exposing device pointer >>>>>>> and saying "use regular virtio APIs for the rest" does not >>>>>>> cut it, the new APIs have to make sense >>>>>>> so virtio drivers can develop normally without fear of stepping >>>>>>> on the toes of this admin driver. >>>>>> Please work with Yishai to get something that make sense to you. He >>>>>> can post a v2 with the accumulated comments addressed so far and then >>>>>> go over what the API between the drivers is. >>>>>> >>>>>> Thanks, >>>>>> Jason >>>>> /me shrugs. I pretty much posted suggestions already. Should not be hard. >>>>> Anything unclear - post on list. >>>>> >>>> Yes, this is the plan. >>>> >>>> We are working to address the comments that we got so far in both VFIO & >>>> VIRTIO, retest and send the next version. >>>> >>>> Re the API between the modules, It looks like we have the below >>>> alternatives. >>>> >>>> 1) Proceed with current approach where we exposed a generic API to execute >>>> any admin command, however, make it much more solid inside VIRTIO. >>>> 2) Expose extra APIs from VIRTIO for commands that we can consider future >>>> client usage of them as of LIST_QUERY/LIST_USE, however still have the >>>> generic execute admin command for others. >>>> 3) Expose API per command from VIRTIO and fully drop the generic execute >>>> admin command. >>>> >>>> Few notes: >>>> Option #1 looks the most generic one, it drops the need to expose multiple >>>> symbols / APIs per command and for now we have a single client for them >>>> (i.e. VFIO). >>>> Options #2 & #3, may still require to expose the virtio_pci_vf_get_pf_dev() >>>> API to let VFIO get the VIRTIO PF (struct virtio_device *) from its PCI >>>> device, each command will get it as its first argument. >>>> >>>> Michael, >>>> What do you suggest here ? >>>> >>>> Thanks, >>>> Yishai >>> I suggest 3 but call it on the VF. commands will switch to PF >>> internally as needed. For example, intel might be interested in exposing >>> admin commands through a memory BAR of VF itself. >>> >> The driver who owns the VF is VFIO, it's not a VIRTIO one. >> >> The ability to get the VIRTIO PF is from the PCI device (i.e. struct >> pci_dev). >> >> In addition, >> virtio_pci_vf_get_pf_dev() was implemented for now in virtio-pci as it >> worked on pci_dev. > On pci_dev of vf, yes? So again just move this into each command, > that's all. I.e. pass pci_dev to each. How about the cyclic dependencies issue inside VIRTIO that I mentioned below ? In my suggestion it's fine, VFIO will get the PF and give it to VIRTIO per command. Yishai >> Assuming that we'll put each command inside virtio as the generic layer, we >> won't be able to call/use this API internally to get the PF as of cyclic >> dependencies between the modules, link will fail. >> >> So in option #3 we may still need to get outside into VFIO the VIRTIO PF and >> give it as pointer to VIRTIO upon each command. >> >> Does it work for you ? >> >> Yishai
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Tuesday, October 10, 2023 9:37 PM > > On Tue, Oct 10, 2023 at 12:03:29PM -0400, Michael S. Tsirkin wrote: > > On Tue, Oct 10, 2023 at 12:59:37PM -0300, Jason Gunthorpe wrote: > > > On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote: > > > > > > > I suggest 3 but call it on the VF. commands will switch to PF > > > > internally as needed. For example, intel might be interested in > > > > exposing admin commands through a memory BAR of VF itself. If in the future if one does admin command on the VF memory BAR, there is no need of cast either. vfio-virtio-pci driver can do on the pci vf device directly. (though per VF memory registers would be anti-scale design for real hw; to discuss in other forum).
On Tue, Oct 10, 2023 at 04:21:15PM +0000, Parav Pandit wrote: > > > From: Jason Gunthorpe <jgg@nvidia.com> > > Sent: Tuesday, October 10, 2023 9:37 PM > > > > On Tue, Oct 10, 2023 at 12:03:29PM -0400, Michael S. Tsirkin wrote: > > > On Tue, Oct 10, 2023 at 12:59:37PM -0300, Jason Gunthorpe wrote: > > > > On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote: > > > > > > > > > I suggest 3 but call it on the VF. commands will switch to PF > > > > > internally as needed. For example, intel might be interested in > > > > > exposing admin commands through a memory BAR of VF itself. > > If in the future if one does admin command on the VF memory BAR, there is no need of cast either. > vfio-virtio-pci driver can do on the pci vf device directly. this is why I want the API to get the VF pci device as a parameter. I don't get what is cyclic about it, yet. > (though per VF memory registers would be anti-scale design for real hw; to discuss in other forum). up to hardware vendor really.
On Tue, Oct 10, 2023 at 07:09:08PM +0300, Yishai Hadas wrote: > > > > Assuming that we'll put each command inside virtio as the generic layer, we > > > won't be able to call/use this API internally to get the PF as of cyclic > > > dependencies between the modules, link will fail. I just mean: virtio_admin_legacy_io_write(sruct pci_device *, ....) internally it starts from vf gets the pf (or vf itself or whatever the transport is) sends command gets status returns. what is cyclic here?
On Tue, Oct 10, 2023 at 06:43:32PM +0300, Yishai Hadas wrote: > > I suggest 3 but call it on the VF. commands will switch to PF > > internally as needed. For example, intel might be interested in exposing > > admin commands through a memory BAR of VF itself. > > > The driver who owns the VF is VFIO, it's not a VIRTIO one. And to loop back into my previous discussion: that's the fundamental problem here. If it is owned by the virtio subsystem, which just calls into vfio we would not have this problem, including the circular loops and exposed APIs.
On Tue, Oct 10, 2023 at 12:59:37PM -0300, Jason Gunthorpe wrote: > On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote: > > > I suggest 3 but call it on the VF. commands will switch to PF > > internally as needed. For example, intel might be interested in exposing > > admin commands through a memory BAR of VF itself. > > FWIW, we have been pushing back on such things in VFIO, so it will > have to be very carefully security justified. > > Probably since that is not standard it should just live in under some > intel-only vfio driver behavior, not in virtio land. Btw, what is that intel thing everyone is talking about? And why would the virtio core support vendor specific behavior like that?
On Tue, Oct 10, 2023 at 10:10:31AM -0300, Jason Gunthorpe wrote: > We've talked around ideas like allowing the VF config space to do some > of the work. For simple devices we could get away with 1 VF config > space register. (VF config space is owned by the hypervisor, not the > guest) Which assumes you're actually using VFs and not multiple PFs, which is a very limiting assumption. It also limits your from actually using DMA during the live migration process, which again is major limitation once you have a non-tivial amount of state. > SIOVr2 is discussing more a flexible RID mapping - there is a possible > route where a "VF" could actually have two RIDs, a hypervisor RID and a > guest RID. Well, then you go down the SIOV route, which requires a complex driver actually presenting the guest visible device anyway. > It really is PCI limitations that force this design of making a PF > driver do dual duty as a fully functionally normal device and act as a > communication channel proxy to make a back channel into a SRIOV VF. > > My view has always been that the VFIO live migration operations are > executed logically within the VF as they only effect the VF. > > So we have a logical design seperation where VFIO world owns the > commands and the PF driver supplies the communication channel. This > works well for devices that already have a robust RPC interface to > their device FW. Independent of my above points on the doubts on VF-controlled live migration for PCe device I absolutely agree with your that the Linux abstraction and user interface should be VF based. Which further reinforeces my point that the VFIO driver for the controlled function (PF or VF) and the Linux driver for the controlling function (better be a PF in practice) must be very tightly integrated. And the best way to do that is to export the vfio nodes from the Linux driver that knowns the hardware and not split out into a separate one. > > The driver that knows this hardware. In this case the virtio subsystem, > > in case of nvme the nvme driver, and in case of mlx5 the mlx5 driver. > > But those are drivers operating the HW to create kernel devices. Here > we need a VFIO device. They can't co-exist, if you switch mlx5 from > normal to vfio you have to tear down the entire normal driver. Yes, absolutey. And if we're smart enough we structure it in a way that we never even initialize the bits of the driver only needed for the normal kernel consumers. > > No. That layout logically follows from what codebase the functionality > > is part of, though. > > I don't understand what we are talking about really. Where do you > imagine the vfio_register_XX() goes? In the driver controlling the hardware. E.g. for virtio in driver/virtio/ and for nvme in drivers/nvme/ and for mlx5 in the mlx5 driver directory. > > > I don't know what "fake-legacy" even means, VFIO is not legacy. > > > > The driver we're talking about in this thread fakes up a virtio_pci > > legacy devie to the guest on top of a "modern" virtio_pci device. > > I'm not sure I'd use the word fake, inb/outb are always trapped > operations in VMs. If the device provided a real IO BAR then VFIO > common code would trap and relay inb/outb to the device. > > All this is doing is changing the inb/outb relay from using a physical > IO BAR to a DMA command ring. > > The motivation is simply because normal IO BAR space is incredibly > limited and you can't get enough SRIOV functions when using it. The fake is not meant as a judgement. But it creates a virtio-legacy device that in this form does not exist in hardware. That's what I call fake. If you prefer a different term that's fine with me too. > > > There is alot of code in VFIO and the VMM side to take a VF and turn > > > it into a vPCI function. You can't just trivially duplicate VFIO in a > > > dozen drivers without creating a giant mess. > > > > I do not advocate for duplicating it. But the code that calls this > > functionality belongs into the driver that deals with the compound > > device that we're doing this work for. > > On one hand, I don't really care - we can put the code where people > like. > > However - the Intel GPU VFIO driver is such a bad experiance I don't > want to encourage people to make VFIO drivers, or code that is only > used by VFIO drivers, that are not under drivers/vfio review. We can and should require vfio review for users of the vfio API. But to be honest code placement was not the problem with i915. The problem was that the mdev APIs (under drivers/vfio) were a complete trainwreck when it was written, and that the driver had a horrible hypervisor API abstraction. > Be aware, there is a significant performance concern here. If you want > to create 1000 VFIO devices (this is a real thing), we *can't* probe a > normal driver first, it is too slow. We need a path that goes directly > from creating the RIDs to turning those RIDs into VFIO. And by calling the vfio funtions from mlx5 you get this easily. But I think you're totally mixing things up here anyway. For mdev/SIOV like flows you must call vfio APIs from the main driver anyway, as there is no pci_dev to probe on anyway. That's what i915 does btw. For "classic" vfio that requires a pci_dev (or $otherbus_dev) we need to have a similar flow. And I think the best way is to have the bus-level attribute on the device and/or a device-specific side band protocol to device how new functions are probed. With that you avoid all the duplicate PCI IDs for the binding, and actually allow to sanely establush a communication channel between the functions. Because without that there is no way to know how any two functions related. The driver might think they know, but there's all kinds of whacky PCI passthough schemes that will break such a logic.
On Tue, Oct 10, 2023 at 11:13:30PM -0700, Christoph Hellwig wrote: > On Tue, Oct 10, 2023 at 12:59:37PM -0300, Jason Gunthorpe wrote: > > On Tue, Oct 10, 2023 at 11:14:56AM -0400, Michael S. Tsirkin wrote: > > > > > I suggest 3 but call it on the VF. commands will switch to PF > > > internally as needed. For example, intel might be interested in exposing > > > admin commands through a memory BAR of VF itself. > > > > FWIW, we have been pushing back on such things in VFIO, so it will > > have to be very carefully security justified. > > > > Probably since that is not standard it should just live in under some > > intel-only vfio driver behavior, not in virtio land. > > Btw, what is that intel thing everyone is talking about? And why > would the virtio core support vendor specific behavior like that? It's not a thing it's Zhu Lingshan :) intel is just one of the vendors that implemented vdpa support and so Zhu Lingshan from intel is working on vdpa and has also proposed virtio spec extensions for migration. intel's driver is called ifcvf. vdpa composes all this stuff that is added to vfio in userspace, so it's a different approach.
On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote: > > Btw, what is that intel thing everyone is talking about? And why > > would the virtio core support vendor specific behavior like that? > > It's not a thing it's Zhu Lingshan :) intel is just one of the vendors > that implemented vdpa support and so Zhu Lingshan from intel is working > on vdpa and has also proposed virtio spec extensions for migration. > intel's driver is called ifcvf. vdpa composes all this stuff that is > added to vfio in userspace, so it's a different approach. Well, so let's call it virtio live migration instead of intel. And please work all together in the virtio committee that you have one way of communication between controlling and controlled functions. If one extension does it one way and the other a different way that's just creating a giant mess.
On 10/10/2023 23:42, Michael S. Tsirkin wrote: > On Tue, Oct 10, 2023 at 07:09:08PM +0300, Yishai Hadas wrote: >>>> Assuming that we'll put each command inside virtio as the generic layer, we >>>> won't be able to call/use this API internally to get the PF as of cyclic >>>> dependencies between the modules, link will fail. > I just mean: > virtio_admin_legacy_io_write(sruct pci_device *, ....) > > > internally it starts from vf gets the pf (or vf itself or whatever > the transport is) sends command gets status returns. > > what is cyclic here? > virtio-pci depends on virtio [1]. If we put the commands in the generic layer as we expect it to be (i.e. virtio), then trying to call internally call for virtio_pci_vf_get_pf_dev() to get the PF from the VF will end-up by a linker cyclic error as of below [2]. As of that, someone can suggest to put the commands in virtio-pci, however this will fully bypass the generic layer of virtio and future clients won't be able to use it. In addition, passing in the VF PCI pointer instead of the VF group member ID + the VIRTIO PF device, will require in the future to duplicate each command once we'll use SIOV devices. Instead, we suggest the below API for the above example. virtio_admin_legacy_io_write(virtio_device *virtio_dev, u64 group_member_id, ....) [1] [yishaih@reg-l-vrt-209 linux]$ modinfo virtio-pci filename: /lib/modules/6.6.0-rc2+/kernel/drivers/virtio/virtio_pci.ko version: 1 license: GPL description: virtio-pci author: Anthony Liguori <aliguori@us.ibm.com> srcversion: 7355EAC9408D38891938391 alias: pci:v00001AF4d*sv*sd*bc*sc*i* depends: virtio_pci_modern_dev,virtio,virtio_ring,virtio_pci_legacy_dev retpoline: Y intree: Y name: virtio_pci vermagic: 6.6.0-rc2+ SMP preempt mod_unload modversions parm: force_legacy:Force legacy mode for transitional virtio 1 devices (bool) [2] depmod: ERROR: Cycle detected: virtio -> virtio_pci -> virtio depmod: ERROR: Found 2 modules in dependency cycles! make[2]: *** [scripts/Makefile.modinst:128: depmod] Error 1 make[1]: *** [/images/yishaih/src/kernel/linux/Makefile:1821: modules_install] Error 2 Yishai
Hi Christoph, > From: Christoph Hellwig <hch@infradead.org> > Sent: Wednesday, October 11, 2023 12:29 PM > > On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote: > > > Btw, what is that intel thing everyone is talking about? And why > > > would the virtio core support vendor specific behavior like that? > > > > It's not a thing it's Zhu Lingshan :) intel is just one of the vendors > > that implemented vdpa support and so Zhu Lingshan from intel is > > working on vdpa and has also proposed virtio spec extensions for migration. > > intel's driver is called ifcvf. vdpa composes all this stuff that is > > added to vfio in userspace, so it's a different approach. > > Well, so let's call it virtio live migration instead of intel. > > And please work all together in the virtio committee that you have one way of > communication between controlling and controlled functions. > If one extension does it one way and the other a different way that's just > creating a giant mess. We in virtio committee are working on VF device migration where: VF = controlled function PF = controlling function The second proposal is what Michael mentioned from Intel that somehow combine controlled and controlling function as single entity on VF. The main reasons I find it weird are: 1. it must always need to do mediation to do fake the device reset, and flr flows 2. dma cannot work as you explained for complex device state 3. it needs constant knowledge of each tiny things for each virtio device type Such single entity appears a bit very weird to me but maybe it is just me.
On Wed, Oct 11, 2023 at 10:44:49AM +0300, Yishai Hadas wrote: > On 10/10/2023 23:42, Michael S. Tsirkin wrote: > > On Tue, Oct 10, 2023 at 07:09:08PM +0300, Yishai Hadas wrote: > > > > > Assuming that we'll put each command inside virtio as the generic layer, we > > > > > won't be able to call/use this API internally to get the PF as of cyclic > > > > > dependencies between the modules, link will fail. > > I just mean: > > virtio_admin_legacy_io_write(sruct pci_device *, ....) > > > > > > internally it starts from vf gets the pf (or vf itself or whatever > > the transport is) sends command gets status returns. > > > > what is cyclic here? > > > virtio-pci depends on virtio [1]. > > If we put the commands in the generic layer as we expect it to be (i.e. > virtio), then trying to call internally call for virtio_pci_vf_get_pf_dev() > to get the PF from the VF will end-up by a linker cyclic error as of below > [2]. > > As of that, someone can suggest to put the commands in virtio-pci, however > this will fully bypass the generic layer of virtio and future clients won't > be able to use it. virtio_pci would get pci device. virtio pci convers that to virtio device of owner + group member id and calls virtio. no cycles and minimal transport specific code, right? > In addition, passing in the VF PCI pointer instead of the VF group member ID > + the VIRTIO PF device, will require in the future to duplicate each command > once we'll use SIOV devices. I don't think anyone knows how will SIOV look. But shuffling APIs around is not a big deal. We'll see. > Instead, we suggest the below API for the above example. > > virtio_admin_legacy_io_write(virtio_device *virtio_dev, u64 > group_member_id, ....) > > [1] > [yishaih@reg-l-vrt-209 linux]$ modinfo virtio-pci > filename: /lib/modules/6.6.0-rc2+/kernel/drivers/virtio/virtio_pci.ko > version: 1 > license: GPL > description: virtio-pci > author: Anthony Liguori <aliguori@us.ibm.com> > srcversion: 7355EAC9408D38891938391 > alias: pci:v00001AF4d*sv*sd*bc*sc*i* > depends: virtio_pci_modern_dev,virtio,virtio_ring,virtio_pci_legacy_dev > retpoline: Y > intree: Y > name: virtio_pci > vermagic: 6.6.0-rc2+ SMP preempt mod_unload modversions > parm: force_legacy:Force legacy mode for transitional virtio 1 > devices (bool) > > [2] > > depmod: ERROR: Cycle detected: virtio -> virtio_pci -> virtio > depmod: ERROR: Found 2 modules in dependency cycles! > make[2]: *** [scripts/Makefile.modinst:128: depmod] Error 1 > make[1]: *** [/images/yishaih/src/kernel/linux/Makefile:1821: > modules_install] Error 2 > > Yishai virtio absolutely must not depend on virtio pci, it is used on systems without pci at all.
On Wed, Oct 11, 2023 at 08:00:57AM +0000, Parav Pandit wrote: > Hi Christoph, > > > From: Christoph Hellwig <hch@infradead.org> > > Sent: Wednesday, October 11, 2023 12:29 PM > > > > On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote: > > > > Btw, what is that intel thing everyone is talking about? And why > > > > would the virtio core support vendor specific behavior like that? > > > > > > It's not a thing it's Zhu Lingshan :) intel is just one of the vendors > > > that implemented vdpa support and so Zhu Lingshan from intel is > > > working on vdpa and has also proposed virtio spec extensions for migration. > > > intel's driver is called ifcvf. vdpa composes all this stuff that is > > > added to vfio in userspace, so it's a different approach. > > > > Well, so let's call it virtio live migration instead of intel. > > > > And please work all together in the virtio committee that you have one way of > > communication between controlling and controlled functions. > > If one extension does it one way and the other a different way that's just > > creating a giant mess. > > We in virtio committee are working on VF device migration where: > VF = controlled function > PF = controlling function > > The second proposal is what Michael mentioned from Intel that somehow combine controlled and controlling function as single entity on VF. > > The main reasons I find it weird are: > 1. it must always need to do mediation to do fake the device reset, and flr flows > 2. dma cannot work as you explained for complex device state > 3. it needs constant knowledge of each tiny things for each virtio device type > > Such single entity appears a bit very weird to me but maybe it is just me. Yea it appears to include everyone from nvidia. Others are used to it - this is exactly what happens with virtio generally. E.g. vhost processes fast path in the kernel and control path is in userspace. vdpa has been largely modeled after that, for better or worse.
On Tue, Oct 10, 2023 at 11:59:26PM -0700, Christoph Hellwig wrote: > On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote: > > > Btw, what is that intel thing everyone is talking about? And why > > > would the virtio core support vendor specific behavior like that? > > > > It's not a thing it's Zhu Lingshan :) intel is just one of the vendors > > that implemented vdpa support and so Zhu Lingshan from intel is working > > on vdpa and has also proposed virtio spec extensions for migration. > > intel's driver is called ifcvf. vdpa composes all this stuff that is > > added to vfio in userspace, so it's a different approach. > > Well, so let's call it virtio live migration instead of intel. > > And please work all together in the virtio committee that you have > one way of communication between controlling and controlled functions. > If one extension does it one way and the other a different way that's > just creating a giant mess. Absolutely, this is exactly what I keep suggesting. Thanks for bringing this up, will help me drive the point home!
On 11/10/2023 11:02, Michael S. Tsirkin wrote: > On Wed, Oct 11, 2023 at 10:44:49AM +0300, Yishai Hadas wrote: >> On 10/10/2023 23:42, Michael S. Tsirkin wrote: >>> On Tue, Oct 10, 2023 at 07:09:08PM +0300, Yishai Hadas wrote: >>>>>> Assuming that we'll put each command inside virtio as the generic layer, we >>>>>> won't be able to call/use this API internally to get the PF as of cyclic >>>>>> dependencies between the modules, link will fail. >>> I just mean: >>> virtio_admin_legacy_io_write(sruct pci_device *, ....) >>> >>> >>> internally it starts from vf gets the pf (or vf itself or whatever >>> the transport is) sends command gets status returns. >>> >>> what is cyclic here? >>> >> virtio-pci depends on virtio [1]. >> >> If we put the commands in the generic layer as we expect it to be (i.e. >> virtio), then trying to call internally call for virtio_pci_vf_get_pf_dev() >> to get the PF from the VF will end-up by a linker cyclic error as of below >> [2]. >> >> As of that, someone can suggest to put the commands in virtio-pci, however >> this will fully bypass the generic layer of virtio and future clients won't >> be able to use it. > virtio_pci would get pci device. > virtio pci convers that to virtio device of owner + group member id and calls virtio. Do you suggest another set of exported symbols (i.e per command ) in virtio which will get the owner device + group member + the extra specific command parameters ? This will end-up duplicating the number of export symbols per command. > no cycles and minimal transport specific code, right? See my above note, if we may just call virtio without any further work on the command's input, than YES. If so, virtio will prepare the command by setting the relevant SG lists and other data and finally will call: vdev->config->exec_admin_cmd(vdev, cmd); Was that your plan ? > >> In addition, passing in the VF PCI pointer instead of the VF group member ID >> + the VIRTIO PF device, will require in the future to duplicate each command >> once we'll use SIOV devices. > I don't think anyone knows how will SIOV look. But shuffling > APIs around is not a big deal. We'll see. As you are the maintainer it's up-to-you, just need to consider another further duplication here. Yishai > >> Instead, we suggest the below API for the above example. >> >> virtio_admin_legacy_io_write(virtio_device *virtio_dev, u64 >> group_member_id, ....) >> >> [1] >> [yishaih@reg-l-vrt-209 linux]$ modinfo virtio-pci >> filename: /lib/modules/6.6.0-rc2+/kernel/drivers/virtio/virtio_pci.ko >> version: 1 >> license: GPL >> description: virtio-pci >> author: Anthony Liguori <aliguori@us.ibm.com> >> srcversion: 7355EAC9408D38891938391 >> alias: pci:v00001AF4d*sv*sd*bc*sc*i* >> depends: virtio_pci_modern_dev,virtio,virtio_ring,virtio_pci_legacy_dev >> retpoline: Y >> intree: Y >> name: virtio_pci >> vermagic: 6.6.0-rc2+ SMP preempt mod_unload modversions >> parm: force_legacy:Force legacy mode for transitional virtio 1 >> devices (bool) >> >> [2] >> >> depmod: ERROR: Cycle detected: virtio -> virtio_pci -> virtio >> depmod: ERROR: Found 2 modules in dependency cycles! >> make[2]: *** [scripts/Makefile.modinst:128: depmod] Error 1 >> make[1]: *** [/images/yishaih/src/kernel/linux/Makefile:1821: >> modules_install] Error 2 >> >> Yishai > virtio absolutely must not depend on virtio pci, it is used on > systems without pci at all. >
On Wed, Oct 11, 2023 at 11:58:11AM +0300, Yishai Hadas wrote: > On 11/10/2023 11:02, Michael S. Tsirkin wrote: > > On Wed, Oct 11, 2023 at 10:44:49AM +0300, Yishai Hadas wrote: > > > On 10/10/2023 23:42, Michael S. Tsirkin wrote: > > > > On Tue, Oct 10, 2023 at 07:09:08PM +0300, Yishai Hadas wrote: > > > > > > > Assuming that we'll put each command inside virtio as the generic layer, we > > > > > > > won't be able to call/use this API internally to get the PF as of cyclic > > > > > > > dependencies between the modules, link will fail. > > > > I just mean: > > > > virtio_admin_legacy_io_write(sruct pci_device *, ....) > > > > > > > > > > > > internally it starts from vf gets the pf (or vf itself or whatever > > > > the transport is) sends command gets status returns. > > > > > > > > what is cyclic here? > > > > > > > virtio-pci depends on virtio [1]. > > > > > > If we put the commands in the generic layer as we expect it to be (i.e. > > > virtio), then trying to call internally call for virtio_pci_vf_get_pf_dev() > > > to get the PF from the VF will end-up by a linker cyclic error as of below > > > [2]. > > > > > > As of that, someone can suggest to put the commands in virtio-pci, however > > > this will fully bypass the generic layer of virtio and future clients won't > > > be able to use it. > > virtio_pci would get pci device. > > virtio pci convers that to virtio device of owner + group member id and calls virtio. > > Do you suggest another set of exported symbols (i.e per command ) in virtio > which will get the owner device + group member + the extra specific command > parameters ? > > This will end-up duplicating the number of export symbols per command. Or make them inline. Or maybe actually even the specific commands should live inside virtio pci they are pci specific after all. > > no cycles and minimal transport specific code, right? > > See my above note, if we may just call virtio without any further work on > the command's input, than YES. > > If so, virtio will prepare the command by setting the relevant SG lists and > other data and finally will call: > > vdev->config->exec_admin_cmd(vdev, cmd); > > Was that your plan ? is vdev the pf? then it won't support the transport where commands are submitted through bar0 of vf itself. > > > > > In addition, passing in the VF PCI pointer instead of the VF group member ID > > > + the VIRTIO PF device, will require in the future to duplicate each command > > > once we'll use SIOV devices. > > I don't think anyone knows how will SIOV look. But shuffling > > APIs around is not a big deal. We'll see. > > As you are the maintainer it's up-to-you, just need to consider another > further duplication here. > > Yishai > > > > > > Instead, we suggest the below API for the above example. > > > > > > virtio_admin_legacy_io_write(virtio_device *virtio_dev, u64 > > > group_member_id, ....) > > > > > > [1] > > > [yishaih@reg-l-vrt-209 linux]$ modinfo virtio-pci > > > filename: /lib/modules/6.6.0-rc2+/kernel/drivers/virtio/virtio_pci.ko > > > version: 1 > > > license: GPL > > > description: virtio-pci > > > author: Anthony Liguori <aliguori@us.ibm.com> > > > srcversion: 7355EAC9408D38891938391 > > > alias: pci:v00001AF4d*sv*sd*bc*sc*i* > > > depends: virtio_pci_modern_dev,virtio,virtio_ring,virtio_pci_legacy_dev > > > retpoline: Y > > > intree: Y > > > name: virtio_pci > > > vermagic: 6.6.0-rc2+ SMP preempt mod_unload modversions > > > parm: force_legacy:Force legacy mode for transitional virtio 1 > > > devices (bool) > > > > > > [2] > > > > > > depmod: ERROR: Cycle detected: virtio -> virtio_pci -> virtio > > > depmod: ERROR: Found 2 modules in dependency cycles! > > > make[2]: *** [scripts/Makefile.modinst:128: depmod] Error 1 > > > make[1]: *** [/images/yishaih/src/kernel/linux/Makefile:1821: > > > modules_install] Error 2 > > > > > > Yishai > > virtio absolutely must not depend on virtio pci, it is used on > > systems without pci at all. > >
On 11/10/2023 12:03, Michael S. Tsirkin wrote: > On Wed, Oct 11, 2023 at 11:58:11AM +0300, Yishai Hadas wrote: >> On 11/10/2023 11:02, Michael S. Tsirkin wrote: >>> On Wed, Oct 11, 2023 at 10:44:49AM +0300, Yishai Hadas wrote: >>>> On 10/10/2023 23:42, Michael S. Tsirkin wrote: >>>>> On Tue, Oct 10, 2023 at 07:09:08PM +0300, Yishai Hadas wrote: >>>>>>>> Assuming that we'll put each command inside virtio as the generic layer, we >>>>>>>> won't be able to call/use this API internally to get the PF as of cyclic >>>>>>>> dependencies between the modules, link will fail. >>>>> I just mean: >>>>> virtio_admin_legacy_io_write(sruct pci_device *, ....) >>>>> >>>>> >>>>> internally it starts from vf gets the pf (or vf itself or whatever >>>>> the transport is) sends command gets status returns. >>>>> >>>>> what is cyclic here? >>>>> >>>> virtio-pci depends on virtio [1]. >>>> >>>> If we put the commands in the generic layer as we expect it to be (i.e. >>>> virtio), then trying to call internally call for virtio_pci_vf_get_pf_dev() >>>> to get the PF from the VF will end-up by a linker cyclic error as of below >>>> [2]. >>>> >>>> As of that, someone can suggest to put the commands in virtio-pci, however >>>> this will fully bypass the generic layer of virtio and future clients won't >>>> be able to use it. >>> virtio_pci would get pci device. >>> virtio pci convers that to virtio device of owner + group member id and calls virtio. >> Do you suggest another set of exported symbols (i.e per command ) in virtio >> which will get the owner device + group member + the extra specific command >> parameters ? >> >> This will end-up duplicating the number of export symbols per command. > Or make them inline. > Or maybe actually even the specific commands should live inside virtio pci > they are pci specific after all. OK, let's leave them in virtio-pci as you suggested here. You can see below [1] some scheme of how a specific command will look like. Few notes: - virtio_pci_vf_get_pf_dev() will become a static function. - The commands will be placed inside virtio_pci_common.c and will use locally the above static function to get the owner PF. - Post of preparing the command we may call directly to vp_avq_cmd_exec() which is part of vfio-pci and not to virtio. - vp_avq_cmd_exec() will be part of virtio_pci_modern.c as you asked in the ML. - The AQ creation/destruction will still be called upon probing virtio as was in V0, it will use the underlay config->create/destroy_avq() ops if exist. - virtio_admin_cmd_exec() won't be exported any more outside virtio, we'll have an exported symbol in virtio-pci per command. Is the above fine for you ? By the way, from API namespace POV, are you fine with virtio_admin_legacy_io_write() or maybe let's have '_pci' as part of the name ? (i.e. virtio_pci_admin_legacy_io_write) [1] int virtio_admin_legacy_io_write(struct pci_dev *pdev, u16 opcode, u8 offset, u8 size, u8 *buf) { struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); struct virtio_admin_cmd_legacy_wr_data *in; struct virtio_admin_cmd cmd = {}; struct scatterlist in_sg; int ret; int vf_id; if (!virtio_dev) return -ENODEV; vf_id = pci_iov_vf_id(pdev); if (vf_id < 0) return -EINVAL; in = kzalloc(sizeof(*in) + size, GFP_KERNEL); if (!in) return -ENOMEM; in->offset = offset; memcpy(in->registers, buf, size); sg_init_one(&in_sg, in, sizeof(*in) + size); cmd.opcode = opcode; cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; cmd.group_member_id = vf_id + 1; cmd.data_sg = &in_sg; ret = vp_avq_cmd_exec(virtio_dev, &cmd); kfree(in); return ret; } EXPORT_SYMBOL_GPL(virtio_admin_legacy_io_write); > >>> no cycles and minimal transport specific code, right? >> See my above note, if we may just call virtio without any further work on >> the command's input, than YES. >> >> If so, virtio will prepare the command by setting the relevant SG lists and >> other data and finally will call: >> >> vdev->config->exec_admin_cmd(vdev, cmd); >> >> Was that your plan ? > is vdev the pf? then it won't support the transport where commands > are submitted through bar0 of vf itself. Yes, it's a PF. Based on current spec for the existing admin commands we issue commands only on the PF. In any case, moving to the above suggested scheme to handle per command and to get the VF PCI as the first argument we now have a full control for any future command. Yishai >>>> In addition, passing in the VF PCI pointer instead of the VF group member ID >>>> + the VIRTIO PF device, will require in the future to duplicate each command >>>> once we'll use SIOV devices. >>> I don't think anyone knows how will SIOV look. But shuffling >>> APIs around is not a big deal. We'll see. >> As you are the maintainer it's up-to-you, just need to consider another >> further duplication here. >> >> Yishai >> >>>> Instead, we suggest the below API for the above example. >>>> >>>> virtio_admin_legacy_io_write(virtio_device *virtio_dev, u64 >>>> group_member_id, ....) >>>> >>>> [1] >>>> [yishaih@reg-l-vrt-209 linux]$ modinfo virtio-pci >>>> filename: /lib/modules/6.6.0-rc2+/kernel/drivers/virtio/virtio_pci.ko >>>> version: 1 >>>> license: GPL >>>> description: virtio-pci >>>> author: Anthony Liguori <aliguori@us.ibm.com> >>>> srcversion: 7355EAC9408D38891938391 >>>> alias: pci:v00001AF4d*sv*sd*bc*sc*i* >>>> depends: virtio_pci_modern_dev,virtio,virtio_ring,virtio_pci_legacy_dev >>>> retpoline: Y >>>> intree: Y >>>> name: virtio_pci >>>> vermagic: 6.6.0-rc2+ SMP preempt mod_unload modversions >>>> parm: force_legacy:Force legacy mode for transitional virtio 1 >>>> devices (bool) >>>> >>>> [2] >>>> >>>> depmod: ERROR: Cycle detected: virtio -> virtio_pci -> virtio >>>> depmod: ERROR: Found 2 modules in dependency cycles! >>>> make[2]: *** [scripts/Makefile.modinst:128: depmod] Error 1 >>>> make[1]: *** [/images/yishaih/src/kernel/linux/Makefile:1821: >>>> modules_install] Error 2 >>>> >>>> Yishai >>> virtio absolutely must not depend on virtio pci, it is used on >>> systems without pci at all. >>>
On Wed, Oct 11, 2023 at 04:10:58AM -0400, Michael S. Tsirkin wrote: > On Wed, Oct 11, 2023 at 08:00:57AM +0000, Parav Pandit wrote: > > Hi Christoph, > > > > > From: Christoph Hellwig <hch@infradead.org> > > > Sent: Wednesday, October 11, 2023 12:29 PM > > > > > > On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote: > > > > > Btw, what is that intel thing everyone is talking about? And why > > > > > would the virtio core support vendor specific behavior like that? > > > > > > > > It's not a thing it's Zhu Lingshan :) intel is just one of the vendors > > > > that implemented vdpa support and so Zhu Lingshan from intel is > > > > working on vdpa and has also proposed virtio spec extensions for migration. > > > > intel's driver is called ifcvf. vdpa composes all this stuff that is > > > > added to vfio in userspace, so it's a different approach. > > > > > > Well, so let's call it virtio live migration instead of intel. > > > > > > And please work all together in the virtio committee that you have one way of > > > communication between controlling and controlled functions. > > > If one extension does it one way and the other a different way that's just > > > creating a giant mess. > > > > We in virtio committee are working on VF device migration where: > > VF = controlled function > > PF = controlling function > > > > The second proposal is what Michael mentioned from Intel that somehow combine controlled and controlling function as single entity on VF. > > > > The main reasons I find it weird are: > > 1. it must always need to do mediation to do fake the device reset, and flr flows > > 2. dma cannot work as you explained for complex device state > > 3. it needs constant knowledge of each tiny things for each virtio device type > > > > Such single entity appears a bit very weird to me but maybe it is just me. > > Yea it appears to include everyone from nvidia. Others are used to it - > this is exactly what happens with virtio generally. E.g. vhost > processes fast path in the kernel and control path is in userspace. > vdpa has been largely modeled after that, for better or worse. As Parav says, you can't use DMA for any migration flows, and you open a single VF scheme up to PCI P2P attacks from the VM. It is a pretty bad design. vfio reviewers will reject things like this that are not secure - we just did for Intel E800, for instance. With VDPA doing the same stuff as vfio I'm not sure who is auditing it for security. The simple way to be sure is to never touch the PCI function that has DMA assigned to a VM from the hypervisor, except through config space. Beyond that.. Well, think carefully about security. IMHO the single-VF approach is not suitable for standardization. Jason
On Tue, Oct 10, 2023 at 11:26:42PM -0700, Christoph Hellwig wrote: > On Tue, Oct 10, 2023 at 10:10:31AM -0300, Jason Gunthorpe wrote: > > We've talked around ideas like allowing the VF config space to do some > > of the work. For simple devices we could get away with 1 VF config > > space register. (VF config space is owned by the hypervisor, not the > > guest) > > Which assumes you're actually using VFs and not multiple PFs, which > is a very limiting assumption. ? It doesn't matter VF/PF, the same functions config space could do simple migration. > It also limits your from actually > using DMA during the live migration process, which again is major > limitation once you have a non-tivial amount of state. Yes, this is a dealbreaker for big cases. But we do see several smaller/simpler devices that don't use DMA in their migration. > > SIOVr2 is discussing more a flexible RID mapping - there is a possible > > route where a "VF" could actually have two RIDs, a hypervisor RID and a > > guest RID. > > Well, then you go down the SIOV route, which requires a complex driver > actually presenting the guest visible device anyway. Yep > Independent of my above points on the doubts on VF-controlled live > migration for PCe device I absolutely agree with your that the Linux > abstraction and user interface should be VF based. Which further > reinforeces my point that the VFIO driver for the controlled function > (PF or VF) and the Linux driver for the controlling function (better > be a PF in practice) must be very tightly integrated. And the best > way to do that is to export the vfio nodes from the Linux driver > that knowns the hardware and not split out into a separate one. I'm not sure how we get to "very tightly integrated". We have many examples of live migration vfio drivers now and they do not seem to require tight integration. The PF driver only has to provide a way to execute a small number of proxied operations. Regardless, I'm not too fussed about what directory the implementation lives in, though I do prefer the current arrangement where VFIO only stuff is in drivers/vfio. I like the process we have where subsystems are responsible for the code that implements the subsystem ops. > > However - the Intel GPU VFIO driver is such a bad experiance I don't > > want to encourage people to make VFIO drivers, or code that is only > > used by VFIO drivers, that are not under drivers/vfio review. > > We can and should require vfio review for users of the vfio API. > But to be honest code placement was not the problem with i915. The > problem was that the mdev APIs (under drivers/vfio) were a complete > trainwreck when it was written, and that the driver had a horrible > hypervisor API abstraction. E800 also made some significant security mistakes that VFIO side caught. I think would have been missed if it went into a netdev tree. Even unrelated to mdev, Intel GPU is still not using the vfio side properly, and the way it hacked into KVM to try to get page tracking is totally logically wrong (but Works For Me (tm)) Aside from technical concerns, I do have a big process worry here. vfio is responsible for the security side of the review of things implementing its ops. > > Be aware, there is a significant performance concern here. If you want > > to create 1000 VFIO devices (this is a real thing), we *can't* probe a > > normal driver first, it is too slow. We need a path that goes directly > > from creating the RIDs to turning those RIDs into VFIO. > > And by calling the vfio funtions from mlx5 you get this easily. "easily" I don't know about that :) > For mdev/SIOV like flows you must call vfio APIs from the main > driver anyway, as there is no pci_dev to probe on anyway. That's > what i915 does btw. IMHO i915 is not an good example to copy. mlx5 is already much closer to your ideal, and I would hold up as the right general direction for SIOV/mdev/etc, as we basically already do a lot of SIOV ideas. mlx5 is a multi-subsystem device. It has driver components in net, VDPA and infiniband. It can create non-PCI "functions". It is not feasible, process wise, for all of this to live under one directory. We *want* the driver split up by subystem and subsystem maintainer. So, we created the auxiliary_device stuff to manage this. It can do what you are imagining, I think. The core PF/VF driver is in charge of what to carve off to a sub system driver. IIRC mlx5 uses netlink to deliver commands to trigger this (eg create a VDPA device). An auxilary_device is created and the target subsystem driver probes to that and autoloads. eg see drivers/vdpa/mlx5/net/mlx5_vnet.c They are not 'tightly coupled', the opposite really. The auxilary_device comes along with a mlx5 API that allows all the subsystem to do what they need on the HW mostly independently. For mlx5 this is mostly a way to execute FW RPC commands. So, if you want to turn the VFIO stuff inside out, I'd still suggest to have the VFIO driver part under drivers/vfio and probe to an auxilary_device that represents the aspect of the HW to turn into VFIO (or VPDA, or whatever). The 'core' driver can provide an appropriate API between its VFIO part and its core part. We lack a common uAPI to trigger this creation, but otherwise the infrastructure exists and works well now. It allows subsystems to remain together and complex devices to spread their functionality to multiple subsystems. The current pci_iov_get_pf_drvdata() hack in VFIO is really a short cut to doing the auxilary_device stuff. (actually we tried to build this with auxilary_device first, it did not work out, needs more driver core infastructure). I can easially imagine all the current VFIO drivers probing to an auxilary_device and obtinaing the VF pci_device and the handle for the core functionalitty directly without the pci_iov_get_pf_drvdata() approach. > For "classic" vfio that requires a pci_dev (or $otherbus_dev) we need > to have a similar flow. And I think the best way is to have the > bus-level attribute on the device and/or a device-specific side band > protocol to device how new functions are probed. With that you > avoid all the duplicate PCI IDs for the binding, and actually allow to > sanely establush a communication channel between the functions. > Because without that there is no way to know how any two functions > related. The driver might think they know, but there's all kinds of > whacky PCI passthough schemes that will break such a logic. Yes, if things are not simple PF/VF then Linux struggles at the driver core level. auxilary_devices are a way out of that since one spot can figure out how to assemble the multi-component device and then delegate portions of the HW to other subsystems. If something wants to probe its own driver to a PF/VF to assemble the components it can do that and then bundle it up into an aux device and trigger a VFIO/etc driver to run on that bundle of resources. We don't *need* to put all the VFIO code someplace else to put the control over slicing the HW into a shared core driver. mlx5 and several other drivers now already demonstrates all of this. Jason
On Wed, Oct 11, 2023 at 10:57:09AM -0300, Jason Gunthorpe wrote: > > Independent of my above points on the doubts on VF-controlled live > > migration for PCe device I absolutely agree with your that the Linux > > abstraction and user interface should be VF based. Which further > > reinforeces my point that the VFIO driver for the controlled function > > (PF or VF) and the Linux driver for the controlling function (better > > be a PF in practice) must be very tightly integrated. And the best > > way to do that is to export the vfio nodes from the Linux driver > > that knowns the hardware and not split out into a separate one. > > I'm not sure how we get to "very tightly integrated". We have many > examples of live migration vfio drivers now and they do not seem to > require tight integration. The PF driver only has to provide a way to > execute a small number of proxied operations. Yes. And for that I need to know what VF it actually is dealing with. Which is tight integration in my book. > Regardless, I'm not too fussed about what directory the implementation > lives in, though I do prefer the current arrangement where VFIO only > stuff is in drivers/vfio. I like the process we have where subsystems > are responsible for the code that implements the subsystem ops. I really don't care about where the code lives (in the directory tree) either. But as you see with virtio trying to split it out into an arbitrary module causes all kinds of pain. > > E800 also made some significant security mistakes that VFIO side > caught. I think would have been missed if it went into a netdev > tree. > > Even unrelated to mdev, Intel GPU is still not using the vfio side > properly, and the way it hacked into KVM to try to get page tracking > is totally logically wrong (but Works For Me (tm)) > > Aside from technical concerns, I do have a big process worry > here. vfio is responsible for the security side of the review of > things implementing its ops. Yes, anytjing exposing a vfio node needs vfio review, period. And I don't think where the code lived was the i915 problem. The problem was they they were the first open user of the mdev API, which was just a badly deisgned hook for never published code at that time, and they then shoehorned it into a weird hypervisor abstraction. There's no good way to succeed with that.
On Wed, Oct 11, 2023 at 07:17:25AM -0700, Christoph Hellwig wrote: > On Wed, Oct 11, 2023 at 10:57:09AM -0300, Jason Gunthorpe wrote: > > > Independent of my above points on the doubts on VF-controlled live > > > migration for PCe device I absolutely agree with your that the Linux > > > abstraction and user interface should be VF based. Which further > > > reinforeces my point that the VFIO driver for the controlled function > > > (PF or VF) and the Linux driver for the controlling function (better > > > be a PF in practice) must be very tightly integrated. And the best > > > way to do that is to export the vfio nodes from the Linux driver > > > that knowns the hardware and not split out into a separate one. > > > > I'm not sure how we get to "very tightly integrated". We have many > > examples of live migration vfio drivers now and they do not seem to > > require tight integration. The PF driver only has to provide a way to > > execute a small number of proxied operations. > > Yes. And for that I need to know what VF it actually is dealing > with. Which is tight integration in my book. Well, I see two modalities here Simple devices with a fixed PF/VF relationship use a VF pci_device for VFIO and pci_iov_get_pf_drvdata()/related APIs to assemble their parts. This is very limited (and kind of hacky). Complex devices can use an auxiliary_device for VFIO and assemble their parts however they like. After probe is done the VFIO code operates effectively identically regardless of how the components were found. Intel is going to submit their IDXD SIOV driver "soon" and I'd like to pause there and have a real discussion about how to manage VFIO lifecycle and dynamic "function" creation in this brave new world. Ideally we can get a lifecycle API that works uniformly for PCI VFs too. Then maybe this gets more resolved. In my mind at least, definately no mdevs and that sysfs GUID junk. :( > I really don't care about where the code lives (in the directory tree) > either. But as you see with virtio trying to split it out into > an arbitrary module causes all kinds of pain. Trying to put VFIO-only code in virtio is what causes all the issues. If you mis-design the API boundary everything will be painful, no matter where you put the code. Jason
On Wed, Oct 11, 2023 at 11:58:10AM -0300, Jason Gunthorpe wrote: > Trying to put VFIO-only code in virtio is what causes all the > issues. If you mis-design the API boundary everything will be painful, > no matter where you put the code. Are you implying the whole idea of adding these legacy virtio admin commands to virtio spec was a design mistake? It was nvidia guys who proposed it, so I'm surprised to hear you say this.
On Wed, Oct 11, 2023 at 09:18:49AM -0300, Jason Gunthorpe wrote: > The simple way to be sure is to never touch the PCI function that has > DMA assigned to a VM from the hypervisor, except through config space. What makes config space different that it's safe though? Isn't this more of a "we can't avoid touching config space" than that it's safe? The line doesn't look that bright to me - if there's e.g. a memory area designed explicitly for hypervisor to poke at, that seems fine.
On Wed, Oct 11, 2023 at 09:18:49AM -0300, Jason Gunthorpe wrote: > With VDPA doing the same stuff as vfio I'm not sure who is auditing it > for security. Check the signed off tags and who sends the pull requests if you want to know.
On Wed, Oct 11, 2023 at 12:59:30PM -0400, Michael S. Tsirkin wrote: > On Wed, Oct 11, 2023 at 11:58:10AM -0300, Jason Gunthorpe wrote: > > Trying to put VFIO-only code in virtio is what causes all the > > issues. If you mis-design the API boundary everything will be painful, > > no matter where you put the code. > > Are you implying the whole idea of adding these legacy virtio admin > commands to virtio spec was a design mistake? No, I'm saying again that trying to relocate all the vfio code into drivers/virtio is a mistake Jason
On Wed, Oct 11, 2023 at 01:03:09PM -0400, Michael S. Tsirkin wrote: > On Wed, Oct 11, 2023 at 09:18:49AM -0300, Jason Gunthorpe wrote: > > The simple way to be sure is to never touch the PCI function that has > > DMA assigned to a VM from the hypervisor, except through config space. > > What makes config space different that it's safe though? Hypervisor fully mediates it and it is not accessible to P2P attacks. > Isn't this more of a "we can't avoid touching config space" than > that it's safe? The line doesn't look that bright to me - > if there's e.g. a memory area designed explicitly for > hypervisor to poke at, that seems fine. It is not. Jason
On Wed, Oct 11, 2023 at 02:19:44PM -0300, Jason Gunthorpe wrote: > On Wed, Oct 11, 2023 at 12:59:30PM -0400, Michael S. Tsirkin wrote: > > On Wed, Oct 11, 2023 at 11:58:10AM -0300, Jason Gunthorpe wrote: > > > Trying to put VFIO-only code in virtio is what causes all the > > > issues. If you mis-design the API boundary everything will be painful, > > > no matter where you put the code. > > > > Are you implying the whole idea of adding these legacy virtio admin > > commands to virtio spec was a design mistake? > > No, I'm saying again that trying to relocate all the vfio code into > drivers/virtio is a mistake > > Jason Yea please don't. And by the same token, please do not put implementations of virtio spec under drivers/vfio.
On 10/11/2023 4:00 PM, Parav Pandit via Virtualization wrote: > Hi Christoph, > >> From: Christoph Hellwig <hch@infradead.org> >> Sent: Wednesday, October 11, 2023 12:29 PM >> >> On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote: >>>> Btw, what is that intel thing everyone is talking about? And why >>>> would the virtio core support vendor specific behavior like that? >>> It's not a thing it's Zhu Lingshan :) intel is just one of the vendors >>> that implemented vdpa support and so Zhu Lingshan from intel is >>> working on vdpa and has also proposed virtio spec extensions for migration. >>> intel's driver is called ifcvf. vdpa composes all this stuff that is >>> added to vfio in userspace, so it's a different approach. >> Well, so let's call it virtio live migration instead of intel. >> >> And please work all together in the virtio committee that you have one way of >> communication between controlling and controlled functions. >> If one extension does it one way and the other a different way that's just >> creating a giant mess. > We in virtio committee are working on VF device migration where: > VF = controlled function > PF = controlling function > > The second proposal is what Michael mentioned from Intel that somehow combine controlled and controlling function as single entity on VF. > > The main reasons I find it weird are: > 1. it must always need to do mediation to do fake the device reset, and flr flows > 2. dma cannot work as you explained for complex device state > 3. it needs constant knowledge of each tiny things for each virtio device type > > Such single entity appears a bit very weird to me but maybe it is just me. sorry for the late reply, we have discussed this for weeks in virtio mailing list. I have proposed a live migration solution which is a config space solution. We(me, Jason and Eugenio) have been working on this solution for more than two years and we are implementing virtio live migration basic facilities. The implementation is transport specific, e.g., for PCI we implement new or extend registers which work as other config space registers do. The reason we are arguing is: I am not sure admin vq based live migration solution is a good choice, because: 1) it does not work for nested 2) it does not work for bare metal 3) QOS problem 4) security leaks. Sorry to span the discussions here. Thanks, Zhu Lingshan > _______________________________________________ > Virtualization mailing list > Virtualization@lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/virtualization
On 10/11/2023 2:59 PM, Christoph Hellwig wrote: > On Wed, Oct 11, 2023 at 02:43:37AM -0400, Michael S. Tsirkin wrote: >>> Btw, what is that intel thing everyone is talking about? And why >>> would the virtio core support vendor specific behavior like that? >> It's not a thing it's Zhu Lingshan :) intel is just one of the vendors >> that implemented vdpa support and so Zhu Lingshan from intel is working >> on vdpa and has also proposed virtio spec extensions for migration. >> intel's driver is called ifcvf. vdpa composes all this stuff that is >> added to vfio in userspace, so it's a different approach. > Well, so let's call it virtio live migration instead of intel. > > And please work all together in the virtio committee that you have > one way of communication between controlling and controlled functions. > If one extension does it one way and the other a different way that's > just creating a giant mess. I hope so, Jason Wang has proposed a solution to cooperate, but sadly rejected... > > _______________________________________________ > Virtualization mailing list > Virtualization@lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/virtualization
On Thu, Oct 12, 2023 at 06:29:47PM +0800, Zhu, Lingshan wrote: > sorry for the late reply, we have discussed this for weeks in virtio mailing > list. I have proposed a live migration solution which is a config space solution. I'm sorry that can't be a serious proposal - config space can't do DMA, it is not suitable. Jason
On Fri, Oct 13, 2023 at 06:28:34PM +0800, Zhu, Lingshan wrote: > > > On 10/12/2023 9:27 PM, Jason Gunthorpe wrote: > > On Thu, Oct 12, 2023 at 06:29:47PM +0800, Zhu, Lingshan wrote: > > > sorry for the late reply, we have discussed this for weeks in virtio mailing > list. I have proposed a live migration solution which is a config space solution. > > I'm sorry that can't be a serious proposal - config space can't do > DMA, it is not suitable. > > config space only controls the live migration process and config the related > facilities. > We don't use config space to transfer data. > > The new added registers work like queue_enable or features. > > For example, we use DMA to report dirty pages and MMIO to fetch the dirty data. > > I remember in another thread you said:"you can't use DMA for any migration > flows" > > And I agree to that statement, so we use config space registers to control the > flow. > > Thanks, > Zhu Lingshan > > > Jason > If you are using dma then I don't see what's wrong with admin vq. dma is all it does.
On 10/13/2023 9:50 PM, Michael S. Tsirkin wrote: > On Fri, Oct 13, 2023 at 06:28:34PM +0800, Zhu, Lingshan wrote: >> >> On 10/12/2023 9:27 PM, Jason Gunthorpe wrote: >> >> On Thu, Oct 12, 2023 at 06:29:47PM +0800, Zhu, Lingshan wrote: >> >> >> sorry for the late reply, we have discussed this for weeks in virtio mailing >> list. I have proposed a live migration solution which is a config space solution. >> >> I'm sorry that can't be a serious proposal - config space can't do >> DMA, it is not suitable. >> >> config space only controls the live migration process and config the related >> facilities. >> We don't use config space to transfer data. >> >> The new added registers work like queue_enable or features. >> >> For example, we use DMA to report dirty pages and MMIO to fetch the dirty data. >> >> I remember in another thread you said:"you can't use DMA for any migration >> flows" >> >> And I agree to that statement, so we use config space registers to control the >> flow. >> >> Thanks, >> Zhu Lingshan >> >> >> Jason >> > If you are using dma then I don't see what's wrong with admin vq. > dma is all it does. dma != admin vq, and I think we have discussed many details in pros and cons in admin vq live migration proposal in virtio-comment. I am not sure we should span the discussions here, repeat them over again. Thanks >
On Mon, Oct 16, 2023 at 04:33:10PM +0800, Zhu, Lingshan wrote: > > > On 10/13/2023 9:50 PM, Michael S. Tsirkin wrote: > > On Fri, Oct 13, 2023 at 06:28:34PM +0800, Zhu, Lingshan wrote: > > > > > > On 10/12/2023 9:27 PM, Jason Gunthorpe wrote: > > > > > > On Thu, Oct 12, 2023 at 06:29:47PM +0800, Zhu, Lingshan wrote: > > > > > > > > > sorry for the late reply, we have discussed this for weeks in virtio mailing > > > list. I have proposed a live migration solution which is a config space solution. > > > > > > I'm sorry that can't be a serious proposal - config space can't do > > > DMA, it is not suitable. > > > > > > config space only controls the live migration process and config the related > > > facilities. > > > We don't use config space to transfer data. > > > > > > The new added registers work like queue_enable or features. > > > > > > For example, we use DMA to report dirty pages and MMIO to fetch the dirty data. > > > > > > I remember in another thread you said:"you can't use DMA for any migration > > > flows" > > > > > > And I agree to that statement, so we use config space registers to control the > > > flow. > > > > > > Thanks, > > > Zhu Lingshan > > > > > > > > > Jason > > > > > If you are using dma then I don't see what's wrong with admin vq. > > dma is all it does. > dma != admin vq, Well they share the same issue that they don't work for nesting because DMA can not be intercepted. > and I think we have discussed many details in pros and cons > in admin vq live migration proposal in virtio-comment. > I am not sure we should span the discussions here, repeat them over again. > > Thanks > > Yea let's not.
On 10/16/2023 4:52 PM, Michael S. Tsirkin wrote: > On Mon, Oct 16, 2023 at 04:33:10PM +0800, Zhu, Lingshan wrote: >> >> On 10/13/2023 9:50 PM, Michael S. Tsirkin wrote: >>> On Fri, Oct 13, 2023 at 06:28:34PM +0800, Zhu, Lingshan wrote: >>>> On 10/12/2023 9:27 PM, Jason Gunthorpe wrote: >>>> >>>> On Thu, Oct 12, 2023 at 06:29:47PM +0800, Zhu, Lingshan wrote: >>>> >>>> >>>> sorry for the late reply, we have discussed this for weeks in virtio mailing >>>> list. I have proposed a live migration solution which is a config space solution. >>>> >>>> I'm sorry that can't be a serious proposal - config space can't do >>>> DMA, it is not suitable. >>>> >>>> config space only controls the live migration process and config the related >>>> facilities. >>>> We don't use config space to transfer data. >>>> >>>> The new added registers work like queue_enable or features. >>>> >>>> For example, we use DMA to report dirty pages and MMIO to fetch the dirty data. >>>> >>>> I remember in another thread you said:"you can't use DMA for any migration >>>> flows" >>>> >>>> And I agree to that statement, so we use config space registers to control the >>>> flow. >>>> >>>> Thanks, >>>> Zhu Lingshan >>>> >>>> >>>> Jason >>>> >>> If you are using dma then I don't see what's wrong with admin vq. >>> dma is all it does. >> dma != admin vq, > Well they share the same issue that they don't work for nesting > because DMA can not be intercepted. (hope this is not a spam to virtualization list and I try to keep this short) only use dma for host memory access, e.g., dirty page bitmap, no need to intercepted. > >> and I think we have discussed many details in pros and cons >> in admin vq live migration proposal in virtio-comment. >> I am not sure we should span the discussions here, repeat them over again. >> >> Thanks > Yea let's not. >
diff --git a/drivers/vfio/pci/virtio/cmd.c b/drivers/vfio/pci/virtio/cmd.c new file mode 100644 index 000000000000..f068239cdbb0 --- /dev/null +++ b/drivers/vfio/pci/virtio/cmd.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "cmd.h" + +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size) +{ + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); + struct scatterlist out_sg; + struct virtio_admin_cmd cmd = {}; + + if (!virtio_dev) + return -ENOTCONN; + + sg_init_one(&out_sg, buf, buf_size); + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_QUERY; + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; + cmd.result_sg = &out_sg; + + return virtio_admin_cmd_exec(virtio_dev, &cmd); +} + +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size) +{ + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); + struct scatterlist in_sg; + struct virtio_admin_cmd cmd = {}; + + if (!virtio_dev) + return -ENOTCONN; + + sg_init_one(&in_sg, buf, buf_size); + cmd.opcode = VIRTIO_ADMIN_CMD_LIST_USE; + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; + cmd.data_sg = &in_sg; + + return virtio_admin_cmd_exec(virtio_dev, &cmd); +} + +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, + u8 offset, u8 size, u8 *buf) +{ + struct virtio_device *virtio_dev = + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); + struct virtio_admin_cmd_data_lr_write *in; + struct scatterlist in_sg; + struct virtio_admin_cmd cmd = {}; + int ret; + + if (!virtio_dev) + return -ENOTCONN; + + in = kzalloc(sizeof(*in) + size, GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->offset = offset; + memcpy(in->registers, buf, size); + sg_init_one(&in_sg, in, sizeof(*in) + size); + cmd.opcode = opcode; + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; + cmd.group_member_id = virtvdev->vf_id + 1; + cmd.data_sg = &in_sg; + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); + + kfree(in); + return ret; +} + +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, + u8 offset, u8 size, u8 *buf) +{ + struct virtio_device *virtio_dev = + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); + struct virtio_admin_cmd_data_lr_read *in; + struct scatterlist in_sg, out_sg; + struct virtio_admin_cmd cmd = {}; + int ret; + + if (!virtio_dev) + return -ENOTCONN; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->offset = offset; + sg_init_one(&in_sg, in, sizeof(*in)); + sg_init_one(&out_sg, buf, size); + cmd.opcode = opcode; + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; + cmd.data_sg = &in_sg; + cmd.result_sg = &out_sg; + cmd.group_member_id = virtvdev->vf_id + 1; + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); + + kfree(in); + return ret; +} + +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, + u8 req_bar_flags, u8 *bar, u64 *bar_offset) +{ + struct virtio_device *virtio_dev = + virtio_pci_vf_get_pf_dev(virtvdev->core_device.pdev); + struct virtio_admin_cmd_notify_info_result *out; + struct scatterlist out_sg; + struct virtio_admin_cmd cmd = {}; + int ret; + + if (!virtio_dev) + return -ENOTCONN; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return -ENOMEM; + + sg_init_one(&out_sg, out, sizeof(*out)); + cmd.opcode = VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO; + cmd.group_type = VIRTIO_ADMIN_GROUP_TYPE_SRIOV; + cmd.result_sg = &out_sg; + cmd.group_member_id = virtvdev->vf_id + 1; + ret = virtio_admin_cmd_exec(virtio_dev, &cmd); + if (!ret) { + struct virtio_admin_cmd_notify_info_data *entry; + int i; + + ret = -ENOENT; + for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) { + entry = &out->entries[i]; + if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END) + break; + if (entry->flags != req_bar_flags) + continue; + *bar = entry->bar; + *bar_offset = le64_to_cpu(entry->offset); + ret = 0; + break; + } + } + + kfree(out); + return ret; +} diff --git a/drivers/vfio/pci/virtio/cmd.h b/drivers/vfio/pci/virtio/cmd.h new file mode 100644 index 000000000000..c2a3645f4b90 --- /dev/null +++ b/drivers/vfio/pci/virtio/cmd.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#ifndef VIRTIO_VFIO_CMD_H +#define VIRTIO_VFIO_CMD_H + +#include <linux/kernel.h> +#include <linux/virtio.h> +#include <linux/vfio_pci_core.h> +#include <linux/virtio_pci.h> + +struct virtiovf_pci_core_device { + struct vfio_pci_core_device core_device; + int vf_id; +}; + +int virtiovf_cmd_list_query(struct pci_dev *pdev, u8 *buf, int buf_size); +int virtiovf_cmd_list_use(struct pci_dev *pdev, u8 *buf, int buf_size); +int virtiovf_cmd_lr_write(struct virtiovf_pci_core_device *virtvdev, u16 opcode, + u8 offset, u8 size, u8 *buf); +int virtiovf_cmd_lr_read(struct virtiovf_pci_core_device *virtvdev, u16 opcode, + u8 offset, u8 size, u8 *buf); +int virtiovf_cmd_lq_read_notify(struct virtiovf_pci_core_device *virtvdev, + u8 req_bar_flags, u8 *bar, u64 *bar_offset); +#endif /* VIRTIO_VFIO_CMD_H */
Expose admin commands over the virtio device, to be used by the vfio-virtio driver in the next patches. It includes: list query/use, legacy write/read, read notify_info. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> --- drivers/vfio/pci/virtio/cmd.c | 146 ++++++++++++++++++++++++++++++++++ drivers/vfio/pci/virtio/cmd.h | 27 +++++++ 2 files changed, 173 insertions(+) create mode 100644 drivers/vfio/pci/virtio/cmd.c create mode 100644 drivers/vfio/pci/virtio/cmd.h