@@ -5,6 +5,7 @@ iommufd-y := \
io_pagetable.o \
ioas.o \
main.o \
- pages.o
+ pages.o \
+ vfio_compat.o
obj-$(CONFIG_IOMMUFD) += iommufd.o
@@ -18,6 +18,7 @@ struct iommufd_ctx {
struct xarray objects;
u8 account_mode;
+ struct iommufd_ioas *vfio_ioas;
};
/*
@@ -91,6 +92,9 @@ struct iommufd_ucmd {
void *cmd;
};
+int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
+ unsigned long arg);
+
/* Copy the response in ucmd->cmd back to userspace. */
static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
size_t cmd_len)
@@ -221,6 +225,8 @@ int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx);
+int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
+
/*
* A HW pagetable is called an iommu_domain inside the kernel. This user object
* allows directly creating and inspecting the domains. Domains that have kernel
@@ -134,6 +134,8 @@ bool iommufd_object_destroy_user(struct iommufd_ctx *ictx,
return false;
}
__xa_erase(&ictx->objects, obj->id);
+ if (ictx->vfio_ioas && &ictx->vfio_ioas->obj == obj)
+ ictx->vfio_ioas = NULL;
xa_unlock(&ictx->objects);
up_write(&obj->destroy_rwsem);
@@ -266,27 +268,31 @@ static struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
length),
IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
val64),
+ IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
+ __reserved),
};
static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
+ struct iommufd_ctx *ictx = filp->private_data;
struct iommufd_ucmd ucmd = {};
struct iommufd_ioctl_op *op;
union ucmd_buffer buf;
unsigned int nr;
int ret;
- ucmd.ictx = filp->private_data;
+ nr = _IOC_NR(cmd);
+ if (nr < IOMMUFD_CMD_BASE ||
+ (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops))
+ return iommufd_vfio_ioctl(ictx, cmd, arg);
+
+ ucmd.ictx = ictx;
ucmd.ubuffer = (void __user *)arg;
ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
if (ret)
return ret;
- nr = _IOC_NR(cmd);
- if (nr < IOMMUFD_CMD_BASE ||
- (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops))
- return -ENOIOCTLCMD;
op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE];
if (op->ioctl_num != cmd)
return -ENOIOCTLCMD;
new file mode 100644
@@ -0,0 +1,443 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/file.h>
+#include <linux/interval_tree.h>
+#include <linux/iommu.h>
+#include <linux/iommufd.h>
+#include <linux/slab.h>
+#include <linux/vfio.h>
+#include <uapi/linux/vfio.h>
+#include <uapi/linux/iommufd.h>
+
+#include "iommufd_private.h"
+
+static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx)
+{
+ struct iommufd_ioas *ioas = ERR_PTR(-ENODEV);
+
+ xa_lock(&ictx->objects);
+ if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj))
+ goto out_unlock;
+ ioas = ictx->vfio_ioas;
+out_unlock:
+ xa_unlock(&ictx->objects);
+ return ioas;
+}
+
+/**
+ * iommufd_vfio_compat_ioas_id - Return the IOAS ID that vfio should use
+ * @ictx - Context to operate on
+ *
+ * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate
+ * on since they do not have an IOAS ID input in their ABI. Only attaching a
+ * group should cause a default creation of the internal ioas, this returns the
+ * existing ioas if it has already been assigned somehow.
+ */
+int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
+{
+ struct iommufd_ioas *ioas = NULL;
+ struct iommufd_ioas *out_ioas;
+
+ ioas = iommufd_ioas_alloc(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ xa_lock(&ictx->objects);
+ if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj))
+ out_ioas = ictx->vfio_ioas;
+ else {
+ out_ioas = ioas;
+ ictx->vfio_ioas = ioas;
+ }
+ xa_unlock(&ictx->objects);
+
+ *out_ioas_id = out_ioas->obj.id;
+ if (out_ioas != ioas) {
+ iommufd_put_object(&out_ioas->obj);
+ iommufd_object_abort(ictx, &ioas->obj);
+ return 0;
+ }
+ iommufd_object_finalize(ictx, &ioas->obj);
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_id, IOMMUFD_VFIO);
+
+int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_vfio_ioas *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+ switch (cmd->op) {
+ case IOMMU_VFIO_IOAS_GET:
+ ioas = get_compat_ioas(ucmd->ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ cmd->ioas_id = ioas->obj.id;
+ iommufd_put_object(&ioas->obj);
+ return iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+ case IOMMU_VFIO_IOAS_SET:
+ ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ xa_lock(&ucmd->ictx->objects);
+ ucmd->ictx->vfio_ioas = ioas;
+ xa_unlock(&ucmd->ictx->objects);
+ iommufd_put_object(&ioas->obj);
+ return 0;
+
+ case IOMMU_VFIO_IOAS_CLEAR:
+ xa_lock(&ucmd->ictx->objects);
+ ucmd->ictx->vfio_ioas = NULL;
+ xa_unlock(&ucmd->ictx->objects);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd,
+ void __user *arg)
+{
+ u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+ struct vfio_iommu_type1_dma_map map;
+ int iommu_prot = IOMMU_CACHE;
+ struct iommufd_ioas *ioas;
+ unsigned long iova;
+ int rc;
+
+ if (copy_from_user(&map, arg, minsz))
+ return -EFAULT;
+
+ if (map.argsz < minsz || map.flags & ~supported_flags)
+ return -EINVAL;
+
+ if (map.flags & VFIO_DMA_MAP_FLAG_READ)
+ iommu_prot |= IOMMU_READ;
+ if (map.flags & VFIO_DMA_MAP_FLAG_WRITE)
+ iommu_prot |= IOMMU_WRITE;
+
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ /*
+ * Maps created through the legacy interface always use VFIO compatible
+ * rlimit accounting. If the user wishes to use the faster user based
+ * rlimit accounting then they must use the new interface.
+ */
+ iova = map.iova;
+ rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr),
+ map.size, iommu_prot, 0);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd,
+ void __user *arg)
+{
+ size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+ /*
+ * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new
+ * dirty tracking direction:
+ * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/
+ * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/
+ */
+ u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL;
+ struct vfio_iommu_type1_dma_unmap unmap;
+ struct iommufd_ioas *ioas;
+ unsigned long unmapped;
+ int rc;
+
+ if (copy_from_user(&unmap, arg, minsz))
+ return -EFAULT;
+
+ if (unmap.argsz < minsz || unmap.flags & ~supported_flags)
+ return -EINVAL;
+
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) {
+ if (unmap.iova != 0 || unmap.size != 0) {
+ rc = -EINVAL;
+ goto err_put;
+ }
+ rc = iopt_unmap_all(&ioas->iopt, &unmapped);
+ } else {
+ if (READ_ONCE(ioas->iopt.disable_large_pages)) {
+ unsigned long iovas[] = { unmap.iova + unmap.size - 1,
+ unmap.iova - 1 };
+
+ rc = iopt_cut_iova(&ioas->iopt, iovas,
+ unmap.iova ? 2 : 1);
+ if (rc)
+ goto err_put;
+ }
+ rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size,
+ &unmapped);
+ }
+ unmap.size = unmapped;
+ if (copy_to_user(arg, &unmap, minsz))
+ rc = -EFAULT;
+
+err_put:
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
+{
+ struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_ioas *ioas;
+ int rc = 1;
+
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ mutex_lock(&ioas->mutex);
+ list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
+ if (!hwpt->enforce_cache_coherency) {
+ rc = 0;
+ break;
+ }
+ }
+ mutex_unlock(&ioas->mutex);
+
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
+ unsigned long type)
+{
+ switch (type) {
+ case VFIO_TYPE1_IOMMU:
+ case VFIO_TYPE1v2_IOMMU:
+ case VFIO_UNMAP_ALL:
+ return 1;
+
+ case VFIO_DMA_CC_IOMMU:
+ return iommufd_vfio_cc_iommu(ictx);
+
+ /*
+ * This is obsolete, and to be removed from VFIO. It was an incomplete
+ * idea that got merged.
+ * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
+ */
+ case VFIO_TYPE1_NESTING_IOMMU:
+ return 0;
+
+ /*
+ * VFIO_DMA_MAP_FLAG_VADDR
+ * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/
+ * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/
+ *
+ * It is hard to see how this could be implemented safely.
+ */
+ case VFIO_UPDATE_VADDR:
+ default:
+ return 0;
+ }
+}
+
+static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type)
+{
+ struct iommufd_ioas *ioas = NULL;
+ int rc = 0;
+
+ if (type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU)
+ return -EINVAL;
+
+ /* VFIO fails the set_iommu if there is no group */
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ if (type == VFIO_TYPE1_IOMMU)
+ rc = iopt_disable_large_pages(&ioas->iopt);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas)
+{
+ struct io_pagetable *iopt = &ioas->iopt;
+ unsigned long pgsize_bitmap = ULONG_MAX;
+ struct iommu_domain *domain;
+ unsigned long index;
+
+ down_read(&iopt->domains_rwsem);
+ xa_for_each(&iopt->domains, index, domain)
+ pgsize_bitmap &= domain->pgsize_bitmap;
+
+ /* See vfio_update_pgsize_bitmap() */
+ if (pgsize_bitmap & ~PAGE_MASK) {
+ pgsize_bitmap &= PAGE_MASK;
+ pgsize_bitmap |= PAGE_SIZE;
+ }
+ pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment);
+ up_read(&iopt->domains_rwsem);
+ return pgsize_bitmap;
+}
+
+static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas,
+ struct vfio_info_cap_header __user *cur,
+ size_t avail)
+{
+ struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas =
+ container_of(cur,
+ struct vfio_iommu_type1_info_cap_iova_range __user,
+ header);
+ struct vfio_iommu_type1_info_cap_iova_range cap_iovas = {
+ .header = {
+ .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE,
+ .version = 1,
+ },
+ };
+ struct interval_tree_span_iter span;
+
+ interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
+ ULONG_MAX) {
+ struct vfio_iova_range range;
+
+ if (!span.is_hole)
+ continue;
+ range.start = span.start_hole;
+ range.end = span.last_hole;
+ if (avail >= struct_size(&cap_iovas, iova_ranges,
+ cap_iovas.nr_iovas + 1) &&
+ copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas],
+ &range, sizeof(range)))
+ return -EFAULT;
+ cap_iovas.nr_iovas++;
+ }
+ if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) &&
+ copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas)))
+ return -EFAULT;
+ return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas);
+}
+
+static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas,
+ struct vfio_info_cap_header __user *cur,
+ size_t avail)
+{
+ struct vfio_iommu_type1_info_dma_avail cap_dma = {
+ .header = {
+ .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL,
+ .version = 1,
+ },
+ /* iommufd has no limit, return the same value as VFIO. */
+ .avail = U16_MAX,
+ };
+
+ if (avail >= sizeof(cap_dma) &&
+ copy_to_user(cur, &cap_dma, sizeof(cap_dma)))
+ return -EFAULT;
+ return sizeof(cap_dma);
+}
+
+static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx,
+ void __user *arg)
+{
+ typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas,
+ struct vfio_info_cap_header __user *cur,
+ size_t avail);
+ static const fill_cap_fn fill_fns[] = {
+ iommufd_fill_cap_iova,
+ iommufd_fill_cap_dma_avail,
+ };
+ size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+ struct vfio_info_cap_header __user *last_cap = NULL;
+ struct vfio_iommu_type1_info info;
+ struct iommufd_ioas *ioas;
+ size_t total_cap_size;
+ int rc;
+ int i;
+
+ if (copy_from_user(&info, arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+ minsz = min_t(size_t, info.argsz, sizeof(info));
+
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ down_read(&ioas->iopt.iova_rwsem);
+ info.flags = VFIO_IOMMU_INFO_PGSIZES;
+ info.iova_pgsizes = iommufd_get_pagesizes(ioas);
+ info.cap_offset = 0;
+
+ total_cap_size = sizeof(info);
+ for (i = 0; i != ARRAY_SIZE(fill_fns); i++) {
+ int cap_size;
+
+ if (info.argsz > total_cap_size)
+ cap_size = fill_fns[i](ioas, arg + total_cap_size,
+ info.argsz - total_cap_size);
+ else
+ cap_size = fill_fns[i](ioas, NULL, 0);
+ if (cap_size < 0) {
+ rc = cap_size;
+ goto out_put;
+ }
+ if (last_cap && info.argsz >= total_cap_size &&
+ put_user(total_cap_size, &last_cap->next)) {
+ rc = -EFAULT;
+ goto out_put;
+ }
+ last_cap = arg + total_cap_size;
+ total_cap_size += cap_size;
+ }
+
+ /*
+ * If the user did not provide enough space then only some caps are
+ * returned and the argsz will be updated to the correct amount to get
+ * all caps.
+ */
+ if (info.argsz >= total_cap_size)
+ info.cap_offset = sizeof(info);
+ info.argsz = total_cap_size;
+ info.flags |= VFIO_IOMMU_INFO_CAPS;
+ if (copy_to_user(arg, &info, minsz))
+ rc = -EFAULT;
+ rc = 0;
+
+out_put:
+ up_read(&ioas->iopt.iova_rwsem);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
+ unsigned long arg)
+{
+ void __user *uarg = (void __user *)arg;
+
+ switch (cmd) {
+ case VFIO_GET_API_VERSION:
+ return VFIO_API_VERSION;
+ case VFIO_SET_IOMMU:
+ return iommufd_vfio_set_iommu(ictx, arg);
+ case VFIO_CHECK_EXTENSION:
+ return iommufd_vfio_check_extension(ictx, arg);
+ case VFIO_IOMMU_GET_INFO:
+ return iommufd_vfio_iommu_get_info(ictx, uarg);
+ case VFIO_IOMMU_MAP_DMA:
+ return iommufd_vfio_map_dma(ictx, cmd, uarg);
+ case VFIO_IOMMU_UNMAP_DMA:
+ return iommufd_vfio_unmap_dma(ictx, cmd, uarg);
+ case VFIO_IOMMU_DIRTY_PAGES:
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return -ENOIOCTLCMD;
+}
@@ -57,6 +57,7 @@ void iommufd_access_unpin_pages(struct iommufd_access *access,
unsigned long iova, unsigned long length);
int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
void *data, size_t len, unsigned int flags);
+int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id);
#else /* !CONFIG_IOMMUFD */
static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
{
@@ -87,5 +88,11 @@ static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long
{
return -EOPNOTSUPP;
}
+
+static inline int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx,
+ u32 *out_ioas_id)
+{
+ return -EOPNOTSUPP;
+}
#endif /* CONFIG_IOMMUFD */
#endif
@@ -44,6 +44,7 @@ enum {
IOMMUFD_CMD_IOAS_MAP,
IOMMUFD_CMD_IOAS_UNMAP,
IOMMUFD_CMD_OPTION,
+ IOMMUFD_CMD_VFIO_IOAS,
};
/**
@@ -291,4 +292,39 @@ struct iommu_option {
__aligned_u64 val64;
};
#define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
+
+/**
+ * enum iommufd_vfio_ioas_op
+ * @IOMMU_VFIO_IOAS_GET: Get the current compatibility IOAS
+ * @IOMMU_VFIO_IOAS_SET: Change the current compatibility IOAS
+ * @IOMMU_VFIO_IOAS_CLEAR: Disable VFIO compatibility
+ */
+enum iommufd_vfio_ioas_op {
+ IOMMU_VFIO_IOAS_GET = 0,
+ IOMMU_VFIO_IOAS_SET = 1,
+ IOMMU_VFIO_IOAS_CLEAR = 2,
+};
+
+/**
+ * struct iommu_vfio_ioas - ioctl(IOMMU_VFIO_IOAS)
+ * @size: sizeof(struct iommu_vfio_ioas)
+ * @ioas_id: For IOMMU_VFIO_IOAS_SET the input IOAS ID to set
+ * For IOMMU_VFIO_IOAS_GET will output the IOAS ID
+ * @op: One of enum iommufd_vfio_ioas_op
+ * @__reserved: Must be 0
+ *
+ * The VFIO compatibility support uses a single ioas because VFIO APIs do not
+ * support the ID field. Set or Get the IOAS that VFIO compatibility will use.
+ * When VFIO_GROUP_SET_CONTAINER is used on an iommufd it will get the
+ * compatibility ioas, either by taking what is already set, or auto creating
+ * one. From then on VFIO will continue to use that ioas and is not effected by
+ * this ioctl. SET or CLEAR does not destroy any auto-created IOAS.
+ */
+struct iommu_vfio_ioas {
+ __u32 size;
+ __u32 ioas_id;
+ __u16 op;
+ __u16 __reserved;
+};
+#define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS)
#endif