@@ -6304,6 +6304,13 @@ state. At VM creation time, all memory is shared, i.e. the PRIVATE attribute
is '0' for all gfns. Userspace can control whether memory is shared/private by
toggling KVM_MEMORY_ATTRIBUTE_PRIVATE via KVM_SET_MEMORY_ATTRIBUTES as needed.
+Userspace can set KVM_MEM_VFIO_DMABUF in flags to indicate the memory region is
+backed by a userspace unmappable dma_buf exported by VFIO. The backend resource
+is one piece of MMIO region of the device. The slot is unmappable so it is
+allowed to be converted to private. KVM binds the memory region to a given
+dma_buf fd range of [0, memory_size]. For now, the dma_buf fd is filled in
+'guest_memfd' field, and the guest_memfd_offset must be 0;
+
S390:
^^^^^
@@ -606,6 +606,10 @@ struct kvm_memory_slot {
pgoff_t pgoff;
} gmem;
#endif
+
+#ifdef CONFIG_KVM_VFIO_DMABUF
+ struct dma_buf_attachment *dmabuf_attach;
+#endif
};
static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
@@ -2568,4 +2572,18 @@ static inline int kvm_enable_virtualization(void) { return 0; }
static inline void kvm_disable_virtualization(void) { }
#endif
+#ifdef CONFIG_KVM_VFIO_DMABUF
+int kvm_vfio_dmabuf_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
+#else
+static inline int kvm_vfio_dmabuf_get_pfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn,
+ int *max_order);
+{
+ KVM_BUG_ON(1, kvm);
+ return -EIO;
+}
+#endif
+
#endif
@@ -51,6 +51,7 @@ struct kvm_userspace_memory_region2 {
#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
#define KVM_MEM_READONLY (1UL << 1)
#define KVM_MEM_GUEST_MEMFD (1UL << 2)
+#define KVM_MEM_VFIO_DMABUF (1UL << 3)
/* for KVM_IRQ_LINE */
struct kvm_irq_level {
@@ -115,6 +115,7 @@ config KVM_PRIVATE_MEM
config KVM_GENERIC_PRIVATE_MEM
select KVM_GENERIC_MEMORY_ATTRIBUTES
select KVM_PRIVATE_MEM
+ select KVM_VFIO_DMABUF
bool
config HAVE_KVM_ARCH_GMEM_PREPARE
@@ -124,3 +125,8 @@ config HAVE_KVM_ARCH_GMEM_PREPARE
config HAVE_KVM_ARCH_GMEM_INVALIDATE
bool
depends on KVM_PRIVATE_MEM
+
+config KVM_VFIO_DMABUF
+ bool
+ select DMA_SHARED_BUFFER
+ select DMABUF_MOVE_NOTIFY
@@ -13,3 +13,4 @@ kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_memfd.o
+kvm-$(CONFIG_KVM_VFIO_DMABUF) += $(KVM)/vfio_dmabuf.o
@@ -938,6 +938,8 @@ static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
if (slot->flags & KVM_MEM_GUEST_MEMFD)
kvm_gmem_unbind(slot);
+ else if (slot->flags & KVM_MEM_VFIO_DMABUF)
+ kvm_vfio_dmabuf_unbind(slot);
kvm_destroy_dirty_bitmap(slot);
@@ -1526,13 +1528,19 @@ static void kvm_replace_memslot(struct kvm *kvm,
static int check_memory_region_flags(struct kvm *kvm,
const struct kvm_userspace_memory_region2 *mem)
{
+ u32 private_mask = KVM_MEM_GUEST_MEMFD | KVM_MEM_VFIO_DMABUF;
+ u32 private_flag = mem->flags & private_mask;
u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+ /* private flags are mutually exclusive. */
+ if (private_flag & (private_flag - 1))
+ return -EINVAL;
+
if (kvm_arch_has_private_mem(kvm))
- valid_flags |= KVM_MEM_GUEST_MEMFD;
+ valid_flags |= private_flag;
/* Dirty logging private memory is not currently supported. */
- if (mem->flags & KVM_MEM_GUEST_MEMFD)
+ if (private_flag)
valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
/*
@@ -1540,8 +1548,7 @@ static int check_memory_region_flags(struct kvm *kvm,
* read-only memslots have emulated MMIO, not page fault, semantics,
* and KVM doesn't allow emulated MMIO for private memory.
*/
- if (kvm_arch_has_readonly_mem(kvm) &&
- !(mem->flags & KVM_MEM_GUEST_MEMFD))
+ if (kvm_arch_has_readonly_mem(kvm) && !private_flag)
valid_flags |= KVM_MEM_READONLY;
if (mem->flags & ~valid_flags)
@@ -2044,6 +2051,21 @@ int __kvm_set_memory_region(struct kvm *kvm,
r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
if (r)
goto out;
+ } else if (mem->flags & KVM_MEM_VFIO_DMABUF) {
+ if (mem->guest_memfd_offset) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Open: May be confusing that store the dmabuf fd parameter in
+ * kvm_userspace_memory_region2::guest_memfd. But this avoids
+ * introducing another format for
+ * IOCTL(KVM_SET_USER_MEMORY_REGIONX).
+ */
+ r = kvm_vfio_dmabuf_bind(kvm, new, mem->guest_memfd);
+ if (r)
+ goto out;
}
r = kvm_set_memslot(kvm, old, new, change);
@@ -2055,6 +2077,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
out_unbind:
if (mem->flags & KVM_MEM_GUEST_MEMFD)
kvm_gmem_unbind(new);
+ else if (mem->flags & KVM_MEM_VFIO_DMABUF)
+ kvm_vfio_dmabuf_unbind(new);
out:
kfree(new);
return r;
@@ -93,4 +93,23 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
}
#endif /* CONFIG_KVM_PRIVATE_MEM */
+#ifdef CONFIG_KVM_VFIO_DMABUF
+int kvm_vfio_dmabuf_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
+ unsigned int fd);
+void kvm_vfio_dmabuf_unbind(struct kvm_memory_slot *slot);
+#else
+static inline int kvm_vfio_dmabuf_bind(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned int fd);
+{
+ WARN_ON_ONCE(1);
+ return -EIO;
+}
+
+static inline void kvm_vfio_dmabuf_unbind(struct kvm_memory_slot *slot)
+{
+ WARN_ON_ONCE(1);
+}
+#endif /* CONFIG_KVM_VFIO_DMABUF */
+
#endif /* __KVM_MM_H__ */
new file mode 100644
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/dma-buf.h>
+#include <linux/kvm_host.h>
+#include <linux/vfio.h>
+
+#include "kvm_mm.h"
+
+MODULE_IMPORT_NS("DMA_BUF");
+
+struct kvm_vfio_dmabuf {
+ struct kvm *kvm;
+ struct kvm_memory_slot *slot;
+};
+
+static void kv_dmabuf_move_notify(struct dma_buf_attachment *attach)
+{
+ struct kvm_vfio_dmabuf *kv_dmabuf = attach->importer_priv;
+ struct kvm_memory_slot *slot = kv_dmabuf->slot;
+ struct kvm *kvm = kv_dmabuf->kvm;
+ bool flush = false;
+
+ struct kvm_gfn_range gfn_range = {
+ .start = slot->base_gfn,
+ .end = slot->base_gfn + slot->npages,
+ .slot = slot,
+ .may_block = true,
+ .attr_filter = KVM_FILTER_PRIVATE | KVM_FILTER_SHARED,
+ };
+
+ KVM_MMU_LOCK(kvm);
+ kvm_mmu_invalidate_begin(kvm);
+ flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ kvm_mmu_invalidate_end(kvm);
+ KVM_MMU_UNLOCK(kvm);
+}
+
+static const struct dma_buf_attach_ops kv_dmabuf_attach_ops = {
+ .allow_peer2peer = true,
+ .move_notify = kv_dmabuf_move_notify,
+};
+
+int kvm_vfio_dmabuf_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
+ unsigned int fd)
+{
+ size_t size = slot->npages << PAGE_SHIFT;
+ struct dma_buf_attachment *attach;
+ struct kvm_vfio_dmabuf *kv_dmabuf;
+ struct dma_buf *dmabuf;
+ int ret;
+
+ dmabuf = dma_buf_get(fd);
+ if (IS_ERR(dmabuf))
+ return PTR_ERR(dmabuf);
+
+ if (size != dmabuf->size) {
+ ret = -EINVAL;
+ goto err_dmabuf;
+ }
+
+ kv_dmabuf = kzalloc(sizeof(*kv_dmabuf), GFP_KERNEL);
+ if (!kv_dmabuf) {
+ ret = -ENOMEM;
+ goto err_dmabuf;
+ }
+
+ kv_dmabuf->kvm = kvm;
+ kv_dmabuf->slot = slot;
+ attach = dma_buf_dynamic_attach(dmabuf, NULL, &kv_dmabuf_attach_ops,
+ kv_dmabuf);
+ if (IS_ERR(attach)) {
+ ret = PTR_ERR(attach);
+ goto err_kv_dmabuf;
+ }
+
+ slot->dmabuf_attach = attach;
+
+ return 0;
+
+err_kv_dmabuf:
+ kfree(kv_dmabuf);
+err_dmabuf:
+ dma_buf_put(dmabuf);
+ return ret;
+}
+
+void kvm_vfio_dmabuf_unbind(struct kvm_memory_slot *slot)
+{
+ struct dma_buf_attachment *attach = slot->dmabuf_attach;
+ struct kvm_vfio_dmabuf *kv_dmabuf;
+ struct dma_buf *dmabuf;
+
+ if (WARN_ON_ONCE(!attach))
+ return;
+
+ kv_dmabuf = attach->importer_priv;
+ dmabuf = attach->dmabuf;
+ dma_buf_detach(dmabuf, attach);
+ kfree(kv_dmabuf);
+ dma_buf_put(dmabuf);
+}
+
+/*
+ * The return value matters. If return -EFAULT, userspace will try to do
+ * page attribute (shared <-> private) conversion.
+ */
+int kvm_vfio_dmabuf_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+{
+ struct dma_buf_attachment *attach = slot->dmabuf_attach;
+ pgoff_t pgoff = gfn - slot->base_gfn;
+ int ret;
+
+ if (WARN_ON_ONCE(!attach))
+ return -EFAULT;
+
+ ret = dma_buf_get_pfn_unlocked(attach, pgoff, pfn, max_order);
+ if (ret)
+ return -EIO;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vfio_dmabuf_get_pfn);
Extend KVM_SET_USER_MEMORY_REGION2 to support mapping vfio_dmabuf backed MMIO region into a guest. The main purpose of this change is for KVM to map MMIO resources without firstly mapping into the host, similar to what is done in guest_memfd. The immediate use case is for CoCo VMs to support private MMIO. Similar to private guest memory, private MMIO is also not intended to be accessed by host. The host access to private MMIO would be rejected by private devices (known as TDI in TDISP spec) and cause the TDI exit the secure state. The further impact to the system may vary according to device implementation. The TDISP spec doesn't mandate any error reporting or logging, the TLP may be handled as an Unsupported Request, or just be dropped. In my test environment, an AER NonFatalErr is reported and no further impact. So from HW perspective, disallowing host access to private MMIO is not that critical but nice to have. But stick to find pfn via userspace mapping while allowing the pfn been privately mapped conflicts with the private mapping concept. And it virtually allows userspace to map any address as private. Before fault in, KVM cannot distinguish if a userspace addr is for private MMIO and safe to host access. Rely on userspace mapping also means private MMIO mapping should follow userspace mapping change via mmu_notifier. This conflicts with the current design that mmu_notifier never impacts private mapping. It also makes no sense to support mmu_notifier just for private MMIO, private MMIO mapping should be fixed when CoCo-VM accepts the private MMIO, any following mapping change without guest permission should be invalid. So the choice here is to eliminate userspace mapping and switch to use the FD based MMIO resources. There is still need to switch the memory attribute (shared <-> private) for private MMIO, when guest switches the device attribute between shared & private. Unlike memory, MMIO region has only one physical backend so it is a bit like in-place conversion, which for private memory, requires much effort on how to invalidate user mapping when converting to private. But for MMIO, it is expected that VMM never needs to access assigned MMIO for feature emulation, so always disallow userspace MMIO mapping and use FD based MMIO resources for 'private capable' MMIO region. The dma-buf is chosen as the FD based backend, it meets the need for KVM to aquire the non-struct page memory that can still have lifetime controlled by VFIO. It provides the option to disallow userspace mmap as long as the exporter doesn't provide dma_buf_ops.mmap() callback. The concern is it now just supports mapping into device's default_domain via DMA APIs. Some clue I can found to extend dma-buf APIs for subsystems like IOMMUFD [1] or KVM. The adding of dma_buf_get_pfn_unlocked() in this series is for this purpose. An alternative is VFIO provides a dedicated FD for KVM. But considering IOMMUFD may use dma-buf for MMIO mapping [2], it is better to have a unified export mechanism for the same purpose in VFIO. Open: Currently store the dmabuf fd parameter in kvm_userspace_memory_region2::guest_memfd. It may be confusing but avoids introducing another API format for IOCTL(KVM_SET_USER_MEMORY_REGION3). [1] https://lore.kernel.org/all/YwywgciH6BiWz4H1@nvidia.com/ [2] https://lore.kernel.org/kvm/14-v4-0de2f6c78ed0+9d1-iommufd_jgg@nvidia.com/ Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com> --- Documentation/virt/kvm/api.rst | 7 ++ include/linux/kvm_host.h | 18 +++++ include/uapi/linux/kvm.h | 1 + virt/kvm/Kconfig | 6 ++ virt/kvm/Makefile.kvm | 1 + virt/kvm/kvm_main.c | 32 +++++++-- virt/kvm/kvm_mm.h | 19 +++++ virt/kvm/vfio_dmabuf.c | 125 +++++++++++++++++++++++++++++++++ 8 files changed, 205 insertions(+), 4 deletions(-) create mode 100644 virt/kvm/vfio_dmabuf.c