From patchwork Wed May 29 16:22:54 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679207 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 0A2E3C25B75 for ; Wed, 29 May 2024 16:26:34 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM6u-00018C-G2; Wed, 29 May 2024 12:25:08 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6o-00015b-TJ for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:05 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6l-0006KV-6k for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:02 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6i-006COY-QZ; Wed, 29 May 2024 17:24:56 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 01/26] vfio/container: pass MemoryRegion to DMA operations Date: Wed, 29 May 2024 17:22:54 +0100 Message-Id: <20240529162319.1476680-2-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Pass through the MemoryRegion to DMA operation handlers of vfio containers. The vfio-user container will need this later. Originally-by: John Johnson Signed-off-by: Jagannathan Raman Signed-off-by: Elena Ufimtseva Signed-off-by: John Levon --- hw/vfio/common.c | 17 ++++++++++------- hw/vfio/container-base.c | 5 +++-- hw/vfio/container.c | 3 ++- hw/vfio/iommufd.c | 3 ++- hw/virtio/vhost-vdpa.c | 2 +- include/exec/memory.h | 4 +++- include/hw/vfio/vfio-container-base.h | 4 ++-- system/memory.c | 7 ++++++- 8 files changed, 29 insertions(+), 16 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index f9619a1dfb..8eb2ed50dd 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -254,12 +254,12 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) /* Called with rcu_read_lock held. */ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, ram_addr_t *ram_addr, bool *read_only, - Error **errp) + MemoryRegion **mrp, Error **errp) { bool ret, mr_has_discard_manager; ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, - &mr_has_discard_manager, errp); + &mr_has_discard_manager, mrp, errp); if (ret && mr_has_discard_manager) { /* * Malicious VMs might trigger discarding of IOMMU-mapped memory. The @@ -287,6 +287,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); VFIOContainerBase *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; + MemoryRegion *mrp; void *vaddr; int ret; Error *local_err = NULL; @@ -306,7 +307,8 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { bool read_only; - if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) { + if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &mrp, + &local_err)) { error_report_err(local_err); goto out; } @@ -319,7 +321,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) */ ret = vfio_container_dma_map(bcontainer, iova, iotlb->addr_mask + 1, vaddr, - read_only); + read_only, mrp); if (ret) { error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -384,7 +386,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, vaddr = memory_region_get_ram_ptr(section->mr) + start; ret = vfio_container_dma_map(bcontainer, iova, next - start, - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { /* Rollback */ vfio_ram_discard_notify_discard(rdl, section); @@ -686,7 +688,7 @@ static void vfio_listener_region_add(MemoryListener *listener, } ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -1231,7 +1233,8 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) } rcu_read_lock(); - if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) { + if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, NULL, + &local_err)) { error_report_err(local_err); goto out_unlock; } diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 760d9d0622..f16766f490 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -17,10 +17,11 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly) + void *vaddr, bool readonly, MemoryRegion *mrp) { g_assert(bcontainer->ops->dma_map); - return bcontainer->ops->dma_map(bcontainer, iova, size, vaddr, readonly); + return bcontainer->ops->dma_map(bcontainer, iova, size, vaddr, + readonly, mrp); } int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 096cc97258..6e6308382f 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -177,7 +177,8 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, } static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mrp) { const VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 554f9a6292..3d08a4f2dc 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -27,7 +27,8 @@ #include "pci.h" static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mrp) { const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index ed99ab8745..9b5fb384dd 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -228,7 +228,7 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { bool read_only; - if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL, + if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL, NULL, &local_err)) { error_report_err(local_err); return; diff --git a/include/exec/memory.h b/include/exec/memory.h index 9cdd64e9c6..81c6d1488a 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -783,13 +783,15 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, * @read_only: indicates if writes are allowed * @mr_has_discard_manager: indicates memory is controlled by a * RamDiscardManager + * @mrp: if non-NULL, fill in with MemoryRegion * @errp: pointer to Error*, to store an error if it happens. * * Return: true on success, else false setting @errp with error. */ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, ram_addr_t *ram_addr, bool *read_only, - bool *mr_has_discard_manager, Error **errp); + bool *mr_has_discard_manager, MemoryRegion **mrp, + Error **errp); typedef struct CoalescedMemoryRange CoalescedMemoryRange; typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 2776481fc9..129e742643 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -72,7 +72,7 @@ typedef struct VFIORamDiscardListener { int vfio_container_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly); + void *vaddr, bool readonly, MemoryRegion *mrp); int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb); @@ -113,7 +113,7 @@ struct VFIOIOMMUClass { bool (*setup)(VFIOContainerBase *bcontainer, Error **errp); int (*dma_map)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly); + void *vaddr, bool readonly, MemoryRegion *mrp); int (*dma_unmap)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb); diff --git a/system/memory.c b/system/memory.c index 9540caa8a1..d6574a88c1 100644 --- a/system/memory.c +++ b/system/memory.c @@ -2179,7 +2179,8 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, /* Called with rcu_read_lock held. */ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, ram_addr_t *ram_addr, bool *read_only, - bool *mr_has_discard_manager, Error **errp) + bool *mr_has_discard_manager, MemoryRegion **mrp, + Error **errp) { MemoryRegion *mr; hwaddr xlat; @@ -2244,6 +2245,10 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, *read_only = !writable || mr->readonly; } + if (mrp != NULL) { + *mrp = mr; + } + return true; } From patchwork Wed May 29 16:22:55 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679241 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 04B8EC25B7C for ; Wed, 29 May 2024 16:30:13 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7W-00020q-QM; Wed, 29 May 2024 12:25:46 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM77-0001K7-Lu for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:23 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM76-0006O0-6D for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:21 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6i-006COb-Ri; Wed, 29 May 2024 17:24:56 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Levon , John Levon Subject: [PATCH 02/26] vfio/container: pass listener_begin/commit callbacks Date: Wed, 29 May 2024 17:22:55 +0100 Message-Id: <20240529162319.1476680-3-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org The vfio-user container will later need to hook into these callbacks; set up vfio to use them, and optionally pass them through to the container. Signed-off-by: John Levon --- hw/vfio/common.c | 22 ++++++++++++++++++++++ include/hw/vfio/vfio-container-base.h | 2 ++ 2 files changed, 24 insertions(+) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 8eb2ed50dd..a18af7a94f 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -563,6 +563,26 @@ static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, return true; } +static void vfio_listener_begin(MemoryListener *listener) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + + if (bcontainer->ops->listener_begin) { + (bcontainer->ops->listener_begin)(bcontainer); + } +} + +static void vfio_listener_commit(MemoryListener *listener) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + + if (bcontainer->ops->listener_commit) { + (bcontainer->ops->listener_commit)(bcontainer); + } +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { @@ -1382,6 +1402,8 @@ static void vfio_listener_log_sync(MemoryListener *listener, const MemoryListener vfio_memory_listener = { .name = "vfio", + .begin = vfio_listener_begin, + .commit = vfio_listener_commit, .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, .log_global_start = vfio_listener_log_global_start, diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 129e742643..e9b59c179e 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -111,6 +111,8 @@ struct VFIOIOMMUClass { /* basic feature */ bool (*setup)(VFIOContainerBase *bcontainer, Error **errp); + void (*listener_begin)(VFIOContainerBase *bcontainer); + void (*listener_commit)(VFIOContainerBase *bcontainer); int (*dma_map)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly, MemoryRegion *mrp); From patchwork Wed May 29 16:22:56 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679202 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 70889C41513 for ; Wed, 29 May 2024 16:25:57 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM72-0001HG-Kh; Wed, 29 May 2024 12:25:16 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6x-00019t-QR for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:11 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6v-0006Ki-C9 for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:11 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6i-006COe-St; Wed, 29 May 2024 17:24:56 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Levon , John Johnson , Elena Ufimtseva Subject: [PATCH 03/26] vfio/container: support VFIO_DMA_UNMAP_FLAG_ALL Date: Wed, 29 May 2024 17:22:56 +0100 Message-Id: <20240529162319.1476680-4-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: John Levon Some containers can directly implement unmapping all regions; add a new flag to support this. Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/common.c | 24 +++++++---------- hw/vfio/container-base.c | 4 +-- hw/vfio/container.c | 38 +++++++++++++++++++++++++-- hw/vfio/iommufd.c | 19 +++++++++++++- include/hw/vfio/vfio-common.h | 1 + include/hw/vfio/vfio-container-base.h | 4 +-- 6 files changed, 68 insertions(+), 22 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index a18af7a94f..ac89bc4b02 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -330,7 +330,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) } } else { ret = vfio_container_dma_unmap(bcontainer, iova, - iotlb->addr_mask + 1, iotlb); + iotlb->addr_mask + 1, iotlb, 0); if (ret) { error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", @@ -354,7 +354,7 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, int ret; /* Unmap with a single call. */ - ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); + ret = vfio_container_dma_unmap(bcontainer, iova, size, NULL, 0); if (ret) { error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, strerror(-ret)); @@ -806,21 +806,15 @@ static void vfio_listener_region_del(MemoryListener *listener, } if (try_unmap) { + int flags = 0; + if (int128_eq(llsize, int128_2_64())) { - /* The unmap ioctl doesn't accept a full 64-bit span. */ - llsize = int128_rshift(llsize, 1); - ret = vfio_container_dma_unmap(bcontainer, iova, - int128_get64(llsize), NULL); - if (ret) { - error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%s)", - bcontainer, iova, int128_get64(llsize), ret, - strerror(-ret)); - } - iova += int128_get64(llsize); + flags = VFIO_DMA_UNMAP_FLAG_ALL; } - ret = vfio_container_dma_unmap(bcontainer, iova, - int128_get64(llsize), NULL); + + ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize), + NULL, flags); + if (ret) { error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index f16766f490..3b2d3f38d5 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -26,10 +26,10 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer, int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) + IOMMUTLBEntry *iotlb, int flags) { g_assert(bcontainer->ops->dma_unmap); - return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); + return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb, flags); } bool vfio_container_add_section_window(VFIOContainerBase *bcontainer, diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 6e6308382f..27ddb894e9 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -118,7 +118,7 @@ unmap_exit: */ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) + IOMMUTLBEntry *iotlb, int flags) { const VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); @@ -141,6 +141,34 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, need_dirty_sync = true; } + /* use unmap all if supported */ + if (flags & VFIO_DMA_UNMAP_FLAG_ALL) { + unmap.iova = 0; + unmap.size = 0; + if (container->unmap_all_supported) { + ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); + } else { + /* unmap in halves */ + Int128 llsize = int128_rshift(int128_2_64(), 1); + + unmap.size = int128_get64(llsize); + + ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); + + if (ret == 0) { + unmap.iova += int128_get64(llsize); + + ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); + } + } + + if (ret != 0) { + return -errno; + } + + goto out; + } + while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { /* * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c @@ -164,6 +192,7 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, return -errno; } +out: if (need_dirty_sync) { ret = vfio_get_dirty_bitmap(bcontainer, iova, size, iotlb->translated_addr, &local_err); @@ -201,7 +230,7 @@ static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, */ if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || (errno == EBUSY && - vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 && + vfio_legacy_dma_unmap(bcontainer, iova, size, NULL, 0) == 0 && ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { return 0; } @@ -533,6 +562,11 @@ static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp) vfio_get_info_iova_range(info, bcontainer); vfio_get_iommu_info_migration(container, info); + + ret = ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL); + + container->unmap_all_supported = (ret != 0); + return true; } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 3d08a4f2dc..7bcadacd0b 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -40,11 +40,28 @@ static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) + IOMMUTLBEntry *iotlb, int flags) { const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + /* unmap in halves */ + if (flags & VFIO_DMA_UNMAP_FLAG_ALL) { + Int128 llsize = int128_rshift(int128_2_64(), 1); + int ret; + + ret = iommufd_backend_unmap_dma(container->be, container->ioas_id, + iova, int128_get64(llsize)); + iova += int128_get64(llsize); + + if (ret == 0) { + ret = iommufd_backend_unmap_dma(container->be, container->ioas_id, + iova, int128_get64(llsize)); + } + + return ret; + } + /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */ return iommufd_backend_unmap_dma(container->be, container->ioas_id, iova, size); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 4cb1ab8645..d112c9b72f 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -79,6 +79,7 @@ typedef struct VFIOContainer { VFIOContainerBase bcontainer; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ unsigned iommu_type; + bool unmap_all_supported; QLIST_HEAD(, VFIOGroup) group_list; } VFIOContainer; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index e9b59c179e..c2c1af70ac 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -75,7 +75,7 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer, void *vaddr, bool readonly, MemoryRegion *mrp); int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb); + IOMMUTLBEntry *iotlb, int flags); bool vfio_container_add_section_window(VFIOContainerBase *bcontainer, MemoryRegionSection *section, Error **errp); @@ -118,7 +118,7 @@ struct VFIOIOMMUClass { void *vaddr, bool readonly, MemoryRegion *mrp); int (*dma_unmap)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb); + IOMMUTLBEntry *iotlb, int flags); bool (*attach_device)(const char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); void (*detach_device)(VFIODevice *vbasedev); From patchwork Wed May 29 16:22:57 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679201 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 50247C25B75 for ; Wed, 29 May 2024 16:25:57 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM6y-0001AS-6J; Wed, 29 May 2024 12:25:12 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6o-00015X-F3 for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:05 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6l-0006KW-9I for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:02 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6i-006COh-U8; Wed, 29 May 2024 17:24:56 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Levon Subject: [PATCH 04/26] vfio: add vfio_attach_device_by_iommu_type() Date: Wed, 29 May 2024 17:22:57 +0100 Message-Id: <20240529162319.1476680-5-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: John Levon Allow attachment by explicitly passing a TYPE_VFIO_IOMMU_* string; vfio-user will use this later. Signed-off-by: John Levon --- hw/vfio/common.c | 25 ++++++++++++++++++------- include/hw/vfio/vfio-common.h | 3 +++ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index ac89bc4b02..8c3b6ad75a 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1542,21 +1542,32 @@ retry: return info; } -bool vfio_attach_device(char *name, VFIODevice *vbasedev, - AddressSpace *as, Error **errp) +bool vfio_attach_device_by_iommu_type(const char *iommu_type, char *name, + VFIODevice *vbasedev, AddressSpace *as, + Error **errp) { - const VFIOIOMMUClass *ops = - VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); + const VFIOIOMMUClass *ops; - if (vbasedev->iommufd) { - ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); - } + ops = VFIO_IOMMU_CLASS(object_class_by_name(iommu_type)); assert(ops); return ops->attach_device(name, vbasedev, as, errp); } +bool vfio_attach_device(char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + const char *iommu_type = TYPE_VFIO_IOMMU_LEGACY; + + if (vbasedev->iommufd) { + iommu_type = TYPE_VFIO_IOMMU_IOMMUFD; + } + + return vfio_attach_device_by_iommu_type(iommu_type, name, vbasedev, + as, errp); +} + void vfio_detach_device(VFIODevice *vbasedev) { if (!vbasedev->bcontainer) { diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index d112c9b72f..8e13d30d2c 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -225,6 +225,9 @@ void vfio_reset_handler(void *opaque); struct vfio_device_info *vfio_get_device_info(int fd); bool vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); +bool vfio_attach_device_by_iommu_type(const char *iommu_type, char *name, + VFIODevice *vbasedev, AddressSpace *as, + Error **errp); void vfio_detach_device(VFIODevice *vbasedev); int vfio_kvm_device_add_fd(int fd, Error **errp); From patchwork Wed May 29 16:22:58 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679240 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 1A209C27C44 for ; Wed, 29 May 2024 16:29:28 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7V-0001wn-4V; Wed, 29 May 2024 12:25:45 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6y-0001BQ-IV for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:14 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6v-0006KZ-D3 for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:12 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6i-006COk-V7; Wed, 29 May 2024 17:24:56 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Levon Subject: [PATCH 05/26] vfio: add vfio_prepare_device() Date: Wed, 29 May 2024 17:22:58 +0100 Message-Id: <20240529162319.1476680-6-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: John Levon Commonize some initialization code shared by the legacy and iommufd vfio implementations (and later by vfio-user). Signed-off-by: John Levon --- hw/vfio/common.c | 19 +++++++++++++++++++ hw/vfio/container.c | 14 +------------- hw/vfio/iommufd.c | 9 +-------- include/hw/vfio/vfio-common.h | 2 ++ 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 8c3b6ad75a..81f4c88f2d 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1542,6 +1542,25 @@ retry: return info; } +void vfio_prepare_device(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, + VFIOGroup *group, struct vfio_device_info *info) +{ + vbasedev->group = group; + + vbasedev->num_irqs = info->num_irqs; + vbasedev->num_regions = info->num_regions; + vbasedev->flags = info->flags; + vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET); + + vbasedev->bcontainer = bcontainer; + QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); + if (group) { + QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); + } + + QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); +} + bool vfio_attach_device_by_iommu_type(const char *iommu_type, char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 27ddb894e9..1bbcbaa874 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -881,17 +881,11 @@ static bool vfio_get_device(VFIOGroup *group, const char *name, } vbasedev->fd = fd; - vbasedev->group = group; - QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); - vbasedev->num_irqs = info->num_irqs; - vbasedev->num_regions = info->num_regions; - vbasedev->flags = info->flags; + vfio_prepare_device(vbasedev, &group->container->bcontainer, group, info); trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs); - vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET); - return true; } @@ -944,7 +938,6 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, int groupid = vfio_device_groupid(vbasedev, errp); VFIODevice *vbasedev_iter; VFIOGroup *group; - VFIOContainerBase *bcontainer; if (groupid < 0) { return false; @@ -969,11 +962,6 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, return false; } - bcontainer = &group->container->bcontainer; - vbasedev->bcontainer = bcontainer; - QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); - QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); - return true; } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 7bcadacd0b..dad83e45d7 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -425,14 +425,7 @@ found_container: iommufd_cdev_ram_block_discard_disable(false); } - vbasedev->group = 0; - vbasedev->num_irqs = dev_info.num_irqs; - vbasedev->num_regions = dev_info.num_regions; - vbasedev->flags = dev_info.flags; - vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); - vbasedev->bcontainer = bcontainer; - QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); - QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + vfio_prepare_device(vbasedev, bcontainer, NULL, &dev_info); trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, vbasedev->num_regions, vbasedev->flags); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 8e13d30d2c..ee022c9cbd 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -223,6 +223,8 @@ void vfio_region_exit(VFIORegion *region); void vfio_region_finalize(VFIORegion *region); void vfio_reset_handler(void *opaque); struct vfio_device_info *vfio_get_device_info(int fd); +void vfio_prepare_device(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, + VFIOGroup *group, struct vfio_device_info *info); bool vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); bool vfio_attach_device_by_iommu_type(const char *iommu_type, char *name, From patchwork Wed May 29 16:22:59 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679265 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 229BEC27C44 for ; Wed, 29 May 2024 16:32:54 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM6r-00015d-F2; Wed, 29 May 2024 12:25:05 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6n-00013e-1N for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:01 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6l-0006KY-45 for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:00 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6i-006COn-W9; Wed, 29 May 2024 17:24:56 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 06/26] vfio: add region cache Date: Wed, 29 May 2024 17:22:59 +0100 Message-Id: <20240529162319.1476680-7-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman cache VFIO_DEVICE_GET_REGION_INFO results to reduce memory alloc/free cycles and as prep work for vfio-user Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/ccw.c | 5 ----- hw/vfio/common.c | 12 ++++++++++++ hw/vfio/container.c | 10 ++++++++++ hw/vfio/helpers.c | 21 ++++++++++++++++----- hw/vfio/igd.c | 8 ++++---- hw/vfio/pci.c | 8 ++++---- include/hw/vfio/vfio-common.h | 1 + 7 files changed, 47 insertions(+), 18 deletions(-) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 2600e62e37..315449c1b1 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -510,7 +510,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) vcdev->io_region_offset = info->offset; vcdev->io_region = g_malloc0(info->size); - g_free(info); /* check for the optional async command region */ ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, @@ -523,7 +522,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) } vcdev->async_cmd_region_offset = info->offset; vcdev->async_cmd_region = g_malloc0(info->size); - g_free(info); } ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, @@ -536,7 +534,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) } vcdev->schib_region_offset = info->offset; vcdev->schib_region = g_malloc(info->size); - g_free(info); } ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, @@ -550,7 +547,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) } vcdev->crw_region_offset = info->offset; vcdev->crw_region = g_malloc(info->size); - g_free(info); } return true; @@ -560,7 +556,6 @@ out_err: g_free(vcdev->schib_region); g_free(vcdev->async_cmd_region); g_free(vcdev->io_region); - g_free(info); return false; } diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 81f4c88f2d..1a71c3be05 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1542,6 +1542,16 @@ retry: return info; } +static void vfio_get_all_regions(VFIODevice *vbasedev) +{ + struct vfio_region_info *info; + int i; + + for (i = 0; i < vbasedev->num_regions; i++) { + vfio_get_region_info(vbasedev, i, &info); + } +} + void vfio_prepare_device(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, VFIOGroup *group, struct vfio_device_info *info) { @@ -1559,6 +1569,8 @@ void vfio_prepare_device(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, } QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + + vfio_get_all_regions(vbasedev); } bool vfio_attach_device_by_iommu_type(const char *iommu_type, char *name, diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 1bbcbaa874..6c70f95f04 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -891,6 +891,16 @@ static bool vfio_get_device(VFIOGroup *group, const char *name, static void vfio_put_base_device(VFIODevice *vbasedev) { + if (vbasedev->regions != NULL) { + int i; + + for (i = 0; i < vbasedev->num_regions; i++) { + g_free(vbasedev->regions[i]); + } + g_free(vbasedev->regions); + vbasedev->regions = NULL; + } + if (!vbasedev->group) { return; } diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 27ea26aa48..cdc7eb5bd5 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -343,7 +343,7 @@ static int vfio_setup_region_sparse_mmaps(VFIORegion *region, int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, int index, const char *name) { - g_autofree struct vfio_region_info *info = NULL; + struct vfio_region_info *info = NULL; int ret; ret = vfio_get_region_info(vbasedev, index, &info); @@ -533,6 +533,17 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, { size_t argsz = sizeof(struct vfio_region_info); + /* create region cache */ + if (vbasedev->regions == NULL) { + vbasedev->regions = g_new0(struct vfio_region_info *, + vbasedev->num_regions); + } + /* check cache */ + if (vbasedev->regions[index] != NULL) { + *info = vbasedev->regions[index]; + return 0; + } + *info = g_malloc0(argsz); (*info)->index = index; @@ -552,6 +563,9 @@ retry: goto retry; } + /* fill cache */ + vbasedev->regions[index] = *info; + return 0; } @@ -570,7 +584,6 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE); if (!hdr) { - g_free(*info); continue; } @@ -582,8 +595,6 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, if (cap_type->type == type && cap_type->subtype == subtype) { return 0; } - - g_free(*info); } *info = NULL; @@ -592,7 +603,7 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) { - g_autofree struct vfio_region_info *info = NULL; + struct vfio_region_info *info = NULL; bool ret = false; if (!vfio_get_region_info(vbasedev, region, &info)) { diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index d320d032a7..34d8a41dd3 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -367,10 +367,10 @@ static const MemoryRegionOps vfio_igd_index_quirk = { void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) { - g_autofree struct vfio_region_info *rom = NULL; - g_autofree struct vfio_region_info *opregion = NULL; - g_autofree struct vfio_region_info *host = NULL; - g_autofree struct vfio_region_info *lpc = NULL; + struct vfio_region_info *rom = NULL; + struct vfio_region_info *opregion = NULL; + struct vfio_region_info *host = NULL; + struct vfio_region_info *lpc = NULL; VFIOQuirk *quirk; VFIOIGDQuirk *igd; PCIDevice *lpc_bridge; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 74a79bdf61..470fb3502a 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -879,7 +879,7 @@ static void vfio_update_msi(VFIOPCIDevice *vdev) static void vfio_pci_load_rom(VFIOPCIDevice *vdev) { - g_autofree struct vfio_region_info *reg_info = NULL; + struct vfio_region_info *reg_info = NULL; uint64_t size; off_t off = 0; ssize_t bytes; @@ -2666,7 +2666,7 @@ static VFIODeviceOps vfio_pci_ops = { bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; - g_autofree struct vfio_region_info *reg_info = NULL; + struct vfio_region_info *reg_info = NULL; int ret; ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info); @@ -2731,7 +2731,7 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; - g_autofree struct vfio_region_info *reg_info = NULL; + struct vfio_region_info *reg_info = NULL; struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; int i, ret = -1; @@ -3135,7 +3135,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (!vdev->igd_opregion && vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) { - g_autofree struct vfio_region_info *opregion = NULL; + struct vfio_region_info *opregion = NULL; if (vdev->pdev.qdev.hotplugged) { error_setg(errp, diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index ee022c9cbd..2428c7d80c 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -128,6 +128,7 @@ typedef struct VFIODevice { bool dirty_tracking; int devid; IOMMUFDBackend *iommufd; + struct vfio_region_info **regions; } VFIODevice; struct VFIODeviceOps { From patchwork Wed May 29 16:23:00 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679206 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id C8548C25B75 for ; Wed, 29 May 2024 16:26:24 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7R-0001VF-4m; Wed, 29 May 2024 12:25:41 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6y-0001AY-6k for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:12 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6v-0006Kh-C8 for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:11 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6j-006COq-1A; Wed, 29 May 2024 17:24:57 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 07/26] vfio: add VFIO base abstract class Date: Wed, 29 May 2024 17:23:00 +0100 Message-Id: <20240529162319.1476680-8-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Add an abstract base class both the kernel driver and user socket implementations can use to share code. Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/pci.c | 108 ++++++++++++++++++++++++++++++-------------------- hw/vfio/pci.h | 16 +++++++- 2 files changed, 80 insertions(+), 44 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 470fb3502a..99783d3cd4 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -239,7 +239,7 @@ static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route) static void vfio_intx_routing_notifier(PCIDevice *pdev) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); PCIINTxRoute route; if (vdev->interrupt != VFIO_INT_INTx) { @@ -514,7 +514,7 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIOMSIVector *vector; int ret; bool resizing = !!(vdev->nr_vectors < nr + 1); @@ -619,7 +619,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev, static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIOMSIVector *vector = &vdev->msi_vectors[nr]; trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); @@ -1168,7 +1168,7 @@ static const MemoryRegionOps vfio_vga_ops = { */ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIORegion *region = &vdev->bars[bar].region; MemoryRegion *mmap_mr, *region_mr, *base_mr; PCIIORegion *r; @@ -1214,7 +1214,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) */ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; memcpy(&emu_bits, vdev->emulated_config_bits + addr, len); @@ -1247,7 +1247,7 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); uint32_t val_le = cpu_to_le32(val); trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len); @@ -2961,7 +2961,7 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) static void vfio_realize(PCIDevice *pdev, Error **errp) { ERRP_GUARD(); - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; char *subsys; int i, ret; @@ -3247,7 +3247,7 @@ error: static void vfio_instance_finalize(Object *obj) { - VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); vfio_display_finalize(vdev); vfio_bars_finalize(vdev); @@ -3265,7 +3265,7 @@ static void vfio_instance_finalize(Object *obj) static void vfio_exitfn(PCIDevice *pdev) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); vfio_unregister_req_notifier(vdev); vfio_unregister_err_notifier(vdev); @@ -3285,7 +3285,7 @@ static void vfio_exitfn(PCIDevice *pdev) static void vfio_pci_reset(DeviceState *dev) { - VFIOPCIDevice *vdev = VFIO_PCI(dev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev); trace_vfio_pci_reset(vdev->vbasedev.name); @@ -3325,7 +3325,7 @@ post_reset: static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); - VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); VFIODevice *vbasedev = &vdev->vbasedev; device_add_bootindex_property(obj, &vdev->bootindex, @@ -3346,25 +3346,12 @@ static void vfio_instance_init(Object *obj) pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; } -static Property vfio_pci_dev_properties[] = { - DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), - DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token), - DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), +static Property vfio_pci_base_dev_properties[] = { DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, vbasedev.pre_copy_dirty_page_tracking, ON_OFF_AUTO_ON), - DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, - display, ON_OFF_AUTO_OFF), - DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), - DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0), DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice, intx.mmap_timeout, 1100), - DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features, - VFIO_FEATURE_ENABLE_VGA_BIT, false), - DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features, - VFIO_FEATURE_ENABLE_REQ_BIT, true), - DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, - VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice, vbasedev.enable_migration, ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice, @@ -3375,8 +3362,6 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false), DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false), DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false), - DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice, - no_geforce_quirks, false), DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd, false), DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd, @@ -3387,12 +3372,58 @@ static Property vfio_pci_dev_properties[] = { sub_vendor_id, PCI_ANY_ID), DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice, sub_device_id, PCI_ANY_ID), + DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo, + OFF_AUTOPCIBAR_OFF), + DEFINE_PROP_END_OF_LIST(), +}; + + +static void vfio_pci_base_dev_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); + + device_class_set_props(dc, vfio_pci_base_dev_properties); + dc->desc = "VFIO PCI base device"; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + pdc->exit = vfio_exitfn; + pdc->config_read = vfio_pci_read_config; + pdc->config_write = vfio_pci_write_config; +} + +static const TypeInfo vfio_pci_base_dev_info = { + .name = TYPE_VFIO_PCI_BASE, + .parent = TYPE_PCI_DEVICE, + .instance_size = 0, + .abstract = true, + .class_init = vfio_pci_base_dev_class_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_PCIE_DEVICE }, + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { } + }, +}; + +static Property vfio_pci_dev_properties[] = { + DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), + DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token), + DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), + DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, + display, ON_OFF_AUTO_OFF), + DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), + DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0), + DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features, + VFIO_FEATURE_ENABLE_VGA_BIT, false), + DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features, + VFIO_FEATURE_ENABLE_REQ_BIT, true), + DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, + VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), + DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice, + no_geforce_quirks, false), DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0), DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice, nv_gpudirect_clique, qdev_prop_nv_gpudirect_clique, uint8_t), - DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo, - OFF_AUTOPCIBAR_OFF), #ifdef CONFIG_IOMMUFD DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd, TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), @@ -3404,7 +3435,8 @@ static Property vfio_pci_dev_properties[] = { #ifdef CONFIG_IOMMUFD static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp) { - vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + vfio_device_set_fd(&vdev->vbasedev, str, errp); } #endif @@ -3419,25 +3451,16 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); #endif dc->desc = "VFIO-based PCI device assignment"; - set_bit(DEVICE_CATEGORY_MISC, dc->categories); pdc->realize = vfio_realize; - pdc->exit = vfio_exitfn; - pdc->config_read = vfio_pci_read_config; - pdc->config_write = vfio_pci_write_config; } static const TypeInfo vfio_pci_dev_info = { .name = TYPE_VFIO_PCI, - .parent = TYPE_PCI_DEVICE, - .instance_size = sizeof(VFIOPCIDevice), + .parent = TYPE_VFIO_PCI_BASE, + .instance_size = sizeof(VFIOKernelPCIDevice), .class_init = vfio_pci_dev_class_init, .instance_init = vfio_instance_init, .instance_finalize = vfio_instance_finalize, - .interfaces = (InterfaceInfo[]) { - { INTERFACE_PCIE_DEVICE }, - { INTERFACE_CONVENTIONAL_PCI_DEVICE }, - { } - }, }; static Property vfio_pci_dev_nohotplug_properties[] = { @@ -3458,12 +3481,13 @@ static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data) static const TypeInfo vfio_pci_nohotplug_dev_info = { .name = TYPE_VFIO_PCI_NOHOTPLUG, .parent = TYPE_VFIO_PCI, - .instance_size = sizeof(VFIOPCIDevice), + .instance_size = sizeof(VFIOKernelPCIDevice), .class_init = vfio_pci_nohotplug_dev_class_init, }; static void register_vfio_pci_dev_type(void) { + type_register_static(&vfio_pci_base_dev_info); type_register_static(&vfio_pci_dev_info); type_register_static(&vfio_pci_nohotplug_dev_info); } diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index bf67df2fbc..770448155e 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -116,8 +116,13 @@ typedef struct VFIOMSIXInfo { bool noresize; } VFIOMSIXInfo; -#define TYPE_VFIO_PCI "vfio-pci" -OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI) +/* + * TYPE_VFIO_PCI_BASE is an abstract type used to share code + * between VFIO implementations that use a kernel driver + * with those that use user sockets. + */ +#define TYPE_VFIO_PCI_BASE "vfio-pci-base" +OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI_BASE) struct VFIOPCIDevice { PCIDevice pdev; @@ -182,6 +187,13 @@ struct VFIOPCIDevice { Notifier irqchip_change_notifier; }; +#define TYPE_VFIO_PCI "vfio-pci" +OBJECT_DECLARE_SIMPLE_TYPE(VFIOKernelPCIDevice, VFIO_PCI) + +struct VFIOKernelPCIDevice { + VFIOPCIDevice device; +}; + /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */ static inline bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t device) { From patchwork Wed May 29 16:23:01 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679239 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 0DDECC25B7C for ; Wed, 29 May 2024 16:29:26 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM6u-00018J-Kb; Wed, 29 May 2024 12:25:08 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6p-00015r-Nf for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:05 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6l-0006Kc-3g for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:03 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6j-006COt-2R; Wed, 29 May 2024 17:24:57 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 08/26] vfio: add device IO ops vector Date: Wed, 29 May 2024 17:23:01 +0100 Message-Id: <20240529162319.1476680-9-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Used for communication with VFIO driver (prep work for vfio-user, which will communicate over a socket) Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/ap.c | 2 +- hw/vfio/ccw.c | 2 +- hw/vfio/common.c | 7 +- hw/vfio/helpers.c | 100 +++++++++++++++++++++++-- hw/vfio/pci.c | 137 +++++++++++++++++++++------------- hw/vfio/platform.c | 2 +- include/hw/vfio/vfio-common.h | 27 ++++++- 7 files changed, 211 insertions(+), 66 deletions(-) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index c12531a788..23d700e67a 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -229,7 +229,7 @@ static void vfio_ap_instance_init(Object *obj) * handle ram_block_discard_disable(). */ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops, - DEVICE(vapdev), true); + &vfio_dev_io_ioctl, DEVICE(vapdev), true); } #ifdef CONFIG_IOMMUFD diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 315449c1b1..b4139c8aef 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -681,7 +681,7 @@ static void vfio_ccw_instance_init(Object *obj) * ram_block_discard_disable(). */ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops, - DEVICE(vcdev), true); + &vfio_dev_io_ioctl, DEVICE(vcdev), true); } #ifdef CONFIG_IOMMUFD diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 1a71c3be05..03c5424938 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -964,7 +964,7 @@ static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) continue; } - if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + if (vbasedev->io->device_feature(vbasedev, feature)) { warn_report("%s: Failed to stop DMA logging, err %d (%s)", vbasedev->name, -errno, strerror(errno)); } @@ -1067,9 +1067,8 @@ static int vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer, continue; } - ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + ret = vbasedev->io->device_feature(vbasedev, feature); if (ret) { - ret = -errno; error_setg_errno(errp, errno, "%s: Failed to start DMA logging", vbasedev->name); goto out; @@ -1148,7 +1147,7 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; - if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + if (vbasedev->io->device_feature(vbasedev, feature)) { return -errno; } diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index cdc7eb5bd5..2cd8fbe70c 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -42,7 +42,7 @@ void vfio_disable_irqindex(VFIODevice *vbasedev, int index) .count = 0, }; - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + vbasedev->io->set_irqs(vbasedev, &irq_set); } void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index) @@ -55,7 +55,7 @@ void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index) .count = 1, }; - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + vbasedev->io->set_irqs(vbasedev, &irq_set); } void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) @@ -68,7 +68,7 @@ void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) .count = 1, }; - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + vbasedev->io->set_irqs(vbasedev, &irq_set); } static inline const char *action_to_str(int action) @@ -115,6 +115,7 @@ bool vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, int argsz; const char *name; int32_t *pfd; + int ret; argsz = sizeof(*irq_set) + sizeof(*pfd); @@ -127,7 +128,9 @@ bool vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, pfd = (int32_t *)&irq_set->data; *pfd = fd; - if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) { + ret = vbasedev->io->set_irqs(vbasedev, irq_set); + + if (!ret) { return true; } @@ -159,6 +162,7 @@ void vfio_region_write(void *opaque, hwaddr addr, uint32_t dword; uint64_t qword; } buf; + int ret; switch (size) { case 1: @@ -178,7 +182,8 @@ void vfio_region_write(void *opaque, hwaddr addr, break; } - if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { + ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf); + if (ret != size) { error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 ",%d) failed: %m", __func__, vbasedev->name, region->nr, @@ -210,8 +215,10 @@ uint64_t vfio_region_read(void *opaque, uint64_t qword; } buf; uint64_t data = 0; + int ret; - if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { + ret = vbasedev->io->region_read(vbasedev, region->nr, addr, size, &buf); + if (ret != size) { error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", __func__, vbasedev->name, region->nr, addr, size); @@ -532,6 +539,7 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info) { size_t argsz = sizeof(struct vfio_region_info); + int ret; /* create region cache */ if (vbasedev->regions == NULL) { @@ -550,7 +558,8 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, retry: (*info)->argsz = argsz; - if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { + ret = vbasedev->io->get_region_info(vbasedev, *info); + if (ret != 0) { g_free(*info); *info = NULL; return -errno; @@ -660,12 +669,87 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) } void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, - DeviceState *dev, bool ram_discard) + VFIODeviceIO *io, DeviceState *dev, bool ram_discard) { vbasedev->type = type; vbasedev->ops = ops; vbasedev->dev = dev; + vbasedev->io = io; vbasedev->fd = -1; vbasedev->ram_block_discard_allowed = ram_discard; } + +/* + * Traditional ioctl() based io + */ + +static int vfio_io_device_feature(VFIODevice *vbasedev, + struct vfio_device_feature *feature) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + + return ret < 0 ? -errno : ret; +} + +static int vfio_io_get_region_info(VFIODevice *vbasedev, + struct vfio_region_info *info) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info); + + return ret < 0 ? -errno : ret; +} + +static int vfio_io_get_irq_info(VFIODevice *vbasedev, + struct vfio_irq_info *info) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info); + + return ret < 0 ? -errno : ret; +} + +static int vfio_io_set_irqs(VFIODevice *vbasedev, struct vfio_irq_set *irqs) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs); + + return ret < 0 ? -errno : ret; +} + +static int vfio_io_region_read(VFIODevice *vbasedev, uint8_t index, off_t off, + uint32_t size, void *data) +{ + struct vfio_region_info *info = vbasedev->regions[index]; + int ret; + + ret = pread(vbasedev->fd, data, size, info->offset + off); + + return ret < 0 ? -errno : ret; +} + +static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off, + uint32_t size, void *data) +{ + struct vfio_region_info *info = vbasedev->regions[index]; + int ret; + + ret = pwrite(vbasedev->fd, data, size, info->offset + off); + + return ret < 0 ? -errno : ret; +} + +VFIODeviceIO vfio_dev_io_ioctl = { + .device_feature = vfio_io_device_feature, + .get_region_info = vfio_io_get_region_info, + .get_irq_info = vfio_io_get_irq_info, + .set_irqs = vfio_io_set_irqs, + .region_read = vfio_io_region_read, + .region_write = vfio_io_region_write, +}; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 99783d3cd4..2ad0dbe342 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -45,6 +45,14 @@ #include "migration/qemu-file.h" #include "sysemu/iommufd.h" +/* convenience macros for PCI config space */ +#define VDEV_CONFIG_READ(vbasedev, off, size, data) \ + ((vbasedev)->io->region_read((vbasedev), VFIO_PCI_CONFIG_REGION_INDEX, \ + (off), (size), (data))) +#define VDEV_CONFIG_WRITE(vbasedev, off, size, data) \ + ((vbasedev)->io->region_write((vbasedev), VFIO_PCI_CONFIG_REGION_INDEX, \ + (off), (size), (data))) + #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" /* Protected by BQL */ @@ -379,6 +387,7 @@ static void vfio_msi_interrupt(void *opaque) static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) { g_autofree struct vfio_irq_set *irq_set = NULL; + VFIODevice *vbasedev = &vdev->vbasedev; int ret = 0, argsz; int32_t *fd; @@ -394,7 +403,7 @@ static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) fd = (int32_t *)&irq_set->data; *fd = -1; - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + ret = vbasedev->io->set_irqs(vbasedev, irq_set); return ret; } @@ -453,7 +462,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) fds[i] = fd; } - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + ret = vdev->vbasedev.io->set_irqs(&vdev->vbasedev, irq_set); g_free(irq_set); @@ -879,13 +888,14 @@ static void vfio_update_msi(VFIOPCIDevice *vdev) static void vfio_pci_load_rom(VFIOPCIDevice *vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; struct vfio_region_info *reg_info = NULL; uint64_t size; off_t off = 0; ssize_t bytes; - if (vfio_get_region_info(&vdev->vbasedev, - VFIO_PCI_ROM_REGION_INDEX, ®_info)) { + if (!vfio_get_region_info(vbasedev, + VFIO_PCI_ROM_REGION_INDEX, ®_info)) { error_report("vfio: Error getting ROM info: %m"); return; } @@ -911,18 +921,19 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) memset(vdev->rom, 0xff, size); while (size) { - bytes = pread(vdev->vbasedev.fd, vdev->rom + off, - size, vdev->rom_offset + off); + bytes = vbasedev->io->region_read(vbasedev, VFIO_PCI_ROM_REGION_INDEX, + off, size, vdev->rom + off); if (bytes == 0) { break; } else if (bytes > 0) { off += bytes; size -= bytes; } else { - if (errno == EINTR || errno == EAGAIN) { + if (bytes == -EINTR || bytes == -EAGAIN) { continue; } - error_report("vfio: Error reading device ROM: %m"); + error_report("vfio: Error reading device ROM: %s", + strerror(-bytes)); break; } } @@ -1010,11 +1021,10 @@ static const MemoryRegionOps vfio_rom_ops = { static void vfio_pci_size_rom(VFIOPCIDevice *vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); - off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; DeviceState *dev = DEVICE(vdev); char *name; - int fd = vdev->vbasedev.fd; if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { /* Since pci handles romfile, just print a message and return */ @@ -1031,11 +1041,12 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) * Use the same size ROM BAR as the physical device. The contents * will get filled in later when the guest tries to read it. */ - if (pread(fd, &orig, 4, offset) != 4 || - pwrite(fd, &size, 4, offset) != 4 || - pread(fd, &size, 4, offset) != 4 || - pwrite(fd, &orig, 4, offset) != 4) { - error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name); + if (VDEV_CONFIG_READ(vbasedev, PCI_ROM_ADDRESS, 4, &orig) != 4 || + VDEV_CONFIG_WRITE(vbasedev, PCI_ROM_ADDRESS, 4, &size) != 4 || + VDEV_CONFIG_READ(vbasedev, PCI_ROM_ADDRESS, 4, &size) != 4 || + VDEV_CONFIG_WRITE(vbasedev, PCI_ROM_ADDRESS, 4, &orig) != 4) { + + error_report("%s(%s) ROM access failed", __func__, vbasedev->name); return; } @@ -1215,6 +1226,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) { VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; memcpy(&emu_bits, vdev->emulated_config_bits + addr, len); @@ -1227,12 +1239,13 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) if (~emu_bits & (0xffffffffU >> (32 - len * 8))) { ssize_t ret; - ret = pread(vdev->vbasedev.fd, &phys_val, len, - vdev->config_offset + addr); + ret = VDEV_CONFIG_READ(vbasedev, addr, len, &phys_val); if (ret != len) { - error_report("%s(%s, 0x%x, 0x%x) failed: %m", - __func__, vdev->vbasedev.name, addr, len); - return -errno; + const char *err = ret < 0 ? strerror(-ret) : "short read"; + + error_report("%s(%s, 0x%x, 0x%x) failed: %s", + __func__, vbasedev->name, addr, len, err); + return -1; } phys_val = le32_to_cpu(phys_val); } @@ -1248,15 +1261,19 @@ void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len) { VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t val_le = cpu_to_le32(val); + int ret; trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len); /* Write everything to VFIO, let it filter out what we can't write */ - if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr) - != len) { - error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m", - __func__, vdev->vbasedev.name, addr, val, len); + ret = VDEV_CONFIG_WRITE(vbasedev, addr, len, &val_le); + if (ret != len) { + const char *err = ret < 0 ? strerror(-ret) : "short write"; + + error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s", + __func__, vbasedev->name, addr, val, len, err); } /* MSI/MSI-X Enabling/Disabling */ @@ -1344,9 +1361,12 @@ static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp) int ret, entries; Error *err = NULL; - if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl), - vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { - error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS"); + ret = VDEV_CONFIG_READ(&vdev->vbasedev, pos + PCI_CAP_FLAGS, + sizeof(ctrl), &ctrl); + if (ret != sizeof(ctrl)) { + const char *errmsg = ret < 0 ? strerror(-ret) : "short read"; + + error_setg(errp, "failed reading MSI PCI_CAP_FLAGS %s", errmsg); return false; } ctrl = le16_to_cpu(ctrl); @@ -1550,34 +1570,43 @@ static bool vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp) */ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) { + VFIODevice *vbasedev = &vdev->vbasedev; uint8_t pos; uint16_t ctrl; uint32_t table, pba; - int ret, fd = vdev->vbasedev.fd; struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), .index = VFIO_PCI_MSIX_IRQ_INDEX }; VFIOMSIXInfo *msix; + int ret; pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); if (!pos) { return true; } - if (pread(fd, &ctrl, sizeof(ctrl), - vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS"); + ret = VDEV_CONFIG_READ(vbasedev, pos + PCI_MSIX_FLAGS, + sizeof(ctrl), &ctrl); + if (ret != sizeof(ctrl)) { + const char *err = ret < 0 ? strerror(-ret) : "short read"; + + error_setg(errp, "failed to read PCI MSIX FLAGS %s", err); return false; } - if (pread(fd, &table, sizeof(table), - vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE"); + ret = VDEV_CONFIG_READ(vbasedev, pos + PCI_MSIX_TABLE, + sizeof(table), &table); + if (ret != sizeof(table)) { + const char *err = ret < 0 ? strerror(-ret) : "short read"; + + error_setg(errp, "failed to read PCI MSIX TABLE %s", err); return false; } - if (pread(fd, &pba, sizeof(pba), - vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX PBA"); + ret = VDEV_CONFIG_READ(vbasedev, pos + PCI_MSIX_PBA, sizeof(pba), &pba); + if (ret != sizeof(pba)) { + const char *err = ret < 0 ? strerror(-ret) : "short read"; + + error_setg(errp, "failed to read PCI MSIX PBA %s", err); return false; } @@ -1592,7 +1621,7 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); + ret = vdev->vbasedev.io->get_irq_info(&vdev->vbasedev, &irq_info); if (ret < 0) { error_setg_errno(errp, -ret, "failed to get MSI-X irq info"); g_free(msix); @@ -1736,10 +1765,12 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) } /* Determine what type of BAR this is for registration */ - ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), - vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); + ret = VDEV_CONFIG_READ(&vdev->vbasedev, PCI_BASE_ADDRESS_0 + (4 * nr), + sizeof(pci_bar), &pci_bar); if (ret != sizeof(pci_bar)) { - error_report("vfio: Failed to read BAR %d (%m)", nr); + const char *err = ret < 0 ? strerror(-ret) : "short read"; + + error_report("vfio: Failed to read BAR %d (%s)", nr, err); return; } @@ -2439,21 +2470,25 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) void vfio_pci_post_reset(VFIOPCIDevice *vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; Error *err = NULL; - int nr; + int ret, nr; if (!vfio_intx_enable(vdev, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) { - off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr); + off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr); uint32_t val = 0; uint32_t len = sizeof(val); - if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) { - error_report("%s(%s) reset bar %d failed: %m", __func__, - vdev->vbasedev.name, nr); + ret = VDEV_CONFIG_WRITE(vbasedev, addr, len, &val); + if (ret != len) { + const char *errmsg = ret < 0 ? strerror(-ret) : "short write"; + + error_report("%s(%s) reset bar %d failed: %s", __func__, + vbasedev->name, nr, errmsg); } } @@ -2795,7 +2830,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); + ret = vbasedev->io->get_irq_info(vbasedev, &irq_info); if (ret) { /* This can fail for an old kernel or legacy PCI dev */ trace_vfio_populate_device_get_irq_info_failure(strerror(errno)); @@ -2916,8 +2951,10 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) return; } - if (ioctl(vdev->vbasedev.fd, - VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) { + if (vdev->vbasedev.io->get_irq_info(&vdev->vbasedev, &irq_info) < 0) { + return; + } + if (irq_info.count < 1) { return; } @@ -3337,7 +3374,7 @@ static void vfio_instance_init(Object *obj) vdev->host.function = ~0U; vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops, - DEVICE(vdev), false); + &vfio_dev_io_ioctl, DEVICE(vdev), false); vdev->nv_gpudirect_clique = 0xFF; diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index a85c199c76..86ecd97fde 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -649,7 +649,7 @@ static void vfio_platform_instance_init(Object *obj) VFIODevice *vbasedev = &vdev->vbasedev; vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops, - DEVICE(vdev), false); + &vfio_dev_io_ioctl, DEVICE(vdev), false); } #ifdef CONFIG_IOMMUFD diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 2428c7d80c..689cafe28b 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -99,6 +99,7 @@ typedef struct VFIOIOMMUFDContainer { } VFIOIOMMUFDContainer; typedef struct VFIODeviceOps VFIODeviceOps; +typedef struct VFIODeviceIO VFIODeviceIO; typedef struct VFIODevice { QLIST_ENTRY(VFIODevice) next; @@ -118,6 +119,7 @@ typedef struct VFIODevice { OnOffAuto enable_migration; bool migration_events; VFIODeviceOps *ops; + VFIODeviceIO *io; unsigned int num_irqs; unsigned int num_regions; unsigned int flags; @@ -163,6 +165,29 @@ struct VFIODeviceOps { int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f); }; +#ifdef CONFIG_LINUX + +/* + * How devices communicate with the server. The default option is through + * ioctl() to the kernel VFIO driver, but vfio-user can use a socket to a remote + * process. + */ +struct VFIODeviceIO { + int (*device_feature)(VFIODevice *vdev, struct vfio_device_feature *); + int (*get_region_info)(VFIODevice *vdev, + struct vfio_region_info *info); + int (*get_irq_info)(VFIODevice *vdev, struct vfio_irq_info *irq); + int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs); + int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, + void *data); + int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, + void *data); +}; + +extern VFIODeviceIO vfio_dev_io_ioctl; + +#endif /* CONFIG_LINUX */ + typedef struct VFIOGroup { int fd; int groupid; @@ -289,5 +314,5 @@ int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp); void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, - DeviceState *dev, bool ram_discard); + VFIODeviceIO *io, DeviceState *dev, bool ram_discard); #endif /* HW_VFIO_VFIO_COMMON_H */ From patchwork Wed May 29 16:23:02 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679237 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id EC484C25B75 for ; Wed, 29 May 2024 16:29:11 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7U-0001rO-3f; Wed, 29 May 2024 12:25:44 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6u-00018I-HL for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:08 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6l-0006Ke-45 for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:07 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6j-006COw-3f; Wed, 29 May 2024 17:24:57 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Levon Subject: [PATCH 09/26] vfio-user: introduce vfio-user protocol specification Date: Wed, 29 May 2024 17:23:02 +0100 Message-Id: <20240529162319.1476680-10-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01, WEIRD_QUOTING=0.001 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Thanos Makatos This patch introduces the vfio-user protocol specification (formerly known as VFIO-over-socket), which is designed to allow devices to be emulated outside QEMU, in a separate process. vfio-user reuses the existing VFIO defines, structs and concepts. It has been earlier discussed as an RFC in: "RFC: use VFIO over a UNIX domain socket to implement device offloading" Signed-off-by: Thanos Makatos Signed-off-by: John Levon --- MAINTAINERS | 4 +- docs/devel/index-internals.rst | 1 + docs/devel/vfio-user.rst | 1522 ++++++++++++++++++++++++++++++++ 3 files changed, 1526 insertions(+), 1 deletion(-) create mode 100644 docs/devel/vfio-user.rst diff --git a/MAINTAINERS b/MAINTAINERS index 448dc951c5..8744b2d76e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4058,11 +4058,13 @@ F: include/semihosting/ F: tests/tcg/multiarch/arm-compat-semi/ F: tests/tcg/aarch64/system/semiheap.c -Multi-process QEMU +Multi-process QEMU / vfio-user M: Elena Ufimtseva M: Jagannathan Raman +M: Thanos Makatos S: Maintained F: docs/devel/multi-process.rst +F: docs/devel/vfio-user.rst F: docs/system/multi-process.rst F: hw/pci-host/remote.c F: include/hw/pci-host/remote.h diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst index 5636e9cf1d..44edd9b149 100644 --- a/docs/devel/index-internals.rst +++ b/docs/devel/index-internals.rst @@ -18,5 +18,6 @@ Details about QEMU's various subsystems including how to add features to them. s390-dasd-ipl tracing vfio-iommufd + vfio-user writing-monitor-commands virtio-backends diff --git a/docs/devel/vfio-user.rst b/docs/devel/vfio-user.rst new file mode 100644 index 0000000000..0d96477a68 --- /dev/null +++ b/docs/devel/vfio-user.rst @@ -0,0 +1,1522 @@ +.. include:: +******************************** +vfio-user Protocol Specification +******************************** + +-------------- +Version_ 0.9.1 +-------------- + +.. contents:: Table of Contents + +Introduction +============ +vfio-user is a protocol that allows a device to be emulated in a separate +process outside of a Virtual Machine Monitor (VMM). vfio-user devices consist +of a generic VFIO device type, living inside the VMM, which we call the client, +and the core device implementation, living outside the VMM, which we call the +server. + +The vfio-user specification is partly based on the +`Linux VFIO ioctl interface `_. + +VFIO is a mature and stable API, backed by an extensively used framework. The +existing VFIO client implementation in QEMU (``qemu/hw/vfio/``) can be largely +re-used, though there is nothing in this specification that requires that +particular implementation. None of the VFIO kernel modules are required for +supporting the protocol, on either the client or server side. Some source +definitions in VFIO are re-used for vfio-user. + +The main idea is to allow a virtual device to function in a separate process in +the same host over a UNIX domain socket. A UNIX domain socket (``AF_UNIX``) is +chosen because file descriptors can be trivially sent over it, which in turn +allows: + +* Sharing of client memory for DMA with the server. +* Sharing of server memory with the client for fast MMIO. +* Efficient sharing of eventfd's for triggering interrupts. + +Other socket types could be used which allow the server to run in a separate +guest in the same host (``AF_VSOCK``) or remotely (``AF_INET``). Theoretically +the underlying transport does not necessarily have to be a socket, however we do +not examine such alternatives. In this protocol version we focus on using a UNIX +domain socket and introduce basic support for the other two types of sockets +without considering performance implications. + +While passing of file descriptors is desirable for performance reasons, support +is not necessary for either the client or the server in order to implement the +protocol. There is always an in-band, message-passing fall back mechanism. + +Overview +======== + +VFIO is a framework that allows a physical device to be securely passed through +to a user space process; the device-specific kernel driver does not drive the +device at all. Typically, the user space process is a VMM and the device is +passed through to it in order to achieve high performance. VFIO provides an API +and the required functionality in the kernel. QEMU has adopted VFIO to allow a +guest to directly access physical devices, instead of emulating them in +software. + +vfio-user reuses the core VFIO concepts defined in its API, but implements them +as messages to be sent over a socket. It does not change the kernel-based VFIO +in any way, in fact none of the VFIO kernel modules need to be loaded to use +vfio-user. It is also possible for the client to concurrently use the current +kernel-based VFIO for one device, and vfio-user for another device. + +VFIO Device Model +----------------- + +A device under VFIO presents a standard interface to the user process. Many of +the VFIO operations in the existing interface use the ``ioctl()`` system call, and +references to the existing interface are called the ``ioctl()`` implementation in +this document. + +The following sections describe the set of messages that implement the vfio-user +interface over a socket. In many cases, the messages are analogous to data +structures used in the ``ioctl()`` implementation. Messages derived from the +``ioctl()`` will have a name derived from the ``ioctl()`` command name. E.g., the +``VFIO_DEVICE_GET_INFO`` ``ioctl()`` command becomes a +``VFIO_USER_DEVICE_GET_INFO`` message. The purpose of this reuse is to share as +much code as feasible with the ``ioctl()`` implementation``. + +Connection Initiation +^^^^^^^^^^^^^^^^^^^^^ + +After the client connects to the server, the initial client message is +``VFIO_USER_VERSION`` to propose a protocol version and set of capabilities to +apply to the session. The server replies with a compatible version and set of +capabilities it supports, or closes the connection if it cannot support the +advertised version. + +Device Information +^^^^^^^^^^^^^^^^^^ + +The client uses a ``VFIO_USER_DEVICE_GET_INFO`` message to query the server for +information about the device. This information includes: + +* The device type and whether it supports reset (``VFIO_DEVICE_FLAGS_``), +* the number of device regions, and +* the device presents to the client the number of interrupt types the device + supports. + +Region Information +^^^^^^^^^^^^^^^^^^ + +The client uses ``VFIO_USER_DEVICE_GET_REGION_INFO`` messages to query the +server for information about the device's regions. This information describes: + +* Read and write permissions, whether it can be memory mapped, and whether it + supports additional capabilities (``VFIO_REGION_INFO_CAP_``). +* Region index, size, and offset. + +When a device region can be mapped by the client, the server provides a file +descriptor which the client can ``mmap()``. The server is responsible for +polling for client updates to memory mapped regions. + +Region Capabilities +""""""""""""""""""" + +Some regions have additional capabilities that cannot be described adequately +by the region info data structure. These capabilities are returned in the +region info reply in a list similar to PCI capabilities in a PCI device's +configuration space. + +Sparse Regions +"""""""""""""" +A region can be memory-mappable in whole or in part. When only a subset of a +region can be mapped by the client, a ``VFIO_REGION_INFO_CAP_SPARSE_MMAP`` +capability is included in the region info reply. This capability describes +which portions can be mapped by the client. + +.. Note:: + For example, in a virtual NVMe controller, sparse regions can be used so + that accesses to the NVMe registers (found in the beginning of BAR0) are + trapped (an infrequent event), while allowing direct access to the doorbells + (an extremely frequent event as every I/O submission requires a write to + BAR0), found in the next page after the NVMe registers in BAR0. + +Device-Specific Regions +""""""""""""""""""""""" + +A device can define regions additional to the standard ones (e.g. PCI indexes +0-8). This is achieved by including a ``VFIO_REGION_INFO_CAP_TYPE`` capability +in the region info reply of a device-specific region. Such regions are reflected +in ``struct vfio_user_device_info.num_regions``. Thus, for PCI devices this +value can be equal to, or higher than, ``VFIO_PCI_NUM_REGIONS``. + +Region I/O via file descriptors +------------------------------- + +For unmapped regions, region I/O from the client is done via +``VFIO_USER_REGION_READ/WRITE``. As an optimization, ioeventfds or ioregionfds +may be configured for sub-regions of some regions. A client may request +information on these sub-regions via ``VFIO_USER_DEVICE_GET_REGION_IO_FDS``; by +configuring the returned file descriptors as ioeventfds or ioregionfds, the +server can be directly notified of I/O (for example, by KVM) without taking a +trip through the client. + +Interrupts +^^^^^^^^^^ + +The client uses ``VFIO_USER_DEVICE_GET_IRQ_INFO`` messages to query the server +for the device's interrupt types. The interrupt types are specific to the bus +the device is attached to, and the client is expected to know the capabilities +of each interrupt type. The server can signal an interrupt by directly injecting +interrupts into the guest via an event file descriptor. The client configures +how the server signals an interrupt with ``VFIO_USER_SET_IRQS`` messages. + +Device Read and Write +^^^^^^^^^^^^^^^^^^^^^ + +When the guest executes load or store operations to an unmapped device region, +the client forwards these operations to the server with +``VFIO_USER_REGION_READ`` or ``VFIO_USER_REGION_WRITE`` messages. The server +will reply with data from the device on read operations or an acknowledgement on +write operations. See `Read and Write Operations`_. + +Client memory access +-------------------- + +The client uses ``VFIO_USER_DMA_MAP`` and ``VFIO_USER_DMA_UNMAP`` messages to +inform the server of the valid DMA ranges that the server can access on behalf +of a device (typically, VM guest memory). DMA memory may be accessed by the +server via ``VFIO_USER_DMA_READ`` and ``VFIO_USER_DMA_WRITE`` messages over the +socket. In this case, the "DMA" part of the naming is a misnomer. + +Actual direct memory access of client memory from the server is possible if the +client provides file descriptors the server can ``mmap()``. Note that ``mmap()`` +privileges cannot be revoked by the client, therefore file descriptors should +only be exported in environments where the client trusts the server not to +corrupt guest memory. + +See `Read and Write Operations`_. + +Client/server interactions +========================== + +Socket +------ + +A server can serve: + +1) one or more clients, and/or +2) one or more virtual devices, belonging to one or more clients. + +The current protocol specification requires a dedicated socket per +client/server connection. It is a server-side implementation detail whether a +single server handles multiple virtual devices from the same or multiple +clients. The location of the socket is implementation-specific. Multiplexing +clients, devices, and servers over the same socket is not supported in this +version of the protocol. + +Authentication +-------------- + +For ``AF_UNIX``, we rely on OS mandatory access controls on the socket files, +therefore it is up to the management layer to set up the socket as required. +Socket types that span guests or hosts will require a proper authentication +mechanism. Defining that mechanism is deferred to a future version of the +protocol. + +Command Concurrency +------------------- + +A client may pipeline multiple commands without waiting for previous command +replies. The server will process commands in the order they are received. A +consequence of this is if a client issues a command with the *No_reply* bit, +then subsequently issues a command without *No_reply*, the older command will +have been processed before the reply to the younger command is sent by the +server. The client must be aware of the device's capability to process +concurrent commands if pipelining is used. For example, pipelining allows +multiple client threads to concurrently access device regions; the client must +ensure these accesses obey device semantics. + +An example is a frame buffer device, where the device may allow concurrent +access to different areas of video memory, but may have indeterminate behavior +if concurrent accesses are performed to command or status registers. + +Note that unrelated messages sent from the server to the client can appear in +between a client to server request/reply and vice versa. + +Implementers should be prepared for certain commands to exhibit potentially +unbounded latencies. For example, ``VFIO_USER_DEVICE_RESET`` may take an +arbitrarily long time to complete; clients should take care not to block +unnecessarily. + +Socket Disconnection Behavior +----------------------------- +The server and the client can disconnect from each other, either intentionally +or unexpectedly. Both the client and the server need to know how to handle such +events. + +Server Disconnection +^^^^^^^^^^^^^^^^^^^^ +A server disconnecting from the client may indicate that: + +1) A virtual device has been restarted, either intentionally (e.g. because of a + device update) or unintentionally (e.g. because of a crash). +2) A virtual device has been shut down with no intention to be restarted. + +It is impossible for the client to know whether or not a failure is +intermittent or innocuous and should be retried, therefore the client should +reset the VFIO device when it detects the socket has been disconnected. +Error recovery will be driven by the guest's device error handling +behavior. + +Client Disconnection +^^^^^^^^^^^^^^^^^^^^ +The client disconnecting from the server primarily means that the client +has exited. Currently, this means that the guest is shut down so the device is +no longer needed therefore the server can automatically exit. However, there +can be cases where a client disconnection should not result in a server exit: + +1) A single server serving multiple clients. +2) A multi-process QEMU upgrading itself step by step, which is not yet + implemented. + +Therefore in order for the protocol to be forward compatible, the server should +respond to a client disconnection as follows: + + - all client memory regions are unmapped and cleaned up (including closing any + passed file descriptors) + - all IRQ file descriptors passed from the old client are closed + - the device state should otherwise be retained + +The expectation is that when a client reconnects, it will re-establish IRQ and +client memory mappings. + +If anything happens to the client (such as qemu really did exit), the control +stack will know about it and can clean up resources accordingly. + +Security Considerations +----------------------- + +Speaking generally, vfio-user clients should not trust servers, and vice versa. +Standard tools and mechanisms should be used on both sides to validate input and +prevent against denial of service scenarios, buffer overflow, etc. + +Request Retry and Response Timeout +---------------------------------- +A failed command is a command that has been successfully sent and has been +responded to with an error code. Failure to send the command in the first place +(e.g. because the socket is disconnected) is a different type of error examined +earlier in the disconnect section. + +.. Note:: + QEMU's VFIO retries certain operations if they fail. While this makes sense + for real HW, we don't know for sure whether it makes sense for virtual + devices. + +Defining a retry and timeout scheme is deferred to a future version of the +protocol. + +Message sizes +------------- + +Some requests have an ``argsz`` field. In a request, it defines the maximum +expected reply payload size, which should be at least the size of the fixed +reply payload headers defined here. The *request* payload size is defined by the +usual ``msg_size`` field in the header, not the ``argsz`` field. + +In a reply, the server sets ``argsz`` field to the size needed for a full +payload size. This may be less than the requested maximum size. This may be +larger than the requested maximum size: in that case, the full payload is not +included in the reply, but the ``argsz`` field in the reply indicates the needed +size, allowing a client to allocate a larger buffer for holding the reply before +trying again. + +In addition, during negotiation (see `Version`_), the client and server may +each specify a ``max_data_xfer_size`` value; this defines the maximum data that +may be read or written via one of the ``VFIO_USER_DMA/REGION_READ/WRITE`` +messages; see `Read and Write Operations`_. + +Protocol Specification +====================== + +To distinguish from the base VFIO symbols, all vfio-user symbols are prefixed +with ``vfio_user`` or ``VFIO_USER``. In this revision, all data is in the +endianness of the host system, although this may be relaxed in future +revisions in cases where the client and server run on different hosts +with different endianness. + +Unless otherwise specified, all sizes should be presumed to be in bytes. + +.. _Commands: + +Commands +-------- +The following table lists the VFIO message command IDs, and whether the +message command is sent from the client or the server. + +====================================== ========= ================= +Name Command Request Direction +====================================== ========= ================= +``VFIO_USER_VERSION`` 1 client -> server +``VFIO_USER_DMA_MAP`` 2 client -> server +``VFIO_USER_DMA_UNMAP`` 3 client -> server +``VFIO_USER_DEVICE_GET_INFO`` 4 client -> server +``VFIO_USER_DEVICE_GET_REGION_INFO`` 5 client -> server +``VFIO_USER_DEVICE_GET_REGION_IO_FDS`` 6 client -> server +``VFIO_USER_DEVICE_GET_IRQ_INFO`` 7 client -> server +``VFIO_USER_DEVICE_SET_IRQS`` 8 client -> server +``VFIO_USER_REGION_READ`` 9 client -> server +``VFIO_USER_REGION_WRITE`` 10 client -> server +``VFIO_USER_DMA_READ`` 11 server -> client +``VFIO_USER_DMA_WRITE`` 12 server -> client +``VFIO_USER_DEVICE_RESET`` 13 client -> server +``VFIO_USER_REGION_WRITE_MULTI`` 15 client -> server +====================================== ========= ================= + +Header +------ + +All messages, both command messages and reply messages, are preceded by a +16-byte header that contains basic information about the message. The header is +followed by message-specific data described in the sections below. + ++----------------+--------+-------------+ +| Name | Offset | Size | ++================+========+=============+ +| Message ID | 0 | 2 | ++----------------+--------+-------------+ +| Command | 2 | 2 | ++----------------+--------+-------------+ +| Message size | 4 | 4 | ++----------------+--------+-------------+ +| Flags | 8 | 4 | ++----------------+--------+-------------+ +| | +-----+------------+ | +| | | Bit | Definition | | +| | +=====+============+ | +| | | 0-3 | Type | | +| | +-----+------------+ | +| | | 4 | No_reply | | +| | +-----+------------+ | +| | | 5 | Error | | +| | +-----+------------+ | ++----------------+--------+-------------+ +| Error | 12 | 4 | ++----------------+--------+-------------+ +| | 16 | variable | ++----------------+--------+-------------+ + +* *Message ID* identifies the message, and is echoed in the command's reply + message. Message IDs belong entirely to the sender, can be re-used (even + concurrently) and the receiver must not make any assumptions about their + uniqueness. +* *Command* specifies the command to be executed, listed in Commands_. It is + also set in the reply header. +* *Message size* contains the size of the entire message, including the header. +* *Flags* contains attributes of the message: + + * The *Type* bits indicate the message type. + + * *Command* (value 0x0) indicates a command message. + * *Reply* (value 0x1) indicates a reply message acknowledging a previous + command with the same message ID. + * *No_reply* in a command message indicates that no reply is needed for this + command. This is commonly used when multiple commands are sent, and only + the last needs acknowledgement. + * *Error* in a reply message indicates the command being acknowledged had + an error. In this case, the *Error* field will be valid. + +* *Error* in a reply message is an optional UNIX errno value. It may be zero + even if the Error bit is set in Flags. It is reserved in a command message. + +Each command message in Commands_ must be replied to with a reply message, +unless the message sets the *No_Reply* bit. The reply consists of the header +with the *Reply* bit set, plus any additional data. + +If an error occurs, the reply message must only include the reply header. + +As the header is standard in both requests and replies, it is not included in +the command-specific specifications below; each message definition should be +appended to the standard header, and the offsets are given from the end of the +standard header. + +``VFIO_USER_VERSION`` +--------------------- + +.. _Version: + +This is the initial message sent by the client after the socket connection is +established; the same format is used for the server's reply. + +Upon establishing a connection, the client must send a ``VFIO_USER_VERSION`` +message proposing a protocol version and a set of capabilities. The server +compares these with the versions and capabilities it supports and sends a +``VFIO_USER_VERSION`` reply according to the following rules. + +* The major version in the reply must be the same as proposed. If the client + does not support the proposed major, it closes the connection. +* The minor version in the reply must be equal to or less than the minor + version proposed. +* The capability list must be a subset of those proposed. If the server + requires a capability the client did not include, it closes the connection. + +The protocol major version will only change when incompatible protocol changes +are made, such as changing the message format. The minor version may change +when compatible changes are made, such as adding new messages or capabilities, +Both the client and server must support all minor versions less than the +maximum minor version it supports. E.g., an implementation that supports +version 1.3 must also support 1.0 through 1.2. + +When making a change to this specification, the protocol version number must +be included in the form "added in version X.Y" + +Request +^^^^^^^ + +============== ====== ==== +Name Offset Size +============== ====== ==== +version major 0 2 +version minor 2 2 +version data 4 variable (including terminating NUL). Optional. +============== ====== ==== + +The version data is an optional UTF-8 encoded JSON byte array with the following +format: + ++--------------+--------+-----------------------------------+ +| Name | Type | Description | ++==============+========+===================================+ +| capabilities | object | Contains common capabilities that | +| | | the sender supports. Optional. | ++--------------+--------+-----------------------------------+ + +Capabilities: + ++--------------------+---------+------------------------------------------------+ +| Name | Type | Description | ++====================+=========+================================================+ +| max_msg_fds | number | Maximum number of file descriptors that can be | +| | | received by the sender in one message. | +| | | Optional. If not specified then the receiver | +| | | must assume a value of ``1``. | ++--------------------+---------+------------------------------------------------+ +| max_data_xfer_size | number | Maximum ``count`` for data transfer messages; | +| | | see `Read and Write Operations`_. Optional, | +| | | with a default value of 1048576 bytes. | ++--------------------+---------+------------------------------------------------+ +| pgsizes | number | Page sizes supported in DMA map operations | +| | | or'ed together. Optional, with a default value | +| | | of supporting only 4k pages. | ++--------------------+---------+------------------------------------------------+ +| max_dma_maps | number | Maximum number DMA map windows that can be | +| | | valid simultaneously. Optional, with a | +| | | value of 65535 (64k-1). | ++--------------------+---------+------------------------------------------------+ +| migration | object | Migration capability parameters. If missing | +| | | then migration is not supported by the sender. | ++--------------------+---------+------------------------------------------------+ +| write_multiple | boolean | ``VFIO_USER_REGION_WRITE_MULTI`` messages | +| | | are supported if the value is ``true``. | ++--------------------+---------+------------------------------------------------+ + +The migration capability contains the following name/value pairs: + ++-----------------+--------+--------------------------------------------------+ +| Name | Type | Description | ++=================+========+==================================================+ +| pgsize | number | Page size of dirty pages bitmap. The smallest | +| | | between the client and the server is used. | ++-----------------+--------+--------------------------------------------------+ +| max_bitmap_size | number | Maximum bitmap size in ``VFIO_USER_DIRTY_PAGES`` | +| | | and ``VFIO_DMA_UNMAP`` messages. Optional, | +| | | with a default value of 256MB. | ++-----------------+--------+--------------------------------------------------+ + +Reply +^^^^^ + +The same message format is used in the server's reply with the semantics +described above. + +``VFIO_USER_DMA_MAP`` +--------------------- + +This command message is sent by the client to the server to inform it of the +memory regions the server can access. It must be sent before the server can +perform any DMA to the client. It is normally sent directly after the version +handshake is completed, but may also occur when memory is added to the client, +or if the client uses a vIOMMU. + +Request +^^^^^^^ + +The request payload for this message is a structure of the following format: + ++-------------+--------+-------------+ +| Name | Offset | Size | ++=============+========+=============+ +| argsz | 0 | 4 | ++-------------+--------+-------------+ +| flags | 4 | 4 | ++-------------+--------+-------------+ +| | +-----+------------+ | +| | | Bit | Definition | | +| | +=====+============+ | +| | | 0 | readable | | +| | +-----+------------+ | +| | | 1 | writeable | | +| | +-----+------------+ | ++-------------+--------+-------------+ +| offset | 8 | 8 | ++-------------+--------+-------------+ +| address | 16 | 8 | ++-------------+--------+-------------+ +| size | 24 | 8 | ++-------------+--------+-------------+ + +* *argsz* is the size of the above structure. Note there is no reply payload, + so this field differs from other message types. +* *flags* contains the following region attributes: + + * *readable* indicates that the region can be read from. + + * *writeable* indicates that the region can be written to. + +* *offset* is the file offset of the region with respect to the associated file + descriptor, or zero if the region is not mappable +* *address* is the base DMA address of the region. +* *size* is the size of the region. + +This structure is 32 bytes in size, so the message size is 16 + 32 bytes. + +If the DMA region being added can be directly mapped by the server, a file +descriptor must be sent as part of the message meta-data. The region can be +mapped via the mmap() system call. On ``AF_UNIX`` sockets, the file descriptor +must be passed as ``SCM_RIGHTS`` type ancillary data. Otherwise, if the DMA +region cannot be directly mapped by the server, no file descriptor must be sent +as part of the message meta-data and the DMA region can be accessed by the +server using ``VFIO_USER_DMA_READ`` and ``VFIO_USER_DMA_WRITE`` messages, +explained in `Read and Write Operations`_. A command to map over an existing +region must be failed by the server with ``EEXIST`` set in error field in the +reply. + +Reply +^^^^^ + +There is no payload in the reply message. + +``VFIO_USER_DMA_UNMAP`` +----------------------- + +This command message is sent by the client to the server to inform it that a +DMA region, previously made available via a ``VFIO_USER_DMA_MAP`` command +message, is no longer available for DMA. It typically occurs when memory is +subtracted from the client or if the client uses a vIOMMU. The DMA region is +described by the following structure: + +Request +^^^^^^^ + +The request payload for this message is a structure of the following format: + ++--------------+--------+------------------------+ +| Name | Offset | Size | ++==============+========+========================+ +| argsz | 0 | 4 | ++--------------+--------+------------------------+ +| flags | 4 | 4 | ++--------------+--------+------------------------+ +| address | 8 | 8 | ++--------------+--------+------------------------+ +| size | 16 | 8 | ++--------------+--------+------------------------+ + +* *argsz* is the maximum size of the reply payload. +* *flags* is unused in this version. +* *address* is the base DMA address of the DMA region. +* *size* is the size of the DMA region. + +The address and size of the DMA region being unmapped must match exactly a +previous mapping. + +Reply +^^^^^ + +Upon receiving a ``VFIO_USER_DMA_UNMAP`` command, if the file descriptor is +mapped then the server must release all references to that DMA region before +replying, which potentially includes in-flight DMA transactions. + +The server responds with the original DMA entry in the request. + + +``VFIO_USER_DEVICE_GET_INFO`` +----------------------------- + +This command message is sent by the client to the server to query for basic +information about the device. + +Request +^^^^^^^ + ++-------------+--------+--------------------------+ +| Name | Offset | Size | ++=============+========+==========================+ +| argsz | 0 | 4 | ++-------------+--------+--------------------------+ +| flags | 4 | 4 | ++-------------+--------+--------------------------+ +| | +-----+-------------------------+ | +| | | Bit | Definition | | +| | +=====+=========================+ | +| | | 0 | VFIO_DEVICE_FLAGS_RESET | | +| | +-----+-------------------------+ | +| | | 1 | VFIO_DEVICE_FLAGS_PCI | | +| | +-----+-------------------------+ | ++-------------+--------+--------------------------+ +| num_regions | 8 | 4 | ++-------------+--------+--------------------------+ +| num_irqs | 12 | 4 | ++-------------+--------+--------------------------+ + +* *argsz* is the maximum size of the reply payload +* all other fields must be zero. + +Reply +^^^^^ + ++-------------+--------+--------------------------+ +| Name | Offset | Size | ++=============+========+==========================+ +| argsz | 0 | 4 | ++-------------+--------+--------------------------+ +| flags | 4 | 4 | ++-------------+--------+--------------------------+ +| | +-----+-------------------------+ | +| | | Bit | Definition | | +| | +=====+=========================+ | +| | | 0 | VFIO_DEVICE_FLAGS_RESET | | +| | +-----+-------------------------+ | +| | | 1 | VFIO_DEVICE_FLAGS_PCI | | +| | +-----+-------------------------+ | ++-------------+--------+--------------------------+ +| num_regions | 8 | 4 | ++-------------+--------+--------------------------+ +| num_irqs | 12 | 4 | ++-------------+--------+--------------------------+ + +* *argsz* is the size required for the full reply payload (16 bytes today) +* *flags* contains the following device attributes. + + * ``VFIO_DEVICE_FLAGS_RESET`` indicates that the device supports the + ``VFIO_USER_DEVICE_RESET`` message. + * ``VFIO_DEVICE_FLAGS_PCI`` indicates that the device is a PCI device. + +* *num_regions* is the number of memory regions that the device exposes. +* *num_irqs* is the number of distinct interrupt types that the device supports. + +This version of the protocol only supports PCI devices. Additional devices may +be supported in future versions. + +``VFIO_USER_DEVICE_GET_REGION_INFO`` +------------------------------------ + +This command message is sent by the client to the server to query for +information about device regions. The VFIO region info structure is defined in +```` (``struct vfio_region_info``). + +Request +^^^^^^^ + ++------------+--------+------------------------------+ +| Name | Offset | Size | ++============+========+==============================+ +| argsz | 0 | 4 | ++------------+--------+------------------------------+ +| flags | 4 | 4 | ++------------+--------+------------------------------+ +| index | 8 | 4 | ++------------+--------+------------------------------+ +| cap_offset | 12 | 4 | ++------------+--------+------------------------------+ +| size | 16 | 8 | ++------------+--------+------------------------------+ +| offset | 24 | 8 | ++------------+--------+------------------------------+ + +* *argsz* the maximum size of the reply payload +* *index* is the index of memory region being queried, it is the only field + that is required to be set in the command message. +* all other fields must be zero. + +Reply +^^^^^ + ++------------+--------+------------------------------+ +| Name | Offset | Size | ++============+========+==============================+ +| argsz | 0 | 4 | ++------------+--------+------------------------------+ +| flags | 4 | 4 | ++------------+--------+------------------------------+ +| | +-----+-----------------------------+ | +| | | Bit | Definition | | +| | +=====+=============================+ | +| | | 0 | VFIO_REGION_INFO_FLAG_READ | | +| | +-----+-----------------------------+ | +| | | 1 | VFIO_REGION_INFO_FLAG_WRITE | | +| | +-----+-----------------------------+ | +| | | 2 | VFIO_REGION_INFO_FLAG_MMAP | | +| | +-----+-----------------------------+ | +| | | 3 | VFIO_REGION_INFO_FLAG_CAPS | | +| | +-----+-----------------------------+ | ++------------+--------+------------------------------+ ++------------+--------+------------------------------+ +| index | 8 | 4 | ++------------+--------+------------------------------+ +| cap_offset | 12 | 4 | ++------------+--------+------------------------------+ +| size | 16 | 8 | ++------------+--------+------------------------------+ +| offset | 24 | 8 | ++------------+--------+------------------------------+ + +* *argsz* is the size required for the full reply payload (region info structure + plus the size of any region capabilities) +* *flags* are attributes of the region: + + * ``VFIO_REGION_INFO_FLAG_READ`` allows client read access to the region. + * ``VFIO_REGION_INFO_FLAG_WRITE`` allows client write access to the region. + * ``VFIO_REGION_INFO_FLAG_MMAP`` specifies the client can mmap() the region. + When this flag is set, the reply will include a file descriptor in its + meta-data. On ``AF_UNIX`` sockets, the file descriptors will be passed as + ``SCM_RIGHTS`` type ancillary data. + * ``VFIO_REGION_INFO_FLAG_CAPS`` indicates additional capabilities found in the + reply. + +* *index* is the index of memory region being queried, it is the only field + that is required to be set in the command message. +* *cap_offset* describes where additional region capabilities can be found. + cap_offset is relative to the beginning of the VFIO region info structure. + The data structure it points is a VFIO cap header defined in + ````. +* *size* is the size of the region. +* *offset* is the offset that should be given to the mmap() system call for + regions with the MMAP attribute. It is also used as the base offset when + mapping a VFIO sparse mmap area, described below. + +VFIO region capabilities +"""""""""""""""""""""""" + +The VFIO region information can also include a capabilities list. This list is +similar to a PCI capability list - each entry has a common header that +identifies a capability and where the next capability in the list can be found. +The VFIO capability header format is defined in ```` (``struct +vfio_info_cap_header``). + +VFIO cap header format +"""""""""""""""""""""" + ++---------+--------+------+ +| Name | Offset | Size | ++=========+========+======+ +| id | 0 | 2 | ++---------+--------+------+ +| version | 2 | 2 | ++---------+--------+------+ +| next | 4 | 4 | ++---------+--------+------+ + +* *id* is the capability identity. +* *version* is a capability-specific version number. +* *next* specifies the offset of the next capability in the capability list. It + is relative to the beginning of the VFIO region info structure. + +VFIO sparse mmap cap header +""""""""""""""""""""""""""" + ++------------------+----------------------------------+ +| Name | Value | ++==================+==================================+ +| id | VFIO_REGION_INFO_CAP_SPARSE_MMAP | ++------------------+----------------------------------+ +| version | 0x1 | ++------------------+----------------------------------+ +| next | | ++------------------+----------------------------------+ +| sparse mmap info | VFIO region info sparse mmap | ++------------------+----------------------------------+ + +This capability is defined when only a subrange of the region supports +direct access by the client via mmap(). The VFIO sparse mmap area is defined in +```` (``struct vfio_region_sparse_mmap_area`` and ``struct +vfio_region_info_cap_sparse_mmap``). + +VFIO region info cap sparse mmap +"""""""""""""""""""""""""""""""" + ++----------+--------+------+ +| Name | Offset | Size | ++==========+========+======+ +| nr_areas | 0 | 4 | ++----------+--------+------+ +| reserved | 4 | 4 | ++----------+--------+------+ +| offset | 8 | 8 | ++----------+--------+------+ +| size | 16 | 8 | ++----------+--------+------+ +| ... | | | ++----------+--------+------+ + +* *nr_areas* is the number of sparse mmap areas in the region. +* *offset* and size describe a single area that can be mapped by the client. + There will be *nr_areas* pairs of offset and size. The offset will be added to + the base offset given in the ``VFIO_USER_DEVICE_GET_REGION_INFO`` to form the + offset argument of the subsequent mmap() call. + +The VFIO sparse mmap area is defined in ```` (``struct +vfio_region_info_cap_sparse_mmap``). + + +``VFIO_USER_DEVICE_GET_REGION_IO_FDS`` +-------------------------------------- + +Clients can access regions via ``VFIO_USER_REGION_READ/WRITE`` or, if provided, by +``mmap()`` of a file descriptor provided by the server. + +``VFIO_USER_DEVICE_GET_REGION_IO_FDS`` provides an alternative access mechanism via +file descriptors. This is an optional feature intended for performance +improvements where an underlying sub-system (such as KVM) supports communication +across such file descriptors to the vfio-user server, without needing to +round-trip through the client. + +The server returns an array of sub-regions for the requested region. Each +sub-region describes a span (offset and size) of a region, along with the +requested file descriptor notification mechanism to use. Each sub-region in the +response message may choose to use a different method, as defined below. The +two mechanisms supported in this specification are ioeventfds and ioregionfds. + +The server in addition returns a file descriptor in the ancillary data; clients +are expected to configure each sub-region's file descriptor with the requested +notification method. For example, a client could configure KVM with the +requested ioeventfd via a ``KVM_IOEVENTFD`` ``ioctl()``. + +Request +^^^^^^^ + ++-------------+--------+------+ +| Name | Offset | Size | ++=============+========+======+ +| argsz | 0 | 4 | ++-------------+--------+------+ +| flags | 4 | 4 | ++-------------+--------+------+ +| index | 8 | 4 | ++-------------+--------+------+ +| count | 12 | 4 | ++-------------+--------+------+ + +* *argsz* the maximum size of the reply payload +* *index* is the index of memory region being queried +* all other fields must be zero + +The client must set ``flags`` to zero and specify the region being queried in +the ``index``. + +Reply +^^^^^ + ++-------------+--------+------+ +| Name | Offset | Size | ++=============+========+======+ +| argsz | 0 | 4 | ++-------------+--------+------+ +| flags | 4 | 4 | ++-------------+--------+------+ +| index | 8 | 4 | ++-------------+--------+------+ +| count | 12 | 4 | ++-------------+--------+------+ +| sub-regions | 16 | ... | ++-------------+--------+------+ + +* *argsz* is the size of the region IO FD info structure plus the + total size of the sub-region array. Thus, each array entry "i" is at offset + i * ((argsz - 32) / count). Note that currently this is 40 bytes for both IO + FD types, but this is not to be relied on. As elsewhere, this indicates the + full reply payload size needed. +* *flags* must be zero +* *index* is the index of memory region being queried +* *count* is the number of sub-regions in the array +* *sub-regions* is the array of Sub-Region IO FD info structures + +The reply message will additionally include at least one file descriptor in the +ancillary data. Note that more than one sub-region may share the same file +descriptor. + +Note that it is the client's responsibility to verify the requested values (for +example, that the requested offset does not exceed the region's bounds). + +Each sub-region given in the response has one of two possible structures, +depending whether *type* is ``VFIO_USER_IO_FD_TYPE_IOEVENTFD`` or +``VFIO_USER_IO_FD_TYPE_IOREGIONFD``: + +Sub-Region IO FD info format (ioeventfd) +"""""""""""""""""""""""""""""""""""""""" + ++-----------+--------+------+ +| Name | Offset | Size | ++===========+========+======+ +| offset | 0 | 8 | ++-----------+--------+------+ +| size | 8 | 8 | ++-----------+--------+------+ +| fd_index | 16 | 4 | ++-----------+--------+------+ +| type | 20 | 4 | ++-----------+--------+------+ +| flags | 24 | 4 | ++-----------+--------+------+ +| padding | 28 | 4 | ++-----------+--------+------+ +| datamatch | 32 | 8 | ++-----------+--------+------+ + +* *offset* is the offset of the start of the sub-region within the region + requested ("physical address offset" for the region) +* *size* is the length of the sub-region. This may be zero if the access size is + not relevant, which may allow for optimizations +* *fd_index* is the index in the ancillary data of the FD to use for ioeventfd + notification; it may be shared. +* *type* is ``VFIO_USER_IO_FD_TYPE_IOEVENTFD`` +* *flags* is any of: + + * ``KVM_IOEVENTFD_FLAG_DATAMATCH`` + * ``KVM_IOEVENTFD_FLAG_PIO`` + * ``KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY`` (FIXME: makes sense?) + +* *datamatch* is the datamatch value if needed + +See https://www.kernel.org/doc/Documentation/virtual/kvm/api.txt, *4.59 +KVM_IOEVENTFD* for further context on the ioeventfd-specific fields. + +Sub-Region IO FD info format (ioregionfd) +""""""""""""""""""""""""""""""""""""""""" + ++-----------+--------+------+ +| Name | Offset | Size | ++===========+========+======+ +| offset | 0 | 8 | ++-----------+--------+------+ +| size | 8 | 8 | ++-----------+--------+------+ +| fd_index | 16 | 4 | ++-----------+--------+------+ +| type | 20 | 4 | ++-----------+--------+------+ +| flags | 24 | 4 | ++-----------+--------+------+ +| padding | 28 | 4 | ++-----------+--------+------+ +| user_data | 32 | 8 | ++-----------+--------+------+ + +* *offset* is the offset of the start of the sub-region within the region + requested ("physical address offset" for the region) +* *size* is the length of the sub-region. This may be zero if the access size is + not relevant, which may allow for optimizations; ``KVM_IOREGION_POSTED_WRITES`` + must be set in *flags* in this case +* *fd_index* is the index in the ancillary data of the FD to use for ioregionfd + messages; it may be shared +* *type* is ``VFIO_USER_IO_FD_TYPE_IOREGIONFD`` +* *flags* is any of: + + * ``KVM_IOREGION_PIO`` + * ``KVM_IOREGION_POSTED_WRITES`` + +* *user_data* is an opaque value passed back to the server via a message on the + file descriptor + +For further information on the ioregionfd-specific fields, see: +https://lore.kernel.org/kvm/cover.1613828726.git.eafanasova@gmail.com/ + +(FIXME: update with final API docs.) + +``VFIO_USER_DEVICE_GET_IRQ_INFO`` +--------------------------------- + +This command message is sent by the client to the server to query for +information about device interrupt types. The VFIO IRQ info structure is +defined in ```` (``struct vfio_irq_info``). + +Request +^^^^^^^ + ++-------+--------+---------------------------+ +| Name | Offset | Size | ++=======+========+===========================+ +| argsz | 0 | 4 | ++-------+--------+---------------------------+ +| flags | 4 | 4 | ++-------+--------+---------------------------+ +| | +-----+--------------------------+ | +| | | Bit | Definition | | +| | +=====+==========================+ | +| | | 0 | VFIO_IRQ_INFO_EVENTFD | | +| | +-----+--------------------------+ | +| | | 1 | VFIO_IRQ_INFO_MASKABLE | | +| | +-----+--------------------------+ | +| | | 2 | VFIO_IRQ_INFO_AUTOMASKED | | +| | +-----+--------------------------+ | +| | | 3 | VFIO_IRQ_INFO_NORESIZE | | +| | +-----+--------------------------+ | ++-------+--------+---------------------------+ +| index | 8 | 4 | ++-------+--------+---------------------------+ +| count | 12 | 4 | ++-------+--------+---------------------------+ + +* *argsz* is the maximum size of the reply payload (16 bytes today) +* index is the index of IRQ type being queried (e.g. ``VFIO_PCI_MSIX_IRQ_INDEX``) +* all other fields must be zero + +Reply +^^^^^ + ++-------+--------+---------------------------+ +| Name | Offset | Size | ++=======+========+===========================+ +| argsz | 0 | 4 | ++-------+--------+---------------------------+ +| flags | 4 | 4 | ++-------+--------+---------------------------+ +| | +-----+--------------------------+ | +| | | Bit | Definition | | +| | +=====+==========================+ | +| | | 0 | VFIO_IRQ_INFO_EVENTFD | | +| | +-----+--------------------------+ | +| | | 1 | VFIO_IRQ_INFO_MASKABLE | | +| | +-----+--------------------------+ | +| | | 2 | VFIO_IRQ_INFO_AUTOMASKED | | +| | +-----+--------------------------+ | +| | | 3 | VFIO_IRQ_INFO_NORESIZE | | +| | +-----+--------------------------+ | ++-------+--------+---------------------------+ +| index | 8 | 4 | ++-------+--------+---------------------------+ +| count | 12 | 4 | ++-------+--------+---------------------------+ + +* *argsz* is the size required for the full reply payload (16 bytes today) +* *flags* defines IRQ attributes: + + * ``VFIO_IRQ_INFO_EVENTFD`` indicates the IRQ type can support server eventfd + signalling. + * ``VFIO_IRQ_INFO_MASKABLE`` indicates that the IRQ type supports the ``MASK`` + and ``UNMASK`` actions in a ``VFIO_USER_DEVICE_SET_IRQS`` message. + * ``VFIO_IRQ_INFO_AUTOMASKED`` indicates the IRQ type masks itself after being + triggered, and the client must send an ``UNMASK`` action to receive new + interrupts. + * ``VFIO_IRQ_INFO_NORESIZE`` indicates ``VFIO_USER_SET_IRQS`` operations setup + interrupts as a set, and new sub-indexes cannot be enabled without disabling + the entire type. +* index is the index of IRQ type being queried +* count describes the number of interrupts of the queried type. + +``VFIO_USER_DEVICE_SET_IRQS`` +----------------------------- + +This command message is sent by the client to the server to set actions for +device interrupt types. The VFIO IRQ set structure is defined in +```` (``struct vfio_irq_set``). + +Request +^^^^^^^ + ++-------+--------+------------------------------+ +| Name | Offset | Size | ++=======+========+==============================+ +| argsz | 0 | 4 | ++-------+--------+------------------------------+ +| flags | 4 | 4 | ++-------+--------+------------------------------+ +| | +-----+-----------------------------+ | +| | | Bit | Definition | | +| | +=====+=============================+ | +| | | 0 | VFIO_IRQ_SET_DATA_NONE | | +| | +-----+-----------------------------+ | +| | | 1 | VFIO_IRQ_SET_DATA_BOOL | | +| | +-----+-----------------------------+ | +| | | 2 | VFIO_IRQ_SET_DATA_EVENTFD | | +| | +-----+-----------------------------+ | +| | | 3 | VFIO_IRQ_SET_ACTION_MASK | | +| | +-----+-----------------------------+ | +| | | 4 | VFIO_IRQ_SET_ACTION_UNMASK | | +| | +-----+-----------------------------+ | +| | | 5 | VFIO_IRQ_SET_ACTION_TRIGGER | | +| | +-----+-----------------------------+ | ++-------+--------+------------------------------+ +| index | 8 | 4 | ++-------+--------+------------------------------+ +| start | 12 | 4 | ++-------+--------+------------------------------+ +| count | 16 | 4 | ++-------+--------+------------------------------+ +| data | 20 | variable | ++-------+--------+------------------------------+ + +* *argsz* is the size of the VFIO IRQ set request payload, including any *data* + field. Note there is no reply payload, so this field differs from other + message types. +* *flags* defines the action performed on the interrupt range. The ``DATA`` + flags describe the data field sent in the message; the ``ACTION`` flags + describe the action to be performed. The flags are mutually exclusive for + both sets. + + * ``VFIO_IRQ_SET_DATA_NONE`` indicates there is no data field in the command. + The action is performed unconditionally. + * ``VFIO_IRQ_SET_DATA_BOOL`` indicates the data field is an array of boolean + bytes. The action is performed if the corresponding boolean is true. + * ``VFIO_IRQ_SET_DATA_EVENTFD`` indicates an array of event file descriptors + was sent in the message meta-data. These descriptors will be signalled when + the action defined by the action flags occurs. In ``AF_UNIX`` sockets, the + descriptors are sent as ``SCM_RIGHTS`` type ancillary data. + If no file descriptors are provided, this de-assigns the specified + previously configured interrupts. + * ``VFIO_IRQ_SET_ACTION_MASK`` indicates a masking event. It can be used with + ``VFIO_IRQ_SET_DATA_BOOL`` or ``VFIO_IRQ_SET_DATA_NONE`` to mask an interrupt, + or with ``VFIO_IRQ_SET_DATA_EVENTFD`` to generate an event when the guest masks + the interrupt. + * ``VFIO_IRQ_SET_ACTION_UNMASK`` indicates an unmasking event. It can be used + with ``VFIO_IRQ_SET_DATA_BOOL`` or ``VFIO_IRQ_SET_DATA_NONE`` to unmask an + interrupt, or with ``VFIO_IRQ_SET_DATA_EVENTFD`` to generate an event when the + guest unmasks the interrupt. + * ``VFIO_IRQ_SET_ACTION_TRIGGER`` indicates a triggering event. It can be used + with ``VFIO_IRQ_SET_DATA_BOOL`` or ``VFIO_IRQ_SET_DATA_NONE`` to trigger an + interrupt, or with ``VFIO_IRQ_SET_DATA_EVENTFD`` to generate an event when the + server triggers the interrupt. + +* *index* is the index of IRQ type being setup. +* *start* is the start of the sub-index being set. +* *count* describes the number of sub-indexes being set. As a special case, a + count (and start) of 0, with data flags of ``VFIO_IRQ_SET_DATA_NONE`` disables + all interrupts of the index. +* *data* is an optional field included when the + ``VFIO_IRQ_SET_DATA_BOOL`` flag is present. It contains an array of booleans + that specify whether the action is to be performed on the corresponding + index. It's used when the action is only performed on a subset of the range + specified. + +Not all interrupt types support every combination of data and action flags. +The client must know the capabilities of the device and IRQ index before it +sends a ``VFIO_USER_DEVICE_SET_IRQ`` message. + +In typical operation, a specific IRQ may operate as follows: + +1. The client sends a ``VFIO_USER_DEVICE_SET_IRQ`` message with + ``flags=(VFIO_IRQ_SET_DATA_EVENTFD|VFIO_IRQ_SET_ACTION_TRIGGER)`` along + with an eventfd. This associates the IRQ with a particular eventfd on the + server side. + +#. The client may send a ``VFIO_USER_DEVICE_SET_IRQ`` message with + ``flags=(VFIO_IRQ_SET_DATA_EVENTFD|VFIO_IRQ_SET_ACTION_MASK/UNMASK)`` along + with another eventfd. This associates the given eventfd with the + mask/unmask state on the server side. + +#. The server may trigger the IRQ by writing 1 to the eventfd. + +#. The server may mask/unmask an IRQ which will write 1 to the corresponding + mask/unmask eventfd, if there is one. + +5. A client may trigger a device IRQ itself, by sending a + ``VFIO_USER_DEVICE_SET_IRQ`` message with + ``flags=(VFIO_IRQ_SET_DATA_NONE/BOOL|VFIO_IRQ_SET_ACTION_TRIGGER)``. + +6. A client may mask or unmask the IRQ, by sending a + ``VFIO_USER_DEVICE_SET_IRQ`` message with + ``flags=(VFIO_IRQ_SET_DATA_NONE/BOOL|VFIO_IRQ_SET_ACTION_MASK/UNMASK)``. + +Reply +^^^^^ + +There is no payload in the reply. + +.. _Read and Write Operations: + +Note that all of these operations must be supported by the client and/or server, +even if the corresponding memory or device region has been shared as mappable. + +The ``count`` field must not exceed the value of ``max_data_xfer_size`` of the +peer, for both reads and writes. + +``VFIO_USER_REGION_READ`` +------------------------- + +If a device region is not mappable, it's not directly accessible by the client +via ``mmap()`` of the underlying file descriptor. In this case, a client can +read from a device region with this message. + +Request +^^^^^^^ + ++--------+--------+----------+ +| Name | Offset | Size | ++========+========+==========+ +| offset | 0 | 8 | ++--------+--------+----------+ +| region | 8 | 4 | ++--------+--------+----------+ +| count | 12 | 4 | ++--------+--------+----------+ + +* *offset* into the region being accessed. +* *region* is the index of the region being accessed. +* *count* is the size of the data to be transferred. + +Reply +^^^^^ + ++--------+--------+----------+ +| Name | Offset | Size | ++========+========+==========+ +| offset | 0 | 8 | ++--------+--------+----------+ +| region | 8 | 4 | ++--------+--------+----------+ +| count | 12 | 4 | ++--------+--------+----------+ +| data | 16 | variable | ++--------+--------+----------+ + +* *offset* into the region accessed. +* *region* is the index of the region accessed. +* *count* is the size of the data transferred. +* *data* is the data that was read from the device region. + +``VFIO_USER_REGION_WRITE`` +-------------------------- + +If a device region is not mappable, it's not directly accessible by the client +via mmap() of the underlying fd. In this case, a client can write to a device +region with this message. + +Request +^^^^^^^ + ++--------+--------+----------+ +| Name | Offset | Size | ++========+========+==========+ +| offset | 0 | 8 | ++--------+--------+----------+ +| region | 8 | 4 | ++--------+--------+----------+ +| count | 12 | 4 | ++--------+--------+----------+ +| data | 16 | variable | ++--------+--------+----------+ + +* *offset* into the region being accessed. +* *region* is the index of the region being accessed. +* *count* is the size of the data to be transferred. +* *data* is the data to write + +Reply +^^^^^ + ++--------+--------+----------+ +| Name | Offset | Size | ++========+========+==========+ +| offset | 0 | 8 | ++--------+--------+----------+ +| region | 8 | 4 | ++--------+--------+----------+ +| count | 12 | 4 | ++--------+--------+----------+ + +* *offset* into the region accessed. +* *region* is the index of the region accessed. +* *count* is the size of the data transferred. + +``VFIO_USER_DMA_READ`` +----------------------- + +If the client has not shared mappable memory, the server can use this message to +read from guest memory. + +Request +^^^^^^^ + ++---------+--------+----------+ +| Name | Offset | Size | ++=========+========+==========+ +| address | 0 | 8 | ++---------+--------+----------+ +| count | 8 | 8 | ++---------+--------+----------+ + +* *address* is the client DMA memory address being accessed. This address must have + been previously exported to the server with a ``VFIO_USER_DMA_MAP`` message. +* *count* is the size of the data to be transferred. + +Reply +^^^^^ + ++---------+--------+----------+ +| Name | Offset | Size | ++=========+========+==========+ +| address | 0 | 8 | ++---------+--------+----------+ +| count | 8 | 8 | ++---------+--------+----------+ +| data | 16 | variable | ++---------+--------+----------+ + +* *address* is the client DMA memory address being accessed. +* *count* is the size of the data transferred. +* *data* is the data read. + +``VFIO_USER_DMA_WRITE`` +----------------------- + +If the client has not shared mappable memory, the server can use this message to +write to guest memory. + +Request +^^^^^^^ + ++---------+--------+----------+ +| Name | Offset | Size | ++=========+========+==========+ +| address | 0 | 8 | ++---------+--------+----------+ +| count | 8 | 8 | ++---------+--------+----------+ +| data | 16 | variable | ++---------+--------+----------+ + +* *address* is the client DMA memory address being accessed. This address must have + been previously exported to the server with a ``VFIO_USER_DMA_MAP`` message. +* *count* is the size of the data to be transferred. +* *data* is the data to write + +Reply +^^^^^ + ++---------+--------+----------+ +| Name | Offset | Size | ++=========+========+==========+ +| address | 0 | 8 | ++---------+--------+----------+ +| count | 8 | 4 | ++---------+--------+----------+ + +* *address* is the client DMA memory address being accessed. +* *count* is the size of the data transferred. + +``VFIO_USER_DEVICE_RESET`` +-------------------------- + +This command message is sent from the client to the server to reset the device. +Neither the request or reply have a payload. + +``VFIO_USER_REGION_WRITE_MULTI`` +-------------------------------- + +This message can be used to coalesce multiple device write operations +into a single messgage. It is only used as an optimization when the +outgoing message queue is relatively full. + +Request +^^^^^^^ + ++---------+--------+----------+ +| Name | Offset | Size | ++=========+========+==========+ +| wr_cnt | 0 | 8 | ++---------+--------+----------+ +| wrs | 8 | variable | ++---------+--------+----------+ + +* *wr_cnt* is the number of device writes coalesced in the message +* *wrs* is an array of device writes defined below + +Single Device Write Format +"""""""""""""""""""""""""" + ++--------+--------+----------+ +| Name | Offset | Size | ++========+========+==========+ +| offset | 0 | 8 | ++--------+--------+----------+ +| region | 8 | 4 | ++--------+--------+----------+ +| count | 12 | 4 | ++--------+--------+----------+ +| data | 16 | 8 | ++--------+--------+----------+ + +* *offset* into the region being accessed. +* *region* is the index of the region being accessed. +* *count* is the size of the data to be transferred. This format can + only describe writes of 8 bytes or less. +* *data* is the data to write. + +Reply +^^^^^ + ++---------+--------+----------+ +| Name | Offset | Size | ++=========+========+==========+ +| wr_cnt | 0 | 8 | ++---------+--------+----------+ + +* *wr_cnt* is the number of device writes completed. + + +Appendices +========== + +Unused VFIO ``ioctl()`` commands +-------------------------------- + +The following VFIO commands do not have an equivalent vfio-user command: + +* ``VFIO_GET_API_VERSION`` +* ``VFIO_CHECK_EXTENSION`` +* ``VFIO_SET_IOMMU`` +* ``VFIO_GROUP_GET_STATUS`` +* ``VFIO_GROUP_SET_CONTAINER`` +* ``VFIO_GROUP_UNSET_CONTAINER`` +* ``VFIO_GROUP_GET_DEVICE_FD`` +* ``VFIO_IOMMU_GET_INFO`` + +However, once support for live migration for VFIO devices is finalized some +of the above commands may have to be handled by the client in their +corresponding vfio-user form. This will be addressed in a future protocol +version. + +VFIO groups and containers +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The current VFIO implementation includes group and container idioms that +describe how a device relates to the host IOMMU. In the vfio-user +implementation, the IOMMU is implemented in SW by the client, and is not +visible to the server. The simplest idea would be that the client put each +device into its own group and container. + +Backend Program Conventions +--------------------------- + +vfio-user backend program conventions are based on the vhost-user ones. + +* The backend program must not daemonize itself. +* No assumptions must be made as to what access the backend program has on the + system. +* File descriptors 0, 1 and 2 must exist, must have regular + stdin/stdout/stderr semantics, and can be redirected. +* The backend program must honor the SIGTERM signal. +* The backend program must accept the following commands line options: + + * ``--socket-path=PATH``: path to UNIX domain socket, + * ``--fd=FDNUM``: file descriptor for UNIX domain socket, incompatible with + ``--socket-path`` +* The backend program must be accompanied with a JSON file stored under + ``/usr/share/vfio-user``. + +TODO add schema similar to docs/interop/vhost-user.json. From patchwork Wed May 29 16:23:03 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679233 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id B0ADEC25B7C for ; Wed, 29 May 2024 16:27:51 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7R-0001VY-5p; Wed, 29 May 2024 12:25:41 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6y-0001BW-O8 for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:14 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6v-0006Kd-CB for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:12 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6j-006COz-5O; Wed, 29 May 2024 17:24:57 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 10/26] vfio-user: add vfio-user class and container Date: Wed, 29 May 2024 17:23:03 +0100 Message-Id: <20240529162319.1476680-11-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Add a new class for vfio-user with its class and instance constructors and destructors, and its pci ops. Introduce VFIOUserContainer for handling container operations for such classes. Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- MAINTAINERS | 1 + hw/vfio/container.c | 2 +- hw/vfio/meson.build | 5 + hw/vfio/pci.c | 12 +- hw/vfio/pci.h | 7 + hw/vfio/user-container.c | 210 ++++++++++++++++++++++++++ hw/vfio/user-pci.c | 155 +++++++++++++++++++ include/hw/vfio/vfio-common.h | 8 + include/hw/vfio/vfio-container-base.h | 1 + meson_options.txt | 2 + scripts/meson-buildoptions.sh | 4 + 11 files changed, 400 insertions(+), 7 deletions(-) create mode 100644 hw/vfio/user-container.c create mode 100644 hw/vfio/user-pci.c diff --git a/MAINTAINERS b/MAINTAINERS index 8744b2d76e..4ca266bb31 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4087,6 +4087,7 @@ F: hw/remote/vfio-user-obj.c F: include/hw/remote/vfio-user-obj.h F: hw/remote/iommu.c F: include/hw/remote/iommu.h +F: hw/vfio/user* EBPF: M: Jason Wang diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 6c70f95f04..4ff09e277f 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -889,7 +889,7 @@ static bool vfio_get_device(VFIOGroup *group, const char *name, return true; } -static void vfio_put_base_device(VFIODevice *vbasedev) +void vfio_put_base_device(VFIODevice *vbasedev) { if (vbasedev->regions != NULL) { int i; diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index bba776f75c..f897c5b81a 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -16,6 +16,11 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'pci-quirks.c', 'pci.c', )) + +if get_option('vfio_user_client').enabled() + vfio_ss.add(files('user-container.c', 'user-pci.c')) +endif + vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c')) vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c')) vfio_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c')) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 2ad0dbe342..2e334c0c38 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -109,7 +109,7 @@ static void vfio_intx_interrupt(void *opaque) } } -static void vfio_intx_eoi(VFIODevice *vbasedev) +void vfio_intx_eoi(VFIODevice *vbasedev) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); @@ -2585,7 +2585,7 @@ static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev) } } -static Object *vfio_pci_get_object(VFIODevice *vbasedev) +Object *vfio_pci_get_object(VFIODevice *vbasedev) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); @@ -2641,7 +2641,7 @@ static const VMStateDescription vmstate_vfio_pci_config = { } }; -static int vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f, Error **errp) +int vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f, Error **errp) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); @@ -2649,7 +2649,7 @@ static int vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f, Error **errp) errp); } -static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) +int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); PCIDevice *pdev = &vdev->pdev; @@ -2845,7 +2845,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) return true; } -static void vfio_pci_put_device(VFIOPCIDevice *vdev) +void vfio_pci_put_device(VFIOPCIDevice *vdev) { vfio_detach_device(&vdev->vbasedev); @@ -3359,7 +3359,7 @@ post_reset: vfio_pci_post_reset(vdev); } -static void vfio_instance_init(Object *obj) +void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 770448155e..040f4995b5 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -213,6 +213,13 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len); +void vfio_intx_eoi(VFIODevice *vbasedev); +Object *vfio_pci_get_object(VFIODevice *vbasedev); +int vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f, Error **errp); +int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f); +void vfio_pci_put_device(VFIOPCIDevice *vdev); +void vfio_instance_init(Object *obj); + uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size); void vfio_vga_write(void *opaque, hwaddr addr, uint64_t data, unsigned size); diff --git a/hw/vfio/user-container.c b/hw/vfio/user-container.c new file mode 100644 index 0000000000..1fd42e9bfd --- /dev/null +++ b/hw/vfio/user-container.c @@ -0,0 +1,210 @@ +/* + * Container for vfio-user IOMMU type: rather than communicating with the kernel + * vfio driver, we communicate over a socket to a server using the vfio-user + * protocol. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include +#include + +#include "hw/vfio/vfio-common.h" +#include "exec/address-spaces.h" +#include "exec/memory.h" +#include "exec/ram_addr.h" +#include "hw/hw.h" +#include "qemu/error-report.h" +#include "qemu/range.h" +#include "sysemu/reset.h" +#include "trace.h" +#include "qapi/error.h" +#include "pci.h" + +static int vfio_user_dma_unmap(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb, int flags) +{ + return -ENOTSUP; +} + +static int vfio_user_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mrp) +{ + return -ENOTSUP; +} + +static int +vfio_user_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, + bool start, Error **errp) +{ + error_setg_errno(errp, ENOTSUP, "Not supported"); + return -ENOTSUP; +} + +static int vfio_user_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size, Error **errp) +{ + error_setg_errno(errp, ENOTSUP, "Not supported"); + return -ENOTSUP; +} + +static bool vfio_user_setup(VFIOContainerBase *bcontainer, Error **errp) +{ + error_setg_errno(errp, ENOTSUP, "Not supported"); + return -ENOTSUP; +} + +/* + * Try to mirror vfio_connect_container() as much as possible. + */ +static VFIOUserContainer * +vfio_connect_user_container(AddressSpace *as, Error **errp) +{ + VFIOAddressSpace *space; + VFIOUserContainer *container; + VFIOContainerBase *bcontainer; + + space = vfio_get_address_space(as); + + container = g_malloc0(sizeof(*container)); + + bcontainer = &container->bcontainer; + + if (!vfio_cpr_register_container(bcontainer, errp)) { + goto free_container_exit; + } + + assert(bcontainer->ops->setup); + + if (!bcontainer->ops->setup(bcontainer, errp)) { + goto unregister_container_exit; + } + + QLIST_INSERT_HEAD(&space->containers, bcontainer, next); + + bcontainer->listener = vfio_memory_listener; + memory_listener_register(&bcontainer->listener, bcontainer->space->as); + + if (bcontainer->error) { + errno = EINVAL; + error_propagate_prepend(errp, bcontainer->error, + "memory listener initialization failed: "); + goto listener_release_exit; + } + + bcontainer->initialized = true; + + return container; + +listener_release_exit: + QLIST_REMOVE(bcontainer, next); + memory_listener_unregister(&bcontainer->listener); + if (bcontainer->ops->release) { + bcontainer->ops->release(bcontainer); + } + +unregister_container_exit: + vfio_cpr_unregister_container(bcontainer); + +free_container_exit: + g_free(container); + + vfio_put_address_space(space); + + return NULL; +} + +static void vfio_disconnect_user_container(VFIOUserContainer *container) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + + memory_listener_unregister(&bcontainer->listener); + if (bcontainer->ops->release) { + bcontainer->ops->release(bcontainer); + } + + VFIOAddressSpace *space = bcontainer->space; + + vfio_container_destroy(bcontainer); + + vfio_cpr_unregister_container(bcontainer); + g_free(container); + + vfio_put_address_space(space); +} + +static bool vfio_user_get_device(VFIOUserContainer *container, + VFIODevice *vbasedev, Error **errp) +{ + struct vfio_device_info info = { 0 }; + + vbasedev->fd = -1; + + vfio_prepare_device(vbasedev, &container->bcontainer, NULL, &info); + + return true; +} + +/* + * vfio_user_attach_device: attach a device to a new container. + */ +static bool vfio_user_attach_device(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + VFIOUserContainer *container; + + container = vfio_connect_user_container(as, errp); + if (container == NULL) { + error_prepend(errp, "failed to connect proxy"); + return false; + } + + return vfio_user_get_device(container, vbasedev, errp); +} + +static void vfio_user_detach_device(VFIODevice *vbasedev) +{ + VFIOUserContainer *container = container_of(vbasedev->bcontainer, + VFIOUserContainer, bcontainer); + + QLIST_REMOVE(vbasedev, global_next); + QLIST_REMOVE(vbasedev, container_next); + vbasedev->bcontainer = NULL; + vfio_put_base_device(vbasedev); + vfio_disconnect_user_container(container); +} + +static int vfio_user_pci_hot_reset(VFIODevice *vbasedev, bool single) +{ + /* ->needs_reset is always false for vfio-user. */ + return 0; +} + +static void vfio_iommu_user_class_init(ObjectClass *klass, void *data) +{ + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + + vioc->setup = vfio_user_setup; + vioc->dma_map = vfio_user_dma_map; + vioc->dma_unmap = vfio_user_dma_unmap; + vioc->attach_device = vfio_user_attach_device; + vioc->detach_device = vfio_user_detach_device; + vioc->set_dirty_page_tracking = vfio_user_set_dirty_page_tracking; + vioc->query_dirty_bitmap = vfio_user_query_dirty_bitmap; + vioc->pci_hot_reset = vfio_user_pci_hot_reset; +}; + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_USER, + .parent = TYPE_VFIO_IOMMU, + .class_init = vfio_iommu_user_class_init, + }, +}; + +DEFINE_TYPES(types) diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c new file mode 100644 index 0000000000..fd0e534f0f --- /dev/null +++ b/hw/vfio/user-pci.c @@ -0,0 +1,155 @@ +/* + * vfio PCI device over a UNIX socket. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include +#include + +#include "hw/hw.h" +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" +#include "hw/pci/pci_bridge.h" +#include "hw/qdev-properties.h" +#include "hw/qdev-properties-system.h" +#include "migration/vmstate.h" +#include "qapi/qmp/qdict.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "qemu/module.h" +#include "qemu/range.h" +#include "qemu/units.h" +#include "sysemu/kvm.h" +#include "sysemu/runstate.h" +#include "pci.h" +#include "trace.h" +#include "qapi/error.h" +#include "migration/blocker.h" +#include "migration/qemu-file.h" + +#define TYPE_VFIO_USER_PCI "vfio-user-pci" +OBJECT_DECLARE_SIMPLE_TYPE(VFIOUserPCIDevice, VFIO_USER_PCI) + +struct VFIOUserPCIDevice { + VFIOPCIDevice device; + char *sock_name; +}; + +/* + * Emulated devices don't use host hot reset + */ +static void vfio_user_compute_needs_reset(VFIODevice *vbasedev) +{ + vbasedev->needs_reset = false; +} + +static VFIODeviceOps vfio_user_pci_ops = { + .vfio_compute_needs_reset = vfio_user_compute_needs_reset, + .vfio_eoi = vfio_intx_eoi, + .vfio_get_object = vfio_pci_get_object, + .vfio_save_config = vfio_pci_save_config, + .vfio_load_config = vfio_pci_load_config, +}; + +static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) +{ + ERRP_GUARD(); + VFIOUserPCIDevice *udev = VFIO_USER_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; + AddressSpace *as; + + /* + * TODO: make option parser understand SocketAddress + * and use that instead of having scalar options + * for each socket type. + */ + if (!udev->sock_name) { + error_setg(errp, "No socket specified"); + error_append_hint(errp, "Use -device vfio-user-pci,socket=\n"); + return; + } + + vbasedev->name = g_strdup_printf("VFIO user <%s>", udev->sock_name); + vbasedev->ops = &vfio_user_pci_ops; + vbasedev->type = VFIO_DEVICE_TYPE_PCI; + vbasedev->dev = DEVICE(vdev); + + as = pci_device_iommu_address_space(pdev); + if (!vfio_attach_device_by_iommu_type(TYPE_VFIO_IOMMU_USER, + vbasedev->name, vbasedev, + as, errp)) { + error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name); + return; + } +} + +static void vfio_user_instance_init(Object *obj) +{ + PCIDevice *pci_dev = PCI_DEVICE(obj); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + VFIODevice *vbasedev = &vdev->vbasedev; + + device_add_bootindex_property(obj, &vdev->bootindex, + "bootindex", NULL, + &pci_dev->qdev); + vdev->host.domain = ~0U; + vdev->host.bus = ~0U; + vdev->host.slot = ~0U; + vdev->host.function = ~0U; + + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_user_pci_ops, + &vfio_dev_io_ioctl, DEVICE(vdev), false); + + vdev->nv_gpudirect_clique = 0xFF; + + /* + * QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command + * line, therefore, no need to wait to realize like other devices. + */ + pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; +} + +static void vfio_user_instance_finalize(Object *obj) +{ + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + + vfio_pci_put_device(vdev); +} + +static Property vfio_user_pci_dev_properties[] = { + DEFINE_PROP_STRING("socket", VFIOUserPCIDevice, sock_name), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vfio_user_pci_dev_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); + + device_class_set_props(dc, vfio_user_pci_dev_properties); + dc->desc = "VFIO over socket PCI device assignment"; + pdc->realize = vfio_user_pci_realize; +} + +static const TypeInfo vfio_user_pci_dev_info = { + .name = TYPE_VFIO_USER_PCI, + .parent = TYPE_VFIO_PCI_BASE, + .instance_size = sizeof(VFIOUserPCIDevice), + .class_init = vfio_user_pci_dev_class_init, + .instance_init = vfio_user_instance_init, + .instance_finalize = vfio_user_instance_finalize, +}; + +static void register_vfio_user_dev_type(void) +{ + type_register_static(&vfio_user_pci_dev_info); +} + +type_init(register_vfio_user_dev_type) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 689cafe28b..c12fd1e2f0 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -75,6 +75,7 @@ typedef struct VFIOMigration { struct VFIOGroup; +/* MMU container sub-class for legacy vfio implementation. */ typedef struct VFIOContainer { VFIOContainerBase bcontainer; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ @@ -83,6 +84,11 @@ typedef struct VFIOContainer { QLIST_HEAD(, VFIOGroup) group_list; } VFIOContainer; +/* MMU container sub-class for vfio-user. */ +typedef struct VFIOUserContainer { + VFIOContainerBase bcontainer; +} VFIOUserContainer; + typedef struct VFIOHostDMAWindow { hwaddr min_iova; hwaddr max_iova; @@ -92,6 +98,7 @@ typedef struct VFIOHostDMAWindow { typedef struct IOMMUFDBackend IOMMUFDBackend; +/* MMU container sub-class for vfio iommufd implementation. */ typedef struct VFIOIOMMUFDContainer { VFIOContainerBase bcontainer; IOMMUFDBackend *be; @@ -257,6 +264,7 @@ bool vfio_attach_device_by_iommu_type(const char *iommu_type, char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); void vfio_detach_device(VFIODevice *vbasedev); +void vfio_put_base_device(VFIODevice *vbasedev); int vfio_kvm_device_add_fd(int fd, Error **errp); int vfio_kvm_device_del_fd(int fd, Error **errp); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index c2c1af70ac..82472e4b3c 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -96,6 +96,7 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer); #define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy" #define TYPE_VFIO_IOMMU_SPAPR TYPE_VFIO_IOMMU "-spapr" #define TYPE_VFIO_IOMMU_IOMMUFD TYPE_VFIO_IOMMU "-iommufd" +#define TYPE_VFIO_IOMMU_USER TYPE_VFIO_IOMMU "-user" /* * VFIOContainerBase is not an abstract QOM object because it felt diff --git a/meson_options.txt b/meson_options.txt index 4c1583eb40..239c69b2f4 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -105,6 +105,8 @@ option('multiprocess', type: 'feature', value: 'auto', description: 'Out of process device emulation support') option('relocatable', type : 'boolean', value : true, description: 'toggle relocatable install') +option('vfio_user_client', type: 'feature', value: 'disabled', + description: 'vfio-user client support') option('vfio_user_server', type: 'feature', value: 'disabled', description: 'vfio-user server support') option('dbus_display', type: 'feature', value: 'auto', diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 6ce5a8b72a..51be621296 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -191,6 +191,8 @@ meson_options_help() { printf "%s\n" ' vdi vdi image format support' printf "%s\n" ' vduse-blk-export' printf "%s\n" ' VDUSE block export support' + printf "%s\n" ' vfio-user-client' + printf "%s\n" ' vfio-user client support' printf "%s\n" ' vfio-user-server' printf "%s\n" ' vfio-user server support' printf "%s\n" ' vhdx vhdx image format support' @@ -508,6 +510,8 @@ _meson_option_parse() { --disable-vdi) printf "%s" -Dvdi=disabled ;; --enable-vduse-blk-export) printf "%s" -Dvduse_blk_export=enabled ;; --disable-vduse-blk-export) printf "%s" -Dvduse_blk_export=disabled ;; + --enable-vfio-user-client) printf "%s" -Dvfio_user_client=enabled ;; + --disable-vfio-user-client) printf "%s" -Dvfio_user_client=disabled ;; --enable-vfio-user-server) printf "%s" -Dvfio_user_server=enabled ;; --disable-vfio-user-server) printf "%s" -Dvfio_user_server=disabled ;; --enable-vhdx) printf "%s" -Dvhdx=enabled ;; From patchwork Wed May 29 16:23:04 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679234 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 3D66AC25B75 for ; Wed, 29 May 2024 16:27:53 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM6u-00018Z-UQ; Wed, 29 May 2024 12:25:09 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6p-00015c-2E for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:05 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6l-0006Kj-7d for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:02 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6j-006CP2-6s; Wed, 29 May 2024 17:24:57 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 11/26] vfio-user: connect vfio proxy to remote server Date: Wed, 29 May 2024 17:23:04 +0100 Message-Id: <20240529162319.1476680-12-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman add user.c & user.h files for vfio-user code add proxy struct to handle comms with remote server Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/meson.build | 2 +- hw/vfio/user-pci.c | 17 ++++ hw/vfio/user.c | 171 ++++++++++++++++++++++++++++++++++ hw/vfio/user.h | 78 ++++++++++++++++ include/hw/vfio/vfio-common.h | 2 + 5 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 hw/vfio/user.c create mode 100644 hw/vfio/user.h diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index f897c5b81a..32ad5ca6b7 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -18,7 +18,7 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( )) if get_option('vfio_user_client').enabled() - vfio_ss.add(files('user-container.c', 'user-pci.c')) + vfio_ss.add(files('user.c', 'user-container.c', 'user-pci.c')) endif vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c')) diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c index fd0e534f0f..8c66833118 100644 --- a/hw/vfio/user-pci.c +++ b/hw/vfio/user-pci.c @@ -18,6 +18,7 @@ #include "hw/pci/pci_bridge.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "hw/vfio/user.h" #include "migration/vmstate.h" #include "qapi/qmp/qdict.h" #include "qemu/error-report.h" @@ -64,6 +65,8 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; AddressSpace *as; + SocketAddress addr; + VFIOUserProxy *proxy; /* * TODO: make option parser understand SocketAddress @@ -76,6 +79,15 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) return; } + memset(&addr, 0, sizeof(addr)); + addr.type = SOCKET_ADDRESS_TYPE_UNIX; + addr.u.q_unix.path = udev->sock_name; + proxy = vfio_user_connect_dev(&addr, errp); + if (!proxy) { + return; + } + vbasedev->proxy = proxy; + vbasedev->name = g_strdup_printf("VFIO user <%s>", udev->sock_name); vbasedev->ops = &vfio_user_pci_ops; vbasedev->type = VFIO_DEVICE_TYPE_PCI; @@ -119,8 +131,13 @@ static void vfio_user_instance_init(Object *obj) static void vfio_user_instance_finalize(Object *obj) { VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + VFIODevice *vbasedev = &vdev->vbasedev; vfio_pci_put_device(vdev); + + if (vbasedev->proxy != NULL) { + vfio_user_disconnect(vbasedev->proxy); + } } static Property vfio_user_pci_dev_properties[] = { diff --git a/hw/vfio/user.c b/hw/vfio/user.c new file mode 100644 index 0000000000..91501f625c --- /dev/null +++ b/hw/vfio/user.c @@ -0,0 +1,171 @@ +/* + * vfio protocol over a UNIX socket. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include +#include + +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "qemu/main-loop.h" +#include "qemu/lockable.h" +#include "hw/hw.h" +#include "hw/vfio/vfio-common.h" +#include "qemu/sockets.h" +#include "io/channel.h" +#include "io/channel-socket.h" +#include "io/channel-util.h" +#include "sysemu/iothread.h" +#include "user.h" + +static IOThread *vfio_user_iothread; + +static void vfio_user_shutdown(VFIOUserProxy *proxy); + + +/* + * Functions called by main, CPU, or iothread threads + */ + +static void vfio_user_shutdown(VFIOUserProxy *proxy) +{ + qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL); + qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, NULL, + proxy->ctx, NULL, NULL); +} + +/* + * Functions only called by iothread + */ + +static void vfio_user_cb(void *opaque) +{ + VFIOUserProxy *proxy = opaque; + + QEMU_LOCK_GUARD(&proxy->lock); + + proxy->state = VFIO_PROXY_CLOSED; + qemu_cond_signal(&proxy->close_cv); +} + + +/* + * Functions called by main or CPU threads + */ + +static QLIST_HEAD(, VFIOUserProxy) vfio_user_sockets = + QLIST_HEAD_INITIALIZER(vfio_user_sockets); + +VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp) +{ + VFIOUserProxy *proxy; + QIOChannelSocket *sioc; + QIOChannel *ioc; + char *sockname; + + if (addr->type != SOCKET_ADDRESS_TYPE_UNIX) { + error_setg(errp, "vfio_user_connect - bad address family"); + return NULL; + } + sockname = addr->u.q_unix.path; + + sioc = qio_channel_socket_new(); + ioc = QIO_CHANNEL(sioc); + if (qio_channel_socket_connect_sync(sioc, addr, errp)) { + object_unref(OBJECT(ioc)); + return NULL; + } + qio_channel_set_blocking(ioc, false, NULL); + + proxy = g_malloc0(sizeof(VFIOUserProxy)); + proxy->sockname = g_strdup_printf("unix:%s", sockname); + proxy->ioc = ioc; + proxy->flags = VFIO_PROXY_CLIENT; + proxy->state = VFIO_PROXY_CONNECTED; + + qemu_mutex_init(&proxy->lock); + qemu_cond_init(&proxy->close_cv); + + if (vfio_user_iothread == NULL) { + vfio_user_iothread = iothread_create("VFIO user", errp); + } + + proxy->ctx = iothread_get_aio_context(vfio_user_iothread); + + QTAILQ_INIT(&proxy->outgoing); + QTAILQ_INIT(&proxy->incoming); + QTAILQ_INIT(&proxy->free); + QTAILQ_INIT(&proxy->pending); + QLIST_INSERT_HEAD(&vfio_user_sockets, proxy, next); + + return proxy; +} + +void vfio_user_disconnect(VFIOUserProxy *proxy) +{ + VFIOUserMsg *r1, *r2; + + qemu_mutex_lock(&proxy->lock); + + /* our side is quitting */ + if (proxy->state == VFIO_PROXY_CONNECTED) { + vfio_user_shutdown(proxy); + if (!QTAILQ_EMPTY(&proxy->pending)) { + error_printf("vfio_user_disconnect: outstanding requests\n"); + } + } + object_unref(OBJECT(proxy->ioc)); + proxy->ioc = NULL; + + proxy->state = VFIO_PROXY_CLOSING; + QTAILQ_FOREACH_SAFE(r1, &proxy->outgoing, next, r2) { + qemu_cond_destroy(&r1->cv); + QTAILQ_REMOVE(&proxy->pending, r1, next); + g_free(r1); + } + QTAILQ_FOREACH_SAFE(r1, &proxy->incoming, next, r2) { + qemu_cond_destroy(&r1->cv); + QTAILQ_REMOVE(&proxy->incoming, r1, next); + g_free(r1); + } + QTAILQ_FOREACH_SAFE(r1, &proxy->pending, next, r2) { + qemu_cond_destroy(&r1->cv); + QTAILQ_REMOVE(&proxy->pending, r1, next); + g_free(r1); + } + QTAILQ_FOREACH_SAFE(r1, &proxy->free, next, r2) { + qemu_cond_destroy(&r1->cv); + QTAILQ_REMOVE(&proxy->free, r1, next); + g_free(r1); + } + + /* + * Make sure the iothread isn't blocking anywhere + * with a ref to this proxy by waiting for a BH + * handler to run after the proxy fd handlers were + * deleted above. + */ + aio_bh_schedule_oneshot(proxy->ctx, vfio_user_cb, proxy); + qemu_cond_wait(&proxy->close_cv, &proxy->lock); + + /* we now hold the only ref to proxy */ + qemu_mutex_unlock(&proxy->lock); + qemu_cond_destroy(&proxy->close_cv); + qemu_mutex_destroy(&proxy->lock); + + QLIST_REMOVE(proxy, next); + if (QLIST_EMPTY(&vfio_user_sockets)) { + iothread_destroy(vfio_user_iothread); + vfio_user_iothread = NULL; + } + + g_free(proxy->sockname); + g_free(proxy); +} diff --git a/hw/vfio/user.h b/hw/vfio/user.h new file mode 100644 index 0000000000..ac7d15dfa8 --- /dev/null +++ b/hw/vfio/user.h @@ -0,0 +1,78 @@ +#ifndef VFIO_USER_H +#define VFIO_USER_H + +/* + * vfio protocol over a UNIX socket. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +typedef struct { + int send_fds; + int recv_fds; + int *fds; +} VFIOUserFDs; + +enum msg_type { + VFIO_MSG_NONE, + VFIO_MSG_ASYNC, + VFIO_MSG_WAIT, + VFIO_MSG_NOWAIT, + VFIO_MSG_REQ, +}; + +typedef struct VFIOUserMsg { + QTAILQ_ENTRY(VFIOUserMsg) next; + VFIOUserFDs *fds; + uint32_t rsize; + uint32_t id; + QemuCond cv; + bool complete; + enum msg_type type; +} VFIOUserMsg; + + +enum proxy_state { + VFIO_PROXY_CONNECTED = 1, + VFIO_PROXY_ERROR = 2, + VFIO_PROXY_CLOSING = 3, + VFIO_PROXY_CLOSED = 4, +}; + +typedef QTAILQ_HEAD(VFIOUserMsgQ, VFIOUserMsg) VFIOUserMsgQ; + +typedef struct VFIOUserProxy { + QLIST_ENTRY(VFIOUserProxy) next; + char *sockname; + struct QIOChannel *ioc; + void (*request)(void *opaque, VFIOUserMsg *msg); + void *req_arg; + int flags; + QemuCond close_cv; + AioContext *ctx; + QEMUBH *req_bh; + + /* + * above only changed when BQL is held + * below are protected by per-proxy lock + */ + QemuMutex lock; + VFIOUserMsgQ free; + VFIOUserMsgQ pending; + VFIOUserMsgQ incoming; + VFIOUserMsgQ outgoing; + VFIOUserMsg *last_nowait; + enum proxy_state state; +} VFIOUserProxy; + +/* VFIOProxy flags */ +#define VFIO_PROXY_CLIENT 0x1 + +VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp); +void vfio_user_disconnect(VFIOUserProxy *proxy); + +#endif /* VFIO_USER_H */ diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index c12fd1e2f0..6bbe0218e9 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -74,6 +74,7 @@ typedef struct VFIOMigration { } VFIOMigration; struct VFIOGroup; +typedef struct VFIOUserProxy VFIOUserProxy; /* MMU container sub-class for legacy vfio implementation. */ typedef struct VFIOContainer { @@ -137,6 +138,7 @@ typedef struct VFIODevice { bool dirty_tracking; int devid; IOMMUFDBackend *iommufd; + VFIOUserProxy *proxy; struct vfio_region_info **regions; } VFIODevice; From patchwork Wed May 29 16:23:05 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679204 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 367A0C25B75 for ; Wed, 29 May 2024 16:26:02 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM6x-00019b-TC; Wed, 29 May 2024 12:25:11 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6s-00017P-Mn for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:07 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6l-0006Kb-6u for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:06 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6j-006CP6-80; Wed, 29 May 2024 17:24:57 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 12/26] vfio-user: define socket receive functions Date: Wed, 29 May 2024 17:23:05 +0100 Message-Id: <20240529162319.1476680-13-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Add infrastructure needed to receive incoming messages Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/trace-events | 5 + hw/vfio/user-pci.c | 11 ++ hw/vfio/user-protocol.h | 54 ++++++ hw/vfio/user.c | 408 ++++++++++++++++++++++++++++++++++++++++ hw/vfio/user.h | 10 + 5 files changed, 488 insertions(+) create mode 100644 hw/vfio/user-protocol.h diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 64161bf6f4..a5a5a34198 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -176,3 +176,8 @@ iommufd_cdev_fail_attach_existing_container(const char *msg) " %s" iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d" iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d" iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d" + +# user.c +vfio_user_recv_hdr(const char *name, uint16_t id, uint16_t cmd, uint32_t size, uint32_t flags) " (%s) id 0x%x cmd 0x%x size 0x%x flags 0x%x" +vfio_user_recv_read(uint16_t id, int read) " id 0x%x read 0x%x" +vfio_user_recv_request(uint16_t cmd) " command 0x%x" diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c index 8c66833118..350e957132 100644 --- a/hw/vfio/user-pci.c +++ b/hw/vfio/user-pci.c @@ -42,6 +42,16 @@ struct VFIOUserPCIDevice { char *sock_name; }; +/* + * Incoming request message callback. + * + * Runs off main loop, so BQL held. + */ +static void vfio_user_pci_process_req(void *opaque, VFIOUserMsg *msg) +{ + +} + /* * Emulated devices don't use host hot reset */ @@ -87,6 +97,7 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) return; } vbasedev->proxy = proxy; + vfio_user_set_handler(vbasedev, vfio_user_pci_process_req, vdev); vbasedev->name = g_strdup_printf("VFIO user <%s>", udev->sock_name); vbasedev->ops = &vfio_user_pci_ops; diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h new file mode 100644 index 0000000000..d23877c958 --- /dev/null +++ b/hw/vfio/user-protocol.h @@ -0,0 +1,54 @@ +#ifndef VFIO_USER_PROTOCOL_H +#define VFIO_USER_PROTOCOL_H + +/* + * vfio protocol over a UNIX socket. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Each message has a standard header that describes the command + * being sent, which is almost always a VFIO ioctl(). + * + * The header may be followed by command-specific data, such as the + * region and offset info for read and write commands. + */ + +typedef struct { + uint16_t id; + uint16_t command; + uint32_t size; + uint32_t flags; + uint32_t error_reply; +} VFIOUserHdr; + +/* VFIOUserHdr commands */ +enum vfio_user_command { + VFIO_USER_VERSION = 1, + VFIO_USER_DMA_MAP = 2, + VFIO_USER_DMA_UNMAP = 3, + VFIO_USER_DEVICE_GET_INFO = 4, + VFIO_USER_DEVICE_GET_REGION_INFO = 5, + VFIO_USER_DEVICE_GET_REGION_IO_FDS = 6, + VFIO_USER_DEVICE_GET_IRQ_INFO = 7, + VFIO_USER_DEVICE_SET_IRQS = 8, + VFIO_USER_REGION_READ = 9, + VFIO_USER_REGION_WRITE = 10, + VFIO_USER_DMA_READ = 11, + VFIO_USER_DMA_WRITE = 12, + VFIO_USER_DEVICE_RESET = 13, + VFIO_USER_DIRTY_PAGES = 14, + VFIO_USER_MAX, +}; + +/* VFIOUserHdr flags */ +#define VFIO_USER_REQUEST 0x0 +#define VFIO_USER_REPLY 0x1 +#define VFIO_USER_TYPE 0xF + +#define VFIO_USER_NO_REPLY 0x10 +#define VFIO_USER_ERROR 0x20 + +#endif /* VFIO_USER_PROTOCOL_H */ diff --git a/hw/vfio/user.c b/hw/vfio/user.c index 91501f625c..998816b97d 100644 --- a/hw/vfio/user.c +++ b/hw/vfio/user.c @@ -24,11 +24,27 @@ #include "io/channel-util.h" #include "sysemu/iothread.h" #include "user.h" +#include "trace.h" static IOThread *vfio_user_iothread; static void vfio_user_shutdown(VFIOUserProxy *proxy); +static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds); +static VFIOUserFDs *vfio_user_getfds(int numfds); +static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg); +static void vfio_user_recv(void *opaque); +static int vfio_user_recv_one(VFIOUserProxy *proxy); +static void vfio_user_cb(void *opaque); + +static void vfio_user_request(void *opaque); + +static inline void vfio_user_set_error(VFIOUserHdr *hdr, uint32_t err) +{ + hdr->flags |= VFIO_USER_ERROR; + hdr->error_reply = err; +} /* * Functions called by main, CPU, or iothread threads @@ -41,10 +57,340 @@ static void vfio_user_shutdown(VFIOUserProxy *proxy) proxy->ctx, NULL, NULL); } +static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds) +{ + VFIOUserMsg *msg; + + msg = QTAILQ_FIRST(&proxy->free); + if (msg != NULL) { + QTAILQ_REMOVE(&proxy->free, msg, next); + } else { + msg = g_malloc0(sizeof(*msg)); + qemu_cond_init(&msg->cv); + } + + msg->hdr = hdr; + msg->fds = fds; + return msg; +} + +/* + * Recycle a message list entry to the free list. + */ +static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg) +{ + if (msg->type == VFIO_MSG_NONE) { + error_printf("vfio_user_recycle - freeing free msg\n"); + return; + } + + /* free msg buffer if no one is waiting to consume the reply */ + if (msg->type == VFIO_MSG_NOWAIT || msg->type == VFIO_MSG_ASYNC) { + g_free(msg->hdr); + if (msg->fds != NULL) { + g_free(msg->fds); + } + } + + msg->type = VFIO_MSG_NONE; + msg->hdr = NULL; + msg->fds = NULL; + msg->complete = false; + QTAILQ_INSERT_HEAD(&proxy->free, msg, next); +} + +static VFIOUserFDs *vfio_user_getfds(int numfds) +{ + VFIOUserFDs *fds = g_malloc0(sizeof(*fds) + (numfds * sizeof(int))); + + fds->fds = (int *)((char *)fds + sizeof(*fds)); + + return fds; +} + /* * Functions only called by iothread */ +/* + * Process a received message. + */ +static void vfio_user_process(VFIOUserProxy *proxy, VFIOUserMsg *msg, + bool isreply) +{ + + /* + * Replies signal a waiter, if none just check for errors + * and free the message buffer. + * + * Requests get queued for the BH. + */ + if (isreply) { + msg->complete = true; + if (msg->type == VFIO_MSG_WAIT) { + qemu_cond_signal(&msg->cv); + } else { + if (msg->hdr->flags & VFIO_USER_ERROR) { + error_printf("vfio_user_process: error reply on async "); + error_printf("request command %x error %s\n", + msg->hdr->command, + strerror(msg->hdr->error_reply)); + } + /* youngest nowait msg has been ack'd */ + if (proxy->last_nowait == msg) { + proxy->last_nowait = NULL; + } + vfio_user_recycle(proxy, msg); + } + } else { + QTAILQ_INSERT_TAIL(&proxy->incoming, msg, next); + qemu_bh_schedule(proxy->req_bh); + } +} + +/* + * Complete a partial message read + */ +static int vfio_user_complete(VFIOUserProxy *proxy, Error **errp) +{ + VFIOUserMsg *msg = proxy->part_recv; + size_t msgleft = proxy->recv_left; + bool isreply; + char *data; + int ret; + + data = (char *)msg->hdr + (msg->hdr->size - msgleft); + while (msgleft > 0) { + ret = qio_channel_read(proxy->ioc, data, msgleft, errp); + + /* error or would block */ + if (ret <= 0) { + /* try for rest on next iternation */ + if (ret == QIO_CHANNEL_ERR_BLOCK) { + proxy->recv_left = msgleft; + } + return ret; + } + trace_vfio_user_recv_read(msg->hdr->id, ret); + + msgleft -= ret; + data += ret; + } + + /* + * Read complete message, process it. + */ + proxy->part_recv = NULL; + proxy->recv_left = 0; + isreply = (msg->hdr->flags & VFIO_USER_TYPE) == VFIO_USER_REPLY; + vfio_user_process(proxy, msg, isreply); + + /* return positive value */ + return 1; +} + +static void vfio_user_recv(void *opaque) +{ + VFIOUserProxy *proxy = opaque; + + QEMU_LOCK_GUARD(&proxy->lock); + + if (proxy->state == VFIO_PROXY_CONNECTED) { + while (vfio_user_recv_one(proxy) == 0) { + ; + } + } +} + +/* + * Receive and process one incoming message. + * + * For replies, find matching outgoing request and wake any waiters. + * For requests, queue in incoming list and run request BH. + */ +static int vfio_user_recv_one(VFIOUserProxy *proxy) +{ + VFIOUserMsg *msg = NULL; + g_autofree int *fdp = NULL; + VFIOUserFDs *reqfds; + VFIOUserHdr hdr; + struct iovec iov = { + .iov_base = &hdr, + .iov_len = sizeof(hdr), + }; + bool isreply = false; + int i, ret; + size_t msgleft, numfds = 0; + char *data = NULL; + char *buf = NULL; + Error *local_err = NULL; + + /* + * Complete any partial reads + */ + if (proxy->part_recv != NULL) { + ret = vfio_user_complete(proxy, &local_err); + + /* still not complete, try later */ + if (ret == QIO_CHANNEL_ERR_BLOCK) { + return ret; + } + + if (ret <= 0) { + goto fatal; + } + /* else fall into reading another msg */ + } + + /* + * Read header + */ + ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds, 0, + &local_err); + if (ret == QIO_CHANNEL_ERR_BLOCK) { + return ret; + } + + /* read error or other side closed connection */ + if (ret <= 0) { + goto fatal; + } + + if (ret < sizeof(msg)) { + error_setg(&local_err, "short read of header"); + goto fatal; + } + + /* + * Validate header + */ + if (hdr.size < sizeof(VFIOUserHdr)) { + error_setg(&local_err, "bad header size"); + goto fatal; + } + switch (hdr.flags & VFIO_USER_TYPE) { + case VFIO_USER_REQUEST: + isreply = false; + break; + case VFIO_USER_REPLY: + isreply = true; + break; + default: + error_setg(&local_err, "unknown message type"); + goto fatal; + } + trace_vfio_user_recv_hdr(proxy->sockname, hdr.id, hdr.command, hdr.size, + hdr.flags); + + /* + * For replies, find the matching pending request. + * For requests, reap incoming FDs. + */ + if (isreply) { + QTAILQ_FOREACH(msg, &proxy->pending, next) { + if (hdr.id == msg->id) { + break; + } + } + if (msg == NULL) { + error_setg(&local_err, "unexpected reply"); + goto err; + } + QTAILQ_REMOVE(&proxy->pending, msg, next); + + /* + * Process any received FDs + */ + if (numfds != 0) { + if (msg->fds == NULL || msg->fds->recv_fds < numfds) { + error_setg(&local_err, "unexpected FDs"); + goto err; + } + msg->fds->recv_fds = numfds; + memcpy(msg->fds->fds, fdp, numfds * sizeof(int)); + } + } else { + if (numfds != 0) { + reqfds = vfio_user_getfds(numfds); + memcpy(reqfds->fds, fdp, numfds * sizeof(int)); + } else { + reqfds = NULL; + } + } + + /* + * Put the whole message into a single buffer. + */ + if (isreply) { + if (hdr.size > msg->rsize) { + error_setg(&local_err, "reply larger than recv buffer"); + goto err; + } + *msg->hdr = hdr; + data = (char *)msg->hdr + sizeof(hdr); + } else { + buf = g_malloc0(hdr.size); + memcpy(buf, &hdr, sizeof(hdr)); + data = buf + sizeof(hdr); + msg = vfio_user_getmsg(proxy, (VFIOUserHdr *)buf, reqfds); + msg->type = VFIO_MSG_REQ; + } + + /* + * Read rest of message. + */ + msgleft = hdr.size - sizeof(hdr); + while (msgleft > 0) { + ret = qio_channel_read(proxy->ioc, data, msgleft, &local_err); + + /* prepare to complete read on next iternation */ + if (ret == QIO_CHANNEL_ERR_BLOCK) { + proxy->part_recv = msg; + proxy->recv_left = msgleft; + return ret; + } + + if (ret <= 0) { + goto fatal; + } + trace_vfio_user_recv_read(hdr.id, ret); + + msgleft -= ret; + data += ret; + } + + vfio_user_process(proxy, msg, isreply); + return 0; + + /* + * fatal means the other side closed or we don't trust the stream + * err means this message is corrupt + */ +fatal: + vfio_user_shutdown(proxy); + proxy->state = VFIO_PROXY_ERROR; + + /* set error if server side closed */ + if (ret == 0) { + error_setg(&local_err, "server closed socket"); + } + +err: + for (i = 0; i < numfds; i++) { + close(fdp[i]); + } + if (isreply && msg != NULL) { + /* force an error to keep sending thread from hanging */ + vfio_user_set_error(msg->hdr, EINVAL); + msg->complete = true; + qemu_cond_signal(&msg->cv); + } + error_prepend(&local_err, "vfio_user_recv_one: "); + error_report_err(local_err); + return -1; +} + static void vfio_user_cb(void *opaque) { VFIOUserProxy *proxy = opaque; @@ -60,6 +406,53 @@ static void vfio_user_cb(void *opaque) * Functions called by main or CPU threads */ +/* + * Process incoming requests. + * + * The bus-specific callback has the form: + * request(opaque, msg) + * where 'opaque' was specified in vfio_user_set_handler + * and 'msg' is the inbound message. + * + * The callback is responsible for disposing of the message buffer, + * usually by re-using it when calling vfio_send_reply or vfio_send_error, + * both of which free their message buffer when the reply is sent. + * + * If the callback uses a new buffer, it needs to free the old one. + */ +static void vfio_user_request(void *opaque) +{ + VFIOUserProxy *proxy = opaque; + VFIOUserMsgQ new, free; + VFIOUserMsg *msg, *m1; + + /* reap all incoming */ + QTAILQ_INIT(&new); + WITH_QEMU_LOCK_GUARD(&proxy->lock) { + QTAILQ_FOREACH_SAFE(msg, &proxy->incoming, next, m1) { + QTAILQ_REMOVE(&proxy->incoming, msg, next); + QTAILQ_INSERT_TAIL(&new, msg, next); + } + } + + /* process list */ + QTAILQ_INIT(&free); + QTAILQ_FOREACH_SAFE(msg, &new, next, m1) { + QTAILQ_REMOVE(&new, msg, next); + trace_vfio_user_recv_request(msg->hdr->command); + proxy->request(proxy->req_arg, msg); + QTAILQ_INSERT_HEAD(&free, msg, next); + } + + /* free list */ + WITH_QEMU_LOCK_GUARD(&proxy->lock) { + QTAILQ_FOREACH_SAFE(msg, &free, next, m1) { + vfio_user_recycle(proxy, msg); + } + } +} + + static QLIST_HEAD(, VFIOUserProxy) vfio_user_sockets = QLIST_HEAD_INITIALIZER(vfio_user_sockets); @@ -98,6 +491,7 @@ VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp) } proxy->ctx = iothread_get_aio_context(vfio_user_iothread); + proxy->req_bh = qemu_bh_new(vfio_user_request, proxy); QTAILQ_INIT(&proxy->outgoing); QTAILQ_INIT(&proxy->incoming); @@ -108,6 +502,18 @@ VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp) return proxy; } +void vfio_user_set_handler(VFIODevice *vbasedev, + void (*handler)(void *opaque, VFIOUserMsg *msg), + void *req_arg) +{ + VFIOUserProxy *proxy = vbasedev->proxy; + + proxy->request = handler; + proxy->req_arg = req_arg; + qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, + vfio_user_recv, NULL, NULL, proxy); +} + void vfio_user_disconnect(VFIOUserProxy *proxy) { VFIOUserMsg *r1, *r2; @@ -123,6 +529,8 @@ void vfio_user_disconnect(VFIOUserProxy *proxy) } object_unref(OBJECT(proxy->ioc)); proxy->ioc = NULL; + qemu_bh_delete(proxy->req_bh); + proxy->req_bh = NULL; proxy->state = VFIO_PROXY_CLOSING; QTAILQ_FOREACH_SAFE(r1, &proxy->outgoing, next, r2) { diff --git a/hw/vfio/user.h b/hw/vfio/user.h index ac7d15dfa8..30cf35d3e4 100644 --- a/hw/vfio/user.h +++ b/hw/vfio/user.h @@ -11,6 +11,8 @@ * */ +#include "user-protocol.h" + typedef struct { int send_fds; int recv_fds; @@ -27,6 +29,7 @@ enum msg_type { typedef struct VFIOUserMsg { QTAILQ_ENTRY(VFIOUserMsg) next; + VFIOUserHdr *hdr; VFIOUserFDs *fds; uint32_t rsize; uint32_t id; @@ -66,13 +69,20 @@ typedef struct VFIOUserProxy { VFIOUserMsgQ incoming; VFIOUserMsgQ outgoing; VFIOUserMsg *last_nowait; + VFIOUserMsg *part_recv; + size_t recv_left; enum proxy_state state; } VFIOUserProxy; /* VFIOProxy flags */ #define VFIO_PROXY_CLIENT 0x1 +typedef struct VFIODevice VFIODevice; + VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp); void vfio_user_disconnect(VFIOUserProxy *proxy); +void vfio_user_set_handler(VFIODevice *vbasedev, + void (*handler)(void *opaque, VFIOUserMsg *msg), + void *reqarg); #endif /* VFIO_USER_H */ From patchwork Wed May 29 16:23:06 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679203 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 8835CC25B7C for ; Wed, 29 May 2024 16:25:57 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM6v-000199-Pw; Wed, 29 May 2024 12:25:10 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6s-00017G-48 for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:06 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6l-0006Kk-7G for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:05 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6j-006CP9-9I; Wed, 29 May 2024 17:24:57 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 13/26] vfio-user: define socket send functions Date: Wed, 29 May 2024 17:23:06 +0100 Message-Id: <20240529162319.1476680-14-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Also negotiate protocol version with remote server Originally-by: John Johnson Signed-off-by: Jagannathan Raman Signed-off-by: Elena Ufimtseva Signed-off-by: John Levon --- hw/vfio/trace-events | 2 + hw/vfio/user-pci.c | 18 +- hw/vfio/user-protocol.h | 62 +++++ hw/vfio/user.c | 511 ++++++++++++++++++++++++++++++++++++++++ hw/vfio/user.h | 9 + 5 files changed, 600 insertions(+), 2 deletions(-) diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index a5a5a34198..68fe6f5689 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -181,3 +181,5 @@ iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int functi vfio_user_recv_hdr(const char *name, uint16_t id, uint16_t cmd, uint32_t size, uint32_t flags) " (%s) id 0x%x cmd 0x%x size 0x%x flags 0x%x" vfio_user_recv_read(uint16_t id, int read) " id 0x%x read 0x%x" vfio_user_recv_request(uint16_t cmd) " command 0x%x" +vfio_user_send_write(uint16_t id, int wrote) " id 0x%x wrote 0x%x" +vfio_user_version(uint16_t major, uint16_t minor, const char *caps) " major %d minor %d caps: %s" diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c index 350e957132..fe98048aad 100644 --- a/hw/vfio/user-pci.c +++ b/hw/vfio/user-pci.c @@ -40,6 +40,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(VFIOUserPCIDevice, VFIO_USER_PCI) struct VFIOUserPCIDevice { VFIOPCIDevice device; char *sock_name; + bool send_queued; /* all sends are queued */ }; /* @@ -99,6 +100,14 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) vbasedev->proxy = proxy; vfio_user_set_handler(vbasedev, vfio_user_pci_process_req, vdev); + if (udev->send_queued) { + proxy->flags |= VFIO_PROXY_FORCE_QUEUED; + } + + if (!vfio_user_validate_version(proxy, errp)) { + goto error; + } + vbasedev->name = g_strdup_printf("VFIO user <%s>", udev->sock_name); vbasedev->ops = &vfio_user_pci_ops; vbasedev->type = VFIO_DEVICE_TYPE_PCI; @@ -108,9 +117,13 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) if (!vfio_attach_device_by_iommu_type(TYPE_VFIO_IOMMU_USER, vbasedev->name, vbasedev, as, errp)) { - error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name); - return; + goto error; } + + return; + +error: + error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name); } static void vfio_user_instance_init(Object *obj) @@ -153,6 +166,7 @@ static void vfio_user_instance_finalize(Object *obj) static Property vfio_user_pci_dev_properties[] = { DEFINE_PROP_STRING("socket", VFIOUserPCIDevice, sock_name), + DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h index d23877c958..5de5b2030c 100644 --- a/hw/vfio/user-protocol.h +++ b/hw/vfio/user-protocol.h @@ -51,4 +51,66 @@ enum vfio_user_command { #define VFIO_USER_NO_REPLY 0x10 #define VFIO_USER_ERROR 0x20 + +/* + * VFIO_USER_VERSION + */ +typedef struct { + VFIOUserHdr hdr; + uint16_t major; + uint16_t minor; + char capabilities[]; +} VFIOUserVersion; + +#define VFIO_USER_MAJOR_VER 0 +#define VFIO_USER_MINOR_VER 0 + +#define VFIO_USER_CAP "capabilities" + +/* "capabilities" members */ +#define VFIO_USER_CAP_MAX_FDS "max_msg_fds" +#define VFIO_USER_CAP_MAX_XFER "max_data_xfer_size" +#define VFIO_USER_CAP_PGSIZES "pgsizes" +#define VFIO_USER_CAP_MAP_MAX "max_dma_maps" +#define VFIO_USER_CAP_MIGR "migration" + +/* "migration" members */ +#define VFIO_USER_CAP_PGSIZE "pgsize" +#define VFIO_USER_CAP_MAX_BITMAP "max_bitmap_size" + +/* + * Max FDs mainly comes into play when a device supports multiple interrupts + * where each ones uses an eventfd to inject it into the guest. + * It is clamped by the the number of FDs the qio channel supports in a + * single message. + */ +#define VFIO_USER_DEF_MAX_FDS 8 +#define VFIO_USER_MAX_MAX_FDS 16 + +/* + * Max transfer limits the amount of data in region and DMA messages. + * Region R/W will be very small (limited by how much a single instruction + * can process) so just use a reasonable limit here. + */ +#define VFIO_USER_DEF_MAX_XFER (1024 * 1024) +#define VFIO_USER_MAX_MAX_XFER (64 * 1024 * 1024) + +/* + * Default pagesizes supported is 4k. + */ +#define VFIO_USER_DEF_PGSIZE 4096 + +/* + * Default max number of DMA mappings is stolen from the + * linux kernel "dma_entry_limit" + */ +#define VFIO_USER_DEF_MAP_MAX 65535 + +/* + * Default max bitmap size is also take from the linux kernel, + * where usage of signed ints limits the VA range to 2^31 bytes. + * Dividing that by the number of bits per byte yields 256MB + */ +#define VFIO_USER_DEF_MAX_BITMAP (256 * 1024 * 1024) + #endif /* VFIO_USER_PROTOCOL_H */ diff --git a/hw/vfio/user.c b/hw/vfio/user.c index 998816b97d..aa561fdba3 100644 --- a/hw/vfio/user.c +++ b/hw/vfio/user.c @@ -23,12 +23,20 @@ #include "io/channel-socket.h" #include "io/channel-util.h" #include "sysemu/iothread.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qjson.h" +#include "qapi/qmp/qnull.h" +#include "qapi/qmp/qstring.h" +#include "qapi/qmp/qnum.h" +#include "qapi/qmp/qbool.h" #include "user.h" #include "trace.h" +static int wait_time = 5000; /* wait up to 5 sec for busy servers */ static IOThread *vfio_user_iothread; static void vfio_user_shutdown(VFIOUserProxy *proxy); +static int vfio_user_send_qio(VFIOUserProxy *proxy, VFIOUserMsg *msg); static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr, VFIOUserFDs *fds); static VFIOUserFDs *vfio_user_getfds(int numfds); @@ -36,9 +44,16 @@ static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg); static void vfio_user_recv(void *opaque); static int vfio_user_recv_one(VFIOUserProxy *proxy); +static void vfio_user_send(void *opaque); +static int vfio_user_send_one(VFIOUserProxy *proxy); static void vfio_user_cb(void *opaque); static void vfio_user_request(void *opaque); +static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg); +static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, int rsize); +static void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd, + uint32_t size, uint32_t flags); static inline void vfio_user_set_error(VFIOUserHdr *hdr, uint32_t err) { @@ -57,6 +72,35 @@ static void vfio_user_shutdown(VFIOUserProxy *proxy) proxy->ctx, NULL, NULL); } +static int vfio_user_send_qio(VFIOUserProxy *proxy, VFIOUserMsg *msg) +{ + VFIOUserFDs *fds = msg->fds; + struct iovec iov = { + .iov_base = msg->hdr, + .iov_len = msg->hdr->size, + }; + size_t numfds = 0; + int ret, *fdp = NULL; + Error *local_err = NULL; + + if (fds != NULL && fds->send_fds != 0) { + numfds = fds->send_fds; + fdp = fds->fds; + } + + ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, 0, + &local_err); + + if (ret == -1) { + vfio_user_set_error(msg->hdr, EIO); + vfio_user_shutdown(proxy); + error_report_err(local_err); + } + trace_vfio_user_send_write(msg->hdr->id, ret); + + return ret; +} + static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr, VFIOUserFDs *fds) { @@ -97,6 +141,7 @@ static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg) msg->hdr = NULL; msg->fds = NULL; msg->complete = false; + msg->pending = false; QTAILQ_INSERT_HEAD(&proxy->free, msg, next); } @@ -391,6 +436,54 @@ err: return -1; } +/* + * Send messages from outgoing queue when the socket buffer has space. + * If we deplete 'outgoing', remove ourselves from the poll list. + */ +static void vfio_user_send(void *opaque) +{ + VFIOUserProxy *proxy = opaque; + + QEMU_LOCK_GUARD(&proxy->lock); + + if (proxy->state == VFIO_PROXY_CONNECTED) { + while (!QTAILQ_EMPTY(&proxy->outgoing)) { + if (vfio_user_send_one(proxy) < 0) { + return; + } + } + qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, + vfio_user_recv, NULL, NULL, proxy); + } +} + +/* + * Send a single message. + * + * Sent async messages are freed, others are moved to pending queue. + */ +static int vfio_user_send_one(VFIOUserProxy *proxy) +{ + VFIOUserMsg *msg; + int ret; + + msg = QTAILQ_FIRST(&proxy->outgoing); + ret = vfio_user_send_qio(proxy, msg); + if (ret < 0) { + return ret; + } + + QTAILQ_REMOVE(&proxy->outgoing, msg, next); + if (msg->type == VFIO_MSG_ASYNC) { + vfio_user_recycle(proxy, msg); + } else { + QTAILQ_INSERT_TAIL(&proxy->pending, msg, next); + msg->pending = true; + } + + return 0; +} + static void vfio_user_cb(void *opaque) { VFIOUserProxy *proxy = opaque; @@ -452,6 +545,133 @@ static void vfio_user_request(void *opaque) } } +/* + * Messages are queued onto the proxy's outgoing list. + * + * It handles 3 types of messages: + * + * async messages - replies and posted writes + * + * There will be no reply from the server, so message + * buffers are freed after they're sent. + * + * nowait messages - map/unmap during address space transactions + * + * These are also sent async, but a reply is expected so that + * vfio_wait_reqs() can wait for the youngest nowait request. + * They transition from the outgoing list to the pending list + * when sent, and are freed when the reply is received. + * + * wait messages - all other requests + * + * The reply to these messages is waited for by their caller. + * They also transition from outgoing to pending when sent, but + * the message buffer is returned to the caller with the reply + * contents. The caller is responsible for freeing these messages. + * + * As an optimization, if the outgoing list and the socket send + * buffer are empty, the message is sent inline instead of being + * added to the outgoing list. The rest of the transitions are + * unchanged. + * + * returns 0 if the message was sent or queued + * returns -1 on send error + */ +static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg) +{ + int ret; + + /* + * Unsent outgoing msgs - add to tail + */ + if (!QTAILQ_EMPTY(&proxy->outgoing)) { + QTAILQ_INSERT_TAIL(&proxy->outgoing, msg, next); + return 0; + } + + /* + * Try inline - if blocked, queue it and kick send poller + */ + if (proxy->flags & VFIO_PROXY_FORCE_QUEUED) { + ret = QIO_CHANNEL_ERR_BLOCK; + } else { + ret = vfio_user_send_qio(proxy, msg); + } + if (ret == QIO_CHANNEL_ERR_BLOCK) { + QTAILQ_INSERT_HEAD(&proxy->outgoing, msg, next); + qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, + vfio_user_recv, proxy->ctx, + vfio_user_send, proxy); + return 0; + } + if (ret == -1) { + return ret; + } + + /* + * Sent - free async, add others to pending + */ + if (msg->type == VFIO_MSG_ASYNC) { + vfio_user_recycle(proxy, msg); + } else { + QTAILQ_INSERT_TAIL(&proxy->pending, msg, next); + msg->pending = true; + } + + return 0; +} + +static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, int rsize) +{ + VFIOUserMsg *msg; + bool iolock = false; + int ret; + + if (hdr->flags & VFIO_USER_NO_REPLY) { + error_printf("vfio_user_send_wait on async message\n"); + vfio_user_set_error(hdr, EINVAL); + return; + } + + /* + * We may block later, so use a per-proxy lock and drop + * BQL while we sleep. + */ + qemu_mutex_lock(&proxy->lock); + + iolock = bql_locked(); + if (iolock) { + bql_unlock(); + } + + msg = vfio_user_getmsg(proxy, hdr, fds); + msg->id = hdr->id; + msg->rsize = rsize ? rsize : hdr->size; + msg->type = VFIO_MSG_WAIT; + + ret = vfio_user_send_queued(proxy, msg); + + if (ret == 0) { + while (!msg->complete) { + if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, wait_time)) { + VFIOUserMsgQ *list; + + list = msg->pending ? &proxy->pending : &proxy->outgoing; + QTAILQ_REMOVE(list, msg, next); + vfio_user_set_error(hdr, ETIMEDOUT); + break; + } + } + } + vfio_user_recycle(proxy, msg); + + /* lock order is BQL->proxy - don't hold proxy when getting BQL */ + qemu_mutex_unlock(&proxy->lock); + if (iolock) { + bql_lock(); + } +} static QLIST_HEAD(, VFIOUserProxy) vfio_user_sockets = QLIST_HEAD_INITIALIZER(vfio_user_sockets); @@ -480,6 +700,15 @@ VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp) proxy = g_malloc0(sizeof(VFIOUserProxy)); proxy->sockname = g_strdup_printf("unix:%s", sockname); proxy->ioc = ioc; + + /* init defaults */ + proxy->max_xfer_size = VFIO_USER_DEF_MAX_XFER; + proxy->max_send_fds = VFIO_USER_DEF_MAX_FDS; + proxy->max_dma = VFIO_USER_DEF_MAP_MAX; + proxy->dma_pgsizes = VFIO_USER_DEF_PGSIZE; + proxy->max_bitmap = VFIO_USER_DEF_MAX_BITMAP; + proxy->migr_pgsize = VFIO_USER_DEF_PGSIZE; + proxy->flags = VFIO_PROXY_CLIENT; proxy->state = VFIO_PROXY_CONNECTED; @@ -577,3 +806,285 @@ void vfio_user_disconnect(VFIOUserProxy *proxy) g_free(proxy->sockname); g_free(proxy); } + +static void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd, + uint32_t size, uint32_t flags) +{ + static uint16_t next_id; + + hdr->id = qatomic_fetch_inc(&next_id); + hdr->command = cmd; + hdr->size = size; + hdr->flags = (flags & ~VFIO_USER_TYPE) | VFIO_USER_REQUEST; + hdr->error_reply = 0; +} + +struct cap_entry { + const char *name; + bool (*check)(VFIOUserProxy *proxy, QObject *qobj, Error **errp); +}; + +static bool caps_parse(VFIOUserProxy *proxy, QDict *qdict, + struct cap_entry caps[], Error **errp) +{ + QObject *qobj; + struct cap_entry *p; + + for (p = caps; p->name != NULL; p++) { + qobj = qdict_get(qdict, p->name); + if (qobj != NULL) { + if (!p->check(proxy, qobj, errp)) { + return false; + } + qdict_del(qdict, p->name); + } + } + + /* warning, for now */ + if (qdict_size(qdict) != 0) { + warn_report("spurious capabilities"); + } + return true; +} + +static bool check_migr_pgsize(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t pgsize; + + if (qn == NULL || !qnum_get_try_uint(qn, &pgsize)) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_PGSIZE); + return false; + } + + /* must be larger than default */ + if (pgsize & (VFIO_USER_DEF_PGSIZE - 1)) { + error_setg(errp, "pgsize 0x%"PRIx64" too small", pgsize); + return false; + } + + proxy->migr_pgsize = pgsize; + return true; +} + +static bool check_bitmap(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t bitmap_size; + + if (qn == NULL || !qnum_get_try_uint(qn, &bitmap_size)) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_BITMAP); + return false; + } + + /* can only lower it */ + if (bitmap_size > VFIO_USER_DEF_MAX_BITMAP) { + error_setg(errp, "%s too large", VFIO_USER_CAP_MAX_BITMAP); + return false; + } + + proxy->max_bitmap = bitmap_size; + return true; +} + +static struct cap_entry caps_migr[] = { + { VFIO_USER_CAP_PGSIZE, check_migr_pgsize }, + { VFIO_USER_CAP_MAX_BITMAP, check_bitmap }, + { NULL } +}; + +static bool check_max_fds(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t max_send_fds; + + if (qn == NULL || !qnum_get_try_uint(qn, &max_send_fds) || + max_send_fds > VFIO_USER_MAX_MAX_FDS) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_FDS); + return false; + } + proxy->max_send_fds = max_send_fds; + return true; +} + +static bool check_max_xfer(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t max_xfer_size; + + if (qn == NULL || !qnum_get_try_uint(qn, &max_xfer_size) || + max_xfer_size > VFIO_USER_MAX_MAX_XFER) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_XFER); + return false; + } + proxy->max_xfer_size = max_xfer_size; + return true; +} + +static bool check_pgsizes(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t pgsizes; + + if (qn == NULL || !qnum_get_try_uint(qn, &pgsizes)) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_PGSIZES); + return false; + } + + /* must be larger than default */ + if (pgsizes & (VFIO_USER_DEF_PGSIZE - 1)) { + error_setg(errp, "pgsize 0x%"PRIx64" too small", pgsizes); + return false; + } + + proxy->dma_pgsizes = pgsizes; + return true; +} + +static bool check_max_dma(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t max_dma; + + if (qn == NULL || !qnum_get_try_uint(qn, &max_dma)) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAP_MAX); + return false; + } + + /* can only lower it */ + if (max_dma > VFIO_USER_DEF_MAP_MAX) { + error_setg(errp, "%s too large", VFIO_USER_CAP_MAP_MAX); + return false; + } + + proxy->max_dma = max_dma; + return true; +} + +static bool check_migr(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QDict *qdict = qobject_to(QDict, qobj); + + if (qdict == NULL) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_FDS); + return true; + } + return caps_parse(proxy, qdict, caps_migr, errp); +} + +static struct cap_entry caps_cap[] = { + { VFIO_USER_CAP_MAX_FDS, check_max_fds }, + { VFIO_USER_CAP_MAX_XFER, check_max_xfer }, + { VFIO_USER_CAP_PGSIZES, check_pgsizes }, + { VFIO_USER_CAP_MAP_MAX, check_max_dma }, + { VFIO_USER_CAP_MIGR, check_migr }, + { NULL } +}; + +static bool check_cap(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QDict *qdict = qobject_to(QDict, qobj); + + if (qdict == NULL) { + error_setg(errp, "malformed %s", VFIO_USER_CAP); + return false; + } + return caps_parse(proxy, qdict, caps_cap, errp); +} + +static struct cap_entry ver_0_0[] = { + { VFIO_USER_CAP, check_cap }, + { NULL } +}; + +static bool caps_check(VFIOUserProxy *proxy, int minor, const char *caps, + Error **errp) +{ + QObject *qobj; + QDict *qdict; + bool ret; + + qobj = qobject_from_json(caps, NULL); + if (qobj == NULL) { + error_setg(errp, "malformed capabilities %s", caps); + return false; + } + qdict = qobject_to(QDict, qobj); + if (qdict == NULL) { + error_setg(errp, "capabilities %s not an object", caps); + qobject_unref(qobj); + return false; + } + ret = caps_parse(proxy, qdict, ver_0_0, errp); + + qobject_unref(qobj); + return ret; +} + +static GString *caps_json(void) +{ + QDict *dict = qdict_new(); + QDict *capdict = qdict_new(); + QDict *migdict = qdict_new(); + GString *str; + + qdict_put_int(migdict, VFIO_USER_CAP_PGSIZE, VFIO_USER_DEF_PGSIZE); + qdict_put_int(migdict, VFIO_USER_CAP_MAX_BITMAP, VFIO_USER_DEF_MAX_BITMAP); + qdict_put_obj(capdict, VFIO_USER_CAP_MIGR, QOBJECT(migdict)); + + qdict_put_int(capdict, VFIO_USER_CAP_MAX_FDS, VFIO_USER_MAX_MAX_FDS); + qdict_put_int(capdict, VFIO_USER_CAP_MAX_XFER, VFIO_USER_DEF_MAX_XFER); + qdict_put_int(capdict, VFIO_USER_CAP_PGSIZES, VFIO_USER_DEF_PGSIZE); + qdict_put_int(capdict, VFIO_USER_CAP_MAP_MAX, VFIO_USER_DEF_MAP_MAX); + + qdict_put_obj(dict, VFIO_USER_CAP, QOBJECT(capdict)); + + str = qobject_to_json(QOBJECT(dict)); + qobject_unref(dict); + return str; +} + +bool vfio_user_validate_version(VFIOUserProxy *proxy, Error **errp) +{ + g_autofree VFIOUserVersion *msgp = NULL; + GString *caps; + char *reply; + int size, caplen; + + caps = caps_json(); + caplen = caps->len + 1; + size = sizeof(*msgp) + caplen; + msgp = g_malloc0(size); + + vfio_user_request_msg(&msgp->hdr, VFIO_USER_VERSION, size, 0); + msgp->major = VFIO_USER_MAJOR_VER; + msgp->minor = VFIO_USER_MINOR_VER; + memcpy(&msgp->capabilities, caps->str, caplen); + g_string_free(caps, true); + trace_vfio_user_version(msgp->major, msgp->minor, msgp->capabilities); + + vfio_user_send_wait(proxy, &msgp->hdr, NULL, 0); + if (msgp->hdr.flags & VFIO_USER_ERROR) { + error_setg_errno(errp, msgp->hdr.error_reply, "version reply"); + return false; + } + + if (msgp->major != VFIO_USER_MAJOR_VER || + msgp->minor > VFIO_USER_MINOR_VER) { + error_setg(errp, "incompatible server version"); + return false; + } + + reply = msgp->capabilities; + if (reply[msgp->hdr.size - sizeof(*msgp) - 1] != '\0') { + error_setg(errp, "corrupt version reply"); + return false; + } + + if (!caps_check(proxy, msgp->minor, reply, errp)) { + return false; + } + + trace_vfio_user_version(msgp->major, msgp->minor, msgp->capabilities); + return true; +} diff --git a/hw/vfio/user.h b/hw/vfio/user.h index 30cf35d3e4..9c3b279839 100644 --- a/hw/vfio/user.h +++ b/hw/vfio/user.h @@ -35,6 +35,7 @@ typedef struct VFIOUserMsg { uint32_t id; QemuCond cv; bool complete; + bool pending; enum msg_type type; } VFIOUserMsg; @@ -54,6 +55,12 @@ typedef struct VFIOUserProxy { struct QIOChannel *ioc; void (*request)(void *opaque, VFIOUserMsg *msg); void *req_arg; + uint64_t max_xfer_size; + uint64_t max_send_fds; + uint64_t max_dma; + uint64_t dma_pgsizes; + uint64_t max_bitmap; + uint64_t migr_pgsize; int flags; QemuCond close_cv; AioContext *ctx; @@ -76,6 +83,7 @@ typedef struct VFIOUserProxy { /* VFIOProxy flags */ #define VFIO_PROXY_CLIENT 0x1 +#define VFIO_PROXY_FORCE_QUEUED 0x4 typedef struct VFIODevice VFIODevice; @@ -84,5 +92,6 @@ void vfio_user_disconnect(VFIOUserProxy *proxy); void vfio_user_set_handler(VFIODevice *vbasedev, void (*handler)(void *opaque, VFIOUserMsg *msg), void *reqarg); +bool vfio_user_validate_version(VFIOUserProxy *proxy, Error **errp); #endif /* VFIO_USER_H */ From patchwork Wed May 29 16:23:07 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679232 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 00314C25B75 for ; Wed, 29 May 2024 16:27:23 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM70-0001FL-EO; Wed, 29 May 2024 12:25:14 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6p-000161-UP for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:05 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6l-0006Kn-Ad for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:03 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6j-006CPg-ED; Wed, 29 May 2024 17:24:57 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 14/26] vfio-user: get device info Date: Wed, 29 May 2024 17:23:07 +0100 Message-Id: <20240529162319.1476680-15-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/trace-events | 1 + hw/vfio/user-container.c | 10 +++++++++- hw/vfio/user-protocol.h | 12 ++++++++++++ hw/vfio/user.c | 34 ++++++++++++++++++++++++++++++++++ hw/vfio/user.h | 1 + 5 files changed, 57 insertions(+), 1 deletion(-) diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 68fe6f5689..0f2e338194 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -183,3 +183,4 @@ vfio_user_recv_read(uint16_t id, int read) " id 0x%x read 0x%x" vfio_user_recv_request(uint16_t cmd) " command 0x%x" vfio_user_send_write(uint16_t id, int wrote) " id 0x%x wrote 0x%x" vfio_user_version(uint16_t major, uint16_t minor, const char *caps) " major %d minor %d caps: %s" +vfio_user_get_info(uint32_t nregions, uint32_t nirqs) " #regions %d #irqs %d" diff --git a/hw/vfio/user-container.c b/hw/vfio/user-container.c index 1fd42e9bfd..f1218560a4 100644 --- a/hw/vfio/user-container.c +++ b/hw/vfio/user-container.c @@ -12,6 +12,7 @@ #include #include "hw/vfio/vfio-common.h" +#include "hw/vfio/user.h" #include "exec/address-spaces.h" #include "exec/memory.h" #include "exec/ram_addr.h" @@ -141,7 +142,14 @@ static void vfio_disconnect_user_container(VFIOUserContainer *container) static bool vfio_user_get_device(VFIOUserContainer *container, VFIODevice *vbasedev, Error **errp) { - struct vfio_device_info info = { 0 }; + struct vfio_device_info info = { .argsz = sizeof(info) }; + int ret; + + ret = vfio_user_get_info(vbasedev->proxy, &info); + if (ret) { + error_setg_errno(errp, -ret, "get info failure"); + return ret; + } vbasedev->fd = -1; diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h index 5de5b2030c..5f9ef1768f 100644 --- a/hw/vfio/user-protocol.h +++ b/hw/vfio/user-protocol.h @@ -113,4 +113,16 @@ typedef struct { */ #define VFIO_USER_DEF_MAX_BITMAP (256 * 1024 * 1024) +/* + * VFIO_USER_DEVICE_GET_INFO + * imported from struct vfio_device_info + */ +typedef struct { + VFIOUserHdr hdr; + uint32_t argsz; + uint32_t flags; + uint32_t num_regions; + uint32_t num_irqs; +} VFIOUserDeviceInfo; + #endif /* VFIO_USER_PROTOCOL_H */ diff --git a/hw/vfio/user.c b/hw/vfio/user.c index aa561fdba3..645b927f97 100644 --- a/hw/vfio/user.c +++ b/hw/vfio/user.c @@ -32,6 +32,13 @@ #include "user.h" #include "trace.h" +/* + * These are to defend against a malign server trying + * to force us to run out of memory. + */ +#define VFIO_USER_MAX_REGIONS 100 +#define VFIO_USER_MAX_IRQS 50 + static int wait_time = 5000; /* wait up to 5 sec for busy servers */ static IOThread *vfio_user_iothread; @@ -1088,3 +1095,30 @@ bool vfio_user_validate_version(VFIOUserProxy *proxy, Error **errp) trace_vfio_user_version(msgp->major, msgp->minor, msgp->capabilities); return true; } + +int vfio_user_get_info(VFIOUserProxy *proxy, struct vfio_device_info *info) +{ + VFIOUserDeviceInfo msg; + uint32_t argsz = sizeof(msg) - sizeof(msg.hdr); + + memset(&msg, 0, sizeof(msg)); + vfio_user_request_msg(&msg.hdr, VFIO_USER_DEVICE_GET_INFO, sizeof(msg), 0); + msg.argsz = argsz; + + vfio_user_send_wait(proxy, &msg.hdr, NULL, 0); + if (msg.hdr.flags & VFIO_USER_ERROR) { + return -msg.hdr.error_reply; + } + trace_vfio_user_get_info(msg.num_regions, msg.num_irqs); + + memcpy(info, &msg.argsz, argsz); + + /* defend against a malicious server */ + if (info->num_regions > VFIO_USER_MAX_REGIONS || + info->num_irqs > VFIO_USER_MAX_IRQS) { + error_printf("%s: invalid reply\n", __func__); + return -EINVAL; + } + + return 0; +} diff --git a/hw/vfio/user.h b/hw/vfio/user.h index 9c3b279839..18a5a40073 100644 --- a/hw/vfio/user.h +++ b/hw/vfio/user.h @@ -93,5 +93,6 @@ void vfio_user_set_handler(VFIODevice *vbasedev, void (*handler)(void *opaque, VFIOUserMsg *msg), void *reqarg); bool vfio_user_validate_version(VFIOUserProxy *proxy, Error **errp); +int vfio_user_get_info(VFIOUserProxy *proxy, struct vfio_device_info *info); #endif /* VFIO_USER_H */ From patchwork Wed May 29 16:23:08 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679238 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 5305FC25B75 for ; Wed, 29 May 2024 16:29:25 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM6v-00019A-PU; Wed, 29 May 2024 12:25:09 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6q-00016D-NJ for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:05 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6l-0006Kq-8F for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:04 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6j-006CPl-H9; Wed, 29 May 2024 17:24:57 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 15/26] vfio-user: get region info Date: Wed, 29 May 2024 17:23:08 +0100 Message-Id: <20240529162319.1476680-16-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Add per-region FD to support mmap() of remote device regions Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/ap.c | 2 ++ hw/vfio/ccw.c | 2 ++ hw/vfio/container.c | 7 ++++ hw/vfio/helpers.c | 26 ++++++++++++-- hw/vfio/pci.c | 2 ++ hw/vfio/platform.c | 5 +++ hw/vfio/trace-events | 1 + hw/vfio/user-pci.c | 2 ++ hw/vfio/user-protocol.h | 14 ++++++++ hw/vfio/user.c | 68 +++++++++++++++++++++++++++++++++++ include/hw/vfio/vfio-common.h | 6 +++- 11 files changed, 132 insertions(+), 3 deletions(-) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 23d700e67a..2736ad03d9 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -162,6 +162,8 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) return; } + vbasedev->use_regfds = false; + if (!vfio_attach_device(vbasedev->name, vbasedev, &address_space_memory, errp)) { goto error; diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index b4139c8aef..34cba01a68 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -587,6 +587,8 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) return; } + vbasedev->use_regfds = false; + if (!vfio_attach_device(cdev->mdevid, vbasedev, &address_space_memory, errp)) { goto out_attach_dev_err; diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 4ff09e277f..7e1a1a010a 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -895,10 +895,17 @@ void vfio_put_base_device(VFIODevice *vbasedev) int i; for (i = 0; i < vbasedev->num_regions; i++) { + if (vbasedev->regfds != NULL && vbasedev->regfds[i] != -1) { + close(vbasedev->regfds[i]); + } g_free(vbasedev->regions[i]); } g_free(vbasedev->regions); vbasedev->regions = NULL; + if (vbasedev->regfds != NULL) { + g_free(vbasedev->regfds); + vbasedev->regfds = NULL; + } } if (!vbasedev->group) { diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 2cd8fbe70c..d0f1db30da 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -363,6 +363,12 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, region->size = info->size; region->fd_offset = info->offset; region->nr = index; + if (vbasedev->regfds != NULL) { + region->fd = vbasedev->regfds[index]; + } else { + region->fd = vbasedev->fd; + } + if (region->size) { region->mem = g_new0(MemoryRegion, 1); @@ -539,12 +545,16 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info) { size_t argsz = sizeof(struct vfio_region_info); + int fd = -1; int ret; /* create region cache */ if (vbasedev->regions == NULL) { vbasedev->regions = g_new0(struct vfio_region_info *, vbasedev->num_regions); + if (vbasedev->use_regfds) { + vbasedev->regfds = g_new0(int, vbasedev->num_regions); + } } /* check cache */ if (vbasedev->regions[index] != NULL) { @@ -558,22 +568,33 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, retry: (*info)->argsz = argsz; - ret = vbasedev->io->get_region_info(vbasedev, *info); + ret = vbasedev->io->get_region_info(vbasedev, *info, &fd); if (ret != 0) { g_free(*info); *info = NULL; + if (vbasedev->regfds != NULL) { + vbasedev->regfds[index] = -1; + } + return -errno; } if ((*info)->argsz > argsz) { argsz = (*info)->argsz; *info = g_realloc(*info, argsz); + if (fd != -1) { + close(fd); + fd = -1; + } goto retry; } /* fill cache */ vbasedev->regions[index] = *info; + if (vbasedev->regfds != NULL) { + vbasedev->regfds[index] = fd; + } return 0; } @@ -695,10 +716,11 @@ static int vfio_io_device_feature(VFIODevice *vbasedev, } static int vfio_io_get_region_info(VFIODevice *vbasedev, - struct vfio_region_info *info) + struct vfio_region_info *info, int *fd) { int ret; + *fd = -1; ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info); return ret < 0 ? -errno : ret; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 2e334c0c38..50a22b6986 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3054,6 +3054,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) name = g_strdup(vbasedev->name); } + vbasedev->use_regfds = false; + if (!vfio_attach_device(name, vbasedev, pci_device_iommu_address_space(pdev), errp)) { goto error; diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 86ecd97fde..7885ccbb97 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -575,6 +575,11 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) VFIODevice *vbasedev = &vdev->vbasedev; int i; + vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; + vbasedev->dev = dev; + vbasedev->ops = &vfio_platform_ops; + vbasedev->use_regfds = false; + qemu_mutex_init(&vdev->intp_mutex); trace_vfio_platform_realize(vbasedev->sysfsdev ? diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 0f2e338194..3bfb625ee3 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -184,3 +184,4 @@ vfio_user_recv_request(uint16_t cmd) " command 0x%x" vfio_user_send_write(uint16_t id, int wrote) " id 0x%x wrote 0x%x" vfio_user_version(uint16_t major, uint16_t minor, const char *caps) " major %d minor %d caps: %s" vfio_user_get_info(uint32_t nregions, uint32_t nirqs) " #regions %d #irqs %d" +vfio_user_get_region_info(uint32_t index, uint32_t flags, uint64_t size) " index %d flags 0x%x size 0x%"PRIx64 diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c index fe98048aad..3c2d145812 100644 --- a/hw/vfio/user-pci.c +++ b/hw/vfio/user-pci.c @@ -112,6 +112,8 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) vbasedev->ops = &vfio_user_pci_ops; vbasedev->type = VFIO_DEVICE_TYPE_PCI; vbasedev->dev = DEVICE(vdev); + vbasedev->io = &vfio_dev_io_sock; + vbasedev->use_regfds = true; as = pci_device_iommu_address_space(pdev); if (!vfio_attach_device_by_iommu_type(TYPE_VFIO_IOMMU_USER, diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h index 5f9ef1768f..6f70a48905 100644 --- a/hw/vfio/user-protocol.h +++ b/hw/vfio/user-protocol.h @@ -125,4 +125,18 @@ typedef struct { uint32_t num_irqs; } VFIOUserDeviceInfo; +/* + * VFIO_USER_DEVICE_GET_REGION_INFO + * imported from struct vfio_region_info + */ +typedef struct { + VFIOUserHdr hdr; + uint32_t argsz; + uint32_t flags; + uint32_t index; + uint32_t cap_offset; + uint64_t size; + uint64_t offset; +} VFIOUserRegionInfo; + #endif /* VFIO_USER_PROTOCOL_H */ diff --git a/hw/vfio/user.c b/hw/vfio/user.c index 645b927f97..e9a89df82a 100644 --- a/hw/vfio/user.c +++ b/hw/vfio/user.c @@ -1122,3 +1122,71 @@ int vfio_user_get_info(VFIOUserProxy *proxy, struct vfio_device_info *info) return 0; } + +static int vfio_user_get_region_info(VFIOUserProxy *proxy, + struct vfio_region_info *info, + VFIOUserFDs *fds) +{ + g_autofree VFIOUserRegionInfo *msgp = NULL; + uint32_t size; + + /* data returned can be larger than vfio_region_info */ + if (info->argsz < sizeof(*info)) { + error_printf("vfio_user_get_region_info argsz too small\n"); + return -E2BIG; + } + if (fds != NULL && fds->send_fds != 0) { + error_printf("vfio_user_get_region_info can't send FDs\n"); + return -EINVAL; + } + + size = info->argsz + sizeof(VFIOUserHdr); + msgp = g_malloc0(size); + + vfio_user_request_msg(&msgp->hdr, VFIO_USER_DEVICE_GET_REGION_INFO, + sizeof(*msgp), 0); + msgp->argsz = info->argsz; + msgp->index = info->index; + + vfio_user_send_wait(proxy, &msgp->hdr, fds, size); + if (msgp->hdr.flags & VFIO_USER_ERROR) { + return -msgp->hdr.error_reply; + } + trace_vfio_user_get_region_info(msgp->index, msgp->flags, msgp->size); + + memcpy(info, &msgp->argsz, info->argsz); + return 0; +} + + +/* + * Socket-based io_ops + */ + +static int vfio_user_io_get_region_info(VFIODevice *vbasedev, + struct vfio_region_info *info, + int *fd) +{ + int ret; + VFIOUserFDs fds = { 0, 1, fd}; + + ret = vfio_user_get_region_info(vbasedev->proxy, info, &fds); + if (ret) { + return ret; + } + + if (info->index > vbasedev->num_regions) { + return -EINVAL; + } + /* cap_offset in valid area */ + if ((info->flags & VFIO_REGION_INFO_FLAG_CAPS) && + (info->cap_offset < sizeof(*info) || info->cap_offset > info->argsz)) { + return -EINVAL; + } + + return 0; +} + +VFIODeviceIO vfio_dev_io_sock = { + .get_region_info = vfio_user_io_get_region_info, +}; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 6bbe0218e9..85c3fec3b9 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -57,6 +57,7 @@ typedef struct VFIORegion { uint32_t nr_mmaps; VFIOMmap *mmaps; uint8_t nr; /* cache the region number for debug */ + int fd; /* fd to mmap() region */ } VFIORegion; typedef struct VFIOMigration { @@ -126,6 +127,7 @@ typedef struct VFIODevice { bool ram_block_discard_allowed; OnOffAuto enable_migration; bool migration_events; + bool use_regfds; VFIODeviceOps *ops; VFIODeviceIO *io; unsigned int num_irqs; @@ -140,6 +142,7 @@ typedef struct VFIODevice { IOMMUFDBackend *iommufd; VFIOUserProxy *proxy; struct vfio_region_info **regions; + int *regfds; } VFIODevice; struct VFIODeviceOps { @@ -184,7 +187,7 @@ struct VFIODeviceOps { struct VFIODeviceIO { int (*device_feature)(VFIODevice *vdev, struct vfio_device_feature *); int (*get_region_info)(VFIODevice *vdev, - struct vfio_region_info *info); + struct vfio_region_info *info, int *fd); int (*get_irq_info)(VFIODevice *vdev, struct vfio_irq_info *irq); int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs); int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, @@ -194,6 +197,7 @@ struct VFIODeviceIO { }; extern VFIODeviceIO vfio_dev_io_ioctl; +extern VFIODeviceIO vfio_dev_io_sock; #endif /* CONFIG_LINUX */ From patchwork Wed May 29 16:23:09 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679205 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id B2D2FC25B7C for ; Wed, 29 May 2024 16:26:02 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7S-0001cy-L0; Wed, 29 May 2024 12:25:42 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6y-0001A9-1r for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:12 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM6v-0006Kx-Dj for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:11 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6j-006CPq-JC; Wed, 29 May 2024 17:24:57 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 16/26] vfio-user: region read/write Date: Wed, 29 May 2024 17:23:09 +0100 Message-Id: <20240529162319.1476680-17-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Add support for posted writes on remote devices Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/helpers.c | 20 ++++-- hw/vfio/pci.c | 5 +- hw/vfio/trace-events | 1 + hw/vfio/user-pci.c | 5 ++ hw/vfio/user-protocol.h | 12 ++++ hw/vfio/user.c | 120 ++++++++++++++++++++++++++++++++++ hw/vfio/user.h | 1 + include/hw/vfio/vfio-common.h | 3 +- 8 files changed, 158 insertions(+), 9 deletions(-) diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index d0f1db30da..b2a5d7d1ba 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -182,12 +182,15 @@ void vfio_region_write(void *opaque, hwaddr addr, break; } - ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf); + ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf, + region->post_wr); if (ret != size) { + const char *errmsg = ret < 0 ? strerror(-ret) : "short write"; + error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 - ",%d) failed: %m", + ",%d) failed: %s", __func__, vbasedev->name, region->nr, - addr, data, size); + addr, data, size, errmsg); } trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); @@ -219,9 +222,11 @@ uint64_t vfio_region_read(void *opaque, ret = vbasedev->io->region_read(vbasedev, region->nr, addr, size, &buf); if (ret != size) { - error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", + const char *errmsg = ret < 0 ? strerror(-ret) : "short read"; + + error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s", __func__, vbasedev->name, region->nr, - addr, size); + addr, size, errmsg); return (uint64_t)-1; } switch (size) { @@ -363,13 +368,14 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, region->size = info->size; region->fd_offset = info->offset; region->nr = index; + region->post_wr = false; + if (vbasedev->regfds != NULL) { region->fd = vbasedev->regfds[index]; } else { region->fd = vbasedev->fd; } - if (region->size) { region->mem = g_new0(MemoryRegion, 1); memory_region_init_io(region->mem, obj, &vfio_region_ops, @@ -757,7 +763,7 @@ static int vfio_io_region_read(VFIODevice *vbasedev, uint8_t index, off_t off, } static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off, - uint32_t size, void *data) + uint32_t size, void *data, bool post) { struct vfio_region_info *info = vbasedev->regions[index]; int ret; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 50a22b6986..c69716f2df 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -51,7 +51,7 @@ (off), (size), (data))) #define VDEV_CONFIG_WRITE(vbasedev, off, size, data) \ ((vbasedev)->io->region_write((vbasedev), VFIO_PCI_CONFIG_REGION_INDEX, \ - (off), (size), (data))) + (off), (size), (data), false)) #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" @@ -1780,6 +1780,9 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK); bar->size = bar->region.size; + + /* IO regions are sync, memory can be async */ + bar->region.post_wr = (bar->ioport == 0); } static void vfio_bars_prepare(VFIOPCIDevice *vdev) diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 3bfb625ee3..a65d530557 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -185,3 +185,4 @@ vfio_user_send_write(uint16_t id, int wrote) " id 0x%x wrote 0x%x" vfio_user_version(uint16_t major, uint16_t minor, const char *caps) " major %d minor %d caps: %s" vfio_user_get_info(uint32_t nregions, uint32_t nirqs) " #regions %d #irqs %d" vfio_user_get_region_info(uint32_t index, uint32_t flags, uint64_t size) " index %d flags 0x%x size 0x%"PRIx64 +vfio_user_region_rw(uint32_t region, uint64_t off, uint32_t count) " region %d offset 0x%"PRIx64" count %d" diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c index 3c2d145812..54b19b68d0 100644 --- a/hw/vfio/user-pci.c +++ b/hw/vfio/user-pci.c @@ -41,6 +41,7 @@ struct VFIOUserPCIDevice { VFIOPCIDevice device; char *sock_name; bool send_queued; /* all sends are queued */ + bool no_post; /* all regions write are sync */ }; /* @@ -103,6 +104,9 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) if (udev->send_queued) { proxy->flags |= VFIO_PROXY_FORCE_QUEUED; } + if (udev->no_post) { + proxy->flags |= VFIO_PROXY_NO_POST; + } if (!vfio_user_validate_version(proxy, errp)) { goto error; @@ -169,6 +173,7 @@ static void vfio_user_instance_finalize(Object *obj) static Property vfio_user_pci_dev_properties[] = { DEFINE_PROP_STRING("socket", VFIOUserPCIDevice, sock_name), DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false), + DEFINE_PROP_BOOL("x-no-posted-writes", VFIOUserPCIDevice, no_post, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h index 6f70a48905..6987435e96 100644 --- a/hw/vfio/user-protocol.h +++ b/hw/vfio/user-protocol.h @@ -139,4 +139,16 @@ typedef struct { uint64_t offset; } VFIOUserRegionInfo; +/* + * VFIO_USER_REGION_READ + * VFIO_USER_REGION_WRITE + */ +typedef struct { + VFIOUserHdr hdr; + uint64_t offset; + uint32_t region; + uint32_t count; + char data[]; +} VFIOUserRegionRW; + #endif /* VFIO_USER_PROTOCOL_H */ diff --git a/hw/vfio/user.c b/hw/vfio/user.c index e9a89df82a..9bb1ee8880 100644 --- a/hw/vfio/user.c +++ b/hw/vfio/user.c @@ -57,6 +57,8 @@ static void vfio_user_cb(void *opaque); static void vfio_user_request(void *opaque); static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg); +static void vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds); static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, VFIOUserFDs *fds, int rsize); static void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd, @@ -628,6 +630,33 @@ static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg) return 0; } +/* + * async send - msg can be queued, but will be freed when sent + */ +static void vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds) +{ + VFIOUserMsg *msg; + int ret; + + if (!(hdr->flags & (VFIO_USER_NO_REPLY | VFIO_USER_REPLY))) { + error_printf("vfio_user_send_async on sync message\n"); + return; + } + + QEMU_LOCK_GUARD(&proxy->lock); + + msg = vfio_user_getmsg(proxy, hdr, fds); + msg->id = hdr->id; + msg->rsize = 0; + msg->type = VFIO_MSG_ASYNC; + + ret = vfio_user_send_queued(proxy, msg); + if (ret < 0) { + vfio_user_recycle(proxy, msg); + } +} + static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, VFIOUserFDs *fds, int rsize) { @@ -1155,9 +1184,84 @@ static int vfio_user_get_region_info(VFIOUserProxy *proxy, trace_vfio_user_get_region_info(msgp->index, msgp->flags, msgp->size); memcpy(info, &msgp->argsz, info->argsz); + + /* read-after-write hazard if guest can directly access region */ + if (info->flags & VFIO_REGION_INFO_FLAG_MMAP) { + WITH_QEMU_LOCK_GUARD(&proxy->lock) { + proxy->flags |= VFIO_PROXY_NO_POST; + } + } + return 0; } +static int vfio_user_region_read(VFIOUserProxy *proxy, uint8_t index, + off_t offset, uint32_t count, void *data) +{ + g_autofree VFIOUserRegionRW *msgp = NULL; + int size = sizeof(*msgp) + count; + + if (count > proxy->max_xfer_size) { + return -EINVAL; + } + + msgp = g_malloc0(size); + vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_READ, sizeof(*msgp), 0); + msgp->offset = offset; + msgp->region = index; + msgp->count = count; + trace_vfio_user_region_rw(msgp->region, msgp->offset, msgp->count); + + vfio_user_send_wait(proxy, &msgp->hdr, NULL, size); + if (msgp->hdr.flags & VFIO_USER_ERROR) { + return -msgp->hdr.error_reply; + } else if (msgp->count > count) { + return -E2BIG; + } else { + memcpy(data, &msgp->data, msgp->count); + } + + return msgp->count; +} + +static int vfio_user_region_write(VFIOUserProxy *proxy, uint8_t index, + off_t offset, uint32_t count, void *data, + bool post) +{ + VFIOUserRegionRW *msgp = NULL; + int flags = post ? VFIO_USER_NO_REPLY : 0; + int size = sizeof(*msgp) + count; + int ret; + + if (count > proxy->max_xfer_size) { + return -EINVAL; + } + + msgp = g_malloc0(size); + vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size, flags); + msgp->offset = offset; + msgp->region = index; + msgp->count = count; + memcpy(&msgp->data, data, count); + trace_vfio_user_region_rw(msgp->region, msgp->offset, msgp->count); + + /* async send will free msg after it's sent */ + if (post && !(proxy->flags & VFIO_PROXY_NO_POST)) { + vfio_user_send_async(proxy, &msgp->hdr, NULL); + return count; + } + + vfio_user_send_wait(proxy, &msgp->hdr, NULL, 0); + if (msgp->hdr.flags & VFIO_USER_ERROR) { + ret = -msgp->hdr.error_reply; + } else { + ret = count; + } + + g_free(msgp); + return ret; +} + /* * Socket-based io_ops @@ -1187,6 +1291,22 @@ static int vfio_user_io_get_region_info(VFIODevice *vbasedev, return 0; } +static int vfio_user_io_region_read(VFIODevice *vbasedev, uint8_t index, + off_t off, uint32_t size, void *data) +{ + return vfio_user_region_read(vbasedev->proxy, index, off, size, data); +} + +static int vfio_user_io_region_write(VFIODevice *vbasedev, uint8_t index, + off_t off, unsigned size, void *data, + bool post) +{ + return vfio_user_region_write(vbasedev->proxy, index, off, size, data, + post); +} + VFIODeviceIO vfio_dev_io_sock = { .get_region_info = vfio_user_io_get_region_info, + .region_read = vfio_user_io_region_read, + .region_write = vfio_user_io_region_write, }; diff --git a/hw/vfio/user.h b/hw/vfio/user.h index 18a5a40073..1f99a976d6 100644 --- a/hw/vfio/user.h +++ b/hw/vfio/user.h @@ -84,6 +84,7 @@ typedef struct VFIOUserProxy { /* VFIOProxy flags */ #define VFIO_PROXY_CLIENT 0x1 #define VFIO_PROXY_FORCE_QUEUED 0x4 +#define VFIO_PROXY_NO_POST 0x8 typedef struct VFIODevice VFIODevice; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 85c3fec3b9..617891c4fe 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -58,6 +58,7 @@ typedef struct VFIORegion { VFIOMmap *mmaps; uint8_t nr; /* cache the region number for debug */ int fd; /* fd to mmap() region */ + bool post_wr; /* writes can be posted */ } VFIORegion; typedef struct VFIOMigration { @@ -193,7 +194,7 @@ struct VFIODeviceIO { int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, void *data); int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, - void *data); + void *data, bool post); }; extern VFIODeviceIO vfio_dev_io_ioctl; From patchwork Wed May 29 16:23:10 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679244 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 4CC9AC25B7C for ; Wed, 29 May 2024 16:30:37 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7X-000217-Bw; Wed, 29 May 2024 12:25:47 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM76-0001IY-Gc for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:23 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM72-0006LQ-FR for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:19 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6k-006CPv-EX; Wed, 29 May 2024 17:24:58 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 17/26] vfio-user: pci_user_realize PCI setup Date: Wed, 29 May 2024 17:23:10 +0100 Message-Id: <20240529162319.1476680-18-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman PCI BARs read from remote device PCI config reads/writes sent to remote server Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/pci.c | 249 ++++++++++++++++++++++++++------------------- hw/vfio/pci.h | 10 ++ hw/vfio/user-pci.c | 42 ++++++++ 3 files changed, 194 insertions(+), 107 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index c69716f2df..d1da64383e 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1728,7 +1728,7 @@ static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) return true; } -static void vfio_teardown_msi(VFIOPCIDevice *vdev) +void vfio_teardown_msi(VFIOPCIDevice *vdev) { msi_uninit(&vdev->pdev); @@ -1829,7 +1829,7 @@ static void vfio_bars_register(VFIOPCIDevice *vdev) } } -static void vfio_bars_exit(VFIOPCIDevice *vdev) +void vfio_bars_exit(VFIOPCIDevice *vdev) { int i; @@ -1849,7 +1849,7 @@ static void vfio_bars_exit(VFIOPCIDevice *vdev) } } -static void vfio_bars_finalize(VFIOPCIDevice *vdev) +void vfio_bars_finalize(VFIOPCIDevice *vdev) { int i; @@ -2417,7 +2417,7 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) return; } -static bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) +bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; @@ -2766,7 +2766,7 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) return true; } -static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) +bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; struct vfio_region_info *reg_info = NULL; @@ -2884,7 +2884,7 @@ static void vfio_err_notifier_handler(void *opaque) * and continue after disabling error recovery support for the * device. */ -static void vfio_register_err_notifier(VFIOPCIDevice *vdev) +void vfio_register_err_notifier(VFIOPCIDevice *vdev) { Error *err = NULL; int32_t fd; @@ -2943,7 +2943,7 @@ static void vfio_req_notifier_handler(void *opaque) } } -static void vfio_register_req_notifier(VFIOPCIDevice *vdev) +void vfio_register_req_notifier(VFIOPCIDevice *vdev) { struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), .index = VFIO_PCI_REQ_IRQ_INDEX }; @@ -2998,6 +2998,130 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) vdev->req_enabled = false; } +bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) +{ + PCIDevice *pdev = &vdev->pdev; + VFIODevice *vbasedev = &vdev->vbasedev; + + /* vfio emulates a lot for us, but some bits need extra love */ + vdev->emulated_config_bits = g_malloc0(vdev->config_size); + + /* QEMU can choose to expose the ROM or not */ + memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4); + /* QEMU can also add or extend BARs */ + memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4); + + /* + * The PCI spec reserves vendor ID 0xffff as an invalid value. The + * device ID is managed by the vendor and need only be a 16-bit value. + * Allow any 16-bit value for subsystem so they can be hidden or changed. + */ + if (vdev->vendor_id != PCI_ANY_ID) { + if (vdev->vendor_id >= 0xffff) { + error_setg(errp, "invalid PCI vendor ID provided"); + return false; + } + vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0); + trace_vfio_pci_emulated_vendor_id(vdev->vbasedev.name, vdev->vendor_id); + } else { + vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID); + } + + if (vdev->device_id != PCI_ANY_ID) { + if (vdev->device_id > 0xffff) { + error_setg(errp, "invalid PCI device ID provided"); + return false; + } + vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0); + trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id); + } else { + vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID); + } + + if (vdev->sub_vendor_id != PCI_ANY_ID) { + if (vdev->sub_vendor_id > 0xffff) { + error_setg(errp, "invalid PCI subsystem vendor ID provided"); + return false; + } + vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID, + vdev->sub_vendor_id, ~0); + trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name, + vdev->sub_vendor_id); + } + + if (vdev->sub_device_id != PCI_ANY_ID) { + if (vdev->sub_device_id > 0xffff) { + error_setg(errp, "invalid PCI subsystem device ID provided"); + return false; + } + vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0); + trace_vfio_pci_emulated_sub_device_id(vbasedev->name, + vdev->sub_device_id); + } + + /* QEMU can change multi-function devices to single function, or reverse */ + vdev->emulated_config_bits[PCI_HEADER_TYPE] = + PCI_HEADER_TYPE_MULTI_FUNCTION; + + /* Restore or clear multifunction, this is always controlled by QEMU */ + if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { + vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; + } else { + vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION; + } + + /* + * Clear host resource mapping info. If we choose not to register a + * BAR, such as might be the case with the option ROM, we can get + * confusing, unwritable, residual addresses from the host here. + */ + memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24); + memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4); + + vfio_pci_size_rom(vdev); + + vfio_bars_prepare(vdev); + + if (!vfio_msix_early_setup(vdev, errp)) { + return false; + } + + vfio_bars_register(vdev); + + return true; +} + +bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) +{ + PCIDevice *pdev = &vdev->pdev; + + /* QEMU emulates all of MSI & MSIX */ + if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { + memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, + MSIX_CAP_LENGTH); + } + + if (pdev->cap_present & QEMU_PCI_CAP_MSI) { + memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff, + vdev->msi_cap_size); + } + + if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { + vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, + vfio_intx_mmap_enable, vdev); + pci_device_set_intx_routing_notifier(&vdev->pdev, + vfio_intx_routing_notifier); + vdev->irqchip_change_notifier.notify = vfio_irqchip_change; + kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier); + if (!vfio_intx_enable(vdev, errp)) { + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); + return false; + } + } + return true; +} + static void vfio_realize(PCIDevice *pdev, Error **errp) { ERRP_GUARD(); @@ -3078,90 +3202,15 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto error; } - /* vfio emulates a lot for us, but some bits need extra love */ - vdev->emulated_config_bits = g_malloc0(vdev->config_size); - - /* QEMU can choose to expose the ROM or not */ - memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4); - /* QEMU can also add or extend BARs */ - memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4); - - /* - * The PCI spec reserves vendor ID 0xffff as an invalid value. The - * device ID is managed by the vendor and need only be a 16-bit value. - * Allow any 16-bit value for subsystem so they can be hidden or changed. - */ - if (vdev->vendor_id != PCI_ANY_ID) { - if (vdev->vendor_id >= 0xffff) { - error_setg(errp, "invalid PCI vendor ID provided"); - goto error; - } - vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0); - trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id); - } else { - vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID); - } - - if (vdev->device_id != PCI_ANY_ID) { - if (vdev->device_id > 0xffff) { - error_setg(errp, "invalid PCI device ID provided"); - goto error; - } - vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0); - trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id); - } else { - vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID); - } - - if (vdev->sub_vendor_id != PCI_ANY_ID) { - if (vdev->sub_vendor_id > 0xffff) { - error_setg(errp, "invalid PCI subsystem vendor ID provided"); - goto error; - } - vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID, - vdev->sub_vendor_id, ~0); - trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name, - vdev->sub_vendor_id); - } - - if (vdev->sub_device_id != PCI_ANY_ID) { - if (vdev->sub_device_id > 0xffff) { - error_setg(errp, "invalid PCI subsystem device ID provided"); - goto error; - } - vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0); - trace_vfio_pci_emulated_sub_device_id(vbasedev->name, - vdev->sub_device_id); - } - - /* QEMU can change multi-function devices to single function, or reverse */ - vdev->emulated_config_bits[PCI_HEADER_TYPE] = - PCI_HEADER_TYPE_MULTI_FUNCTION; - - /* Restore or clear multifunction, this is always controlled by QEMU */ - if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { - vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; - } else { - vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION; + if (!vfio_pci_config_setup(vdev, errp)) { + goto error; } /* - * Clear host resource mapping info. If we choose not to register a - * BAR, such as might be the case with the option ROM, we can get - * confusing, unwritable, residual addresses from the host here. + * vfio_pci_config_setup will have registered the device's BARs + * and setup any MSIX BARs, so errors after it succeeds must + * use out_teardown */ - memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24); - memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4); - - vfio_pci_size_rom(vdev); - - vfio_bars_prepare(vdev); - - if (!vfio_msix_early_setup(vdev, errp)) { - goto error; - } - - vfio_bars_register(vdev); if (!vfio_add_capabilities(vdev, errp)) { goto out_teardown; @@ -3200,28 +3249,14 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } } - /* QEMU emulates all of MSI & MSIX */ - if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { - memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, - MSIX_CAP_LENGTH); - } - - if (pdev->cap_present & QEMU_PCI_CAP_MSI) { - memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff, - vdev->msi_cap_size); + if (!vfio_interrupt_setup(vdev, errp)) { + goto out_teardown; } - if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { - vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, - vfio_intx_mmap_enable, vdev); - pci_device_set_intx_routing_notifier(&vdev->pdev, - vfio_intx_routing_notifier); - vdev->irqchip_change_notifier.notify = vfio_irqchip_change; - kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier); - if (!vfio_intx_enable(vdev, errp)) { - goto out_deregister; - } - } + /* + * vfio_interrupt_setup will have setup INTx's KVM routing + * so errors after it succeeds must use out_deregister + */ if (vdev->display != ON_OFF_AUTO_OFF) { if (!vfio_display_probe(vdev, errp)) { diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 040f4995b5..1eeb67bb2e 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -218,6 +218,16 @@ Object *vfio_pci_get_object(VFIODevice *vbasedev); int vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f, Error **errp); int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f); void vfio_pci_put_device(VFIOPCIDevice *vdev); +bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp); +void vfio_teardown_msi(VFIOPCIDevice *vdev); +void vfio_bars_exit(VFIOPCIDevice *vdev); +void vfio_bars_finalize(VFIOPCIDevice *vdev); +bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp); +void vfio_put_device(VFIOPCIDevice *vdev); +void vfio_register_err_notifier(VFIOPCIDevice *vdev); +void vfio_register_req_notifier(VFIOPCIDevice *vdev); +bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp); +bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp); void vfio_instance_init(Object *obj); uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size); diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c index 54b19b68d0..b8a9f7a7fb 100644 --- a/hw/vfio/user-pci.c +++ b/hw/vfio/user-pci.c @@ -79,6 +79,7 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) AddressSpace *as; SocketAddress addr; VFIOUserProxy *proxy; + int ret; /* * TODO: make option parser understand SocketAddress @@ -126,8 +127,45 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) goto error; } + if (!vfio_populate_device(vdev, errp)) { + goto error; + } + + /* Get a copy of config space */ + ret = vbasedev->io->region_read(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, 0, + MIN(pci_config_size(pdev), vdev->config_size), + pdev->config); + if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { + error_setg_errno(errp, -ret, "failed to read device config space"); + goto error; + } + + if (!vfio_pci_config_setup(vdev, errp)) { + goto error; + } + + /* + * vfio_pci_config_setup will have registered the device's BARs + * and setup any MSIX BARs, so errors after it succeeds must + * use out_teardown + */ + + if (!vfio_add_capabilities(vdev, errp)) { + goto out_teardown; + } + + if (!vfio_interrupt_setup(vdev, errp)) { + goto out_teardown; + } + + vfio_register_err_notifier(vdev); + vfio_register_req_notifier(vdev); + return; +out_teardown: + vfio_teardown_msi(vdev); + vfio_bars_exit(vdev); error: error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name); } @@ -163,6 +201,10 @@ static void vfio_user_instance_finalize(Object *obj) VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); VFIODevice *vbasedev = &vdev->vbasedev; + vfio_bars_finalize(vdev); + g_free(vdev->emulated_config_bits); + g_free(vdev->rom); + vfio_pci_put_device(vdev); if (vbasedev->proxy != NULL) { From patchwork Wed May 29 16:23:11 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679263 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 6D1DAC25B75 for ; Wed, 29 May 2024 16:32:25 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7T-0001mb-JY; Wed, 29 May 2024 12:25:43 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM76-0001IX-Fe for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:23 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM73-0006LT-Pw for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:19 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6k-006CPy-PO; Wed, 29 May 2024 17:24:58 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 18/26] vfio-user: get and set IRQs Date: Wed, 29 May 2024 17:23:11 +0100 Message-Id: <20240529162319.1476680-19-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/pci.c | 3 +- hw/vfio/trace-events | 2 + hw/vfio/user-protocol.h | 25 +++++++ hw/vfio/user.c | 140 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 169 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index d1da64383e..56bfb17eec 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -772,7 +772,8 @@ retry: ret = vfio_enable_vectors(vdev, false); if (ret) { if (ret < 0) { - error_report("vfio: Error: Failed to setup MSI fds: %m"); + error_report("vfio: Error: Failed to setup MSI fds: %s", + strerror(-ret)); } else { error_report("vfio: Error: Failed to enable %d " "MSI vectors, retry with %d", vdev->nr_vectors, ret); diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index a65d530557..73661db9d9 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -186,3 +186,5 @@ vfio_user_version(uint16_t major, uint16_t minor, const char *caps) " major %d m vfio_user_get_info(uint32_t nregions, uint32_t nirqs) " #regions %d #irqs %d" vfio_user_get_region_info(uint32_t index, uint32_t flags, uint64_t size) " index %d flags 0x%x size 0x%"PRIx64 vfio_user_region_rw(uint32_t region, uint64_t off, uint32_t count) " region %d offset 0x%"PRIx64" count %d" +vfio_user_get_irq_info(uint32_t index, uint32_t flags, uint32_t count) " index %d flags 0x%x count %d" +vfio_user_set_irqs(uint32_t index, uint32_t start, uint32_t count, uint32_t flags) " index %d start %d count %d flags 0x%x" diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h index 6987435e96..48dd475ab3 100644 --- a/hw/vfio/user-protocol.h +++ b/hw/vfio/user-protocol.h @@ -139,6 +139,31 @@ typedef struct { uint64_t offset; } VFIOUserRegionInfo; +/* + * VFIO_USER_DEVICE_GET_IRQ_INFO + * imported from struct vfio_irq_info + */ +typedef struct { + VFIOUserHdr hdr; + uint32_t argsz; + uint32_t flags; + uint32_t index; + uint32_t count; +} VFIOUserIRQInfo; + +/* + * VFIO_USER_DEVICE_SET_IRQS + * imported from struct vfio_irq_set + */ +typedef struct { + VFIOUserHdr hdr; + uint32_t argsz; + uint32_t flags; + uint32_t index; + uint32_t start; + uint32_t count; +} VFIOUserIRQSet; + /* * VFIO_USER_REGION_READ * VFIO_USER_REGION_WRITE diff --git a/hw/vfio/user.c b/hw/vfio/user.c index 9bb1ee8880..352c10d37b 100644 --- a/hw/vfio/user.c +++ b/hw/vfio/user.c @@ -1195,6 +1195,122 @@ static int vfio_user_get_region_info(VFIOUserProxy *proxy, return 0; } +static int vfio_user_get_irq_info(VFIOUserProxy *proxy, + struct vfio_irq_info *info) +{ + VFIOUserIRQInfo msg; + + memset(&msg, 0, sizeof(msg)); + vfio_user_request_msg(&msg.hdr, VFIO_USER_DEVICE_GET_IRQ_INFO, + sizeof(msg), 0); + msg.argsz = info->argsz; + msg.index = info->index; + + vfio_user_send_wait(proxy, &msg.hdr, NULL, 0); + if (msg.hdr.flags & VFIO_USER_ERROR) { + return -msg.hdr.error_reply; + } + trace_vfio_user_get_irq_info(msg.index, msg.flags, msg.count); + + memcpy(info, &msg.argsz, sizeof(*info)); + return 0; +} + +static int irq_howmany(int *fdp, uint32_t cur, uint32_t max) +{ + int n = 0; + + if (fdp[cur] != -1) { + do { + n++; + } while (n < max && fdp[cur + n] != -1); + } else { + do { + n++; + } while (n < max && fdp[cur + n] == -1); + } + + return n; +} + +static int vfio_user_set_irqs(VFIOUserProxy *proxy, struct vfio_irq_set *irq) +{ + g_autofree VFIOUserIRQSet *msgp = NULL; + uint32_t size, nfds, send_fds, sent_fds, max; + + if (irq->argsz < sizeof(*irq)) { + error_printf("vfio_user_set_irqs argsz too small\n"); + return -EINVAL; + } + + /* + * Handle simple case + */ + if ((irq->flags & VFIO_IRQ_SET_DATA_EVENTFD) == 0) { + size = sizeof(VFIOUserHdr) + irq->argsz; + msgp = g_malloc0(size); + + vfio_user_request_msg(&msgp->hdr, VFIO_USER_DEVICE_SET_IRQS, size, 0); + msgp->argsz = irq->argsz; + msgp->flags = irq->flags; + msgp->index = irq->index; + msgp->start = irq->start; + msgp->count = irq->count; + trace_vfio_user_set_irqs(msgp->index, msgp->start, msgp->count, + msgp->flags); + + vfio_user_send_wait(proxy, &msgp->hdr, NULL, 0); + if (msgp->hdr.flags & VFIO_USER_ERROR) { + return -msgp->hdr.error_reply; + } + + return 0; + } + + /* + * Calculate the number of FDs to send + * and adjust argsz + */ + nfds = (irq->argsz - sizeof(*irq)) / sizeof(int); + irq->argsz = sizeof(*irq); + msgp = g_malloc0(sizeof(*msgp)); + /* + * Send in chunks if over max_send_fds + */ + for (sent_fds = 0; nfds > sent_fds; sent_fds += send_fds) { + VFIOUserFDs *arg_fds, loop_fds; + + /* must send all valid FDs or all invalid FDs in single msg */ + max = nfds - sent_fds; + if (max > proxy->max_send_fds) { + max = proxy->max_send_fds; + } + send_fds = irq_howmany((int *)irq->data, sent_fds, max); + + vfio_user_request_msg(&msgp->hdr, VFIO_USER_DEVICE_SET_IRQS, + sizeof(*msgp), 0); + msgp->argsz = irq->argsz; + msgp->flags = irq->flags; + msgp->index = irq->index; + msgp->start = irq->start + sent_fds; + msgp->count = send_fds; + trace_vfio_user_set_irqs(msgp->index, msgp->start, msgp->count, + msgp->flags); + + loop_fds.send_fds = send_fds; + loop_fds.recv_fds = 0; + loop_fds.fds = (int *)irq->data + sent_fds; + arg_fds = loop_fds.fds[0] != -1 ? &loop_fds : NULL; + + vfio_user_send_wait(proxy, &msgp->hdr, arg_fds, 0); + if (msgp->hdr.flags & VFIO_USER_ERROR) { + return -msgp->hdr.error_reply; + } + } + + return 0; +} + static int vfio_user_region_read(VFIOUserProxy *proxy, uint8_t index, off_t offset, uint32_t count, void *data) { @@ -1291,6 +1407,28 @@ static int vfio_user_io_get_region_info(VFIODevice *vbasedev, return 0; } +static int vfio_user_io_get_irq_info(VFIODevice *vbasedev, + struct vfio_irq_info *irq) +{ + int ret; + + ret = vfio_user_get_irq_info(vbasedev->proxy, irq); + if (ret) { + return ret; + } + + if (irq->index > vbasedev->num_irqs) { + return -EINVAL; + } + return 0; +} + +static int vfio_user_io_set_irqs(VFIODevice *vbasedev, + struct vfio_irq_set *irqs) +{ + return vfio_user_set_irqs(vbasedev->proxy, irqs); +} + static int vfio_user_io_region_read(VFIODevice *vbasedev, uint8_t index, off_t off, uint32_t size, void *data) { @@ -1307,6 +1445,8 @@ static int vfio_user_io_region_write(VFIODevice *vbasedev, uint8_t index, VFIODeviceIO vfio_dev_io_sock = { .get_region_info = vfio_user_io_get_region_info, + .get_irq_info = vfio_user_io_get_irq_info, + .set_irqs = vfio_user_io_set_irqs, .region_read = vfio_user_io_region_read, .region_write = vfio_user_io_region_write, }; From patchwork Wed May 29 16:23:12 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679236 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 59A3BC27C44 for ; Wed, 29 May 2024 16:29:13 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7Y-00022k-4k; Wed, 29 May 2024 12:25:48 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM74-0001Ht-1L for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:23 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM71-0006La-RT for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:17 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6k-006CQ5-Vf; Wed, 29 May 2024 17:24:58 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 19/26] vfio-user: forward msix BAR accesses to server Date: Wed, 29 May 2024 17:23:12 +0100 Message-Id: <20240529162319.1476680-20-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Server holds device current device pending state Use irq masking commands in socket case Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/helpers.c | 26 +++++++++++ hw/vfio/pci.c | 86 +++++++++++++++++++++++++---------- hw/vfio/pci.h | 2 + hw/vfio/user-pci.c | 63 +++++++++++++++++++++++++ include/hw/vfio/vfio-common.h | 2 + 5 files changed, 156 insertions(+), 23 deletions(-) diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index b2a5d7d1ba..54c8c9b03e 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -71,6 +71,32 @@ void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) vbasedev->io->set_irqs(vbasedev, &irq_set); } +void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, + .index = index, + .start = irq, + .count = 1, + }; + + vbasedev->io->set_irqs(vbasedev, &irq_set); +} + +void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, + .index = index, + .start = irq, + .count = 1, + }; + + vbasedev->io->set_irqs(vbasedev, &irq_set); +} + static inline const char *action_to_str(int action) { switch (action) { diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 56bfb17eec..8ff69ff6ec 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -520,11 +520,30 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, kvm_irqchip_commit_routes(kvm_state); } +static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector, + unsigned int nr) +{ + Error *err = NULL; + int32_t fd; + + if (vector->virq >= 0) { + fd = event_notifier_get_fd(&vector->kvm_interrupt); + } else { + fd = event_notifier_get_fd(&vector->interrupt); + } + + if (!vfio_set_irq_signaling(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { + error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name); + } +} + static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIOMSIVector *vector; + bool new_vec = false; int ret; bool resizing = !!(vdev->nr_vectors < nr + 1); @@ -539,6 +558,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, error_report("vfio: Error: event_notifier_init failed"); } vector->use = true; + new_vec = true; msix_vector_use(pdev, nr); } @@ -565,6 +585,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, kvm_irqchip_commit_route_changes(&vfio_route_change); vfio_connect_kvm_msi_virq(vector); } + new_vec = true; } } @@ -574,38 +595,35 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, * in use, so we shutdown and incrementally increase them as needed. * nr_vectors represents the total number of vectors allocated. * + * Otherwise, unmask the vector if the vector is already setup (and we can + * do so) or send the fd if not. + * * When dynamic allocation is supported, let the host only allocate * and enable a vector when it is in use in guest. nr_vectors represents * the upper bound of vectors being enabled (but not all of the ranges * is allocated or enabled). */ + if (resizing) { vdev->nr_vectors = nr + 1; } if (!vdev->defer_kvm_irq_routing) { - if (vdev->msix->noresize && resizing) { - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); - ret = vfio_enable_vectors(vdev, true); - if (ret) { - error_report("vfio: failed to enable vectors, %d", ret); - } - } else { - Error *err = NULL; - int32_t fd; - - if (vector->virq >= 0) { - fd = event_notifier_get_fd(&vector->kvm_interrupt); + if (resizing) { + if (vdev->msix->noresize) { + vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); + ret = vfio_enable_vectors(vdev, true); + if (ret) { + error_report("vfio: failed to enable vectors, %d", ret); + } } else { - fd = event_notifier_get_fd(&vector->interrupt); - } - - if (!vfio_set_irq_signaling(&vdev->vbasedev, - VFIO_PCI_MSIX_IRQ_INDEX, nr, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, - &err)) { - error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); + set_irq_signalling(&vdev->vbasedev, vector, nr); } + } else if (vdev->can_mask_msix && !new_vec) { + vfio_unmask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, + nr); + } else { + set_irq_signalling(&vdev->vbasedev, vector, nr); } } @@ -633,6 +651,12 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); + /* just mask vector if peer supports it */ + if (vdev->can_mask_msix) { + vfio_mask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr); + return; + } + /* * There are still old guests that mask and unmask vectors on every * interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of @@ -704,7 +728,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) if (ret) { error_report("vfio: failed to enable vectors, %d", ret); } - } else { + } else if (!vdev->can_mask_msix) { /* * Some communication channels between VF & PF or PF & fw rely on the * physical state of the device and expect that enabling MSI-X from the @@ -721,6 +745,13 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) if (ret) { error_report("vfio: failed to enable MSI-X, %d", ret); } + } else { + /* + * If we can use irq masking, send an invalid fd on vector 0 + * to enable MSI-X without any vectors enabled. + */ + vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, NULL); } trace_vfio_msix_enable(vdev->vbasedev.name); @@ -2771,7 +2802,7 @@ bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; struct vfio_region_info *reg_info = NULL; - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; + struct vfio_irq_info irq_info; int i, ret = -1; /* Sanity check device */ @@ -2832,8 +2863,17 @@ bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) } } - irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; + irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_info.argsz = sizeof(irq_info); + ret = vbasedev->io->get_irq_info(vbasedev, &irq_info); + if (ret == 0 && (irq_info.flags & VFIO_IRQ_INFO_MASKABLE)) { + vdev->can_mask_msix = true; + } else { + vdev->can_mask_msix = false; + } + irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; + irq_info.argsz = sizeof(irq_info); ret = vbasedev->io->get_irq_info(vbasedev, &irq_info); if (ret) { /* This can fail for an old kernel or legacy PCI dev */ diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 1eeb67bb2e..d678b84191 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -114,6 +114,7 @@ typedef struct VFIOMSIXInfo { uint32_t pba_offset; unsigned long *pending; bool noresize; + MemoryRegion *pba_region; } VFIOMSIXInfo; /* @@ -183,6 +184,7 @@ struct VFIOPCIDevice { bool defer_kvm_irq_routing; bool clear_parent_atomics_on_exit; bool skip_vsc_check; + bool can_mask_msix; VFIODisplay *dpy; Notifier irqchip_change_notifier; }; diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c index b8a9f7a7fb..9514f6e2fa 100644 --- a/hw/vfio/user-pci.c +++ b/hw/vfio/user-pci.c @@ -44,6 +44,62 @@ struct VFIOUserPCIDevice { bool no_post; /* all regions write are sync */ }; +/* + * The server maintains the device's pending interrupts, + * via its MSIX table and PBA, so we treat these acceses + * like PCI config space and forward them. + */ +static uint64_t vfio_user_pba_read(void *opaque, hwaddr addr, + unsigned size) +{ + VFIOPCIDevice *vdev = opaque; + VFIORegion *region = &vdev->bars[vdev->msix->pba_bar].region; + uint64_t data; + + /* server copy is what matters */ + data = vfio_region_read(region, addr + vdev->msix->pba_offset, size); + return data; +} + +static void vfio_user_pba_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + /* dropped */ +} + +static const MemoryRegionOps vfio_user_pba_ops = { + .read = vfio_user_pba_read, + .write = vfio_user_pba_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_user_msix_setup(VFIOPCIDevice *vdev) +{ + MemoryRegion *vfio_reg, *msix_reg, *pba_reg; + + pba_reg = g_new0(MemoryRegion, 1); + vdev->msix->pba_region = pba_reg; + + vfio_reg = vdev->bars[vdev->msix->pba_bar].mr; + msix_reg = &vdev->pdev.msix_pba_mmio; + memory_region_init_io(pba_reg, OBJECT(vdev), &vfio_user_pba_ops, vdev, + "VFIO MSIX PBA", int128_get64(msix_reg->size)); + memory_region_add_subregion_overlap(vfio_reg, vdev->msix->pba_offset, + pba_reg, 1); +} + +static void vfio_user_msix_teardown(VFIOPCIDevice *vdev) +{ + MemoryRegion *mr, *sub; + + mr = vdev->bars[vdev->msix->pba_bar].mr; + sub = vdev->msix->pba_region; + memory_region_del_subregion(mr, sub); + + g_free(vdev->msix->pba_region); + vdev->msix->pba_region = NULL; +} + /* * Incoming request message callback. * @@ -153,6 +209,9 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) if (!vfio_add_capabilities(vdev, errp)) { goto out_teardown; } + if (vdev->msix != NULL) { + vfio_user_msix_setup(vdev); + } if (!vfio_interrupt_setup(vdev, errp)) { goto out_teardown; @@ -205,6 +264,10 @@ static void vfio_user_instance_finalize(Object *obj) g_free(vdev->emulated_config_bits); g_free(vdev->rom); + if (vdev->msix != NULL) { + vfio_user_msix_teardown(vdev); + } + vfio_pci_put_device(vdev); if (vbasedev->proxy != NULL) { diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 617891c4fe..747b343596 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -248,6 +248,8 @@ void vfio_spapr_container_deinit(VFIOContainer *container); void vfio_disable_irqindex(VFIODevice *vbasedev, int index); void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index); void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index); +void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq); +void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq); bool vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, int action, int fd, Error **errp); void vfio_region_write(void *opaque, hwaddr addr, From patchwork Wed May 29 16:23:13 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679261 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 15B27C25B75 for ; Wed, 29 May 2024 16:32:19 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7X-00020x-4D; Wed, 29 May 2024 12:25:47 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM74-0001Hu-1N for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:23 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM71-0006Mo-Up for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:17 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6l-006CQA-3Z; Wed, 29 May 2024 17:24:59 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 20/26] vfio-user: proxy container connect/disconnect Date: Wed, 29 May 2024 17:23:13 +0100 Message-Id: <20240529162319.1476680-21-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/container.c | 4 ++- hw/vfio/user-container.c | 52 ++++++++++++++++++++++++++++------- hw/vfio/user-protocol.h | 2 ++ hw/vfio/user.c | 4 +++ hw/vfio/user.h | 10 +++++++ include/hw/vfio/vfio-common.h | 1 + 6 files changed, 62 insertions(+), 11 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 7e1a1a010a..e60e92066d 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -914,7 +914,9 @@ void vfio_put_base_device(VFIODevice *vbasedev) QLIST_REMOVE(vbasedev, next); vbasedev->group = NULL; trace_vfio_put_base_device(vbasedev->fd); - close(vbasedev->fd); + if (vbasedev->fd != -1) { + close(vbasedev->fd); + } } static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp) diff --git a/hw/vfio/user-container.c b/hw/vfio/user-container.c index f1218560a4..f0414509d5 100644 --- a/hw/vfio/user-container.c +++ b/hw/vfio/user-container.c @@ -56,34 +56,62 @@ static int vfio_user_query_dirty_bitmap(const VFIOContainerBase *bcontainer, static bool vfio_user_setup(VFIOContainerBase *bcontainer, Error **errp) { - error_setg_errno(errp, ENOTSUP, "Not supported"); - return -ENOTSUP; + VFIOUserContainer *container = container_of(bcontainer, VFIOUserContainer, + bcontainer); + + assert(container->proxy->dma_pgsizes != 0); + bcontainer->pgsizes = container->proxy->dma_pgsizes; + bcontainer->dma_max_mappings = container->proxy->max_dma; + + /* No live migration support yet. */ + bcontainer->dirty_pages_supported = false; + bcontainer->max_dirty_bitmap_size = container->proxy->max_bitmap; + bcontainer->dirty_pgsizes = container->proxy->migr_pgsize; + + return true; } /* * Try to mirror vfio_connect_container() as much as possible. */ static VFIOUserContainer * -vfio_connect_user_container(AddressSpace *as, Error **errp) +vfio_connect_user_container(AddressSpace *as, VFIODevice *vbasedev, + Error **errp) { - VFIOAddressSpace *space; - VFIOUserContainer *container; VFIOContainerBase *bcontainer; + VFIOUserContainer *container; + const VFIOIOMMUClass *ops; + VFIOAddressSpace *space; + int ret; space = vfio_get_address_space(as); container = g_malloc0(sizeof(*container)); - + container->proxy = vbasedev->proxy; bcontainer = &container->bcontainer; + ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_USER)); + + vfio_container_init(&container->bcontainer, space, ops); + if (!vfio_cpr_register_container(bcontainer, errp)) { goto free_container_exit; } + /* + * VFIO user allows the device server to map guest memory so it has the same + * issue with discards as a local IOMMU has. + */ + ret = ram_block_uncoordinated_discard_disable(true); + if (ret) { + error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); + goto unregister_container_exit; + } + assert(bcontainer->ops->setup); if (!bcontainer->ops->setup(bcontainer, errp)) { - goto unregister_container_exit; + goto enable_discards_exit; } QLIST_INSERT_HEAD(&space->containers, bcontainer, next); @@ -109,6 +137,9 @@ listener_release_exit: bcontainer->ops->release(bcontainer); } +enable_discards_exit: + ram_block_uncoordinated_discard_disable(false); + unregister_container_exit: vfio_cpr_unregister_container(bcontainer); @@ -123,14 +154,15 @@ free_container_exit: static void vfio_disconnect_user_container(VFIOUserContainer *container) { VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOAddressSpace *space = bcontainer->space; + + ram_block_uncoordinated_discard_disable(false); memory_listener_unregister(&bcontainer->listener); if (bcontainer->ops->release) { bcontainer->ops->release(bcontainer); } - VFIOAddressSpace *space = bcontainer->space; - vfio_container_destroy(bcontainer); vfio_cpr_unregister_container(bcontainer); @@ -166,7 +198,7 @@ static bool vfio_user_attach_device(const char *name, VFIODevice *vbasedev, { VFIOUserContainer *container; - container = vfio_connect_user_container(as, errp); + container = vfio_connect_user_container(as, vbasedev, errp); if (container == NULL) { error_prepend(errp, "failed to connect proxy"); return false; diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h index 48dd475ab3..87e43ddc72 100644 --- a/hw/vfio/user-protocol.h +++ b/hw/vfio/user-protocol.h @@ -16,6 +16,8 @@ * region and offset info for read and write commands. */ +#include + typedef struct { uint16_t id; uint16_t command; diff --git a/hw/vfio/user.c b/hw/vfio/user.c index 352c10d37b..224b5febd8 100644 --- a/hw/vfio/user.c +++ b/hw/vfio/user.c @@ -18,10 +18,14 @@ #include "qemu/lockable.h" #include "hw/hw.h" #include "hw/vfio/vfio-common.h" +#include "exec/address-spaces.h" +#include "exec/memory.h" +#include "exec/ram_addr.h" #include "qemu/sockets.h" #include "io/channel.h" #include "io/channel-socket.h" #include "io/channel-util.h" +#include "sysemu/reset.h" #include "sysemu/iothread.h" #include "qapi/qmp/qdict.h" #include "qapi/qmp/qjson.h" diff --git a/hw/vfio/user.h b/hw/vfio/user.h index 1f99a976d6..9039e96069 100644 --- a/hw/vfio/user.h +++ b/hw/vfio/user.h @@ -11,7 +11,17 @@ * */ +#include + +#include "glib-compat.h" #include "user-protocol.h" +#include "qemu/osdep.h" +#include "qemu/typedefs.h" +#include "qemu/queue.h" +#include "qemu/sockets.h" +#include "qemu/thread.h" + +typedef struct VFIODevice VFIODevice; typedef struct { int send_fds; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 747b343596..c06099edce 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -90,6 +90,7 @@ typedef struct VFIOContainer { /* MMU container sub-class for vfio-user. */ typedef struct VFIOUserContainer { VFIOContainerBase bcontainer; + VFIOUserProxy *proxy; } VFIOUserContainer; typedef struct VFIOHostDMAWindow { From patchwork Wed May 29 16:23:14 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679262 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 4B28CC25B7C for ; Wed, 29 May 2024 16:32:19 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7W-0001zV-3r; Wed, 29 May 2024 12:25:46 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM7P-0001Ud-MU for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:39 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM7J-0006S2-LX for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:35 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6r-006CQF-29; Wed, 29 May 2024 17:25:05 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Levon , John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 21/26] vfio-user: dma map/unmap operations Date: Wed, 29 May 2024 17:23:14 +0100 Message-Id: <20240529162319.1476680-22-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Implement DMA map/unmap for the vfio-user container. Add ability to do async operations during memory transactions. Originally-by: John Johnson Signed-off-by: Jagannathan Raman Signed-off-by: Elena Ufimtseva Signed-off-by: John Levon --- hw/vfio/trace-events | 4 ++ hw/vfio/user-container.c | 114 ++++++++++++++++++++++++++++++++++++++- hw/vfio/user-protocol.h | 32 +++++++++++ hw/vfio/user.c | 99 ++++++++++++++++++++++++++++++---- hw/vfio/user.h | 10 ++++ 5 files changed, 247 insertions(+), 12 deletions(-) diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 73661db9d9..387751bd7f 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -188,3 +188,7 @@ vfio_user_get_region_info(uint32_t index, uint32_t flags, uint64_t size) " index vfio_user_region_rw(uint32_t region, uint64_t off, uint32_t count) " region %d offset 0x%"PRIx64" count %d" vfio_user_get_irq_info(uint32_t index, uint32_t flags, uint32_t count) " index %d flags 0x%x count %d" vfio_user_set_irqs(uint32_t index, uint32_t start, uint32_t count, uint32_t flags) " index %d start %d count %d flags 0x%x" + +# user-container.c +vfio_user_dma_map(uint64_t iova, uint64_t size, uint64_t off, uint32_t flags, bool async_ops) " iova 0x%"PRIx64" size 0x%"PRIx64" off 0x%"PRIx64" flags 0x%x async_ops %d" +vfio_user_dma_unmap(uint64_t iova, uint64_t size, uint32_t flags, bool async_ops) " iova 0x%"PRIx64" size 0x%"PRIx64" flags 0x%x async_ops %d" diff --git a/hw/vfio/user-container.c b/hw/vfio/user-container.c index f0414509d5..213861fb5d 100644 --- a/hw/vfio/user-container.c +++ b/hw/vfio/user-container.c @@ -24,18 +24,126 @@ #include "qapi/error.h" #include "pci.h" +/* + * When DMA space is the physical address space, the region add/del listeners + * will fire during memory update transactions. These depend on BQL being held, + * so do any resulting map/demap ops async while keeping BQL. + */ +static void vfio_user_listener_begin(VFIOContainerBase *bcontainer) +{ + VFIOUserContainer *container = container_of(bcontainer, VFIOUserContainer, + bcontainer); + + container->proxy->async_ops = true; +} + +static void vfio_user_listener_commit(VFIOContainerBase *bcontainer) +{ + VFIOUserContainer *container = container_of(bcontainer, VFIOUserContainer, + bcontainer); + + /* wait here for any async requests sent during the transaction */ + container->proxy->async_ops = false; + vfio_user_wait_reqs(container->proxy); +} + static int vfio_user_dma_unmap(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb, int flags) { - return -ENOTSUP; + VFIOUserContainer *container = container_of(bcontainer, VFIOUserContainer, + bcontainer); + + struct { + VFIOUserDMAUnmap msg; + VFIOUserBitmap bitmap; + } *msgp = NULL; + int msize, rsize; + + msize = rsize = sizeof(VFIOUserDMAUnmap); + msgp = g_malloc0(rsize); + + vfio_user_request_msg(&msgp->msg.hdr, VFIO_USER_DMA_UNMAP, msize, 0); + msgp->msg.argsz = rsize - sizeof(VFIOUserHdr); + msgp->msg.flags = flags; + msgp->msg.iova = iova; + msgp->msg.size = size; + trace_vfio_user_dma_unmap(msgp->msg.iova, msgp->msg.size, msgp->msg.flags, + container->proxy->async_ops); + + if (container->proxy->async_ops) { + vfio_user_send_nowait(container->proxy, &msgp->msg.hdr, NULL, rsize); + return 0; + } + + vfio_user_send_wait(container->proxy, &msgp->msg.hdr, NULL, rsize); + if (msgp->msg.hdr.flags & VFIO_USER_ERROR) { + return -msgp->msg.hdr.error_reply; + } + + g_free(msgp); + return 0; } static int vfio_user_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly, MemoryRegion *mrp) { - return -ENOTSUP; + VFIOUserContainer *container = container_of(bcontainer, VFIOUserContainer, + bcontainer); + + VFIOUserProxy *proxy = container->proxy; + int fd = memory_region_get_fd(mrp); + int ret; + + VFIOUserFDs *fds = NULL; + VFIOUserDMAMap *msgp = g_malloc0(sizeof(*msgp)); + + vfio_user_request_msg(&msgp->hdr, VFIO_USER_DMA_MAP, sizeof(*msgp), 0); + msgp->argsz = sizeof(struct vfio_iommu_type1_dma_map); + msgp->flags = VFIO_DMA_MAP_FLAG_READ; + msgp->offset = 0; + msgp->iova = iova; + msgp->size = size; + + /* + * vaddr enters as a QEMU process address; make it either a file offset + * for mapped areas or leave as 0. + */ + if (fd != -1) { + msgp->offset = qemu_ram_block_host_offset(mrp->ram_block, vaddr); + } + + if (!readonly) { + msgp->flags |= VFIO_DMA_MAP_FLAG_WRITE; + } + + trace_vfio_user_dma_map(msgp->iova, msgp->size, msgp->offset, msgp->flags, + container->proxy->async_ops); + + /* + * The async_ops case sends without blocking or dropping BQL. + * They're later waited for in vfio_send_wait_reqs. + */ + if (container->proxy->async_ops) { + /* can't use auto variable since we don't block */ + if (fd != -1) { + fds = vfio_user_getfds(1); + fds->send_fds = 1; + fds->fds[0] = fd; + } + vfio_user_send_nowait(proxy, &msgp->hdr, fds, 0); + ret = 0; + } else { + VFIOUserFDs local_fds = { 1, 0, &fd }; + + fds = fd != -1 ? &local_fds : NULL; + vfio_user_send_wait(proxy, &msgp->hdr, fds, 0); + ret = (msgp->hdr.flags & VFIO_USER_ERROR) ? -msgp->hdr.error_reply : 0; + g_free(msgp); + } + + return ret; } static int @@ -230,6 +338,8 @@ static void vfio_iommu_user_class_init(ObjectClass *klass, void *data) VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); vioc->setup = vfio_user_setup; + vioc->listener_begin = vfio_user_listener_begin, + vioc->listener_commit = vfio_user_listener_commit, vioc->dma_map = vfio_user_dma_map; vioc->dma_unmap = vfio_user_dma_unmap; vioc->attach_device = vfio_user_attach_device; diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h index 87e43ddc72..9b569156fa 100644 --- a/hw/vfio/user-protocol.h +++ b/hw/vfio/user-protocol.h @@ -115,6 +115,31 @@ typedef struct { */ #define VFIO_USER_DEF_MAX_BITMAP (256 * 1024 * 1024) +/* + * VFIO_USER_DMA_MAP + * imported from struct vfio_iommu_type1_dma_map + */ +typedef struct { + VFIOUserHdr hdr; + uint32_t argsz; + uint32_t flags; + uint64_t offset; /* FD offset */ + uint64_t iova; + uint64_t size; +} VFIOUserDMAMap; + +/* + * VFIO_USER_DMA_UNMAP + * imported from struct vfio_iommu_type1_dma_unmap + */ +typedef struct { + VFIOUserHdr hdr; + uint32_t argsz; + uint32_t flags; + uint64_t iova; + uint64_t size; +} VFIOUserDMAUnmap; + /* * VFIO_USER_DEVICE_GET_INFO * imported from struct vfio_device_info @@ -178,4 +203,11 @@ typedef struct { char data[]; } VFIOUserRegionRW; +/*imported from struct vfio_bitmap */ +typedef struct { + uint64_t pgsize; + uint64_t size; + char data[]; +} VFIOUserBitmap; + #endif /* VFIO_USER_PROTOCOL_H */ diff --git a/hw/vfio/user.c b/hw/vfio/user.c index 224b5febd8..adecd9a68a 100644 --- a/hw/vfio/user.c +++ b/hw/vfio/user.c @@ -50,7 +50,6 @@ static void vfio_user_shutdown(VFIOUserProxy *proxy); static int vfio_user_send_qio(VFIOUserProxy *proxy, VFIOUserMsg *msg); static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr, VFIOUserFDs *fds); -static VFIOUserFDs *vfio_user_getfds(int numfds); static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg); static void vfio_user_recv(void *opaque); @@ -63,10 +62,6 @@ static void vfio_user_request(void *opaque); static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg); static void vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr, VFIOUserFDs *fds); -static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, - VFIOUserFDs *fds, int rsize); -static void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd, - uint32_t size, uint32_t flags); static inline void vfio_user_set_error(VFIOUserHdr *hdr, uint32_t err) { @@ -158,7 +153,7 @@ static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg) QTAILQ_INSERT_HEAD(&proxy->free, msg, next); } -static VFIOUserFDs *vfio_user_getfds(int numfds) +VFIOUserFDs *vfio_user_getfds(int numfds) { VFIOUserFDs *fds = g_malloc0(sizeof(*fds) + (numfds * sizeof(int))); @@ -661,8 +656,38 @@ static void vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr, } } -static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, - VFIOUserFDs *fds, int rsize) +/* + * nowait send - vfio_wait_reqs() can wait for it later + */ +void vfio_user_send_nowait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, int rsize) +{ + VFIOUserMsg *msg; + int ret; + + if (hdr->flags & VFIO_USER_NO_REPLY) { + error_printf("vfio_user_send_nowait on async message\n"); + return; + } + + QEMU_LOCK_GUARD(&proxy->lock); + + msg = vfio_user_getmsg(proxy, hdr, fds); + msg->id = hdr->id; + msg->rsize = rsize ? rsize : hdr->size; + msg->type = VFIO_MSG_NOWAIT; + + ret = vfio_user_send_queued(proxy, msg); + if (ret < 0) { + vfio_user_recycle(proxy, msg); + return; + } + + proxy->last_nowait = msg; +} + +void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, int rsize) { VFIOUserMsg *msg; bool iolock = false; @@ -713,6 +738,60 @@ static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, } } +void vfio_user_wait_reqs(VFIOUserProxy *proxy) +{ + VFIOUserMsg *msg; + bool iolock = false; + + /* + * Any DMA map/unmap requests sent in the middle + * of a memory region transaction were sent nowait. + * Wait for them here. + */ + qemu_mutex_lock(&proxy->lock); + if (proxy->last_nowait != NULL) { + iolock = bql_locked(); + if (iolock) { + bql_unlock(); + } + + /* + * Change type to WAIT to wait for reply + */ + msg = proxy->last_nowait; + msg->type = VFIO_MSG_WAIT; + proxy->last_nowait = NULL; + while (!msg->complete) { + if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, wait_time)) { + VFIOUserMsgQ *list; + + list = msg->pending ? &proxy->pending : &proxy->outgoing; + QTAILQ_REMOVE(list, msg, next); + error_printf("vfio_wait_reqs - timed out\n"); + break; + } + } + + if (msg->hdr->flags & VFIO_USER_ERROR) { + error_printf("vfio_user_wait_reqs - error reply on async "); + error_printf("request: command %x error %s\n", msg->hdr->command, + strerror(msg->hdr->error_reply)); + } + + /* + * Change type back to NOWAIT to free + */ + msg->type = VFIO_MSG_NOWAIT; + vfio_user_recycle(proxy, msg); + } + + /* lock order is BQL->proxy - don't hold proxy when getting BQL */ + qemu_mutex_unlock(&proxy->lock); + if (iolock) { + bql_lock(); + } +} + static QLIST_HEAD(, VFIOUserProxy) vfio_user_sockets = QLIST_HEAD_INITIALIZER(vfio_user_sockets); @@ -847,8 +926,8 @@ void vfio_user_disconnect(VFIOUserProxy *proxy) g_free(proxy); } -static void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd, - uint32_t size, uint32_t flags) +void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd, + uint32_t size, uint32_t flags) { static uint16_t next_id; diff --git a/hw/vfio/user.h b/hw/vfio/user.h index 9039e96069..31d2c5abd9 100644 --- a/hw/vfio/user.h +++ b/hw/vfio/user.h @@ -75,6 +75,7 @@ typedef struct VFIOUserProxy { QemuCond close_cv; AioContext *ctx; QEMUBH *req_bh; + bool async_ops; /* * above only changed when BQL is held @@ -106,4 +107,13 @@ void vfio_user_set_handler(VFIODevice *vbasedev, bool vfio_user_validate_version(VFIOUserProxy *proxy, Error **errp); int vfio_user_get_info(VFIOUserProxy *proxy, struct vfio_device_info *info); +VFIOUserFDs *vfio_user_getfds(int numfds); +void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd, + uint32_t size, uint32_t flags); +void vfio_user_wait_reqs(VFIOUserProxy *proxy); +void vfio_user_send_nowait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, int rsize); +void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, int rsize); + #endif /* VFIO_USER_H */ From patchwork Wed May 29 16:23:15 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679242 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 624F3C25B7C for ; Wed, 29 May 2024 16:30:27 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7T-0001o1-PF; Wed, 29 May 2024 12:25:43 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM75-0001IB-4I for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:23 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM72-0006N6-Nt for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:18 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6r-006CR0-9r; Wed, 29 May 2024 17:25:05 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 22/26] vfio-user: no-mmap DMA support Date: Wed, 29 May 2024 17:23:15 +0100 Message-Id: <20240529162319.1476680-23-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Force remote process to use DMA r/w messages instead of directly mapping guest memory. Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/user-container.c | 2 +- hw/vfio/user-pci.c | 5 +++++ hw/vfio/user.h | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/hw/vfio/user-container.c b/hw/vfio/user-container.c index 213861fb5d..1f6710e0b7 100644 --- a/hw/vfio/user-container.c +++ b/hw/vfio/user-container.c @@ -110,7 +110,7 @@ static int vfio_user_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, * vaddr enters as a QEMU process address; make it either a file offset * for mapped areas or leave as 0. */ - if (fd != -1) { + if (fd != -1 && !(container->proxy->flags & VFIO_PROXY_NO_MMAP)) { msgp->offset = qemu_ram_block_host_offset(mrp->ram_block, vaddr); } diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c index 9514f6e2fa..19fbcf77ab 100644 --- a/hw/vfio/user-pci.c +++ b/hw/vfio/user-pci.c @@ -40,6 +40,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(VFIOUserPCIDevice, VFIO_USER_PCI) struct VFIOUserPCIDevice { VFIOPCIDevice device; char *sock_name; + bool no_direct_dma; /* disable shared mem for DMA */ bool send_queued; /* all sends are queued */ bool no_post; /* all regions write are sync */ }; @@ -158,6 +159,9 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) vbasedev->proxy = proxy; vfio_user_set_handler(vbasedev, vfio_user_pci_process_req, vdev); + if (udev->no_direct_dma) { + proxy->flags |= VFIO_PROXY_NO_MMAP; + } if (udev->send_queued) { proxy->flags |= VFIO_PROXY_FORCE_QUEUED; } @@ -277,6 +281,7 @@ static void vfio_user_instance_finalize(Object *obj) static Property vfio_user_pci_dev_properties[] = { DEFINE_PROP_STRING("socket", VFIOUserPCIDevice, sock_name), + DEFINE_PROP_BOOL("no-direct-dma", VFIOUserPCIDevice, no_direct_dma, false), DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false), DEFINE_PROP_BOOL("x-no-posted-writes", VFIOUserPCIDevice, no_post, false), DEFINE_PROP_END_OF_LIST(), diff --git a/hw/vfio/user.h b/hw/vfio/user.h index 31d2c5abd9..fe24a881f2 100644 --- a/hw/vfio/user.h +++ b/hw/vfio/user.h @@ -94,6 +94,7 @@ typedef struct VFIOUserProxy { /* VFIOProxy flags */ #define VFIO_PROXY_CLIENT 0x1 +#define VFIO_PROXY_NO_MMAP 0x2 #define VFIO_PROXY_FORCE_QUEUED 0x4 #define VFIO_PROXY_NO_POST 0x8 From patchwork Wed May 29 16:23:16 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679264 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id C56A4C25B7C for ; Wed, 29 May 2024 16:32:52 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7W-00020K-Gg; Wed, 29 May 2024 12:25:46 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM77-0001JU-Gm for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:23 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM74-0006NI-JN for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:21 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6r-006CR5-DS; Wed, 29 May 2024 17:25:05 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 23/26] vfio-user: dma read/write operations Date: Wed, 29 May 2024 17:23:16 +0100 Message-Id: <20240529162319.1476680-24-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Messages from server to client that perform device DMA. Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/user-pci.c | 110 ++++++++++++++++++++++++++++++++++++++++ hw/vfio/user-protocol.h | 13 ++++- hw/vfio/user.c | 57 +++++++++++++++++++++ hw/vfio/user.h | 3 ++ 4 files changed, 182 insertions(+), 1 deletion(-) diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c index 19fbcf77ab..9fb6ae5142 100644 --- a/hw/vfio/user-pci.c +++ b/hw/vfio/user-pci.c @@ -101,6 +101,95 @@ static void vfio_user_msix_teardown(VFIOPCIDevice *vdev) vdev->msix->pba_region = NULL; } +static void vfio_user_dma_read(VFIOPCIDevice *vdev, VFIOUserDMARW *msg) +{ + PCIDevice *pdev = &vdev->pdev; + VFIOUserProxy *proxy = vdev->vbasedev.proxy; + VFIOUserDMARW *res; + MemTxResult r; + size_t size; + + if (msg->hdr.size < sizeof(*msg)) { + vfio_user_send_error(proxy, &msg->hdr, EINVAL); + return; + } + if (msg->count > proxy->max_xfer_size) { + vfio_user_send_error(proxy, &msg->hdr, E2BIG); + return; + } + + /* switch to our own message buffer */ + size = msg->count + sizeof(VFIOUserDMARW); + res = g_malloc0(size); + memcpy(res, msg, sizeof(*res)); + g_free(msg); + + r = pci_dma_read(pdev, res->offset, &res->data, res->count); + + switch (r) { + case MEMTX_OK: + if (res->hdr.flags & VFIO_USER_NO_REPLY) { + g_free(res); + return; + } + vfio_user_send_reply(proxy, &res->hdr, size); + break; + case MEMTX_ERROR: + vfio_user_send_error(proxy, &res->hdr, EFAULT); + break; + case MEMTX_DECODE_ERROR: + vfio_user_send_error(proxy, &res->hdr, ENODEV); + break; + case MEMTX_ACCESS_ERROR: + vfio_user_send_error(proxy, &res->hdr, EPERM); + break; + default: + error_printf("vfio_user_dma_read unknown error %d\n", r); + vfio_user_send_error(vdev->vbasedev.proxy, &res->hdr, EINVAL); + } +} + +static void vfio_user_dma_write(VFIOPCIDevice *vdev, VFIOUserDMARW *msg) +{ + PCIDevice *pdev = &vdev->pdev; + VFIOUserProxy *proxy = vdev->vbasedev.proxy; + MemTxResult r; + + if (msg->hdr.size < sizeof(*msg)) { + vfio_user_send_error(proxy, &msg->hdr, EINVAL); + return; + } + /* make sure transfer count isn't larger than the message data */ + if (msg->count > msg->hdr.size - sizeof(*msg)) { + vfio_user_send_error(proxy, &msg->hdr, E2BIG); + return; + } + + r = pci_dma_write(pdev, msg->offset, &msg->data, msg->count); + + switch (r) { + case MEMTX_OK: + if ((msg->hdr.flags & VFIO_USER_NO_REPLY) == 0) { + vfio_user_send_reply(proxy, &msg->hdr, sizeof(msg->hdr)); + } else { + g_free(msg); + } + break; + case MEMTX_ERROR: + vfio_user_send_error(proxy, &msg->hdr, EFAULT); + break; + case MEMTX_DECODE_ERROR: + vfio_user_send_error(proxy, &msg->hdr, ENODEV); + break; + case MEMTX_ACCESS_ERROR: + vfio_user_send_error(proxy, &msg->hdr, EPERM); + break; + default: + error_printf("vfio_user_dma_write unknown error %d\n", r); + vfio_user_send_error(vdev->vbasedev.proxy, &msg->hdr, EINVAL); + } +} + /* * Incoming request message callback. * @@ -108,7 +197,28 @@ static void vfio_user_msix_teardown(VFIOPCIDevice *vdev) */ static void vfio_user_pci_process_req(void *opaque, VFIOUserMsg *msg) { + VFIOPCIDevice *vdev = opaque; + VFIOUserHdr *hdr = msg->hdr; + + /* no incoming PCI requests pass FDs */ + if (msg->fds != NULL) { + vfio_user_send_error(vdev->vbasedev.proxy, hdr, EINVAL); + vfio_user_putfds(msg); + return; + } + switch (hdr->command) { + case VFIO_USER_DMA_READ: + vfio_user_dma_read(vdev, (VFIOUserDMARW *)hdr); + break; + case VFIO_USER_DMA_WRITE: + vfio_user_dma_write(vdev, (VFIOUserDMARW *)hdr); + break; + default: + error_printf("vfio_user_pci_process_req unknown cmd %d\n", + hdr->command); + vfio_user_send_error(vdev->vbasedev.proxy, hdr, ENOSYS); + } } /* diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h index 9b569156fa..607e0f4b7f 100644 --- a/hw/vfio/user-protocol.h +++ b/hw/vfio/user-protocol.h @@ -203,7 +203,18 @@ typedef struct { char data[]; } VFIOUserRegionRW; -/*imported from struct vfio_bitmap */ +/* + * VFIO_USER_DMA_READ + * VFIO_USER_DMA_WRITE + */ +typedef struct { + VFIOUserHdr hdr; + uint64_t offset; + uint32_t count; + char data[]; +} VFIOUserDMARW; + +/* imported from struct vfio_bitmap */ typedef struct { uint64_t pgsize; uint64_t size; diff --git a/hw/vfio/user.c b/hw/vfio/user.c index adecd9a68a..a383922601 100644 --- a/hw/vfio/user.c +++ b/hw/vfio/user.c @@ -383,6 +383,10 @@ static int vfio_user_recv_one(VFIOUserProxy *proxy) *msg->hdr = hdr; data = (char *)msg->hdr + sizeof(hdr); } else { + if (hdr.size > proxy->max_xfer_size + sizeof(VFIOUserDMARW)) { + error_setg(&local_err, "vfio_user_recv request larger than max"); + goto err; + } buf = g_malloc0(hdr.size); memcpy(buf, &hdr, sizeof(hdr)); data = buf + sizeof(hdr); @@ -792,6 +796,59 @@ void vfio_user_wait_reqs(VFIOUserProxy *proxy) } } +/* + * Reply to an incoming request. + */ +void vfio_user_send_reply(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int size) +{ + + if (size < sizeof(VFIOUserHdr)) { + error_printf("vfio_user_send_reply - size too small\n"); + g_free(hdr); + return; + } + + /* + * convert header to associated reply + */ + hdr->flags = VFIO_USER_REPLY; + hdr->size = size; + + vfio_user_send_async(proxy, hdr, NULL); +} + +/* + * Send an error reply to an incoming request. + */ +void vfio_user_send_error(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int error) +{ + + /* + * convert header to associated reply + */ + hdr->flags = VFIO_USER_REPLY; + hdr->flags |= VFIO_USER_ERROR; + hdr->error_reply = error; + hdr->size = sizeof(*hdr); + + vfio_user_send_async(proxy, hdr, NULL); +} + +/* + * Close FDs erroneously received in an incoming request. + */ +void vfio_user_putfds(VFIOUserMsg *msg) +{ + VFIOUserFDs *fds = msg->fds; + int i; + + for (i = 0; i < fds->recv_fds; i++) { + close(fds->fds[i]); + } + g_free(fds); + msg->fds = NULL; +} + static QLIST_HEAD(, VFIOUserProxy) vfio_user_sockets = QLIST_HEAD_INITIALIZER(vfio_user_sockets); diff --git a/hw/vfio/user.h b/hw/vfio/user.h index fe24a881f2..fa6bc9a9d6 100644 --- a/hw/vfio/user.h +++ b/hw/vfio/user.h @@ -116,5 +116,8 @@ void vfio_user_send_nowait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, VFIOUserFDs *fds, int rsize); void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, VFIOUserFDs *fds, int rsize); +void vfio_user_send_reply(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int size); +void vfio_user_send_error(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int error); +void vfio_user_putfds(VFIOUserMsg *msg); #endif /* VFIO_USER_H */ From patchwork Wed May 29 16:23:17 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679243 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 72150C41513 for ; Wed, 29 May 2024 16:30:37 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7U-0001t4-Cf; Wed, 29 May 2024 12:25:44 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM77-0001Id-6r for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:23 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM75-0006NW-0v for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:20 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6r-006CR8-TE; Wed, 29 May 2024 17:25:05 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 24/26] vfio-user: pci reset Date: Wed, 29 May 2024 17:23:17 +0100 Message-Id: <20240529162319.1476680-25-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Message to tell the server to reset the device. Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/user-pci.c | 15 +++++++++++++++ hw/vfio/user.c | 12 ++++++++++++ hw/vfio/user.h | 1 + 3 files changed, 28 insertions(+) diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c index 9fb6ae5142..b473cbc404 100644 --- a/hw/vfio/user-pci.c +++ b/hw/vfio/user-pci.c @@ -389,6 +389,20 @@ static void vfio_user_instance_finalize(Object *obj) } } +static void vfio_user_pci_reset(DeviceState *dev) +{ + VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev); + VFIODevice *vbasedev = &vdev->vbasedev; + + vfio_pci_pre_reset(vdev); + + if (vbasedev->reset_works) { + vfio_user_reset(vbasedev->proxy); + } + + vfio_pci_post_reset(vdev); +} + static Property vfio_user_pci_dev_properties[] = { DEFINE_PROP_STRING("socket", VFIOUserPCIDevice, sock_name), DEFINE_PROP_BOOL("no-direct-dma", VFIOUserPCIDevice, no_direct_dma, false), @@ -402,6 +416,7 @@ static void vfio_user_pci_dev_class_init(ObjectClass *klass, void *data) DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); + dc->reset = vfio_user_pci_reset; device_class_set_props(dc, vfio_user_pci_dev_properties); dc->desc = "VFIO over socket PCI device assignment"; pdc->realize = vfio_user_pci_realize; diff --git a/hw/vfio/user.c b/hw/vfio/user.c index a383922601..3a38aa5543 100644 --- a/hw/vfio/user.c +++ b/hw/vfio/user.c @@ -1518,6 +1518,18 @@ static int vfio_user_region_write(VFIOUserProxy *proxy, uint8_t index, return ret; } +void vfio_user_reset(VFIOUserProxy *proxy) +{ + VFIOUserHdr msg; + + vfio_user_request_msg(&msg, VFIO_USER_DEVICE_RESET, sizeof(msg), 0); + + vfio_user_send_wait(proxy, &msg, NULL, 0); + if (msg.flags & VFIO_USER_ERROR) { + error_printf("reset reply error %d\n", msg.error_reply); + } +} + /* * Socket-based io_ops diff --git a/hw/vfio/user.h b/hw/vfio/user.h index fa6bc9a9d6..d9aa1759df 100644 --- a/hw/vfio/user.h +++ b/hw/vfio/user.h @@ -119,5 +119,6 @@ void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, void vfio_user_send_reply(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int size); void vfio_user_send_error(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int error); void vfio_user_putfds(VFIOUserMsg *msg); +void vfio_user_reset(VFIOUserProxy *proxy); #endif /* VFIO_USER_H */ From patchwork Wed May 29 16:23:18 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679235 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 78779C41513 for ; Wed, 29 May 2024 16:27:53 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7Z-00024e-BM; Wed, 29 May 2024 12:25:49 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM76-0001IZ-GW for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:23 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM74-0006Np-EL for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:19 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6t-006CRE-4c; Wed, 29 May 2024 17:25:07 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 25/26] vfio-user: add 'x-msg-timeout' option that specifies msg wait times Date: Wed, 29 May 2024 17:23:18 +0100 Message-Id: <20240529162319.1476680-26-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/user-pci.c | 4 ++++ hw/vfio/user.c | 7 ++++--- hw/vfio/user.h | 1 + 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c index b473cbc404..b70aeb6ee3 100644 --- a/hw/vfio/user-pci.c +++ b/hw/vfio/user-pci.c @@ -43,6 +43,7 @@ struct VFIOUserPCIDevice { bool no_direct_dma; /* disable shared mem for DMA */ bool send_queued; /* all sends are queued */ bool no_post; /* all regions write are sync */ + uint32_t wait_time; /* timeout for message replies */ }; /* @@ -278,6 +279,8 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) if (udev->no_post) { proxy->flags |= VFIO_PROXY_NO_POST; } + /* user specified or 5 sec default */ + proxy->wait_time = udev->wait_time; if (!vfio_user_validate_version(proxy, errp)) { goto error; @@ -408,6 +411,7 @@ static Property vfio_user_pci_dev_properties[] = { DEFINE_PROP_BOOL("no-direct-dma", VFIOUserPCIDevice, no_direct_dma, false), DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false), DEFINE_PROP_BOOL("x-no-posted-writes", VFIOUserPCIDevice, no_post, false), + DEFINE_PROP_UINT32("x-msg-timeout", VFIOUserPCIDevice, wait_time, 5000), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/vfio/user.c b/hw/vfio/user.c index 3a38aa5543..f13381f310 100644 --- a/hw/vfio/user.c +++ b/hw/vfio/user.c @@ -43,7 +43,6 @@ #define VFIO_USER_MAX_REGIONS 100 #define VFIO_USER_MAX_IRQS 50 -static int wait_time = 5000; /* wait up to 5 sec for busy servers */ static IOThread *vfio_user_iothread; static void vfio_user_shutdown(VFIOUserProxy *proxy); @@ -723,7 +722,8 @@ void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, if (ret == 0) { while (!msg->complete) { - if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, wait_time)) { + if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, + proxy->wait_time)) { VFIOUserMsgQ *list; list = msg->pending ? &proxy->pending : &proxy->outgoing; @@ -766,7 +766,8 @@ void vfio_user_wait_reqs(VFIOUserProxy *proxy) msg->type = VFIO_MSG_WAIT; proxy->last_nowait = NULL; while (!msg->complete) { - if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, wait_time)) { + if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, + proxy->wait_time)) { VFIOUserMsgQ *list; list = msg->pending ? &proxy->pending : &proxy->outgoing; diff --git a/hw/vfio/user.h b/hw/vfio/user.h index d9aa1759df..ff2aa005eb 100644 --- a/hw/vfio/user.h +++ b/hw/vfio/user.h @@ -72,6 +72,7 @@ typedef struct VFIOUserProxy { uint64_t max_bitmap; uint64_t migr_pgsize; int flags; + uint32_t wait_time; QemuCond close_cv; AioContext *ctx; QEMUBH *req_bh; From patchwork Wed May 29 16:23:19 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: John Levon X-Patchwork-Id: 13679245 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 880FBC25B75 for ; Wed, 29 May 2024 16:31:10 +0000 (UTC) Received: from localhost ([::1] helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1sCM7U-0001ud-Oc; Wed, 29 May 2024 12:25:44 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM7A-0001Ku-IX for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:25 -0400 Received: from ssh.movementarian.org ([139.162.205.133] helo=movementarian.org) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1sCM74-0006Nr-GT for qemu-devel@nongnu.org; Wed, 29 May 2024 12:25:24 -0400 Received: from movement by movementarian.org with local (Exim 4.95) (envelope-from ) id 1sCM6t-006CRU-Ct; Wed, 29 May 2024 17:25:07 +0100 From: John Levon To: qemu-devel@nongnu.org Cc: alex.williamson@redhat.com, clg@redhat.com, jag.raman@oracle.com, thanos.makatos@nutanix.com, John Johnson , Elena Ufimtseva , John Levon Subject: [PATCH 26/26] vfio-user: add coalesced posted writes Date: Wed, 29 May 2024 17:23:19 +0100 Message-Id: <20240529162319.1476680-27-levon@movementarian.org> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240529162319.1476680-1-levon@movementarian.org> References: <20240529162319.1476680-1-levon@movementarian.org> MIME-Version: 1.0 Received-SPF: pass client-ip=139.162.205.133; envelope-from=movement@movementarian.org; helo=movementarian.org X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_PASS=-0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org From: Jagannathan Raman Add new message to send multiple writes to server. Prevents the outgoing queue from overflowing when a long latency operation is followed by a series of posted writes. Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon --- hw/vfio/trace-events | 1 + hw/vfio/user-protocol.h | 21 +++++++ hw/vfio/user.c | 130 +++++++++++++++++++++++++++++++++++++++- hw/vfio/user.h | 7 +++ 4 files changed, 157 insertions(+), 2 deletions(-) diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 387751bd7f..2e6f0e6240 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -188,6 +188,7 @@ vfio_user_get_region_info(uint32_t index, uint32_t flags, uint64_t size) " index vfio_user_region_rw(uint32_t region, uint64_t off, uint32_t count) " region %d offset 0x%"PRIx64" count %d" vfio_user_get_irq_info(uint32_t index, uint32_t flags, uint32_t count) " index %d flags 0x%x count %d" vfio_user_set_irqs(uint32_t index, uint32_t start, uint32_t count, uint32_t flags) " index %d start %d count %d flags 0x%x" +vfio_user_wrmulti(const char *s, uint64_t wr_cnt) " %s count 0x%"PRIx64 # user-container.c vfio_user_dma_map(uint64_t iova, uint64_t size, uint64_t off, uint32_t flags, bool async_ops) " iova 0x%"PRIx64" size 0x%"PRIx64" off 0x%"PRIx64" flags 0x%x async_ops %d" diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h index 607e0f4b7f..22e3265f58 100644 --- a/hw/vfio/user-protocol.h +++ b/hw/vfio/user-protocol.h @@ -42,6 +42,7 @@ enum vfio_user_command { VFIO_USER_DMA_WRITE = 12, VFIO_USER_DEVICE_RESET = 13, VFIO_USER_DIRTY_PAGES = 14, + VFIO_USER_REGION_WRITE_MULTI = 15, VFIO_USER_MAX, }; @@ -75,6 +76,7 @@ typedef struct { #define VFIO_USER_CAP_PGSIZES "pgsizes" #define VFIO_USER_CAP_MAP_MAX "max_dma_maps" #define VFIO_USER_CAP_MIGR "migration" +#define VFIO_USER_CAP_MULTI "write_multiple" /* "migration" members */ #define VFIO_USER_CAP_PGSIZE "pgsize" @@ -221,4 +223,23 @@ typedef struct { char data[]; } VFIOUserBitmap; +/* + * VFIO_USER_REGION_WRITE_MULTI + */ +#define VFIO_USER_MULTI_DATA 8 +#define VFIO_USER_MULTI_MAX 200 + +typedef struct { + uint64_t offset; + uint32_t region; + uint32_t count; + char data[VFIO_USER_MULTI_DATA]; +} VFIOUserWROne; + +typedef struct { + VFIOUserHdr hdr; + uint64_t wr_cnt; + VFIOUserWROne wrs[VFIO_USER_MULTI_MAX]; +} VFIOUserWRMulti; + #endif /* VFIO_USER_PROTOCOL_H */ diff --git a/hw/vfio/user.c b/hw/vfio/user.c index f13381f310..b866c84b4c 100644 --- a/hw/vfio/user.c +++ b/hw/vfio/user.c @@ -61,6 +61,7 @@ static void vfio_user_request(void *opaque); static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg); static void vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr, VFIOUserFDs *fds); +static void vfio_user_flush_multi(VFIOUserProxy *proxy); static inline void vfio_user_set_error(VFIOUserHdr *hdr, uint32_t err) { @@ -465,6 +466,11 @@ static void vfio_user_send(void *opaque) } qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, vfio_user_recv, NULL, NULL, proxy); + + /* queue empty - send any pending multi write msgs */ + if (proxy->wr_multi != NULL) { + vfio_user_flush_multi(proxy); + } } } @@ -485,6 +491,7 @@ static int vfio_user_send_one(VFIOUserProxy *proxy) } QTAILQ_REMOVE(&proxy->outgoing, msg, next); + proxy->num_outgoing--; if (msg->type == VFIO_MSG_ASYNC) { vfio_user_recycle(proxy, msg); } else { @@ -592,11 +599,18 @@ static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg) { int ret; + /* older coalesced writes go first */ + if (proxy->wr_multi != NULL && + ((msg->hdr->flags & VFIO_USER_TYPE) == VFIO_USER_REQUEST)) { + vfio_user_flush_multi(proxy); + } + /* * Unsent outgoing msgs - add to tail */ if (!QTAILQ_EMPTY(&proxy->outgoing)) { QTAILQ_INSERT_TAIL(&proxy->outgoing, msg, next); + proxy->num_outgoing++; return 0; } @@ -610,6 +624,7 @@ static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg) } if (ret == QIO_CHANNEL_ERR_BLOCK) { QTAILQ_INSERT_HEAD(&proxy->outgoing, msg, next); + proxy->num_outgoing = 1; qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, vfio_user_recv, proxy->ctx, vfio_user_send, proxy); @@ -1149,12 +1164,27 @@ static bool check_migr(VFIOUserProxy *proxy, QObject *qobj, Error **errp) return caps_parse(proxy, qdict, caps_migr, errp); } +static bool check_multi(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QBool *qb = qobject_to(QBool, qobj); + + if (qb == NULL) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MULTI); + return false; + } + if (qbool_get_bool(qb)) { + proxy->flags |= VFIO_PROXY_USE_MULTI; + } + return true; +} + static struct cap_entry caps_cap[] = { { VFIO_USER_CAP_MAX_FDS, check_max_fds }, { VFIO_USER_CAP_MAX_XFER, check_max_xfer }, { VFIO_USER_CAP_PGSIZES, check_pgsizes }, { VFIO_USER_CAP_MAP_MAX, check_max_dma }, { VFIO_USER_CAP_MIGR, check_migr }, + { VFIO_USER_CAP_MULTI, check_multi }, { NULL } }; @@ -1213,6 +1243,7 @@ static GString *caps_json(void) qdict_put_int(capdict, VFIO_USER_CAP_MAX_XFER, VFIO_USER_DEF_MAX_XFER); qdict_put_int(capdict, VFIO_USER_CAP_PGSIZES, VFIO_USER_DEF_PGSIZE); qdict_put_int(capdict, VFIO_USER_CAP_MAP_MAX, VFIO_USER_DEF_MAP_MAX); + qdict_put_bool(capdict, VFIO_USER_CAP_MULTI, true); qdict_put_obj(dict, VFIO_USER_CAP, QOBJECT(capdict)); @@ -1481,19 +1512,114 @@ static int vfio_user_region_read(VFIOUserProxy *proxy, uint8_t index, return msgp->count; } +static void vfio_user_flush_multi(VFIOUserProxy *proxy) +{ + VFIOUserMsg *msg; + VFIOUserWRMulti *wm = proxy->wr_multi; + int ret; + + proxy->wr_multi = NULL; + + /* adjust size for actual # of writes */ + wm->hdr.size -= (VFIO_USER_MULTI_MAX - wm->wr_cnt) * sizeof(VFIOUserWROne); + + msg = vfio_user_getmsg(proxy, &wm->hdr, NULL); + msg->id = wm->hdr.id; + msg->rsize = 0; + msg->type = VFIO_MSG_ASYNC; + trace_vfio_user_wrmulti("flush", wm->wr_cnt); + + ret = vfio_user_send_queued(proxy, msg); + if (ret < 0) { + vfio_user_recycle(proxy, msg); + } +} + +static void vfio_user_create_multi(VFIOUserProxy *proxy) +{ + VFIOUserWRMulti *wm; + + wm = g_malloc0(sizeof(*wm)); + vfio_user_request_msg(&wm->hdr, VFIO_USER_REGION_WRITE_MULTI, + sizeof(*wm), VFIO_USER_NO_REPLY); + proxy->wr_multi = wm; +} + +static void vfio_user_add_multi(VFIOUserProxy *proxy, uint8_t index, + off_t offset, uint32_t count, void *data) +{ + VFIOUserWRMulti *wm = proxy->wr_multi; + VFIOUserWROne *w1 = &wm->wrs[wm->wr_cnt]; + + w1->offset = offset; + w1->region = index; + w1->count = count; + memcpy(&w1->data, data, count); + + wm->wr_cnt++; + trace_vfio_user_wrmulti("add", wm->wr_cnt); + if (wm->wr_cnt == VFIO_USER_MULTI_MAX || + proxy->num_outgoing < VFIO_USER_OUT_LOW) { + vfio_user_flush_multi(proxy); + } +} + static int vfio_user_region_write(VFIOUserProxy *proxy, uint8_t index, off_t offset, uint32_t count, void *data, bool post) { VFIOUserRegionRW *msgp = NULL; - int flags = post ? VFIO_USER_NO_REPLY : 0; + int flags; int size = sizeof(*msgp) + count; + bool can_multi; int ret; if (count > proxy->max_xfer_size) { return -EINVAL; } + if (proxy->flags & VFIO_PROXY_NO_POST) { + post = false; + } + + /* write eligible to be in a WRITE_MULTI msg ? */ + can_multi = (proxy->flags & VFIO_PROXY_USE_MULTI) && post && + count <= VFIO_USER_MULTI_DATA; + + /* + * This should be a rare case, so first check without the lock, + * if we're wrong, vfio_send_queued() will flush any posted writes + * we missed here + */ + if (proxy->wr_multi != NULL || + (proxy->num_outgoing > VFIO_USER_OUT_HIGH && can_multi)) { + + /* + * re-check with lock + * + * if already building a WRITE_MULTI msg, + * add this one if possible else flush pending before + * sending the current one + * + * else if outgoing queue is over the highwater, + * start a new WRITE_MULTI message + */ + WITH_QEMU_LOCK_GUARD(&proxy->lock) { + if (proxy->wr_multi != NULL) { + if (can_multi) { + vfio_user_add_multi(proxy, index, offset, count, data); + return count; + } + vfio_user_flush_multi(proxy); + } else if (proxy->num_outgoing > VFIO_USER_OUT_HIGH && can_multi) { + vfio_user_create_multi(proxy); + vfio_user_add_multi(proxy, index, offset, count, data); + return count; + } + } + } + + flags = post ? VFIO_USER_NO_REPLY : 0; msgp = g_malloc0(size); vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size, flags); msgp->offset = offset; @@ -1503,7 +1629,7 @@ static int vfio_user_region_write(VFIOUserProxy *proxy, uint8_t index, trace_vfio_user_region_rw(msgp->region, msgp->offset, msgp->count); /* async send will free msg after it's sent */ - if (post && !(proxy->flags & VFIO_PROXY_NO_POST)) { + if (post) { vfio_user_send_async(proxy, &msgp->hdr, NULL); return count; } diff --git a/hw/vfio/user.h b/hw/vfio/user.h index ff2aa005eb..dc4d41cc0e 100644 --- a/hw/vfio/user.h +++ b/hw/vfio/user.h @@ -90,6 +90,8 @@ typedef struct VFIOUserProxy { VFIOUserMsg *last_nowait; VFIOUserMsg *part_recv; size_t recv_left; + VFIOUserWRMulti *wr_multi; + int num_outgoing; enum proxy_state state; } VFIOUserProxy; @@ -98,6 +100,11 @@ typedef struct VFIOUserProxy { #define VFIO_PROXY_NO_MMAP 0x2 #define VFIO_PROXY_FORCE_QUEUED 0x4 #define VFIO_PROXY_NO_POST 0x8 +#define VFIO_PROXY_USE_MULTI 0x10 + +/* coalescing high and low water marks for VFIOProxy num_outgoing */ +#define VFIO_USER_OUT_HIGH 1024 +#define VFIO_USER_OUT_LOW 128 typedef struct VFIODevice VFIODevice;