Message ID | 20241012024524.1377836-5-vivek.kasireddy@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/xe/sriov: Don't migrate dmabuf BO to System RAM while running in VM | expand |
On Fri, Oct 11, 2024 at 07:40:26PM -0700, Vivek Kasireddy wrote: > For BOs of type ttm_bo_type_sg, that are backed by PCI BAR addresses > associated with a VF, we need to adjust and translate these addresses > to LMEM addresses to make the BOs usable by the PF. Otherwise, the BOs > (i.e, PCI BAR addresses) are only accessible by the CPU and not by > the GPU. > > In order to do the above, we first need to identify if the DMA addresses > associated with an imported BO (type ttm_bo_type_sg) belong to System > RAM or a VF or other PCI device. After we confirm that they belong to > a VF, we convert the DMA addresses (IOVAs in this case) to DPAs and > create a new sg and populate it with the new addresses. I think using a SG list is a no-go. We have received pushback before [1] about using a SG list as structure to hold DPA rather than dma-address. The consensus was a SG list is not a generic structure to hold any address [2], rather a specific structure for dma addressess. I'm pretty sure we will have define a new BO type (ttm_bo_type_devmem?) and structure that can iterated on if we want to do something like this unless we want to ignore the above feedback. [1] https://patchwork.freedesktop.org/patch/574894/?series=128910&rev=1 [2] https://patchwork.freedesktop.org/patch/574894/?series=128910&rev=1#comment_1070889 > > Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com> > --- > drivers/gpu/drm/xe/xe_bo.c | 108 ++++++++++++++++++++++++++++++- > drivers/gpu/drm/xe/xe_bo_types.h | 6 ++ > 2 files changed, 113 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c > index c74c121ea7bb..64efe1b21f19 100644 > --- a/drivers/gpu/drm/xe/xe_bo.c > +++ b/drivers/gpu/drm/xe/xe_bo.c > @@ -6,6 +6,7 @@ > #include "xe_bo.h" > > #include <linux/dma-buf.h> > +#include <linux/iommu.h> > > #include <drm/drm_drv.h> > #include <drm/drm_gem_ttm_helper.h> > @@ -15,16 +16,19 @@ > #include <drm/ttm/ttm_tt.h> > #include <uapi/drm/xe_drm.h> > > +#include "regs/xe_bars.h" > #include "xe_device.h" > #include "xe_dma_buf.h" > #include "xe_drm_client.h" > #include "xe_ggtt.h" > #include "xe_gt.h" > +#include "xe_gt_sriov_pf_config.h" > #include "xe_map.h" > #include "xe_migrate.h" > #include "xe_pm.h" > #include "xe_preempt_fence.h" > #include "xe_res_cursor.h" > +#include "xe_sriov_pf_helpers.h" > #include "xe_trace_bo.h" > #include "xe_ttm_stolen_mgr.h" > #include "xe_vm.h" > @@ -543,6 +547,102 @@ static int xe_bo_trigger_rebind(struct xe_device *xe, struct xe_bo *bo, > return ret; > } > > +static struct pci_dev *xe_find_vf_dev(struct xe_device *xe, > + phys_addr_t phys) > +{ > + struct pci_dev *pdev, *pf_pdev = to_pci_dev(xe->drm.dev); > + resource_size_t io_start, io_size; > + > + list_for_each_entry(pdev, &pf_pdev->bus->devices, bus_list) { > + if (pdev->is_physfn) > + continue; > + > + io_start = pci_resource_start(pdev, LMEM_BAR); > + io_size = pci_resource_len(pdev, LMEM_BAR); > + > + if (phys >= io_start && > + phys < (io_start + io_size - PAGE_SIZE)) > + return pdev; > + } > + return NULL; > +} > + > + > +static void xe_bo_translate_iova_to_dpa(struct xe_device *xe, > + struct sg_table *sg, > + struct sg_table *new_sg, > + struct pci_dev *pdev) > +{ > + resource_size_t io_start = pci_resource_start(pdev, LMEM_BAR); > + struct xe_gt *gt = xe_root_mmio_gt(xe); > + struct scatterlist *sgl, *new_sgl; > + int i, vfid = pci_iov_vf_id(pdev); > + dma_addr_t new_addr, bo_addr; > + struct iommu_domain *domain; > + phys_addr_t phys; > + u64 offset; > + > + bo_addr = xe_gt_sriov_pf_config_get_lmem_addr(gt, ++vfid); > + domain = iommu_get_domain_for_dev(xe->drm.dev); > + > + new_sgl = new_sg->sgl; > + for_each_sgtable_dma_sg(sg, sgl, i) { I'm pretty sure this doesn't work if a single dma address of the input 'sg' resolves to a non-contiguous physical DPA. In most cases this is going to be contiguous though unless there is memory pressure or unaligned allocations sizes. Assuming your testing didn't blow up, you may not have hit a memory pressure situation where VRAM gets fragmented. I think only iommu_iova_to_phys is accurate for exactly 1 page unless I'm missing something. See [3]. [3] https://elixir.bootlin.com/linux/v6.11.3/source/drivers/iommu/iommu.c#L2376 Matt > + phys = domain ? iommu_iova_to_phys(domain, sg_dma_address(sgl)) : > + sg_dma_address(sgl); > + offset = phys - io_start; > + new_addr = bo_addr + offset; > + > + sg_set_page(new_sgl, NULL, sg_dma_len(sgl), 0); > + sg_dma_address(new_sgl) = new_addr; > + sg_dma_len(new_sgl) = sg_dma_len(sgl); > + > + new_sgl = sg_next(new_sgl); > + } > +} > + > +static struct sg_table *xe_bo_create_new_sg(struct sg_table *sg, > + struct xe_bo *bo) > +{ > + struct xe_device *xe = xe_bo_device(bo); > + struct iommu_domain *domain; > + struct sg_table *new_sg; > + struct pci_dev *pdev; > + phys_addr_t phys; > + int vfid; > + > + if (!IS_SRIOV_PF(xe)) > + return sg; > + > + domain = iommu_get_domain_for_dev(xe->drm.dev); > + phys = domain ? iommu_iova_to_phys(domain, sg_dma_address(sg->sgl)) : > + sg_dma_address(sg->sgl); > + > + if (page_is_ram(PFN_DOWN(phys))) > + return sg; > + > + pdev = xe_find_vf_dev(xe, phys); > + if (!pdev) > + return sg; > + > + vfid = pci_iov_vf_id(pdev); > + if (vfid < 0) > + return sg; > + > + new_sg = kzalloc(sizeof(*new_sg), GFP_KERNEL); > + if (!new_sg) > + return sg; > + > + if (sg_alloc_table(new_sg, sg->nents, GFP_KERNEL)) { > + kfree(new_sg); > + return sg; > + } > + > + bo->is_devmem_external = true; > + xe_bo_translate_iova_to_dpa(xe, sg, new_sg, pdev); > + > + return new_sg; > +} > + > /* > * The dma-buf map_attachment() / unmap_attachment() is hooked up here. > * Note that unmapping the attachment is deferred to the next > @@ -577,7 +677,7 @@ static int xe_bo_move_dmabuf(struct ttm_buffer_object *ttm_bo, > return PTR_ERR(sg); > > ttm_bo->sg = sg; > - xe_tt->sg = sg; > + xe_tt->sg = xe_bo_create_new_sg(sg, ttm_to_xe_bo(ttm_bo)); > > out: > ttm_bo_move_null(ttm_bo, new_res); > @@ -1066,6 +1166,8 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) > > static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo) > { > + struct xe_bo *bo = ttm_to_xe_bo(ttm_bo); > + > if (!xe_bo_is_xe_bo(ttm_bo)) > return; > > @@ -1079,6 +1181,10 @@ static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo) > > dma_buf_unmap_attachment(ttm_bo->base.import_attach, ttm_bo->sg, > DMA_BIDIRECTIONAL); > + if (bo->is_devmem_external && xe_tt->sg != ttm_bo->sg) { > + sg_free_table(xe_tt->sg); > + kfree(xe_tt->sg); > + } > ttm_bo->sg = NULL; > xe_tt->sg = NULL; > } > diff --git a/drivers/gpu/drm/xe/xe_bo_types.h b/drivers/gpu/drm/xe/xe_bo_types.h > index 8b9201775081..0fe619bc436d 100644 > --- a/drivers/gpu/drm/xe/xe_bo_types.h > +++ b/drivers/gpu/drm/xe/xe_bo_types.h > @@ -67,6 +67,12 @@ struct xe_bo { > /** @ccs_cleared */ > bool ccs_cleared; > > + /** > + * @is_devmem_external: Whether this BO is an imported dma-buf that > + * has a backing store in VRAM. > + */ > + bool is_devmem_external; > + > /** > * @cpu_caching: CPU caching mode. Currently only used for userspace > * objects. Exceptions are system memory on DGFX, which is always > -- > 2.45.1 >
Hi Matt, > > On Fri, Oct 11, 2024 at 07:40:26PM -0700, Vivek Kasireddy wrote: > > For BOs of type ttm_bo_type_sg, that are backed by PCI BAR addresses > > associated with a VF, we need to adjust and translate these addresses > > to LMEM addresses to make the BOs usable by the PF. Otherwise, the BOs > > (i.e, PCI BAR addresses) are only accessible by the CPU and not by > > the GPU. > > > > In order to do the above, we first need to identify if the DMA addresses > > associated with an imported BO (type ttm_bo_type_sg) belong to System > > RAM or a VF or other PCI device. After we confirm that they belong to > > a VF, we convert the DMA addresses (IOVAs in this case) to DPAs and > > create a new sg and populate it with the new addresses. > > I think using a SG list is a no-go. We have received pushback before [1] > about using a SG list as structure to hold DPA rather than dma-address. > The consensus was a SG list is not a generic structure to hold any > address [2], rather a specific structure for dma addressess. AFAIU, that would make sense if the SG list was exposed outside of the Xe driver but the SG list that is created by this patch is only used internally by the Xe driver. Would this still not be acceptable? > > I'm pretty sure we will have define a new BO type (ttm_bo_type_devmem?) > and structure that can iterated on if we want to do something like this > unless we want to ignore the above feedback. Right, if using SG list is a no-go, then using some form of xe_res_cursor to iterate seems like the only other option. > > [1] https://patchwork.freedesktop.org/patch/574894/?series=128910&rev=1 > [2] > https://patchwork.freedesktop.org/patch/574894/?series=128910&rev=1#co > mment_1070889 > > > > > Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com> > > --- > > drivers/gpu/drm/xe/xe_bo.c | 108 > ++++++++++++++++++++++++++++++- > > drivers/gpu/drm/xe/xe_bo_types.h | 6 ++ > > 2 files changed, 113 insertions(+), 1 deletion(-) > > > > diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c > > index c74c121ea7bb..64efe1b21f19 100644 > > --- a/drivers/gpu/drm/xe/xe_bo.c > > +++ b/drivers/gpu/drm/xe/xe_bo.c > > @@ -6,6 +6,7 @@ > > #include "xe_bo.h" > > > > #include <linux/dma-buf.h> > > +#include <linux/iommu.h> > > > > #include <drm/drm_drv.h> > > #include <drm/drm_gem_ttm_helper.h> > > @@ -15,16 +16,19 @@ > > #include <drm/ttm/ttm_tt.h> > > #include <uapi/drm/xe_drm.h> > > > > +#include "regs/xe_bars.h" > > #include "xe_device.h" > > #include "xe_dma_buf.h" > > #include "xe_drm_client.h" > > #include "xe_ggtt.h" > > #include "xe_gt.h" > > +#include "xe_gt_sriov_pf_config.h" > > #include "xe_map.h" > > #include "xe_migrate.h" > > #include "xe_pm.h" > > #include "xe_preempt_fence.h" > > #include "xe_res_cursor.h" > > +#include "xe_sriov_pf_helpers.h" > > #include "xe_trace_bo.h" > > #include "xe_ttm_stolen_mgr.h" > > #include "xe_vm.h" > > @@ -543,6 +547,102 @@ static int xe_bo_trigger_rebind(struct xe_device > *xe, struct xe_bo *bo, > > return ret; > > } > > > > +static struct pci_dev *xe_find_vf_dev(struct xe_device *xe, > > + phys_addr_t phys) > > +{ > > + struct pci_dev *pdev, *pf_pdev = to_pci_dev(xe->drm.dev); > > + resource_size_t io_start, io_size; > > + > > + list_for_each_entry(pdev, &pf_pdev->bus->devices, bus_list) { > > + if (pdev->is_physfn) > > + continue; > > + > > + io_start = pci_resource_start(pdev, LMEM_BAR); > > + io_size = pci_resource_len(pdev, LMEM_BAR); > > + > > + if (phys >= io_start && > > + phys < (io_start + io_size - PAGE_SIZE)) > > + return pdev; > > + } > > + return NULL; > > +} > > + > > + > > +static void xe_bo_translate_iova_to_dpa(struct xe_device *xe, > > + struct sg_table *sg, > > + struct sg_table *new_sg, > > + struct pci_dev *pdev) > > +{ > > + resource_size_t io_start = pci_resource_start(pdev, LMEM_BAR); > > + struct xe_gt *gt = xe_root_mmio_gt(xe); > > + struct scatterlist *sgl, *new_sgl; > > + int i, vfid = pci_iov_vf_id(pdev); > > + dma_addr_t new_addr, bo_addr; > > + struct iommu_domain *domain; > > + phys_addr_t phys; > > + u64 offset; > > + > > + bo_addr = xe_gt_sriov_pf_config_get_lmem_addr(gt, ++vfid); > > + domain = iommu_get_domain_for_dev(xe->drm.dev); > > + > > + new_sgl = new_sg->sgl; > > + for_each_sgtable_dma_sg(sg, sgl, i) { > > I'm pretty sure this doesn't work if a single dma address of the input > 'sg' resolves to a non-contiguous physical DPA. In most cases this is > going to be contiguous though unless there is memory pressure or > unaligned allocations sizes. Assuming your testing didn't blow up, you > may not have hit a memory pressure situation where VRAM gets > fragmented. Yeah, I did not test exhaustively and also did not realize that that the BO (config->lmem_obj) backing the VF's lmem quota may not be contiguous. I'll try to figure out a way to test this scenario. However, for each SG entry, if I do something like: offset = phys - io_start; new_addr = xe_bo_addr(config->lmem_obj, offset, sg_dma_len(sgl)); instead of bo_addr = xe_bo_addr(config->lmem_obj, 0, PAGE_SIZE); offset = phys - io_start; new_addr = bo_addr + offset; I believe this would probably work as expected even when lmem_obj is not contiguous (as it probes the buddy blocks) or if input SG has non-contiguous physical (DPA) ranges. > > I think only iommu_iova_to_phys is accurate for exactly 1 page unless > I'm missing something. See [3]. IIUC, it seems to be valid for the segment of size sg_dma_len(sgl), for a given SG entry. Thanks, Vivek > > [3] > https://elixir.bootlin.com/linux/v6.11.3/source/drivers/iommu/iommu.c#L2 > 376 > > Matt > > > + phys = domain ? iommu_iova_to_phys(domain, > sg_dma_address(sgl)) : > > + sg_dma_address(sgl); > > + offset = phys - io_start; > > + new_addr = bo_addr + offset; > > + > > + sg_set_page(new_sgl, NULL, sg_dma_len(sgl), 0); > > + sg_dma_address(new_sgl) = new_addr; > > + sg_dma_len(new_sgl) = sg_dma_len(sgl); > > + > > + new_sgl = sg_next(new_sgl); > > + } > > +} > > + > > +static struct sg_table *xe_bo_create_new_sg(struct sg_table *sg, > > + struct xe_bo *bo) > > +{ > > + struct xe_device *xe = xe_bo_device(bo); > > + struct iommu_domain *domain; > > + struct sg_table *new_sg; > > + struct pci_dev *pdev; > > + phys_addr_t phys; > > + int vfid; > > + > > + if (!IS_SRIOV_PF(xe)) > > + return sg; > > + > > + domain = iommu_get_domain_for_dev(xe->drm.dev); > > + phys = domain ? iommu_iova_to_phys(domain, sg_dma_address(sg- > >sgl)) : > > + sg_dma_address(sg->sgl); > > + > > + if (page_is_ram(PFN_DOWN(phys))) > > + return sg; > > + > > + pdev = xe_find_vf_dev(xe, phys); > > + if (!pdev) > > + return sg; > > + > > + vfid = pci_iov_vf_id(pdev); > > + if (vfid < 0) > > + return sg; > > + > > + new_sg = kzalloc(sizeof(*new_sg), GFP_KERNEL); > > + if (!new_sg) > > + return sg; > > + > > + if (sg_alloc_table(new_sg, sg->nents, GFP_KERNEL)) { > > + kfree(new_sg); > > + return sg; > > + } > > + > > + bo->is_devmem_external = true; > > + xe_bo_translate_iova_to_dpa(xe, sg, new_sg, pdev); > > + > > + return new_sg; > > +} > > + > > /* > > * The dma-buf map_attachment() / unmap_attachment() is hooked up > here. > > * Note that unmapping the attachment is deferred to the next > > @@ -577,7 +677,7 @@ static int xe_bo_move_dmabuf(struct > ttm_buffer_object *ttm_bo, > > return PTR_ERR(sg); > > > > ttm_bo->sg = sg; > > - xe_tt->sg = sg; > > + xe_tt->sg = xe_bo_create_new_sg(sg, ttm_to_xe_bo(ttm_bo)); > > > > out: > > ttm_bo_move_null(ttm_bo, new_res); > > @@ -1066,6 +1166,8 @@ static void xe_ttm_bo_release_notify(struct > ttm_buffer_object *ttm_bo) > > > > static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object > *ttm_bo) > > { > > + struct xe_bo *bo = ttm_to_xe_bo(ttm_bo); > > + > > if (!xe_bo_is_xe_bo(ttm_bo)) > > return; > > > > @@ -1079,6 +1181,10 @@ static void > xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo) > > > > dma_buf_unmap_attachment(ttm_bo->base.import_attach, > ttm_bo->sg, > > DMA_BIDIRECTIONAL); > > + if (bo->is_devmem_external && xe_tt->sg != ttm_bo->sg) { > > + sg_free_table(xe_tt->sg); > > + kfree(xe_tt->sg); > > + } > > ttm_bo->sg = NULL; > > xe_tt->sg = NULL; > > } > > diff --git a/drivers/gpu/drm/xe/xe_bo_types.h > b/drivers/gpu/drm/xe/xe_bo_types.h > > index 8b9201775081..0fe619bc436d 100644 > > --- a/drivers/gpu/drm/xe/xe_bo_types.h > > +++ b/drivers/gpu/drm/xe/xe_bo_types.h > > @@ -67,6 +67,12 @@ struct xe_bo { > > /** @ccs_cleared */ > > bool ccs_cleared; > > > > + /** > > + * @is_devmem_external: Whether this BO is an imported dma-buf > that > > + * has a backing store in VRAM. > > + */ > > + bool is_devmem_external; > > + > > /** > > * @cpu_caching: CPU caching mode. Currently only used for > userspace > > * objects. Exceptions are system memory on DGFX, which is always > > -- > > 2.45.1 > >
On Tue, Oct 15, 2024 at 11:41:56PM -0600, Kasireddy, Vivek wrote: > Hi Matt, > > > > > On Fri, Oct 11, 2024 at 07:40:26PM -0700, Vivek Kasireddy wrote: > > > For BOs of type ttm_bo_type_sg, that are backed by PCI BAR addresses > > > associated with a VF, we need to adjust and translate these addresses > > > to LMEM addresses to make the BOs usable by the PF. Otherwise, the BOs > > > (i.e, PCI BAR addresses) are only accessible by the CPU and not by > > > the GPU. > > > > > > In order to do the above, we first need to identify if the DMA addresses > > > associated with an imported BO (type ttm_bo_type_sg) belong to System > > > RAM or a VF or other PCI device. After we confirm that they belong to > > > a VF, we convert the DMA addresses (IOVAs in this case) to DPAs and > > > create a new sg and populate it with the new addresses. > > > > I think using a SG list is a no-go. We have received pushback before [1] > > about using a SG list as structure to hold DPA rather than dma-address. > > The consensus was a SG list is not a generic structure to hold any > > address [2], rather a specific structure for dma addressess. > AFAIU, that would make sense if the SG list was exposed outside of the Xe > driver but the SG list that is created by this patch is only used internally by > the Xe driver. Would this still not be acceptable? > I'd check with Thomas on this. I'm not hugely opposed, but if I remember correctly, Thomas agrees with Jason in the links below that SG lists shouldn't be used like this. Certainly, making this internal to Xe makes it safer, though. > > > > I'm pretty sure we will have define a new BO type (ttm_bo_type_devmem?) > > and structure that can iterated on if we want to do something like this > > unless we want to ignore the above feedback. > Right, if using SG list is a no-go, then using some form of xe_res_cursor to > iterate seems like the only other option. > Ah, yes. Since this is internal to Xe, it may be easy enough to use a different cursor type here rather than touching TTM. > > > > [1] https://patchwork.freedesktop.org/patch/574894/?series=128910&rev=1 > > [2] > > https://patchwork.freedesktop.org/patch/574894/?series=128910&rev=1#co > > mment_1070889 > > > > > > > > Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com> > > > --- > > > drivers/gpu/drm/xe/xe_bo.c | 108 > > ++++++++++++++++++++++++++++++- > > > drivers/gpu/drm/xe/xe_bo_types.h | 6 ++ > > > 2 files changed, 113 insertions(+), 1 deletion(-) > > > > > > diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c > > > index c74c121ea7bb..64efe1b21f19 100644 > > > --- a/drivers/gpu/drm/xe/xe_bo.c > > > +++ b/drivers/gpu/drm/xe/xe_bo.c > > > @@ -6,6 +6,7 @@ > > > #include "xe_bo.h" > > > > > > #include <linux/dma-buf.h> > > > +#include <linux/iommu.h> > > > > > > #include <drm/drm_drv.h> > > > #include <drm/drm_gem_ttm_helper.h> > > > @@ -15,16 +16,19 @@ > > > #include <drm/ttm/ttm_tt.h> > > > #include <uapi/drm/xe_drm.h> > > > > > > +#include "regs/xe_bars.h" > > > #include "xe_device.h" > > > #include "xe_dma_buf.h" > > > #include "xe_drm_client.h" > > > #include "xe_ggtt.h" > > > #include "xe_gt.h" > > > +#include "xe_gt_sriov_pf_config.h" > > > #include "xe_map.h" > > > #include "xe_migrate.h" > > > #include "xe_pm.h" > > > #include "xe_preempt_fence.h" > > > #include "xe_res_cursor.h" > > > +#include "xe_sriov_pf_helpers.h" > > > #include "xe_trace_bo.h" > > > #include "xe_ttm_stolen_mgr.h" > > > #include "xe_vm.h" > > > @@ -543,6 +547,102 @@ static int xe_bo_trigger_rebind(struct xe_device > > *xe, struct xe_bo *bo, > > > return ret; > > > } > > > > > > +static struct pci_dev *xe_find_vf_dev(struct xe_device *xe, > > > + phys_addr_t phys) > > > +{ > > > + struct pci_dev *pdev, *pf_pdev = to_pci_dev(xe->drm.dev); > > > + resource_size_t io_start, io_size; > > > + > > > + list_for_each_entry(pdev, &pf_pdev->bus->devices, bus_list) { > > > + if (pdev->is_physfn) > > > + continue; > > > + > > > + io_start = pci_resource_start(pdev, LMEM_BAR); > > > + io_size = pci_resource_len(pdev, LMEM_BAR); > > > + > > > + if (phys >= io_start && > > > + phys < (io_start + io_size - PAGE_SIZE)) > > > + return pdev; > > > + } > > > + return NULL; > > > +} > > > + > > > + > > > +static void xe_bo_translate_iova_to_dpa(struct xe_device *xe, > > > + struct sg_table *sg, > > > + struct sg_table *new_sg, > > > + struct pci_dev *pdev) > > > +{ > > > + resource_size_t io_start = pci_resource_start(pdev, LMEM_BAR); > > > + struct xe_gt *gt = xe_root_mmio_gt(xe); > > > + struct scatterlist *sgl, *new_sgl; > > > + int i, vfid = pci_iov_vf_id(pdev); > > > + dma_addr_t new_addr, bo_addr; > > > + struct iommu_domain *domain; > > > + phys_addr_t phys; > > > + u64 offset; > > > + > > > + bo_addr = xe_gt_sriov_pf_config_get_lmem_addr(gt, ++vfid); > > > + domain = iommu_get_domain_for_dev(xe->drm.dev); > > > + > > > + new_sgl = new_sg->sgl; > > > + for_each_sgtable_dma_sg(sg, sgl, i) { > > > > I'm pretty sure this doesn't work if a single dma address of the input > > 'sg' resolves to a non-contiguous physical DPA. In most cases this is > > going to be contiguous though unless there is memory pressure or > > unaligned allocations sizes. Assuming your testing didn't blow up, you > > may not have hit a memory pressure situation where VRAM gets > > fragmented. > Yeah, I did not test exhaustively and also did not realize that that the BO > (config->lmem_obj) backing the VF's lmem quota may not be contiguous. > I'll try to figure out a way to test this scenario. However, for each SG entry, > if I do something like: > > offset = phys - io_start; > new_addr = xe_bo_addr(config->lmem_obj, offset, sg_dma_len(sgl)); > > instead of > > bo_addr = xe_bo_addr(config->lmem_obj, 0, PAGE_SIZE); > offset = phys - io_start; > new_addr = bo_addr + offset; > > I believe this would probably work as expected even when lmem_obj is not > contiguous (as it probes the buddy blocks) or if input SG has non-contiguous > physical (DPA) ranges. > > > > > I think only iommu_iova_to_phys is accurate for exactly 1 page unless > > I'm missing something. See [3]. > IIUC, it seems to be valid for the segment of size sg_dma_len(sgl), for a given > SG entry. > So the SG list you are remapping here is set up in xe_dma_buf_map, specifically for the VRAM case of xe_ttm_vram_mgr_alloc_sgt, right? Yes, it appears each segment (sg_dma_len(sgl)) is a buddy block, so it is contiguous. This does work, but it makes a lot of assumptions about the lower layers, which is not ideal. Each segment is still a DMA address, so it’s possible that iommu_iova_to_phys is only accurate for exactly one page within the segment. I'd prefer to code for that behavior, which will always work, rather than making assumptions about a lower layer. With all of the above, can we add a page-by-page DPA cursor? Thomas has written one for SVM [4] [5]. That code will take a while to land, but I think it can give you a template to build a cursor like this. Once that lands, maybe we can combine these cursors. Matt [4] https://patchwork.freedesktop.org/patch/619812/?series=137870&rev=2 [5] https://patchwork.freedesktop.org/patch/619845/?series=137870&rev=2 > Thanks, > Vivek > > > > > [3] > > https://elixir.bootlin.com/linux/v6.11.3/source/drivers/iommu/iommu.c#L2 > > 376 > > > > Matt > > > > > + phys = domain ? iommu_iova_to_phys(domain, > > sg_dma_address(sgl)) : > > > + sg_dma_address(sgl); > > > + offset = phys - io_start; > > > + new_addr = bo_addr + offset; > > > + > > > + sg_set_page(new_sgl, NULL, sg_dma_len(sgl), 0); > > > + sg_dma_address(new_sgl) = new_addr; > > > + sg_dma_len(new_sgl) = sg_dma_len(sgl); > > > + > > > + new_sgl = sg_next(new_sgl); > > > + } > > > +} > > > + > > > +static struct sg_table *xe_bo_create_new_sg(struct sg_table *sg, > > > + struct xe_bo *bo) > > > +{ > > > + struct xe_device *xe = xe_bo_device(bo); > > > + struct iommu_domain *domain; > > > + struct sg_table *new_sg; > > > + struct pci_dev *pdev; > > > + phys_addr_t phys; > > > + int vfid; > > > + > > > + if (!IS_SRIOV_PF(xe)) > > > + return sg; > > > + > > > + domain = iommu_get_domain_for_dev(xe->drm.dev); > > > + phys = domain ? iommu_iova_to_phys(domain, sg_dma_address(sg- > > >sgl)) : > > > + sg_dma_address(sg->sgl); > > > + > > > + if (page_is_ram(PFN_DOWN(phys))) > > > + return sg; > > > + > > > + pdev = xe_find_vf_dev(xe, phys); > > > + if (!pdev) > > > + return sg; > > > + > > > + vfid = pci_iov_vf_id(pdev); > > > + if (vfid < 0) > > > + return sg; > > > + > > > + new_sg = kzalloc(sizeof(*new_sg), GFP_KERNEL); > > > + if (!new_sg) > > > + return sg; > > > + > > > + if (sg_alloc_table(new_sg, sg->nents, GFP_KERNEL)) { > > > + kfree(new_sg); > > > + return sg; > > > + } > > > + > > > + bo->is_devmem_external = true; > > > + xe_bo_translate_iova_to_dpa(xe, sg, new_sg, pdev); > > > + > > > + return new_sg; > > > +} > > > + > > > /* > > > * The dma-buf map_attachment() / unmap_attachment() is hooked up > > here. > > > * Note that unmapping the attachment is deferred to the next > > > @@ -577,7 +677,7 @@ static int xe_bo_move_dmabuf(struct > > ttm_buffer_object *ttm_bo, > > > return PTR_ERR(sg); > > > > > > ttm_bo->sg = sg; > > > - xe_tt->sg = sg; > > > + xe_tt->sg = xe_bo_create_new_sg(sg, ttm_to_xe_bo(ttm_bo)); > > > > > > out: > > > ttm_bo_move_null(ttm_bo, new_res); > > > @@ -1066,6 +1166,8 @@ static void xe_ttm_bo_release_notify(struct > > ttm_buffer_object *ttm_bo) > > > > > > static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object > > *ttm_bo) > > > { > > > + struct xe_bo *bo = ttm_to_xe_bo(ttm_bo); > > > + > > > if (!xe_bo_is_xe_bo(ttm_bo)) > > > return; > > > > > > @@ -1079,6 +1181,10 @@ static void > > xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo) > > > > > > dma_buf_unmap_attachment(ttm_bo->base.import_attach, > > ttm_bo->sg, > > > DMA_BIDIRECTIONAL); > > > + if (bo->is_devmem_external && xe_tt->sg != ttm_bo->sg) { > > > + sg_free_table(xe_tt->sg); > > > + kfree(xe_tt->sg); > > > + } > > > ttm_bo->sg = NULL; > > > xe_tt->sg = NULL; > > > } > > > diff --git a/drivers/gpu/drm/xe/xe_bo_types.h > > b/drivers/gpu/drm/xe/xe_bo_types.h > > > index 8b9201775081..0fe619bc436d 100644 > > > --- a/drivers/gpu/drm/xe/xe_bo_types.h > > > +++ b/drivers/gpu/drm/xe/xe_bo_types.h > > > @@ -67,6 +67,12 @@ struct xe_bo { > > > /** @ccs_cleared */ > > > bool ccs_cleared; > > > > > > + /** > > > + * @is_devmem_external: Whether this BO is an imported dma-buf > > that > > > + * has a backing store in VRAM. > > > + */ > > > + bool is_devmem_external; > > > + > > > /** > > > * @cpu_caching: CPU caching mode. Currently only used for > > userspace > > > * objects. Exceptions are system memory on DGFX, which is always > > > -- > > > 2.45.1 > > >
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index c74c121ea7bb..64efe1b21f19 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -6,6 +6,7 @@ #include "xe_bo.h" #include <linux/dma-buf.h> +#include <linux/iommu.h> #include <drm/drm_drv.h> #include <drm/drm_gem_ttm_helper.h> @@ -15,16 +16,19 @@ #include <drm/ttm/ttm_tt.h> #include <uapi/drm/xe_drm.h> +#include "regs/xe_bars.h" #include "xe_device.h" #include "xe_dma_buf.h" #include "xe_drm_client.h" #include "xe_ggtt.h" #include "xe_gt.h" +#include "xe_gt_sriov_pf_config.h" #include "xe_map.h" #include "xe_migrate.h" #include "xe_pm.h" #include "xe_preempt_fence.h" #include "xe_res_cursor.h" +#include "xe_sriov_pf_helpers.h" #include "xe_trace_bo.h" #include "xe_ttm_stolen_mgr.h" #include "xe_vm.h" @@ -543,6 +547,102 @@ static int xe_bo_trigger_rebind(struct xe_device *xe, struct xe_bo *bo, return ret; } +static struct pci_dev *xe_find_vf_dev(struct xe_device *xe, + phys_addr_t phys) +{ + struct pci_dev *pdev, *pf_pdev = to_pci_dev(xe->drm.dev); + resource_size_t io_start, io_size; + + list_for_each_entry(pdev, &pf_pdev->bus->devices, bus_list) { + if (pdev->is_physfn) + continue; + + io_start = pci_resource_start(pdev, LMEM_BAR); + io_size = pci_resource_len(pdev, LMEM_BAR); + + if (phys >= io_start && + phys < (io_start + io_size - PAGE_SIZE)) + return pdev; + } + return NULL; +} + + +static void xe_bo_translate_iova_to_dpa(struct xe_device *xe, + struct sg_table *sg, + struct sg_table *new_sg, + struct pci_dev *pdev) +{ + resource_size_t io_start = pci_resource_start(pdev, LMEM_BAR); + struct xe_gt *gt = xe_root_mmio_gt(xe); + struct scatterlist *sgl, *new_sgl; + int i, vfid = pci_iov_vf_id(pdev); + dma_addr_t new_addr, bo_addr; + struct iommu_domain *domain; + phys_addr_t phys; + u64 offset; + + bo_addr = xe_gt_sriov_pf_config_get_lmem_addr(gt, ++vfid); + domain = iommu_get_domain_for_dev(xe->drm.dev); + + new_sgl = new_sg->sgl; + for_each_sgtable_dma_sg(sg, sgl, i) { + phys = domain ? iommu_iova_to_phys(domain, sg_dma_address(sgl)) : + sg_dma_address(sgl); + offset = phys - io_start; + new_addr = bo_addr + offset; + + sg_set_page(new_sgl, NULL, sg_dma_len(sgl), 0); + sg_dma_address(new_sgl) = new_addr; + sg_dma_len(new_sgl) = sg_dma_len(sgl); + + new_sgl = sg_next(new_sgl); + } +} + +static struct sg_table *xe_bo_create_new_sg(struct sg_table *sg, + struct xe_bo *bo) +{ + struct xe_device *xe = xe_bo_device(bo); + struct iommu_domain *domain; + struct sg_table *new_sg; + struct pci_dev *pdev; + phys_addr_t phys; + int vfid; + + if (!IS_SRIOV_PF(xe)) + return sg; + + domain = iommu_get_domain_for_dev(xe->drm.dev); + phys = domain ? iommu_iova_to_phys(domain, sg_dma_address(sg->sgl)) : + sg_dma_address(sg->sgl); + + if (page_is_ram(PFN_DOWN(phys))) + return sg; + + pdev = xe_find_vf_dev(xe, phys); + if (!pdev) + return sg; + + vfid = pci_iov_vf_id(pdev); + if (vfid < 0) + return sg; + + new_sg = kzalloc(sizeof(*new_sg), GFP_KERNEL); + if (!new_sg) + return sg; + + if (sg_alloc_table(new_sg, sg->nents, GFP_KERNEL)) { + kfree(new_sg); + return sg; + } + + bo->is_devmem_external = true; + xe_bo_translate_iova_to_dpa(xe, sg, new_sg, pdev); + + return new_sg; +} + /* * The dma-buf map_attachment() / unmap_attachment() is hooked up here. * Note that unmapping the attachment is deferred to the next @@ -577,7 +677,7 @@ static int xe_bo_move_dmabuf(struct ttm_buffer_object *ttm_bo, return PTR_ERR(sg); ttm_bo->sg = sg; - xe_tt->sg = sg; + xe_tt->sg = xe_bo_create_new_sg(sg, ttm_to_xe_bo(ttm_bo)); out: ttm_bo_move_null(ttm_bo, new_res); @@ -1066,6 +1166,8 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo) { + struct xe_bo *bo = ttm_to_xe_bo(ttm_bo); + if (!xe_bo_is_xe_bo(ttm_bo)) return; @@ -1079,6 +1181,10 @@ static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo) dma_buf_unmap_attachment(ttm_bo->base.import_attach, ttm_bo->sg, DMA_BIDIRECTIONAL); + if (bo->is_devmem_external && xe_tt->sg != ttm_bo->sg) { + sg_free_table(xe_tt->sg); + kfree(xe_tt->sg); + } ttm_bo->sg = NULL; xe_tt->sg = NULL; } diff --git a/drivers/gpu/drm/xe/xe_bo_types.h b/drivers/gpu/drm/xe/xe_bo_types.h index 8b9201775081..0fe619bc436d 100644 --- a/drivers/gpu/drm/xe/xe_bo_types.h +++ b/drivers/gpu/drm/xe/xe_bo_types.h @@ -67,6 +67,12 @@ struct xe_bo { /** @ccs_cleared */ bool ccs_cleared; + /** + * @is_devmem_external: Whether this BO is an imported dma-buf that + * has a backing store in VRAM. + */ + bool is_devmem_external; + /** * @cpu_caching: CPU caching mode. Currently only used for userspace * objects. Exceptions are system memory on DGFX, which is always
For BOs of type ttm_bo_type_sg, that are backed by PCI BAR addresses associated with a VF, we need to adjust and translate these addresses to LMEM addresses to make the BOs usable by the PF. Otherwise, the BOs (i.e, PCI BAR addresses) are only accessible by the CPU and not by the GPU. In order to do the above, we first need to identify if the DMA addresses associated with an imported BO (type ttm_bo_type_sg) belong to System RAM or a VF or other PCI device. After we confirm that they belong to a VF, we convert the DMA addresses (IOVAs in this case) to DPAs and create a new sg and populate it with the new addresses. Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com> --- drivers/gpu/drm/xe/xe_bo.c | 108 ++++++++++++++++++++++++++++++- drivers/gpu/drm/xe/xe_bo_types.h | 6 ++ 2 files changed, 113 insertions(+), 1 deletion(-)