Message ID | 20210912165309.98695-3-ogabbay@kernel.org (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Add p2p via dmabuf to habanalabs | expand |
On Sun, Sep 12, 2021 at 07:53:09PM +0300, Oded Gabbay wrote: > From: Tomer Tayar <ttayar@habana.ai> > > Implement the calls to the dma-buf kernel api to create a dma-buf > object backed by FD. > > We block the option to mmap the DMA-BUF object because we don't support > DIRECT_IO and implicit P2P. This statement doesn't make sense, you can mmap your dmabuf if you like. All dmabuf mmaps are supposed to set the special bit/etc to exclude them from get_user_pages() anyhow - and since this is BAR memory not struct page memory this driver would be doing it anyhow. > We check the p2p distance using pci_p2pdma_distance_many() and refusing > to map dmabuf in case the distance doesn't allow p2p. Does this actually allow the p2p transfer for your intended use cases? > diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c > index 33986933aa9e..8cf5437c0390 100644 > +++ b/drivers/misc/habanalabs/common/memory.c > @@ -1,7 +1,7 @@ > // SPDX-License-Identifier: GPL-2.0 > > /* > - * Copyright 2016-2019 HabanaLabs, Ltd. > + * Copyright 2016-2021 HabanaLabs, Ltd. > * All Rights Reserved. > */ > > @@ -11,11 +11,13 @@ > > #include <linux/uaccess.h> > #include <linux/slab.h> > +#include <linux/pci-p2pdma.h> > > #define HL_MMU_DEBUG 0 > > /* use small pages for supporting non-pow2 (32M/40M/48M) DRAM phys page sizes */ > -#define DRAM_POOL_PAGE_SIZE SZ_8M > +#define DRAM_POOL_PAGE_SIZE SZ_8M > + ?? > /* > * The va ranges in context object contain a list with the available chunks of > @@ -347,6 +349,13 @@ static int free_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args) > return -EINVAL; > } > > + if (phys_pg_pack->exporting_cnt) { > + dev_err(hdev->dev, > + "handle %u is exported, cannot free\n", handle); > + spin_unlock(&vm->idr_lock); Don't write to the kernel log from user space triggered actions > +static int alloc_sgt_from_device_pages(struct hl_device *hdev, > + struct sg_table **sgt, u64 *pages, > + u64 npages, u64 page_size, > + struct device *dev, > + enum dma_data_direction dir) Why doesn't this return a sg_table * and an ERR_PTR? > +{ > + u64 chunk_size, bar_address, dma_max_seg_size; > + struct asic_fixed_properties *prop; > + int rc, i, j, nents, cur_page; > + struct scatterlist *sg; > + > + prop = &hdev->asic_prop; > + > + dma_max_seg_size = dma_get_max_seg_size(dev); > + > + /* We would like to align the max segment size to PAGE_SIZE, so the > + * SGL will contain aligned addresses that can be easily mapped to > + * an MMU > + */ > + dma_max_seg_size = ALIGN_DOWN(dma_max_seg_size, PAGE_SIZE); > + if (dma_max_seg_size < PAGE_SIZE) { > + dev_err_ratelimited(hdev->dev, > + "dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n", > + dma_max_seg_size); > + return -EINVAL; > + } > + > + *sgt = kzalloc(sizeof(**sgt), GFP_KERNEL); > + if (!*sgt) > + return -ENOMEM; > + > + /* If the size of each page is larger than the dma max segment size, > + * then we can't combine pages and the number of entries in the SGL > + * will just be the > + * <number of pages> * <chunks of max segment size in each page> > + */ > + if (page_size > dma_max_seg_size) > + nents = npages * DIV_ROUND_UP_ULL(page_size, dma_max_seg_size); > + else > + /* Get number of non-contiguous chunks */ > + for (i = 1, nents = 1, chunk_size = page_size ; i < npages ; i++) { > + if (pages[i - 1] + page_size != pages[i] || > + chunk_size + page_size > dma_max_seg_size) { > + nents++; > + chunk_size = page_size; > + continue; > + } > + > + chunk_size += page_size; > + } > + > + rc = sg_alloc_table(*sgt, nents, GFP_KERNEL | __GFP_ZERO); > + if (rc) > + goto error_free; > + > + /* Because we are not going to include a CPU list we want to have some > + * chance that other users will detect this by setting the orig_nents > + * to 0 and using only nents (length of DMA list) when going over the > + * sgl > + */ > + (*sgt)->orig_nents = 0; Maybe do this at the end so you'd have to undo it on the error path? > + cur_page = 0; > + > + if (page_size > dma_max_seg_size) { > + u64 size_left, cur_device_address = 0; > + > + size_left = page_size; > + > + /* Need to split each page into the number of chunks of > + * dma_max_seg_size > + */ > + for_each_sgtable_dma_sg((*sgt), sg, i) { > + if (size_left == page_size) > + cur_device_address = > + pages[cur_page] - prop->dram_base_address; > + else > + cur_device_address += dma_max_seg_size; > + > + chunk_size = min(size_left, dma_max_seg_size); > + > + bar_address = hdev->dram_pci_bar_start + cur_device_address; > + > + rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir); > + if (rc) > + goto error_unmap; > + > + if (size_left > dma_max_seg_size) { > + size_left -= dma_max_seg_size; > + } else { > + cur_page++; > + size_left = page_size; > + } > + } > + } else { > + /* Merge pages and put them into the scatterlist */ > + for_each_sgtable_dma_sg((*sgt), sg, i) { > + chunk_size = page_size; > + for (j = cur_page + 1 ; j < npages ; j++) { > + if (pages[j - 1] + page_size != pages[j] || > + chunk_size + page_size > dma_max_seg_size) > + break; > + > + chunk_size += page_size; > + } > + > + bar_address = hdev->dram_pci_bar_start + > + (pages[cur_page] - prop->dram_base_address); > + > + rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir); > + if (rc) > + goto error_unmap; > + > + cur_page = j; > + } > + } We have this sg_append stuff now that is intended to help building these things. It can only build CPU page lists, not these DMA lists, but I do wonder if open coding in drivers is slipping back a bit. Especially since AMD seems to be doing something different. Could the DMABUF layer gain some helpers styled after the sg_append to simplify building these things? and convert the AMD driver of course. > +static int hl_dmabuf_attach(struct dma_buf *dmabuf, > + struct dma_buf_attachment *attachment) > +{ > + struct hl_dmabuf_wrapper *hl_dmabuf; > + struct hl_device *hdev; > + int rc; > + > + hl_dmabuf = dmabuf->priv; > + hdev = hl_dmabuf->ctx->hdev; > + > + rc = pci_p2pdma_distance_many(hdev->pdev, &attachment->dev, 1, true); > + > + if (rc < 0) > + attachment->peer2peer = false; Extra blank line > + > + return 0; > +} > + > +static struct sg_table *hl_map_dmabuf(struct dma_buf_attachment *attachment, > + enum dma_data_direction dir) > +{ > + struct dma_buf *dma_buf = attachment->dmabuf; > + struct hl_vm_phys_pg_pack *phys_pg_pack; > + struct hl_dmabuf_wrapper *hl_dmabuf; > + struct hl_device *hdev; > + struct sg_table *sgt; > + int rc; > + > + hl_dmabuf = dma_buf->priv; > + hdev = hl_dmabuf->ctx->hdev; > + phys_pg_pack = hl_dmabuf->phys_pg_pack; > + > + if (!attachment->peer2peer) { > + dev_err(hdev->dev, > + "Failed to map dmabuf because p2p is disabled\n"); > + return ERR_PTR(-EPERM); User triggered printable again? > +static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment, > + struct sg_table *sgt, > + enum dma_data_direction dir) > +{ > + struct scatterlist *sg; > + int i; > + > + for_each_sgtable_dma_sg(sgt, sg, i) > + dma_unmap_resource(attachment->dev, sg_dma_address(sg), > + sg_dma_len(sg), dir, > + DMA_ATTR_SKIP_CPU_SYNC); Why can we skip the CPU_SYNC? Seems like a comment is needed Something has to do a CPU_SYNC before recylcing this memory for another purpose, where is it? > +static void hl_release_dmabuf(struct dma_buf *dmabuf) > +{ > + struct hl_dmabuf_wrapper *hl_dmabuf = dmabuf->priv; Maybe hl_dmabuf_wrapper should be hl_dmabuf_priv > + * export_dmabuf_from_addr() - export a dma-buf object for the given memory > + * address and size. > + * @ctx: pointer to the context structure. > + * @device_addr: device memory physical address. > + * @size: size of device memory. > + * @flags: DMA-BUF file/FD flags. > + * @dmabuf_fd: pointer to result FD that represents the dma-buf object. > + * > + * Create and export a dma-buf object for an existing memory allocation inside > + * the device memory, and return a FD which is associated with the dma-buf > + * object. > + * > + * Return: 0 on success, non-zero for failure. > + */ > +static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 device_addr, > + u64 size, int flags, int *dmabuf_fd) > +{ > + struct hl_dmabuf_wrapper *hl_dmabuf; > + struct hl_device *hdev = ctx->hdev; > + struct asic_fixed_properties *prop; > + u64 bar_address; > + int rc; > + > + prop = &hdev->asic_prop; > + > + if (!IS_ALIGNED(device_addr, PAGE_SIZE)) { > + dev_err_ratelimited(hdev->dev, > + "address of exported device memory should be aligned to 0x%lx, address 0x%llx\n", > + PAGE_SIZE, device_addr); > + return -EINVAL; > + } > + > + if (size < PAGE_SIZE) { > + dev_err_ratelimited(hdev->dev, > + "size %llu of exported device memory should be equal to or greater than %lu\n", > + size, PAGE_SIZE); > + return -EINVAL; > + } > + > + if (device_addr < prop->dram_user_base_address || > + device_addr + size > prop->dram_end_address || > + device_addr + size < device_addr) { > + dev_err_ratelimited(hdev->dev, > + "DRAM memory range is outside of DRAM boundaries, address 0x%llx, size 0x%llx\n", > + device_addr, size); > + return -EINVAL; > + } > + > + bar_address = hdev->dram_pci_bar_start + > + (device_addr - prop->dram_base_address); > + > + if (bar_address + size > > + hdev->dram_pci_bar_start + prop->dram_pci_bar_size || > + bar_address + size < bar_address) { > + dev_err_ratelimited(hdev->dev, > + "DRAM memory range is outside of PCI BAR boundaries, address 0x%llx, size 0x%llx\n", > + device_addr, size); > + return -EINVAL; > + } More prints from userspace > +static int export_dmabuf_from_handle(struct hl_ctx *ctx, u64 handle, int flags, > + int *dmabuf_fd) > +{ > + struct hl_vm_phys_pg_pack *phys_pg_pack; > + struct hl_dmabuf_wrapper *hl_dmabuf; > + struct hl_device *hdev = ctx->hdev; > + struct asic_fixed_properties *prop; > + struct hl_vm *vm = &hdev->vm; > + u64 bar_address; > + u32 idr_handle; > + int rc, i; > + > + prop = &hdev->asic_prop; > + > + idr_handle = lower_32_bits(handle); Why silent truncation? Shouldn't setting the upper 32 bits be an error? > + case HL_MEM_OP_EXPORT_DMABUF_FD: > + rc = export_dmabuf_from_addr(ctx, > + args->in.export_dmabuf_fd.handle, > + args->in.export_dmabuf_fd.mem_size, > + args->in.flags, > + &dmabuf_fd); > + memset(args, 0, sizeof(*args)); > + args->out.fd = dmabuf_fd; Would expect the installed fd to be the positive return, not a pointer Jason
On Tue, Sep 28, 2021 at 8:36 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Sun, Sep 12, 2021 at 07:53:09PM +0300, Oded Gabbay wrote: > > From: Tomer Tayar <ttayar@habana.ai> > > > > Implement the calls to the dma-buf kernel api to create a dma-buf > > object backed by FD. > > > > We block the option to mmap the DMA-BUF object because we don't support > > DIRECT_IO and implicit P2P. > > This statement doesn't make sense, you can mmap your dmabuf if you > like. All dmabuf mmaps are supposed to set the special bit/etc to > exclude them from get_user_pages() anyhow - and since this is BAR > memory not struct page memory this driver would be doing it anyhow. > But we block mmap the dmabuf fd from user-space. If you try to do it, you will get MAP_FAILED. That's because we don't supply a function to the mmap callback in dmabuf. We did that per Christian's advice. It is in one of the long email threads on previous versions of this patch. > > We check the p2p distance using pci_p2pdma_distance_many() and refusing > > to map dmabuf in case the distance doesn't allow p2p. > > Does this actually allow the p2p transfer for your intended use cases? > It depends on the system. If we are working bare-metal, then yes, it allows. If inside a VM, then no. The virtualized root complex is not white-listed and the kernel can't know the distance. But I remember you asked me to add this check, in v3 of the review IIRC. I don't mind removing this check if you don't object. > > diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c > > index 33986933aa9e..8cf5437c0390 100644 > > +++ b/drivers/misc/habanalabs/common/memory.c > > @@ -1,7 +1,7 @@ > > // SPDX-License-Identifier: GPL-2.0 > > > > /* > > - * Copyright 2016-2019 HabanaLabs, Ltd. > > + * Copyright 2016-2021 HabanaLabs, Ltd. > > * All Rights Reserved. > > */ > > > > @@ -11,11 +11,13 @@ > > > > #include <linux/uaccess.h> > > #include <linux/slab.h> > > +#include <linux/pci-p2pdma.h> > > > > #define HL_MMU_DEBUG 0 > > > > /* use small pages for supporting non-pow2 (32M/40M/48M) DRAM phys page sizes */ > > -#define DRAM_POOL_PAGE_SIZE SZ_8M > > +#define DRAM_POOL_PAGE_SIZE SZ_8M > > + > > ?? ok, I 'll remove > > > /* > > * The va ranges in context object contain a list with the available chunks of > > @@ -347,6 +349,13 @@ static int free_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args) > > return -EINVAL; > > } > > > > + if (phys_pg_pack->exporting_cnt) { > > + dev_err(hdev->dev, > > + "handle %u is exported, cannot free\n", handle); > > + spin_unlock(&vm->idr_lock); > > Don't write to the kernel log from user space triggered actions at all ? It's the first time I hear about this limitation... How do you tell the user it has done something wrong ? I agree it might be better to rate limit it, but why not give the information to the user ? > > > +static int alloc_sgt_from_device_pages(struct hl_device *hdev, > > + struct sg_table **sgt, u64 *pages, > > + u64 npages, u64 page_size, > > + struct device *dev, > > + enum dma_data_direction dir) > > Why doesn't this return a sg_table * and an ERR_PTR? Basically I modeled this function after amdgpu_vram_mgr_alloc_sgt() And in that function they also return int and pass the sg_table as ** If it's critical I can change. > > > +{ > > + u64 chunk_size, bar_address, dma_max_seg_size; > > + struct asic_fixed_properties *prop; > > + int rc, i, j, nents, cur_page; > > + struct scatterlist *sg; > > + > > + prop = &hdev->asic_prop; > > + > > + dma_max_seg_size = dma_get_max_seg_size(dev); > > > + > > + /* We would like to align the max segment size to PAGE_SIZE, so the > > + * SGL will contain aligned addresses that can be easily mapped to > > + * an MMU > > + */ > > + dma_max_seg_size = ALIGN_DOWN(dma_max_seg_size, PAGE_SIZE); > > + if (dma_max_seg_size < PAGE_SIZE) { > > + dev_err_ratelimited(hdev->dev, > > + "dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n", > > + dma_max_seg_size); > > + return -EINVAL; > > + } > > + > > + *sgt = kzalloc(sizeof(**sgt), GFP_KERNEL); > > + if (!*sgt) > > + return -ENOMEM; > > + > > + /* If the size of each page is larger than the dma max segment size, > > + * then we can't combine pages and the number of entries in the SGL > > + * will just be the > > + * <number of pages> * <chunks of max segment size in each page> > > + */ > > + if (page_size > dma_max_seg_size) > > + nents = npages * DIV_ROUND_UP_ULL(page_size, dma_max_seg_size); > > + else > > + /* Get number of non-contiguous chunks */ > > + for (i = 1, nents = 1, chunk_size = page_size ; i < npages ; i++) { > > + if (pages[i - 1] + page_size != pages[i] || > > + chunk_size + page_size > dma_max_seg_size) { > > + nents++; > > + chunk_size = page_size; > > + continue; > > + } > > + > > + chunk_size += page_size; > > + } > > + > > + rc = sg_alloc_table(*sgt, nents, GFP_KERNEL | __GFP_ZERO); > > + if (rc) > > + goto error_free; > > + > > + /* Because we are not going to include a CPU list we want to have some > > + * chance that other users will detect this by setting the orig_nents > > + * to 0 and using only nents (length of DMA list) when going over the > > + * sgl > > + */ > > + (*sgt)->orig_nents = 0; > > Maybe do this at the end so you'd have to undo it on the error path? Agreed, less code > > > + cur_page = 0; > > + > > + if (page_size > dma_max_seg_size) { > > + u64 size_left, cur_device_address = 0; > > + > > + size_left = page_size; > > + > > + /* Need to split each page into the number of chunks of > > + * dma_max_seg_size > > + */ > > + for_each_sgtable_dma_sg((*sgt), sg, i) { > > + if (size_left == page_size) > > + cur_device_address = > > + pages[cur_page] - prop->dram_base_address; > > + else > > + cur_device_address += dma_max_seg_size; > > + > > + chunk_size = min(size_left, dma_max_seg_size); > > + > > + bar_address = hdev->dram_pci_bar_start + cur_device_address; > > + > > + rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir); > > + if (rc) > > + goto error_unmap; > > + > > + if (size_left > dma_max_seg_size) { > > + size_left -= dma_max_seg_size; > > + } else { > > + cur_page++; > > + size_left = page_size; > > + } > > + } > > + } else { > > + /* Merge pages and put them into the scatterlist */ > > + for_each_sgtable_dma_sg((*sgt), sg, i) { > > + chunk_size = page_size; > > + for (j = cur_page + 1 ; j < npages ; j++) { > > + if (pages[j - 1] + page_size != pages[j] || > > + chunk_size + page_size > dma_max_seg_size) > > + break; > > + > > + chunk_size += page_size; > > + } > > + > > + bar_address = hdev->dram_pci_bar_start + > > + (pages[cur_page] - prop->dram_base_address); > > + > > + rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir); > > + if (rc) > > + goto error_unmap; > > + > > + cur_page = j; > > + } > > + } > > We have this sg_append stuff now that is intended to help building > these things. It can only build CPU page lists, not these DMA lists, > but I do wonder if open coding in drivers is slipping back a > bit. Especially since AMD seems to be doing something different. > > Could the DMABUF layer gain some helpers styled after the sg_append to > simplify building these things? and convert the AMD driver of course. > I will take it as a task to do in the near future, but I prefer to first upstream these patches as-is, because I don't have a way to check AMD devices and I guess converting them might take some time. > > +static int hl_dmabuf_attach(struct dma_buf *dmabuf, > > + struct dma_buf_attachment *attachment) > > +{ > > + struct hl_dmabuf_wrapper *hl_dmabuf; > > + struct hl_device *hdev; > > + int rc; > > + > > + hl_dmabuf = dmabuf->priv; > > + hdev = hl_dmabuf->ctx->hdev; > > + > > + rc = pci_p2pdma_distance_many(hdev->pdev, &attachment->dev, 1, true); > > + > > + if (rc < 0) > > + attachment->peer2peer = false; > > Extra blank line > > > + > > + return 0; > > +} > > + > > +static struct sg_table *hl_map_dmabuf(struct dma_buf_attachment *attachment, > > + enum dma_data_direction dir) > > +{ > > + struct dma_buf *dma_buf = attachment->dmabuf; > > + struct hl_vm_phys_pg_pack *phys_pg_pack; > > + struct hl_dmabuf_wrapper *hl_dmabuf; > > + struct hl_device *hdev; > > + struct sg_table *sgt; > > + int rc; > > + > > + hl_dmabuf = dma_buf->priv; > > + hdev = hl_dmabuf->ctx->hdev; > > + phys_pg_pack = hl_dmabuf->phys_pg_pack; > > + > > + if (!attachment->peer2peer) { > > + dev_err(hdev->dev, > > + "Failed to map dmabuf because p2p is disabled\n"); > > + return ERR_PTR(-EPERM); > > User triggered printable again? But how will the user know what the reason for the failure is ? > > > +static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment, > > + struct sg_table *sgt, > > + enum dma_data_direction dir) > > +{ > > + struct scatterlist *sg; > > + int i; > > + > > + for_each_sgtable_dma_sg(sgt, sg, i) > > + dma_unmap_resource(attachment->dev, sg_dma_address(sg), > > + sg_dma_len(sg), dir, > > + DMA_ATTR_SKIP_CPU_SYNC); > > Why can we skip the CPU_SYNC? Seems like a comment is needed > > Something has to do a CPU_SYNC before recylcing this memory for > another purpose, where is it? I have to ask for further explanation here. Same as before, this function is modeled after the amdgpu one - amdgpu_vram_mgr_free_sgt() They also use CPU_SYNC in that function. Christain, maybe you know ? Maybe it is not relevant to our case as it represents a PCI bar ? > > > +static void hl_release_dmabuf(struct dma_buf *dmabuf) > > +{ > > + struct hl_dmabuf_wrapper *hl_dmabuf = dmabuf->priv; > > Maybe hl_dmabuf_wrapper should be hl_dmabuf_priv Agreed. > > > + * export_dmabuf_from_addr() - export a dma-buf object for the given memory > > + * address and size. > > + * @ctx: pointer to the context structure. > > + * @device_addr: device memory physical address. > > + * @size: size of device memory. > > + * @flags: DMA-BUF file/FD flags. > > + * @dmabuf_fd: pointer to result FD that represents the dma-buf object. > > + * > > + * Create and export a dma-buf object for an existing memory allocation inside > > + * the device memory, and return a FD which is associated with the dma-buf > > + * object. > > + * > > + * Return: 0 on success, non-zero for failure. > > + */ > > +static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 device_addr, > > + u64 size, int flags, int *dmabuf_fd) > > +{ > > + struct hl_dmabuf_wrapper *hl_dmabuf; > > + struct hl_device *hdev = ctx->hdev; > > + struct asic_fixed_properties *prop; > > + u64 bar_address; > > + int rc; > > + > > + prop = &hdev->asic_prop; > > + > > + if (!IS_ALIGNED(device_addr, PAGE_SIZE)) { > > + dev_err_ratelimited(hdev->dev, > > + "address of exported device memory should be aligned to 0x%lx, address 0x%llx\n", > > + PAGE_SIZE, device_addr); > > + return -EINVAL; > > + } > > + > > + if (size < PAGE_SIZE) { > > + dev_err_ratelimited(hdev->dev, > > + "size %llu of exported device memory should be equal to or greater than %lu\n", > > + size, PAGE_SIZE); > > + return -EINVAL; > > + } > > + > > + if (device_addr < prop->dram_user_base_address || > > + device_addr + size > prop->dram_end_address || > > + device_addr + size < device_addr) { > > + dev_err_ratelimited(hdev->dev, > > + "DRAM memory range is outside of DRAM boundaries, address 0x%llx, size 0x%llx\n", > > + device_addr, size); > > + return -EINVAL; > > + } > > + > > + bar_address = hdev->dram_pci_bar_start + > > + (device_addr - prop->dram_base_address); > > + > > + if (bar_address + size > > > + hdev->dram_pci_bar_start + prop->dram_pci_bar_size || > > + bar_address + size < bar_address) { > > + dev_err_ratelimited(hdev->dev, > > + "DRAM memory range is outside of PCI BAR boundaries, address 0x%llx, size 0x%llx\n", > > + device_addr, size); > > + return -EINVAL; > > + } > > More prints from userspace > > > +static int export_dmabuf_from_handle(struct hl_ctx *ctx, u64 handle, int flags, > > + int *dmabuf_fd) > > +{ > > + struct hl_vm_phys_pg_pack *phys_pg_pack; > > + struct hl_dmabuf_wrapper *hl_dmabuf; > > + struct hl_device *hdev = ctx->hdev; > > + struct asic_fixed_properties *prop; > > + struct hl_vm *vm = &hdev->vm; > > + u64 bar_address; > > + u32 idr_handle; > > + int rc, i; > > + > > + prop = &hdev->asic_prop; > > + > > + idr_handle = lower_32_bits(handle); > > Why silent truncation? Shouldn't setting the upper 32 bits be an > error? Yes, you are correct. I will fix it. > > > + case HL_MEM_OP_EXPORT_DMABUF_FD: > > + rc = export_dmabuf_from_addr(ctx, > > + args->in.export_dmabuf_fd.handle, > > + args->in.export_dmabuf_fd.mem_size, > > + args->in.flags, > > + &dmabuf_fd); > > + memset(args, 0, sizeof(*args)); > > + args->out.fd = dmabuf_fd; > > Would expect the installed fd to be the positive return, not a pointer yeah, changed it to s32 as you suggested. > > Jason
On Wed, Sep 29, 2021 at 12:17 AM Oded Gabbay <ogabbay@kernel.org> wrote: > > On Tue, Sep 28, 2021 at 8:36 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > On Sun, Sep 12, 2021 at 07:53:09PM +0300, Oded Gabbay wrote: > > > From: Tomer Tayar <ttayar@habana.ai> > > > > > > Implement the calls to the dma-buf kernel api to create a dma-buf > > > object backed by FD. > > > > > > We block the option to mmap the DMA-BUF object because we don't support > > > DIRECT_IO and implicit P2P. > > > > This statement doesn't make sense, you can mmap your dmabuf if you > > like. All dmabuf mmaps are supposed to set the special bit/etc to > > exclude them from get_user_pages() anyhow - and since this is BAR > > memory not struct page memory this driver would be doing it anyhow. > > > But we block mmap the dmabuf fd from user-space. > If you try to do it, you will get MAP_FAILED. > That's because we don't supply a function to the mmap callback in dmabuf. > We did that per Christian's advice. It is in one of the long email > threads on previous versions of this patch. > > > > > We check the p2p distance using pci_p2pdma_distance_many() and refusing > > > to map dmabuf in case the distance doesn't allow p2p. > > > > Does this actually allow the p2p transfer for your intended use cases? > > > It depends on the system. If we are working bare-metal, then yes, it allows. > If inside a VM, then no. The virtualized root complex is not > white-listed and the kernel can't know the distance. > But I remember you asked me to add this check, in v3 of the review IIRC. > I don't mind removing this check if you don't object. > > > > diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c > > > index 33986933aa9e..8cf5437c0390 100644 > > > +++ b/drivers/misc/habanalabs/common/memory.c > > > @@ -1,7 +1,7 @@ > > > // SPDX-License-Identifier: GPL-2.0 > > > > > > /* > > > - * Copyright 2016-2019 HabanaLabs, Ltd. > > > + * Copyright 2016-2021 HabanaLabs, Ltd. > > > * All Rights Reserved. > > > */ > > > > > > @@ -11,11 +11,13 @@ > > > > > > #include <linux/uaccess.h> > > > #include <linux/slab.h> > > > +#include <linux/pci-p2pdma.h> > > > > > > #define HL_MMU_DEBUG 0 > > > > > > /* use small pages for supporting non-pow2 (32M/40M/48M) DRAM phys page sizes */ > > > -#define DRAM_POOL_PAGE_SIZE SZ_8M > > > +#define DRAM_POOL_PAGE_SIZE SZ_8M > > > + > > > > ?? > ok, I 'll remove > > > > > /* > > > * The va ranges in context object contain a list with the available chunks of > > > @@ -347,6 +349,13 @@ static int free_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args) > > > return -EINVAL; > > > } > > > > > > + if (phys_pg_pack->exporting_cnt) { > > > + dev_err(hdev->dev, > > > + "handle %u is exported, cannot free\n", handle); > > > + spin_unlock(&vm->idr_lock); > > > > Don't write to the kernel log from user space triggered actions > at all ? > It's the first time I hear about this limitation... > How do you tell the user it has done something wrong ? > I agree it might be better to rate limit it, but why not give the > information to the user ? > > > > > > +static int alloc_sgt_from_device_pages(struct hl_device *hdev, > > > + struct sg_table **sgt, u64 *pages, > > > + u64 npages, u64 page_size, > > > + struct device *dev, > > > + enum dma_data_direction dir) > > > > Why doesn't this return a sg_table * and an ERR_PTR? > Basically I modeled this function after amdgpu_vram_mgr_alloc_sgt() > And in that function they also return int and pass the sg_table as ** > > If it's critical I can change. > > > > > > +{ > > > + u64 chunk_size, bar_address, dma_max_seg_size; > > > + struct asic_fixed_properties *prop; > > > + int rc, i, j, nents, cur_page; > > > + struct scatterlist *sg; > > > + > > > + prop = &hdev->asic_prop; > > > + > > > + dma_max_seg_size = dma_get_max_seg_size(dev); > > > > > + > > > + /* We would like to align the max segment size to PAGE_SIZE, so the > > > + * SGL will contain aligned addresses that can be easily mapped to > > > + * an MMU > > > + */ > > > + dma_max_seg_size = ALIGN_DOWN(dma_max_seg_size, PAGE_SIZE); > > > + if (dma_max_seg_size < PAGE_SIZE) { > > > + dev_err_ratelimited(hdev->dev, > > > + "dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n", > > > + dma_max_seg_size); > > > + return -EINVAL; > > > + } > > > + > > > + *sgt = kzalloc(sizeof(**sgt), GFP_KERNEL); > > > + if (!*sgt) > > > + return -ENOMEM; > > > + > > > + /* If the size of each page is larger than the dma max segment size, > > > + * then we can't combine pages and the number of entries in the SGL > > > + * will just be the > > > + * <number of pages> * <chunks of max segment size in each page> > > > + */ > > > + if (page_size > dma_max_seg_size) > > > + nents = npages * DIV_ROUND_UP_ULL(page_size, dma_max_seg_size); > > > + else > > > + /* Get number of non-contiguous chunks */ > > > + for (i = 1, nents = 1, chunk_size = page_size ; i < npages ; i++) { > > > + if (pages[i - 1] + page_size != pages[i] || > > > + chunk_size + page_size > dma_max_seg_size) { > > > + nents++; > > > + chunk_size = page_size; > > > + continue; > > > + } > > > + > > > + chunk_size += page_size; > > > + } > > > + > > > + rc = sg_alloc_table(*sgt, nents, GFP_KERNEL | __GFP_ZERO); > > > + if (rc) > > > + goto error_free; > > > + > > > + /* Because we are not going to include a CPU list we want to have some > > > + * chance that other users will detect this by setting the orig_nents > > > + * to 0 and using only nents (length of DMA list) when going over the > > > + * sgl > > > + */ > > > + (*sgt)->orig_nents = 0; > > > > Maybe do this at the end so you'd have to undo it on the error path? > Agreed, less code > > > > > > + cur_page = 0; > > > + > > > + if (page_size > dma_max_seg_size) { > > > + u64 size_left, cur_device_address = 0; > > > + > > > + size_left = page_size; > > > + > > > + /* Need to split each page into the number of chunks of > > > + * dma_max_seg_size > > > + */ > > > + for_each_sgtable_dma_sg((*sgt), sg, i) { > > > + if (size_left == page_size) > > > + cur_device_address = > > > + pages[cur_page] - prop->dram_base_address; > > > + else > > > + cur_device_address += dma_max_seg_size; > > > + > > > + chunk_size = min(size_left, dma_max_seg_size); > > > + > > > + bar_address = hdev->dram_pci_bar_start + cur_device_address; > > > + > > > + rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir); > > > + if (rc) > > > + goto error_unmap; > > > + > > > + if (size_left > dma_max_seg_size) { > > > + size_left -= dma_max_seg_size; > > > + } else { > > > + cur_page++; > > > + size_left = page_size; > > > + } > > > + } > > > + } else { > > > + /* Merge pages and put them into the scatterlist */ > > > + for_each_sgtable_dma_sg((*sgt), sg, i) { > > > + chunk_size = page_size; > > > + for (j = cur_page + 1 ; j < npages ; j++) { > > > + if (pages[j - 1] + page_size != pages[j] || > > > + chunk_size + page_size > dma_max_seg_size) > > > + break; > > > + > > > + chunk_size += page_size; > > > + } > > > + > > > + bar_address = hdev->dram_pci_bar_start + > > > + (pages[cur_page] - prop->dram_base_address); > > > + > > > + rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir); > > > + if (rc) > > > + goto error_unmap; > > > + > > > + cur_page = j; > > > + } > > > + } > > > > We have this sg_append stuff now that is intended to help building > > these things. It can only build CPU page lists, not these DMA lists, > > but I do wonder if open coding in drivers is slipping back a > > bit. Especially since AMD seems to be doing something different. > > > > Could the DMABUF layer gain some helpers styled after the sg_append to > > simplify building these things? and convert the AMD driver of course. > > > I will take it as a task to do in the near future, but I prefer to > first upstream these patches as-is, > because I don't have a way to check AMD devices and I guess converting them > might take some time. > > > > +static int hl_dmabuf_attach(struct dma_buf *dmabuf, > > > + struct dma_buf_attachment *attachment) > > > +{ > > > + struct hl_dmabuf_wrapper *hl_dmabuf; > > > + struct hl_device *hdev; > > > + int rc; > > > + > > > + hl_dmabuf = dmabuf->priv; > > > + hdev = hl_dmabuf->ctx->hdev; > > > + > > > + rc = pci_p2pdma_distance_many(hdev->pdev, &attachment->dev, 1, true); > > > + > > > + if (rc < 0) > > > + attachment->peer2peer = false; > > > > Extra blank line > > > > > + > > > + return 0; > > > +} > > > + > > > +static struct sg_table *hl_map_dmabuf(struct dma_buf_attachment *attachment, > > > + enum dma_data_direction dir) > > > +{ > > > + struct dma_buf *dma_buf = attachment->dmabuf; > > > + struct hl_vm_phys_pg_pack *phys_pg_pack; > > > + struct hl_dmabuf_wrapper *hl_dmabuf; > > > + struct hl_device *hdev; > > > + struct sg_table *sgt; > > > + int rc; > > > + > > > + hl_dmabuf = dma_buf->priv; > > > + hdev = hl_dmabuf->ctx->hdev; > > > + phys_pg_pack = hl_dmabuf->phys_pg_pack; > > > + > > > + if (!attachment->peer2peer) { > > > + dev_err(hdev->dev, > > > + "Failed to map dmabuf because p2p is disabled\n"); > > > + return ERR_PTR(-EPERM); > > > > User triggered printable again? > But how will the user know what the reason for the failure is ? > > > > > > +static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment, > > > + struct sg_table *sgt, > > > + enum dma_data_direction dir) > > > +{ > > > + struct scatterlist *sg; > > > + int i; > > > + > > > + for_each_sgtable_dma_sg(sgt, sg, i) > > > + dma_unmap_resource(attachment->dev, sg_dma_address(sg), > > > + sg_dma_len(sg), dir, > > > + DMA_ATTR_SKIP_CPU_SYNC); > > > > Why can we skip the CPU_SYNC? Seems like a comment is needed > > > > Something has to do a CPU_SYNC before recylcing this memory for > > another purpose, where is it? > I have to ask for further explanation here. > Same as before, this function is modeled after the amdgpu one - > amdgpu_vram_mgr_free_sgt() > They also use CPU_SYNC in that function. > Christain, maybe you know ? > > Maybe it is not relevant to our case as it represents a PCI bar ? > Hi Jason, After reading the kernel iommu code, I think this is not relevant here, and I'll add a comment appropriately but I'll also write it here, and please correct me if my understanding is wrong. The memory behind this specific dma-buf has *always* resided on the device itself, i.e. it lives only in the 'device' domain (after all, it maps a PCI bar address which points to the device memory). Therefore, it was never in the 'CPU' domain and hence, there is no need to perform a sync of the memory to the CPU's cache, as it was never inside that cache to begin with. This is not the same case as with regular memory which is dma-mapped and then copied into the device using a dma engine. In that case, the memory started in the 'CPU' domain and moved to the 'device' domain. When it is unmapped it will indeed be recycled to be used for another purpose and therefore we need to sync the CPU cache. Is my understanding correct ? Thanks, Oded > > > > > +static void hl_release_dmabuf(struct dma_buf *dmabuf) > > > +{ > > > + struct hl_dmabuf_wrapper *hl_dmabuf = dmabuf->priv; > > > > Maybe hl_dmabuf_wrapper should be hl_dmabuf_priv > Agreed. > > > > > > + * export_dmabuf_from_addr() - export a dma-buf object for the given memory > > > + * address and size. > > > + * @ctx: pointer to the context structure. > > > + * @device_addr: device memory physical address. > > > + * @size: size of device memory. > > > + * @flags: DMA-BUF file/FD flags. > > > + * @dmabuf_fd: pointer to result FD that represents the dma-buf object. > > > + * > > > + * Create and export a dma-buf object for an existing memory allocation inside > > > + * the device memory, and return a FD which is associated with the dma-buf > > > + * object. > > > + * > > > + * Return: 0 on success, non-zero for failure. > > > + */ > > > +static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 device_addr, > > > + u64 size, int flags, int *dmabuf_fd) > > > +{ > > > + struct hl_dmabuf_wrapper *hl_dmabuf; > > > + struct hl_device *hdev = ctx->hdev; > > > + struct asic_fixed_properties *prop; > > > + u64 bar_address; > > > + int rc; > > > + > > > + prop = &hdev->asic_prop; > > > + > > > + if (!IS_ALIGNED(device_addr, PAGE_SIZE)) { > > > + dev_err_ratelimited(hdev->dev, > > > + "address of exported device memory should be aligned to 0x%lx, address 0x%llx\n", > > > + PAGE_SIZE, device_addr); > > > + return -EINVAL; > > > + } > > > + > > > + if (size < PAGE_SIZE) { > > > + dev_err_ratelimited(hdev->dev, > > > + "size %llu of exported device memory should be equal to or greater than %lu\n", > > > + size, PAGE_SIZE); > > > + return -EINVAL; > > > + } > > > + > > > + if (device_addr < prop->dram_user_base_address || > > > + device_addr + size > prop->dram_end_address || > > > + device_addr + size < device_addr) { > > > + dev_err_ratelimited(hdev->dev, > > > + "DRAM memory range is outside of DRAM boundaries, address 0x%llx, size 0x%llx\n", > > > + device_addr, size); > > > + return -EINVAL; > > > + } > > > + > > > + bar_address = hdev->dram_pci_bar_start + > > > + (device_addr - prop->dram_base_address); > > > + > > > + if (bar_address + size > > > > + hdev->dram_pci_bar_start + prop->dram_pci_bar_size || > > > + bar_address + size < bar_address) { > > > + dev_err_ratelimited(hdev->dev, > > > + "DRAM memory range is outside of PCI BAR boundaries, address 0x%llx, size 0x%llx\n", > > > + device_addr, size); > > > + return -EINVAL; > > > + } > > > > More prints from userspace > > > > > +static int export_dmabuf_from_handle(struct hl_ctx *ctx, u64 handle, int flags, > > > + int *dmabuf_fd) > > > +{ > > > + struct hl_vm_phys_pg_pack *phys_pg_pack; > > > + struct hl_dmabuf_wrapper *hl_dmabuf; > > > + struct hl_device *hdev = ctx->hdev; > > > + struct asic_fixed_properties *prop; > > > + struct hl_vm *vm = &hdev->vm; > > > + u64 bar_address; > > > + u32 idr_handle; > > > + int rc, i; > > > + > > > + prop = &hdev->asic_prop; > > > + > > > + idr_handle = lower_32_bits(handle); > > > > Why silent truncation? Shouldn't setting the upper 32 bits be an > > error? > Yes, you are correct. I will fix it. > > > > > > + case HL_MEM_OP_EXPORT_DMABUF_FD: > > > + rc = export_dmabuf_from_addr(ctx, > > > + args->in.export_dmabuf_fd.handle, > > > + args->in.export_dmabuf_fd.mem_size, > > > + args->in.flags, > > > + &dmabuf_fd); > > > + memset(args, 0, sizeof(*args)); > > > + args->out.fd = dmabuf_fd; > > > > Would expect the installed fd to be the positive return, not a pointer > yeah, changed it to s32 as you suggested. > > > > Jason
On Wed, Sep 29, 2021 at 12:17:35AM +0300, Oded Gabbay wrote: > On Tue, Sep 28, 2021 at 8:36 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > On Sun, Sep 12, 2021 at 07:53:09PM +0300, Oded Gabbay wrote: > > > From: Tomer Tayar <ttayar@habana.ai> > > > > > > Implement the calls to the dma-buf kernel api to create a dma-buf > > > object backed by FD. > > > > > > We block the option to mmap the DMA-BUF object because we don't support > > > DIRECT_IO and implicit P2P. > > > > This statement doesn't make sense, you can mmap your dmabuf if you > > like. All dmabuf mmaps are supposed to set the special bit/etc to > > exclude them from get_user_pages() anyhow - and since this is BAR > > memory not struct page memory this driver would be doing it anyhow. > > > But we block mmap the dmabuf fd from user-space. > If you try to do it, you will get MAP_FAILED. You do, I'm saying the above paragraph explaining *why* that was done is not correct. > > > We check the p2p distance using pci_p2pdma_distance_many() and refusing > > > to map dmabuf in case the distance doesn't allow p2p. > > > > Does this actually allow the p2p transfer for your intended use cases? > > It depends on the system. If we are working bare-metal, then yes, it allows. > If inside a VM, then no. The virtualized root complex is not > white-listed and the kernel can't know the distance. > But I remember you asked me to add this check, in v3 of the review IIRC. > I don't mind removing this check if you don't object. Yes, i tis the right code, I was curious how far along things have gotten > > Don't write to the kernel log from user space triggered actions > at all ? At all. > It's the first time I hear about this limitation... Oh? It is a security issue, we don't want to allow userspace to DOS the kerne logging. > How do you tell the user it has done something wrong ? dev_dbg is the usual way and then users doing debugging can opt in to the logging. > > Why doesn't this return a sg_table * and an ERR_PTR? > Basically I modeled this function after amdgpu_vram_mgr_alloc_sgt() > And in that function they also return int and pass the sg_table as ** > > If it's critical I can change. Please follow the normal kernel style Jason
On Thu, Sep 30, 2021 at 03:46:35PM +0300, Oded Gabbay wrote: > After reading the kernel iommu code, I think this is not relevant > here, and I'll add a comment appropriately but I'll also write it > here, and please correct me if my understanding is wrong. > > The memory behind this specific dma-buf has *always* resided on the > device itself, i.e. it lives only in the 'device' domain (after all, > it maps a PCI bar address which points to the device memory). > Therefore, it was never in the 'CPU' domain and hence, there is no > need to perform a sync of the memory to the CPU's cache, as it was > never inside that cache to begin with. > > This is not the same case as with regular memory which is dma-mapped > and then copied into the device using a dma engine. In that case, > the memory started in the 'CPU' domain and moved to the 'device' > domain. When it is unmapped it will indeed be recycled to be used > for another purpose and therefore we need to sync the CPU cache. > > Is my understanding correct ? It makes sense to me Jason
diff --git a/drivers/misc/habanalabs/Kconfig b/drivers/misc/habanalabs/Kconfig index 293d79811372..c82d2e7b2035 100644 --- a/drivers/misc/habanalabs/Kconfig +++ b/drivers/misc/habanalabs/Kconfig @@ -8,6 +8,7 @@ config HABANA_AI depends on PCI && HAS_IOMEM select GENERIC_ALLOCATOR select HWMON + select DMA_SHARED_BUFFER help Enables PCIe card driver for Habana's AI Processors (AIP) that are designed to accelerate Deep Learning inference and training workloads. diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index bebebcb163ee..df49376a0880 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -26,6 +26,7 @@ #include <linux/sched/signal.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/coresight.h> +#include <linux/dma-buf.h> #define HL_NAME "habanalabs" @@ -1352,6 +1353,23 @@ struct hl_cs_counters_atomic { atomic64_t validation_drop_cnt; }; +/** + * struct hl_dmabuf_wrapper - a dma-buf wrapper object. + * @dmabuf: pointer to dma-buf object. + * @ctx: pointer to the dma-buf owner's context. + * @phys_pg_pack: pointer to physical page pack if the dma-buf was exported for + * memory allocation handle. + * @device_address: physical address of the device's memory. Relevant only + * if phys_pg_pack is NULL (dma-buf was exported from address). + * The total size can be taken from the dmabuf object. + */ +struct hl_dmabuf_wrapper { + struct dma_buf *dmabuf; + struct hl_ctx *ctx; + struct hl_vm_phys_pg_pack *phys_pg_pack; + uint64_t device_address; +}; + /** * struct hl_ctx - user/kernel context. * @mem_hash: holds mapping from virtual address to virtual memory area @@ -1662,6 +1680,7 @@ struct hl_vm_hw_block_list_node { * @npages: num physical pages in the pack. * @total_size: total size of all the pages in this list. * @mapping_cnt: number of shared mappings. + * @exporting_cnt: number of dma-buf exporting. * @asid: the context related to this list. * @page_size: size of each page in the pack. * @flags: HL_MEM_* flags related to this list. @@ -1676,6 +1695,7 @@ struct hl_vm_phys_pg_pack { u64 npages; u64 total_size; atomic_t mapping_cnt; + u32 exporting_cnt; u32 asid; u32 page_size; u32 flags; @@ -2396,6 +2416,7 @@ struct multi_cs_data { * the error will be ignored by the driver during * device initialization. Mainly used to debug and * workaround firmware bugs + * @dram_pci_bar_start: start bus address of PCIe bar towards DRAM. * @last_successful_open_jif: timestamp (jiffies) of the last successful * device open. * @last_open_session_duration_jif: duration (jiffies) of the last device open @@ -2537,6 +2558,7 @@ struct hl_device { u64 max_power; u64 clock_gating_mask; u64 boot_error_status_mask; + u64 dram_pci_bar_start; u64 last_successful_open_jif; u64 last_open_session_duration_jif; u64 open_counter; diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 33986933aa9e..8cf5437c0390 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2016-2019 HabanaLabs, Ltd. + * Copyright 2016-2021 HabanaLabs, Ltd. * All Rights Reserved. */ @@ -11,11 +11,13 @@ #include <linux/uaccess.h> #include <linux/slab.h> +#include <linux/pci-p2pdma.h> #define HL_MMU_DEBUG 0 /* use small pages for supporting non-pow2 (32M/40M/48M) DRAM phys page sizes */ -#define DRAM_POOL_PAGE_SIZE SZ_8M +#define DRAM_POOL_PAGE_SIZE SZ_8M + /* * The va ranges in context object contain a list with the available chunks of @@ -347,6 +349,13 @@ static int free_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args) return -EINVAL; } + if (phys_pg_pack->exporting_cnt) { + dev_err(hdev->dev, + "handle %u is exported, cannot free\n", handle); + spin_unlock(&vm->idr_lock); + return -EINVAL; + } + /* * must remove from idr before the freeing of the physical * pages as the refcount of the pool is also the trigger of the @@ -1487,13 +1496,492 @@ int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma) return 0; } +static int set_dma_sg(struct scatterlist *sg, u64 bar_address, u64 chunk_size, + struct device *dev, enum dma_data_direction dir) +{ + dma_addr_t addr; + int rc; + + addr = dma_map_resource(dev, bar_address, chunk_size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + rc = dma_mapping_error(dev, addr); + if (rc) + return rc; + + sg_set_page(sg, NULL, chunk_size, 0); + sg_dma_address(sg) = addr; + sg_dma_len(sg) = chunk_size; + + return 0; +} + +static int alloc_sgt_from_device_pages(struct hl_device *hdev, + struct sg_table **sgt, u64 *pages, + u64 npages, u64 page_size, + struct device *dev, + enum dma_data_direction dir) +{ + u64 chunk_size, bar_address, dma_max_seg_size; + struct asic_fixed_properties *prop; + int rc, i, j, nents, cur_page; + struct scatterlist *sg; + + prop = &hdev->asic_prop; + + dma_max_seg_size = dma_get_max_seg_size(dev); + + /* We would like to align the max segment size to PAGE_SIZE, so the + * SGL will contain aligned addresses that can be easily mapped to + * an MMU + */ + dma_max_seg_size = ALIGN_DOWN(dma_max_seg_size, PAGE_SIZE); + if (dma_max_seg_size < PAGE_SIZE) { + dev_err_ratelimited(hdev->dev, + "dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n", + dma_max_seg_size); + return -EINVAL; + } + + *sgt = kzalloc(sizeof(**sgt), GFP_KERNEL); + if (!*sgt) + return -ENOMEM; + + /* If the size of each page is larger than the dma max segment size, + * then we can't combine pages and the number of entries in the SGL + * will just be the + * <number of pages> * <chunks of max segment size in each page> + */ + if (page_size > dma_max_seg_size) + nents = npages * DIV_ROUND_UP_ULL(page_size, dma_max_seg_size); + else + /* Get number of non-contiguous chunks */ + for (i = 1, nents = 1, chunk_size = page_size ; i < npages ; i++) { + if (pages[i - 1] + page_size != pages[i] || + chunk_size + page_size > dma_max_seg_size) { + nents++; + chunk_size = page_size; + continue; + } + + chunk_size += page_size; + } + + rc = sg_alloc_table(*sgt, nents, GFP_KERNEL | __GFP_ZERO); + if (rc) + goto error_free; + + /* Because we are not going to include a CPU list we want to have some + * chance that other users will detect this by setting the orig_nents + * to 0 and using only nents (length of DMA list) when going over the + * sgl + */ + (*sgt)->orig_nents = 0; + + cur_page = 0; + + if (page_size > dma_max_seg_size) { + u64 size_left, cur_device_address = 0; + + size_left = page_size; + + /* Need to split each page into the number of chunks of + * dma_max_seg_size + */ + for_each_sgtable_dma_sg((*sgt), sg, i) { + if (size_left == page_size) + cur_device_address = + pages[cur_page] - prop->dram_base_address; + else + cur_device_address += dma_max_seg_size; + + chunk_size = min(size_left, dma_max_seg_size); + + bar_address = hdev->dram_pci_bar_start + cur_device_address; + + rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir); + if (rc) + goto error_unmap; + + if (size_left > dma_max_seg_size) { + size_left -= dma_max_seg_size; + } else { + cur_page++; + size_left = page_size; + } + } + } else { + /* Merge pages and put them into the scatterlist */ + for_each_sgtable_dma_sg((*sgt), sg, i) { + chunk_size = page_size; + for (j = cur_page + 1 ; j < npages ; j++) { + if (pages[j - 1] + page_size != pages[j] || + chunk_size + page_size > dma_max_seg_size) + break; + + chunk_size += page_size; + } + + bar_address = hdev->dram_pci_bar_start + + (pages[cur_page] - prop->dram_base_address); + + rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir); + if (rc) + goto error_unmap; + + cur_page = j; + } + } + + return 0; + +error_unmap: + for_each_sgtable_dma_sg((*sgt), sg, i) { + if (!sg_dma_len(sg)) + continue; + + dma_unmap_resource(dev, sg_dma_address(sg), + sg_dma_len(sg), dir, + DMA_ATTR_SKIP_CPU_SYNC); + } + + /* Need to restore orig_nents because sg_free_table use that field */ + (*sgt)->orig_nents = nents; + sg_free_table(*sgt); + +error_free: + kfree(*sgt); + return rc; +} + +static int hl_dmabuf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct hl_dmabuf_wrapper *hl_dmabuf; + struct hl_device *hdev; + int rc; + + hl_dmabuf = dmabuf->priv; + hdev = hl_dmabuf->ctx->hdev; + + rc = pci_p2pdma_distance_many(hdev->pdev, &attachment->dev, 1, true); + + if (rc < 0) + attachment->peer2peer = false; + + return 0; +} + +static struct sg_table *hl_map_dmabuf(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + struct dma_buf *dma_buf = attachment->dmabuf; + struct hl_vm_phys_pg_pack *phys_pg_pack; + struct hl_dmabuf_wrapper *hl_dmabuf; + struct hl_device *hdev; + struct sg_table *sgt; + int rc; + + hl_dmabuf = dma_buf->priv; + hdev = hl_dmabuf->ctx->hdev; + phys_pg_pack = hl_dmabuf->phys_pg_pack; + + if (!attachment->peer2peer) { + dev_err(hdev->dev, + "Failed to map dmabuf because p2p is disabled\n"); + return ERR_PTR(-EPERM); + } + + if (phys_pg_pack) + rc = alloc_sgt_from_device_pages(hdev, &sgt, + phys_pg_pack->pages, + phys_pg_pack->npages, + phys_pg_pack->page_size, + attachment->dev, + dir); + else + rc = alloc_sgt_from_device_pages(hdev, &sgt, + &hl_dmabuf->device_address, + 1, + hl_dmabuf->dmabuf->size, + attachment->dev, + dir); + + if (rc) { + dev_err(hdev->dev, + "failed (%d) to initialize sgt for dmabuf\n", + rc); + return ERR_PTR(rc); + } + + return sgt; +} + +static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + struct scatterlist *sg; + int i; + + for_each_sgtable_dma_sg(sgt, sg, i) + dma_unmap_resource(attachment->dev, sg_dma_address(sg), + sg_dma_len(sg), dir, + DMA_ATTR_SKIP_CPU_SYNC); + + /* Need to restore orig_nents because sg_free_table use that field */ + sgt->orig_nents = sgt->nents; + sg_free_table(sgt); + kfree(sgt); +} + +static void hl_release_dmabuf(struct dma_buf *dmabuf) +{ + struct hl_dmabuf_wrapper *hl_dmabuf = dmabuf->priv; + struct hl_ctx *ctx = hl_dmabuf->ctx; + struct hl_device *hdev = ctx->hdev; + struct hl_vm *vm = &hdev->vm; + + if (hl_dmabuf->phys_pg_pack) { + spin_lock(&vm->idr_lock); + hl_dmabuf->phys_pg_pack->exporting_cnt--; + spin_unlock(&vm->idr_lock); + } + + hl_ctx_put(hl_dmabuf->ctx); + + kfree(hl_dmabuf); +} + +static const struct dma_buf_ops habanalabs_dmabuf_ops = { + .attach = hl_dmabuf_attach, + .map_dma_buf = hl_map_dmabuf, + .unmap_dma_buf = hl_unmap_dmabuf, + .release = hl_release_dmabuf, +}; + +static int export_dmabuf_common(struct hl_ctx *ctx, + struct hl_dmabuf_wrapper *hl_dmabuf, + u64 total_size, int flags, int *dmabuf_fd) +{ + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct hl_device *hdev = ctx->hdev; + int rc, fd; + + exp_info.ops = &habanalabs_dmabuf_ops; + exp_info.size = total_size; + exp_info.flags = flags; + exp_info.priv = hl_dmabuf; + + hl_dmabuf->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(hl_dmabuf->dmabuf)) { + dev_err(hdev->dev, "failed to export dma-buf\n"); + return PTR_ERR(hl_dmabuf->dmabuf); + } + + fd = dma_buf_fd(hl_dmabuf->dmabuf, flags); + if (fd < 0) { + dev_err(hdev->dev, + "failed to get a file descriptor for a dma-buf\n"); + rc = fd; + goto err_dma_buf_put; + } + + hl_dmabuf->ctx = ctx; + hl_ctx_get(hdev, hl_dmabuf->ctx); + + *dmabuf_fd = fd; + + return 0; + +err_dma_buf_put: + dma_buf_put(hl_dmabuf->dmabuf); + return rc; +} + +/** + * export_dmabuf_from_addr() - export a dma-buf object for the given memory + * address and size. + * @ctx: pointer to the context structure. + * @device_addr: device memory physical address. + * @size: size of device memory. + * @flags: DMA-BUF file/FD flags. + * @dmabuf_fd: pointer to result FD that represents the dma-buf object. + * + * Create and export a dma-buf object for an existing memory allocation inside + * the device memory, and return a FD which is associated with the dma-buf + * object. + * + * Return: 0 on success, non-zero for failure. + */ +static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 device_addr, + u64 size, int flags, int *dmabuf_fd) +{ + struct hl_dmabuf_wrapper *hl_dmabuf; + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop; + u64 bar_address; + int rc; + + prop = &hdev->asic_prop; + + if (!IS_ALIGNED(device_addr, PAGE_SIZE)) { + dev_err_ratelimited(hdev->dev, + "address of exported device memory should be aligned to 0x%lx, address 0x%llx\n", + PAGE_SIZE, device_addr); + return -EINVAL; + } + + if (size < PAGE_SIZE) { + dev_err_ratelimited(hdev->dev, + "size %llu of exported device memory should be equal to or greater than %lu\n", + size, PAGE_SIZE); + return -EINVAL; + } + + if (device_addr < prop->dram_user_base_address || + device_addr + size > prop->dram_end_address || + device_addr + size < device_addr) { + dev_err_ratelimited(hdev->dev, + "DRAM memory range is outside of DRAM boundaries, address 0x%llx, size 0x%llx\n", + device_addr, size); + return -EINVAL; + } + + bar_address = hdev->dram_pci_bar_start + + (device_addr - prop->dram_base_address); + + if (bar_address + size > + hdev->dram_pci_bar_start + prop->dram_pci_bar_size || + bar_address + size < bar_address) { + dev_err_ratelimited(hdev->dev, + "DRAM memory range is outside of PCI BAR boundaries, address 0x%llx, size 0x%llx\n", + device_addr, size); + return -EINVAL; + } + + hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL); + if (!hl_dmabuf) + return -ENOMEM; + + hl_dmabuf->device_address = device_addr; + + rc = export_dmabuf_common(ctx, hl_dmabuf, size, flags, dmabuf_fd); + if (rc) + goto err_free_dmabuf_wrapper; + + return 0; + +err_free_dmabuf_wrapper: + kfree(hl_dmabuf); + return rc; +} + +/** + * export_dmabuf_from_handle() - export a dma-buf object for the given memory + * handle. + * @ctx: pointer to the context structure. + * @handle: device memory allocation handle. + * @flags: DMA-BUF file/FD flags. + * @dmabuf_fd: pointer to result FD that represents the dma-buf object. + * + * Create and export a dma-buf object for an existing memory allocation inside + * the device memory, and return a FD which is associated with the dma-buf + * object. + * + * Return: 0 on success, non-zero for failure. + */ +static int export_dmabuf_from_handle(struct hl_ctx *ctx, u64 handle, int flags, + int *dmabuf_fd) +{ + struct hl_vm_phys_pg_pack *phys_pg_pack; + struct hl_dmabuf_wrapper *hl_dmabuf; + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop; + struct hl_vm *vm = &hdev->vm; + u64 bar_address; + u32 idr_handle; + int rc, i; + + prop = &hdev->asic_prop; + + idr_handle = lower_32_bits(handle); + + spin_lock(&vm->idr_lock); + + phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, idr_handle); + if (!phys_pg_pack) { + spin_unlock(&vm->idr_lock); + dev_err_ratelimited(hdev->dev, "no match for handle 0x%x\n", + idr_handle); + return -EINVAL; + } + + /* increment now to avoid freeing device memory while exporting */ + phys_pg_pack->exporting_cnt++; + + spin_unlock(&vm->idr_lock); + + if (phys_pg_pack->vm_type != VM_TYPE_PHYS_PACK) { + dev_err_ratelimited(hdev->dev, + "handle 0x%llx is not for DRAM memory\n", + handle); + rc = -EINVAL; + goto err_dec_exporting_cnt; + } + + for (i = 0 ; i < phys_pg_pack->npages ; i++) { + + bar_address = hdev->dram_pci_bar_start + + (phys_pg_pack->pages[i] - + prop->dram_base_address); + + if (bar_address + phys_pg_pack->page_size > + hdev->dram_pci_bar_start + prop->dram_pci_bar_size || + bar_address + phys_pg_pack->page_size < bar_address) { + + dev_err_ratelimited(hdev->dev, + "DRAM memory range is outside of PCI BAR boundaries, address 0x%llx, size 0x%x\n", + phys_pg_pack->pages[i], + phys_pg_pack->page_size); + + rc = -EINVAL; + goto err_dec_exporting_cnt; + } + } + + hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL); + if (!hl_dmabuf) { + rc = -ENOMEM; + goto err_dec_exporting_cnt; + } + + hl_dmabuf->phys_pg_pack = phys_pg_pack; + + rc = export_dmabuf_common(ctx, hl_dmabuf, phys_pg_pack->total_size, + flags, dmabuf_fd); + if (rc) + goto err_free_dmabuf_wrapper; + + return 0; + +err_free_dmabuf_wrapper: + kfree(hl_dmabuf); + +err_dec_exporting_cnt: + spin_lock(&vm->idr_lock); + phys_pg_pack->exporting_cnt--; + spin_unlock(&vm->idr_lock); + + return rc; +} + static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args) { struct hl_device *hdev = hpriv->hdev; struct hl_ctx *ctx = hpriv->ctx; u64 block_handle, device_addr = 0; u32 handle = 0, block_size; - int rc; + int rc, dmabuf_fd = -EBADF; switch (args->in.op) { case HL_MEM_OP_ALLOC: @@ -1542,6 +2030,16 @@ static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args) args->out.block_size = block_size; break; + case HL_MEM_OP_EXPORT_DMABUF_FD: + rc = export_dmabuf_from_addr(ctx, + args->in.export_dmabuf_fd.handle, + args->in.export_dmabuf_fd.mem_size, + args->in.flags, + &dmabuf_fd); + memset(args, 0, sizeof(*args)); + args->out.fd = dmabuf_fd; + break; + default: dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n"); rc = -ENOTTY; @@ -1560,7 +2058,7 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data) struct hl_ctx *ctx = hpriv->ctx; u64 block_handle, device_addr = 0; u32 handle = 0, block_size; - int rc; + int rc, dmabuf_fd = -EBADF; if (!hl_device_operational(hdev, &status)) { dev_warn_ratelimited(hdev->dev, @@ -1651,6 +2149,22 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data) args->out.block_size = block_size; break; + case HL_MEM_OP_EXPORT_DMABUF_FD: + if (hdev->asic_prop.dram_supports_virtual_memory) + rc = export_dmabuf_from_handle(ctx, + args->in.export_dmabuf_fd.handle, + args->in.flags, + &dmabuf_fd); + else + rc = export_dmabuf_from_addr(ctx, + args->in.export_dmabuf_fd.handle, + args->in.export_dmabuf_fd.mem_size, + args->in.flags, + &dmabuf_fd); + memset(args, 0, sizeof(*args)); + args->out.fd = dmabuf_fd; + break; + default: dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n"); rc = -ENOTTY; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 383865be3c2c..48cea845624e 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -795,6 +795,7 @@ static int gaudi_early_init(struct hl_device *hdev) } prop->dram_pci_bar_size = pci_resource_len(pdev, HBM_BAR_ID); + hdev->dram_pci_bar_start = pci_resource_start(pdev, HBM_BAR_ID); /* If FW security is enabled at this point it means no access to ELBI */ if (hdev->asic_prop.fw_security_enabled) { diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 031c1849da14..65e4a4bb9323 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -622,6 +622,7 @@ static int goya_early_init(struct hl_device *hdev) } prop->dram_pci_bar_size = pci_resource_len(pdev, DDR_BAR_ID); + hdev->dram_pci_bar_start = pci_resource_start(pdev, DDR_BAR_ID); /* If FW security is enabled at this point it means no access to ELBI */ if (hdev->asic_prop.fw_security_enabled) {