Message ID | 1610706138-4219-2-git-send-email-liweihang@huawei.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | RDMA/hns: Add support for Dynamic Context Attachment | expand |
On Fri, Jan 15, 2021 at 06:22:12PM +0800, Weihang Li wrote: > From: Xi Wang <wangxi11@huawei.com> > > The hip09 introduces the DCA(Dynamic context attachment) feature which > supports many RC QPs to share the WQE buffer in a memory pool, this will > reduce the memory consumption when there are too many QPs are inactive. > > If a QP enables DCA feature, the WQE's buffer will not be allocated when > creating. But when the users start to post WRs, the hns driver will > allocate a buffer from the memory pool and then fill WQEs which tagged with > this QP's number. > > The hns ROCEE will stop accessing the WQE buffer when the user polled all > of the CQEs for a DCA QP, then the driver will recycle this WQE's buffer > to the memory pool. > > This patch adds a group of methods to support the user space register > buffers to a memory pool which belongs to the user context. The hns kernel > driver will update the pages state in this pool when the user calling the > post/poll methods and the user driver can get the QP's WQE buffer address > by the key and offset which queried from kernel. > > Signed-off-by: Xi Wang <wangxi11@huawei.com> > Signed-off-by: Weihang Li <liweihang@huawei.com> > --- > drivers/infiniband/hw/hns/Makefile | 2 +- > drivers/infiniband/hw/hns/hns_roce_dca.c | 381 ++++++++++++++++++++++++++++ > drivers/infiniband/hw/hns/hns_roce_dca.h | 22 ++ > drivers/infiniband/hw/hns/hns_roce_device.h | 10 + > drivers/infiniband/hw/hns/hns_roce_main.c | 27 +- > include/uapi/rdma/hns-abi.h | 23 ++ > 6 files changed, 462 insertions(+), 3 deletions(-) > create mode 100644 drivers/infiniband/hw/hns/hns_roce_dca.c > create mode 100644 drivers/infiniband/hw/hns/hns_roce_dca.h <...> > +static struct dca_mem *alloc_dca_mem(struct hns_roce_dca_ctx *ctx) > +{ > + struct dca_mem *mem, *tmp, *found = NULL; > + unsigned long flags; > + > + spin_lock_irqsave(&ctx->pool_lock, flags); > + list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { > + spin_lock(&mem->lock); > + if (dca_mem_is_free(mem)) { > + found = mem; > + set_dca_mem_alloced(mem); > + spin_unlock(&mem->lock); > + goto done; > + } > + spin_unlock(&mem->lock); > + } > + > +done: > + spin_unlock_irqrestore(&ctx->pool_lock, flags); > + > + if (found) > + return found; > + > + mem = kzalloc(sizeof(*mem), GFP_ATOMIC); Should it be ATOMIC? > + if (!mem) > + return NULL; > + > + spin_lock_init(&mem->lock); > + INIT_LIST_HEAD(&mem->list); > + > + set_dca_mem_alloced(mem); > + > + spin_lock_irqsave(&ctx->pool_lock, flags); > + list_add(&mem->list, &ctx->pool); > + spin_unlock_irqrestore(&ctx->pool_lock, flags); > + return mem; > +} <...> > /** > * hns_get_gid_index - Get gid index. > @@ -306,15 +308,16 @@ static int hns_roce_modify_device(struct ib_device *ib_dev, int mask, > static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, > struct ib_udata *udata) > { > - int ret; > struct hns_roce_ucontext *context = to_hr_ucontext(uctx); > - struct hns_roce_ib_alloc_ucontext_resp resp = {}; > struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device); > + struct hns_roce_ib_alloc_ucontext_resp resp = {}; > + int ret; > > if (!hr_dev->active) > return -EAGAIN; > > resp.qp_tab_size = hr_dev->caps.num_qps; > + resp.cap_flags = (u32)hr_dev->caps.flags; This is prone to errors, flags is u64. <...> > diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h > index 90b739d..f59abc4 100644 > --- a/include/uapi/rdma/hns-abi.h > +++ b/include/uapi/rdma/hns-abi.h > @@ -86,10 +86,33 @@ struct hns_roce_ib_create_qp_resp { > struct hns_roce_ib_alloc_ucontext_resp { > __u32 qp_tab_size; > __u32 cqe_size; > + __u32 cap_flags; > }; This struct should be padded to 64bits, Thanks
On 2021/1/20 16:10, Leon Romanovsky wrote: > On Fri, Jan 15, 2021 at 06:22:12PM +0800, Weihang Li wrote: >> From: Xi Wang <wangxi11@huawei.com> >> >> The hip09 introduces the DCA(Dynamic context attachment) feature which >> supports many RC QPs to share the WQE buffer in a memory pool, this will >> reduce the memory consumption when there are too many QPs are inactive. >> >> If a QP enables DCA feature, the WQE's buffer will not be allocated when >> creating. But when the users start to post WRs, the hns driver will >> allocate a buffer from the memory pool and then fill WQEs which tagged with >> this QP's number. >> >> The hns ROCEE will stop accessing the WQE buffer when the user polled all >> of the CQEs for a DCA QP, then the driver will recycle this WQE's buffer >> to the memory pool. >> >> This patch adds a group of methods to support the user space register >> buffers to a memory pool which belongs to the user context. The hns kernel >> driver will update the pages state in this pool when the user calling the >> post/poll methods and the user driver can get the QP's WQE buffer address >> by the key and offset which queried from kernel. >> >> Signed-off-by: Xi Wang <wangxi11@huawei.com> >> Signed-off-by: Weihang Li <liweihang@huawei.com> >> --- >> drivers/infiniband/hw/hns/Makefile | 2 +- >> drivers/infiniband/hw/hns/hns_roce_dca.c | 381 ++++++++++++++++++++++++++++ >> drivers/infiniband/hw/hns/hns_roce_dca.h | 22 ++ >> drivers/infiniband/hw/hns/hns_roce_device.h | 10 + >> drivers/infiniband/hw/hns/hns_roce_main.c | 27 +- >> include/uapi/rdma/hns-abi.h | 23 ++ >> 6 files changed, 462 insertions(+), 3 deletions(-) >> create mode 100644 drivers/infiniband/hw/hns/hns_roce_dca.c >> create mode 100644 drivers/infiniband/hw/hns/hns_roce_dca.h > > <...> > >> +static struct dca_mem *alloc_dca_mem(struct hns_roce_dca_ctx *ctx) >> +{ >> + struct dca_mem *mem, *tmp, *found = NULL; >> + unsigned long flags; >> + >> + spin_lock_irqsave(&ctx->pool_lock, flags); >> + list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { >> + spin_lock(&mem->lock); >> + if (dca_mem_is_free(mem)) { >> + found = mem; >> + set_dca_mem_alloced(mem); >> + spin_unlock(&mem->lock); >> + goto done; >> + } >> + spin_unlock(&mem->lock); >> + } >> + >> +done: >> + spin_unlock_irqrestore(&ctx->pool_lock, flags); >> + >> + if (found) >> + return found; >> + >> + mem = kzalloc(sizeof(*mem), GFP_ATOMIC); > > Should it be ATOMIC? > Hi Leon, The current DCA interfaces can be invoked by userspace through ibv_xx_cmd(), but it is expected that it can work in ib_post_xx() in kernel in the future. Since it may work in context of spin_lock, so we use GFP_ATOMIC. >> + if (!mem) >> + return NULL; >> + >> + spin_lock_init(&mem->lock); >> + INIT_LIST_HEAD(&mem->list); >> + >> + set_dca_mem_alloced(mem); >> + >> + spin_lock_irqsave(&ctx->pool_lock, flags); >> + list_add(&mem->list, &ctx->pool); >> + spin_unlock_irqrestore(&ctx->pool_lock, flags); >> + return mem; >> +} > > <...> > >> /** >> * hns_get_gid_index - Get gid index. >> @@ -306,15 +308,16 @@ static int hns_roce_modify_device(struct ib_device *ib_dev, int mask, >> static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, >> struct ib_udata *udata) >> { >> - int ret; >> struct hns_roce_ucontext *context = to_hr_ucontext(uctx); >> - struct hns_roce_ib_alloc_ucontext_resp resp = {}; >> struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device); >> + struct hns_roce_ib_alloc_ucontext_resp resp = {}; >> + int ret; >> >> if (!hr_dev->active) >> return -EAGAIN; >> >> resp.qp_tab_size = hr_dev->caps.num_qps; >> + resp.cap_flags = (u32)hr_dev->caps.flags; > > This is prone to errors, flags is u64. > OK, we plan to change type of resp.cap_flags to u64. > <...> > >> diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h >> index 90b739d..f59abc4 100644 >> --- a/include/uapi/rdma/hns-abi.h >> +++ b/include/uapi/rdma/hns-abi.h >> @@ -86,10 +86,33 @@ struct hns_roce_ib_create_qp_resp { >> struct hns_roce_ib_alloc_ucontext_resp { >> __u32 qp_tab_size; >> __u32 cqe_size; >> + __u32 cap_flags; >> }; > > This struct should be padded to 64bits, > > Thanks > Thanks, I will fix it. Weihang
On Thu, Jan 21, 2021 at 07:01:50AM +0000, liweihang wrote: > On 2021/1/20 16:10, Leon Romanovsky wrote: > > On Fri, Jan 15, 2021 at 06:22:12PM +0800, Weihang Li wrote: > >> From: Xi Wang <wangxi11@huawei.com> > >> > >> The hip09 introduces the DCA(Dynamic context attachment) feature which > >> supports many RC QPs to share the WQE buffer in a memory pool, this will > >> reduce the memory consumption when there are too many QPs are inactive. > >> > >> If a QP enables DCA feature, the WQE's buffer will not be allocated when > >> creating. But when the users start to post WRs, the hns driver will > >> allocate a buffer from the memory pool and then fill WQEs which tagged with > >> this QP's number. > >> > >> The hns ROCEE will stop accessing the WQE buffer when the user polled all > >> of the CQEs for a DCA QP, then the driver will recycle this WQE's buffer > >> to the memory pool. > >> > >> This patch adds a group of methods to support the user space register > >> buffers to a memory pool which belongs to the user context. The hns kernel > >> driver will update the pages state in this pool when the user calling the > >> post/poll methods and the user driver can get the QP's WQE buffer address > >> by the key and offset which queried from kernel. > >> > >> Signed-off-by: Xi Wang <wangxi11@huawei.com> > >> Signed-off-by: Weihang Li <liweihang@huawei.com> > >> --- > >> drivers/infiniband/hw/hns/Makefile | 2 +- > >> drivers/infiniband/hw/hns/hns_roce_dca.c | 381 ++++++++++++++++++++++++++++ > >> drivers/infiniband/hw/hns/hns_roce_dca.h | 22 ++ > >> drivers/infiniband/hw/hns/hns_roce_device.h | 10 + > >> drivers/infiniband/hw/hns/hns_roce_main.c | 27 +- > >> include/uapi/rdma/hns-abi.h | 23 ++ > >> 6 files changed, 462 insertions(+), 3 deletions(-) > >> create mode 100644 drivers/infiniband/hw/hns/hns_roce_dca.c > >> create mode 100644 drivers/infiniband/hw/hns/hns_roce_dca.h > > > > <...> > > > >> +static struct dca_mem *alloc_dca_mem(struct hns_roce_dca_ctx *ctx) > >> +{ > >> + struct dca_mem *mem, *tmp, *found = NULL; > >> + unsigned long flags; > >> + > >> + spin_lock_irqsave(&ctx->pool_lock, flags); > >> + list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { > >> + spin_lock(&mem->lock); > >> + if (dca_mem_is_free(mem)) { > >> + found = mem; > >> + set_dca_mem_alloced(mem); > >> + spin_unlock(&mem->lock); > >> + goto done; > >> + } > >> + spin_unlock(&mem->lock); > >> + } > >> + > >> +done: > >> + spin_unlock_irqrestore(&ctx->pool_lock, flags); > >> + > >> + if (found) > >> + return found; > >> + > >> + mem = kzalloc(sizeof(*mem), GFP_ATOMIC); > > > > Should it be ATOMIC? > > > > Hi Leon, > > The current DCA interfaces can be invoked by userspace through ibv_xx_cmd(), > but it is expected that it can work in ib_post_xx() in kernel in the future. > Since it may work in context of spin_lock, so we use GFP_ATOMIC. Are you planning to invoke kzalloc in data path? The GFP_ATOMIC will cause to use special allocation pool that is seen as precious resource because it must to succeed. It is better to avoid this flag if you don't need it. Thanks
On 2021/1/21 16:53, Leon Romanovsky wrote: > On Thu, Jan 21, 2021 at 07:01:50AM +0000, liweihang wrote: >> On 2021/1/20 16:10, Leon Romanovsky wrote: >>> On Fri, Jan 15, 2021 at 06:22:12PM +0800, Weihang Li wrote: >>>> From: Xi Wang <wangxi11@huawei.com> >>>> >>>> The hip09 introduces the DCA(Dynamic context attachment) feature which >>>> supports many RC QPs to share the WQE buffer in a memory pool, this will >>>> reduce the memory consumption when there are too many QPs are inactive. >>>> >>>> If a QP enables DCA feature, the WQE's buffer will not be allocated when >>>> creating. But when the users start to post WRs, the hns driver will >>>> allocate a buffer from the memory pool and then fill WQEs which tagged with >>>> this QP's number. >>>> >>>> The hns ROCEE will stop accessing the WQE buffer when the user polled all >>>> of the CQEs for a DCA QP, then the driver will recycle this WQE's buffer >>>> to the memory pool. >>>> >>>> This patch adds a group of methods to support the user space register >>>> buffers to a memory pool which belongs to the user context. The hns kernel >>>> driver will update the pages state in this pool when the user calling the >>>> post/poll methods and the user driver can get the QP's WQE buffer address >>>> by the key and offset which queried from kernel. >>>> >>>> Signed-off-by: Xi Wang <wangxi11@huawei.com> >>>> Signed-off-by: Weihang Li <liweihang@huawei.com> >>>> --- >>>> drivers/infiniband/hw/hns/Makefile | 2 +- >>>> drivers/infiniband/hw/hns/hns_roce_dca.c | 381 ++++++++++++++++++++++++++++ >>>> drivers/infiniband/hw/hns/hns_roce_dca.h | 22 ++ >>>> drivers/infiniband/hw/hns/hns_roce_device.h | 10 + >>>> drivers/infiniband/hw/hns/hns_roce_main.c | 27 +- >>>> include/uapi/rdma/hns-abi.h | 23 ++ >>>> 6 files changed, 462 insertions(+), 3 deletions(-) >>>> create mode 100644 drivers/infiniband/hw/hns/hns_roce_dca.c >>>> create mode 100644 drivers/infiniband/hw/hns/hns_roce_dca.h >>> >>> <...> >>> >>>> +static struct dca_mem *alloc_dca_mem(struct hns_roce_dca_ctx *ctx) >>>> +{ >>>> + struct dca_mem *mem, *tmp, *found = NULL; >>>> + unsigned long flags; >>>> + >>>> + spin_lock_irqsave(&ctx->pool_lock, flags); >>>> + list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { >>>> + spin_lock(&mem->lock); >>>> + if (dca_mem_is_free(mem)) { >>>> + found = mem; >>>> + set_dca_mem_alloced(mem); >>>> + spin_unlock(&mem->lock); >>>> + goto done; >>>> + } >>>> + spin_unlock(&mem->lock); >>>> + } >>>> + >>>> +done: >>>> + spin_unlock_irqrestore(&ctx->pool_lock, flags); >>>> + >>>> + if (found) >>>> + return found; >>>> + >>>> + mem = kzalloc(sizeof(*mem), GFP_ATOMIC); >>> >>> Should it be ATOMIC? >>> >> >> Hi Leon, >> >> The current DCA interfaces can be invoked by userspace through ibv_xx_cmd(), >> but it is expected that it can work in ib_post_xx() in kernel in the future. >> Since it may work in context of spin_lock, so we use GFP_ATOMIC. > > Are you planning to invoke kzalloc in data path? > > The GFP_ATOMIC will cause to use special allocation pool that is seen as precious > resource because it must to succeed. > > It is better to avoid this flag if you don't need it. > > Thanks We need to allocate memory while spin_lock is hold, how about using GFP_KERNEL or GFP_NOWAIT? Thanks Weihang
On Thu, Jan 21, 2021 at 01:33:42PM +0000, liweihang wrote: > We need to allocate memory while spin_lock is hold, how about using GFP_KERNEL or > GFP_NOWAIT? You should try hard not to do that. Convert he spinlock to a mutex, for instance. Jason
On 2021/1/21 21:36, Jason Gunthorpe wrote: > On Thu, Jan 21, 2021 at 01:33:42PM +0000, liweihang wrote: > >> We need to allocate memory while spin_lock is hold, how about using GFP_KERNEL or >> GFP_NOWAIT? > > You should try hard not to do that. Convert he spinlock to a mutex, > for instance. > > Jason > But what if some kernel users call ib_post_send() when holding a spinlock? Thanks Weihang
On Thu, Jan 21, 2021 at 01:48:56PM +0000, liweihang wrote: > On 2021/1/21 21:36, Jason Gunthorpe wrote: > > On Thu, Jan 21, 2021 at 01:33:42PM +0000, liweihang wrote: > > > >> We need to allocate memory while spin_lock is hold, how about using GFP_KERNEL or > >> GFP_NOWAIT? > > > > You should try hard not to do that. Convert he spinlock to a mutex, > > for instance. > > > > Jason > > > > But what if some kernel users call ib_post_send() when holding a spinlock? I doubt extensions like this would be part of kernel verbs.. Does any ULP call ib_post_send under lock? I'm not sure that is valid. Jason
On 2021/1/21 21:52, Jason Gunthorpe wrote: > On Thu, Jan 21, 2021 at 01:48:56PM +0000, liweihang wrote: >> On 2021/1/21 21:36, Jason Gunthorpe wrote: >>> On Thu, Jan 21, 2021 at 01:33:42PM +0000, liweihang wrote: >>> >>>> We need to allocate memory while spin_lock is hold, how about using GFP_KERNEL or >>>> GFP_NOWAIT? >>> >>> You should try hard not to do that. Convert he spinlock to a mutex, >>> for instance. >>> >>> Jason >>> >> >> But what if some kernel users call ib_post_send() when holding a spinlock? > > I doubt extensions like this would be part of kernel verbs.. > > Does any ULP call ib_post_send under lock? I'm not sure that is valid. > > Jason > I didn't find such a ULP calling ib_post_send in a spinlock either. Anyway, I will use GFP_NOWAIT instead of GFP_ATOMIC. Thanks Weihang
diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index e105945..9962b23 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -6,7 +6,7 @@ ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ - hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ + hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o hns_roce_dca.o \ hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o ifdef CONFIG_INFINIBAND_HNS_HIP06 diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.c b/drivers/infiniband/hw/hns/hns_roce_dca.c new file mode 100644 index 0000000..872e51a --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_dca.c @@ -0,0 +1,381 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2021 Hisilicon Limited. All rights reserved. + */ + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_verbs.h> +#include <rdma/uverbs_types.h> +#include <rdma/uverbs_ioctl.h> +#include <rdma/uverbs_std_types.h> +#include <rdma/ib_umem.h> +#include "hns_roce_device.h" +#include "hns_roce_dca.h" + +#define UVERBS_MODULE_NAME hns_ib +#include <rdma/uverbs_named_ioctl.h> + +/* DCA memory */ +struct dca_mem { +#define DCA_MEM_FLAGS_ALLOCED BIT(0) +#define DCA_MEM_FLAGS_REGISTERED BIT(1) + u32 flags; + struct list_head list; /* link to mem list in dca context */ + spinlock_t lock; /* protect the @flags and @list */ + int page_count; /* page count in this mem obj */ + u64 key; /* register by caller */ + u32 size; /* bytes in this mem object */ + struct hns_dca_page_state *states; /* record each page's state */ + void *pages; /* memory handle for getting dma address */ +}; + +struct dca_mem_attr { + u64 key; + u64 addr; + u32 size; +}; + +static inline bool dca_mem_is_free(struct dca_mem *mem) +{ + return mem->flags == 0; +} + +static inline void set_dca_mem_free(struct dca_mem *mem) +{ + mem->flags = 0; +} + +static inline void set_dca_mem_alloced(struct dca_mem *mem) +{ + mem->flags |= DCA_MEM_FLAGS_ALLOCED; +} + +static inline void set_dca_mem_registered(struct dca_mem *mem) +{ + mem->flags |= DCA_MEM_FLAGS_REGISTERED; +} + +static inline void clr_dca_mem_registered(struct dca_mem *mem) +{ + mem->flags &= ~DCA_MEM_FLAGS_REGISTERED; +} + +static void free_dca_pages(void *pages) +{ + ib_umem_release(pages); +} + +static void *alloc_dca_pages(struct hns_roce_dev *hr_dev, struct dca_mem *mem, + struct dca_mem_attr *attr) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + struct ib_umem *umem; + + umem = ib_umem_get(ibdev, attr->addr, attr->size, 0); + if (IS_ERR(umem)) { + ibdev_err(ibdev, "failed to get uDCA pages, ret = %ld.\n", + PTR_ERR(umem)); + return NULL; + } + + mem->page_count = ib_umem_num_dma_blocks(umem, HNS_HW_PAGE_SIZE); + + return umem; +} + +static void free_mem_states(struct hns_dca_page_state *states) +{ + kfree(states); +} + +static void init_dca_umem_states(struct hns_dca_page_state *states, int count, + struct ib_umem *umem) +{ + struct ib_block_iter biter; + dma_addr_t cur_addr; + dma_addr_t pre_addr; + int i = 0; + + pre_addr = 0; + rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap, + HNS_HW_PAGE_SIZE) { + cur_addr = rdma_block_iter_dma_address(&biter); + if (i < count) { + if (cur_addr - pre_addr != HNS_HW_PAGE_SIZE) + states[i].head = 1; + } + + pre_addr = cur_addr; + i++; + } +} + +static struct hns_dca_page_state *alloc_dca_states(void *pages, int count) +{ + struct hns_dca_page_state *states; + + states = kcalloc(count, sizeof(*states), GFP_ATOMIC); + if (!states) + return NULL; + + init_dca_umem_states(states, count, pages); + + return states; +} + +/* user DCA is managed by ucontext */ +static inline struct hns_roce_dca_ctx * +to_hr_dca_ctx(struct hns_roce_ucontext *uctx) +{ + return &uctx->dca_ctx; +} + +static void unregister_dca_mem(struct hns_roce_ucontext *uctx, + struct dca_mem *mem) +{ + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(uctx); + unsigned long flags; + void *states, *pages; + + spin_lock_irqsave(&ctx->pool_lock, flags); + + spin_lock(&mem->lock); + clr_dca_mem_registered(mem); + mem->page_count = 0; + pages = mem->pages; + mem->pages = NULL; + states = mem->states; + mem->states = NULL; + spin_unlock(&mem->lock); + + ctx->free_mems--; + ctx->free_size -= mem->size; + + ctx->total_size -= mem->size; + spin_unlock_irqrestore(&ctx->pool_lock, flags); + + free_mem_states(states); + free_dca_pages(pages); +} + +static int register_dca_mem(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx, + struct dca_mem *mem, struct dca_mem_attr *attr) +{ + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(uctx); + void *states, *pages; + unsigned long flags; + + pages = alloc_dca_pages(hr_dev, mem, attr); + if (!pages) + return -ENOMEM; + + states = alloc_dca_states(pages, mem->page_count); + if (!states) { + free_dca_pages(pages); + return -ENOMEM; + } + + spin_lock_irqsave(&ctx->pool_lock, flags); + + spin_lock(&mem->lock); + mem->pages = pages; + mem->states = states; + mem->key = attr->key; + mem->size = attr->size; + set_dca_mem_registered(mem); + spin_unlock(&mem->lock); + + ctx->free_mems++; + ctx->free_size += attr->size; + ctx->total_size += attr->size; + spin_unlock_irqrestore(&ctx->pool_lock, flags); + + return 0; +} + +static void init_dca_context(struct hns_roce_dca_ctx *ctx) +{ + INIT_LIST_HEAD(&ctx->pool); + spin_lock_init(&ctx->pool_lock); + ctx->total_size = 0; +} + +static void cleanup_dca_context(struct hns_roce_dev *hr_dev, + struct hns_roce_dca_ctx *ctx) +{ + struct dca_mem *mem, *tmp; + unsigned long flags; + + spin_lock_irqsave(&ctx->pool_lock, flags); + list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { + list_del(&mem->list); + set_dca_mem_free(mem); + spin_unlock_irqrestore(&ctx->pool_lock, flags); + + free_mem_states(mem->states); + free_dca_pages(mem->pages); + kfree(mem); + + spin_lock_irqsave(&ctx->pool_lock, flags); + } + ctx->total_size = 0; + spin_unlock_irqrestore(&ctx->pool_lock, flags); +} + +void hns_roce_register_udca(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx) +{ + init_dca_context(&uctx->dca_ctx); +} + +void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx) +{ + cleanup_dca_context(hr_dev, &uctx->dca_ctx); +} + +static struct dca_mem *alloc_dca_mem(struct hns_roce_dca_ctx *ctx) +{ + struct dca_mem *mem, *tmp, *found = NULL; + unsigned long flags; + + spin_lock_irqsave(&ctx->pool_lock, flags); + list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { + spin_lock(&mem->lock); + if (dca_mem_is_free(mem)) { + found = mem; + set_dca_mem_alloced(mem); + spin_unlock(&mem->lock); + goto done; + } + spin_unlock(&mem->lock); + } + +done: + spin_unlock_irqrestore(&ctx->pool_lock, flags); + + if (found) + return found; + + mem = kzalloc(sizeof(*mem), GFP_ATOMIC); + if (!mem) + return NULL; + + spin_lock_init(&mem->lock); + INIT_LIST_HEAD(&mem->list); + + set_dca_mem_alloced(mem); + + spin_lock_irqsave(&ctx->pool_lock, flags); + list_add(&mem->list, &ctx->pool); + spin_unlock_irqrestore(&ctx->pool_lock, flags); + return mem; +} + +static void free_dca_mem(struct dca_mem *mem) +{ + /* We cannot hold the whole pool's lock during the DCA is working + * until cleanup the context in cleanup_dca_context(), so we just + * set the DCA mem state as free when destroying DCA mem object. + */ + spin_lock(&mem->lock); + set_dca_mem_free(mem); + spin_unlock(&mem->lock); +} + +static inline struct hns_roce_ucontext * +uverbs_attr_to_hr_uctx(struct uverbs_attr_bundle *attrs) +{ + return rdma_udata_to_drv_context(&attrs->driver_udata, + struct hns_roce_ucontext, ibucontext); +} + +static int UVERBS_HANDLER(HNS_IB_METHOD_DCA_MEM_REG)( + struct uverbs_attr_bundle *attrs) +{ + struct hns_roce_ucontext *uctx = uverbs_attr_to_hr_uctx(attrs); + struct hns_roce_dev *hr_dev = to_hr_dev(uctx->ibucontext.device); + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, HNS_IB_ATTR_DCA_MEM_REG_HANDLE); + struct dca_mem_attr init_attr = {}; + struct dca_mem *mem; + int ret; + + if (uverbs_copy_from(&init_attr.addr, attrs, + HNS_IB_ATTR_DCA_MEM_REG_ADDR) || + uverbs_copy_from(&init_attr.size, attrs, + HNS_IB_ATTR_DCA_MEM_REG_LEN) || + uverbs_copy_from(&init_attr.key, attrs, + HNS_IB_ATTR_DCA_MEM_REG_KEY)) + return -EFAULT; + + mem = alloc_dca_mem(to_hr_dca_ctx(uctx)); + if (!mem) + return -ENOMEM; + + ret = register_dca_mem(hr_dev, uctx, mem, &init_attr); + if (ret) { + free_dca_mem(mem); + return ret; + } + + uobj->object = mem; + + return 0; +} + +static int dca_cleanup(struct ib_uobject *uobject, enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct hns_roce_ucontext *uctx = uverbs_attr_to_hr_uctx(attrs); + struct dca_mem *mem; + + /* One DCA MEM maybe shared by many QPs, so the DCA mem uobject must + * be destroyed before all QP uobjects, and we will destroy the DCA + * uobjects when cleanup DCA context by calling hns_roce_cleanup_dca(). + */ + if (why == RDMA_REMOVE_CLOSE || why == RDMA_REMOVE_DRIVER_REMOVE) + return 0; + + mem = uobject->object; + unregister_dca_mem(uctx, mem); + free_dca_mem(mem); + + return 0; +} + +DECLARE_UVERBS_NAMED_METHOD( + HNS_IB_METHOD_DCA_MEM_REG, + UVERBS_ATTR_IDR(HNS_IB_ATTR_DCA_MEM_REG_HANDLE, HNS_IB_OBJECT_DCA_MEM, + UVERBS_ACCESS_NEW, UA_MANDATORY), + UVERBS_ATTR_PTR_IN(HNS_IB_ATTR_DCA_MEM_REG_LEN, UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(HNS_IB_ATTR_DCA_MEM_REG_ADDR, UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(HNS_IB_ATTR_DCA_MEM_REG_KEY, UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + HNS_IB_METHOD_DCA_MEM_DEREG, + UVERBS_ATTR_IDR(HNS_IB_ATTR_DCA_MEM_DEREG_HANDLE, HNS_IB_OBJECT_DCA_MEM, + UVERBS_ACCESS_DESTROY, UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(HNS_IB_OBJECT_DCA_MEM, + UVERBS_TYPE_ALLOC_IDR(dca_cleanup), + &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_REG), + &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_DEREG)); + +static bool dca_is_supported(struct ib_device *device) +{ + struct hns_roce_dev *dev = to_hr_dev(device); + + return dev->caps.flags & HNS_ROCE_CAP_FLAG_DCA_MODE; +} + +const struct uapi_definition hns_roce_dca_uapi_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + HNS_IB_OBJECT_DCA_MEM, + UAPI_DEF_IS_OBJ_SUPPORTED(dca_is_supported)), + {} +}; diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.h b/drivers/infiniband/hw/hns/hns_roce_dca.h new file mode 100644 index 0000000..cb3481f --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_dca.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2021 Hisilicon Limited. All rights reserved. + */ + +#ifndef __HNS_ROCE_DCA_H +#define __HNS_ROCE_DCA_H + +/* DCA page state (32 bit) */ +struct hns_dca_page_state { + u32 buf_id : 29; /* If zero, means page can be used by any buffer. */ + u32 lock : 1; /* @buf_id locked this page to prepare access. */ + u32 active : 1; /* @buf_id is accessing this page. */ + u32 head : 1; /* This page is the head in a continuous address range. */ +}; + +void hns_roce_register_udca(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx); +void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx); + +#endif diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 55d5386..5524d72 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -215,6 +215,7 @@ enum { HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL = BIT(9), HNS_ROCE_CAP_FLAG_ATOMIC = BIT(10), HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14), + HNS_ROCE_CAP_FLAG_DCA_MODE = BIT(15), HNS_ROCE_CAP_FLAG_STASH = BIT(17), }; @@ -266,11 +267,20 @@ struct hns_roce_uar { unsigned long logic_idx; }; +struct hns_roce_dca_ctx { + struct list_head pool; /* all DCA mems link to @pool */ + spinlock_t pool_lock; /* protect @pool */ + unsigned int free_mems; /* free mem num in pool */ + size_t free_size; /* free mem size in pool */ + size_t total_size; /* total size in pool */ +}; + struct hns_roce_ucontext { struct ib_ucontext ibucontext; struct hns_roce_uar uar; struct list_head page_list; struct mutex page_mutex; + struct hns_roce_dca_ctx dca_ctx; }; struct hns_roce_pd { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index d9179ba..66d0d02d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -37,10 +37,12 @@ #include <rdma/ib_addr.h> #include <rdma/ib_smi.h> #include <rdma/ib_user_verbs.h> +#include <rdma/uverbs_ioctl.h> #include <rdma/ib_cache.h> #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_hem.h" +#include "hns_roce_dca.h" /** * hns_get_gid_index - Get gid index. @@ -306,15 +308,16 @@ static int hns_roce_modify_device(struct ib_device *ib_dev, int mask, static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { - int ret; struct hns_roce_ucontext *context = to_hr_ucontext(uctx); - struct hns_roce_ib_alloc_ucontext_resp resp = {}; struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device); + struct hns_roce_ib_alloc_ucontext_resp resp = {}; + int ret; if (!hr_dev->active) return -EAGAIN; resp.qp_tab_size = hr_dev->caps.num_qps; + resp.cap_flags = (u32)hr_dev->caps.flags; ret = hns_roce_uar_alloc(hr_dev, &context->uar); if (ret) @@ -325,6 +328,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, mutex_init(&context->page_mutex); } + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DCA_MODE) + hns_roce_register_udca(hr_dev, context); + resp.cqe_size = hr_dev->caps.cqe_sz; ret = ib_copy_to_udata(udata, &resp, @@ -335,6 +341,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, return 0; error_fail_copy_to_udata: + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DCA_MODE) + hns_roce_unregister_udca(hr_dev, context); + hns_roce_uar_free(hr_dev, &context->uar); error_fail_uar_alloc: @@ -344,8 +353,12 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); + struct hns_roce_dev *hr_dev = to_hr_dev(ibcontext->device); hns_roce_uar_free(to_hr_dev(ibcontext->device), &context->uar); + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DCA_MODE) + hns_roce_unregister_udca(hr_dev, context); } static int hns_roce_mmap(struct ib_ucontext *context, @@ -414,6 +427,12 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) ib_unregister_device(&hr_dev->ib_dev); } +extern const struct uapi_definition hns_roce_dca_uapi_defs[]; +static const struct uapi_definition hns_roce_uapi_defs[] = { + UAPI_DEF_CHAIN(hns_roce_dca_uapi_defs), + {} +}; + static const struct ib_device_ops hns_roce_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_HNS, @@ -515,6 +534,10 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_ops); + + if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) + ib_dev->driver_def = hns_roce_uapi_defs; + for (i = 0; i < hr_dev->caps.num_ports; i++) { if (!hr_dev->iboe.netdevs[i]) continue; diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 90b739d..f59abc4 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -86,10 +86,33 @@ struct hns_roce_ib_create_qp_resp { struct hns_roce_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 cqe_size; + __u32 cap_flags; }; struct hns_roce_ib_alloc_pd_resp { __u32 pdn; }; +#define UVERBS_ID_NS_MASK 0xF000 +#define UVERBS_ID_NS_SHIFT 12 + +enum hns_ib_objects { + HNS_IB_OBJECT_DCA_MEM = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum hns_ib_dca_mem_methods { + HNS_IB_METHOD_DCA_MEM_REG = (1U << UVERBS_ID_NS_SHIFT), + HNS_IB_METHOD_DCA_MEM_DEREG, +}; + +enum hns_ib_dca_mem_reg_attrs { + HNS_IB_ATTR_DCA_MEM_REG_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + HNS_IB_ATTR_DCA_MEM_REG_LEN, + HNS_IB_ATTR_DCA_MEM_REG_ADDR, + HNS_IB_ATTR_DCA_MEM_REG_KEY, +}; + +enum hns_ib_dca_mem_dereg_attrs { + HNS_IB_ATTR_DCA_MEM_DEREG_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; #endif /* HNS_ABI_USER_H */