Message ID | 20181108191017.21891-9-leon@kernel.org (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | Collection of ODP fixes | expand |
On Thu, Nov 08, 2018 at 09:10:15PM +0200, Leon Romanovsky wrote: > From: Moni Shoua <monis@mellanox.com> > > Telling the HCA that page fault handling is done and QP can resume > its flow is done in the context of the page fault handler. This blocks > the handling of the next work in queue without a need. > Call the PAGE_FAULT_RESUME command in an asynchronous manner and free > the workqueue to pick the next work item for handling. All tasks that > were executed after PAGE_FAULT_RESUME need to be done now > in the callback of the asynchronous command mechanism. > > Signed-off-by: Moni Shoua <monis@mellanox.com> > Signed-off-by: Leon Romanovsky <leonro@mellanox.com> > drivers/infiniband/hw/mlx5/odp.c | 110 +++++++++++++++++++++++++------ > include/linux/mlx5/driver.h | 3 + > 2 files changed, 94 insertions(+), 19 deletions(-) > > diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c > index abce55b8b9ba..0c4f469cdd5b 100644 > +++ b/drivers/infiniband/hw/mlx5/odp.c > @@ -298,20 +298,78 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) > return; > } > > +struct pfault_resume_cb_ctx { > + struct mlx5_ib_dev *dev; > + struct mlx5_core_rsc_common *res; > + struct mlx5_pagefault *pfault; > +}; > + > +static void page_fault_resume_callback(int status, void *context) > +{ > + struct pfault_resume_cb_ctx *ctx = context; > + struct mlx5_pagefault *pfault = ctx->pfault; > + > + if (status) > + mlx5_ib_err(ctx->dev, "Resolve the page fault failed with status %d\n", > + status); > + > + if (ctx->res) > + mlx5_core_res_put(ctx->res); > + kfree(pfault); > + kfree(ctx); > +} > + > static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, > + struct mlx5_core_rsc_common *res, > struct mlx5_pagefault *pfault, > - int error) > + int error, > + bool async) > { > + int ret = 0; > + u32 *out = pfault->out_pf_resume; > + u32 *in = pfault->in_pf_resume; > + u32 token = pfault->token; > int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? > - pfault->wqe.wq_num : pfault->token; > - int ret = mlx5_core_page_fault_resume(dev->mdev, > - pfault->token, > - wq_num, > - pfault->type, > - error); > - if (ret) > - mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", > - wq_num); > + pfault->wqe.wq_num : pfault->token; > + u8 type = pfault->type; > + struct pfault_resume_cb_ctx *ctx = NULL; > + > + if (async) > + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); Why not allocate this ctx ast part of the mlx5_pagefault and avoid this allocation failure strategy? Jason
On Thu, Nov 08, 2018 at 07:49:03PM +0000, Jason Gunthorpe wrote: > On Thu, Nov 08, 2018 at 09:10:15PM +0200, Leon Romanovsky wrote: > > From: Moni Shoua <monis@mellanox.com> > > > > Telling the HCA that page fault handling is done and QP can resume > > its flow is done in the context of the page fault handler. This blocks > > the handling of the next work in queue without a need. > > Call the PAGE_FAULT_RESUME command in an asynchronous manner and free > > the workqueue to pick the next work item for handling. All tasks that > > were executed after PAGE_FAULT_RESUME need to be done now > > in the callback of the asynchronous command mechanism. > > > > Signed-off-by: Moni Shoua <monis@mellanox.com> > > Signed-off-by: Leon Romanovsky <leonro@mellanox.com> > > drivers/infiniband/hw/mlx5/odp.c | 110 +++++++++++++++++++++++++------ > > include/linux/mlx5/driver.h | 3 + > > 2 files changed, 94 insertions(+), 19 deletions(-) > > > > diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c > > index abce55b8b9ba..0c4f469cdd5b 100644 > > +++ b/drivers/infiniband/hw/mlx5/odp.c > > @@ -298,20 +298,78 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) > > return; > > } > > > > +struct pfault_resume_cb_ctx { > > + struct mlx5_ib_dev *dev; > > + struct mlx5_core_rsc_common *res; > > + struct mlx5_pagefault *pfault; > > +}; > > + > > +static void page_fault_resume_callback(int status, void *context) > > +{ > > + struct pfault_resume_cb_ctx *ctx = context; > > + struct mlx5_pagefault *pfault = ctx->pfault; > > + > > + if (status) > > + mlx5_ib_err(ctx->dev, "Resolve the page fault failed with status %d\n", > > + status); > > + > > + if (ctx->res) > > + mlx5_core_res_put(ctx->res); > > + kfree(pfault); > > + kfree(ctx); > > +} > > + > > static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, > > + struct mlx5_core_rsc_common *res, > > struct mlx5_pagefault *pfault, > > - int error) > > + int error, > > + bool async) > > { > > + int ret = 0; > > + u32 *out = pfault->out_pf_resume; > > + u32 *in = pfault->in_pf_resume; > > + u32 token = pfault->token; > > int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? > > - pfault->wqe.wq_num : pfault->token; > > - int ret = mlx5_core_page_fault_resume(dev->mdev, > > - pfault->token, > > - wq_num, > > - pfault->type, > > - error); > > - if (ret) > > - mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", > > - wq_num); > > + pfault->wqe.wq_num : pfault->token; > > + u8 type = pfault->type; > > + struct pfault_resume_cb_ctx *ctx = NULL; > > + > > + if (async) > > + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); > > Why not allocate this ctx ast part of the mlx5_pagefault and avoid > this allocation failure strategy? It is another way to implement it, both of them are correct. Can I assume that we can progress with patches except patch #2? Thanks > > Jason
On Fri, Nov 09, 2018 at 06:26:22PM +0200, Leon Romanovsky wrote: > On Thu, Nov 08, 2018 at 07:49:03PM +0000, Jason Gunthorpe wrote: > > On Thu, Nov 08, 2018 at 09:10:15PM +0200, Leon Romanovsky wrote: > > > From: Moni Shoua <monis@mellanox.com> > > > > > > Telling the HCA that page fault handling is done and QP can resume > > > its flow is done in the context of the page fault handler. This blocks > > > the handling of the next work in queue without a need. > > > Call the PAGE_FAULT_RESUME command in an asynchronous manner and free > > > the workqueue to pick the next work item for handling. All tasks that > > > were executed after PAGE_FAULT_RESUME need to be done now > > > in the callback of the asynchronous command mechanism. > > > > > > Signed-off-by: Moni Shoua <monis@mellanox.com> > > > Signed-off-by: Leon Romanovsky <leonro@mellanox.com> > > > drivers/infiniband/hw/mlx5/odp.c | 110 +++++++++++++++++++++++++------ > > > include/linux/mlx5/driver.h | 3 + > > > 2 files changed, 94 insertions(+), 19 deletions(-) > > > > > > diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c > > > index abce55b8b9ba..0c4f469cdd5b 100644 > > > +++ b/drivers/infiniband/hw/mlx5/odp.c > > > @@ -298,20 +298,78 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) > > > return; > > > } > > > > > > +struct pfault_resume_cb_ctx { > > > + struct mlx5_ib_dev *dev; > > > + struct mlx5_core_rsc_common *res; > > > + struct mlx5_pagefault *pfault; > > > +}; > > > + > > > +static void page_fault_resume_callback(int status, void *context) > > > +{ > > > + struct pfault_resume_cb_ctx *ctx = context; > > > + struct mlx5_pagefault *pfault = ctx->pfault; > > > + > > > + if (status) > > > + mlx5_ib_err(ctx->dev, "Resolve the page fault failed with status %d\n", > > > + status); > > > + > > > + if (ctx->res) > > > + mlx5_core_res_put(ctx->res); > > > + kfree(pfault); > > > + kfree(ctx); > > > +} > > > + > > > static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, > > > + struct mlx5_core_rsc_common *res, > > > struct mlx5_pagefault *pfault, > > > - int error) > > > + int error, > > > + bool async) > > > { > > > + int ret = 0; > > > + u32 *out = pfault->out_pf_resume; > > > + u32 *in = pfault->in_pf_resume; > > > + u32 token = pfault->token; > > > int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? > > > - pfault->wqe.wq_num : pfault->token; > > > - int ret = mlx5_core_page_fault_resume(dev->mdev, > > > - pfault->token, > > > - wq_num, > > > - pfault->type, > > > - error); > > > - if (ret) > > > - mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", > > > - wq_num); > > > + pfault->wqe.wq_num : pfault->token; > > > + u8 type = pfault->type; > > > + struct pfault_resume_cb_ctx *ctx = NULL; > > > + > > > + if (async) > > > + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); > > > > Why not allocate this ctx ast part of the mlx5_pagefault and avoid > > this allocation failure strategy? > > It is another way to implement it, both of them are correct. .. I think it is alot better to move this allocation, it gets rid of this ugly duplicated code > Can I assume that we can progress with patches except patch #2? Lets drop this one too.. Jason
On Fri, Nov 09, 2018 at 09:59:53AM -0700, Jason Gunthorpe wrote: > On Fri, Nov 09, 2018 at 06:26:22PM +0200, Leon Romanovsky wrote: > > On Thu, Nov 08, 2018 at 07:49:03PM +0000, Jason Gunthorpe wrote: > > > On Thu, Nov 08, 2018 at 09:10:15PM +0200, Leon Romanovsky wrote: > > > > From: Moni Shoua <monis@mellanox.com> > > > > > > > > Telling the HCA that page fault handling is done and QP can resume > > > > its flow is done in the context of the page fault handler. This blocks > > > > the handling of the next work in queue without a need. > > > > Call the PAGE_FAULT_RESUME command in an asynchronous manner and free > > > > the workqueue to pick the next work item for handling. All tasks that > > > > were executed after PAGE_FAULT_RESUME need to be done now > > > > in the callback of the asynchronous command mechanism. > > > > > > > > Signed-off-by: Moni Shoua <monis@mellanox.com> > > > > Signed-off-by: Leon Romanovsky <leonro@mellanox.com> > > > > drivers/infiniband/hw/mlx5/odp.c | 110 +++++++++++++++++++++++++------ > > > > include/linux/mlx5/driver.h | 3 + > > > > 2 files changed, 94 insertions(+), 19 deletions(-) > > > > > > > > diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c > > > > index abce55b8b9ba..0c4f469cdd5b 100644 > > > > +++ b/drivers/infiniband/hw/mlx5/odp.c > > > > @@ -298,20 +298,78 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) > > > > return; > > > > } > > > > > > > > +struct pfault_resume_cb_ctx { > > > > + struct mlx5_ib_dev *dev; > > > > + struct mlx5_core_rsc_common *res; > > > > + struct mlx5_pagefault *pfault; > > > > +}; > > > > + > > > > +static void page_fault_resume_callback(int status, void *context) > > > > +{ > > > > + struct pfault_resume_cb_ctx *ctx = context; > > > > + struct mlx5_pagefault *pfault = ctx->pfault; > > > > + > > > > + if (status) > > > > + mlx5_ib_err(ctx->dev, "Resolve the page fault failed with status %d\n", > > > > + status); > > > > + > > > > + if (ctx->res) > > > > + mlx5_core_res_put(ctx->res); > > > > + kfree(pfault); > > > > + kfree(ctx); > > > > +} > > > > + > > > > static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, > > > > + struct mlx5_core_rsc_common *res, > > > > struct mlx5_pagefault *pfault, > > > > - int error) > > > > + int error, > > > > + bool async) > > > > { > > > > + int ret = 0; > > > > + u32 *out = pfault->out_pf_resume; > > > > + u32 *in = pfault->in_pf_resume; > > > > + u32 token = pfault->token; > > > > int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? > > > > - pfault->wqe.wq_num : pfault->token; > > > > - int ret = mlx5_core_page_fault_resume(dev->mdev, > > > > - pfault->token, > > > > - wq_num, > > > > - pfault->type, > > > > - error); > > > > - if (ret) > > > > - mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", > > > > - wq_num); > > > > + pfault->wqe.wq_num : pfault->token; > > > > + u8 type = pfault->type; > > > > + struct pfault_resume_cb_ctx *ctx = NULL; > > > > + > > > > + if (async) > > > > + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); > > > > > > Why not allocate this ctx ast part of the mlx5_pagefault and avoid > > > this allocation failure strategy? > > > > It is another way to implement it, both of them are correct. > > .. I think it is alot better to move this allocation, it gets rid of > this ugly duplicated code > > > Can I assume that we can progress with patches except patch #2? > > Lets drop this one too.. Sure, thanks > > Jason
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index abce55b8b9ba..0c4f469cdd5b 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -298,20 +298,78 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) return; } +struct pfault_resume_cb_ctx { + struct mlx5_ib_dev *dev; + struct mlx5_core_rsc_common *res; + struct mlx5_pagefault *pfault; +}; + +static void page_fault_resume_callback(int status, void *context) +{ + struct pfault_resume_cb_ctx *ctx = context; + struct mlx5_pagefault *pfault = ctx->pfault; + + if (status) + mlx5_ib_err(ctx->dev, "Resolve the page fault failed with status %d\n", + status); + + if (ctx->res) + mlx5_core_res_put(ctx->res); + kfree(pfault); + kfree(ctx); +} + static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, + struct mlx5_core_rsc_common *res, struct mlx5_pagefault *pfault, - int error) + int error, + bool async) { + int ret = 0; + u32 *out = pfault->out_pf_resume; + u32 *in = pfault->in_pf_resume; + u32 token = pfault->token; int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? - pfault->wqe.wq_num : pfault->token; - int ret = mlx5_core_page_fault_resume(dev->mdev, - pfault->token, - wq_num, - pfault->type, - error); - if (ret) - mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", - wq_num); + pfault->wqe.wq_num : pfault->token; + u8 type = pfault->type; + struct pfault_resume_cb_ctx *ctx = NULL; + + if (async) + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + + if (ctx) { + ctx->dev = dev; + ctx->res = res; + ctx->pfault = pfault; + } + + memset(out, 0, MLX5_ST_SZ_BYTES(page_fault_resume_out)); + memset(in, 0, MLX5_ST_SZ_BYTES(page_fault_resume_in)); + + MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); + MLX5_SET(page_fault_resume_in, in, error, !!error); + MLX5_SET(page_fault_resume_in, in, page_fault_type, type); + MLX5_SET(page_fault_resume_in, in, wq_number, wq_num); + MLX5_SET(page_fault_resume_in, in, token, token); + + if (ctx) + ret = mlx5_cmd_exec_cb(dev->mdev, + in, MLX5_ST_SZ_BYTES(page_fault_resume_in), + out, MLX5_ST_SZ_BYTES(page_fault_resume_out), + page_fault_resume_callback, ctx); + else + ret = mlx5_cmd_exec(dev->mdev, + in, MLX5_ST_SZ_BYTES(page_fault_resume_in), + out, MLX5_ST_SZ_BYTES(page_fault_resume_out)); + if (ret || !ctx) { + if (ret) + mlx5_ib_err(dev, "Failed to resume the page fault (%d)\n", ret); + if (res) + mlx5_core_res_put(res); + if (async) + kfree(pfault); + kfree(ctx); + } } static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, @@ -1045,7 +1103,8 @@ static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res) } static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, - struct mlx5_pagefault *pfault) + struct mlx5_pagefault *pfault, + bool async_resume) { int ret; void *wqe, *wqe_end; @@ -1113,11 +1172,10 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, resume_with_error = 0; resolve_page_fault: - mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); + mlx5_ib_page_fault_resume(dev, res, pfault, resume_with_error, async_resume); mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", pfault->wqe.wq_num, resume_with_error, pfault->type); - mlx5_core_res_put(res); free_page((unsigned long)buffer); } @@ -1128,7 +1186,8 @@ static int pages_in_range(u64 address, u32 length) } static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, - struct mlx5_pagefault *pfault) + struct mlx5_pagefault *pfault, + bool async_resume) { u64 address; u32 length; @@ -1166,14 +1225,14 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, /* We're racing with an invalidation, don't prefetch */ prefetch_activated = 0; } else if (ret < 0 || pages_in_range(address, length) > ret) { - mlx5_ib_page_fault_resume(dev, pfault, 1); + mlx5_ib_page_fault_resume(dev, NULL, pfault, 1, async_resume); if (ret != -ENOENT) mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", ret, pfault->token, pfault->type); return; } - mlx5_ib_page_fault_resume(dev, pfault, 0); + mlx5_ib_page_fault_resume(dev, NULL, pfault, 0, async_resume); mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", pfault->token, pfault->type, prefetch_activated); @@ -1201,18 +1260,31 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, { struct mlx5_ib_dev *dev = context; u8 event_subtype = pfault->event_subtype; + struct mlx5_pagefault *pfault_copy; + struct mlx5_pagefault *pfault_arg; + bool async_resume; + + pfault_copy = kmalloc(sizeof(*pfault_copy), GFP_KERNEL); + if (pfault_copy) { + memcpy(pfault_copy, pfault, sizeof(*pfault_copy)); + async_resume = true; + pfault_arg = pfault_copy; + } else { + async_resume = false; + pfault_arg = pfault; + } switch (event_subtype) { case MLX5_PFAULT_SUBTYPE_WQE: - mlx5_ib_mr_wqe_pfault_handler(dev, pfault); + mlx5_ib_mr_wqe_pfault_handler(dev, pfault_arg, async_resume); break; case MLX5_PFAULT_SUBTYPE_RDMA: - mlx5_ib_mr_rdma_pfault_handler(dev, pfault); + mlx5_ib_mr_rdma_pfault_handler(dev, pfault_arg, async_resume); break; default: mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", event_subtype); - mlx5_ib_page_fault_resume(dev, pfault, 1); + mlx5_ib_page_fault_resume(dev, NULL, pfault, 1, false); } } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index aa5963b5d38e..85275612a473 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -738,6 +738,9 @@ enum mlx5_pagefault_type_flags { /* Contains the details of a pagefault. */ struct mlx5_pagefault { + u32 out_pf_resume[MLX5_ST_SZ_DW(page_fault_resume_out)]; + u32 in_pf_resume[MLX5_ST_SZ_DW(page_fault_resume_in)]; + u32 bytes_committed; u32 token; u8 event_subtype;