Message ID | 165603880411.551046.9204694225111844300.stgit@dwillia2-xfh |
---|---|
State | Superseded |
Headers | show |
Series | CXL PMEM Region Provisioning | expand |
On Thu, 23 Jun 2022 19:46:44 -0700 Dan Williams <dan.j.williams@intel.com> wrote: > In preparation for provisioining CXL regions, add accounting for the DPA > space consumed by existing regions / decoders. Recall, a CXL region is a > memory range comrpised from one or more endpoint devices contributing a > mapping of their DPA into HPA space through a decoder. > > Record the DPA ranges covered by committed decoders at initial probe of > endpoint ports relative to a per-device resource tree of the DPA type > (pmem or volaltile-ram). > > The cxl_dpa_rwsem semaphore is introduced to globally synchronize DPA > state across all endpoints and their decoders at once. The vast majority > of DPA operations are reads as region creation is expected to be as rare > as disk partitioning and volume creation. The device_lock() for this > synchronization is specifically avoided for concern of entangling with > sysfs attribute removal. > > Co-developed-by: Ben Widawsky <bwidawsk@kernel.org> > Signed-off-by: Ben Widawsky <bwidawsk@kernel.org> > Signed-off-by: Dan Williams <dan.j.williams@intel.com> > --- > drivers/cxl/core/hdm.c | 148 ++++++++++++++++++++++++++++++++++++++++++++---- > drivers/cxl/cxl.h | 2 + > drivers/cxl/cxlmem.h | 13 ++++ > 3 files changed, 152 insertions(+), 11 deletions(-) > > diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c > index c940a4911fee..daae6e533146 100644 > --- a/drivers/cxl/core/hdm.c > +++ b/drivers/cxl/core/hdm.c > @@ -7,6 +7,8 @@ > #include "cxlmem.h" > #include "core.h" > > +static DECLARE_RWSEM(cxl_dpa_rwsem); I've not checked many files, but pci.c has equivalent static defines after the DOC: entry so for consistency move this below that? > + > /** > * DOC: cxl core hdm > * > @@ -128,10 +130,108 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port) > } > EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_hdm, CXL); > > +/* > + * Must be called in a context that synchronizes against this decoder's > + * port ->remove() callback (like an endpoint decoder sysfs attribute) > + */ > +static void cxl_dpa_release(void *cxled); > +static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled, bool remove_action) > +{ > + struct cxl_port *port = cxled_to_port(cxled); > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > + struct resource *res = cxled->dpa_res; > + > + lockdep_assert_held_write(&cxl_dpa_rwsem); > + > + if (remove_action) > + devm_remove_action(&port->dev, cxl_dpa_release, cxled); This code organization is more surprising than I'd like. Why not move this to a wrapper that is like devm_kfree() and similar which do the free now and remove from the devm list? static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled) { struct cxl_port *port = cxled_to_port(cxled); struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct resource *res = cxled->dpa_res; if (cxled->skip) __release_region(&cxlds->dpa_res, res->start - cxled->skip, cxled->skip); cxled->skip = 0; __release_region(&cxlds->dpa_res, res->start, resource_size(res)); cxled->dpa_res = NULL; } /* possibly add some underscores to this name to indicate it's special in when you can safely call it */ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled) { struct cxl_port *port = cxled_to_port(cxled); lockdep_assert_held_write(&cxl_dpa_rwsem); devm_remove_action(&port->dev, cxl_dpa_release, cxled); __cxl_dpa_release(cxled); } static void cxl_dpa_release(void *cxled) { down_write(&cxl_dpa_rwsem); __cxl_dpa_release(cxled, false); up_write(&cxl_dpa_rwsem); } > + > + if (cxled->skip) > + __release_region(&cxlds->dpa_res, res->start - cxled->skip, > + cxled->skip); > + cxled->skip = 0; > + __release_region(&cxlds->dpa_res, res->start, resource_size(res)); > + cxled->dpa_res = NULL; > +} > + > +static void cxl_dpa_release(void *cxled) > +{ > + down_write(&cxl_dpa_rwsem); > + __cxl_dpa_release(cxled, false); > + up_write(&cxl_dpa_rwsem); > +} > + > +static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, > + resource_size_t base, resource_size_t len, > + resource_size_t skip) > +{ > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > + struct cxl_port *port = cxled_to_port(cxled); > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > + struct device *dev = &port->dev; > + struct resource *res; > + > + lockdep_assert_held_write(&cxl_dpa_rwsem); > + > + if (!len) > + return 0; > + > + if (cxled->dpa_res) { > + dev_dbg(dev, "decoder%d.%d: existing allocation %pr assigned\n", > + port->id, cxled->cxld.id, cxled->dpa_res); > + return -EBUSY; > + } > + > + if (skip) { > + res = __request_region(&cxlds->dpa_res, base - skip, skip, > + dev_name(dev), 0); Interface that uses a backwards definition of skip as what to skip before the base parameter is a little odd can we rename base parameter to something like 'current_top' then have base = current_top + skip? current_top naming not great though... > + if (!res) { > + dev_dbg(dev, > + "decoder%d.%d: failed to reserve skip space\n", > + port->id, cxled->cxld.id); > + return -EBUSY; > + } > + } > + res = __request_region(&cxlds->dpa_res, base, len, dev_name(dev), 0); > + if (!res) { > + dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n", > + port->id, cxled->cxld.id); > + if (skip) > + __release_region(&cxlds->dpa_res, base - skip, skip); > + return -EBUSY; > + } > + cxled->dpa_res = res; > + cxled->skip = skip; > + > + return 0; > +} > + ...
Jonathan Cameron wrote: > On Thu, 23 Jun 2022 19:46:44 -0700 > Dan Williams <dan.j.williams@intel.com> wrote: > > > In preparation for provisioining CXL regions, add accounting for the DPA > > space consumed by existing regions / decoders. Recall, a CXL region is a > > memory range comrpised from one or more endpoint devices contributing a > > mapping of their DPA into HPA space through a decoder. > > > > Record the DPA ranges covered by committed decoders at initial probe of > > endpoint ports relative to a per-device resource tree of the DPA type > > (pmem or volaltile-ram). > > > > The cxl_dpa_rwsem semaphore is introduced to globally synchronize DPA > > state across all endpoints and their decoders at once. The vast majority > > of DPA operations are reads as region creation is expected to be as rare > > as disk partitioning and volume creation. The device_lock() for this > > synchronization is specifically avoided for concern of entangling with > > sysfs attribute removal. > > > > Co-developed-by: Ben Widawsky <bwidawsk@kernel.org> > > Signed-off-by: Ben Widawsky <bwidawsk@kernel.org> > > Signed-off-by: Dan Williams <dan.j.williams@intel.com> > > --- > > drivers/cxl/core/hdm.c | 148 ++++++++++++++++++++++++++++++++++++++++++++---- > > drivers/cxl/cxl.h | 2 + > > drivers/cxl/cxlmem.h | 13 ++++ > > 3 files changed, 152 insertions(+), 11 deletions(-) > > > > diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c > > index c940a4911fee..daae6e533146 100644 > > --- a/drivers/cxl/core/hdm.c > > +++ b/drivers/cxl/core/hdm.c > > @@ -7,6 +7,8 @@ > > #include "cxlmem.h" > > #include "core.h" > > > > +static DECLARE_RWSEM(cxl_dpa_rwsem); > > I've not checked many files, but pci.c has equivalent static defines after > the DOC: entry so for consistency move this below that? ok. > > > > + > > /** > > * DOC: cxl core hdm > > * > > @@ -128,10 +130,108 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port) > > } > > EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_hdm, CXL); > > > > +/* > > + * Must be called in a context that synchronizes against this decoder's > > + * port ->remove() callback (like an endpoint decoder sysfs attribute) > > + */ > > +static void cxl_dpa_release(void *cxled); > > +static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled, bool remove_action) > > +{ > > + struct cxl_port *port = cxled_to_port(cxled); > > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > > + struct resource *res = cxled->dpa_res; > > + > > + lockdep_assert_held_write(&cxl_dpa_rwsem); > > + > > + if (remove_action) > > + devm_remove_action(&port->dev, cxl_dpa_release, cxled); > > This code organization is more surprising than I'd like. Why not move this to > a wrapper that is like devm_kfree() and similar which do the free now and > remove from the devm list? True. I see how this got here incrementally, but this end state can definitely now be fixed up to be more devm idiomatic. > > static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled) > { > struct cxl_port *port = cxled_to_port(cxled); > struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > struct cxl_dev_state *cxlds = cxlmd->cxlds; > struct resource *res = cxled->dpa_res; > > if (cxled->skip) > __release_region(&cxlds->dpa_res, res->start - cxled->skip, > cxled->skip); > cxled->skip = 0; > __release_region(&cxlds->dpa_res, res->start, resource_size(res)); > cxled->dpa_res = NULL; > } > > /* possibly add some underscores to this name to indicate it's special > in when you can safely call it */ > static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled) > { > struct cxl_port *port = cxled_to_port(cxled); > lockdep_assert_held_write(&cxl_dpa_rwsem); > devm_remove_action(&port->dev, cxl_dpa_release, cxled); > __cxl_dpa_release(cxled); > } > > static void cxl_dpa_release(void *cxled) > { > down_write(&cxl_dpa_rwsem); > __cxl_dpa_release(cxled, false); > up_write(&cxl_dpa_rwsem); > } > > > + > > + if (cxled->skip) > > + __release_region(&cxlds->dpa_res, res->start - cxled->skip, > > + cxled->skip); > > + cxled->skip = 0; > > + __release_region(&cxlds->dpa_res, res->start, resource_size(res)); > > + cxled->dpa_res = NULL; > > +} > > + > > +static void cxl_dpa_release(void *cxled) > > +{ > > + down_write(&cxl_dpa_rwsem); > > + __cxl_dpa_release(cxled, false); > > + up_write(&cxl_dpa_rwsem); > > +} > > + > > +static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, > > + resource_size_t base, resource_size_t len, > > + resource_size_t skip) > > +{ > > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > > + struct cxl_port *port = cxled_to_port(cxled); > > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > > + struct device *dev = &port->dev; > > + struct resource *res; > > + > > + lockdep_assert_held_write(&cxl_dpa_rwsem); > > + > > + if (!len) > > + return 0; > > + > > + if (cxled->dpa_res) { > > + dev_dbg(dev, "decoder%d.%d: existing allocation %pr assigned\n", > > + port->id, cxled->cxld.id, cxled->dpa_res); > > + return -EBUSY; > > + } > > + > > + if (skip) { > > + res = __request_region(&cxlds->dpa_res, base - skip, skip, > > + dev_name(dev), 0); > > > Interface that uses a backwards definition of skip as what to skip before > the base parameter is a little odd can we rename base parameter to something > like 'current_top' then have base = current_top + skip? current_top naming > not great though... How about just name it "skipped" instead of "skip"? As the parameter is how many bytes were skipped to allow a new allocation to start at base.
... > > > +static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, > > > + resource_size_t base, resource_size_t len, > > > + resource_size_t skip) > > > +{ > > > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > > > + struct cxl_port *port = cxled_to_port(cxled); > > > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > > > + struct device *dev = &port->dev; > > > + struct resource *res; > > > + > > > + lockdep_assert_held_write(&cxl_dpa_rwsem); > > > + > > > + if (!len) > > > + return 0; > > > + > > > + if (cxled->dpa_res) { > > > + dev_dbg(dev, "decoder%d.%d: existing allocation %pr assigned\n", > > > + port->id, cxled->cxld.id, cxled->dpa_res); > > > + return -EBUSY; > > > + } > > > + > > > + if (skip) { > > > + res = __request_region(&cxlds->dpa_res, base - skip, skip, > > > + dev_name(dev), 0); > > > > > > Interface that uses a backwards definition of skip as what to skip before > > the base parameter is a little odd can we rename base parameter to something > > like 'current_top' then have base = current_top + skip? current_top naming > > not great though... > > How about just name it "skipped" instead of "skip"? As the parameter is > how many bytes were skipped to allow a new allocation to start at base. Works for me (guessing you long since went with this given how far behind I am!) Thanks, Jonathan
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index c940a4911fee..daae6e533146 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -7,6 +7,8 @@ #include "cxlmem.h" #include "core.h" +static DECLARE_RWSEM(cxl_dpa_rwsem); + /** * DOC: cxl core hdm * @@ -128,10 +130,108 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_hdm, CXL); +/* + * Must be called in a context that synchronizes against this decoder's + * port ->remove() callback (like an endpoint decoder sysfs attribute) + */ +static void cxl_dpa_release(void *cxled); +static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled, bool remove_action) +{ + struct cxl_port *port = cxled_to_port(cxled); + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct resource *res = cxled->dpa_res; + + lockdep_assert_held_write(&cxl_dpa_rwsem); + + if (remove_action) + devm_remove_action(&port->dev, cxl_dpa_release, cxled); + + if (cxled->skip) + __release_region(&cxlds->dpa_res, res->start - cxled->skip, + cxled->skip); + cxled->skip = 0; + __release_region(&cxlds->dpa_res, res->start, resource_size(res)); + cxled->dpa_res = NULL; +} + +static void cxl_dpa_release(void *cxled) +{ + down_write(&cxl_dpa_rwsem); + __cxl_dpa_release(cxled, false); + up_write(&cxl_dpa_rwsem); +} + +static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, + resource_size_t base, resource_size_t len, + resource_size_t skip) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *port = cxled_to_port(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *dev = &port->dev; + struct resource *res; + + lockdep_assert_held_write(&cxl_dpa_rwsem); + + if (!len) + return 0; + + if (cxled->dpa_res) { + dev_dbg(dev, "decoder%d.%d: existing allocation %pr assigned\n", + port->id, cxled->cxld.id, cxled->dpa_res); + return -EBUSY; + } + + if (skip) { + res = __request_region(&cxlds->dpa_res, base - skip, skip, + dev_name(dev), 0); + if (!res) { + dev_dbg(dev, + "decoder%d.%d: failed to reserve skip space\n", + port->id, cxled->cxld.id); + return -EBUSY; + } + } + res = __request_region(&cxlds->dpa_res, base, len, dev_name(dev), 0); + if (!res) { + dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n", + port->id, cxled->cxld.id); + if (skip) + __release_region(&cxlds->dpa_res, base - skip, skip); + return -EBUSY; + } + cxled->dpa_res = res; + cxled->skip = skip; + + return 0; +} + +static int cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, + resource_size_t base, resource_size_t len, + resource_size_t skip) +{ + struct cxl_port *port = cxled_to_port(cxled); + int rc; + + down_write(&cxl_dpa_rwsem); + rc = __cxl_dpa_reserve(cxled, base, len, skip); + up_write(&cxl_dpa_rwsem); + + if (rc) + return rc; + + return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled); +} + static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, - int *target_map, void __iomem *hdm, int which) + int *target_map, void __iomem *hdm, int which, + u64 *dpa_base) { - u64 size, base; + struct cxl_endpoint_decoder *cxled = NULL; + u64 size, base, skip, dpa_size; + bool committed; + u32 remainder; int i, rc; u32 ctrl; union { @@ -139,11 +239,15 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, unsigned char target_id[8]; } target_list; + if (is_endpoint_decoder(&cxld->dev)) + cxled = to_cxl_endpoint_decoder(&cxld->dev); + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which)); base = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(which)); size = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(which)); + committed = !!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED); - if (!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) + if (!committed) size = 0; if (base == U64_MAX || size == U64_MAX) { dev_warn(&port->dev, "decoder%d.%d: Invalid resource range\n", @@ -156,8 +260,8 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, .end = base + size - 1, }; - /* switch decoders are always enabled if committed */ - if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) { + /* decoders are enabled if committed */ + if (committed) { cxld->flags |= CXL_DECODER_F_ENABLE; if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) cxld->flags |= CXL_DECODER_F_LOCK; @@ -180,14 +284,35 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, else cxld->target_type = CXL_DECODER_ACCELERATOR; - if (is_endpoint_decoder(&cxld->dev)) + if (!cxled) { + target_list.value = + ioread64_hi_lo(hdm + CXL_HDM_DECODER0_TL_LOW(which)); + for (i = 0; i < cxld->interleave_ways; i++) + target_map[i] = target_list.target_id[i]; + return 0; + } - target_list.value = - ioread64_hi_lo(hdm + CXL_HDM_DECODER0_TL_LOW(which)); - for (i = 0; i < cxld->interleave_ways; i++) - target_map[i] = target_list.target_id[i]; + if (!committed) + return 0; + dpa_size = div_u64_rem(size, cxld->interleave_ways, &remainder); + if (remainder) { + dev_err(&port->dev, + "decoder%d.%d: invalid committed configuration size: %#llx ways: %d\n", + port->id, cxld->id, size, cxld->interleave_ways); + return -ENXIO; + } + skip = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_SKIP_LOW(which)); + rc = cxl_dpa_reserve(cxled, *dpa_base + skip, dpa_size, skip); + if (rc) { + dev_err(&port->dev, + "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)", + port->id, cxld->id, *dpa_base, + *dpa_base + dpa_size + skip - 1, rc); + return rc; + } + *dpa_base += dpa_size + skip; return 0; } @@ -200,6 +325,7 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) void __iomem *hdm = cxlhdm->regs.hdm_decoder; struct cxl_port *port = cxlhdm->port; int i, committed; + u64 dpa_base = 0; u32 ctrl; /* @@ -247,7 +373,7 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) return PTR_ERR(cxld); } - rc = init_hdm_decoder(port, cxld, target_map, hdm, i); + rc = init_hdm_decoder(port, cxld, target_map, hdm, i, &dpa_base); if (rc) { put_device(&cxld->dev); return rc; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 579f2d802396..6832d6d70548 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -56,6 +56,8 @@ #define CXL_HDM_DECODER0_CTRL_TYPE BIT(12) #define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) #define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) +#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) +#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) static inline int cxl_hdm_decoder_count(u32 cap_hdr) { diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index a9609d40643f..b4e5ed9eabc9 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -50,6 +50,19 @@ static inline struct cxl_memdev *to_cxl_memdev(struct device *dev) return container_of(dev, struct cxl_memdev, dev); } +static inline struct cxl_port *cxled_to_port(struct cxl_endpoint_decoder *cxled) +{ + return to_cxl_port(cxled->cxld.dev.parent); +} + +static inline struct cxl_memdev * +cxled_to_memdev(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *port = to_cxl_port(cxled->cxld.dev.parent); + + return to_cxl_memdev(port->uport); +} + bool is_cxl_memdev(struct device *dev); static inline bool is_cxl_endpoint(struct cxl_port *port) {