diff mbox series

[v2] cxl: Calculate region bandwidth of targets with shared upstream link

Message ID 20240411181641.514075-1-dave.jiang@intel.com
State New
Headers show
Series [v2] cxl: Calculate region bandwidth of targets with shared upstream link | expand

Commit Message

Dave Jiang April 11, 2024, 6:16 p.m. UTC
For a topology where multiple targets sharing the same switch uplink, the
bandwidth must be divided amongst all the sharing targets.
cxl_rr->num_targets keeps track of the numbers of targets sharing the same
upstream port. The current implementation accumulates targets during
cxl_region_attach() and does not have the final number of targets until
the last target is attached. If the upstream link is shared, accumulate
bandwidth up to the switch upstream bandwidth.

Suggested-by: Jonathan Cameron <Jonathan.Cameron@Huawei.com>
Link: https://lore.kernel.org/linux-cxl/20240405143242.0000363a@Huawei.com/
Signed-off-by: Dave Jiang <dave.jiang@intel.com>

v2:
- cxl_region_targets() -> cxl_port_shared_region_targets(). (Dan)
- Use different method to calculate bandwidth. (Dan)
---
 drivers/cxl/core/cdat.c   | 48 +++++++++++++++++++++++++++++++++++++--
 drivers/cxl/core/core.h   |  3 +++
 drivers/cxl/core/pci.c    | 35 ++++++++++++++++++++++++++++
 drivers/cxl/core/region.c | 10 ++++++++
 4 files changed, 94 insertions(+), 2 deletions(-)

Comments

Ira Weiny April 23, 2024, 11:37 p.m. UTC | #1
Dave Jiang wrote:

[snip]

> +/*
> + * Calculate the bandwidth for the cxl region based on the number of targets
> + * that share an upstream switch. The function is called while targets are
> + * being attached for a region. If the number of targets is 1, then
> + * the target either does not have a upstream switch or it's the first target
> + * of the shared link. In this case, the bandwidth is the sum of the target
> + * bandwidth and the collected region bandwidth. If the targets from cxl_rr is
> + * greater than 1, then the bandwidth is the minimum of the switch upstream
> + * port bandwidth or the region plus the target bandwidth.
> + */
> +static unsigned int calculate_region_bw(int targets, unsigned int usp_bw,
> +					unsigned int ep_bw,
> +					unsigned int region_bw)
> +{
> +	if (targets == 1)
> +		return region_bw + ep_bw;
> +
> +	return min_t(unsigned int, usp_bw, region_bw + ep_bw);
> +}
> +
>  void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
>  				    struct cxl_endpoint_decoder *cxled)
>  {
> @@ -551,7 +581,9 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
>  			.start = cxled->dpa_res->start,
>  			.end = cxled->dpa_res->end,
>  	};
> +	struct cxl_port *port = cxlmd->endpoint;
>  	struct cxl_dpa_perf *perf;
> +	int usp_bw, targets;
>  
>  	switch (cxlr->mode) {
>  	case CXL_DECODER_RAM:
> @@ -569,6 +601,12 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
>  	if (!range_contains(&perf->dpa_range, &dpa))
>  		return;
>  
> +	usp_bw = cxl_get_switch_uport_bandwidth(port->uport_dev);
> +	if (usp_bw > 0)
> +		targets = cxl_port_shared_region_targets(port, cxlr);

I'm not quite following how this handles a x4 situation with 2 upstream
ports and 2 devices under each of those switches.

     +------+            +------+
     | HB 0 |            | HB 1 |
     +------+            +------+
        | (link 0)          | (link 1)
     +------+            +------+
     | SW 0 |            | SW 1 |
     +------+            +------+
      /    \              /     \
 +------+ +------+   +------+ +------+
 | EP 0 | | EP 1 |   | EP 2 | | EP 3 |
 +------+ +------+   +------+ +------+

In this case the region BW should be:

	min ( sum(link0, link1), sum(EP 0-3) )

How is the sum of EP 0-1 limited to the link0 BW, and EP 2-3 to link1?

> +	else
> +		targets = 1;

Maybe a comment to indicate that targets == 1 is a failure to read the usp
link speed so defaulting back to the sum of the endpoints.

> +
>  	for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
>  		/* Get total bandwidth and the worst latency for the cxl region */
>  		cxlr->coord[i].read_latency = max_t(unsigned int,
> @@ -577,8 +615,14 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
>  		cxlr->coord[i].write_latency = max_t(unsigned int,
>  						     cxlr->coord[i].write_latency,
>  						     perf->coord[i].write_latency);
> -		cxlr->coord[i].read_bandwidth += perf->coord[i].read_bandwidth;
> -		cxlr->coord[i].write_bandwidth += perf->coord[i].write_bandwidth;
> +		cxlr->coord[i].read_bandwidth =
> +			calculate_region_bw(targets, usp_bw,
> +					    perf->coord[i].read_bandwidth,
> +					    cxlr->coord[i].read_bandwidth);
> +		cxlr->coord[i].write_bandwidth =
> +			calculate_region_bw(targets, usp_bw,
> +					    perf->coord[i].write_bandwidth,
> +					    cxlr->coord[i].write_bandwidth);
>  	}
>  }
>  

[snip]

> +
> +int cxl_pci_get_switch_usp_bandwidth(struct pci_dev *pdev)
> +{
> +	struct device *dev = &pdev->dev;
> +	struct pci_dev *iter = pdev;
> +
> +	do {
> +		if (pci_pcie_type(iter) == PCI_EXP_TYPE_UPSTREAM)
> +			break;

Why is it not ok to do:

	while (pci_pcie_type(iter) != PCI_EXP_TYPE_UPSTREAM) {
		...
	}

Ira

[snip]
Jonathan Cameron May 1, 2024, 1:46 p.m. UTC | #2
On Thu, 11 Apr 2024 11:16:41 -0700
Dave Jiang <dave.jiang@intel.com> wrote:

> For a topology where multiple targets sharing the same switch uplink, the
> bandwidth must be divided amongst all the sharing targets.
> cxl_rr->num_targets keeps track of the numbers of targets sharing the same
> upstream port. The current implementation accumulates targets during
> cxl_region_attach() and does not have the final number of targets until
> the last target is attached. If the upstream link is shared, accumulate
> bandwidth up to the switch upstream bandwidth.
> 
> Suggested-by: Jonathan Cameron <Jonathan.Cameron@Huawei.com>
> Link: https://lore.kernel.org/linux-cxl/20240405143242.0000363a@Huawei.com/
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Version info below the ---
Though it's your problem to fix that up commits anyway :)

Some passing code comments inline but as with Ira I can't see how this
works for anything involving more than one switch.

> 
> v2:
> - cxl_region_targets() -> cxl_port_shared_region_targets(). (Dan)
> - Use different method to calculate bandwidth. (Dan)
> ---
>  drivers/cxl/core/cdat.c   | 48 +++++++++++++++++++++++++++++++++++++--
>  drivers/cxl/core/core.h   |  3 +++
>  drivers/cxl/core/pci.c    | 35 ++++++++++++++++++++++++++++
>  drivers/cxl/core/region.c | 10 ++++++++
>  4 files changed, 94 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c
> index 4b717d2f5a9d..09f3e62e19a5 100644
> --- a/drivers/cxl/core/cdat.c
> +++ b/drivers/cxl/core/cdat.c
> @@ -541,6 +541,36 @@ void cxl_coordinates_combine(struct access_coordinate *out,
>  
>  MODULE_IMPORT_NS(CXL);
>  
> +static int cxl_get_switch_uport_bandwidth(struct device *uport_dev)
> +{
> +	struct device *dev = uport_dev->parent;
> +
> +	if (!dev_is_pci(dev))
> +		return -ENODEV;
> +
> +	return cxl_pci_get_switch_usp_bandwidth(to_pci_dev(dev));
> +}
> +
> +/*
> + * Calculate the bandwidth for the cxl region based on the number of targets
> + * that share an upstream switch. The function is called while targets are
> + * being attached for a region. If the number of targets is 1, then
> + * the target either does not have a upstream switch or it's the first target
> + * of the shared link. In this case, the bandwidth is the sum of the target
> + * bandwidth and the collected region bandwidth. If the targets from cxl_rr is
> + * greater than 1, then the bandwidth is the minimum of the switch upstream
> + * port bandwidth or the region plus the target bandwidth.
> + */
> +static unsigned int calculate_region_bw(int targets, unsigned int usp_bw,
> +					unsigned int ep_bw,
> +					unsigned int region_bw)
> +{
> +	if (targets == 1)
> +		return region_bw + ep_bw;
> +
> +	return min_t(unsigned int, usp_bw, region_bw + ep_bw);
Similar to what Ira said, I can't see how this works for more complex situations
(i.e. pretty much anything involving more than one switch in parallel.).

> +}
> +
>  void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
>  				    struct cxl_endpoint_decoder *cxled)
>  {
> @@ -551,7 +581,9 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
>  			.start = cxled->dpa_res->start,
>  			.end = cxled->dpa_res->end,
>  	};
> +	struct cxl_port *port = cxlmd->endpoint;
>  	struct cxl_dpa_perf *perf;
> +	int usp_bw, targets;
>  
>  	switch (cxlr->mode) {
>  	case CXL_DECODER_RAM:
> @@ -569,6 +601,12 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
>  	if (!range_contains(&perf->dpa_range, &dpa))
>  		return;
>  
> +	usp_bw = cxl_get_switch_uport_bandwidth(port->uport_dev);
> +	if (usp_bw > 0)
> +		targets = cxl_port_shared_region_targets(port, cxlr);
> +	else

Comment on why would be helpful.  I think it's because there wasn't
a switch upstream port in the path?

> +		targets = 1;
> +
>  	for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
>  		/* Get total bandwidth and the worst latency for the cxl region */
>  		cxlr->coord[i].read_latency = max_t(unsigned int,
> @@ -577,8 +615,14 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
>  		cxlr->coord[i].write_latency = max_t(unsigned int,
>  						     cxlr->coord[i].write_latency,
>  						     perf->coord[i].write_latency);
> -		cxlr->coord[i].read_bandwidth += perf->coord[i].read_bandwidth;
> -		cxlr->coord[i].write_bandwidth += perf->coord[i].write_bandwidth;
> +		cxlr->coord[i].read_bandwidth =
> +			calculate_region_bw(targets, usp_bw,
> +					    perf->coord[i].read_bandwidth,
> +					    cxlr->coord[i].read_bandwidth);
> +		cxlr->coord[i].write_bandwidth =
> +			calculate_region_bw(targets, usp_bw,
> +					    perf->coord[i].write_bandwidth,
> +					    cxlr->coord[i].write_bandwidth);
>  	}
>  }
>  
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index bc5a95665aa0..99c1c24df671 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -89,9 +89,12 @@ enum cxl_poison_trace_type {
>  };
>  
>  long cxl_pci_get_latency(struct pci_dev *pdev);
> +int cxl_pci_get_switch_usp_bandwidth(struct pci_dev *pdev);
>  
>  int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr,
>  				       enum access_coordinate_class access);
>  bool cxl_need_node_perf_attrs_update(int nid);
>  
> +int cxl_port_shared_region_targets(struct cxl_port *port, struct cxl_region *cxlr);
> +
>  #endif /* __CXL_CORE_H__ */
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 0df09bd79408..9281ed5a073d 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -1045,3 +1045,38 @@ long cxl_pci_get_latency(struct pci_dev *pdev)
>  
>  	return cxl_flit_size(pdev) * MEGA / bw;
>  }
> +
> +static int cxl_pci_get_bandwidth(struct pci_dev *pdev)
> +{
> +	u16 lnksta;
> +	u32 width;
> +	int speed;
> +
> +	speed = pcie_link_speed_mbps(pdev);
> +	if (speed < 0)
> +		return 0;
> +	speed /= BITS_PER_BYTE;
> +
> +	pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnksta);
> +	width = FIELD_GET(PCI_EXP_LNKSTA_NLW, lnksta);
> +
> +	return speed * width;
> +}
> +

I'd add some comments. Otherwise expectation would be that
the pdev passed in would be the usp.

> +int cxl_pci_get_switch_usp_bandwidth(struct pci_dev *pdev)
> +{
> +	struct device *dev = &pdev->dev;

You set this then change it before it's used.

> +	struct pci_dev *iter = pdev;
> +
	while (pci_pcie_type(iter) == PCI_EXP_TYPE_UPSTREAM) {
		dev = iter->dev.parent;
		if (!dev || !dev_is_pci(dev))
			return -ENODEV;
		iter = to_pci_dev(dev);
	}

	return cxl_pci_get_bandwidth(iter);

> +	do {
> +		if (pci_pcie_type(iter) == PCI_EXP_TYPE_UPSTREAM)
> +			break;
> +
> +		dev = iter->dev.parent;
> +		if (!dev || !dev_is_pci(dev))
> +			return -ENODEV;
> +		iter = to_pci_dev(dev);
> +	} while (1);
> +
> +	return cxl_pci_get_bandwidth(iter);
> +}
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 5c186e0a39b9..5a1bca31d269 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -222,6 +222,16 @@ static struct cxl_region_ref *cxl_rr_load(struct cxl_port *port,
>  	return xa_load(&port->regions, (unsigned long)cxlr);
>  }
>  
> +int cxl_port_shared_region_targets(struct cxl_port *port, struct cxl_region *cxlr)
> +{
> +	struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
> +
> +	if (!cxl_rr)
> +		return 0;
> +
> +	return cxl_rr->nr_targets;
> +}
> +
>  static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
>  {
>  	if (!cpu_cache_has_invalidate_memregion()) {
Jonathan Cameron May 1, 2024, 2:25 p.m. UTC | #3
On Tue, 23 Apr 2024 16:37:50 -0700
Ira Weiny <ira.weiny@intel.com> wrote:

> Dave Jiang wrote:
> 
> [snip]
> 
> > +/*
> > + * Calculate the bandwidth for the cxl region based on the number of targets
> > + * that share an upstream switch. The function is called while targets are
> > + * being attached for a region. If the number of targets is 1, then
> > + * the target either does not have a upstream switch or it's the first target
> > + * of the shared link. In this case, the bandwidth is the sum of the target
> > + * bandwidth and the collected region bandwidth. If the targets from cxl_rr is
> > + * greater than 1, then the bandwidth is the minimum of the switch upstream
> > + * port bandwidth or the region plus the target bandwidth.
> > + */
> > +static unsigned int calculate_region_bw(int targets, unsigned int usp_bw,
> > +					unsigned int ep_bw,
> > +					unsigned int region_bw)
> > +{
> > +	if (targets == 1)
> > +		return region_bw + ep_bw;
> > +
> > +	return min_t(unsigned int, usp_bw, region_bw + ep_bw);
> > +}
> > +
> >  void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
> >  				    struct cxl_endpoint_decoder *cxled)
> >  {
> > @@ -551,7 +581,9 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
> >  			.start = cxled->dpa_res->start,
> >  			.end = cxled->dpa_res->end,
> >  	};
> > +	struct cxl_port *port = cxlmd->endpoint;
> >  	struct cxl_dpa_perf *perf;
> > +	int usp_bw, targets;
> >  
> >  	switch (cxlr->mode) {
> >  	case CXL_DECODER_RAM:
> > @@ -569,6 +601,12 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
> >  	if (!range_contains(&perf->dpa_range, &dpa))
> >  		return;
> >  
> > +	usp_bw = cxl_get_switch_uport_bandwidth(port->uport_dev);
> > +	if (usp_bw > 0)
> > +		targets = cxl_port_shared_region_targets(port, cxlr);  
> 
> I'm not quite following how this handles a x4 situation with 2 upstream
> ports and 2 devices under each of those switches.
> 
>      +------+            +------+
>      | HB 0 |            | HB 1 |
>      +------+            +------+
>         | (link 0)          | (link 1)
>      +------+            +------+
>      | SW 0 |            | SW 1 |
>      +------+            +------+
>       /    \              /     \
>  +------+ +------+   +------+ +------+
>  | EP 0 | | EP 1 |   | EP 2 | | EP 3 |
>  +------+ +------+   +------+ +------+
> 
> In this case the region BW should be:
> 
> 	min ( sum(link0, link1), sum(EP 0-3) )
> 
> How is the sum of EP 0-1 limited to the link0 BW, and EP 2-3 to link1?

Agreed. Seems this fails if there are multiple RPs involved whether attached
to a single host bridge, or multiple host bridges (and hence multiple generic
ports).  

I'm also wondering now if we are handling the difference between shared
Generic Ports and independent ones? (and for giggles PXM nodes with multiple
GPs)

                 CFMWS 0
                    |
           _________|_________
          |                   |
    GP0/HB0/ACPI0017-0  GP1/HB1/ACPI0017-1
      |          |        |           |
     RP0        RP1      RP2         RP3
      |          |        |           |
    SW 0       SW 1     SW 2        SW 3
    |   |      |   |    |   |       |   |
   EP0 EP1    EP2 EP3  EP4  EP5    EP6 EP7

BW I think is min of (being loose with terms!):
* GP0 BW + GP1 BW  (I think GP stuff is expressing bw to the Root Complex)
* RP0 Link + RP1 Link + RP2 Link + RP3 Link
* SW0 SSLBIS BW Port0 + SW0 SSLBIS BW P1 + SW1 BW P0 + SW1 P1 + SW2 P0 + SW2 P1 + SW3 P0 + SW3 P1
* EP0 Link BW + EP1 Link BW + EP2 Link BW + EP3 Link BW + EP 4 Link BW + EP5 Link BW + EP6 Link BW + EP7 Link BW
* EP0 DSLBIS BW + EP1.... EP7 DSLBIS BW.

But it's worse than that, because in each branch the bottleneck might not
be at the same level.  If we spit it into two GP0 tree and GP1 tree
and in GP0 the limit is the SW0/1 BW and in GP1 it's EP4,5,6,7
then we can't just sum things up like above.

I think we'd need to build up from the leaves with them all in place, clamping
as we go.  So something like
Min (CPU0 to GP0 BW, 
     Min(RP0 to SW 0 Link BW, 
         Min(SW0SSLBIS to P0, EP0 DSLBIS, EP0 Upstream Link) +
         Min(SW0SSLBIS to P1, EP1 DSLBIS, EP1 Upstream link)) +
     Min(RP1 to SW 1 Link BW, 
         Min(SW1SSLBIS to P0, EP2 DSLBIS, EP2 Upstream Link) +
         Min(SW1SSLBIS to P1, EP3 DSLBIS, EP3 Upstream link))) +
Min (CPU0 to GP1 BW,
     Min(RP2 to SW 2 Link BW, 
         Min(SW2SSLBIS to P0, EP4 DSLBIS, EP4 Upstream Link) +
         Min(SW2SSLBIS to P1, EP5 DSLBIS, EP5 Upstream link)) +
     Min(RP3 to SW 3 Link BW, 
         Min(SW3SSLBIS to P0, EP6 DSLBIS, EP6 Upstream Link) +
         Min(SW3SSLBIS to P1, EP7 DSLBIS, EP7 Upstream link))))


The outer most sum assumes GP0 and GP1 are in different PXM.
If they are in the same one I think it's effectively the same
as all the RP below one HB?

Maybe we should just document we only support symmetric
setups and print warning messages if anything else is seen - the
shared PXM for the GPs should be added though as that is certainly
possible and made me think I need to poke some FW folk to make sure
they aren't doing this wrong ;)

So there can be sharing both upstream of switches and between the root
ports.  We can't do anything about sharing inside the host as HMAT
doesn't give us any hints.

I'm basing that shared PXM assumption on the case of two chunks of
memory (each with own SRAT entry but same Proximity Domain) then the HMAT numbers
are to all the memory in the node, not to each chunk of the memory.

So upshot is we need test cases where we deliberate clamp the bandwidth
at each level in the above diagram to something small and make sure we
correctly identify the overall bandwidth.

Can do that in QEMU (with the GP set) though you'll need to hack the
CDAT tables or export them and inject them back in again with different
numbers.

Good luck ;) *hides*

Jonathan




> 
> > +	else
> > +		targets = 1;  
> 
> Maybe a comment to indicate that targets == 1 is a failure to read the usp
> link speed so defaulting back to the sum of the endpoints.
> 
> > +
> >  	for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
> >  		/* Get total bandwidth and the worst latency for the cxl region */
> >  		cxlr->coord[i].read_latency = max_t(unsigned int,
> > @@ -577,8 +615,14 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
> >  		cxlr->coord[i].write_latency = max_t(unsigned int,
> >  						     cxlr->coord[i].write_latency,
> >  						     perf->coord[i].write_latency);
> > -		cxlr->coord[i].read_bandwidth += perf->coord[i].read_bandwidth;
> > -		cxlr->coord[i].write_bandwidth += perf->coord[i].write_bandwidth;
> > +		cxlr->coord[i].read_bandwidth =
> > +			calculate_region_bw(targets, usp_bw,
> > +					    perf->coord[i].read_bandwidth,
> > +					    cxlr->coord[i].read_bandwidth);
> > +		cxlr->coord[i].write_bandwidth =
> > +			calculate_region_bw(targets, usp_bw,
> > +					    perf->coord[i].write_bandwidth,
> > +					    cxlr->coord[i].write_bandwidth);
> >  	}
> >  }
> >    
> 
> [snip]
> 
> > +
> > +int cxl_pci_get_switch_usp_bandwidth(struct pci_dev *pdev)
> > +{
> > +	struct device *dev = &pdev->dev;
> > +	struct pci_dev *iter = pdev;
> > +
> > +	do {
> > +		if (pci_pcie_type(iter) == PCI_EXP_TYPE_UPSTREAM)
> > +			break;  
> 
> Why is it not ok to do:
> 
> 	while (pci_pcie_type(iter) != PCI_EXP_TYPE_UPSTREAM) {
> 		...
Snap ;)
> 	}
> 
> Ira
> 
> [snip]
diff mbox series

Patch

diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c
index 4b717d2f5a9d..09f3e62e19a5 100644
--- a/drivers/cxl/core/cdat.c
+++ b/drivers/cxl/core/cdat.c
@@ -541,6 +541,36 @@  void cxl_coordinates_combine(struct access_coordinate *out,
 
 MODULE_IMPORT_NS(CXL);
 
+static int cxl_get_switch_uport_bandwidth(struct device *uport_dev)
+{
+	struct device *dev = uport_dev->parent;
+
+	if (!dev_is_pci(dev))
+		return -ENODEV;
+
+	return cxl_pci_get_switch_usp_bandwidth(to_pci_dev(dev));
+}
+
+/*
+ * Calculate the bandwidth for the cxl region based on the number of targets
+ * that share an upstream switch. The function is called while targets are
+ * being attached for a region. If the number of targets is 1, then
+ * the target either does not have a upstream switch or it's the first target
+ * of the shared link. In this case, the bandwidth is the sum of the target
+ * bandwidth and the collected region bandwidth. If the targets from cxl_rr is
+ * greater than 1, then the bandwidth is the minimum of the switch upstream
+ * port bandwidth or the region plus the target bandwidth.
+ */
+static unsigned int calculate_region_bw(int targets, unsigned int usp_bw,
+					unsigned int ep_bw,
+					unsigned int region_bw)
+{
+	if (targets == 1)
+		return region_bw + ep_bw;
+
+	return min_t(unsigned int, usp_bw, region_bw + ep_bw);
+}
+
 void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
 				    struct cxl_endpoint_decoder *cxled)
 {
@@ -551,7 +581,9 @@  void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
 			.start = cxled->dpa_res->start,
 			.end = cxled->dpa_res->end,
 	};
+	struct cxl_port *port = cxlmd->endpoint;
 	struct cxl_dpa_perf *perf;
+	int usp_bw, targets;
 
 	switch (cxlr->mode) {
 	case CXL_DECODER_RAM:
@@ -569,6 +601,12 @@  void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
 	if (!range_contains(&perf->dpa_range, &dpa))
 		return;
 
+	usp_bw = cxl_get_switch_uport_bandwidth(port->uport_dev);
+	if (usp_bw > 0)
+		targets = cxl_port_shared_region_targets(port, cxlr);
+	else
+		targets = 1;
+
 	for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
 		/* Get total bandwidth and the worst latency for the cxl region */
 		cxlr->coord[i].read_latency = max_t(unsigned int,
@@ -577,8 +615,14 @@  void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
 		cxlr->coord[i].write_latency = max_t(unsigned int,
 						     cxlr->coord[i].write_latency,
 						     perf->coord[i].write_latency);
-		cxlr->coord[i].read_bandwidth += perf->coord[i].read_bandwidth;
-		cxlr->coord[i].write_bandwidth += perf->coord[i].write_bandwidth;
+		cxlr->coord[i].read_bandwidth =
+			calculate_region_bw(targets, usp_bw,
+					    perf->coord[i].read_bandwidth,
+					    cxlr->coord[i].read_bandwidth);
+		cxlr->coord[i].write_bandwidth =
+			calculate_region_bw(targets, usp_bw,
+					    perf->coord[i].write_bandwidth,
+					    cxlr->coord[i].write_bandwidth);
 	}
 }
 
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index bc5a95665aa0..99c1c24df671 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -89,9 +89,12 @@  enum cxl_poison_trace_type {
 };
 
 long cxl_pci_get_latency(struct pci_dev *pdev);
+int cxl_pci_get_switch_usp_bandwidth(struct pci_dev *pdev);
 
 int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr,
 				       enum access_coordinate_class access);
 bool cxl_need_node_perf_attrs_update(int nid);
 
+int cxl_port_shared_region_targets(struct cxl_port *port, struct cxl_region *cxlr);
+
 #endif /* __CXL_CORE_H__ */
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 0df09bd79408..9281ed5a073d 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -1045,3 +1045,38 @@  long cxl_pci_get_latency(struct pci_dev *pdev)
 
 	return cxl_flit_size(pdev) * MEGA / bw;
 }
+
+static int cxl_pci_get_bandwidth(struct pci_dev *pdev)
+{
+	u16 lnksta;
+	u32 width;
+	int speed;
+
+	speed = pcie_link_speed_mbps(pdev);
+	if (speed < 0)
+		return 0;
+	speed /= BITS_PER_BYTE;
+
+	pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnksta);
+	width = FIELD_GET(PCI_EXP_LNKSTA_NLW, lnksta);
+
+	return speed * width;
+}
+
+int cxl_pci_get_switch_usp_bandwidth(struct pci_dev *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct pci_dev *iter = pdev;
+
+	do {
+		if (pci_pcie_type(iter) == PCI_EXP_TYPE_UPSTREAM)
+			break;
+
+		dev = iter->dev.parent;
+		if (!dev || !dev_is_pci(dev))
+			return -ENODEV;
+		iter = to_pci_dev(dev);
+	} while (1);
+
+	return cxl_pci_get_bandwidth(iter);
+}
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 5c186e0a39b9..5a1bca31d269 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -222,6 +222,16 @@  static struct cxl_region_ref *cxl_rr_load(struct cxl_port *port,
 	return xa_load(&port->regions, (unsigned long)cxlr);
 }
 
+int cxl_port_shared_region_targets(struct cxl_port *port, struct cxl_region *cxlr)
+{
+	struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
+
+	if (!cxl_rr)
+		return 0;
+
+	return cxl_rr->nr_targets;
+}
+
 static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
 {
 	if (!cpu_cache_has_invalidate_memregion()) {