diff mbox series

[1/2] cxl/acpi: Fix XOR 3-6-12 way host bridge look-up calculation

Message ID 0eebc96f0e36ebc74b0dc0f4112b70cced6907a7.1707891715.git.alison.schofield@intel.com
State New, archived
Headers show
Series XOR 3-6-12 HB Interleave Calc Repair | expand

Commit Message

Alison Schofield Feb. 14, 2024, 7:13 a.m. UTC
From: Alison Schofield <alison.schofield@intel.com>

The XOR host bridge look-up function is broken in its application
of the modulo calculation for interleaves that are multiples of 3.

The failure appears like this:
[] cxl_core:cxl_region_attach_position:1433: cxl region4: mem0:decoder8.1 invalid target position for decoder3.2

Replace the broken modulo calc with the same modulo calc as used
for Modulo Math. This is per CXL spec definition but was overlooked
in the original over-complicated implementation.

With the simple modulo calculation, the jump to a helper function
becomes needless and the work is now presented in a straight line.

Display the interleave_arithmetic in the dev_dbg() of successfully
setup root decoders to make debug lookup easier.

Fixes: f9db85bfec0d ("cxl/acpi: Support CXL XOR Interleave Math (CXIMS)")
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
---
 drivers/cxl/acpi.c | 51 ++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 29 deletions(-)

Comments

Dan Williams Feb. 29, 2024, 6:45 p.m. UTC | #1
alison.schofield@ wrote:
> From: Alison Schofield <alison.schofield@intel.com>
> 
> The XOR host bridge look-up function is broken in its application
> of the modulo calculation for interleaves that are multiples of 3.
> 
> The failure appears like this:
> [] cxl_core:cxl_region_attach_position:1433: cxl region4: mem0:decoder8.1 invalid target position for decoder3.2
> 
> Replace the broken modulo calc with the same modulo calc as used
> for Modulo Math. This is per CXL spec definition but was overlooked
> in the original over-complicated implementation.
> 
> With the simple modulo calculation, the jump to a helper function
> becomes needless and the work is now presented in a straight line.
> 
> Display the interleave_arithmetic in the dev_dbg() of successfully
> setup root decoders to make debug lookup easier.
> 
> Fixes: f9db85bfec0d ("cxl/acpi: Support CXL XOR Interleave Math (CXIMS)")
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>  drivers/cxl/acpi.c | 51 ++++++++++++++++++++--------------------------
>  1 file changed, 22 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
> index dcf2b39e1048..86c735f733ea 100644
> --- a/drivers/cxl/acpi.c
> +++ b/drivers/cxl/acpi.c
> @@ -22,31 +22,6 @@ static const guid_t acpi_cxl_qtg_id_guid =
>  	GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071,
>  		  0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52);
>  
> -/*
> - * Find a targets entry (n) in the host bridge interleave list.
> - * CXL Specification 3.0 Table 9-22
> - */
> -static int cxl_xor_calc_n(u64 hpa, struct cxl_cxims_data *cximsd, int iw,
> -			  int ig)
> -{
> -	int i = 0, n = 0;
> -	u8 eiw;
> -
> -	/* IW: 2,4,6,8,12,16 begin building 'n' using xormaps */
> -	if (iw != 3) {
> -		for (i = 0; i < cximsd->nr_maps; i++)
> -			n |= (hweight64(hpa & cximsd->xormaps[i]) & 1) << i;
> -	}
> -	/* IW: 3,6,12 add a modulo calculation to 'n' */
> -	if (!is_power_of_2(iw)) {
> -		if (ways_to_eiw(iw, &eiw))
> -			return -1;
> -		hpa &= GENMASK_ULL(51, eiw + ig);
> -		n |= do_div(hpa, 3) << i;
> -	}
> -	return n;
> -}
> -
>  static struct cxl_dport *cxl_hb_xor(struct cxl_root_decoder *cxlrd, int pos)
>  {
>  	struct cxl_cxims_data *cximsd = cxlrd->platform_data;
> @@ -62,11 +37,28 @@ static struct cxl_dport *cxl_hb_xor(struct cxl_root_decoder *cxlrd, int pos)
>  			  "misconfigured root decoder\n"))
>  		return NULL;
>  
> +	/*
> +	 * Find a targets entry in the host bridge interleave
> +	 * list as defined in CXL Specification 3.0 Table 9-22
> +	 *
> +	 * iw: 1 is no interleave, so entry is 0
> +	 * iw: 3 uses a modulo calc only
> +	 * iw: 2,4,6,8,12,16 use xormaps
> +	 * iw: 6,12 apply a modulo calc after xormaps
> +	 */
> +
> +	if (iw == 1)
> +		return cxlrd->cxlsd.target[0];
> +
> +	if (iw == 3)
> +		return cxlrd->cxlsd.target[pos % iw];
> +
>  	hpa = cxlrd->res->start + pos * ig;
> +	for (int i = 0; i < cximsd->nr_maps; i++)
> +		n |= (hweight64(hpa & cximsd->xormaps[i]) & 1) << i;

I am with you up to this point...

>  
> -	/* Entry (n) is 0 for no interleave (iw == 1) */
> -	if (iw != 1)
> -		n = cxl_xor_calc_n(hpa, cximsd, iw, ig);
> +	if (iw == 6 || iw == 12)
> +		n |= pos % iw;

Can you help me map this back to Table-9-22?

Shouldn't this be:

	n |= (pos % iw) << (iw / 3);

?
Dan Williams Feb. 29, 2024, 10:35 p.m. UTC | #2
Dan Williams wrote:
> alison.schofield@ wrote:
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > The XOR host bridge look-up function is broken in its application
> > of the modulo calculation for interleaves that are multiples of 3.
> > 
> > The failure appears like this:
> > [] cxl_core:cxl_region_attach_position:1433: cxl region4: mem0:decoder8.1 invalid target position for decoder3.2
> > 
> > Replace the broken modulo calc with the same modulo calc as used
> > for Modulo Math. This is per CXL spec definition but was overlooked
> > in the original over-complicated implementation.
> > 
> > With the simple modulo calculation, the jump to a helper function
> > becomes needless and the work is now presented in a straight line.
> > 
> > Display the interleave_arithmetic in the dev_dbg() of successfully
> > setup root decoders to make debug lookup easier.
> > 
> > Fixes: f9db85bfec0d ("cxl/acpi: Support CXL XOR Interleave Math (CXIMS)")
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
[..]
> > -	/* Entry (n) is 0 for no interleave (iw == 1) */
> > -	if (iw != 1)
> > -		n = cxl_xor_calc_n(hpa, cximsd, iw, ig);
> > +	if (iw == 6 || iw == 12)
> > +		n |= pos % iw;
> 
> Can you help me map this back to Table-9-22?
> 
> Shouldn't this be:
> 
> 	n |= (pos % iw) << (iw / 3);

Wait, no:

 	n |= (pos % iw) * (iw / 3);

...to match:

 6:   + 2* HPA[51:9+HBIG] MOD 3

 12:  + 4* HPA[51:10+HBIG] MOD 3
Alison Schofield March 7, 2024, 4:40 a.m. UTC | #3
On Thu, Feb 29, 2024 at 02:35:12PM -0800, Dan Williams wrote:
> Dan Williams wrote:
> > alison.schofield@ wrote:
> > > From: Alison Schofield <alison.schofield@intel.com>
> > > 
> > > The XOR host bridge look-up function is broken in its application
> > > of the modulo calculation for interleaves that are multiples of 3.
> > > 
> > > The failure appears like this:
> > > [] cxl_core:cxl_region_attach_position:1433: cxl region4: mem0:decoder8.1 invalid target position for decoder3.2
> > > 
> > > Replace the broken modulo calc with the same modulo calc as used
> > > for Modulo Math. This is per CXL spec definition but was overlooked
> > > in the original over-complicated implementation.
> > > 
> > > With the simple modulo calculation, the jump to a helper function
> > > becomes needless and the work is now presented in a straight line.
> > > 
> > > Display the interleave_arithmetic in the dev_dbg() of successfully
> > > setup root decoders to make debug lookup easier.
> > > 
> > > Fixes: f9db85bfec0d ("cxl/acpi: Support CXL XOR Interleave Math (CXIMS)")
> > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> [..]
> > > -	/* Entry (n) is 0 for no interleave (iw == 1) */
> > > -	if (iw != 1)
> > > -		n = cxl_xor_calc_n(hpa, cximsd, iw, ig);
> > > +	if (iw == 6 || iw == 12)
> > > +		n |= pos % iw;
> > 
> > Can you help me map this back to Table-9-22?
> > 
> > Shouldn't this be:
> > 
> > 	n |= (pos % iw) << (iw / 3);
> 
> Wait, no:
> 
>  	n |= (pos % iw) * (iw / 3);
> 
> ...to match:
> 
>  6:   + 2* HPA[51:9+HBIG] MOD 3
> 
>  12:  + 4* HPA[51:10+HBIG] MOD 3

Hi Dan,

tl;mrp  <-- must read please :)


Yes - that table is the key and I have a new found understanding, or
misunderstaning, of how to use it to share here.

But first, I did follow up on your suggestions along with more
variation to try and fix the existing code path:

Evaluating other calcs with an HBIW of 6 and pos[0..5]:

> Shouldn't this be:
> 
> 	n |= (pos % iw) << (iw / 3);
No. For each pos in [0..5] and HBIW = 6 the right-hand side of that
assignment results in 0, 4, 8, 12, 16, and 20, so goes beyond the
HBIW of 6.

> Wait, no:
>
> 	n |= (pos % iw) * (iw / 3);
No. The right side yields 0,2,4,6,8,10, beyond HBIW of 6.

I sampled many flavors of these and kept going back to Table-22
and a 6 way HB interleave setup, trying to make something work.
The fact that the target-list is in HB interleave order hit me
and here's where it took me:

From Table - 22 "A list of all the Interleave Targets. The number of
entries in this list shall match the Number of Interleave Ways (NIW).
The order of the targets reported in this List indicates the order
in the Interleave Set"

Using a 6 way Host Bridge Interleave: HB0  HB1  HB2  HB3  HB4  HB5

Using this CFMWS and CXIMS. This is a sample of what a 6 way interleave
may look like and how the xormap directs the traffic to HBs.

cfmws30 = {
                .cfmws = {
                        .header = {
                                .type = ACPI_CEDT_TYPE_CFMWS,
                                .length = sizeof(mock_cedt.cfmws30),
                        },
                        .interleave_arithmetic = ACPI_CEDT_CFMWS_ARITHMETIC_XOR,
                        .interleave_ways = 9,
                        .granularity = 0,
                        .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 |
                                        ACPI_CEDT_CFMWS_RESTRICT_PMEM,
                        .qtg_id = 0,
                        .window_size = SZ_256M * 24UL,
                },
                .target = { HB0, HB3, HB4, HB1, HB2, HB5 },
        },
cxims0 = {
                .cxims = {
                        .header = {
                                .type = ACPI_CEDT_TYPE_CXIMS,
                                .length = sizeof(mock_cedt.cxims0),
                        },
                        .hbig = 0,
                        .nr_xormaps = 1,
                },
                /* xormap for 6 way interleave */
                .xormap_list = { 0x2020900 },
        },


Given an HPA, HBIG, HBIW - the Table 9-22 calcs direct the routing of
that HPA. In order to route an address to the correct decoder (N), do

N = XORALLBITS (HPA AND XORMAP[0]) + 2* HPA[51:9+HBIG] MOD 3

Example data: hpa:0xf010000100 map:0x2020900 HBIG: 6
N = XORALLBITS (0xf01000010 AND 0x2020900) + 2* 1 MOD 3
N = 3

That says to route HPA 0xf01000100 to the 3rd Host Bridge: HB3
(And, of course, part of the routing is that gets reduced to
an offset, the position bit gets removed and the offset is
applied to the DPA base.)

All very interesting but is not applicable for what the driver
is trying to do in calc_hb. The driver wants to find the index
into the root decoder (cxlsd) target list. It is not HB3.

Recall that the CFMWS presents the target list is interleave order.
The CXL driver stores that target_list in interleave order.

Applying the Table 9-22 calc doesn't apply. It shouldn't and my ahah
moment here is that the CXL driver does not need to use that calc.
I didn't see this until I had that 6 way HBIW where a CFMWS target
list was 'dis-ordered'. Previously, 2 & 4 way XOR, applying the
calc was needless, but functioned, because the lists were ordered.

Given that the drivers target-list is in interleave order, use (pos % HBIW)
to get the index into the decoder target list. We don't need the 
complex calc to jump to the right location in the interleave since
we stored the interleave order. And note that the (pos % HBIW) will
fail with 'invalid target position' if the region creation is attempted
with a badly ordered target list. The check is as valid as ever.

I've verified this with these configs:
configs="1 11 111 1111 111111 2 22 222 2222 222222 4 44"
where an entry like 111 means 1+1+1 means 3 way HB Interleave

I only reached this point while working on that 6 way XOR config.
The upstream code works for 1 11 1111 and the like probably because
the lists were simply ordered. 

I'm ready to hear where this goes wrong. It reminds me of the address
translation work where I was hung up on the xormaps until figuring out
that those were routing instructions and didn't come into play at the
driver level of translation.

Direction check please. If this makes sense, the fix is simple, but the
cleanup of useless code will cause some churn.

Alison
diff mbox series

Patch

diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index dcf2b39e1048..86c735f733ea 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -22,31 +22,6 @@  static const guid_t acpi_cxl_qtg_id_guid =
 	GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071,
 		  0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52);
 
-/*
- * Find a targets entry (n) in the host bridge interleave list.
- * CXL Specification 3.0 Table 9-22
- */
-static int cxl_xor_calc_n(u64 hpa, struct cxl_cxims_data *cximsd, int iw,
-			  int ig)
-{
-	int i = 0, n = 0;
-	u8 eiw;
-
-	/* IW: 2,4,6,8,12,16 begin building 'n' using xormaps */
-	if (iw != 3) {
-		for (i = 0; i < cximsd->nr_maps; i++)
-			n |= (hweight64(hpa & cximsd->xormaps[i]) & 1) << i;
-	}
-	/* IW: 3,6,12 add a modulo calculation to 'n' */
-	if (!is_power_of_2(iw)) {
-		if (ways_to_eiw(iw, &eiw))
-			return -1;
-		hpa &= GENMASK_ULL(51, eiw + ig);
-		n |= do_div(hpa, 3) << i;
-	}
-	return n;
-}
-
 static struct cxl_dport *cxl_hb_xor(struct cxl_root_decoder *cxlrd, int pos)
 {
 	struct cxl_cxims_data *cximsd = cxlrd->platform_data;
@@ -62,11 +37,28 @@  static struct cxl_dport *cxl_hb_xor(struct cxl_root_decoder *cxlrd, int pos)
 			  "misconfigured root decoder\n"))
 		return NULL;
 
+	/*
+	 * Find a targets entry in the host bridge interleave
+	 * list as defined in CXL Specification 3.0 Table 9-22
+	 *
+	 * iw: 1 is no interleave, so entry is 0
+	 * iw: 3 uses a modulo calc only
+	 * iw: 2,4,6,8,12,16 use xormaps
+	 * iw: 6,12 apply a modulo calc after xormaps
+	 */
+
+	if (iw == 1)
+		return cxlrd->cxlsd.target[0];
+
+	if (iw == 3)
+		return cxlrd->cxlsd.target[pos % iw];
+
 	hpa = cxlrd->res->start + pos * ig;
+	for (int i = 0; i < cximsd->nr_maps; i++)
+		n |= (hweight64(hpa & cximsd->xormaps[i]) & 1) << i;
 
-	/* Entry (n) is 0 for no interleave (iw == 1) */
-	if (iw != 1)
-		n = cxl_xor_calc_n(hpa, cximsd, iw, ig);
+	if (iw == 6 || iw == 12)
+		n |= pos % iw;
 
 	if (n < 0)
 		return NULL;
@@ -424,8 +416,9 @@  static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 		dev_err(dev, "Failed to add decode range: %pr", res);
 		return rc;
 	}
-	dev_dbg(dev, "add: %s node: %d range [%#llx - %#llx]\n",
+	dev_dbg(dev, "add: %s math: %s node: %d range [%#llx - %#llx]\n",
 		dev_name(&cxld->dev),
+		cfmws->interleave_arithmetic ? "xor" : "modulo",
 		phys_to_target_node(cxld->hpa_range.start),
 		cxld->hpa_range.start, cxld->hpa_range.end);