diff mbox series

[v8,04/46] hw/cxl/device: Introduce a CXL device (8.2.8)

Message ID 20220318150635.24600-5-Jonathan.Cameron@huawei.com
State Superseded
Headers show
Series CXl 2.0 emulation Support | expand

Commit Message

Jonathan Cameron March 18, 2022, 3:05 p.m. UTC
From: Ben Widawsky <ben.widawsky@intel.com>

A CXL device is a type of CXL component. Conceptually, a CXL device
would be a leaf node in a CXL topology. From an emulation perspective,
CXL devices are the most complex and so the actual implementation is
reserved for discrete commits.

This new device type is specifically catered towards the eventual
implementation of a Type3 CXL.mem device, 8.2.8.5 in the CXL 2.0
specification.

Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
---
 include/hw/cxl/cxl.h        |   1 +
 include/hw/cxl/cxl_device.h | 165 ++++++++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+)

Comments

Adam Manzanares March 29, 2022, 6:13 p.m. UTC | #1
On Fri, Mar 18, 2022 at 03:05:53PM +0000, Jonathan Cameron wrote:
> From: Ben Widawsky <ben.widawsky@intel.com>
> 
> A CXL device is a type of CXL component. Conceptually, a CXL device
> would be a leaf node in a CXL topology. From an emulation perspective,
> CXL devices are the most complex and so the actual implementation is
> reserved for discrete commits.
> 
> This new device type is specifically catered towards the eventual
> implementation of a Type3 CXL.mem device, 8.2.8.5 in the CXL 2.0
> specification.
> 
> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
> ---
>  include/hw/cxl/cxl.h        |   1 +
>  include/hw/cxl/cxl_device.h | 165 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 166 insertions(+)
> 
> diff --git a/include/hw/cxl/cxl.h b/include/hw/cxl/cxl.h
> index 8c738c7a2b..b9d1ac3fad 100644
> --- a/include/hw/cxl/cxl.h
> +++ b/include/hw/cxl/cxl.h
> @@ -12,5 +12,6 @@
>  
>  #include "cxl_pci.h"
>  #include "cxl_component.h"
> +#include "cxl_device.h"
>  
>  #endif
> diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
> new file mode 100644
> index 0000000000..b2416e45bf
> --- /dev/null
> +++ b/include/hw/cxl/cxl_device.h
> @@ -0,0 +1,165 @@
> +/*
> + * QEMU CXL Devices
> + *
> + * Copyright (c) 2020 Intel
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2. See the
> + * COPYING file in the top-level directory.
> + */
> +
> +#ifndef CXL_DEVICE_H
> +#define CXL_DEVICE_H
> +
> +#include "hw/register.h"
> +
> +/*
> + * The following is how a CXL device's MMIO space is laid out. The only
> + * requirement from the spec is that the capabilities array and the capability
> + * headers start at offset 0 and are contiguously packed. The headers themselves
> + * provide offsets to the register fields. For this emulation, registers will
> + * start at offset 0x80 (m == 0x80). No secondary mailbox is implemented which
> + * means that n = m + sizeof(mailbox registers) + sizeof(device registers).

What is n here, the start offset of the mailbox registers, this question is 
based on the figure below?

> + *
> + * This is roughly described in 8.2.8 Figure 138 of the CXL 2.0 spec.
> + *
> + *                       +---------------------------------+
> + *                       |                                 |
> + *                       |    Memory Device Registers      |
> + *                       |                                 |
> + * n + PAYLOAD_SIZE_MAX  -----------------------------------
> + *                  ^    |                                 |
> + *                  |    |                                 |
> + *                  |    |                                 |
> + *                  |    |                                 |
> + *                  |    |                                 |
> + *                  |    |         Mailbox Payload         |
> + *                  |    |                                 |
> + *                  |    |                                 |
> + *                  |    |                                 |
> + *                  |    -----------------------------------
> + *                  |    |       Mailbox Registers         |
> + *                  |    |                                 |
> + *                  n    -----------------------------------
> + *                  ^    |                                 |
> + *                  |    |        Device Registers         |
> + *                  |    |                                 |
> + *                  m    ---------------------------------->
> + *                  ^    |  Memory Device Capability Header|
> + *                  |    -----------------------------------
> + *                  |    |     Mailbox Capability Header   |
> + *                  |    -------------- --------------------
> + *                  |    |     Device Capability Header    |
> + *                  |    -----------------------------------
> + *                  |    |                                 |
> + *                  |    |                                 |
> + *                  |    |      Device Cap Array[0..n]     |
> + *                  |    |                                 |
> + *                  |    |                                 |
> + *                       |                                 |
> + *                  0    +---------------------------------+

Would it make sense to add CXL cap header register to the diagram? n also 
seems to be the size of the cap array, but it is also an offset so that could
be clarified.

> + *
> + */
> +
> +#define CXL_DEVICE_CAP_HDR1_OFFSET 0x10 /* Figure 138 */
> +#define CXL_DEVICE_CAP_REG_SIZE 0x10 /* 8.2.8.2 */
> +#define CXL_DEVICE_CAPS_MAX 4 /* 8.2.8.2.1 + 8.2.8.5 */
> +
> +#define CXL_DEVICE_REGISTERS_OFFSET 0x80 /* Read comment above */

Is this to plan for future capabilities? If we have CAPS MAX doesn't this 
allow us to remove the slack space. 

> +#define CXL_DEVICE_REGISTERS_LENGTH 0x8 /* 8.2.8.3.1 */

Should we add status to the name here, or would it get too long?

> +
> +#define CXL_MAILBOX_REGISTERS_OFFSET \
> +    (CXL_DEVICE_REGISTERS_OFFSET + CXL_DEVICE_REGISTERS_LENGTH)
> +#define CXL_MAILBOX_REGISTERS_SIZE 0x20 /* 8.2.8.4, Figure 139 */
> +#define CXL_MAILBOX_PAYLOAD_SHIFT 11

I see 20 in the spec.

> +#define CXL_MAILBOX_MAX_PAYLOAD_SIZE (1 << CXL_MAILBOX_PAYLOAD_SHIFT)
> +#define CXL_MAILBOX_REGISTERS_LENGTH \
> +    (CXL_MAILBOX_REGISTERS_SIZE + CXL_MAILBOX_MAX_PAYLOAD_SIZE)
> +
> +typedef struct cxl_device_state {
> +    MemoryRegion device_registers;
> +
> +    /* mmio for device capabilities array - 8.2.8.2 */
> +    MemoryRegion device;
> +    MemoryRegion caps;
> +
> +    /* mmio for the mailbox registers 8.2.8.4 */
> +    MemoryRegion mailbox;
> +
> +    /* memory region for persistent memory, HDM */
> +    uint64_t pmem_size;

Can we switch this to mem_size and drop the persistent comment? It is my 
understanding that HDM is independent of persistence.

> +} CXLDeviceState;
> +
> +/* Initialize the register block for a device */
> +void cxl_device_register_block_init(Object *obj, CXLDeviceState *dev);
> +
> +/* Set up default values for the register block */
> +void cxl_device_register_init_common(CXLDeviceState *dev);
> +
> +/*
> + * CXL 2.0 - 8.2.8.1 including errata F4
> + * Documented as a 128 bit register, but 64 bit accesses and the second
> + * 64 bits are currently reserved.
> + */
> +REG64(CXL_DEV_CAP_ARRAY, 0) /* Documented as 128 bit register but 64 byte accesses */
> +    FIELD(CXL_DEV_CAP_ARRAY, CAP_ID, 0, 16)
> +    FIELD(CXL_DEV_CAP_ARRAY, CAP_VERSION, 16, 8)
> +    FIELD(CXL_DEV_CAP_ARRAY, CAP_COUNT, 32, 16)
> +
> +/*
> + * Helper macro to initialize capability headers for CXL devices.
> + *
> + * In the 8.2.8.2, this is listed as a 128b register, but in 8.2.8, it says:
> + * > No registers defined in Section 8.2.8 are larger than 64-bits wide so that
> + * > is the maximum access size allowed for these registers. If this rule is not
> + * > followed, the behavior is undefined
> + *
> + * CXL 2.0 Errata F4 states futher that the layouts in the specification are
> + * shown as greater than 128 bits, but implementations are expected to
> + * use any size of access up to 64 bits.
> + *
> + * Here we've chosen to make it 4 dwords. The spec allows any pow2 multiple
> + * access to be used for a register up to 64 bits.
> + */
> +#define CXL_DEVICE_CAPABILITY_HEADER_REGISTER(n, offset)  \
> +    REG32(CXL_DEV_##n##_CAP_HDR0, offset)                 \
> +        FIELD(CXL_DEV_##n##_CAP_HDR0, CAP_ID, 0, 16)      \
> +        FIELD(CXL_DEV_##n##_CAP_HDR0, CAP_VERSION, 16, 8) \
> +    REG32(CXL_DEV_##n##_CAP_HDR1, offset + 4)             \
> +        FIELD(CXL_DEV_##n##_CAP_HDR1, CAP_OFFSET, 0, 32)  \
> +    REG32(CXL_DEV_##n##_CAP_HDR2, offset + 8)             \
> +        FIELD(CXL_DEV_##n##_CAP_HDR2, CAP_LENGTH, 0, 32)
> +
> +CXL_DEVICE_CAPABILITY_HEADER_REGISTER(DEVICE, CXL_DEVICE_CAP_HDR1_OFFSET)
> +CXL_DEVICE_CAPABILITY_HEADER_REGISTER(MAILBOX, CXL_DEVICE_CAP_HDR1_OFFSET + \
> +                                               CXL_DEVICE_CAP_REG_SIZE)
> +

Fig139 for the following registers.

8.2.8.4.3
> +REG32(CXL_DEV_MAILBOX_CAP, 0)
> +    FIELD(CXL_DEV_MAILBOX_CAP, PAYLOAD_SIZE, 0, 5)
> +    FIELD(CXL_DEV_MAILBOX_CAP, INT_CAP, 5, 1)
> +    FIELD(CXL_DEV_MAILBOX_CAP, BG_INT_CAP, 6, 1)
> +    FIELD(CXL_DEV_MAILBOX_CAP, MSI_N, 7, 4)
> +

8.2.8.4.4 
> +REG32(CXL_DEV_MAILBOX_CTRL, 4)
> +    FIELD(CXL_DEV_MAILBOX_CTRL, DOORBELL, 0, 1)
> +    FIELD(CXL_DEV_MAILBOX_CTRL, INT_EN, 1, 1)
> +    FIELD(CXL_DEV_MAILBOX_CTRL, BG_INT_EN, 2, 1)
> +

8.2.8.4.5 + 8.2.9
> +REG64(CXL_DEV_MAILBOX_CMD, 8)
> +    FIELD(CXL_DEV_MAILBOX_CMD, COMMAND, 0, 8)
> +    FIELD(CXL_DEV_MAILBOX_CMD, COMMAND_SET, 8, 8)
> +    FIELD(CXL_DEV_MAILBOX_CMD, LENGTH, 16, 20)
> +

8.2.8.4.6
> +REG64(CXL_DEV_MAILBOX_STS, 0x10)
> +    FIELD(CXL_DEV_MAILBOX_STS, BG_OP, 0, 1)
> +    FIELD(CXL_DEV_MAILBOX_STS, ERRNO, 32, 16)
> +    FIELD(CXL_DEV_MAILBOX_STS, VENDOR_ERRNO, 48, 16)
> +

8.2.8.4.7
> +REG64(CXL_DEV_BG_CMD_STS, 0x18)
> +    FIELD(CXL_DEV_BG_CMD_STS, BG, 0, 16)

Should we call this OP since it is implied that we are BG given the register?

> +    FIELD(CXL_DEV_BG_CMD_STS, DONE, 16, 7)

NUM_DONE? since this is a percentage.

> +    FIELD(CXL_DEV_BG_CMD_STS, ERRNO, 32, 16)

Isn't this a RET_CODE since it is only valid if previous field is 100%

> +    FIELD(CXL_DEV_BG_CMD_STS, VENDOR_ERRNO, 48, 16)

VENDOR_RET_CODE since the same rule for the previous field applies here.

> +
> +REG32(CXL_DEV_CMD_PAYLOAD, 0x20)
> +
> +#endif
> -- 
> 2.32.0
> 
> 

+cc Dave, Klaus, Tong
Other than the minor issues raised.

Looks good.

Reviewed by: Adam Manzanares <a.manzanares@samsung.com>
Davidlohr Bueso March 29, 2022, 7:53 p.m. UTC | #2
On Tue, 29 Mar 2022, Adam Manzanares wrote:
>> +typedef struct cxl_device_state {
>> +    MemoryRegion device_registers;
>> +
>> +    /* mmio for device capabilities array - 8.2.8.2 */
>> +    MemoryRegion device;
>> +    MemoryRegion caps;
>> +
>> +    /* mmio for the mailbox registers 8.2.8.4 */
>> +    MemoryRegion mailbox;
>> +
>> +    /* memory region for persistent memory, HDM */
>> +    uint64_t pmem_size;
>
>Can we switch this to mem_size and drop the persistent comment? It is my
>understanding that HDM is independent of persistence.

Agreed, but ideally both volatile and persistent capacities would have been
supported in this patchset. I'm also probably missing specific reasons as to
why this isn't the case.

Looking at it briefly could it be just a matter of adding to cxl_type3_dev
a new hostmem along with it's AddressSpace for the volatile? If so, I'm
thinking something along these lines:

@@ -123,8 +123,8 @@ typedef struct cxl_device_state {
	 uint64_t host_set;
      } timestamp;

-    /* memory region for persistent memory, HDM */
-    uint64_t pmem_size;
+    /* memory region for persistent and volatile memory, HDM */
+    uint64_t pmem_size, mem_size;
  } CXLDeviceState;

  /* Initialize the register block for a device */
@@ -235,9 +235,9 @@ typedef struct cxl_type3_dev {
      PCIDevice parent_obj;

      /* Properties */
-    AddressSpace hostmem_as;
+    AddressSpace hostmem_as, hostmemv_as;
      uint64_t size;
-    HostMemoryBackend *hostmem;
+    HostMemoryBackend *hostmem, *hostmemv;
      HostMemoryBackend *lsa;
      uint64_t sn;

Then for cxl_setup_memory(), with ct3d->hostmem and/or ct3d->hostmemv
non-nil, set the respective MemoryRegions:

+    if (ct3d->hostmem) {
+            memory_region_set_nonvolatile(mr, true);
+            memory_region_set_enabled(mr, true);
+            host_memory_backend_set_mapped(ct3d->hostmem, true);
+            address_space_init(&ct3d->hostmem_as, mr, name);
+            ct3d->cxl_dstate.pmem_size = ct3d->hostmem->size;
+    }
+    if (ct3d->hostmemv) {
+            memory_region_set_nonvolatile(mrv, false);
+            memory_region_set_enabled(mrv, true);
+            host_memory_backend_set_mapped(ct3d->hostmemv, true);
+            address_space_init(&ct3d->hostmem_as, mrv, name);
+            ct3d->cxl_dstate.pmem_size = ct3d->hostmem->size;
+    }

For corresponding MB commands, it's mostly IDENTIFY_MEMORY_DEVICE that needs
updating:

@@ -281,7 +281,7 @@ static ret_code cmd_identify_memory_device(struct cxl_cmd *cmd,

      CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
      CXLType3Class *cvc = CXL_TYPE3_DEV_GET_CLASS(ct3d);
-    uint64_t size = cxl_dstate->pmem_size;
+    uint64_t size = cxl_dstate->pmem_size + cxl_dstate->mem_size;

      if (!QEMU_IS_ALIGNED(size, 256 << 20)) {
	 return CXL_MBOX_INTERNAL_ERROR;
@@ -290,11 +290,11 @@ static ret_code cmd_identify_memory_device(struct cxl_cmd *cmd,
      id = (void *)cmd->payload;
      memset(id, 0, sizeof(*id));

-    /* PMEM only */
      snprintf(id->fw_revision, 0x10, "BWFW VERSION %02d", 0);

      id->total_capacity = size / (256 << 20);
-    id->persistent_capacity = size / (256 << 20);
+    id->persistent_capacity = cxl_dstate->pmem_size / (256 << 20);
+    id->volatile_capacity = cxl_dstate->mem_size / (256 << 20);
      id->lsa_size = cvc->get_lsa_size(ct3d);

      *len = sizeof(*id);
@@ -312,16 +312,16 @@ static ret_code cmd_ccls_get_partition_info(struct cxl_cmd *cmd,
	 uint64_t next_pmem;
      } QEMU_PACKED *part_info = (void *)cmd->payload;
      QEMU_BUILD_BUG_ON(sizeof(*part_info) != 0x20);
-    uint64_t size = cxl_dstate->pmem_size;
+    uint64_t psize = cxl_dstate->pmem_size;
+    uint64_t vsize = cxl_dstate->mem_size;

-    if (!QEMU_IS_ALIGNED(size, 256 << 20)) {
+    if (!QEMU_IS_ALIGNED(psize + vsize, 256 << 20)) {
	 return CXL_MBOX_INTERNAL_ERROR;
      }

-    /* PMEM only */
-    part_info->active_vmem = 0;
-    part_info->next_vmem = 0;
-    part_info->active_pmem = size / (256 << 20);
+    part_info->active_vmem = vsize / (256 << 20);
+    part_info->next_vmem = part_info->active_vmem;
+    part_info->active_pmem = psize / (256 << 20);
      part_info->next_pmem = part_info->active_pmem;

Then for reads/writes, both cxl_type3_write() and _read() would, after computing the dpa_offset,
first try the volatile region then upon error attempt the same in the persistent memory - this
assuming the DPA space is consistent among both types of memory. Ie:

address_space_read(&ct3d->hostmemv_as, dpa_offset, attrs, data, size)
or
address_space_read(&ct3d->hostmem_as, dpa_offset, attrs, data, size)

... but then again all this is probably just wishful thinking.

Thanks,
Davidlohr
Jonathan Cameron March 30, 2022, 12:15 p.m. UTC | #3
On Tue, 29 Mar 2022 12:53:51 -0700
Davidlohr Bueso <dave@stgolabs.net> wrote:

> On Tue, 29 Mar 2022, Adam Manzanares wrote:
> >> +typedef struct cxl_device_state {
> >> +    MemoryRegion device_registers;
> >> +
> >> +    /* mmio for device capabilities array - 8.2.8.2 */
> >> +    MemoryRegion device;
> >> +    MemoryRegion caps;
> >> +
> >> +    /* mmio for the mailbox registers 8.2.8.4 */
> >> +    MemoryRegion mailbox;
> >> +
> >> +    /* memory region for persistent memory, HDM */
> >> +    uint64_t pmem_size;  
> >
> >Can we switch this to mem_size and drop the persistent comment? It is my
> >understanding that HDM is independent of persistence.  
> 
> Agreed, but ideally both volatile and persistent capacities would have been
> supported in this patchset. I'm also probably missing specific reasons as to
> why this isn't the case.

Whilst it doesn't add a huge amount of complexity it does add some
and the software paths in Linux we were developing this for are pmem focused.
Hence volatile is on the todo list rather than in this first patch set.
Not sensible to aim for feature complete in one go.

> 
> Looking at it briefly could it be just a matter of adding to cxl_type3_dev
> a new hostmem along with it's AddressSpace for the volatile? If so, I'm
> thinking something along these lines:
> 
> @@ -123,8 +123,8 @@ typedef struct cxl_device_state {
> 	 uint64_t host_set;
>       } timestamp;
> 
> -    /* memory region for persistent memory, HDM */
> -    uint64_t pmem_size;
> +    /* memory region for persistent and volatile memory, HDM */
> +    uint64_t pmem_size, mem_size;
>   } CXLDeviceState;
> 
>   /* Initialize the register block for a device */
> @@ -235,9 +235,9 @@ typedef struct cxl_type3_dev {
>       PCIDevice parent_obj;
> 
>       /* Properties */
> -    AddressSpace hostmem_as;
> +    AddressSpace hostmem_as, hostmemv_as;
>       uint64_t size;
> -    HostMemoryBackend *hostmem;
> +    HostMemoryBackend *hostmem, *hostmemv;
>       HostMemoryBackend *lsa;
>       uint64_t sn;
> 
> Then for cxl_setup_memory(), with ct3d->hostmem and/or ct3d->hostmemv
> non-nil, set the respective MemoryRegions:
> 
> +    if (ct3d->hostmem) {
> +            memory_region_set_nonvolatile(mr, true);
> +            memory_region_set_enabled(mr, true);
> +            host_memory_backend_set_mapped(ct3d->hostmem, true);
> +            address_space_init(&ct3d->hostmem_as, mr, name);
> +            ct3d->cxl_dstate.pmem_size = ct3d->hostmem->size;
> +    }
> +    if (ct3d->hostmemv) {
> +            memory_region_set_nonvolatile(mrv, false);
> +            memory_region_set_enabled(mrv, true);
> +            host_memory_backend_set_mapped(ct3d->hostmemv, true);
> +            address_space_init(&ct3d->hostmem_as, mrv, name);
> +            ct3d->cxl_dstate.pmem_size = ct3d->hostmem->size;
> +    }
> 
> For corresponding MB commands, it's mostly IDENTIFY_MEMORY_DEVICE that needs
> updating:
> 
> @@ -281,7 +281,7 @@ static ret_code cmd_identify_memory_device(struct cxl_cmd *cmd,
> 
>       CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
>       CXLType3Class *cvc = CXL_TYPE3_DEV_GET_CLASS(ct3d);
> -    uint64_t size = cxl_dstate->pmem_size;
> +    uint64_t size = cxl_dstate->pmem_size + cxl_dstate->mem_size;
> 
>       if (!QEMU_IS_ALIGNED(size, 256 << 20)) {
> 	 return CXL_MBOX_INTERNAL_ERROR;
> @@ -290,11 +290,11 @@ static ret_code cmd_identify_memory_device(struct cxl_cmd *cmd,
>       id = (void *)cmd->payload;
>       memset(id, 0, sizeof(*id));
> 
> -    /* PMEM only */
>       snprintf(id->fw_revision, 0x10, "BWFW VERSION %02d", 0);
> 
>       id->total_capacity = size / (256 << 20);
> -    id->persistent_capacity = size / (256 << 20);
> +    id->persistent_capacity = cxl_dstate->pmem_size / (256 << 20);
> +    id->volatile_capacity = cxl_dstate->mem_size / (256 << 20);
>       id->lsa_size = cvc->get_lsa_size(ct3d);
> 
>       *len = sizeof(*id);
> @@ -312,16 +312,16 @@ static ret_code cmd_ccls_get_partition_info(struct cxl_cmd *cmd,
> 	 uint64_t next_pmem;
>       } QEMU_PACKED *part_info = (void *)cmd->payload;
>       QEMU_BUILD_BUG_ON(sizeof(*part_info) != 0x20);
> -    uint64_t size = cxl_dstate->pmem_size;
> +    uint64_t psize = cxl_dstate->pmem_size;
> +    uint64_t vsize = cxl_dstate->mem_size;
> 
> -    if (!QEMU_IS_ALIGNED(size, 256 << 20)) {
> +    if (!QEMU_IS_ALIGNED(psize + vsize, 256 << 20)) {
> 	 return CXL_MBOX_INTERNAL_ERROR;
>       }
> 
> -    /* PMEM only */
> -    part_info->active_vmem = 0;
> -    part_info->next_vmem = 0;
> -    part_info->active_pmem = size / (256 << 20);
> +    part_info->active_vmem = vsize / (256 << 20);
> +    part_info->next_vmem = part_info->active_vmem;
> +    part_info->active_pmem = psize / (256 << 20);
>       part_info->next_pmem = part_info->active_pmem;
> 
> Then for reads/writes, both cxl_type3_write() and _read() would, after computing the dpa_offset,
> first try the volatile region then upon error attempt the same in the persistent memory - this
> assuming the DPA space is consistent among both types of memory. Ie:
> 
> address_space_read(&ct3d->hostmemv_as, dpa_offset, attrs, data, size)
> or
> address_space_read(&ct3d->hostmem_as, dpa_offset, attrs, data, size)
> 
> ... but then again all this is probably just wishful thinking.

Without looking in detail, will indeed be something along those lines.
Gets more fiddly if you include partitioning control that Alison was interested
in adding.

Also, we probably need to support multiple HDM decoders.  Also not a huge
complexity to add, but left for now as main focus is to get the base
patch set merged.

So I'm happy to queue stuff up on top of this series and carry it forward
but I don't want to add features to what we try to merge initially.
This set is already huge and hard to review even with what think is a
minimum set of features to be useful.

Note I'm already carrying a few features on top if this on the gitlab
branch gitlab.com/jic23/qemu (DOE + CDAT and serial numbers) and
have a few other things out of tree for now (SPDM, emulating most
of the PCI Config space controls). 

Jonathan

> 
> Thanks,
> Davidlohr
Jonathan Cameron March 30, 2022, 5:48 p.m. UTC | #4
On Tue, 29 Mar 2022 18:13:59 +0000
Adam Manzanares <a.manzanares@samsung.com> wrote:

> On Fri, Mar 18, 2022 at 03:05:53PM +0000, Jonathan Cameron wrote:
> > From: Ben Widawsky <ben.widawsky@intel.com>
> > 
> > A CXL device is a type of CXL component. Conceptually, a CXL device
> > would be a leaf node in a CXL topology. From an emulation perspective,
> > CXL devices are the most complex and so the actual implementation is
> > reserved for discrete commits.
> > 
> > This new device type is specifically catered towards the eventual
> > implementation of a Type3 CXL.mem device, 8.2.8.5 in the CXL 2.0
> > specification.
> > 
> > Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

...

> > diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
> > new file mode 100644
> > index 0000000000..b2416e45bf
> > --- /dev/null
> > +++ b/include/hw/cxl/cxl_device.h
> > @@ -0,0 +1,165 @@
> > +/*
> > + * QEMU CXL Devices
> > + *
> > + * Copyright (c) 2020 Intel
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2. See the
> > + * COPYING file in the top-level directory.
> > + */
> > +
> > +#ifndef CXL_DEVICE_H
> > +#define CXL_DEVICE_H
> > +
> > +#include "hw/register.h"
> > +
> > +/*
> > + * The following is how a CXL device's MMIO space is laid out. The only
> > + * requirement from the spec is that the capabilities array and the capability
> > + * headers start at offset 0 and are contiguously packed. The headers themselves
> > + * provide offsets to the register fields. For this emulation, registers will
> > + * start at offset 0x80 (m == 0x80). No secondary mailbox is implemented which
> > + * means that n = m + sizeof(mailbox registers) + sizeof(device registers).  
> 
> What is n here, the start offset of the mailbox registers, this question is 
> based on the figure below?

I'll expand on this to say

means that the offset of the start of the mailbox payload (n) is given by
n = m + sizeof....

Which means the diagram below is wrong as should align with top
of mailbox registers.

> 
> > + *
> > + * This is roughly described in 8.2.8 Figure 138 of the CXL 2.0 spec
I'm going drop this comment as that figure appears unrelated to me.

> > + *
> > + *                       +---------------------------------+
> > + *                       |                                 |
> > + *                       |    Memory Device Registers      |
> > + *                       |                                 |
> > + * n + PAYLOAD_SIZE_MAX  -----------------------------------
> > + *                  ^    |                                 |
> > + *                  |    |                                 |
> > + *                  |    |                                 |
> > + *                  |    |                                 |
> > + *                  |    |                                 |
> > + *                  |    |         Mailbox Payload         |
> > + *                  |    |                                 |
> > + *                  |    |                                 |
> > + *                  |    |                                 |
> > + *                  |    -----------------------------------
> > + *                  |    |       Mailbox Registers         |
> > + *                  |    |                                 |
> > + *                  n    -----------------------------------
> > + *                  ^    |                                 |
> > + *                  |    |        Device Registers         |
> > + *                  |    |                                 |
> > + *                  m    ---------------------------------->
> > + *                  ^    |  Memory Device Capability Header|
> > + *                  |    -----------------------------------
> > + *                  |    |     Mailbox Capability Header   |
> > + *                  |    -------------- --------------------
> > + *                  |    |     Device Capability Header    |
> > + *                  |    -----------------------------------
> > + *                  |    |                                 |
> > + *                  |    |                                 |
> > + *                  |    |      Device Cap Array[0..n]     |
> > + *                  |    |                                 |
> > + *                  |    |                                 |
> > + *                       |                                 |
> > + *                  0    +---------------------------------+  
> 
> Would it make sense to add CXL cap header register to the diagram?

Too many similar names in the CXL spec. I'm not sure which one you mean,
could you let me have a reference?  If you mean the one that is
at the start of the CXL.cache and CXL.mem registers that whole region
isn't covered by this diagram and might be in a different BAR.
Here we are only dealing with the Memory Device Registers.  I'll
add statement to the initial comment block to make that clear
as it definitely isn't currently!

> n also 
> seems to be the size of the cap array, but it is also an offset so that could
> be clarified.

Ah. Letter reuse. good point. Looking more closely it isn't an array anyway
in the diagram (the array would have to include the Device Capability Header
and Mailbox Capability headers.  Renamed as simply Device Cap Array Register

> 
> > + *
> > + */
> > +
> > +#define CXL_DEVICE_CAP_HDR1_OFFSET 0x10 /* Figure 138 */
> > +#define CXL_DEVICE_CAP_REG_SIZE 0x10 /* 8.2.8.2 */
> > +#define CXL_DEVICE_CAPS_MAX 4 /* 8.2.8.2.1 + 8.2.8.5 */
> > +
> > +#define CXL_DEVICE_REGISTERS_OFFSET 0x80 /* Read comment above */  
> 
> Is this to plan for future capabilities? If we have CAPS MAX doesn't this 
> allow us to remove the slack space. 
> 
> > +#define CXL_DEVICE_REGISTERS_LENGTH 0x8 /* 8.2.8.3.1 */  
> 
> Should we add status to the name here, or would it get too long?
> 
> > +
> > +#define CXL_MAILBOX_REGISTERS_OFFSET \
> > +    (CXL_DEVICE_REGISTERS_OFFSET + CXL_DEVICE_REGISTERS_LENGTH)
> > +#define CXL_MAILBOX_REGISTERS_SIZE 0x20 /* 8.2.8.4, Figure 139 */
> > +#define CXL_MAILBOX_PAYLOAD_SHIFT 11  
> 
> I see 20 in the spec.

It's an implementation choice between 8 and 20. For now, this code goes
with 11 for no particularly strong reason.

> 
> > +#define CXL_MAILBOX_MAX_PAYLOAD_SIZE (1 << CXL_MAILBOX_PAYLOAD_SHIFT)
> > +#define CXL_MAILBOX_REGISTERS_LENGTH \
> > +    (CXL_MAILBOX_REGISTERS_SIZE + CXL_MAILBOX_MAX_PAYLOAD_SIZE)
> > +
> > +typedef struct cxl_device_state {
> > +    MemoryRegion device_registers;
> > +
> > +    /* mmio for device capabilities array - 8.2.8.2 */
> > +    MemoryRegion device;
> > +    MemoryRegion caps;
> > +
> > +    /* mmio for the mailbox registers 8.2.8.4 */
> > +    MemoryRegion mailbox;
> > +
> > +    /* memory region for persistent memory, HDM */
> > +    uint64_t pmem_size;  
> 
> Can we switch this to mem_size and drop the persistent comment? It is my 
> understanding that HDM is independent of persistence.

Discussed in the other branch of this thread.  Short answer is we don't
support non persistent yet but it's on the todo list.  What exactly
that looks like is to be determined.  One aspect of that is there
isn't currently a software stack to test volatile memory.

> 
> > +} CXLDeviceState;
> > +
> > +/* Initialize the register block for a device */
> > +void cxl_device_register_block_init(Object *obj, CXLDeviceState *dev);
> > +
> > +/* Set up default values for the register block */
> > +void cxl_device_register_init_common(CXLDeviceState *dev);
> > +
> > +/*
> > + * CXL 2.0 - 8.2.8.1 including errata F4
> > + * Documented as a 128 bit register, but 64 bit accesses and the second
> > + * 64 bits are currently reserved.
> > + */
> > +REG64(CXL_DEV_CAP_ARRAY, 0) /* Documented as 128 bit register but 64 byte accesses */
> > +    FIELD(CXL_DEV_CAP_ARRAY, CAP_ID, 0, 16)
> > +    FIELD(CXL_DEV_CAP_ARRAY, CAP_VERSION, 16, 8)
> > +    FIELD(CXL_DEV_CAP_ARRAY, CAP_COUNT, 32, 16)
> > +
> > +/*
> > + * Helper macro to initialize capability headers for CXL devices.
> > + *
> > + * In the 8.2.8.2, this is listed as a 128b register, but in 8.2.8, it says:
> > + * > No registers defined in Section 8.2.8 are larger than 64-bits wide so that
> > + * > is the maximum access size allowed for these registers. If this rule is not
> > + * > followed, the behavior is undefined
> > + *
> > + * CXL 2.0 Errata F4 states futher that the layouts in the specification are
> > + * shown as greater than 128 bits, but implementations are expected to
> > + * use any size of access up to 64 bits.
> > + *
> > + * Here we've chosen to make it 4 dwords. The spec allows any pow2 multiple
> > + * access to be used for a register up to 64 bits.
> > + */
> > +#define CXL_DEVICE_CAPABILITY_HEADER_REGISTER(n, offset)  \
> > +    REG32(CXL_DEV_##n##_CAP_HDR0, offset)                 \
> > +        FIELD(CXL_DEV_##n##_CAP_HDR0, CAP_ID, 0, 16)      \
> > +        FIELD(CXL_DEV_##n##_CAP_HDR0, CAP_VERSION, 16, 8) \
> > +    REG32(CXL_DEV_##n##_CAP_HDR1, offset + 4)             \
> > +        FIELD(CXL_DEV_##n##_CAP_HDR1, CAP_OFFSET, 0, 32)  \
> > +    REG32(CXL_DEV_##n##_CAP_HDR2, offset + 8)             \
> > +        FIELD(CXL_DEV_##n##_CAP_HDR2, CAP_LENGTH, 0, 32)
> > +
> > +CXL_DEVICE_CAPABILITY_HEADER_REGISTER(DEVICE, CXL_DEVICE_CAP_HDR1_OFFSET)
> > +CXL_DEVICE_CAPABILITY_HEADER_REGISTER(MAILBOX, CXL_DEVICE_CAP_HDR1_OFFSET + \
> > +                                               CXL_DEVICE_CAP_REG_SIZE)
> > +  
> 
> Fig139 for the following registers.
Added ref

> 
> 8.2.8.4.3
Good idea. Added all these references.

> > +REG32(CXL_DEV_MAILBOX_CAP, 0)
> > +    FIELD(CXL_DEV_MAILBOX_CAP, PAYLOAD_SIZE, 0, 5)
> > +    FIELD(CXL_DEV_MAILBOX_CAP, INT_CAP, 5, 1)
> > +    FIELD(CXL_DEV_MAILBOX_CAP, BG_INT_CAP, 6, 1)
> > +    FIELD(CXL_DEV_MAILBOX_CAP, MSI_N, 7, 4)
> > +  
> 
> 8.2.8.4.4 
> > +REG32(CXL_DEV_MAILBOX_CTRL, 4)
> > +    FIELD(CXL_DEV_MAILBOX_CTRL, DOORBELL, 0, 1)
> > +    FIELD(CXL_DEV_MAILBOX_CTRL, INT_EN, 1, 1)
> > +    FIELD(CXL_DEV_MAILBOX_CTRL, BG_INT_EN, 2, 1)
> > +  
> 
> 8.2.8.4.5 + 8.2.9
> > +REG64(CXL_DEV_MAILBOX_CMD, 8)
> > +    FIELD(CXL_DEV_MAILBOX_CMD, COMMAND, 0, 8)
> > +    FIELD(CXL_DEV_MAILBOX_CMD, COMMAND_SET, 8, 8)
> > +    FIELD(CXL_DEV_MAILBOX_CMD, LENGTH, 16, 20)
> > +  
> 
> 8.2.8.4.6
> > +REG64(CXL_DEV_MAILBOX_STS, 0x10)
> > +    FIELD(CXL_DEV_MAILBOX_STS, BG_OP, 0, 1)
> > +    FIELD(CXL_DEV_MAILBOX_STS, ERRNO, 32, 16)
> > +    FIELD(CXL_DEV_MAILBOX_STS, VENDOR_ERRNO, 48, 16)
> > +  
> 
> 8.2.8.4.7
> > +REG64(CXL_DEV_BG_CMD_STS, 0x18)
> > +    FIELD(CXL_DEV_BG_CMD_STS, BG, 0, 16)  
> 
> Should we call this OP since it is implied that we are BG given the register?
Sure. It's a better name than BG.
> 
> > +    FIELD(CXL_DEV_BG_CMD_STS, DONE, 16, 7)  
> 
> NUM_DONE? since this is a percentage.
Let's be verbose as NUM_DONE still seems confusing to me.
PERCENTAGE_COMP

I hadn't really noticed these names as I don't think any of
them are used yet.

> 
> > +    FIELD(CXL_DEV_BG_CMD_STS, ERRNO, 32, 16)  
> 
> Isn't this a RET_CODE since it is only valid if previous field is 100%

Changed

> 
> > +    FIELD(CXL_DEV_BG_CMD_STS, VENDOR_ERRNO, 48, 16)  
> 
> VENDOR_RET_CODE since the same rule for the previous field applies here.
Changed
> 
> > +
> > +REG32(CXL_DEV_CMD_PAYLOAD, 0x20)
> > +
> > +#endif
> > -- 
> > 2.32.0
> > 
> >   
> 
> +cc Dave, Klaus, Tong
> Other than the minor issues raised.
> 
> Looks good.
> 
> Reviewed by: Adam Manzanares <a.manzanares@samsung.com>

Btw I haven't accepted all changes, but have been picking up
your RB.  Shout if that's not fine with you.

Thanks.

Jonathan
Adam Manzanares March 31, 2022, 9:42 p.m. UTC | #5
On Wed, Mar 30, 2022 at 01:15:58PM +0100, Jonathan Cameron wrote:
> On Tue, 29 Mar 2022 12:53:51 -0700
> Davidlohr Bueso <dave@stgolabs.net> wrote:
> 
> > On Tue, 29 Mar 2022, Adam Manzanares wrote:
> > >> +typedef struct cxl_device_state {
> > >> +    MemoryRegion device_registers;
> > >> +
> > >> +    /* mmio for device capabilities array - 8.2.8.2 */
> > >> +    MemoryRegion device;
> > >> +    MemoryRegion caps;
> > >> +
> > >> +    /* mmio for the mailbox registers 8.2.8.4 */
> > >> +    MemoryRegion mailbox;
> > >> +
> > >> +    /* memory region for persistent memory, HDM */
> > >> +    uint64_t pmem_size;  
> > >
> > >Can we switch this to mem_size and drop the persistent comment? It is my
> > >understanding that HDM is independent of persistence.  
> > 
> > Agreed, but ideally both volatile and persistent capacities would have been
> > supported in this patchset. I'm also probably missing specific reasons as to
> > why this isn't the case.
> 
> Whilst it doesn't add a huge amount of complexity it does add some
> and the software paths in Linux we were developing this for are pmem focused.
> Hence volatile is on the todo list rather than in this first patch set.
> Not sensible to aim for feature complete in one go.

Makes complete sense. We can help with the Linux development for the volatile 
side. I will add a couple of folks on cc. In addition, we would like to help
the CXL ecosystem in general so I anticipate we will have more reviews and 
patches for CXL in general.

> 
> > 
> > Looking at it briefly could it be just a matter of adding to cxl_type3_dev
> > a new hostmem along with it's AddressSpace for the volatile? If so, I'm
> > thinking something along these lines:
> > 
> > @@ -123,8 +123,8 @@ typedef struct cxl_device_state {
> > 	 uint64_t host_set;
> >       } timestamp;
> > 
> > -    /* memory region for persistent memory, HDM */
> > -    uint64_t pmem_size;
> > +    /* memory region for persistent and volatile memory, HDM */
> > +    uint64_t pmem_size, mem_size;
> >   } CXLDeviceState;
> > 
> >   /* Initialize the register block for a device */
> > @@ -235,9 +235,9 @@ typedef struct cxl_type3_dev {
> >       PCIDevice parent_obj;
> > 
> >       /* Properties */
> > -    AddressSpace hostmem_as;
> > +    AddressSpace hostmem_as, hostmemv_as;
> >       uint64_t size;
> > -    HostMemoryBackend *hostmem;
> > +    HostMemoryBackend *hostmem, *hostmemv;
> >       HostMemoryBackend *lsa;
> >       uint64_t sn;
> > 
> > Then for cxl_setup_memory(), with ct3d->hostmem and/or ct3d->hostmemv
> > non-nil, set the respective MemoryRegions:
> > 
> > +    if (ct3d->hostmem) {
> > +            memory_region_set_nonvolatile(mr, true);
> > +            memory_region_set_enabled(mr, true);
> > +            host_memory_backend_set_mapped(ct3d->hostmem, true);
> > +            address_space_init(&ct3d->hostmem_as, mr, name);
> > +            ct3d->cxl_dstate.pmem_size = ct3d->hostmem->size;
> > +    }
> > +    if (ct3d->hostmemv) {
> > +            memory_region_set_nonvolatile(mrv, false);
> > +            memory_region_set_enabled(mrv, true);
> > +            host_memory_backend_set_mapped(ct3d->hostmemv, true);
> > +            address_space_init(&ct3d->hostmem_as, mrv, name);
> > +            ct3d->cxl_dstate.pmem_size = ct3d->hostmem->size;
> > +    }
> > 
> > For corresponding MB commands, it's mostly IDENTIFY_MEMORY_DEVICE that needs
> > updating:
> > 
> > @@ -281,7 +281,7 @@ static ret_code cmd_identify_memory_device(struct cxl_cmd *cmd,
> > 
> >       CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
> >       CXLType3Class *cvc = CXL_TYPE3_DEV_GET_CLASS(ct3d);
> > -    uint64_t size = cxl_dstate->pmem_size;
> > +    uint64_t size = cxl_dstate->pmem_size + cxl_dstate->mem_size;
> > 
> >       if (!QEMU_IS_ALIGNED(size, 256 << 20)) {
> > 	 return CXL_MBOX_INTERNAL_ERROR;
> > @@ -290,11 +290,11 @@ static ret_code cmd_identify_memory_device(struct cxl_cmd *cmd,
> >       id = (void *)cmd->payload;
> >       memset(id, 0, sizeof(*id));
> > 
> > -    /* PMEM only */
> >       snprintf(id->fw_revision, 0x10, "BWFW VERSION %02d", 0);
> > 
> >       id->total_capacity = size / (256 << 20);
> > -    id->persistent_capacity = size / (256 << 20);
> > +    id->persistent_capacity = cxl_dstate->pmem_size / (256 << 20);
> > +    id->volatile_capacity = cxl_dstate->mem_size / (256 << 20);
> >       id->lsa_size = cvc->get_lsa_size(ct3d);
> > 
> >       *len = sizeof(*id);
> > @@ -312,16 +312,16 @@ static ret_code cmd_ccls_get_partition_info(struct cxl_cmd *cmd,
> > 	 uint64_t next_pmem;
> >       } QEMU_PACKED *part_info = (void *)cmd->payload;
> >       QEMU_BUILD_BUG_ON(sizeof(*part_info) != 0x20);
> > -    uint64_t size = cxl_dstate->pmem_size;
> > +    uint64_t psize = cxl_dstate->pmem_size;
> > +    uint64_t vsize = cxl_dstate->mem_size;
> > 
> > -    if (!QEMU_IS_ALIGNED(size, 256 << 20)) {
> > +    if (!QEMU_IS_ALIGNED(psize + vsize, 256 << 20)) {
> > 	 return CXL_MBOX_INTERNAL_ERROR;
> >       }
> > 
> > -    /* PMEM only */
> > -    part_info->active_vmem = 0;
> > -    part_info->next_vmem = 0;
> > -    part_info->active_pmem = size / (256 << 20);
> > +    part_info->active_vmem = vsize / (256 << 20);
> > +    part_info->next_vmem = part_info->active_vmem;
> > +    part_info->active_pmem = psize / (256 << 20);
> >       part_info->next_pmem = part_info->active_pmem;
> > 
> > Then for reads/writes, both cxl_type3_write() and _read() would, after computing the dpa_offset,
> > first try the volatile region then upon error attempt the same in the persistent memory - this
> > assuming the DPA space is consistent among both types of memory. Ie:
> > 
> > address_space_read(&ct3d->hostmemv_as, dpa_offset, attrs, data, size)
> > or
> > address_space_read(&ct3d->hostmem_as, dpa_offset, attrs, data, size)
> > 
> > ... but then again all this is probably just wishful thinking.
> 
> Without looking in detail, will indeed be something along those lines.
> Gets more fiddly if you include partitioning control that Alison was interested
> in adding.
> 
> Also, we probably need to support multiple HDM decoders.  Also not a huge
> complexity to add, but left for now as main focus is to get the base
> patch set merged.
> 
> So I'm happy to queue stuff up on top of this series and carry it forward
> but I don't want to add features to what we try to merge initially.
> This set is already huge and hard to review even with what think is a
> minimum set of features to be useful.
> 
> Note I'm already carrying a few features on top if this on the gitlab
> branch gitlab.com/jic23/qemu (DOE + CDAT and serial numbers) and
> have a few other things out of tree for now (SPDM, emulating most
> of the PCI Config space controls). 
> 

Thanks for the updates. Do you have any suggestions on how to coordinate 
efforts? Ideally we can have a list of features that need to be developed and
some names of people that will lead the work. 

> Jonathan
> 
> > 
> > Thanks,
> > Davidlohr
>
Adam Manzanares March 31, 2022, 10:13 p.m. UTC | #6
On Wed, Mar 30, 2022 at 06:48:48PM +0100, Jonathan Cameron wrote:
> On Tue, 29 Mar 2022 18:13:59 +0000
> Adam Manzanares <a.manzanares@samsung.com> wrote:
> 
> > On Fri, Mar 18, 2022 at 03:05:53PM +0000, Jonathan Cameron wrote:
> > > From: Ben Widawsky <ben.widawsky@intel.com>
> > > 
> > > A CXL device is a type of CXL component. Conceptually, a CXL device
> > > would be a leaf node in a CXL topology. From an emulation perspective,
> > > CXL devices are the most complex and so the actual implementation is
> > > reserved for discrete commits.
> > > 
> > > This new device type is specifically catered towards the eventual
> > > implementation of a Type3 CXL.mem device, 8.2.8.5 in the CXL 2.0
> > > specification.
> > > 
> > > Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> > > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
> 
> ...
> 
> > > diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
> > > new file mode 100644
> > > index 0000000000..b2416e45bf
> > > --- /dev/null
> > > +++ b/include/hw/cxl/cxl_device.h
> > > @@ -0,0 +1,165 @@
> > > +/*
> > > + * QEMU CXL Devices
> > > + *
> > > + * Copyright (c) 2020 Intel
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2. See the
> > > + * COPYING file in the top-level directory.
> > > + */
> > > +
> > > +#ifndef CXL_DEVICE_H
> > > +#define CXL_DEVICE_H
> > > +
> > > +#include "hw/register.h"
> > > +
> > > +/*
> > > + * The following is how a CXL device's MMIO space is laid out. The only
> > > + * requirement from the spec is that the capabilities array and the capability
> > > + * headers start at offset 0 and are contiguously packed. The headers themselves
> > > + * provide offsets to the register fields. For this emulation, registers will
> > > + * start at offset 0x80 (m == 0x80). No secondary mailbox is implemented which
> > > + * means that n = m + sizeof(mailbox registers) + sizeof(device registers).  
> > 
> > What is n here, the start offset of the mailbox registers, this question is 
> > based on the figure below?
> 
> I'll expand on this to say
> 
> means that the offset of the start of the mailbox payload (n) is given by
> n = m + sizeof....
> 
> Which means the diagram below is wrong as should align with top
> of mailbox registers.
> 
> > 
> > > + *
> > > + * This is roughly described in 8.2.8 Figure 138 of the CXL 2.0 spec
> I'm going drop this comment as that figure appears unrelated to me.
> 
> > > + *
> > > + *                       +---------------------------------+
> > > + *                       |                                 |
> > > + *                       |    Memory Device Registers      |
> > > + *                       |                                 |
> > > + * n + PAYLOAD_SIZE_MAX  -----------------------------------
> > > + *                  ^    |                                 |
> > > + *                  |    |                                 |
> > > + *                  |    |                                 |
> > > + *                  |    |                                 |
> > > + *                  |    |                                 |
> > > + *                  |    |         Mailbox Payload         |
> > > + *                  |    |                                 |
> > > + *                  |    |                                 |
> > > + *                  |    |                                 |
> > > + *                  |    -----------------------------------
> > > + *                  |    |       Mailbox Registers         |
> > > + *                  |    |                                 |
> > > + *                  n    -----------------------------------
> > > + *                  ^    |                                 |
> > > + *                  |    |        Device Registers         |
> > > + *                  |    |                                 |
> > > + *                  m    ---------------------------------->
> > > + *                  ^    |  Memory Device Capability Header|
> > > + *                  |    -----------------------------------
> > > + *                  |    |     Mailbox Capability Header   |
> > > + *                  |    -------------- --------------------
> > > + *                  |    |     Device Capability Header    |
> > > + *                  |    -----------------------------------
> > > + *                  |    |                                 |
> > > + *                  |    |                                 |
> > > + *                  |    |      Device Cap Array[0..n]     |
> > > + *                  |    |                                 |
> > > + *                  |    |                                 |
> > > + *                       |                                 |
> > > + *                  0    +---------------------------------+  
> > 
> > Would it make sense to add CXL cap header register to the diagram?
> 
> Too many similar names in the CXL spec. I'm not sure which one you mean,
> could you let me have a reference?  If you mean the one that is
> at the start of the CXL.cache and CXL.mem registers that whole region
> isn't covered by this diagram and might be in a different BAR.
> Here we are only dealing with the Memory Device Registers.  I'll
> add statement to the initial comment block to make that clear
> as it definitely isn't currently!


I was thinking 0 in your figure is the device capabilities array register, 
which tells us how many capabilites that are in the array. This would be 
8.2.8.1. After that comes 8.2.8.2 with n capability header registers which 
point to the device registers.

> 
> > n also 
> > seems to be the size of the cap array, but it is also an offset so that could
> > be clarified.
> 
> Ah. Letter reuse. good point. Looking more closely it isn't an array anyway
> in the diagram (the array would have to include the Device Capability Header
> and Mailbox Capability headers.  Renamed as simply Device Cap Array Register
> 
> > 
> > > + *
> > > + */
> > > +
> > > +#define CXL_DEVICE_CAP_HDR1_OFFSET 0x10 /* Figure 138 */
> > > +#define CXL_DEVICE_CAP_REG_SIZE 0x10 /* 8.2.8.2 */
> > > +#define CXL_DEVICE_CAPS_MAX 4 /* 8.2.8.2.1 + 8.2.8.5 */
> > > +
> > > +#define CXL_DEVICE_REGISTERS_OFFSET 0x80 /* Read comment above */  
> > 
> > Is this to plan for future capabilities? If we have CAPS MAX doesn't this 
> > allow us to remove the slack space. 
> > 
> > > +#define CXL_DEVICE_REGISTERS_LENGTH 0x8 /* 8.2.8.3.1 */  
> > 
> > Should we add status to the name here, or would it get too long?
> > 
> > > +
> > > +#define CXL_MAILBOX_REGISTERS_OFFSET \
> > > +    (CXL_DEVICE_REGISTERS_OFFSET + CXL_DEVICE_REGISTERS_LENGTH)
> > > +#define CXL_MAILBOX_REGISTERS_SIZE 0x20 /* 8.2.8.4, Figure 139 */
> > > +#define CXL_MAILBOX_PAYLOAD_SHIFT 11  
> > 
> > I see 20 in the spec.
> 
> It's an implementation choice between 8 and 20. For now, this code goes
> with 11 for no particularly strong reason.

Got it.

> 
> > 
> > > +#define CXL_MAILBOX_MAX_PAYLOAD_SIZE (1 << CXL_MAILBOX_PAYLOAD_SHIFT)
> > > +#define CXL_MAILBOX_REGISTERS_LENGTH \
> > > +    (CXL_MAILBOX_REGISTERS_SIZE + CXL_MAILBOX_MAX_PAYLOAD_SIZE)
> > > +
> > > +typedef struct cxl_device_state {
> > > +    MemoryRegion device_registers;
> > > +
> > > +    /* mmio for device capabilities array - 8.2.8.2 */
> > > +    MemoryRegion device;
> > > +    MemoryRegion caps;
> > > +
> > > +    /* mmio for the mailbox registers 8.2.8.4 */
> > > +    MemoryRegion mailbox;
> > > +
> > > +    /* memory region for persistent memory, HDM */
> > > +    uint64_t pmem_size;  
> > 
> > Can we switch this to mem_size and drop the persistent comment? It is my 
> > understanding that HDM is independent of persistence.
> 
> Discussed in the other branch of this thread.  Short answer is we don't
> support non persistent yet but it's on the todo list.  What exactly
> that looks like is to be determined.  One aspect of that is there
> isn't currently a software stack to test volatile memory.

If you can elaborate more here on what is needed to test the volatile memory 
stack we may be able to help out.

> 
> > 
> > > +} CXLDeviceState;
> > > +
> > > +/* Initialize the register block for a device */
> > > +void cxl_device_register_block_init(Object *obj, CXLDeviceState *dev);
> > > +
> > > +/* Set up default values for the register block */
> > > +void cxl_device_register_init_common(CXLDeviceState *dev);
> > > +
> > > +/*
> > > + * CXL 2.0 - 8.2.8.1 including errata F4
> > > + * Documented as a 128 bit register, but 64 bit accesses and the second
> > > + * 64 bits are currently reserved.
> > > + */
> > > +REG64(CXL_DEV_CAP_ARRAY, 0) /* Documented as 128 bit register but 64 byte accesses */
> > > +    FIELD(CXL_DEV_CAP_ARRAY, CAP_ID, 0, 16)
> > > +    FIELD(CXL_DEV_CAP_ARRAY, CAP_VERSION, 16, 8)
> > > +    FIELD(CXL_DEV_CAP_ARRAY, CAP_COUNT, 32, 16)
> > > +
> > > +/*
> > > + * Helper macro to initialize capability headers for CXL devices.
> > > + *
> > > + * In the 8.2.8.2, this is listed as a 128b register, but in 8.2.8, it says:
> > > + * > No registers defined in Section 8.2.8 are larger than 64-bits wide so that
> > > + * > is the maximum access size allowed for these registers. If this rule is not
> > > + * > followed, the behavior is undefined
> > > + *
> > > + * CXL 2.0 Errata F4 states futher that the layouts in the specification are
> > > + * shown as greater than 128 bits, but implementations are expected to
> > > + * use any size of access up to 64 bits.
> > > + *
> > > + * Here we've chosen to make it 4 dwords. The spec allows any pow2 multiple
> > > + * access to be used for a register up to 64 bits.
> > > + */
> > > +#define CXL_DEVICE_CAPABILITY_HEADER_REGISTER(n, offset)  \
> > > +    REG32(CXL_DEV_##n##_CAP_HDR0, offset)                 \
> > > +        FIELD(CXL_DEV_##n##_CAP_HDR0, CAP_ID, 0, 16)      \
> > > +        FIELD(CXL_DEV_##n##_CAP_HDR0, CAP_VERSION, 16, 8) \
> > > +    REG32(CXL_DEV_##n##_CAP_HDR1, offset + 4)             \
> > > +        FIELD(CXL_DEV_##n##_CAP_HDR1, CAP_OFFSET, 0, 32)  \
> > > +    REG32(CXL_DEV_##n##_CAP_HDR2, offset + 8)             \
> > > +        FIELD(CXL_DEV_##n##_CAP_HDR2, CAP_LENGTH, 0, 32)
> > > +
> > > +CXL_DEVICE_CAPABILITY_HEADER_REGISTER(DEVICE, CXL_DEVICE_CAP_HDR1_OFFSET)
> > > +CXL_DEVICE_CAPABILITY_HEADER_REGISTER(MAILBOX, CXL_DEVICE_CAP_HDR1_OFFSET + \
> > > +                                               CXL_DEVICE_CAP_REG_SIZE)
> > > +  
> > 
> > Fig139 for the following registers.
> Added ref
> 
> > 
> > 8.2.8.4.3
> Good idea. Added all these references.
> 
> > > +REG32(CXL_DEV_MAILBOX_CAP, 0)
> > > +    FIELD(CXL_DEV_MAILBOX_CAP, PAYLOAD_SIZE, 0, 5)
> > > +    FIELD(CXL_DEV_MAILBOX_CAP, INT_CAP, 5, 1)
> > > +    FIELD(CXL_DEV_MAILBOX_CAP, BG_INT_CAP, 6, 1)
> > > +    FIELD(CXL_DEV_MAILBOX_CAP, MSI_N, 7, 4)
> > > +  
> > 
> > 8.2.8.4.4 
> > > +REG32(CXL_DEV_MAILBOX_CTRL, 4)
> > > +    FIELD(CXL_DEV_MAILBOX_CTRL, DOORBELL, 0, 1)
> > > +    FIELD(CXL_DEV_MAILBOX_CTRL, INT_EN, 1, 1)
> > > +    FIELD(CXL_DEV_MAILBOX_CTRL, BG_INT_EN, 2, 1)
> > > +  
> > 
> > 8.2.8.4.5 + 8.2.9
> > > +REG64(CXL_DEV_MAILBOX_CMD, 8)
> > > +    FIELD(CXL_DEV_MAILBOX_CMD, COMMAND, 0, 8)
> > > +    FIELD(CXL_DEV_MAILBOX_CMD, COMMAND_SET, 8, 8)
> > > +    FIELD(CXL_DEV_MAILBOX_CMD, LENGTH, 16, 20)
> > > +  
> > 
> > 8.2.8.4.6
> > > +REG64(CXL_DEV_MAILBOX_STS, 0x10)
> > > +    FIELD(CXL_DEV_MAILBOX_STS, BG_OP, 0, 1)
> > > +    FIELD(CXL_DEV_MAILBOX_STS, ERRNO, 32, 16)
> > > +    FIELD(CXL_DEV_MAILBOX_STS, VENDOR_ERRNO, 48, 16)
> > > +  
> > 
> > 8.2.8.4.7
> > > +REG64(CXL_DEV_BG_CMD_STS, 0x18)
> > > +    FIELD(CXL_DEV_BG_CMD_STS, BG, 0, 16)  
> > 
> > Should we call this OP since it is implied that we are BG given the register?
> Sure. It's a better name than BG.
> > 
> > > +    FIELD(CXL_DEV_BG_CMD_STS, DONE, 16, 7)  
> > 
> > NUM_DONE? since this is a percentage.
> Let's be verbose as NUM_DONE still seems confusing to me.
> PERCENTAGE_COMP

Even better.

> 
> I hadn't really noticed these names as I don't think any of
> them are used yet.
> 
> > 
> > > +    FIELD(CXL_DEV_BG_CMD_STS, ERRNO, 32, 16)  
> > 
> > Isn't this a RET_CODE since it is only valid if previous field is 100%
> 
> Changed
> 
> > 
> > > +    FIELD(CXL_DEV_BG_CMD_STS, VENDOR_ERRNO, 48, 16)  
> > 
> > VENDOR_RET_CODE since the same rule for the previous field applies here.
> Changed
> > 
> > > +
> > > +REG32(CXL_DEV_CMD_PAYLOAD, 0x20)
> > > +
> > > +#endif
> > > -- 
> > > 2.32.0
> > > 
> > >   
> > 
> > +cc Dave, Klaus, Tong
> > Other than the minor issues raised.
> > 
> > Looks good.
> > 
> > Reviewed by: Adam Manzanares <a.manzanares@samsung.com>
> 
> Btw I haven't accepted all changes, but have been picking up
> your RB.  Shout if that's not fine with you.

Definitely fine with me and was my intention. Let us know how we can help move
the work forward. I am kick starting reviewing and will try to bring others in. 

> 
> Thanks.
> 
> Jonathan
>
Jonathan Cameron April 1, 2022, 1:30 p.m. UTC | #7
On Thu, 31 Mar 2022 22:13:20 +0000
Adam Manzanares <a.manzanares@samsung.com> wrote:

> On Wed, Mar 30, 2022 at 06:48:48PM +0100, Jonathan Cameron wrote:
> > On Tue, 29 Mar 2022 18:13:59 +0000
> > Adam Manzanares <a.manzanares@samsung.com> wrote:
> >   
> > > On Fri, Mar 18, 2022 at 03:05:53PM +0000, Jonathan Cameron wrote:  
> > > > From: Ben Widawsky <ben.widawsky@intel.com>
> > > > 
> > > > A CXL device is a type of CXL component. Conceptually, a CXL device
> > > > would be a leaf node in a CXL topology. From an emulation perspective,
> > > > CXL devices are the most complex and so the actual implementation is
> > > > reserved for discrete commits.
> > > > 
> > > > This new device type is specifically catered towards the eventual
> > > > implementation of a Type3 CXL.mem device, 8.2.8.5 in the CXL 2.0
> > > > specification.
> > > > 
> > > > Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> > > > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > > Reviewed-by: Alex Bennée <alex.bennee@linaro.org>  
> > 
> > ...
> >   
> > > > diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
> > > > new file mode 100644
> > > > index 0000000000..b2416e45bf
> > > > --- /dev/null
> > > > +++ b/include/hw/cxl/cxl_device.h
> > > > @@ -0,0 +1,165 @@
> > > > +/*
> > > > + * QEMU CXL Devices
> > > > + *
> > > > + * Copyright (c) 2020 Intel
> > > > + *
> > > > + * This work is licensed under the terms of the GNU GPL, version 2. See the
> > > > + * COPYING file in the top-level directory.
> > > > + */
> > > > +
> > > > +#ifndef CXL_DEVICE_H
> > > > +#define CXL_DEVICE_H
> > > > +
> > > > +#include "hw/register.h"
> > > > +
> > > > +/*
> > > > + * The following is how a CXL device's MMIO space is laid out. The only
> > > > + * requirement from the spec is that the capabilities array and the capability
> > > > + * headers start at offset 0 and are contiguously packed. The headers themselves
> > > > + * provide offsets to the register fields. For this emulation, registers will
> > > > + * start at offset 0x80 (m == 0x80). No secondary mailbox is implemented which
> > > > + * means that n = m + sizeof(mailbox registers) + sizeof(device registers).    
> > > 
> > > What is n here, the start offset of the mailbox registers, this question is 
> > > based on the figure below?  
> > 
> > I'll expand on this to say
> > 
> > means that the offset of the start of the mailbox payload (n) is given by
> > n = m + sizeof....
> > 
> > Which means the diagram below is wrong as should align with top
> > of mailbox registers.
> >   
> > >   
> > > > + *
> > > > + * This is roughly described in 8.2.8 Figure 138 of the CXL 2.0 spec  
> > I'm going drop this comment as that figure appears unrelated to me.
> >   
> > > > + *
> > > > + *                       +---------------------------------+
> > > > + *                       |                                 |
> > > > + *                       |    Memory Device Registers      |
> > > > + *                       |                                 |
> > > > + * n + PAYLOAD_SIZE_MAX  -----------------------------------
> > > > + *                  ^    |                                 |
> > > > + *                  |    |                                 |
> > > > + *                  |    |                                 |
> > > > + *                  |    |                                 |
> > > > + *                  |    |                                 |
> > > > + *                  |    |         Mailbox Payload         |
> > > > + *                  |    |                                 |
> > > > + *                  |    |                                 |
> > > > + *                  |    |                                 |
> > > > + *                  |    -----------------------------------
> > > > + *                  |    |       Mailbox Registers         |
> > > > + *                  |    |                                 |
> > > > + *                  n    -----------------------------------
> > > > + *                  ^    |                                 |
> > > > + *                  |    |        Device Registers         |
> > > > + *                  |    |                                 |
> > > > + *                  m    ---------------------------------->
> > > > + *                  ^    |  Memory Device Capability Header|
> > > > + *                  |    -----------------------------------
> > > > + *                  |    |     Mailbox Capability Header   |
> > > > + *                  |    -------------- --------------------
> > > > + *                  |    |     Device Capability Header    |
> > > > + *                  |    -----------------------------------
> > > > + *                  |    |                                 |
> > > > + *                  |    |                                 |
> > > > + *                  |    |      Device Cap Array[0..n]     |
> > > > + *                  |    |                                 |
> > > > + *                  |    |                                 |
> > > > + *                       |                                 |
> > > > + *                  0    +---------------------------------+    
> > > 
> > > Would it make sense to add CXL cap header register to the diagram?  
> > 
> > Too many similar names in the CXL spec. I'm not sure which one you mean,
> > could you let me have a reference?  If you mean the one that is
> > at the start of the CXL.cache and CXL.mem registers that whole region
> > isn't covered by this diagram and might be in a different BAR.
> > Here we are only dealing with the Memory Device Registers.  I'll
> > add statement to the initial comment block to make that clear
> > as it definitely isn't currently!  
> 
> 
> I was thinking 0 in your figure is the device capabilities array register, 
> which tells us how many capabilites that are in the array. This would be 
> 8.2.8.1. After that comes 8.2.8.2 with n capability header registers which 
> point to the device registers.

Got it.  See below.

> 
> >   
> > > n also 
> > > seems to be the size of the cap array, but it is also an offset so that could
> > > be clarified.  
> > 
> > Ah. Letter reuse. good point. Looking more closely it isn't an array anyway
> > in the diagram (the array would have to include the Device Capability Header
> > and Mailbox Capability headers.  Renamed as simply Device Cap Array Register

As mentioned here, the array is misleading anyway because we have the
actual entries listed directly above it rather than 'inside' the array.
Hence the change described above.

> >   
> > >   
> > > > + *
> > > > + */
> > > > +
> > > > +#define CXL_DEVICE_CAP_HDR1_OFFSET 0x10 /* Figure 138 */
> > > > +#define CXL_DEVICE_CAP_REG_SIZE 0x10 /* 8.2.8.2 */
> > > > +#define CXL_DEVICE_CAPS_MAX 4 /* 8.2.8.2.1 + 8.2.8.5 */
> > > > +
> > > > +#define CXL_DEVICE_REGISTERS_OFFSET 0x80 /* Read comment above */    
> > > 
> > > Is this to plan for future capabilities? If we have CAPS MAX doesn't this 
> > > allow us to remove the slack space. 
I missed replying to this before.

So far CAPS MAX covers everything in the spec. (room for secondary mailbox
+ the 3 we have implemented). 
We don't support migration etc yet (and I'm not sure we ever will)
anyway so I'm not hugely bothered about backwards compatibility.
Hence we can just move things if needed later.

> > >   
> > > > +#define CXL_DEVICE_REGISTERS_LENGTH 0x8 /* 8.2.8.3.1 */    
> > > 
> > > Should we add status to the name here, or would it get too long?
> > >   
> > > > +
> > > > +#define CXL_MAILBOX_REGISTERS_OFFSET \
> > > > +    (CXL_DEVICE_REGISTERS_OFFSET + CXL_DEVICE_REGISTERS_LENGTH)
> > > > +#define CXL_MAILBOX_REGISTERS_SIZE 0x20 /* 8.2.8.4, Figure 139 */
> > > > +#define CXL_MAILBOX_PAYLOAD_SHIFT 11    
> > > 
> > > I see 20 in the spec.  
> > 
> > It's an implementation choice between 8 and 20. For now, this code goes
> > with 11 for no particularly strong reason.  
> 
> Got it.
> 
> >   
> > >   
> > > > +#define CXL_MAILBOX_MAX_PAYLOAD_SIZE (1 << CXL_MAILBOX_PAYLOAD_SHIFT)
> > > > +#define CXL_MAILBOX_REGISTERS_LENGTH \
> > > > +    (CXL_MAILBOX_REGISTERS_SIZE + CXL_MAILBOX_MAX_PAYLOAD_SIZE)
> > > > +
> > > > +typedef struct cxl_device_state {
> > > > +    MemoryRegion device_registers;
> > > > +
> > > > +    /* mmio for device capabilities array - 8.2.8.2 */
> > > > +    MemoryRegion device;
> > > > +    MemoryRegion caps;
> > > > +
> > > > +    /* mmio for the mailbox registers 8.2.8.4 */
> > > > +    MemoryRegion mailbox;
> > > > +
> > > > +    /* memory region for persistent memory, HDM */
> > > > +    uint64_t pmem_size;    
> > > 
> > > Can we switch this to mem_size and drop the persistent comment? It is my 
> > > understanding that HDM is independent of persistence.  
> > 
> > Discussed in the other branch of this thread.  Short answer is we don't
> > support non persistent yet but it's on the todo list.  What exactly
> > that looks like is to be determined.  One aspect of that is there
> > isn't currently a software stack to test volatile memory.  
> 
> If you can elaborate more here on what is needed to test the volatile memory 
> stack we may be able to help out.

There are a bunch of different ways this could be done - ultimate we probably
want to do all of them.

https://cdrdv2.intel.com/v1/dl/getContent/643805?wapkw=CXL%20memory%20device%20sw%20guide
has some suggestions (though no one is obliged to follow them!) See 2.4

First assumption is that for volatile devices, a common approach will be to do
all the setup in firmware before the OS boots and just present normal SRAT, HMAT
and memory tables as if it were any other memory.  If we want to go that way
for testing purposes then we'd need an open source firmware to implement
setup similar to that done in Linux - probably EDK2.

Of course, volatile memory might be hot added, in which case the OS may be involved.
In that case I think the main missing part would be actually doing the final memory
hotplug event to expose it to the OS + the necessary dynamic updating of the
OS numa description etc. There is work on going to get the information needed
but I think we are still some way off actually tying everything together.

Dan / Ben and team may be able to share more status information.

> 
> >   
> > >   
> > > > +} CXLDeviceState;
> > > > +
> > > > +/* Initialize the register block for a device */
> > > > +void cxl_device_register_block_init(Object *obj, CXLDeviceState *dev);

...

> > > +cc Dave, Klaus, Tong
> > > Other than the minor issues raised.
> > > 
> > > Looks good.
> > > 
> > > Reviewed by: Adam Manzanares <a.manzanares@samsung.com>  
> > 
> > Btw I haven't accepted all changes, but have been picking up
> > your RB.  Shout if that's not fine with you.  
> 
> Definitely fine with me and was my intention. Let us know how we can help move
> the work forward. I am kick starting reviewing and will try to bring others in.

Great.  For various reasons I'll not bother mention here (see my employer ;)
I need to keep any discussions on mailing list or in a 'published' form.
So discussion on mailing list + at conferences works best for me but we can
organize some suitably hosted public calls if needed to align plans.
There is a plan for uconf at Plumbers this year which will hopefully let
us do any longer term planning.  Shorter term my aims around QEMU side of things
are:

1) Get the initial support upstream as I'm getting bored of rebasing :)
   I think we are in a fairly good state for doing that once qemu 7.0 is
   out.
2) Improved tests so it doesn't break when no one is paying attention.
3) Expand out the feature set to keep up with what is going on Linux kernel
   wise (personally no other OS of interest, but it would be great if anyone
   wanted to help deal with other operating systems that care).
  * RAS
  * CDAT for switches etc, host table updates for generic port definition
   - What ever else I've missed recently.  When the region code finalizes
     I suspect we'll want to add a load more tests to stress various corners
     of that.
  * Alison may help with partitioning support.
4) Expand features where we have currently taken a short cut such as enabling
   multiple HDM decoders.
5) Use it as a path for testing spec features before publication (obviously can't
   talk about that on list but I've open in appropriate venue about that).

Happy to have help on any of the above, but 'features' that are reasonably separate
such as RAS support might be a good place for contributions that won't be
greatly affected by any other refactoring going on.

I've pushed all but SPDM support and stuff for which the spec isn't public yet up on
https://gitlab.com/jic23/qemu/-/commits/cxl-v9-draft-1
(as you can see CI found a segfault today so I'll push the fix out for that
 shortly - that also highlighted a build breakage mid series that I've fixed up.).

Jonathan

 
> 
> > 
> > Thanks.
> > 
> > Jonathan
> >
Adam Manzanares April 4, 2022, 3:15 p.m. UTC | #8
On Fri, Apr 01, 2022 at 02:30:34PM +0100, Jonathan Cameron wrote:
> On Thu, 31 Mar 2022 22:13:20 +0000
> Adam Manzanares <a.manzanares@samsung.com> wrote:
> 
> > On Wed, Mar 30, 2022 at 06:48:48PM +0100, Jonathan Cameron wrote:
> > > On Tue, 29 Mar 2022 18:13:59 +0000
> > > Adam Manzanares <a.manzanares@samsung.com> wrote:
> > >   
> > > > On Fri, Mar 18, 2022 at 03:05:53PM +0000, Jonathan Cameron wrote:  
> > > > > From: Ben Widawsky <ben.widawsky@intel.com>
> > > > > 
> > > > > A CXL device is a type of CXL component. Conceptually, a CXL device
> > > > > would be a leaf node in a CXL topology. From an emulation perspective,
> > > > > CXL devices are the most complex and so the actual implementation is
> > > > > reserved for discrete commits.
> > > > > 
> > > > > This new device type is specifically catered towards the eventual
> > > > > implementation of a Type3 CXL.mem device, 8.2.8.5 in the CXL 2.0
> > > > > specification.
> > > > > 
> > > > > Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> > > > > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > > > Reviewed-by: Alex Bennée <alex.bennee@linaro.org>  
> > > 
> > > ...
> > >   
> > > > > diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
> > > > > new file mode 100644
> > > > > index 0000000000..b2416e45bf
> > > > > --- /dev/null
> > > > > +++ b/include/hw/cxl/cxl_device.h
> > > > > @@ -0,0 +1,165 @@
> > > > > +/*
> > > > > + * QEMU CXL Devices
> > > > > + *
> > > > > + * Copyright (c) 2020 Intel
> > > > > + *
> > > > > + * This work is licensed under the terms of the GNU GPL, version 2. See the
> > > > > + * COPYING file in the top-level directory.
> > > > > + */
> > > > > +
> > > > > +#ifndef CXL_DEVICE_H
> > > > > +#define CXL_DEVICE_H
> > > > > +
> > > > > +#include "hw/register.h"
> > > > > +
> > > > > +/*
> > > > > + * The following is how a CXL device's MMIO space is laid out. The only
> > > > > + * requirement from the spec is that the capabilities array and the capability
> > > > > + * headers start at offset 0 and are contiguously packed. The headers themselves
> > > > > + * provide offsets to the register fields. For this emulation, registers will
> > > > > + * start at offset 0x80 (m == 0x80). No secondary mailbox is implemented which
> > > > > + * means that n = m + sizeof(mailbox registers) + sizeof(device registers).    
> > > > 
> > > > What is n here, the start offset of the mailbox registers, this question is 
> > > > based on the figure below?  
> > > 
> > > I'll expand on this to say
> > > 
> > > means that the offset of the start of the mailbox payload (n) is given by
> > > n = m + sizeof....
> > > 
> > > Which means the diagram below is wrong as should align with top
> > > of mailbox registers.
> > >   
> > > >   
> > > > > + *
> > > > > + * This is roughly described in 8.2.8 Figure 138 of the CXL 2.0 spec  
> > > I'm going drop this comment as that figure appears unrelated to me.
> > >   
> > > > > + *
> > > > > + *                       +---------------------------------+
> > > > > + *                       |                                 |
> > > > > + *                       |    Memory Device Registers      |
> > > > > + *                       |                                 |
> > > > > + * n + PAYLOAD_SIZE_MAX  -----------------------------------
> > > > > + *                  ^    |                                 |
> > > > > + *                  |    |                                 |
> > > > > + *                  |    |                                 |
> > > > > + *                  |    |                                 |
> > > > > + *                  |    |                                 |
> > > > > + *                  |    |         Mailbox Payload         |
> > > > > + *                  |    |                                 |
> > > > > + *                  |    |                                 |
> > > > > + *                  |    |                                 |
> > > > > + *                  |    -----------------------------------
> > > > > + *                  |    |       Mailbox Registers         |
> > > > > + *                  |    |                                 |
> > > > > + *                  n    -----------------------------------
> > > > > + *                  ^    |                                 |
> > > > > + *                  |    |        Device Registers         |
> > > > > + *                  |    |                                 |
> > > > > + *                  m    ---------------------------------->
> > > > > + *                  ^    |  Memory Device Capability Header|
> > > > > + *                  |    -----------------------------------
> > > > > + *                  |    |     Mailbox Capability Header   |
> > > > > + *                  |    -------------- --------------------
> > > > > + *                  |    |     Device Capability Header    |
> > > > > + *                  |    -----------------------------------
> > > > > + *                  |    |                                 |
> > > > > + *                  |    |                                 |
> > > > > + *                  |    |      Device Cap Array[0..n]     |
> > > > > + *                  |    |                                 |
> > > > > + *                  |    |                                 |
> > > > > + *                       |                                 |
> > > > > + *                  0    +---------------------------------+    
> > > > 
> > > > Would it make sense to add CXL cap header register to the diagram?  
> > > 
> > > Too many similar names in the CXL spec. I'm not sure which one you mean,
> > > could you let me have a reference?  If you mean the one that is
> > > at the start of the CXL.cache and CXL.mem registers that whole region
> > > isn't covered by this diagram and might be in a different BAR.
> > > Here we are only dealing with the Memory Device Registers.  I'll
> > > add statement to the initial comment block to make that clear
> > > as it definitely isn't currently!  
> > 
> > 
> > I was thinking 0 in your figure is the device capabilities array register, 
> > which tells us how many capabilites that are in the array. This would be 
> > 8.2.8.1. After that comes 8.2.8.2 with n capability header registers which 
> > point to the device registers.
> 
> Got it.  See below.
> 
> > 
> > >   
> > > > n also 
> > > > seems to be the size of the cap array, but it is also an offset so that could
> > > > be clarified.  
> > > 
> > > Ah. Letter reuse. good point. Looking more closely it isn't an array anyway
> > > in the diagram (the array would have to include the Device Capability Header
> > > and Mailbox Capability headers.  Renamed as simply Device Cap Array Register
> 
> As mentioned here, the array is misleading anyway because we have the
> actual entries listed directly above it rather than 'inside' the array.
> Hence the change described above.
> 
> > >   
> > > >   
> > > > > + *
> > > > > + */
> > > > > +
> > > > > +#define CXL_DEVICE_CAP_HDR1_OFFSET 0x10 /* Figure 138 */
> > > > > +#define CXL_DEVICE_CAP_REG_SIZE 0x10 /* 8.2.8.2 */
> > > > > +#define CXL_DEVICE_CAPS_MAX 4 /* 8.2.8.2.1 + 8.2.8.5 */
> > > > > +
> > > > > +#define CXL_DEVICE_REGISTERS_OFFSET 0x80 /* Read comment above */    
> > > > 
> > > > Is this to plan for future capabilities? If we have CAPS MAX doesn't this 
> > > > allow us to remove the slack space. 
> I missed replying to this before.
> 
> So far CAPS MAX covers everything in the spec. (room for secondary mailbox
> + the 3 we have implemented). 
> We don't support migration etc yet (and I'm not sure we ever will)
> anyway so I'm not hugely bothered about backwards compatibility.
> Hence we can just move things if needed later.
> 
> > > >   
> > > > > +#define CXL_DEVICE_REGISTERS_LENGTH 0x8 /* 8.2.8.3.1 */    
> > > > 
> > > > Should we add status to the name here, or would it get too long?
> > > >   
> > > > > +
> > > > > +#define CXL_MAILBOX_REGISTERS_OFFSET \
> > > > > +    (CXL_DEVICE_REGISTERS_OFFSET + CXL_DEVICE_REGISTERS_LENGTH)
> > > > > +#define CXL_MAILBOX_REGISTERS_SIZE 0x20 /* 8.2.8.4, Figure 139 */
> > > > > +#define CXL_MAILBOX_PAYLOAD_SHIFT 11    
> > > > 
> > > > I see 20 in the spec.  
> > > 
> > > It's an implementation choice between 8 and 20. For now, this code goes
> > > with 11 for no particularly strong reason.  
> > 
> > Got it.
> > 
> > >   
> > > >   
> > > > > +#define CXL_MAILBOX_MAX_PAYLOAD_SIZE (1 << CXL_MAILBOX_PAYLOAD_SHIFT)
> > > > > +#define CXL_MAILBOX_REGISTERS_LENGTH \
> > > > > +    (CXL_MAILBOX_REGISTERS_SIZE + CXL_MAILBOX_MAX_PAYLOAD_SIZE)
> > > > > +
> > > > > +typedef struct cxl_device_state {
> > > > > +    MemoryRegion device_registers;
> > > > > +
> > > > > +    /* mmio for device capabilities array - 8.2.8.2 */
> > > > > +    MemoryRegion device;
> > > > > +    MemoryRegion caps;
> > > > > +
> > > > > +    /* mmio for the mailbox registers 8.2.8.4 */
> > > > > +    MemoryRegion mailbox;
> > > > > +
> > > > > +    /* memory region for persistent memory, HDM */
> > > > > +    uint64_t pmem_size;    
> > > > 
> > > > Can we switch this to mem_size and drop the persistent comment? It is my 
> > > > understanding that HDM is independent of persistence.  
> > > 
> > > Discussed in the other branch of this thread.  Short answer is we don't
> > > support non persistent yet but it's on the todo list.  What exactly
> > > that looks like is to be determined.  One aspect of that is there
> > > isn't currently a software stack to test volatile memory.  
> > 
> > If you can elaborate more here on what is needed to test the volatile memory 
> > stack we may be able to help out.
> 
> There are a bunch of different ways this could be done - ultimate we probably
> want to do all of them.
> 
> https://urldefense.com/v3/__https://cdrdv2.intel.com/v1/dl/getContent/643805?wapkw=CXL*20memory*20device*20sw*20guide__;JSUlJQ!!EwVzqGoTKBqv-0DWAJBm!HzD_Dh_I9m9MydppOSSyhuzvwTawlg7LE77bEYiZ1i3AMgxV_YOI56VeZgkg-KuX7XMA$ 
> has some suggestions (though no one is obliged to follow them!) See 2.4
> 
> First assumption is that for volatile devices, a common approach will be to do
> all the setup in firmware before the OS boots and just present normal SRAT, HMAT
> and memory tables as if it were any other memory.  If we want to go that way
> for testing purposes then we'd need an open source firmware to implement
> setup similar to that done in Linux - probably EDK2.
> 
> Of course, volatile memory might be hot added, in which case the OS may be involved.
> In that case I think the main missing part would be actually doing the final memory
> hotplug event to expose it to the OS + the necessary dynamic updating of the
> OS numa description etc. There is work on going to get the information needed
> but I think we are still some way off actually tying everything together.
> 
> Dan / Ben and team may be able to share more status information.

Great, thanks for all of the information. We will start planning out our next
steps. I'll add Luis on cc since he has chatted with me about setting up a 
test framework for the CXL kernel code that will rely on QEMU.

> 
> > 
> > >   
> > > >   
> > > > > +} CXLDeviceState;
> > > > > +
> > > > > +/* Initialize the register block for a device */
> > > > > +void cxl_device_register_block_init(Object *obj, CXLDeviceState *dev);
> 
> ...
> 
> > > > +cc Dave, Klaus, Tong
> > > > Other than the minor issues raised.
> > > > 
> > > > Looks good.
> > > > 
> > > > Reviewed by: Adam Manzanares <a.manzanares@samsung.com>  
> > > 
> > > Btw I haven't accepted all changes, but have been picking up
> > > your RB.  Shout if that's not fine with you.  
> > 
> > Definitely fine with me and was my intention. Let us know how we can help move
> > the work forward. I am kick starting reviewing and will try to bring others in.
> 
> Great.  For various reasons I'll not bother mention here (see my employer ;)
> I need to keep any discussions on mailing list or in a 'published' form.
> So discussion on mailing list + at conferences works best for me but we can
> organize some suitably hosted public calls if needed to align plans.
> There is a plan for uconf at Plumbers this year which will hopefully let

We would also prefer to keep discussions in the public domain. We have plans to
attend Plumbers this year, so we look forward to discussing in person. 

> us do any longer term planning.  Shorter term my aims around QEMU side of things
> are:
> 
> 1) Get the initial support upstream as I'm getting bored of rebasing :)
>    I think we are in a fairly good state for doing that once qemu 7.0 is
>    out.
> 2) Improved tests so it doesn't break when no one is paying attention.

Luis may have some thoughts here. 

> 3) Expand out the feature set to keep up with what is going on Linux kernel
>    wise (personally no other OS of interest, but it would be great if anyone
>    wanted to help deal with other operating systems that care).
>   * RAS
>   * CDAT for switches etc, host table updates for generic port definition
>    - What ever else I've missed recently.  When the region code finalizes
>      I suspect we'll want to add a load more tests to stress various corners
>      of that.
>   * Alison may help with partitioning support.
> 4) Expand features where we have currently taken a short cut such as enabling
>    multiple HDM decoders.
> 5) Use it as a path for testing spec features before publication (obviously can't
>    talk about that on list but I've open in appropriate venue about that).
> 
> Happy to have help on any of the above, but 'features' that are reasonably separate
> such as RAS support might be a good place for contributions that won't be
> greatly affected by any other refactoring going on.
> 
> I've pushed all but SPDM support and stuff for which the spec isn't public yet up on
> https://urldefense.com/v3/__https://gitlab.com/jic23/qemu/-/commits/cxl-v9-draft-1__;!!EwVzqGoTKBqv-0DWAJBm!HzD_Dh_I9m9MydppOSSyhuzvwTawlg7LE77bEYiZ1i3AMgxV_YOI56VeZgkg-EMwmPTV$ 
> (as you can see CI found a segfault today so I'll push the fix out for that
>  shortly - that also highlighted a build breakage mid series that I've fixed up.).
> 

Once again thanks for all of the pointers. 

> Jonathan
> 
>  
> > 
> > > 
> > > Thanks.
> > > 
> > > Jonathan
> > >  
>
Jonathan Cameron April 5, 2022, 9:10 a.m. UTC | #9
...

> > > > > 
> > > > > Can we switch this to mem_size and drop the persistent comment? It is my 
> > > > > understanding that HDM is independent of persistence.    
> > > > 
> > > > Discussed in the other branch of this thread.  Short answer is we don't
> > > > support non persistent yet but it's on the todo list.  What exactly
> > > > that looks like is to be determined.  One aspect of that is there
> > > > isn't currently a software stack to test volatile memory.    
> > > 
> > > If you can elaborate more here on what is needed to test the volatile memory 
> > > stack we may be able to help out.  
> > 
> > There are a bunch of different ways this could be done - ultimate we probably
> > want to do all of them.
> > 
> > https://urldefense.com/v3/__https://cdrdv2.intel.com/v1/dl/getContent/643805?wapkw=CXL*20memory*20device*20sw*20guide__;JSUlJQ!!EwVzqGoTKBqv-0DWAJBm!HzD_Dh_I9m9MydppOSSyhuzvwTawlg7LE77bEYiZ1i3AMgxV_YOI56VeZgkg-KuX7XMA$ 
> > has some suggestions (though no one is obliged to follow them!) See 2.4
> > 
> > First assumption is that for volatile devices, a common approach will be to do
> > all the setup in firmware before the OS boots and just present normal SRAT, HMAT
> > and memory tables as if it were any other memory.  If we want to go that way
> > for testing purposes then we'd need an open source firmware to implement
> > setup similar to that done in Linux - probably EDK2.
> > 
> > Of course, volatile memory might be hot added, in which case the OS may be involved.
> > In that case I think the main missing part would be actually doing the final memory
> > hotplug event to expose it to the OS + the necessary dynamic updating of the
> > OS numa description etc. There is work on going to get the information needed
> > but I think we are still some way off actually tying everything together.
> > 
> > Dan / Ben and team may be able to share more status information.  
> 
> Great, thanks for all of the information. We will start planning out our next
> steps. I'll add Luis on cc since he has chatted with me about setting up a 
> test framework for the CXL kernel code that will rely on QEMU.
> 
> >   
> > >   
> > > >     
> > > > >     
> > > > > > +} CXLDeviceState;
> > > > > > +
> > > > > > +/* Initialize the register block for a device */
> > > > > > +void cxl_device_register_block_init(Object *obj, CXLDeviceState *dev);  
> > 
> > ...
> >   
> > > > > +cc Dave, Klaus, Tong
> > > > > Other than the minor issues raised.
> > > > > 
> > > > > Looks good.
> > > > > 
> > > > > Reviewed by: Adam Manzanares <a.manzanares@samsung.com>    
> > > > 
> > > > Btw I haven't accepted all changes, but have been picking up
> > > > your RB.  Shout if that's not fine with you.    
> > > 
> > > Definitely fine with me and was my intention. Let us know how we can help move
> > > the work forward. I am kick starting reviewing and will try to bring others in.  
> > 
> > Great.  For various reasons I'll not bother mention here (see my employer ;)
> > I need to keep any discussions on mailing list or in a 'published' form.
> > So discussion on mailing list + at conferences works best for me but we can
> > organize some suitably hosted public calls if needed to align plans.
> > There is a plan for uconf at Plumbers this year which will hopefully let  
> 
> We would also prefer to keep discussions in the public domain. We have plans to
> attend Plumbers this year, so we look forward to discussing in person. 

Excellent.  If it's useful to have a public discussion before plumbers then the nice
folk at Linaro have been kind enough to host similar discussion in the
past (and deal with posting recordings etc afterwards for those who missed
the live call) and I expect they'd help us out again (Hi Alex ;)

> 
> > us do any longer term planning.  Shorter term my aims around QEMU side of things
> > are:
> > 
> > 1) Get the initial support upstream as I'm getting bored of rebasing :)
> >    I think we are in a fairly good state for doing that once qemu 7.0 is
> >    out.
> > 2) Improved tests so it doesn't break when no one is paying attention.  
> 
> Luis may have some thoughts here. 

Excellent. A testing expert is always useful. It would be nice to think about
getting something beyond a basic 'does it boot' test into the qemu CI but
I've not really looked into how one might do that.

> 
> > 3) Expand out the feature set to keep up with what is going on Linux kernel
> >    wise (personally no other OS of interest, but it would be great if anyone
> >    wanted to help deal with other operating systems that care).
> >   * RAS
> >   * CDAT for switches etc, host table updates for generic port definition
> >    - What ever else I've missed recently.  When the region code finalizes
> >      I suspect we'll want to add a load more tests to stress various corners
> >      of that.
> >   * Alison may help with partitioning support.
> > 4) Expand features where we have currently taken a short cut such as enabling
> >    multiple HDM decoders.
> > 5) Use it as a path for testing spec features before publication (obviously can't
> >    talk about that on list but I've open in appropriate venue about that).
> > 
> > Happy to have help on any of the above, but 'features' that are reasonably separate
> > such as RAS support might be a good place for contributions that won't be
> > greatly affected by any other refactoring going on.
> > 
> > I've pushed all but SPDM support and stuff for which the spec isn't public yet up on
> > https://urldefense.com/v3/__https://gitlab.com/jic23/qemu/-/commits/cxl-v9-draft-1__;!!EwVzqGoTKBqv-0DWAJBm!HzD_Dh_I9m9MydppOSSyhuzvwTawlg7LE77bEYiZ1i3AMgxV_YOI56VeZgkg-EMwmPTV$ 
> > (as you can see CI found a segfault today so I'll push the fix out for that
> >  shortly - that also highlighted a build breakage mid series that I've fixed up.).
> >   
> 
> Once again thanks for all of the pointers. 

You are welcome. It's nice to see this work gain traction :)

Anyhow, v9 is on it's way (slowly) through our firewall (got log anti spam
send rate limits) so fingers crossed we are nearly ready with this first bit
of support to build more fun stuff on top of.

Jonathan

> 
> > Jonathan
diff mbox series

Patch

diff --git a/include/hw/cxl/cxl.h b/include/hw/cxl/cxl.h
index 8c738c7a2b..b9d1ac3fad 100644
--- a/include/hw/cxl/cxl.h
+++ b/include/hw/cxl/cxl.h
@@ -12,5 +12,6 @@ 
 
 #include "cxl_pci.h"
 #include "cxl_component.h"
+#include "cxl_device.h"
 
 #endif
diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
new file mode 100644
index 0000000000..b2416e45bf
--- /dev/null
+++ b/include/hw/cxl/cxl_device.h
@@ -0,0 +1,165 @@ 
+/*
+ * QEMU CXL Devices
+ *
+ * Copyright (c) 2020 Intel
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See the
+ * COPYING file in the top-level directory.
+ */
+
+#ifndef CXL_DEVICE_H
+#define CXL_DEVICE_H
+
+#include "hw/register.h"
+
+/*
+ * The following is how a CXL device's MMIO space is laid out. The only
+ * requirement from the spec is that the capabilities array and the capability
+ * headers start at offset 0 and are contiguously packed. The headers themselves
+ * provide offsets to the register fields. For this emulation, registers will
+ * start at offset 0x80 (m == 0x80). No secondary mailbox is implemented which
+ * means that n = m + sizeof(mailbox registers) + sizeof(device registers).
+ *
+ * This is roughly described in 8.2.8 Figure 138 of the CXL 2.0 spec.
+ *
+ *                       +---------------------------------+
+ *                       |                                 |
+ *                       |    Memory Device Registers      |
+ *                       |                                 |
+ * n + PAYLOAD_SIZE_MAX  -----------------------------------
+ *                  ^    |                                 |
+ *                  |    |                                 |
+ *                  |    |                                 |
+ *                  |    |                                 |
+ *                  |    |                                 |
+ *                  |    |         Mailbox Payload         |
+ *                  |    |                                 |
+ *                  |    |                                 |
+ *                  |    |                                 |
+ *                  |    -----------------------------------
+ *                  |    |       Mailbox Registers         |
+ *                  |    |                                 |
+ *                  n    -----------------------------------
+ *                  ^    |                                 |
+ *                  |    |        Device Registers         |
+ *                  |    |                                 |
+ *                  m    ---------------------------------->
+ *                  ^    |  Memory Device Capability Header|
+ *                  |    -----------------------------------
+ *                  |    |     Mailbox Capability Header   |
+ *                  |    -------------- --------------------
+ *                  |    |     Device Capability Header    |
+ *                  |    -----------------------------------
+ *                  |    |                                 |
+ *                  |    |                                 |
+ *                  |    |      Device Cap Array[0..n]     |
+ *                  |    |                                 |
+ *                  |    |                                 |
+ *                       |                                 |
+ *                  0    +---------------------------------+
+ *
+ */
+
+#define CXL_DEVICE_CAP_HDR1_OFFSET 0x10 /* Figure 138 */
+#define CXL_DEVICE_CAP_REG_SIZE 0x10 /* 8.2.8.2 */
+#define CXL_DEVICE_CAPS_MAX 4 /* 8.2.8.2.1 + 8.2.8.5 */
+
+#define CXL_DEVICE_REGISTERS_OFFSET 0x80 /* Read comment above */
+#define CXL_DEVICE_REGISTERS_LENGTH 0x8 /* 8.2.8.3.1 */
+
+#define CXL_MAILBOX_REGISTERS_OFFSET \
+    (CXL_DEVICE_REGISTERS_OFFSET + CXL_DEVICE_REGISTERS_LENGTH)
+#define CXL_MAILBOX_REGISTERS_SIZE 0x20 /* 8.2.8.4, Figure 139 */
+#define CXL_MAILBOX_PAYLOAD_SHIFT 11
+#define CXL_MAILBOX_MAX_PAYLOAD_SIZE (1 << CXL_MAILBOX_PAYLOAD_SHIFT)
+#define CXL_MAILBOX_REGISTERS_LENGTH \
+    (CXL_MAILBOX_REGISTERS_SIZE + CXL_MAILBOX_MAX_PAYLOAD_SIZE)
+
+typedef struct cxl_device_state {
+    MemoryRegion device_registers;
+
+    /* mmio for device capabilities array - 8.2.8.2 */
+    MemoryRegion device;
+    MemoryRegion caps;
+
+    /* mmio for the mailbox registers 8.2.8.4 */
+    MemoryRegion mailbox;
+
+    /* memory region for persistent memory, HDM */
+    uint64_t pmem_size;
+} CXLDeviceState;
+
+/* Initialize the register block for a device */
+void cxl_device_register_block_init(Object *obj, CXLDeviceState *dev);
+
+/* Set up default values for the register block */
+void cxl_device_register_init_common(CXLDeviceState *dev);
+
+/*
+ * CXL 2.0 - 8.2.8.1 including errata F4
+ * Documented as a 128 bit register, but 64 bit accesses and the second
+ * 64 bits are currently reserved.
+ */
+REG64(CXL_DEV_CAP_ARRAY, 0) /* Documented as 128 bit register but 64 byte accesses */
+    FIELD(CXL_DEV_CAP_ARRAY, CAP_ID, 0, 16)
+    FIELD(CXL_DEV_CAP_ARRAY, CAP_VERSION, 16, 8)
+    FIELD(CXL_DEV_CAP_ARRAY, CAP_COUNT, 32, 16)
+
+/*
+ * Helper macro to initialize capability headers for CXL devices.
+ *
+ * In the 8.2.8.2, this is listed as a 128b register, but in 8.2.8, it says:
+ * > No registers defined in Section 8.2.8 are larger than 64-bits wide so that
+ * > is the maximum access size allowed for these registers. If this rule is not
+ * > followed, the behavior is undefined
+ *
+ * CXL 2.0 Errata F4 states futher that the layouts in the specification are
+ * shown as greater than 128 bits, but implementations are expected to
+ * use any size of access up to 64 bits.
+ *
+ * Here we've chosen to make it 4 dwords. The spec allows any pow2 multiple
+ * access to be used for a register up to 64 bits.
+ */
+#define CXL_DEVICE_CAPABILITY_HEADER_REGISTER(n, offset)  \
+    REG32(CXL_DEV_##n##_CAP_HDR0, offset)                 \
+        FIELD(CXL_DEV_##n##_CAP_HDR0, CAP_ID, 0, 16)      \
+        FIELD(CXL_DEV_##n##_CAP_HDR0, CAP_VERSION, 16, 8) \
+    REG32(CXL_DEV_##n##_CAP_HDR1, offset + 4)             \
+        FIELD(CXL_DEV_##n##_CAP_HDR1, CAP_OFFSET, 0, 32)  \
+    REG32(CXL_DEV_##n##_CAP_HDR2, offset + 8)             \
+        FIELD(CXL_DEV_##n##_CAP_HDR2, CAP_LENGTH, 0, 32)
+
+CXL_DEVICE_CAPABILITY_HEADER_REGISTER(DEVICE, CXL_DEVICE_CAP_HDR1_OFFSET)
+CXL_DEVICE_CAPABILITY_HEADER_REGISTER(MAILBOX, CXL_DEVICE_CAP_HDR1_OFFSET + \
+                                               CXL_DEVICE_CAP_REG_SIZE)
+
+REG32(CXL_DEV_MAILBOX_CAP, 0)
+    FIELD(CXL_DEV_MAILBOX_CAP, PAYLOAD_SIZE, 0, 5)
+    FIELD(CXL_DEV_MAILBOX_CAP, INT_CAP, 5, 1)
+    FIELD(CXL_DEV_MAILBOX_CAP, BG_INT_CAP, 6, 1)
+    FIELD(CXL_DEV_MAILBOX_CAP, MSI_N, 7, 4)
+
+REG32(CXL_DEV_MAILBOX_CTRL, 4)
+    FIELD(CXL_DEV_MAILBOX_CTRL, DOORBELL, 0, 1)
+    FIELD(CXL_DEV_MAILBOX_CTRL, INT_EN, 1, 1)
+    FIELD(CXL_DEV_MAILBOX_CTRL, BG_INT_EN, 2, 1)
+
+REG64(CXL_DEV_MAILBOX_CMD, 8)
+    FIELD(CXL_DEV_MAILBOX_CMD, COMMAND, 0, 8)
+    FIELD(CXL_DEV_MAILBOX_CMD, COMMAND_SET, 8, 8)
+    FIELD(CXL_DEV_MAILBOX_CMD, LENGTH, 16, 20)
+
+REG64(CXL_DEV_MAILBOX_STS, 0x10)
+    FIELD(CXL_DEV_MAILBOX_STS, BG_OP, 0, 1)
+    FIELD(CXL_DEV_MAILBOX_STS, ERRNO, 32, 16)
+    FIELD(CXL_DEV_MAILBOX_STS, VENDOR_ERRNO, 48, 16)
+
+REG64(CXL_DEV_BG_CMD_STS, 0x18)
+    FIELD(CXL_DEV_BG_CMD_STS, BG, 0, 16)
+    FIELD(CXL_DEV_BG_CMD_STS, DONE, 16, 7)
+    FIELD(CXL_DEV_BG_CMD_STS, ERRNO, 32, 16)
+    FIELD(CXL_DEV_BG_CMD_STS, VENDOR_ERRNO, 48, 16)
+
+REG32(CXL_DEV_CMD_PAYLOAD, 0x20)
+
+#endif