diff mbox series

[RFC,v2,1/2] Basic PCIe DOE support

Message ID 1612902949-9992-1-git-send-email-cbrowy@avery-design.com (mailing list archive)
State New, archived
Headers show
Series PCIe DOE for PCIe and CXL 2.0 v2 release | expand

Commit Message

Chris Browy Feb. 9, 2021, 8:35 p.m. UTC
---
 MAINTAINERS                               |   7 +
 hw/pci/meson.build                        |   1 +
 hw/pci/pcie.c                             |   2 +-
 hw/pci/pcie_doe.c                         | 414 ++++++++++++++++++++++++++++++
 include/hw/pci/pci_ids.h                  |   2 +
 include/hw/pci/pcie.h                     |   1 +
 include/hw/pci/pcie_doe.h                 | 166 ++++++++++++
 include/hw/pci/pcie_regs.h                |   4 +
 include/standard-headers/linux/pci_regs.h |   3 +-
 9 files changed, 598 insertions(+), 2 deletions(-)
 create mode 100644 hw/pci/pcie_doe.c
 create mode 100644 include/hw/pci/pcie_doe.h

Comments

Ben Widawsky Feb. 9, 2021, 9:42 p.m. UTC | #1
Have you/Jonathan come to consensus about which implementation is going forward?
I'd rather not have to review two :D

On 21-02-09 15:35:49, Chris Browy wrote:
> ---
>  MAINTAINERS                               |   7 +
>  hw/pci/meson.build                        |   1 +
>  hw/pci/pcie.c                             |   2 +-
>  hw/pci/pcie_doe.c                         | 414 ++++++++++++++++++++++++++++++
>  include/hw/pci/pci_ids.h                  |   2 +
>  include/hw/pci/pcie.h                     |   1 +
>  include/hw/pci/pcie_doe.h                 | 166 ++++++++++++
>  include/hw/pci/pcie_regs.h                |   4 +
>  include/standard-headers/linux/pci_regs.h |   3 +-
>  9 files changed, 598 insertions(+), 2 deletions(-)
>  create mode 100644 hw/pci/pcie_doe.c
>  create mode 100644 include/hw/pci/pcie_doe.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 981dc92..4fb865e 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1655,6 +1655,13 @@ F: docs/pci*
>  F: docs/specs/*pci*
>  F: default-configs/pci.mak
>  
> +PCIE DOE
> +M: Huai-Cheng Kuo <hchkuo@avery-design.com.tw>
> +M: Chris Browy <cbrowy@avery-design.com>
> +S: Supported
> +F: include/hw/pci/pcie_doe.h
> +F: hw/pci/pcie_doe.c
> +
>  ACPI/SMBIOS
>  M: Michael S. Tsirkin <mst@redhat.com>
>  M: Igor Mammedov <imammedo@redhat.com>
> diff --git a/hw/pci/meson.build b/hw/pci/meson.build
> index 5c4bbac..115e502 100644
> --- a/hw/pci/meson.build
> +++ b/hw/pci/meson.build
> @@ -12,6 +12,7 @@ pci_ss.add(files(
>  # allow plugging PCIe devices into PCI buses, include them even if
>  # CONFIG_PCI_EXPRESS=n.
>  pci_ss.add(files('pcie.c', 'pcie_aer.c'))
> +pci_ss.add(files('pcie_doe.c'))

It looks like this should be like the below line:
softmmu_ss.add(when: 'CONFIG_PCI_EXPRESS', if_true: pci_doe.c))

>  softmmu_ss.add(when: 'CONFIG_PCI_EXPRESS', if_true: files('pcie_port.c', 'pcie_host.c'))
>  softmmu_ss.add_all(when: 'CONFIG_PCI', if_true: pci_ss)
>  
> diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> index 1ecf6f6..f7516c4 100644
> --- a/hw/pci/pcie.c
> +++ b/hw/pci/pcie.c
> @@ -735,7 +735,7 @@ void pcie_cap_slot_write_config(PCIDevice *dev,
>  
>      hotplug_event_notify(dev);
>  
> -    /* 
> +    /*

Please drop this.

>       * 6.7.3.2 Command Completed Events
>       *
>       * Software issues a command to a hot-plug capable Downstream Port by
> diff --git a/hw/pci/pcie_doe.c b/hw/pci/pcie_doe.c
> new file mode 100644
> index 0000000..df8e92e
> --- /dev/null
> +++ b/hw/pci/pcie_doe.c
> @@ -0,0 +1,414 @@
> +#include "qemu/osdep.h"
> +#include "qemu/log.h"
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
> +#include "qemu/range.h"
> +#include "hw/pci/pci.h"
> +#include "hw/pci/pcie.h"
> +#include "hw/pci/pcie_doe.h"
> +#include "hw/pci/msi.h"
> +#include "hw/pci/msix.h"
> +
> +/*
> + * DOE Default Protocols (Discovery, CMA)
> + */
> +/* Discovery Request Object */
> +struct doe_discovery {
> +    DOEHeader header;
> +    uint8_t index;
> +    uint8_t reserved[3];
> +} QEMU_PACKED;
> +
> +/* Discovery Response Object */
> +struct doe_discovery_rsp {
> +    DOEHeader header;
> +    uint16_t vendor_id;
> +    uint8_t doe_type;
> +    uint8_t next_index;
> +} QEMU_PACKED;
> +
> +/* Callback for Discovery */
> +static bool pcie_doe_discovery_rsp(DOECap *doe_cap)
> +{
> +    PCIEDOE *doe = doe_cap->doe;
> +    struct doe_discovery *req = pcie_doe_get_req(doe_cap);
> +    uint8_t index = req->index;
> +    DOEProtocol *prot = NULL;
> +
> +    /* Request length mismatch, discard */
> +    if (req->header.length < dwsizeof(struct doe_discovery)) {

Use DIV_ROUND_UP instead of rolling your own thing.

> +        return DOE_DISCARD;
> +    }
> +
> +    /* Point to the requested protocol */
> +    if (index < doe->protocol_num) {
> +        prot = &doe->protocols[index];
> +    }

What happens on else, should that still return DOE_SUCCESS?

> +
> +    struct doe_discovery_rsp rsp = {
> +        .header = {
> +            .vendor_id = PCI_VENDOR_ID_PCI_SIG,
> +            .doe_type = PCI_SIG_DOE_DISCOVERY,
> +            .reserved = 0x0,
> +            .length = dwsizeof(struct doe_discovery_rsp),
> +        },

mixed declarations are not allowed.
DIV_ROUND_UP

> +        .vendor_id = (prot) ? prot->vendor_id : 0xFFFF,
> +        .doe_type = (prot) ? prot->doe_type : 0xFF,
> +        .next_index = (index + 1) < doe->protocol_num ?
> +                      (index + 1) : 0,
> +    };

I prefer:
next_index = (index + 1) % doe->protocol_num

> +
> +    pcie_doe_set_rsp(doe_cap, &rsp);
> +
> +    return DOE_SUCCESS;
> +}
> +
> +/* Callback for CMA */
> +static bool pcie_doe_cma_rsp(DOECap *doe_cap)
> +{
> +    doe_cap->status.error = 1;
> +
> +    memset(doe_cap->read_mbox, 0,
> +           PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
> +
> +    doe_cap->write_mbox_len = 0;
> +
> +    return DOE_DISCARD;
> +}
> +
> +/*
> + * DOE Utilities
> + */
> +static void pcie_doe_reset_mbox(DOECap *st)
> +{
> +    st->read_mbox_idx = 0;
> +
> +    st->read_mbox_len = 0;
> +    st->write_mbox_len = 0;
> +
> +    memset(st->read_mbox, 0, PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
> +    memset(st->write_mbox, 0, PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
> +}
> +
> +/*
> + * Initialize the list and protocol for a device.
> + * This function won't add the DOE capabitity to your PCIe device.
> + */
> +void pcie_doe_init(PCIDevice *dev, PCIEDOE *doe)
> +{
> +    doe->pdev = dev;
> +    doe->head = NULL;
> +    doe->protocol_num = 0;
> +
> +    /* Register two default protocol */
> +    //TODO : LINK LIST

Please do this for next rev of the patches.

> +    pcie_doe_register_protocol(doe, PCI_VENDOR_ID_PCI_SIG,
> +            PCI_SIG_DOE_DISCOVERY, pcie_doe_discovery_rsp);
> +    pcie_doe_register_protocol(doe, PCI_VENDOR_ID_PCI_SIG,
> +            PCI_SIG_DOE_CMA, pcie_doe_cma_rsp);
> +}
> +
> +int pcie_cap_doe_add(PCIEDOE *doe, uint16_t offset, bool intr, uint16_t vec) {
> +    DOECap *new_cap, **ptr;
> +    PCIDevice *dev = doe->pdev;
> +
> +    pcie_add_capability(dev, PCI_EXT_CAP_ID_DOE, PCI_DOE_VER, offset,
> +                        PCI_DOE_SIZEOF);
> +
> +    ptr = &doe->head;
> +    /* Insert the new DOE at the end of linked list */
> +    while (*ptr) {
> +        if (range_covers_byte((*ptr)->offset, PCI_DOE_SIZEOF, offset) ||
> +            (*ptr)->cap.vec == vec) {
> +            return -EINVAL;
> +        }
> +
> +        ptr = &(*ptr)->next;
> +    }
> +    new_cap = g_malloc0(sizeof(DOECap));
> +    *ptr = new_cap;
> +
> +    new_cap->doe = doe;
> +    new_cap->offset = offset;
> +    new_cap->next = NULL;
> +
> +    /* Configure MSI/MSI-X */
> +    if (intr && (msi_present(dev) || msix_present(dev))) {
> +        new_cap->cap.intr = intr;
> +        new_cap->cap.vec = vec;
> +    }
> +
> +    /* Set up W/R Mailbox buffer */
> +    new_cap->write_mbox = g_malloc0(PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
> +    new_cap->read_mbox = g_malloc0(PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
> +
> +    pcie_doe_reset_mbox(new_cap);
> +
> +    return 0;
> +}
> +
> +void pcie_doe_uninit(PCIEDOE *doe) {

fini is the more idiomatically unix name for de/un init.
> +    PCIDevice *dev = doe->pdev;
> +    DOECap *cap;
> +
> +    pci_del_capability(dev, PCI_EXT_CAP_ID_DOE, PCI_DOE_SIZEOF);
> +
> +    cap = doe->head;
> +    while (cap) {
> +        doe->head = doe->head->next;
> +
> +        g_free(cap->read_mbox);
> +        g_free(cap->write_mbox);
> +        cap->read_mbox = NULL;
> +        cap->write_mbox = NULL;
> +        g_free(cap);
> +        cap = doe->head;
> +    }
> +}
> +
> +void pcie_doe_register_protocol(PCIEDOE *doe, uint16_t vendor_id,
> +        uint8_t doe_type, bool (*set_rsp)(DOECap *))
> +{
> +    /* Protocol num should not exceed 256 */
> +    assert(doe->protocol_num < PCI_DOE_PROTOCOL_MAX);
> +
> +    doe->protocols[doe->protocol_num].vendor_id = vendor_id;
> +    doe->protocols[doe->protocol_num].doe_type = doe_type;
> +    doe->protocols[doe->protocol_num].set_rsp = set_rsp;
> +
> +    doe->protocol_num++;
> +}
> +
> +uint32_t pcie_doe_build_protocol(DOEProtocol *p)
> +{
> +    return DATA_OBJ_BUILD_HEADER1(p->vendor_id, p->doe_type);
> +}
> +
> +/* Return the pointer of DOE request in write mailbox buffer */
> +void *pcie_doe_get_req(DOECap *doe_cap)
> +{
> +    return doe_cap->write_mbox;
> +}
> +
> +/* Copy the response to read mailbox buffer */
> +void pcie_doe_set_rsp(DOECap *doe_cap, void *rsp)
> +{
> +    uint32_t len = pcie_doe_object_len(rsp);
> +
> +    memcpy(doe_cap->read_mbox + doe_cap->read_mbox_len,
> +           rsp, len * sizeof(uint32_t));
> +    doe_cap->read_mbox_len += len;
> +}
> +
> +/* Get Data Object length */
> +uint32_t pcie_doe_object_len(void *obj)
> +{
> +    return (obj)? ((DOEHeader *)obj)->length : 0;
> +}
> +
> +static void pcie_doe_write_mbox(DOECap *doe_cap, uint32_t val)
> +{
> +    memcpy(doe_cap->write_mbox + doe_cap->write_mbox_len, &val, sizeof(uint32_t));

doe_cap->write_mbox[doe_cap->write_mbox_len] = val?

> +
> +    if (doe_cap->write_mbox_len == 1) {
> +        DOE_DBG("  Capture DOE request DO length = %d\n", val & 0x0003ffff);
> +    }

I don't have the spec in front of me, but this is begging for a comment on why 1
is special.

> +
> +    doe_cap->write_mbox_len++;
> +}
> +
> +static void pcie_doe_irq_assert(DOECap *doe_cap)
> +{
> +    PCIDevice *dev = doe_cap->doe->pdev;
> +
> +    if (doe_cap->cap.intr && doe_cap->ctrl.intr) {
> +        /* Interrupt notify */
> +        if (msix_enabled(dev)) {
> +            msix_notify(dev, doe_cap->cap.vec);
> +        } else if (msi_enabled(dev)) {
> +            msi_notify(dev, doe_cap->cap.vec);
> +        }
> +        /* Not support legacy IRQ */
> +    }
> +}
> +
> +static void pcie_doe_set_ready(DOECap *doe_cap, bool rdy)
> +{
> +    doe_cap->status.ready = rdy;
> +
> +    if (rdy) {
> +        pcie_doe_irq_assert(doe_cap);
> +    }
> +}
> +
> +static void pcie_doe_set_error(DOECap *doe_cap, bool err)
> +{
> +    doe_cap->status.error = err;
> +
> +    if (err) {
> +        pcie_doe_irq_assert(doe_cap);
> +    }
> +}
> +
> +DOECap *pcie_doe_covers_addr(PCIEDOE *doe, uint32_t addr)
> +{
> +    DOECap *ptr = doe->head;
> +
> +    /* If overlaps DOE range. PCIe Capability Header won't be included. */
> +    while (ptr && 
> +           !range_covers_byte(ptr->offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
> +        ptr = ptr->next;
> +    }
> +    
> +    return ptr;
> +}
> +
> +uint32_t pcie_doe_read_config(DOECap *doe_cap,
> +                              uint32_t addr, int size)
> +{
> +    uint32_t ret = 0, shift, mask = 0xFFFFFFFF;
> +    uint16_t doe_offset = doe_cap->offset;
> +
> +    /* If overlaps DOE range. PCIe Capability Header won't be included. */
> +    if (range_covers_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
> +        addr -= doe_offset;
> +
> +        if (range_covers_byte(PCI_EXP_DOE_CAP, sizeof(uint32_t), addr)) {
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_REG, INTR_SUPP,
> +                             doe_cap->cap.intr);
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_REG, DOE_INTR_MSG_NUM,
> +                             doe_cap->cap.vec);
> +        } else if (range_covers_byte(PCI_EXP_DOE_CTRL, sizeof(uint32_t), addr)) {
> +            /* Must return ABORT=0 and GO=0 */
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_CONTROL, DOE_INTR_EN,
> +                             doe_cap->ctrl.intr);
> +            DOE_DBG("  CONTROL REG=%x\n", ret);
> +        } else if (range_covers_byte(PCI_EXP_DOE_STATUS, sizeof(uint32_t), addr)) {
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_BUSY,
> +                             doe_cap->status.busy);
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_INTR_STATUS,
> +                             doe_cap->status.intr);
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_ERROR,
> +                             doe_cap->status.error);
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DATA_OBJ_RDY,
> +                             doe_cap->status.ready);
> +        } else if (range_covers_byte(PCI_EXP_DOE_RD_DATA_MBOX, sizeof(uint32_t), addr)) {
> +            /* Check that READY is set */
> +            if (!doe_cap->status.ready) {
> +                /* Return 0 if not ready */
> +                DOE_DBG("  RD MBOX RETURN=%x\n", ret);
> +            } else {
> +                /* Deposit next DO DW into read mbox */
> +                DOE_DBG("  RD MBOX, DATA OBJECT READY,"
> +                        " CURRENT DO DW OFFSET=%x\n",
> +                        doe_cap->read_mbox_idx);
> +
> +                ret = doe_cap->read_mbox[doe_cap->read_mbox_idx];
> +
> +                DOE_DBG("  RD MBOX DW=%x\n", ret);
> +                DOE_DBG("  RD MBOX DW OFFSET=%d, RD MBOX LENGTH=%d\n",
> +                        doe_cap->read_mbox_idx, doe_cap->read_mbox_len);
> +            }
> +        } else if (range_covers_byte(PCI_EXP_DOE_WR_DATA_MBOX, sizeof(uint32_t), addr)) {
> +            DOE_DBG("  WR MBOX, local_val =%x\n", ret);
> +        }
> +    }
> +
> +    /* Alignment */
> +    shift = addr % sizeof(uint32_t);

Can these actually be unaligned? The whole range_covers_byte() stuff could go
away if not.

> +    if (shift) {
> +        ret >>= shift * 8;
> +    }
> +    mask >>= (sizeof(uint32_t) - size) * 8;
> +
> +    return ret & mask;
> +}
> +
> +void pcie_doe_write_config(DOECap *doe_cap,
> +                           uint32_t addr, uint32_t val, int size)
> +{
> +    PCIEDOE *doe = doe_cap->doe;
> +    uint16_t doe_offset = doe_cap->offset;
> +    int p;
> +    bool discard;
> +    uint32_t shift;
> +
> +    DOE_DBG("  addr=%x, val=%x, size=%x\n", addr, val, size);
> +
> +    /* If overlaps DOE range. PCIe Capability Header won't be included. */
> +    if (range_covers_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
> +        /* Alignment */
> +        shift = addr % sizeof(uint32_t);
> +        addr -= (doe_offset - shift);
> +        val <<= shift * 8;
> +
> +        switch (addr) {
> +        case PCI_EXP_DOE_CTRL:
> +            DOE_DBG("  CONTROL=%x\n", val);
> +            if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_ABORT)) {
> +                /* If ABORT, clear status reg DOE Error and DOE Ready */
> +                DOE_DBG("  Setting ABORT\n");
> +                pcie_doe_set_ready(doe_cap, 0);
> +                pcie_doe_set_error(doe_cap, 0);
> +                pcie_doe_reset_mbox(doe_cap);
> +            } else if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_GO)) {
> +                DOE_DBG("  CONTROL GO=%x\n", val);
> +                /*
> +                 * Check protocol the incoming request in write_mbox and
> +                 * memcpy the corresponding response to read_mbox.
> +                 *
> +                 * "discard" should be set up if the response callback
> +                 * function could not deal with request (e.g. length
> +                 * mismatch) or the protocol of request was not found.
> +                 */
> +                discard = DOE_DISCARD;
> +                for (p = 0; p < doe->protocol_num; p++) {
> +                    if (doe_cap->write_mbox[0] ==
> +                        pcie_doe_build_protocol(&doe->protocols[p])) {
> +                        /* Found */
> +                        DOE_DBG("  DOE PROTOCOL=%x\n", doe_cap->write_mbox[0]);
> +                        /*
> +                         * Spec:
> +                         * If the number of DW transferred does not match the
> +                         * indicated Length for a data object, then the
> +                         * data object must be silently discarded.
> +                         */
> +                        if (doe_cap->write_mbox_len ==
> +                            pcie_doe_object_len(pcie_doe_get_req(doe_cap)))
> +                            discard = doe->protocols[p].set_rsp(doe_cap);
> +                        break;
> +                    }
> +                }
> +
> +                /* Set DOE Ready */
> +                if (!discard) {
> +                    pcie_doe_set_ready(doe_cap, 1);
> +                } else {
> +                    pcie_doe_reset_mbox(doe_cap);
> +                }
> +            } else if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_INTR_EN)) {
> +                doe_cap->ctrl.intr = 1;
> +            }
> +            break;
> +        case PCI_EXP_DOE_RD_DATA_MBOX:
> +            /* Read MBOX */
> +            DOE_DBG("  INCR RD MBOX DO DW OFFSET=%d\n", doe_cap->read_mbox_idx);
> +            doe_cap->read_mbox_idx += 1;
> +            /* Last DW */
> +            if (doe_cap->read_mbox_idx >= doe_cap->read_mbox_len) {
> +                pcie_doe_reset_mbox(doe_cap);
> +                pcie_doe_set_ready(doe_cap, 0);
> +            }
> +            break;
> +        case PCI_EXP_DOE_WR_DATA_MBOX:
> +            /* Write MBOX */
> +            DOE_DBG("  WR MBOX=%x, DW OFFSET = %d\n", val, doe_cap->write_mbox_len);
> +            pcie_doe_write_mbox(doe_cap, val);
> +            break;
> +        case PCI_EXP_DOE_STATUS:
> +        case PCI_EXP_DOE_CAP:
> +        default:
> +            break;
> +        }
> +    }
> +}
> diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
> index 76bf3ed..636b2e8 100644
> --- a/include/hw/pci/pci_ids.h
> +++ b/include/hw/pci/pci_ids.h
> @@ -157,6 +157,8 @@
>  
>  /* Vendors and devices.  Sort key: vendor first, device next. */
>  
> +#define PCI_VENDOR_ID_PCI_SIG            0x0001
> +
>  #define PCI_VENDOR_ID_LSI_LOGIC          0x1000
>  #define PCI_DEVICE_ID_LSI_53C810         0x0001
>  #define PCI_DEVICE_ID_LSI_53C895A        0x0012
> diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
> index 14c58eb..47d6f66 100644
> --- a/include/hw/pci/pcie.h
> +++ b/include/hw/pci/pcie.h
> @@ -25,6 +25,7 @@
>  #include "hw/pci/pcie_regs.h"
>  #include "hw/pci/pcie_aer.h"
>  #include "hw/hotplug.h"
> +#include "hw/pci/pcie_doe.h"
>  
>  typedef enum {
>      /* for attention and power indicator */
> diff --git a/include/hw/pci/pcie_doe.h b/include/hw/pci/pcie_doe.h
> new file mode 100644
> index 0000000..f497075
> --- /dev/null
> +++ b/include/hw/pci/pcie_doe.h
> @@ -0,0 +1,166 @@
> +#ifndef PCIE_DOE_H
> +#define PCIE_DOE_H
> +
> +#include "qemu/range.h"
> +#include "qemu/typedefs.h"
> +#include "hw/register.h"
> +
> +/* 
> + * PCI DOE register defines 7.9.xx 
> + */

Whitespace issues

> +/* DOE Capabilities Register */
> +#define PCI_EXP_DOE_CAP             0x04
> +REG32(PCI_DOE_CAP_REG, 0)
> +    FIELD(PCI_DOE_CAP_REG, INTR_SUPP, 0, 1)
> +    FIELD(PCI_DOE_CAP_REG, DOE_INTR_MSG_NUM, 1, 11)
> +
> +/* DOE Control Register */
> +#define PCI_EXP_DOE_CTRL            0x08
> +REG32(PCI_DOE_CAP_CONTROL, 0)
> +    FIELD(PCI_DOE_CAP_CONTROL, DOE_ABORT, 0, 1)
> +    FIELD(PCI_DOE_CAP_CONTROL, DOE_INTR_EN, 1, 1)
> +    FIELD(PCI_DOE_CAP_CONTROL, DOE_GO, 31, 1)
> +
> +/* DOE Status Register  */
> +#define PCI_EXP_DOE_STATUS          0x0c
> +REG32(PCI_DOE_CAP_STATUS, 0)
> +    FIELD(PCI_DOE_CAP_STATUS, DOE_BUSY, 0, 1)
> +    FIELD(PCI_DOE_CAP_STATUS, DOE_INTR_STATUS, 1, 1)
> +    FIELD(PCI_DOE_CAP_STATUS, DOE_ERROR, 2, 1)
> +    FIELD(PCI_DOE_CAP_STATUS, DATA_OBJ_RDY, 31, 1)
> +
> +/* DOE Write Data Mailbox Register  */
> +#define PCI_EXP_DOE_WR_DATA_MBOX    0x10
> +
> +/* DOE Read Data Mailbox Register  */
> +#define PCI_EXP_DOE_RD_DATA_MBOX    0x14
> +
> +/* 
> + * PCI DOE register defines 7.9.xx 
> + */

Is this duplicated on purpose?

> +/* Table 7-x2 */
> +#define PCI_SIG_DOE_DISCOVERY       0x00
> +#define PCI_SIG_DOE_CMA             0x01
> +
> +#define DATA_OBJ_BUILD_HEADER1(v, p)  ((p << 16) | v)
> +
> +/* Figure 6-x1 */
> +#define DATA_OBJECT_HEADER1_OFFSET  0
> +#define DATA_OBJECT_HEADER2_OFFSET  1
> +#define DATA_OBJECT_CONTENT_OFFSET  2
> +
> +#define PCI_DOE_MAX_DW_SIZE (1 << 18)
> +#define PCI_DOE_PROTOCOL_MAX 256
> +
> +/*
> + * Self-defined Marco
> + */
> +/* Request/Response State */
> +#define DOE_SUCCESS 0
> +#define DOE_DISCARD 1
> +
> +/* An invalid vector number for MSI/MSI-x */
> +#define DOE_NO_INTR (~0)
> +
> +/*
> + * DOE Protocol - Data Object
> + */
> +typedef struct DOEHeader DOEHeader;
> +typedef struct DOEProtocol DOEProtocol;
> +typedef struct PCIEDOE PCIEDOE;
> +typedef struct DOECap DOECap;
> +
> +struct DOEHeader {
> +    uint16_t vendor_id;
> +    uint8_t doe_type;
> +    uint8_t reserved;
> +    struct {
> +        uint32_t length:18;
> +        uint32_t reserved2:14;
> +    };
> +} QEMU_PACKED;
> +
> +/* Protocol infos and rsp function callback */
> +struct DOEProtocol {
> +    uint16_t vendor_id;
> +    uint8_t doe_type;
> +
> +    bool (*set_rsp)(DOECap *);
> +};
> +
> +struct PCIEDOE {
> +    PCIDevice *pdev;
> +    DOECap *head;
> +
> +    /* Protocols and its callback response */
> +    DOEProtocol protocols[PCI_DOE_PROTOCOL_MAX];
> +    uint32_t protocol_num;
> +};
> +
> +struct DOECap {
> +    PCIEDOE *doe;
> +
> +    /* Capability offset */
> +    uint16_t offset;
> +
> +    /* Next DOE capability */
> +    DOECap *next;
> +
> +    /* Capability */
> +    struct {
> +        bool intr;
> +        uint16_t vec;
> +    } cap;
> +
> +    /* Control */
> +    struct {
> +        bool abort;
> +        bool intr;
> +        bool go;
> +    } ctrl;
> +
> +    /* Status */
> +    struct {
> +        bool busy;
> +        bool intr;
> +        bool error;
> +        bool ready;
> +    } status;
> +
> +    /* Mailbox buffer for device */
> +    uint32_t *write_mbox;
> +    uint32_t *read_mbox;
> +
> +    /* Mailbox position indicator */
> +    uint32_t read_mbox_idx;
> +    uint32_t read_mbox_len;
> +    uint32_t write_mbox_len;
> +};
> +
> +void pcie_doe_init(PCIDevice *dev, PCIEDOE *doe);
> +int pcie_cap_doe_add(PCIEDOE *doe, uint16_t offset, bool intr, uint16_t vec);
> +void pcie_doe_uninit(PCIEDOE *doe);
> +void pcie_doe_register_protocol(PCIEDOE *doe, uint16_t vendor_id,
> +                                uint8_t doe_type,
> +                                bool (*set_rsp)(DOECap *));
> +uint32_t pcie_doe_build_protocol(DOEProtocol *p);
> +DOECap *pcie_doe_covers_addr(PCIEDOE *doe, uint32_t addr);
> +uint32_t pcie_doe_read_config(DOECap *doe_cap, uint32_t addr, int size);
> +void pcie_doe_write_config(DOECap *doe_cap, uint32_t addr,
> +                           uint32_t val, int size);
> +
> +/* Utility functions for set_rsp in DOEProtocol */
> +void *pcie_doe_get_req(DOECap *doe_cap);
> +void pcie_doe_set_rsp(DOECap *doe_cap, void *rsp);
> +uint32_t pcie_doe_object_len(void *obj);
> +
> +#define DOE_DEBUG
> +#ifdef DOE_DEBUG
> +#define DOE_DBG(...) printf(__VA_ARGS__)
> +#else
> +#define DOE_DBG(...) {}
> +#endif
> +
> +#define dwsizeof(s) ((sizeof(s) + 4 - 1) / 4)
> +
> +#endif /* PCIE_DOE_H */
> diff --git a/include/hw/pci/pcie_regs.h b/include/hw/pci/pcie_regs.h
> index 1db86b0..963dc2e 100644
> --- a/include/hw/pci/pcie_regs.h
> +++ b/include/hw/pci/pcie_regs.h
> @@ -179,4 +179,8 @@ typedef enum PCIExpLinkWidth {
>  #define PCI_ACS_VER                     0x1
>  #define PCI_ACS_SIZEOF                  8
>  > +/* DOE Capability Register Fields */
> +#define PCI_DOE_VER                     0x1
> +#define PCI_DOE_SIZEOF                  24
> +
>  #endif /* QEMU_PCIE_REGS_H */
> diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h
> index e709ae8..4a7b7a4 100644
> --- a/include/standard-headers/linux/pci_regs.h
> +++ b/include/standard-headers/linux/pci_regs.h
> @@ -730,7 +730,8 @@
>  #define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
>  #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
>  #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
> -#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
> +#define PCI_EXT_CAP_ID_DOE      0x2E    /* Data Object Exchange */
> +#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DOE
>  
>  #define PCI_EXT_CAP_DSN_SIZEOF	12
>  #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40

I haven't reviewed the spec stuff, but I assume Jonathan is familiar with that
and probably knows it almost by heart already.

> -- 
> 1.8.3.1
> 
>
Chris Browy Feb. 9, 2021, 10:10 p.m. UTC | #2
No consensus yet but I’d suggest that we’ll do the QEMU work and Jonathan focuses 
on the linux kernel and UEFI/edk2 and CXL SSWG efforts.  Seems like
a way to maximize resources and everyone’s contribution and expertise.  QEMU part
requires the least expertise which is why we’re best suited for it compared to other 
areas ;)

Review comments will be folded into next patch.

> On Feb 9, 2021, at 4:42 PM, Ben Widawsky <ben@bwidawsk.net> wrote:
> 
> Have you/Jonathan come to consensus about which implementation is going forward?
> I'd rather not have to review two :D
> 
> On 21-02-09 15:35:49, Chris Browy wrote:
>> ---
>> MAINTAINERS                               |   7 +
>> hw/pci/meson.build                        |   1 +
>> hw/pci/pcie.c                             |   2 +-
>> hw/pci/pcie_doe.c                         | 414 ++++++++++++++++++++++++++++++
>> include/hw/pci/pci_ids.h                  |   2 +
>> include/hw/pci/pcie.h                     |   1 +
>> include/hw/pci/pcie_doe.h                 | 166 ++++++++++++
>> include/hw/pci/pcie_regs.h                |   4 +
>> include/standard-headers/linux/pci_regs.h |   3 +-
>> 9 files changed, 598 insertions(+), 2 deletions(-)
>> create mode 100644 hw/pci/pcie_doe.c
>> create mode 100644 include/hw/pci/pcie_doe.h
>> 
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 981dc92..4fb865e 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -1655,6 +1655,13 @@ F: docs/pci*
>> F: docs/specs/*pci*
>> F: default-configs/pci.mak
>> 
>> +PCIE DOE
>> +M: Huai-Cheng Kuo <hchkuo@avery-design.com.tw>
>> +M: Chris Browy <cbrowy@avery-design.com>
>> +S: Supported
>> +F: include/hw/pci/pcie_doe.h
>> +F: hw/pci/pcie_doe.c
>> +
>> ACPI/SMBIOS
>> M: Michael S. Tsirkin <mst@redhat.com>
>> M: Igor Mammedov <imammedo@redhat.com>
>> diff --git a/hw/pci/meson.build b/hw/pci/meson.build
>> index 5c4bbac..115e502 100644
>> --- a/hw/pci/meson.build
>> +++ b/hw/pci/meson.build
>> @@ -12,6 +12,7 @@ pci_ss.add(files(
>> # allow plugging PCIe devices into PCI buses, include them even if
>> # CONFIG_PCI_EXPRESS=n.
>> pci_ss.add(files('pcie.c', 'pcie_aer.c'))
>> +pci_ss.add(files('pcie_doe.c'))
> 
> It looks like this should be like the below line:
> softmmu_ss.add(when: 'CONFIG_PCI_EXPRESS', if_true: pci_doe.c))
> 
>> softmmu_ss.add(when: 'CONFIG_PCI_EXPRESS', if_true: files('pcie_port.c', 'pcie_host.c'))
>> softmmu_ss.add_all(when: 'CONFIG_PCI', if_true: pci_ss)
>> 
>> diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
>> index 1ecf6f6..f7516c4 100644
>> --- a/hw/pci/pcie.c
>> +++ b/hw/pci/pcie.c
>> @@ -735,7 +735,7 @@ void pcie_cap_slot_write_config(PCIDevice *dev,
>> 
>>     hotplug_event_notify(dev);
>> 
>> -    /* 
>> +    /*
> 
> Please drop this.
> 
>>      * 6.7.3.2 Command Completed Events
>>      *
>>      * Software issues a command to a hot-plug capable Downstream Port by
>> diff --git a/hw/pci/pcie_doe.c b/hw/pci/pcie_doe.c
>> new file mode 100644
>> index 0000000..df8e92e
>> --- /dev/null
>> +++ b/hw/pci/pcie_doe.c
>> @@ -0,0 +1,414 @@
>> +#include "qemu/osdep.h"
>> +#include "qemu/log.h"
>> +#include "qemu/error-report.h"
>> +#include "qapi/error.h"
>> +#include "qemu/range.h"
>> +#include "hw/pci/pci.h"
>> +#include "hw/pci/pcie.h"
>> +#include "hw/pci/pcie_doe.h"
>> +#include "hw/pci/msi.h"
>> +#include "hw/pci/msix.h"
>> +
>> +/*
>> + * DOE Default Protocols (Discovery, CMA)
>> + */
>> +/* Discovery Request Object */
>> +struct doe_discovery {
>> +    DOEHeader header;
>> +    uint8_t index;
>> +    uint8_t reserved[3];
>> +} QEMU_PACKED;
>> +
>> +/* Discovery Response Object */
>> +struct doe_discovery_rsp {
>> +    DOEHeader header;
>> +    uint16_t vendor_id;
>> +    uint8_t doe_type;
>> +    uint8_t next_index;
>> +} QEMU_PACKED;
>> +
>> +/* Callback for Discovery */
>> +static bool pcie_doe_discovery_rsp(DOECap *doe_cap)
>> +{
>> +    PCIEDOE *doe = doe_cap->doe;
>> +    struct doe_discovery *req = pcie_doe_get_req(doe_cap);
>> +    uint8_t index = req->index;
>> +    DOEProtocol *prot = NULL;
>> +
>> +    /* Request length mismatch, discard */
>> +    if (req->header.length < dwsizeof(struct doe_discovery)) {
> 
> Use DIV_ROUND_UP instead of rolling your own thing.
> 
>> +        return DOE_DISCARD;
>> +    }
>> +
>> +    /* Point to the requested protocol */
>> +    if (index < doe->protocol_num) {
>> +        prot = &doe->protocols[index];
>> +    }
> 
> What happens on else, should that still return DOE_SUCCESS?
> 
>> +
>> +    struct doe_discovery_rsp rsp = {
>> +        .header = {
>> +            .vendor_id = PCI_VENDOR_ID_PCI_SIG,
>> +            .doe_type = PCI_SIG_DOE_DISCOVERY,
>> +            .reserved = 0x0,
>> +            .length = dwsizeof(struct doe_discovery_rsp),
>> +        },
> 
> mixed declarations are not allowed.
> DIV_ROUND_UP
> 
>> +        .vendor_id = (prot) ? prot->vendor_id : 0xFFFF,
>> +        .doe_type = (prot) ? prot->doe_type : 0xFF,
>> +        .next_index = (index + 1) < doe->protocol_num ?
>> +                      (index + 1) : 0,
>> +    };
> 
> I prefer:
> next_index = (index + 1) % doe->protocol_num
> 
>> +
>> +    pcie_doe_set_rsp(doe_cap, &rsp);
>> +
>> +    return DOE_SUCCESS;
>> +}
>> +
>> +/* Callback for CMA */
>> +static bool pcie_doe_cma_rsp(DOECap *doe_cap)
>> +{
>> +    doe_cap->status.error = 1;
>> +
>> +    memset(doe_cap->read_mbox, 0,
>> +           PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
>> +
>> +    doe_cap->write_mbox_len = 0;
>> +
>> +    return DOE_DISCARD;
>> +}
>> +
>> +/*
>> + * DOE Utilities
>> + */
>> +static void pcie_doe_reset_mbox(DOECap *st)
>> +{
>> +    st->read_mbox_idx = 0;
>> +
>> +    st->read_mbox_len = 0;
>> +    st->write_mbox_len = 0;
>> +
>> +    memset(st->read_mbox, 0, PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
>> +    memset(st->write_mbox, 0, PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
>> +}
>> +
>> +/*
>> + * Initialize the list and protocol for a device.
>> + * This function won't add the DOE capabitity to your PCIe device.
>> + */
>> +void pcie_doe_init(PCIDevice *dev, PCIEDOE *doe)
>> +{
>> +    doe->pdev = dev;
>> +    doe->head = NULL;
>> +    doe->protocol_num = 0;
>> +
>> +    /* Register two default protocol */
>> +    //TODO : LINK LIST
> 
> Please do this for next rev of the patches.
> 
>> +    pcie_doe_register_protocol(doe, PCI_VENDOR_ID_PCI_SIG,
>> +            PCI_SIG_DOE_DISCOVERY, pcie_doe_discovery_rsp);
>> +    pcie_doe_register_protocol(doe, PCI_VENDOR_ID_PCI_SIG,
>> +            PCI_SIG_DOE_CMA, pcie_doe_cma_rsp);
>> +}
>> +
>> +int pcie_cap_doe_add(PCIEDOE *doe, uint16_t offset, bool intr, uint16_t vec) {
>> +    DOECap *new_cap, **ptr;
>> +    PCIDevice *dev = doe->pdev;
>> +
>> +    pcie_add_capability(dev, PCI_EXT_CAP_ID_DOE, PCI_DOE_VER, offset,
>> +                        PCI_DOE_SIZEOF);
>> +
>> +    ptr = &doe->head;
>> +    /* Insert the new DOE at the end of linked list */
>> +    while (*ptr) {
>> +        if (range_covers_byte((*ptr)->offset, PCI_DOE_SIZEOF, offset) ||
>> +            (*ptr)->cap.vec == vec) {
>> +            return -EINVAL;
>> +        }
>> +
>> +        ptr = &(*ptr)->next;
>> +    }
>> +    new_cap = g_malloc0(sizeof(DOECap));
>> +    *ptr = new_cap;
>> +
>> +    new_cap->doe = doe;
>> +    new_cap->offset = offset;
>> +    new_cap->next = NULL;
>> +
>> +    /* Configure MSI/MSI-X */
>> +    if (intr && (msi_present(dev) || msix_present(dev))) {
>> +        new_cap->cap.intr = intr;
>> +        new_cap->cap.vec = vec;
>> +    }
>> +
>> +    /* Set up W/R Mailbox buffer */
>> +    new_cap->write_mbox = g_malloc0(PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
>> +    new_cap->read_mbox = g_malloc0(PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
>> +
>> +    pcie_doe_reset_mbox(new_cap);
>> +
>> +    return 0;
>> +}
>> +
>> +void pcie_doe_uninit(PCIEDOE *doe) {
> 
> fini is the more idiomatically unix name for de/un init.
>> +    PCIDevice *dev = doe->pdev;
>> +    DOECap *cap;
>> +
>> +    pci_del_capability(dev, PCI_EXT_CAP_ID_DOE, PCI_DOE_SIZEOF);
>> +
>> +    cap = doe->head;
>> +    while (cap) {
>> +        doe->head = doe->head->next;
>> +
>> +        g_free(cap->read_mbox);
>> +        g_free(cap->write_mbox);
>> +        cap->read_mbox = NULL;
>> +        cap->write_mbox = NULL;
>> +        g_free(cap);
>> +        cap = doe->head;
>> +    }
>> +}
>> +
>> +void pcie_doe_register_protocol(PCIEDOE *doe, uint16_t vendor_id,
>> +        uint8_t doe_type, bool (*set_rsp)(DOECap *))
>> +{
>> +    /* Protocol num should not exceed 256 */
>> +    assert(doe->protocol_num < PCI_DOE_PROTOCOL_MAX);
>> +
>> +    doe->protocols[doe->protocol_num].vendor_id = vendor_id;
>> +    doe->protocols[doe->protocol_num].doe_type = doe_type;
>> +    doe->protocols[doe->protocol_num].set_rsp = set_rsp;
>> +
>> +    doe->protocol_num++;
>> +}
>> +
>> +uint32_t pcie_doe_build_protocol(DOEProtocol *p)
>> +{
>> +    return DATA_OBJ_BUILD_HEADER1(p->vendor_id, p->doe_type);
>> +}
>> +
>> +/* Return the pointer of DOE request in write mailbox buffer */
>> +void *pcie_doe_get_req(DOECap *doe_cap)
>> +{
>> +    return doe_cap->write_mbox;
>> +}
>> +
>> +/* Copy the response to read mailbox buffer */
>> +void pcie_doe_set_rsp(DOECap *doe_cap, void *rsp)
>> +{
>> +    uint32_t len = pcie_doe_object_len(rsp);
>> +
>> +    memcpy(doe_cap->read_mbox + doe_cap->read_mbox_len,
>> +           rsp, len * sizeof(uint32_t));
>> +    doe_cap->read_mbox_len += len;
>> +}
>> +
>> +/* Get Data Object length */
>> +uint32_t pcie_doe_object_len(void *obj)
>> +{
>> +    return (obj)? ((DOEHeader *)obj)->length : 0;
>> +}
>> +
>> +static void pcie_doe_write_mbox(DOECap *doe_cap, uint32_t val)
>> +{
>> +    memcpy(doe_cap->write_mbox + doe_cap->write_mbox_len, &val, sizeof(uint32_t));
> 
> doe_cap->write_mbox[doe_cap->write_mbox_len] = val?
> 
>> +
>> +    if (doe_cap->write_mbox_len == 1) {
>> +        DOE_DBG("  Capture DOE request DO length = %d\n", val & 0x0003ffff);
>> +    }
> 
> I don't have the spec in front of me, but this is begging for a comment on why 1
> is special.
> 
>> +
>> +    doe_cap->write_mbox_len++;
>> +}
>> +
>> +static void pcie_doe_irq_assert(DOECap *doe_cap)
>> +{
>> +    PCIDevice *dev = doe_cap->doe->pdev;
>> +
>> +    if (doe_cap->cap.intr && doe_cap->ctrl.intr) {
>> +        /* Interrupt notify */
>> +        if (msix_enabled(dev)) {
>> +            msix_notify(dev, doe_cap->cap.vec);
>> +        } else if (msi_enabled(dev)) {
>> +            msi_notify(dev, doe_cap->cap.vec);
>> +        }
>> +        /* Not support legacy IRQ */
>> +    }
>> +}
>> +
>> +static void pcie_doe_set_ready(DOECap *doe_cap, bool rdy)
>> +{
>> +    doe_cap->status.ready = rdy;
>> +
>> +    if (rdy) {
>> +        pcie_doe_irq_assert(doe_cap);
>> +    }
>> +}
>> +
>> +static void pcie_doe_set_error(DOECap *doe_cap, bool err)
>> +{
>> +    doe_cap->status.error = err;
>> +
>> +    if (err) {
>> +        pcie_doe_irq_assert(doe_cap);
>> +    }
>> +}
>> +
>> +DOECap *pcie_doe_covers_addr(PCIEDOE *doe, uint32_t addr)
>> +{
>> +    DOECap *ptr = doe->head;
>> +
>> +    /* If overlaps DOE range. PCIe Capability Header won't be included. */
>> +    while (ptr && 
>> +           !range_covers_byte(ptr->offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
>> +        ptr = ptr->next;
>> +    }
>> +    
>> +    return ptr;
>> +}
>> +
>> +uint32_t pcie_doe_read_config(DOECap *doe_cap,
>> +                              uint32_t addr, int size)
>> +{
>> +    uint32_t ret = 0, shift, mask = 0xFFFFFFFF;
>> +    uint16_t doe_offset = doe_cap->offset;
>> +
>> +    /* If overlaps DOE range. PCIe Capability Header won't be included. */
>> +    if (range_covers_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
>> +        addr -= doe_offset;
>> +
>> +        if (range_covers_byte(PCI_EXP_DOE_CAP, sizeof(uint32_t), addr)) {
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_REG, INTR_SUPP,
>> +                             doe_cap->cap.intr);
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_REG, DOE_INTR_MSG_NUM,
>> +                             doe_cap->cap.vec);
>> +        } else if (range_covers_byte(PCI_EXP_DOE_CTRL, sizeof(uint32_t), addr)) {
>> +            /* Must return ABORT=0 and GO=0 */
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_CONTROL, DOE_INTR_EN,
>> +                             doe_cap->ctrl.intr);
>> +            DOE_DBG("  CONTROL REG=%x\n", ret);
>> +        } else if (range_covers_byte(PCI_EXP_DOE_STATUS, sizeof(uint32_t), addr)) {
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_BUSY,
>> +                             doe_cap->status.busy);
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_INTR_STATUS,
>> +                             doe_cap->status.intr);
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_ERROR,
>> +                             doe_cap->status.error);
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DATA_OBJ_RDY,
>> +                             doe_cap->status.ready);
>> +        } else if (range_covers_byte(PCI_EXP_DOE_RD_DATA_MBOX, sizeof(uint32_t), addr)) {
>> +            /* Check that READY is set */
>> +            if (!doe_cap->status.ready) {
>> +                /* Return 0 if not ready */
>> +                DOE_DBG("  RD MBOX RETURN=%x\n", ret);
>> +            } else {
>> +                /* Deposit next DO DW into read mbox */
>> +                DOE_DBG("  RD MBOX, DATA OBJECT READY,"
>> +                        " CURRENT DO DW OFFSET=%x\n",
>> +                        doe_cap->read_mbox_idx);
>> +
>> +                ret = doe_cap->read_mbox[doe_cap->read_mbox_idx];
>> +
>> +                DOE_DBG("  RD MBOX DW=%x\n", ret);
>> +                DOE_DBG("  RD MBOX DW OFFSET=%d, RD MBOX LENGTH=%d\n",
>> +                        doe_cap->read_mbox_idx, doe_cap->read_mbox_len);
>> +            }
>> +        } else if (range_covers_byte(PCI_EXP_DOE_WR_DATA_MBOX, sizeof(uint32_t), addr)) {
>> +            DOE_DBG("  WR MBOX, local_val =%x\n", ret);
>> +        }
>> +    }
>> +
>> +    /* Alignment */
>> +    shift = addr % sizeof(uint32_t);
> 
> Can these actually be unaligned? The whole range_covers_byte() stuff could go
> away if not.
> 
>> +    if (shift) {
>> +        ret >>= shift * 8;
>> +    }
>> +    mask >>= (sizeof(uint32_t) - size) * 8;
>> +
>> +    return ret & mask;
>> +}
>> +
>> +void pcie_doe_write_config(DOECap *doe_cap,
>> +                           uint32_t addr, uint32_t val, int size)
>> +{
>> +    PCIEDOE *doe = doe_cap->doe;
>> +    uint16_t doe_offset = doe_cap->offset;
>> +    int p;
>> +    bool discard;
>> +    uint32_t shift;
>> +
>> +    DOE_DBG("  addr=%x, val=%x, size=%x\n", addr, val, size);
>> +
>> +    /* If overlaps DOE range. PCIe Capability Header won't be included. */
>> +    if (range_covers_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
>> +        /* Alignment */
>> +        shift = addr % sizeof(uint32_t);
>> +        addr -= (doe_offset - shift);
>> +        val <<= shift * 8;
>> +
>> +        switch (addr) {
>> +        case PCI_EXP_DOE_CTRL:
>> +            DOE_DBG("  CONTROL=%x\n", val);
>> +            if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_ABORT)) {
>> +                /* If ABORT, clear status reg DOE Error and DOE Ready */
>> +                DOE_DBG("  Setting ABORT\n");
>> +                pcie_doe_set_ready(doe_cap, 0);
>> +                pcie_doe_set_error(doe_cap, 0);
>> +                pcie_doe_reset_mbox(doe_cap);
>> +            } else if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_GO)) {
>> +                DOE_DBG("  CONTROL GO=%x\n", val);
>> +                /*
>> +                 * Check protocol the incoming request in write_mbox and
>> +                 * memcpy the corresponding response to read_mbox.
>> +                 *
>> +                 * "discard" should be set up if the response callback
>> +                 * function could not deal with request (e.g. length
>> +                 * mismatch) or the protocol of request was not found.
>> +                 */
>> +                discard = DOE_DISCARD;
>> +                for (p = 0; p < doe->protocol_num; p++) {
>> +                    if (doe_cap->write_mbox[0] ==
>> +                        pcie_doe_build_protocol(&doe->protocols[p])) {
>> +                        /* Found */
>> +                        DOE_DBG("  DOE PROTOCOL=%x\n", doe_cap->write_mbox[0]);
>> +                        /*
>> +                         * Spec:
>> +                         * If the number of DW transferred does not match the
>> +                         * indicated Length for a data object, then the
>> +                         * data object must be silently discarded.
>> +                         */
>> +                        if (doe_cap->write_mbox_len ==
>> +                            pcie_doe_object_len(pcie_doe_get_req(doe_cap)))
>> +                            discard = doe->protocols[p].set_rsp(doe_cap);
>> +                        break;
>> +                    }
>> +                }
>> +
>> +                /* Set DOE Ready */
>> +                if (!discard) {
>> +                    pcie_doe_set_ready(doe_cap, 1);
>> +                } else {
>> +                    pcie_doe_reset_mbox(doe_cap);
>> +                }
>> +            } else if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_INTR_EN)) {
>> +                doe_cap->ctrl.intr = 1;
>> +            }
>> +            break;
>> +        case PCI_EXP_DOE_RD_DATA_MBOX:
>> +            /* Read MBOX */
>> +            DOE_DBG("  INCR RD MBOX DO DW OFFSET=%d\n", doe_cap->read_mbox_idx);
>> +            doe_cap->read_mbox_idx += 1;
>> +            /* Last DW */
>> +            if (doe_cap->read_mbox_idx >= doe_cap->read_mbox_len) {
>> +                pcie_doe_reset_mbox(doe_cap);
>> +                pcie_doe_set_ready(doe_cap, 0);
>> +            }
>> +            break;
>> +        case PCI_EXP_DOE_WR_DATA_MBOX:
>> +            /* Write MBOX */
>> +            DOE_DBG("  WR MBOX=%x, DW OFFSET = %d\n", val, doe_cap->write_mbox_len);
>> +            pcie_doe_write_mbox(doe_cap, val);
>> +            break;
>> +        case PCI_EXP_DOE_STATUS:
>> +        case PCI_EXP_DOE_CAP:
>> +        default:
>> +            break;
>> +        }
>> +    }
>> +}
>> diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
>> index 76bf3ed..636b2e8 100644
>> --- a/include/hw/pci/pci_ids.h
>> +++ b/include/hw/pci/pci_ids.h
>> @@ -157,6 +157,8 @@
>> 
>> /* Vendors and devices.  Sort key: vendor first, device next. */
>> 
>> +#define PCI_VENDOR_ID_PCI_SIG            0x0001
>> +
>> #define PCI_VENDOR_ID_LSI_LOGIC          0x1000
>> #define PCI_DEVICE_ID_LSI_53C810         0x0001
>> #define PCI_DEVICE_ID_LSI_53C895A        0x0012
>> diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
>> index 14c58eb..47d6f66 100644
>> --- a/include/hw/pci/pcie.h
>> +++ b/include/hw/pci/pcie.h
>> @@ -25,6 +25,7 @@
>> #include "hw/pci/pcie_regs.h"
>> #include "hw/pci/pcie_aer.h"
>> #include "hw/hotplug.h"
>> +#include "hw/pci/pcie_doe.h"
>> 
>> typedef enum {
>>     /* for attention and power indicator */
>> diff --git a/include/hw/pci/pcie_doe.h b/include/hw/pci/pcie_doe.h
>> new file mode 100644
>> index 0000000..f497075
>> --- /dev/null
>> +++ b/include/hw/pci/pcie_doe.h
>> @@ -0,0 +1,166 @@
>> +#ifndef PCIE_DOE_H
>> +#define PCIE_DOE_H
>> +
>> +#include "qemu/range.h"
>> +#include "qemu/typedefs.h"
>> +#include "hw/register.h"
>> +
>> +/* 
>> + * PCI DOE register defines 7.9.xx 
>> + */
> 
> Whitespace issues
> 
>> +/* DOE Capabilities Register */
>> +#define PCI_EXP_DOE_CAP             0x04
>> +REG32(PCI_DOE_CAP_REG, 0)
>> +    FIELD(PCI_DOE_CAP_REG, INTR_SUPP, 0, 1)
>> +    FIELD(PCI_DOE_CAP_REG, DOE_INTR_MSG_NUM, 1, 11)
>> +
>> +/* DOE Control Register */
>> +#define PCI_EXP_DOE_CTRL            0x08
>> +REG32(PCI_DOE_CAP_CONTROL, 0)
>> +    FIELD(PCI_DOE_CAP_CONTROL, DOE_ABORT, 0, 1)
>> +    FIELD(PCI_DOE_CAP_CONTROL, DOE_INTR_EN, 1, 1)
>> +    FIELD(PCI_DOE_CAP_CONTROL, DOE_GO, 31, 1)
>> +
>> +/* DOE Status Register  */
>> +#define PCI_EXP_DOE_STATUS          0x0c
>> +REG32(PCI_DOE_CAP_STATUS, 0)
>> +    FIELD(PCI_DOE_CAP_STATUS, DOE_BUSY, 0, 1)
>> +    FIELD(PCI_DOE_CAP_STATUS, DOE_INTR_STATUS, 1, 1)
>> +    FIELD(PCI_DOE_CAP_STATUS, DOE_ERROR, 2, 1)
>> +    FIELD(PCI_DOE_CAP_STATUS, DATA_OBJ_RDY, 31, 1)
>> +
>> +/* DOE Write Data Mailbox Register  */
>> +#define PCI_EXP_DOE_WR_DATA_MBOX    0x10
>> +
>> +/* DOE Read Data Mailbox Register  */
>> +#define PCI_EXP_DOE_RD_DATA_MBOX    0x14
>> +
>> +/* 
>> + * PCI DOE register defines 7.9.xx 
>> + */
> 
> Is this duplicated on purpose?
> 
>> +/* Table 7-x2 */
>> +#define PCI_SIG_DOE_DISCOVERY       0x00
>> +#define PCI_SIG_DOE_CMA             0x01
>> +
>> +#define DATA_OBJ_BUILD_HEADER1(v, p)  ((p << 16) | v)
>> +
>> +/* Figure 6-x1 */
>> +#define DATA_OBJECT_HEADER1_OFFSET  0
>> +#define DATA_OBJECT_HEADER2_OFFSET  1
>> +#define DATA_OBJECT_CONTENT_OFFSET  2
>> +
>> +#define PCI_DOE_MAX_DW_SIZE (1 << 18)
>> +#define PCI_DOE_PROTOCOL_MAX 256
>> +
>> +/*
>> + * Self-defined Marco
>> + */
>> +/* Request/Response State */
>> +#define DOE_SUCCESS 0
>> +#define DOE_DISCARD 1
>> +
>> +/* An invalid vector number for MSI/MSI-x */
>> +#define DOE_NO_INTR (~0)
>> +
>> +/*
>> + * DOE Protocol - Data Object
>> + */
>> +typedef struct DOEHeader DOEHeader;
>> +typedef struct DOEProtocol DOEProtocol;
>> +typedef struct PCIEDOE PCIEDOE;
>> +typedef struct DOECap DOECap;
>> +
>> +struct DOEHeader {
>> +    uint16_t vendor_id;
>> +    uint8_t doe_type;
>> +    uint8_t reserved;
>> +    struct {
>> +        uint32_t length:18;
>> +        uint32_t reserved2:14;
>> +    };
>> +} QEMU_PACKED;
>> +
>> +/* Protocol infos and rsp function callback */
>> +struct DOEProtocol {
>> +    uint16_t vendor_id;
>> +    uint8_t doe_type;
>> +
>> +    bool (*set_rsp)(DOECap *);
>> +};
>> +
>> +struct PCIEDOE {
>> +    PCIDevice *pdev;
>> +    DOECap *head;
>> +
>> +    /* Protocols and its callback response */
>> +    DOEProtocol protocols[PCI_DOE_PROTOCOL_MAX];
>> +    uint32_t protocol_num;
>> +};
>> +
>> +struct DOECap {
>> +    PCIEDOE *doe;
>> +
>> +    /* Capability offset */
>> +    uint16_t offset;
>> +
>> +    /* Next DOE capability */
>> +    DOECap *next;
>> +
>> +    /* Capability */
>> +    struct {
>> +        bool intr;
>> +        uint16_t vec;
>> +    } cap;
>> +
>> +    /* Control */
>> +    struct {
>> +        bool abort;
>> +        bool intr;
>> +        bool go;
>> +    } ctrl;
>> +
>> +    /* Status */
>> +    struct {
>> +        bool busy;
>> +        bool intr;
>> +        bool error;
>> +        bool ready;
>> +    } status;
>> +
>> +    /* Mailbox buffer for device */
>> +    uint32_t *write_mbox;
>> +    uint32_t *read_mbox;
>> +
>> +    /* Mailbox position indicator */
>> +    uint32_t read_mbox_idx;
>> +    uint32_t read_mbox_len;
>> +    uint32_t write_mbox_len;
>> +};
>> +
>> +void pcie_doe_init(PCIDevice *dev, PCIEDOE *doe);
>> +int pcie_cap_doe_add(PCIEDOE *doe, uint16_t offset, bool intr, uint16_t vec);
>> +void pcie_doe_uninit(PCIEDOE *doe);
>> +void pcie_doe_register_protocol(PCIEDOE *doe, uint16_t vendor_id,
>> +                                uint8_t doe_type,
>> +                                bool (*set_rsp)(DOECap *));
>> +uint32_t pcie_doe_build_protocol(DOEProtocol *p);
>> +DOECap *pcie_doe_covers_addr(PCIEDOE *doe, uint32_t addr);
>> +uint32_t pcie_doe_read_config(DOECap *doe_cap, uint32_t addr, int size);
>> +void pcie_doe_write_config(DOECap *doe_cap, uint32_t addr,
>> +                           uint32_t val, int size);
>> +
>> +/* Utility functions for set_rsp in DOEProtocol */
>> +void *pcie_doe_get_req(DOECap *doe_cap);
>> +void pcie_doe_set_rsp(DOECap *doe_cap, void *rsp);
>> +uint32_t pcie_doe_object_len(void *obj);
>> +
>> +#define DOE_DEBUG
>> +#ifdef DOE_DEBUG
>> +#define DOE_DBG(...) printf(__VA_ARGS__)
>> +#else
>> +#define DOE_DBG(...) {}
>> +#endif
>> +
>> +#define dwsizeof(s) ((sizeof(s) + 4 - 1) / 4)
>> +
>> +#endif /* PCIE_DOE_H */
>> diff --git a/include/hw/pci/pcie_regs.h b/include/hw/pci/pcie_regs.h
>> index 1db86b0..963dc2e 100644
>> --- a/include/hw/pci/pcie_regs.h
>> +++ b/include/hw/pci/pcie_regs.h
>> @@ -179,4 +179,8 @@ typedef enum PCIExpLinkWidth {
>> #define PCI_ACS_VER                     0x1
>> #define PCI_ACS_SIZEOF                  8
>>> +/* DOE Capability Register Fields */
>> +#define PCI_DOE_VER                     0x1
>> +#define PCI_DOE_SIZEOF                  24
>> +
>> #endif /* QEMU_PCIE_REGS_H */
>> diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h
>> index e709ae8..4a7b7a4 100644
>> --- a/include/standard-headers/linux/pci_regs.h
>> +++ b/include/standard-headers/linux/pci_regs.h
>> @@ -730,7 +730,8 @@
>> #define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
>> #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
>> #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
>> -#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
>> +#define PCI_EXT_CAP_ID_DOE      0x2E    /* Data Object Exchange */
>> +#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DOE
>> 
>> #define PCI_EXT_CAP_DSN_SIZEOF	12
>> #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
> 
> I haven't reviewed the spec stuff, but I assume Jonathan is familiar with that
> and probably knows it almost by heart already.
> 
>> -- 
>> 1.8.3.1
>> 
>>
Jonathan Cameron Feb. 12, 2021, 4:24 p.m. UTC | #3
On Tue, 9 Feb 2021 15:35:49 -0500
Chris Browy <cbrowy@avery-design.com> wrote:

Run ./scripts/checkpatch.pl over the patches and fix all the warnings before
posting.  It will save time by clearing out most of the minor formatting issues
and similar that inevitably sneak in during development.

The biggest issue I'm seeing in here is that the abstraction of
multiple DOE capabiltiies accessing same protocols doesn't make sense.

Each DOE ecap region and hence mailbox can have it's own set of
(possibly  overlapping) protocols.

From the ECN:
"It is permitted for a protocol using data object exchanges to require
 that a Function implement a unique instance of DOE for that specific
 protocol, and/or to allow sharing of a DOE instance to only a specific
 set of protocols using data object exchange, and/or to allow a Function
 to implement multiple instances of DOE supporting the specific protocol."

Tightly couple the ECAP and DOE.  If we are in the multiple instances
of DOE supporting a specific protocol case, then register it separately
for each one.  The individual device emulation then needs to deal with
any possible clashes etc.

Various comments inline, but mostly small stuff.

Jonathan


> ---
>  MAINTAINERS                               |   7 +
>  hw/pci/meson.build                        |   1 +
>  hw/pci/pcie.c                             |   2 +-
>  hw/pci/pcie_doe.c                         | 414 ++++++++++++++++++++++++++++++
>  include/hw/pci/pci_ids.h                  |   2 +
>  include/hw/pci/pcie.h                     |   1 +
>  include/hw/pci/pcie_doe.h                 | 166 ++++++++++++
>  include/hw/pci/pcie_regs.h                |   4 +
>  include/standard-headers/linux/pci_regs.h |   3 +-
>  9 files changed, 598 insertions(+), 2 deletions(-)
>  create mode 100644 hw/pci/pcie_doe.c
>  create mode 100644 include/hw/pci/pcie_doe.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 981dc92..4fb865e 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1655,6 +1655,13 @@ F: docs/pci*
>  F: docs/specs/*pci*
>  F: default-configs/pci.mak
>  
> +PCIE DOE
> +M: Huai-Cheng Kuo <hchkuo@avery-design.com.tw>
> +M: Chris Browy <cbrowy@avery-design.com>
> +S: Supported
> +F: include/hw/pci/pcie_doe.h
> +F: hw/pci/pcie_doe.c
> +
>  ACPI/SMBIOS
>  M: Michael S. Tsirkin <mst@redhat.com>
>  M: Igor Mammedov <imammedo@redhat.com>
> diff --git a/hw/pci/meson.build b/hw/pci/meson.build
> index 5c4bbac..115e502 100644
> --- a/hw/pci/meson.build
> +++ b/hw/pci/meson.build
> @@ -12,6 +12,7 @@ pci_ss.add(files(
>  # allow plugging PCIe devices into PCI buses, include them even if
>  # CONFIG_PCI_EXPRESS=n.
>  pci_ss.add(files('pcie.c', 'pcie_aer.c'))
> +pci_ss.add(files('pcie_doe.c'))
>  softmmu_ss.add(when: 'CONFIG_PCI_EXPRESS', if_true: files('pcie_port.c', 'pcie_host.c'))
>  softmmu_ss.add_all(when: 'CONFIG_PCI', if_true: pci_ss)
>  

...

> diff --git a/hw/pci/pcie_doe.c b/hw/pci/pcie_doe.c
> new file mode 100644
> index 0000000..df8e92e
> --- /dev/null
> +++ b/hw/pci/pcie_doe.c
> @@ -0,0 +1,414 @@

Add a copyright header / license etc before v3.

> +#include "qemu/osdep.h"
> +#include "qemu/log.h"
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
> +#include "qemu/range.h"
> +#include "hw/pci/pci.h"
> +#include "hw/pci/pcie.h"
> +#include "hw/pci/pcie_doe.h"
> +#include "hw/pci/msi.h"
> +#include "hw/pci/msix.h"
> +
> +/*
> + * DOE Default Protocols (Discovery, CMA)
> + */
> +/* Discovery Request Object */
> +struct doe_discovery {
> +    DOEHeader header;
> +    uint8_t index;
> +    uint8_t reserved[3];
> +} QEMU_PACKED;
> +
> +/* Discovery Response Object */
> +struct doe_discovery_rsp {
> +    DOEHeader header;
> +    uint16_t vendor_id;
> +    uint8_t doe_type;
> +    uint8_t next_index;
> +} QEMU_PACKED;
> +
> +/* Callback for Discovery */
> +static bool pcie_doe_discovery_rsp(DOECap *doe_cap)
> +{
> +    PCIEDOE *doe = doe_cap->doe;
> +    struct doe_discovery *req = pcie_doe_get_req(doe_cap);
> +    uint8_t index = req->index;
> +    DOEProtocol *prot = NULL;
> +
> +    /* Request length mismatch, discard */
> +    if (req->header.length < dwsizeof(struct doe_discovery)) {
> +        return DOE_DISCARD;

	return false;  Or better yet a meaningful error code to make debugging
easier.

> +    }
> +
> +    /* Point to the requested protocol */
> +    if (index < doe->protocol_num) {
> +        prot = &doe->protocols[index];
> +    }
> +
> +    struct doe_discovery_rsp rsp = {
> +        .header = {
> +            .vendor_id = PCI_VENDOR_ID_PCI_SIG,
> +            .doe_type = PCI_SIG_DOE_DISCOVERY,
> +            .reserved = 0x0,

Nice thing about c99 based structure assignment is that unspecified
elements are set to 0 automatically.  So you can drop this particular
element safely and end up with slightly cleaner code.

> +            .length = dwsizeof(struct doe_discovery_rsp),
> +        },
> +        .vendor_id = (prot) ? prot->vendor_id : 0xFFFF,
> +        .doe_type = (prot) ? prot->doe_type : 0xFF,
> +        .next_index = (index + 1) < doe->protocol_num ?
> +                      (index + 1) : 0,
> +    };
> +
> +    pcie_doe_set_rsp(doe_cap, &rsp);
> +
> +    return DOE_SUCCESS;
> +}
> +
> +/* Callback for CMA */
> +static bool pcie_doe_cma_rsp(DOECap *doe_cap)

I've not checked CMA, but I'd expect this to be a separate
patch as not part of core DOE support (or same ECN for that
matter).

> +{
> +    doe_cap->status.error = 1;
> +
> +    memset(doe_cap->read_mbox, 0,
> +           PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
> +
> +    doe_cap->write_mbox_len = 0;
> +
> +    return DOE_DISCARD;
> +}
> +
> +/*
> + * DOE Utilities
> + */
> +static void pcie_doe_reset_mbox(DOECap *st)
> +{
> +    st->read_mbox_idx = 0;
> +
> +    st->read_mbox_len = 0;
> +    st->write_mbox_len = 0;
> +
> +    memset(st->read_mbox, 0, PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
> +    memset(st->write_mbox, 0, PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
> +}
> +
> +/*
> + * Initialize the list and protocol for a device.
> + * This function won't add the DOE capabitity to your PCIe device.
> + */
> +void pcie_doe_init(PCIDevice *dev, PCIEDOE *doe)
> +{
> +    doe->pdev = dev;
> +    doe->head = NULL;
> +    doe->protocol_num = 0;

No need to set things to zero as I assume you allocate it zero filled.
At least no point for things where zero is the obvious default.

> +
> +    /* Register two default protocol */
> +    //TODO : LINK LIST

Agreed :)

> +    pcie_doe_register_protocol(doe, PCI_VENDOR_ID_PCI_SIG,
> +            PCI_SIG_DOE_DISCOVERY, pcie_doe_discovery_rsp);
> +    pcie_doe_register_protocol(doe, PCI_VENDOR_ID_PCI_SIG,
> +            PCI_SIG_DOE_CMA, pcie_doe_cma_rsp);
> +}
> +
> +int pcie_cap_doe_add(PCIEDOE *doe, uint16_t offset, bool intr, uint16_t vec) {

Bracket on this line.

> +    DOECap *new_cap, **ptr;
> +    PCIDevice *dev = doe->pdev;
> +
> +    pcie_add_capability(dev, PCI_EXT_CAP_ID_DOE, PCI_DOE_VER, offset,
> +                        PCI_DOE_SIZEOF);
> +
> +    ptr = &doe->head;
> +    /* Insert the new DOE at the end of linked list */

As below, not sure this abstraction makes sense.

> +    while (*ptr) {
> +        if (range_covers_byte((*ptr)->offset, PCI_DOE_SIZEOF, offset) ||
> +            (*ptr)->cap.vec == vec) {
> +            return -EINVAL;
> +        }
> +
> +        ptr = &(*ptr)->next;
> +    }
> +    new_cap = g_malloc0(sizeof(DOECap));
> +    *ptr = new_cap;
> +
> +    new_cap->doe = doe;
> +    new_cap->offset = offset;
> +    new_cap->next = NULL;
> +
> +    /* Configure MSI/MSI-X */
> +    if (intr && (msi_present(dev) || msix_present(dev))) {
> +        new_cap->cap.intr = intr;
> +        new_cap->cap.vec = vec;
> +    }
> +
> +    /* Set up W/R Mailbox buffer */
> +    new_cap->write_mbox = g_malloc0(PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
> +    new_cap->read_mbox = g_malloc0(PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
> +
> +    pcie_doe_reset_mbox(new_cap);
> +
> +    return 0;
> +}
> +
> +void pcie_doe_uninit(PCIEDOE *doe) {
> +    PCIDevice *dev = doe->pdev;
> +    DOECap *cap;
> +
> +    pci_del_capability(dev, PCI_EXT_CAP_ID_DOE, PCI_DOE_SIZEOF);
> +
> +    cap = doe->head;
> +    while (cap) {
> +        doe->head = doe->head->next;
> +
> +        g_free(cap->read_mbox);
> +        g_free(cap->write_mbox);
> +        cap->read_mbox = NULL;
> +        cap->write_mbox = NULL;

I'd avoid setting things to NULL that you are throwing away.  It
implies that they will be reused or that it matters in somewhat which
isn't the case here.

> +        g_free(cap);
> +        cap = doe->head;
> +    }
> +}
> +
> +void pcie_doe_register_protocol(PCIEDOE *doe, uint16_t vendor_id,
> +        uint8_t doe_type, bool (*set_rsp)(DOECap *))
> +{
> +    /* Protocol num should not exceed 256 */
> +    assert(doe->protocol_num < PCI_DOE_PROTOCOL_MAX);
> +
> +    doe->protocols[doe->protocol_num].vendor_id = vendor_id;
> +    doe->protocols[doe->protocol_num].doe_type = doe_type;
> +    doe->protocols[doe->protocol_num].set_rsp = set_rsp;
> +
> +    doe->protocol_num++;
> +}
> +
> +uint32_t pcie_doe_build_protocol(DOEProtocol *p)
> +{
> +    return DATA_OBJ_BUILD_HEADER1(p->vendor_id, p->doe_type);
> +}
> +
> +/* Return the pointer of DOE request in write mailbox buffer */
> +void *pcie_doe_get_req(DOECap *doe_cap)
> +{
> +    return doe_cap->write_mbox;
> +}
> +
> +/* Copy the response to read mailbox buffer */
> +void pcie_doe_set_rsp(DOECap *doe_cap, void *rsp)
> +{
> +    uint32_t len = pcie_doe_object_len(rsp);
> +
> +    memcpy(doe_cap->read_mbox + doe_cap->read_mbox_len,
> +           rsp, len * sizeof(uint32_t));
> +    doe_cap->read_mbox_len += len;
> +}
> +
> +/* Get Data Object length */
> +uint32_t pcie_doe_object_len(void *obj)
> +{
> +    return (obj)? ((DOEHeader *)obj)->length : 0;
> +}
> +
> +static void pcie_doe_write_mbox(DOECap *doe_cap, uint32_t val)
> +{
> +    memcpy(doe_cap->write_mbox + doe_cap->write_mbox_len, &val, sizeof(uint32_t));
> +
> +    if (doe_cap->write_mbox_len == 1) {
> +        DOE_DBG("  Capture DOE request DO length = %d\n", val & 0x0003ffff);
> +    }
> +
> +    doe_cap->write_mbox_len++;

We probably need to check that the mailbox write was full dword as the spec
requires.  No idea what we do if it isn't as spec doesn't seem to say...
I've queried one of our PCI experts.


> +}
> +
> +static void pcie_doe_irq_assert(DOECap *doe_cap)
> +{
> +    PCIDevice *dev = doe_cap->doe->pdev;
> +
> +    if (doe_cap->cap.intr && doe_cap->ctrl.intr) {
> +        /* Interrupt notify */
> +        if (msix_enabled(dev)) {
> +            msix_notify(dev, doe_cap->cap.vec);
> +        } else if (msi_enabled(dev)) {
> +            msi_notify(dev, doe_cap->cap.vec);
> +        }
> +        /* Not support legacy IRQ */
> +    }
> +}
> +
> +static void pcie_doe_set_ready(DOECap *doe_cap, bool rdy)
> +{
> +    doe_cap->status.ready = rdy;
> +
> +    if (rdy) {
> +        pcie_doe_irq_assert(doe_cap);
> +    }
> +}
> +
> +static void pcie_doe_set_error(DOECap *doe_cap, bool err)
> +{
> +    doe_cap->status.error = err;
> +
> +    if (err) {
> +        pcie_doe_irq_assert(doe_cap);
> +    }
> +}
> +
> +DOECap *pcie_doe_covers_addr(PCIEDOE *doe, uint32_t addr)
> +{
> +    DOECap *ptr = doe->head;
> +
> +    /* If overlaps DOE range. PCIe Capability Header won't be included. */
> +    while (ptr && 
> +           !range_covers_byte(ptr->offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
> +        ptr = ptr->next;
> +    }
> +    
> +    return ptr;
> +}
> +
> +uint32_t pcie_doe_read_config(DOECap *doe_cap,
> +                              uint32_t addr, int size)
> +{
> +    uint32_t ret = 0, shift, mask = 0xFFFFFFFF;
> +    uint16_t doe_offset = doe_cap->offset;
> +
> +    /* If overlaps DOE range. PCIe Capability Header won't be included. */
> +    if (range_covers_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {

I'd flip this around to reduce indentation + no need to be careful with alignment etc
if we aren't returning anything.

       if (!range_cover_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)
		return 0;


> +        addr -= doe_offset;
> +
> +        if (range_covers_byte(PCI_EXP_DOE_CAP, sizeof(uint32_t), addr)) {
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_REG, INTR_SUPP,
> +                             doe_cap->cap.intr);
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_REG, DOE_INTR_MSG_NUM,
> +                             doe_cap->cap.vec);
> +        } else if (range_covers_byte(PCI_EXP_DOE_CTRL, sizeof(uint32_t), addr)) {
> +            /* Must return ABORT=0 and GO=0 */
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_CONTROL, DOE_INTR_EN,
> +                             doe_cap->ctrl.intr);
> +            DOE_DBG("  CONTROL REG=%x\n", ret);
> +        } else if (range_covers_byte(PCI_EXP_DOE_STATUS, sizeof(uint32_t), addr)) {
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_BUSY,
> +                             doe_cap->status.busy);
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_INTR_STATUS,
> +                             doe_cap->status.intr);
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_ERROR,
> +                             doe_cap->status.error);
> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DATA_OBJ_RDY,
> +                             doe_cap->status.ready);
> +        } else if (range_covers_byte(PCI_EXP_DOE_RD_DATA_MBOX, sizeof(uint32_t), addr)) {
> +            /* Check that READY is set */

I'd clean out any comment that is obvious from the code.
Comments get out of sync over time so there is a maintenance burden in having them such
that we only bother if they add information.

> +            if (!doe_cap->status.ready) {
> +                /* Return 0 if not ready */
> +                DOE_DBG("  RD MBOX RETURN=%x\n", ret);
> +            } else {
> +                /* Deposit next DO DW into read mbox */

This comment is missleading.  It might not be the 'next' one. If you read
the register twice for instance.  Much better not to have the comment
at all on basis code is fairly obvious anyway!

As mentioned below, a read of this when nothing there is an underflow and spec
suggests that is an error condition.

> +                DOE_DBG("  RD MBOX, DATA OBJECT READY,"
> +                        " CURRENT DO DW OFFSET=%x\n",
> +                        doe_cap->read_mbox_idx);
> +
> +                ret = doe_cap->read_mbox[doe_cap->read_mbox_idx];
> +
> +                DOE_DBG("  RD MBOX DW=%x\n", ret);
> +                DOE_DBG("  RD MBOX DW OFFSET=%d, RD MBOX LENGTH=%d\n",
> +                        doe_cap->read_mbox_idx, doe_cap->read_mbox_len);
> +            }
> +        } else if (range_covers_byte(PCI_EXP_DOE_WR_DATA_MBOX, sizeof(uint32_t), addr)) {
> +            DOE_DBG("  WR MBOX, local_val =%x\n", ret);
> +        }
> +    }
> +
> +    /* Alignment */
> +    shift = addr % sizeof(uint32_t);
> +    if (shift) {
> +        ret >>= shift * 8;
> +    }
> +    mask >>= (sizeof(uint32_t) - size) * 8;
> +
> +    return ret & mask;
> +}
> +
> +void pcie_doe_write_config(DOECap *doe_cap,
> +                           uint32_t addr, uint32_t val, int size)
> +{
> +    PCIEDOE *doe = doe_cap->doe;
> +    uint16_t doe_offset = doe_cap->offset;
> +    int p;
> +    bool discard;
> +    uint32_t shift;
> +
> +    DOE_DBG("  addr=%x, val=%x, size=%x\n", addr, val, size);
> +
> +    /* If overlaps DOE range. PCIe Capability Header won't be included. */
> +    if (range_covers_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {

As below, invert this condition and return early as it will simplify below and reduce
indentation.

     if (!range_covers_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
          return;
    }

> +        /* Alignment */
> +        shift = addr % sizeof(uint32_t);
> +        addr -= (doe_offset - shift);
> +        val <<= shift * 8;
> +
> +        switch (addr) {
> +        case PCI_EXP_DOE_CTRL:
> +            DOE_DBG("  CONTROL=%x\n", val);
> +            if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_ABORT)) {
> +                /* If ABORT, clear status reg DOE Error and DOE Ready */
> +                DOE_DBG("  Setting ABORT\n");
> +                pcie_doe_set_ready(doe_cap, 0);
> +                pcie_doe_set_error(doe_cap, 0);
> +                pcie_doe_reset_mbox(doe_cap);
> +            } else if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_GO)) {
> +                DOE_DBG("  CONTROL GO=%x\n", val);

This case is complex enough I'd factor it out as it's own function.

> +                /*
> +                 * Check protocol the incoming request in write_mbox and
> +                 * memcpy the corresponding response to read_mbox.
> +                 *
> +                 * "discard" should be set up if the response callback
> +                 * function could not deal with request (e.g. length
> +                 * mismatch) or the protocol of request was not found.
> +                 */
> +                discard = DOE_DISCARD;
> +                for (p = 0; p < doe->protocol_num; p++) {
> +                    if (doe_cap->write_mbox[0] ==
> +                        pcie_doe_build_protocol(&doe->protocols[p])) {
> +                        /* Found */
> +                        DOE_DBG("  DOE PROTOCOL=%x\n", doe_cap->write_mbox[0]);
> +                        /*
> +                         * Spec:
> +                         * If the number of DW transferred does not match the
> +                         * indicated Length for a data object, then the
> +                         * data object must be silently discarded.
> +                         */
> +                        if (doe_cap->write_mbox_len ==
> +                            pcie_doe_object_len(pcie_doe_get_req(doe_cap)))
> +                            discard = doe->protocols[p].set_rsp(doe_cap);
> +                        break;
> +                    }
> +                }
> +
> +                /* Set DOE Ready */
> +                if (!discard) {
> +                    pcie_doe_set_ready(doe_cap, 1);
> +                } else {
> +                    pcie_doe_reset_mbox(doe_cap);
> +                }
> +            } else if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_INTR_EN)) {

Spec reference needed to say why you can't write this at same time as GO.
It may be odd thing to do, but I can't see anything saying you couldn't do this,
for example on your very first command.

> +                doe_cap->ctrl.intr = 1;
> +            }
> +            break;
> +        case PCI_EXP_DOE_RD_DATA_MBOX:
> +            /* Read MBOX */
> +            DOE_DBG("  INCR RD MBOX DO DW OFFSET=%d\n", doe_cap->read_mbox_idx);
> +            doe_cap->read_mbox_idx += 1;
> +            /* Last DW */
> +            if (doe_cap->read_mbox_idx >= doe_cap->read_mbox_len) {
> +                pcie_doe_reset_mbox(doe_cap);
> +                pcie_doe_set_ready(doe_cap, 0);
> +            }

A write of this when there is nothing there is an underflow. As is a read of this
after the last write.  I would guess both should be error conditions.

> +            break;
> +        case PCI_EXP_DOE_WR_DATA_MBOX:
> +            /* Write MBOX */
> +            DOE_DBG("  WR MBOX=%x, DW OFFSET = %d\n", val, doe_cap->write_mbox_len);
> +            pcie_doe_write_mbox(doe_cap, val);
> +            break;
> +        case PCI_EXP_DOE_STATUS:
> +        case PCI_EXP_DOE_CAP:
> +        default:
> +            break;
> +        }
> +    }
> +}
> diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
> index 76bf3ed..636b2e8 100644
> --- a/include/hw/pci/pci_ids.h
> +++ b/include/hw/pci/pci_ids.h
> @@ -157,6 +157,8 @@
>  
>  /* Vendors and devices.  Sort key: vendor first, device next. */
>  
> +#define PCI_VENDOR_ID_PCI_SIG            0x0001
> +
>  #define PCI_VENDOR_ID_LSI_LOGIC          0x1000
>  #define PCI_DEVICE_ID_LSI_53C810         0x0001
>  #define PCI_DEVICE_ID_LSI_53C895A        0x0012
> diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
> index 14c58eb..47d6f66 100644
> --- a/include/hw/pci/pcie.h
> +++ b/include/hw/pci/pcie.h
> @@ -25,6 +25,7 @@
>  #include "hw/pci/pcie_regs.h"
>  #include "hw/pci/pcie_aer.h"
>  #include "hw/hotplug.h"
> +#include "hw/pci/pcie_doe.h"
>  
>  typedef enum {
>      /* for attention and power indicator */
> diff --git a/include/hw/pci/pcie_doe.h b/include/hw/pci/pcie_doe.h
> new file mode 100644
> index 0000000..f497075
> --- /dev/null
> +++ b/include/hw/pci/pcie_doe.h
> @@ -0,0 +1,166 @@
> +#ifndef PCIE_DOE_H
> +#define PCIE_DOE_H
> +
> +#include "qemu/range.h"
> +#include "qemu/typedefs.h"
> +#include "hw/register.h"
> +
> +/* 
> + * PCI DOE register defines 7.9.xx 
> + */
> +/* DOE Capabilities Register */
> +#define PCI_EXP_DOE_CAP             0x04
> +REG32(PCI_DOE_CAP_REG, 0)
> +    FIELD(PCI_DOE_CAP_REG, INTR_SUPP, 0, 1)
> +    FIELD(PCI_DOE_CAP_REG, DOE_INTR_MSG_NUM, 1, 11)
> +
> +/* DOE Control Register */
> +#define PCI_EXP_DOE_CTRL            0x08
> +REG32(PCI_DOE_CAP_CONTROL, 0)
> +    FIELD(PCI_DOE_CAP_CONTROL, DOE_ABORT, 0, 1)
> +    FIELD(PCI_DOE_CAP_CONTROL, DOE_INTR_EN, 1, 1)
> +    FIELD(PCI_DOE_CAP_CONTROL, DOE_GO, 31, 1)
> +
> +/* DOE Status Register  */
> +#define PCI_EXP_DOE_STATUS          0x0c
> +REG32(PCI_DOE_CAP_STATUS, 0)
> +    FIELD(PCI_DOE_CAP_STATUS, DOE_BUSY, 0, 1)
> +    FIELD(PCI_DOE_CAP_STATUS, DOE_INTR_STATUS, 1, 1)
> +    FIELD(PCI_DOE_CAP_STATUS, DOE_ERROR, 2, 1)
> +    FIELD(PCI_DOE_CAP_STATUS, DATA_OBJ_RDY, 31, 1)
> +
> +/* DOE Write Data Mailbox Register  */
> +#define PCI_EXP_DOE_WR_DATA_MBOX    0x10
> +
> +/* DOE Read Data Mailbox Register  */
> +#define PCI_EXP_DOE_RD_DATA_MBOX    0x14
> +
> +/* 
> + * PCI DOE register defines 7.9.xx 
> + */
> +/* Table 7-x2 */
> +#define PCI_SIG_DOE_DISCOVERY       0x00
> +#define PCI_SIG_DOE_CMA             0x01
> +
> +#define DATA_OBJ_BUILD_HEADER1(v, p)  ((p << 16) | v)
> +
> +/* Figure 6-x1 */
> +#define DATA_OBJECT_HEADER1_OFFSET  0
> +#define DATA_OBJECT_HEADER2_OFFSET  1
> +#define DATA_OBJECT_CONTENT_OFFSET  2
> +
> +#define PCI_DOE_MAX_DW_SIZE (1 << 18)
> +#define PCI_DOE_PROTOCOL_MAX 256
> +
> +/*
> + * Self-defined Marco
> + */
> +/* Request/Response State */
> +#define DOE_SUCCESS 0
> +#define DOE_DISCARD 1
Drop these. As mentioned in previous review.  These are just
obvious bools.

> +
> +/* An invalid vector number for MSI/MSI-x */
> +#define DOE_NO_INTR (~0)
> +
> +/*
> + * DOE Protocol - Data Object
> + */
> +typedef struct DOEHeader DOEHeader;
> +typedef struct DOEProtocol DOEProtocol;
> +typedef struct PCIEDOE PCIEDOE;
> +typedef struct DOECap DOECap;
> +
> +struct DOEHeader {
> +    uint16_t vendor_id;
> +    uint8_t doe_type;
> +    uint8_t reserved;
> +    struct {
> +        uint32_t length:18;
> +        uint32_t reserved2:14;

As before, bitfields are not a good idea in this sort of code in general due to lack
of standard handling in the C spec.

> +    };
> +} QEMU_PACKED;
> +
> +/* Protocol infos and rsp function callback */
> +struct DOEProtocol {
> +    uint16_t vendor_id;
> +    uint8_t doe_type;
> +
> +    bool (*set_rsp)(DOECap *);
> +};
> +
> +struct PCIEDOE {
> +    PCIDevice *pdev;

Having the PCIDevice in here only allows you to drop a parameter in read and write
calls. I'd not bother given the nest of pointers you have as a result.
Just pass both the PCIDOE and the PCIDevice into those two functions.

> +    DOECap *head;

I can sort of get what you are doing with this list of DOEs but to mind it's the wrong
abstraction as it doesn't fit how I think these will be used.

It's not a case that there will be N DOE mailboxes each of which supports
all the same protocols.  I suspect quite the opposite.

Each DOE may support multiple protocols but the most obvious reason you'd
do multiple DOEs is because they support different protocols.

If we want to support the same protocol on mulitple DOE instances then we'd
register it for each of them.

Thus I'd drop the list handling and instead  have for example

struct cxl_type3_dev {

...

    PCIDOE doe_ima;
    PCIDOE doe_cdat;
};

In the read and write calls then just call pcie_doe_xxxx_config() twice.
You do have to handle the miss vs hit stuff though (similar problem that
lead to ugly code in my version).


> +
> +    /* Protocols and its callback response */
> +    DOEProtocol protocols[PCI_DOE_PROTOCOL_MAX];
> +    uint32_t protocol_num;
> +};
> +
> +struct DOECap {
> +    PCIEDOE *doe;

Following suggestion to get rid of the list, you should also then combine
PCIEDOE and the DOECap as they match 1 to 1.

> +
> +    /* Capability offset */
> +    uint16_t offset;
> +
> +    /* Next DOE capability */
> +    DOECap *next;
> +
> +    /* Capability */
> +    struct {
> +        bool intr;
> +        uint16_t vec;
> +    } cap;
> +
> +    /* Control */
> +    struct {
> +        bool abort;
> +        bool intr;
> +        bool go;
> +    } ctrl;
> +
> +    /* Status */
> +    struct {
> +        bool busy;
> +        bool intr;
> +        bool error;
> +        bool ready;
> +    } status;
> +
> +    /* Mailbox buffer for device */
> +    uint32_t *write_mbox;
> +    uint32_t *read_mbox;
> +
> +    /* Mailbox position indicator */
> +    uint32_t read_mbox_idx;
> +    uint32_t read_mbox_len;
> +    uint32_t write_mbox_len;
> +};
> +
> +void pcie_doe_init(PCIDevice *dev, PCIEDOE *doe);
> +int pcie_cap_doe_add(PCIEDOE *doe, uint16_t offset, bool intr, uint16_t vec);
> +void pcie_doe_uninit(PCIEDOE *doe);
> +void pcie_doe_register_protocol(PCIEDOE *doe, uint16_t vendor_id,
> +                                uint8_t doe_type,
> +                                bool (*set_rsp)(DOECap *));
> +uint32_t pcie_doe_build_protocol(DOEProtocol *p);
> +DOECap *pcie_doe_covers_addr(PCIEDOE *doe, uint32_t addr);
> +uint32_t pcie_doe_read_config(DOECap *doe_cap, uint32_t addr, int size);
> +void pcie_doe_write_config(DOECap *doe_cap, uint32_t addr,
> +                           uint32_t val, int size);
> +
> +/* Utility functions for set_rsp in DOEProtocol */
> +void *pcie_doe_get_req(DOECap *doe_cap);
> +void pcie_doe_set_rsp(DOECap *doe_cap, void *rsp);
> +uint32_t pcie_doe_object_len(void *obj);
> +
> +#define DOE_DEBUG
> +#ifdef DOE_DEBUG
> +#define DOE_DBG(...) printf(__VA_ARGS__)
> +#else
> +#define DOE_DBG(...) {}
> +#endif

Tidy this stuff up (i.e. get rid of it) for next version.  It's fine when
you are doing early debug, but we don't want it in the version for review.

> +
> +#define dwsizeof(s) ((sizeof(s) + 4 - 1) / 4)

Not used in this patch.  Move it down towards code that uses it or at very 
least only introduce it when used.

> +
> +#endif /* PCIE_DOE_H */
> diff --git a/include/hw/pci/pcie_regs.h b/include/hw/pci/pcie_regs.h
> index 1db86b0..963dc2e 100644
> --- a/include/hw/pci/pcie_regs.h
> +++ b/include/hw/pci/pcie_regs.h
> @@ -179,4 +179,8 @@ typedef enum PCIExpLinkWidth {
>  #define PCI_ACS_VER                     0x1
>  #define PCI_ACS_SIZEOF                  8
>  
> +/* DOE Capability Register Fields */
> +#define PCI_DOE_VER                     0x1
> +#define PCI_DOE_SIZEOF                  24
> +
>  #endif /* QEMU_PCIE_REGS_H */
> diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h
> index e709ae8..4a7b7a4 100644
> --- a/include/standard-headers/linux/pci_regs.h
> +++ b/include/standard-headers/linux/pci_regs.h

Pull this change out as a separate patch.  This header gets fetched via a script
so we will only find this here if we update the source file in the kernel.

That should happen soon anyway as we add the support to read this.


> @@ -730,7 +730,8 @@
>  #define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
>  #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
>  #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
> -#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
> +#define PCI_EXT_CAP_ID_DOE      0x2E    /* Data Object Exchange */
> +#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DOE
>  
>  #define PCI_EXT_CAP_DSN_SIZEOF	12
>  #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
Chris Browy Feb. 12, 2021, 9:58 p.m. UTC | #4
> On Feb 12, 2021, at 11:24 AM, Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:
> 
> On Tue, 9 Feb 2021 15:35:49 -0500
> Chris Browy <cbrowy@avery-design.com> wrote:
> 
> Run ./scripts/checkpatch.pl over the patches and fix all the warnings before
> posting.  It will save time by clearing out most of the minor formatting issues
> and similar that inevitably sneak in during development.
> 
Excellent suggestion.  We’re still newbies!

> The biggest issue I'm seeing in here is that the abstraction of
> multiple DOE capabiltiies accessing same protocols doesn't make sense.
> 
> Each DOE ecap region and hence mailbox can have it's own set of
> (possibly  overlapping) protocols.
> 
> From the ECN:
> "It is permitted for a protocol using data object exchanges to require
> that a Function implement a unique instance of DOE for that specific
> protocol, and/or to allow sharing of a DOE instance to only a specific
> set of protocols using data object exchange, and/or to allow a Function
> to implement multiple instances of DOE supporting the specific protocol."
> 
> Tightly couple the ECAP and DOE.  If we are in the multiple instances
> of DOE supporting a specific protocol case, then register it separately
> for each one.  The individual device emulation then needs to deal with
> any possible clashes etc.

Not sure how configurable we want to make the device.  It is a simple type 3
device after all. 

The DOE spec does leave it pretty arbitrary regarding N DOE instances (DOE 
Extended Cap entry points) for M protocols, including where N>1 and M=1.  
Currently we implement N=2 DOE caps (instances), one for CDAT, one for 
Compliance Mode.

Maybe a more complex MLD device might have one or more DOE instances 
for the CDAT protocol alone to define each HDM but currently we only have 
one pmem (SLD) so we can’t really do much more than what’s supported.

Open to further suggestion though.  Based on answer to above we’ll follow 
the suggestion lower in the code review regarding 


> 
> Various comments inline, but mostly small stuff.
> 
> Jonathan
> 
> 
>> ---
>> MAINTAINERS                               |   7 +
>> hw/pci/meson.build                        |   1 +
>> hw/pci/pcie.c                             |   2 +-
>> hw/pci/pcie_doe.c                         | 414 ++++++++++++++++++++++++++++++
>> include/hw/pci/pci_ids.h                  |   2 +
>> include/hw/pci/pcie.h                     |   1 +
>> include/hw/pci/pcie_doe.h                 | 166 ++++++++++++
>> include/hw/pci/pcie_regs.h                |   4 +
>> include/standard-headers/linux/pci_regs.h |   3 +-
>> 9 files changed, 598 insertions(+), 2 deletions(-)
>> create mode 100644 hw/pci/pcie_doe.c
>> create mode 100644 include/hw/pci/pcie_doe.h
>> 
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 981dc92..4fb865e 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -1655,6 +1655,13 @@ F: docs/pci*
>> F: docs/specs/*pci*
>> F: default-configs/pci.mak
>> 
>> +PCIE DOE
>> +M: Huai-Cheng Kuo <hchkuo@avery-design.com.tw>
>> +M: Chris Browy <cbrowy@avery-design.com>
>> +S: Supported
>> +F: include/hw/pci/pcie_doe.h
>> +F: hw/pci/pcie_doe.c
>> +
>> ACPI/SMBIOS
>> M: Michael S. Tsirkin <mst@redhat.com>
>> M: Igor Mammedov <imammedo@redhat.com>
>> diff --git a/hw/pci/meson.build b/hw/pci/meson.build
>> index 5c4bbac..115e502 100644
>> --- a/hw/pci/meson.build
>> +++ b/hw/pci/meson.build
>> @@ -12,6 +12,7 @@ pci_ss.add(files(
>> # allow plugging PCIe devices into PCI buses, include them even if
>> # CONFIG_PCI_EXPRESS=n.
>> pci_ss.add(files('pcie.c', 'pcie_aer.c'))
>> +pci_ss.add(files('pcie_doe.c'))
>> softmmu_ss.add(when: 'CONFIG_PCI_EXPRESS', if_true: files('pcie_port.c', 'pcie_host.c'))
>> softmmu_ss.add_all(when: 'CONFIG_PCI', if_true: pci_ss)
>> 
> 
> ...
> 
>> diff --git a/hw/pci/pcie_doe.c b/hw/pci/pcie_doe.c
>> new file mode 100644
>> index 0000000..df8e92e
>> --- /dev/null
>> +++ b/hw/pci/pcie_doe.c
>> @@ -0,0 +1,414 @@
> 
> Add a copyright header / license etc before v3.
> 
>> +#include "qemu/osdep.h"
>> +#include "qemu/log.h"
>> +#include "qemu/error-report.h"
>> +#include "qapi/error.h"
>> +#include "qemu/range.h"
>> +#include "hw/pci/pci.h"
>> +#include "hw/pci/pcie.h"
>> +#include "hw/pci/pcie_doe.h"
>> +#include "hw/pci/msi.h"
>> +#include "hw/pci/msix.h"
>> +
>> +/*
>> + * DOE Default Protocols (Discovery, CMA)
>> + */
>> +/* Discovery Request Object */
>> +struct doe_discovery {
>> +    DOEHeader header;
>> +    uint8_t index;
>> +    uint8_t reserved[3];
>> +} QEMU_PACKED;
>> +
>> +/* Discovery Response Object */
>> +struct doe_discovery_rsp {
>> +    DOEHeader header;
>> +    uint16_t vendor_id;
>> +    uint8_t doe_type;
>> +    uint8_t next_index;
>> +} QEMU_PACKED;
>> +
>> +/* Callback for Discovery */
>> +static bool pcie_doe_discovery_rsp(DOECap *doe_cap)
>> +{
>> +    PCIEDOE *doe = doe_cap->doe;
>> +    struct doe_discovery *req = pcie_doe_get_req(doe_cap);
>> +    uint8_t index = req->index;
>> +    DOEProtocol *prot = NULL;
>> +
>> +    /* Request length mismatch, discard */
>> +    if (req->header.length < dwsizeof(struct doe_discovery)) {
>> +        return DOE_DISCARD;
> 
> 	return false;  Or better yet a meaningful error code to make debugging
> easier.

OK

> 
>> +    }
>> +
>> +    /* Point to the requested protocol */
>> +    if (index < doe->protocol_num) {
>> +        prot = &doe->protocols[index];
>> +    }
>> +
>> +    struct doe_discovery_rsp rsp = {
>> +        .header = {
>> +            .vendor_id = PCI_VENDOR_ID_PCI_SIG,
>> +            .doe_type = PCI_SIG_DOE_DISCOVERY,
>> +            .reserved = 0x0,
> 
> Nice thing about c99 based structure assignment is that unspecified
> elements are set to 0 automatically.  So you can drop this particular
> element safely and end up with slightly cleaner code.

OK

> 
>> +            .length = dwsizeof(struct doe_discovery_rsp),
>> +        },
>> +        .vendor_id = (prot) ? prot->vendor_id : 0xFFFF,
>> +        .doe_type = (prot) ? prot->doe_type : 0xFF,
>> +        .next_index = (index + 1) < doe->protocol_num ?
>> +                      (index + 1) : 0,
>> +    };
>> +
>> +    pcie_doe_set_rsp(doe_cap, &rsp);
>> +
>> +    return DOE_SUCCESS;
>> +}
>> +
>> +/* Callback for CMA */
>> +static bool pcie_doe_cma_rsp(DOECap *doe_cap)
> 
> I've not checked CMA, but I'd expect this to be a separate
> patch as not part of core DOE support (or same ECN for that
> matter).

We’ll back out CMA.  Currently it’s an incomplete solution.  We support it in 
discovery process but the response is that of being disabled.  So not the best
way to handle it.

> 
>> +{
>> +    doe_cap->status.error = 1;
>> +
>> +    memset(doe_cap->read_mbox, 0,
>> +           PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
>> +
>> +    doe_cap->write_mbox_len = 0;
>> +
>> +    return DOE_DISCARD;
>> +}
>> +
>> +/*
>> + * DOE Utilities
>> + */
>> +static void pcie_doe_reset_mbox(DOECap *st)
>> +{
>> +    st->read_mbox_idx = 0;
>> +
>> +    st->read_mbox_len = 0;
>> +    st->write_mbox_len = 0;
>> +
>> +    memset(st->read_mbox, 0, PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
>> +    memset(st->write_mbox, 0, PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
>> +}
>> +
>> +/*
>> + * Initialize the list and protocol for a device.
>> + * This function won't add the DOE capabitity to your PCIe device.
>> + */
>> +void pcie_doe_init(PCIDevice *dev, PCIEDOE *doe)
>> +{
>> +    doe->pdev = dev;
>> +    doe->head = NULL;
>> +    doe->protocol_num = 0;
> 
> No need to set things to zero as I assume you allocate it zero filled.
> At least no point for things where zero is the obvious default.
> 
>> +
>> +    /* Register two default protocol */
>> +    //TODO : LINK LIST
> 
> Agreed :)
> 
>> +    pcie_doe_register_protocol(doe, PCI_VENDOR_ID_PCI_SIG,
>> +            PCI_SIG_DOE_DISCOVERY, pcie_doe_discovery_rsp);
>> +    pcie_doe_register_protocol(doe, PCI_VENDOR_ID_PCI_SIG,
>> +            PCI_SIG_DOE_CMA, pcie_doe_cma_rsp);
>> +}
>> +
>> +int pcie_cap_doe_add(PCIEDOE *doe, uint16_t offset, bool intr, uint16_t vec) {
> 
> Bracket on this line.
> 
>> +    DOECap *new_cap, **ptr;
>> +    PCIDevice *dev = doe->pdev;
>> +
>> +    pcie_add_capability(dev, PCI_EXT_CAP_ID_DOE, PCI_DOE_VER, offset,
>> +                        PCI_DOE_SIZEOF);
>> +
>> +    ptr = &doe->head;
>> +    /* Insert the new DOE at the end of linked list */
> 
> As below, not sure this abstraction makes sense.
> 
>> +    while (*ptr) {
>> +        if (range_covers_byte((*ptr)->offset, PCI_DOE_SIZEOF, offset) ||
>> +            (*ptr)->cap.vec == vec) {
>> +            return -EINVAL;
>> +        }
>> +
>> +        ptr = &(*ptr)->next;
>> +    }
>> +    new_cap = g_malloc0(sizeof(DOECap));
>> +    *ptr = new_cap;
>> +
>> +    new_cap->doe = doe;
>> +    new_cap->offset = offset;
>> +    new_cap->next = NULL;
>> +
>> +    /* Configure MSI/MSI-X */
>> +    if (intr && (msi_present(dev) || msix_present(dev))) {
>> +        new_cap->cap.intr = intr;
>> +        new_cap->cap.vec = vec;
>> +    }
>> +
>> +    /* Set up W/R Mailbox buffer */
>> +    new_cap->write_mbox = g_malloc0(PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
>> +    new_cap->read_mbox = g_malloc0(PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
>> +
>> +    pcie_doe_reset_mbox(new_cap);
>> +
>> +    return 0;
>> +}
>> +
>> +void pcie_doe_uninit(PCIEDOE *doe) {
>> +    PCIDevice *dev = doe->pdev;
>> +    DOECap *cap;
>> +
>> +    pci_del_capability(dev, PCI_EXT_CAP_ID_DOE, PCI_DOE_SIZEOF);
>> +
>> +    cap = doe->head;
>> +    while (cap) {
>> +        doe->head = doe->head->next;
>> +
>> +        g_free(cap->read_mbox);
>> +        g_free(cap->write_mbox);
>> +        cap->read_mbox = NULL;
>> +        cap->write_mbox = NULL;
> 
> I'd avoid setting things to NULL that you are throwing away.  It
> implies that they will be reused or that it matters in somewhat which
> isn't the case here.

OK

> 
>> +        g_free(cap);
>> +        cap = doe->head;
>> +    }
>> +}
>> +
>> +void pcie_doe_register_protocol(PCIEDOE *doe, uint16_t vendor_id,
>> +        uint8_t doe_type, bool (*set_rsp)(DOECap *))
>> +{
>> +    /* Protocol num should not exceed 256 */
>> +    assert(doe->protocol_num < PCI_DOE_PROTOCOL_MAX);
>> +
>> +    doe->protocols[doe->protocol_num].vendor_id = vendor_id;
>> +    doe->protocols[doe->protocol_num].doe_type = doe_type;
>> +    doe->protocols[doe->protocol_num].set_rsp = set_rsp;
>> +
>> +    doe->protocol_num++;
>> +}
>> +
>> +uint32_t pcie_doe_build_protocol(DOEProtocol *p)
>> +{
>> +    return DATA_OBJ_BUILD_HEADER1(p->vendor_id, p->doe_type);
>> +}
>> +
>> +/* Return the pointer of DOE request in write mailbox buffer */
>> +void *pcie_doe_get_req(DOECap *doe_cap)
>> +{
>> +    return doe_cap->write_mbox;
>> +}
>> +
>> +/* Copy the response to read mailbox buffer */
>> +void pcie_doe_set_rsp(DOECap *doe_cap, void *rsp)
>> +{
>> +    uint32_t len = pcie_doe_object_len(rsp);
>> +
>> +    memcpy(doe_cap->read_mbox + doe_cap->read_mbox_len,
>> +           rsp, len * sizeof(uint32_t));
>> +    doe_cap->read_mbox_len += len;
>> +}
>> +
>> +/* Get Data Object length */
>> +uint32_t pcie_doe_object_len(void *obj)
>> +{
>> +    return (obj)? ((DOEHeader *)obj)->length : 0;
>> +}
>> +
>> +static void pcie_doe_write_mbox(DOECap *doe_cap, uint32_t val)
>> +{
>> +    memcpy(doe_cap->write_mbox + doe_cap->write_mbox_len, &val, sizeof(uint32_t));
>> +
>> +    if (doe_cap->write_mbox_len == 1) {
>> +        DOE_DBG("  Capture DOE request DO length = %d\n", val & 0x0003ffff);
>> +    }
>> +
>> +    doe_cap->write_mbox_len++;
> 
> We probably need to check that the mailbox write was full dword as the spec
> requires.  No idea what we do if it isn't as spec doesn't seem to say...
> I've queried one of our PCI experts.

Let us know their response.  If not being able to rely on full DW config access will
require a bunch of changes to accommodate.

> 
> 
>> +}
>> +
>> +static void pcie_doe_irq_assert(DOECap *doe_cap)
>> +{
>> +    PCIDevice *dev = doe_cap->doe->pdev;
>> +
>> +    if (doe_cap->cap.intr && doe_cap->ctrl.intr) {
>> +        /* Interrupt notify */
>> +        if (msix_enabled(dev)) {
>> +            msix_notify(dev, doe_cap->cap.vec);
>> +        } else if (msi_enabled(dev)) {
>> +            msi_notify(dev, doe_cap->cap.vec);
>> +        }
>> +        /* Not support legacy IRQ */
>> +    }
>> +}
>> +
>> +static void pcie_doe_set_ready(DOECap *doe_cap, bool rdy)
>> +{
>> +    doe_cap->status.ready = rdy;
>> +
>> +    if (rdy) {
>> +        pcie_doe_irq_assert(doe_cap);
>> +    }
>> +}
>> +
>> +static void pcie_doe_set_error(DOECap *doe_cap, bool err)
>> +{
>> +    doe_cap->status.error = err;
>> +
>> +    if (err) {
>> +        pcie_doe_irq_assert(doe_cap);
>> +    }
>> +}
>> +
>> +DOECap *pcie_doe_covers_addr(PCIEDOE *doe, uint32_t addr)
>> +{
>> +    DOECap *ptr = doe->head;
>> +
>> +    /* If overlaps DOE range. PCIe Capability Header won't be included. */
>> +    while (ptr && 
>> +           !range_covers_byte(ptr->offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
>> +        ptr = ptr->next;
>> +    }
>> +    
>> +    return ptr;
>> +}
>> +
>> +uint32_t pcie_doe_read_config(DOECap *doe_cap,
>> +                              uint32_t addr, int size)
>> +{
>> +    uint32_t ret = 0, shift, mask = 0xFFFFFFFF;
>> +    uint16_t doe_offset = doe_cap->offset;
>> +
>> +    /* If overlaps DOE range. PCIe Capability Header won't be included. */
>> +    if (range_covers_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
> 
> I'd flip this around to reduce indentation + no need to be careful with alignment etc
> if we aren't returning anything.
> 
>       if (!range_cover_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)
> 		return 0;
> 
> 
>> +        addr -= doe_offset;
>> +
>> +        if (range_covers_byte(PCI_EXP_DOE_CAP, sizeof(uint32_t), addr)) {
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_REG, INTR_SUPP,
>> +                             doe_cap->cap.intr);
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_REG, DOE_INTR_MSG_NUM,
>> +                             doe_cap->cap.vec);
>> +        } else if (range_covers_byte(PCI_EXP_DOE_CTRL, sizeof(uint32_t), addr)) {
>> +            /* Must return ABORT=0 and GO=0 */
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_CONTROL, DOE_INTR_EN,
>> +                             doe_cap->ctrl.intr);
>> +            DOE_DBG("  CONTROL REG=%x\n", ret);
>> +        } else if (range_covers_byte(PCI_EXP_DOE_STATUS, sizeof(uint32_t), addr)) {
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_BUSY,
>> +                             doe_cap->status.busy);
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_INTR_STATUS,
>> +                             doe_cap->status.intr);
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_ERROR,
>> +                             doe_cap->status.error);
>> +            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DATA_OBJ_RDY,
>> +                             doe_cap->status.ready);
>> +        } else if (range_covers_byte(PCI_EXP_DOE_RD_DATA_MBOX, sizeof(uint32_t), addr)) {
>> +            /* Check that READY is set */
> 
> I'd clean out any comment that is obvious from the code.
> Comments get out of sync over time so there is a maintenance burden in having them such
> that we only bother if they add information.
> 
>> +            if (!doe_cap->status.ready) {
>> +                /* Return 0 if not ready */
>> +                DOE_DBG("  RD MBOX RETURN=%x\n", ret);
>> +            } else {
>> +                /* Deposit next DO DW into read mbox */
> 
> This comment is missleading.  It might not be the 'next' one. If you read
> the register twice for instance.  Much better not to have the comment
> at all on basis code is fairly obvious anyway!
> 
> As mentioned below, a read of this when nothing there is an underflow and spec
> suggests that is an error condition.
> 
>> +                DOE_DBG("  RD MBOX, DATA OBJECT READY,"
>> +                        " CURRENT DO DW OFFSET=%x\n",
>> +                        doe_cap->read_mbox_idx);
>> +
>> +                ret = doe_cap->read_mbox[doe_cap->read_mbox_idx];
>> +
>> +                DOE_DBG("  RD MBOX DW=%x\n", ret);
>> +                DOE_DBG("  RD MBOX DW OFFSET=%d, RD MBOX LENGTH=%d\n",
>> +                        doe_cap->read_mbox_idx, doe_cap->read_mbox_len);
>> +            }
>> +        } else if (range_covers_byte(PCI_EXP_DOE_WR_DATA_MBOX, sizeof(uint32_t), addr)) {
>> +            DOE_DBG("  WR MBOX, local_val =%x\n", ret);
>> +        }
>> +    }
>> +
>> +    /* Alignment */
>> +    shift = addr % sizeof(uint32_t);
>> +    if (shift) {
>> +        ret >>= shift * 8;
>> +    }
>> +    mask >>= (sizeof(uint32_t) - size) * 8;
>> +
>> +    return ret & mask;
>> +}
>> +
>> +void pcie_doe_write_config(DOECap *doe_cap,
>> +                           uint32_t addr, uint32_t val, int size)
>> +{
>> +    PCIEDOE *doe = doe_cap->doe;
>> +    uint16_t doe_offset = doe_cap->offset;
>> +    int p;
>> +    bool discard;
>> +    uint32_t shift;
>> +
>> +    DOE_DBG("  addr=%x, val=%x, size=%x\n", addr, val, size);
>> +
>> +    /* If overlaps DOE range. PCIe Capability Header won't be included. */
>> +    if (range_covers_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
> 
> As below, invert this condition and return early as it will simplify below and reduce
> indentation.
> 
>     if (!range_covers_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
>          return;
>    }
> 
>> +        /* Alignment */
>> +        shift = addr % sizeof(uint32_t);
>> +        addr -= (doe_offset - shift);
>> +        val <<= shift * 8;
>> +
>> +        switch (addr) {
>> +        case PCI_EXP_DOE_CTRL:
>> +            DOE_DBG("  CONTROL=%x\n", val);
>> +            if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_ABORT)) {
>> +                /* If ABORT, clear status reg DOE Error and DOE Ready */
>> +                DOE_DBG("  Setting ABORT\n");
>> +                pcie_doe_set_ready(doe_cap, 0);
>> +                pcie_doe_set_error(doe_cap, 0);
>> +                pcie_doe_reset_mbox(doe_cap);
>> +            } else if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_GO)) {
>> +                DOE_DBG("  CONTROL GO=%x\n", val);
> 
> This case is complex enough I'd factor it out as it's own function.

OK

> 
>> +                /*
>> +                 * Check protocol the incoming request in write_mbox and
>> +                 * memcpy the corresponding response to read_mbox.
>> +                 *
>> +                 * "discard" should be set up if the response callback
>> +                 * function could not deal with request (e.g. length
>> +                 * mismatch) or the protocol of request was not found.
>> +                 */
>> +                discard = DOE_DISCARD;
>> +                for (p = 0; p < doe->protocol_num; p++) {
>> +                    if (doe_cap->write_mbox[0] ==
>> +                        pcie_doe_build_protocol(&doe->protocols[p])) {
>> +                        /* Found */
>> +                        DOE_DBG("  DOE PROTOCOL=%x\n", doe_cap->write_mbox[0]);
>> +                        /*
>> +                         * Spec:
>> +                         * If the number of DW transferred does not match the
>> +                         * indicated Length for a data object, then the
>> +                         * data object must be silently discarded.
>> +                         */
>> +                        if (doe_cap->write_mbox_len ==
>> +                            pcie_doe_object_len(pcie_doe_get_req(doe_cap)))
>> +                            discard = doe->protocols[p].set_rsp(doe_cap);
>> +                        break;
>> +                    }
>> +                }
>> +
>> +                /* Set DOE Ready */
>> +                if (!discard) {
>> +                    pcie_doe_set_ready(doe_cap, 1);
>> +                } else {
>> +                    pcie_doe_reset_mbox(doe_cap);
>> +                }
>> +            } else if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_INTR_EN)) {
> 
> Spec reference needed to say why you can't write this at same time as GO.
> It may be odd thing to do, but I can't see anything saying you couldn't do this,
> for example on your very first command.
> 
>> +                doe_cap->ctrl.intr = 1;
>> +            }
>> +            break;
>> +        case PCI_EXP_DOE_RD_DATA_MBOX:
>> +            /* Read MBOX */
>> +            DOE_DBG("  INCR RD MBOX DO DW OFFSET=%d\n", doe_cap->read_mbox_idx);
>> +            doe_cap->read_mbox_idx += 1;
>> +            /* Last DW */
>> +            if (doe_cap->read_mbox_idx >= doe_cap->read_mbox_len) {
>> +                pcie_doe_reset_mbox(doe_cap);
>> +                pcie_doe_set_ready(doe_cap, 0);
>> +            }
> 
> A write of this when there is nothing there is an underflow. As is a read of this
> after the last write.  I would guess both should be error conditions.

We’ll double check and fix accordingly

> 
>> +            break;
>> +        case PCI_EXP_DOE_WR_DATA_MBOX:
>> +            /* Write MBOX */
>> +            DOE_DBG("  WR MBOX=%x, DW OFFSET = %d\n", val, doe_cap->write_mbox_len);
>> +            pcie_doe_write_mbox(doe_cap, val);
>> +            break;
>> +        case PCI_EXP_DOE_STATUS:
>> +        case PCI_EXP_DOE_CAP:
>> +        default:
>> +            break;
>> +        }
>> +    }
>> +}
>> diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
>> index 76bf3ed..636b2e8 100644
>> --- a/include/hw/pci/pci_ids.h
>> +++ b/include/hw/pci/pci_ids.h
>> @@ -157,6 +157,8 @@
>> 
>> /* Vendors and devices.  Sort key: vendor first, device next. */
>> 
>> +#define PCI_VENDOR_ID_PCI_SIG            0x0001
>> +
>> #define PCI_VENDOR_ID_LSI_LOGIC          0x1000
>> #define PCI_DEVICE_ID_LSI_53C810         0x0001
>> #define PCI_DEVICE_ID_LSI_53C895A        0x0012
>> diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
>> index 14c58eb..47d6f66 100644
>> --- a/include/hw/pci/pcie.h
>> +++ b/include/hw/pci/pcie.h
>> @@ -25,6 +25,7 @@
>> #include "hw/pci/pcie_regs.h"
>> #include "hw/pci/pcie_aer.h"
>> #include "hw/hotplug.h"
>> +#include "hw/pci/pcie_doe.h"
>> 
>> typedef enum {
>>     /* for attention and power indicator */
>> diff --git a/include/hw/pci/pcie_doe.h b/include/hw/pci/pcie_doe.h
>> new file mode 100644
>> index 0000000..f497075
>> --- /dev/null
>> +++ b/include/hw/pci/pcie_doe.h
>> @@ -0,0 +1,166 @@
>> +#ifndef PCIE_DOE_H
>> +#define PCIE_DOE_H
>> +
>> +#include "qemu/range.h"
>> +#include "qemu/typedefs.h"
>> +#include "hw/register.h"
>> +
>> +/* 
>> + * PCI DOE register defines 7.9.xx 
>> + */
>> +/* DOE Capabilities Register */
>> +#define PCI_EXP_DOE_CAP             0x04
>> +REG32(PCI_DOE_CAP_REG, 0)
>> +    FIELD(PCI_DOE_CAP_REG, INTR_SUPP, 0, 1)
>> +    FIELD(PCI_DOE_CAP_REG, DOE_INTR_MSG_NUM, 1, 11)
>> +
>> +/* DOE Control Register */
>> +#define PCI_EXP_DOE_CTRL            0x08
>> +REG32(PCI_DOE_CAP_CONTROL, 0)
>> +    FIELD(PCI_DOE_CAP_CONTROL, DOE_ABORT, 0, 1)
>> +    FIELD(PCI_DOE_CAP_CONTROL, DOE_INTR_EN, 1, 1)
>> +    FIELD(PCI_DOE_CAP_CONTROL, DOE_GO, 31, 1)
>> +
>> +/* DOE Status Register  */
>> +#define PCI_EXP_DOE_STATUS          0x0c
>> +REG32(PCI_DOE_CAP_STATUS, 0)
>> +    FIELD(PCI_DOE_CAP_STATUS, DOE_BUSY, 0, 1)
>> +    FIELD(PCI_DOE_CAP_STATUS, DOE_INTR_STATUS, 1, 1)
>> +    FIELD(PCI_DOE_CAP_STATUS, DOE_ERROR, 2, 1)
>> +    FIELD(PCI_DOE_CAP_STATUS, DATA_OBJ_RDY, 31, 1)
>> +
>> +/* DOE Write Data Mailbox Register  */
>> +#define PCI_EXP_DOE_WR_DATA_MBOX    0x10
>> +
>> +/* DOE Read Data Mailbox Register  */
>> +#define PCI_EXP_DOE_RD_DATA_MBOX    0x14
>> +
>> +/* 
>> + * PCI DOE register defines 7.9.xx 
>> + */
>> +/* Table 7-x2 */
>> +#define PCI_SIG_DOE_DISCOVERY       0x00
>> +#define PCI_SIG_DOE_CMA             0x01
>> +
>> +#define DATA_OBJ_BUILD_HEADER1(v, p)  ((p << 16) | v)
>> +
>> +/* Figure 6-x1 */
>> +#define DATA_OBJECT_HEADER1_OFFSET  0
>> +#define DATA_OBJECT_HEADER2_OFFSET  1
>> +#define DATA_OBJECT_CONTENT_OFFSET  2
>> +
>> +#define PCI_DOE_MAX_DW_SIZE (1 << 18)
>> +#define PCI_DOE_PROTOCOL_MAX 256
>> +
>> +/*
>> + * Self-defined Marco
>> + */
>> +/* Request/Response State */
>> +#define DOE_SUCCESS 0
>> +#define DOE_DISCARD 1
> Drop these. As mentioned in previous review.  These are just
> obvious bools.
> 
>> +
>> +/* An invalid vector number for MSI/MSI-x */
>> +#define DOE_NO_INTR (~0)
>> +
>> +/*
>> + * DOE Protocol - Data Object
>> + */
>> +typedef struct DOEHeader DOEHeader;
>> +typedef struct DOEProtocol DOEProtocol;
>> +typedef struct PCIEDOE PCIEDOE;
>> +typedef struct DOECap DOECap;
>> +
>> +struct DOEHeader {
>> +    uint16_t vendor_id;
>> +    uint8_t doe_type;
>> +    uint8_t reserved;
>> +    struct {
>> +        uint32_t length:18;
>> +        uint32_t reserved2:14;
> 
> As before, bitfields are not a good idea in this sort of code in general due to lack
> of standard handling in the C spec.
> 
>> +    };
>> +} QEMU_PACKED;
>> +
>> +/* Protocol infos and rsp function callback */
>> +struct DOEProtocol {
>> +    uint16_t vendor_id;
>> +    uint8_t doe_type;
>> +
>> +    bool (*set_rsp)(DOECap *);
>> +};
>> +
>> +struct PCIEDOE {
>> +    PCIDevice *pdev;
> 
> Having the PCIDevice in here only allows you to drop a parameter in read and write
> calls. I'd not bother given the nest of pointers you have as a result.
> Just pass both the PCIDOE and the PCIDevice into those two functions.
> 
>> +    DOECap *head;
> 
> I can sort of get what you are doing with this list of DOEs but to mind it's the wrong
> abstraction as it doesn't fit how I think these will be used.
> 
> It's not a case that there will be N DOE mailboxes each of which supports
> all the same protocols.  I suspect quite the opposite.
> 
> Each DOE may support multiple protocols but the most obvious reason you'd
> do multiple DOEs is because they support different protocols.

Agree

> 
> If we want to support the same protocol on mulitple DOE instances then we'd
> register it for each of them.
> 
> Thus I'd drop the list handling and instead  have for example
> 
> struct cxl_type3_dev {
> 
> ...
> 
>    PCIDOE doe_ima;
>    PCIDOE doe_cdat;
> };
> 
> In the read and write calls then just call pcie_doe_xxxx_config() twice.
> You do have to handle the miss vs hit stuff though (similar problem that
> lead to ugly code in my version).

OK we’ll handle it better in next patch.

> 
> 
>> +
>> +    /* Protocols and its callback response */
>> +    DOEProtocol protocols[PCI_DOE_PROTOCOL_MAX];
>> +    uint32_t protocol_num;
>> +};
>> +
>> +struct DOECap {
>> +    PCIEDOE *doe;
> 
> Following suggestion to get rid of the list, you should also then combine
> PCIEDOE and the DOECap as they match 1 to 1.
> 
>> +
>> +    /* Capability offset */
>> +    uint16_t offset;
>> +
>> +    /* Next DOE capability */
>> +    DOECap *next;
>> +
>> +    /* Capability */
>> +    struct {
>> +        bool intr;
>> +        uint16_t vec;
>> +    } cap;
>> +
>> +    /* Control */
>> +    struct {
>> +        bool abort;
>> +        bool intr;
>> +        bool go;
>> +    } ctrl;
>> +
>> +    /* Status */
>> +    struct {
>> +        bool busy;
>> +        bool intr;
>> +        bool error;
>> +        bool ready;
>> +    } status;
>> +
>> +    /* Mailbox buffer for device */
>> +    uint32_t *write_mbox;
>> +    uint32_t *read_mbox;
>> +
>> +    /* Mailbox position indicator */
>> +    uint32_t read_mbox_idx;
>> +    uint32_t read_mbox_len;
>> +    uint32_t write_mbox_len;
>> +};
>> +
>> +void pcie_doe_init(PCIDevice *dev, PCIEDOE *doe);
>> +int pcie_cap_doe_add(PCIEDOE *doe, uint16_t offset, bool intr, uint16_t vec);
>> +void pcie_doe_uninit(PCIEDOE *doe);
>> +void pcie_doe_register_protocol(PCIEDOE *doe, uint16_t vendor_id,
>> +                                uint8_t doe_type,
>> +                                bool (*set_rsp)(DOECap *));
>> +uint32_t pcie_doe_build_protocol(DOEProtocol *p);
>> +DOECap *pcie_doe_covers_addr(PCIEDOE *doe, uint32_t addr);
>> +uint32_t pcie_doe_read_config(DOECap *doe_cap, uint32_t addr, int size);
>> +void pcie_doe_write_config(DOECap *doe_cap, uint32_t addr,
>> +                           uint32_t val, int size);
>> +
>> +/* Utility functions for set_rsp in DOEProtocol */
>> +void *pcie_doe_get_req(DOECap *doe_cap);
>> +void pcie_doe_set_rsp(DOECap *doe_cap, void *rsp);
>> +uint32_t pcie_doe_object_len(void *obj);
>> +
>> +#define DOE_DEBUG
>> +#ifdef DOE_DEBUG
>> +#define DOE_DBG(...) printf(__VA_ARGS__)
>> +#else
>> +#define DOE_DBG(...) {}
>> +#endif
> 
> Tidy this stuff up (i.e. get rid of it) for next version.  It's fine when
> you are doing early debug, but we don't want it in the version for review.

Got it!

> 
>> +
>> +#define dwsizeof(s) ((sizeof(s) + 4 - 1) / 4)
> 
> Not used in this patch.  Move it down towards code that uses it or at very 
> least only introduce it when used.
> 
>> +
>> +#endif /* PCIE_DOE_H */
>> diff --git a/include/hw/pci/pcie_regs.h b/include/hw/pci/pcie_regs.h
>> index 1db86b0..963dc2e 100644
>> --- a/include/hw/pci/pcie_regs.h
>> +++ b/include/hw/pci/pcie_regs.h
>> @@ -179,4 +179,8 @@ typedef enum PCIExpLinkWidth {
>> #define PCI_ACS_VER                     0x1
>> #define PCI_ACS_SIZEOF                  8
>> 
>> +/* DOE Capability Register Fields */
>> +#define PCI_DOE_VER                     0x1
>> +#define PCI_DOE_SIZEOF                  24
>> +
>> #endif /* QEMU_PCIE_REGS_H */
>> diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h
>> index e709ae8..4a7b7a4 100644
>> --- a/include/standard-headers/linux/pci_regs.h
>> +++ b/include/standard-headers/linux/pci_regs.h
> 
> Pull this change out as a separate patch.  This header gets fetched via a script
> so we will only find this here if we update the source file in the kernel.

Got it!

> 
> That should happen soon anyway as we add the support to read this.
> 
> 
>> @@ -730,7 +730,8 @@
>> #define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
>> #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
>> #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
>> -#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
>> +#define PCI_EXT_CAP_ID_DOE      0x2E    /* Data Object Exchange */
>> +#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DOE
>> 
>> #define PCI_EXT_CAP_DSN_SIZEOF	12
>> #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
>
Jonathan Cameron Feb. 18, 2021, 7:11 p.m. UTC | #5
On Fri, 12 Feb 2021 16:58:21 -0500
Chris Browy <cbrowy@avery-design.com> wrote:

> > On Feb 12, 2021, at 11:24 AM, Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:
> > 
> > On Tue, 9 Feb 2021 15:35:49 -0500
> > Chris Browy <cbrowy@avery-design.com> wrote:
> > 
> > Run ./scripts/checkpatch.pl over the patches and fix all the warnings before
> > posting.  It will save time by clearing out most of the minor formatting issues
> > and similar that inevitably sneak in during development.
> >   
> Excellent suggestion.  We’re still newbies!
> 
> > The biggest issue I'm seeing in here is that the abstraction of
> > multiple DOE capabiltiies accessing same protocols doesn't make sense.
> > 
> > Each DOE ecap region and hence mailbox can have it's own set of
> > (possibly  overlapping) protocols.
> > 
> > From the ECN:
> > "It is permitted for a protocol using data object exchanges to require
> > that a Function implement a unique instance of DOE for that specific
> > protocol, and/or to allow sharing of a DOE instance to only a specific
> > set of protocols using data object exchange, and/or to allow a Function
> > to implement multiple instances of DOE supporting the specific protocol."
> > 
> > Tightly couple the ECAP and DOE.  If we are in the multiple instances
> > of DOE supporting a specific protocol case, then register it separately
> > for each one.  The individual device emulation then needs to deal with
> > any possible clashes etc.  
> 
> Not sure how configurable we want to make the device.  It is a simple type 3
> device after all. 

Agreed, but what I (or someone else) really doesn't want to have to do
in the future is reimplement DOE because we made design decisions that make
this version hard to reuse.  Unless it is particularly nasty to do we should
try to design something that is generally useful rather than targeted to
closely at the specific case we are dealing with.

I'd argue the ECAP and the DOE mailbox are always tightly coupled 1-to-1.
Whether the device wants to implement multiple protocols on each DOE mailbox
or indeed run individual protocols on multiple DOE mailboxes is a design
decision, but the actual mechanics of DOE match up with the config
space structures anything else is impdef on the device.

> 
> The DOE spec does leave it pretty arbitrary regarding N DOE instances (DOE 
> Extended Cap entry points) for M protocols, including where N>1 and M=1.  
> Currently we implement N=2 DOE caps (instances), one for CDAT, one for 
> Compliance Mode.[
> 
> Maybe a more complex MLD device might have one or more DOE instances 
> for the CDAT protocol alone to define each HDM but currently we only have 
> one pmem (SLD) so we can’t really do much more than what’s supported.
> 
> Open to further suggestion though.  Based on answer to above we’ll follow 
> the suggestion lower in the code review regarding 
> 
...
Chris Browy Feb. 19, 2021, 12:46 a.m. UTC | #6
> On Feb 18, 2021, at 2:11 PM, Jonathan Cameron <jonathan.cameron@huawei.com> wrote:
> 
> On Fri, 12 Feb 2021 16:58:21 -0500
> Chris Browy <cbrowy@avery-design.com> wrote:
> 
>>> On Feb 12, 2021, at 11:24 AM, Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:
>>> 
>>> On Tue, 9 Feb 2021 15:35:49 -0500
>>> Chris Browy <cbrowy@avery-design.com> wrote:
>>> 
>>> Run ./scripts/checkpatch.pl over the patches and fix all the warnings before
>>> posting.  It will save time by clearing out most of the minor formatting issues
>>> and similar that inevitably sneak in during development.
>>> 
>> Excellent suggestion.  We’re still newbies!
>> 
>>> The biggest issue I'm seeing in here is that the abstraction of
>>> multiple DOE capabiltiies accessing same protocols doesn't make sense.
>>> 
>>> Each DOE ecap region and hence mailbox can have it's own set of
>>> (possibly  overlapping) protocols.
>>> 
>>> From the ECN:
>>> "It is permitted for a protocol using data object exchanges to require
>>> that a Function implement a unique instance of DOE for that specific
>>> protocol, and/or to allow sharing of a DOE instance to only a specific
>>> set of protocols using data object exchange, and/or to allow a Function
>>> to implement multiple instances of DOE supporting the specific protocol."
>>> 
>>> Tightly couple the ECAP and DOE.  If we are in the multiple instances
>>> of DOE supporting a specific protocol case, then register it separately
>>> for each one.  The individual device emulation then needs to deal with
>>> any possible clashes etc.  
>> 
>> Not sure how configurable we want to make the device.  It is a simple type 3
>> device after all. 
> 
> Agreed, but what I (or someone else) really doesn't want to have to do
> in the future is reimplement DOE because we made design decisions that make
> this version hard to reuse.  Unless it is particularly nasty to do we should
> try to design something that is generally useful rather than targeted to
> closely at the specific case we are dealing with.
> 
> I'd argue the ECAP and the DOE mailbox are always tightly coupled 1-to-1.
> Whether the device wants to implement multiple protocols on each DOE mailbox
> or indeed run individual protocols on multiple DOE mailboxes is a design
> decision, but the actual mechanics of DOE match up with the config
> space structures anything else is impdef on the device.

Yes I agree that there is 1-to-1 between DOE extended cap (ECAP) and DOE
Mailbox.  If we want to provide complete flexibility we should let the user pass 
device property arrays to QEMU command for how many DOE ECAP’s to build 
out and how to assign protocol(s) to each of them.  Array index is the DOE 
instance #.

Also we can provide a property for cdat binary (blob) filename to initialize 
the CDAT structure[entries].  This just reads in whatever mix of CDAT structure
types are in the blob.

-device cxl-type3,bus=rp0,memdev=cxl-mem1,id=cxl-pmem0,size=256M \
    doe-ecap-instances=2 \
    doe-ecap[0]=5 // bitwise OR for protocols shared
    doe-ecap[1]=2 //bitwise OR for protocols shared
    doe-ecap-cdat[1]=mycdat.bin

where let’s say protocols bitvector
bit [0]=CMA
bit [1]=CDAT
bit [2}=Compliance

Let me know if you some better alternatives and we’ll implement it.


> 
>> 
>> The DOE spec does leave it pretty arbitrary regarding N DOE instances (DOE 
>> Extended Cap entry points) for M protocols, including where N>1 and M=1.  
>> Currently we implement N=2 DOE caps (instances), one for CDAT, one for 
>> Compliance Mode.[
>> 
>> Maybe a more complex MLD device might have one or more DOE instances 
>> for the CDAT protocol alone to define each HDM but currently we only have 
>> one pmem (SLD) so we can’t really do much more than what’s supported.
>> 
>> Open to further suggestion though.  Based on answer to above we’ll follow 
>> the suggestion lower in the code review regarding 
>> 
> ...
>
Jonathan Cameron Feb. 19, 2021, 12:33 p.m. UTC | #7
On Thu, 18 Feb 2021 19:46:54 -0500
Chris Browy <cbrowy@avery-design.com> wrote:

> > On Feb 18, 2021, at 2:11 PM, Jonathan Cameron <jonathan.cameron@huawei.com> wrote:
> > 
> > On Fri, 12 Feb 2021 16:58:21 -0500
> > Chris Browy <cbrowy@avery-design.com> wrote:
> >   
> >>> On Feb 12, 2021, at 11:24 AM, Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:
> >>> 
> >>> On Tue, 9 Feb 2021 15:35:49 -0500
> >>> Chris Browy <cbrowy@avery-design.com> wrote:
> >>> 
> >>> Run ./scripts/checkpatch.pl over the patches and fix all the warnings before
> >>> posting.  It will save time by clearing out most of the minor formatting issues
> >>> and similar that inevitably sneak in during development.
> >>>   
> >> Excellent suggestion.  We’re still newbies!
> >>   
> >>> The biggest issue I'm seeing in here is that the abstraction of
> >>> multiple DOE capabiltiies accessing same protocols doesn't make sense.
> >>> 
> >>> Each DOE ecap region and hence mailbox can have it's own set of
> >>> (possibly  overlapping) protocols.
> >>> 
> >>> From the ECN:
> >>> "It is permitted for a protocol using data object exchanges to require
> >>> that a Function implement a unique instance of DOE for that specific
> >>> protocol, and/or to allow sharing of a DOE instance to only a specific
> >>> set of protocols using data object exchange, and/or to allow a Function
> >>> to implement multiple instances of DOE supporting the specific protocol."
> >>> 
> >>> Tightly couple the ECAP and DOE.  If we are in the multiple instances
> >>> of DOE supporting a specific protocol case, then register it separately
> >>> for each one.  The individual device emulation then needs to deal with
> >>> any possible clashes etc.    
> >> 
> >> Not sure how configurable we want to make the device.  It is a simple type 3
> >> device after all.   
> > 
> > Agreed, but what I (or someone else) really doesn't want to have to do
> > in the future is reimplement DOE because we made design decisions that make
> > this version hard to reuse.  Unless it is particularly nasty to do we should
> > try to design something that is generally useful rather than targeted to
> > closely at the specific case we are dealing with.
> > 
> > I'd argue the ECAP and the DOE mailbox are always tightly coupled 1-to-1.
> > Whether the device wants to implement multiple protocols on each DOE mailbox
> > or indeed run individual protocols on multiple DOE mailboxes is a design
> > decision, but the actual mechanics of DOE match up with the config
> > space structures anything else is impdef on the device.  
> 
> Yes I agree that there is 1-to-1 between DOE extended cap (ECAP) and DOE
> Mailbox.  If we want to provide complete flexibility we should let the user pass 
> device property arrays to QEMU command for how many DOE ECAP’s to build 
> out and how to assign protocol(s) to each of them.  Array index is the DOE 
> instance #.
> 
> Also we can provide a property for cdat binary (blob) filename to initialize 
> the CDAT structure[entries].  This just reads in whatever mix of CDAT structure
> types are in the blob.
> 
> -device cxl-type3,bus=rp0,memdev=cxl-mem1,id=cxl-pmem0,size=256M \
>     doe-ecap-instances=2 \
>     doe-ecap[0]=5 // bitwise OR for protocols shared
>     doe-ecap[1]=2 //bitwise OR for protocols shared
>     doe-ecap-cdat[1]=mycdat.bin
> 
> where let’s say protocols bitvector
> bit [0]=CMA
> bit [1]=CDAT
> bit [2}=Compliance
> 
> Let me know if you some better alternatives and we’ll implement it.
> 

Gut feeling for DOE is that the particular combination of protocol and
ECAP/DOE is device dependent. As such...

I'm not sure we actually want to expose it as command line controllable at all.
If we do, I'd suggest a small number of sane choices that exercise cases
we want to check.

From a testing point of view, 2 DOE, one of which supports multiple
protocols and we will have enough to test likely failure modes in the code.

The one protocol running on multiple mailboxes is already covered by the
discovery protocol which they all support.  That might not exercise
all the potential problems on the emulator side (as no need to do
locking etc) but it will proabbly exercise those in the OS and firmware.

Almost all users of DOE functionality in QEMU in the long run are likely
to be emulating a particular device so will hard code the DOE instances present
on that device in their emulation of whatever PCIe device they are
emulating.

This is no different to picking a particular layout for config space.
We could make it fully flexible, but it's rarely useful to do so.

If anyone wants to check something unusual, they can hack it into
QEMU.

As a side note, a protocol bit vector is going to unmaintainable as
there will be lots of protocols and last thing we want is that vector
to mean different things on different emulated PCI devices.

Jonathan



> 
> >   
> >> 
> >> The DOE spec does leave it pretty arbitrary regarding N DOE instances (DOE 
> >> Extended Cap entry points) for M protocols, including where N>1 and M=1.  
> >> Currently we implement N=2 DOE caps (instances), one for CDAT, one for 
> >> Compliance Mode.[
> >> 
> >> Maybe a more complex MLD device might have one or more DOE instances 
> >> for the CDAT protocol alone to define each HDM but currently we only have 
> >> one pmem (SLD) so we can’t really do much more than what’s supported.
> >> 
> >> Open to further suggestion though.  Based on answer to above we’ll follow 
> >> the suggestion lower in the code review regarding 
> >>   
> > ...
> >   
>
Jonathan Cameron March 4, 2021, 7:21 p.m. UTC | #8
On Tue, 9 Feb 2021 15:35:49 -0500
Chris Browy <cbrowy@avery-design.com> wrote:

Hi Chris,

One more thing hit whilst debugging linux side of this.

> +static void pcie_doe_irq_assert(DOECap *doe_cap)
> +{
> +    PCIDevice *dev = doe_cap->doe->pdev;
> +
> +    if (doe_cap->cap.intr && doe_cap->ctrl.intr) {


need something like

doe_cap->status.intr = 1;

I think or anyone checking the status register is going to think
this interrupt is spurious.

Otherwise all seems to work. I need to do a bit of tidying up on
kernel code but should be able to send out early next week.

> +        /* Interrupt notify */
> +        if (msix_enabled(dev)) {
> +            msix_notify(dev, doe_cap->cap.vec);
> +        } else if (msi_enabled(dev)) {
> +            msi_notify(dev, doe_cap->cap.vec);
> +        }
> +        /* Not support legacy IRQ */
> +    }
> +}
Chris Browy March 4, 2021, 7:50 p.m. UTC | #9
> On Mar 4, 2021, at 2:21 PM, Jonathan Cameron <jonathan.cameron@huawei.com> wrote:
> 
> On Tue, 9 Feb 2021 15:35:49 -0500
> Chris Browy <cbrowy@avery-design.com> wrote:
> 
> Hi Chris,
> 
> One more thing hit whilst debugging linux side of this.
> 
>> +static void pcie_doe_irq_assert(DOECap *doe_cap)
>> +{
>> +    PCIDevice *dev = doe_cap->doe->pdev;
>> +
>> +    if (doe_cap->cap.intr && doe_cap->ctrl.intr) {
> 
> 
> need something like
> 
> doe_cap->status.intr = 1;
> 
> I think or anyone checking the status register is going to think
> this interrupt is spurious.

You’re absolutely right, good catch!

> 
> Otherwise all seems to work. I need to do a bit of tidying up on
> kernel code but should be able to send out early next week.
> 

We’re putting out the v3 by end of this week.  We’re spent a bit longer
tidying up on our end but sounds like coming together real soon in 5.12 
release!

>> +        /* Interrupt notify */
>> +        if (msix_enabled(dev)) {
>> +            msix_notify(dev, doe_cap->cap.vec);
>> +        } else if (msi_enabled(dev)) {
>> +            msi_notify(dev, doe_cap->cap.vec);
>> +        }
>> +        /* Not support legacy IRQ */
>> +    }
>> +}
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 981dc92..4fb865e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1655,6 +1655,13 @@  F: docs/pci*
 F: docs/specs/*pci*
 F: default-configs/pci.mak
 
+PCIE DOE
+M: Huai-Cheng Kuo <hchkuo@avery-design.com.tw>
+M: Chris Browy <cbrowy@avery-design.com>
+S: Supported
+F: include/hw/pci/pcie_doe.h
+F: hw/pci/pcie_doe.c
+
 ACPI/SMBIOS
 M: Michael S. Tsirkin <mst@redhat.com>
 M: Igor Mammedov <imammedo@redhat.com>
diff --git a/hw/pci/meson.build b/hw/pci/meson.build
index 5c4bbac..115e502 100644
--- a/hw/pci/meson.build
+++ b/hw/pci/meson.build
@@ -12,6 +12,7 @@  pci_ss.add(files(
 # allow plugging PCIe devices into PCI buses, include them even if
 # CONFIG_PCI_EXPRESS=n.
 pci_ss.add(files('pcie.c', 'pcie_aer.c'))
+pci_ss.add(files('pcie_doe.c'))
 softmmu_ss.add(when: 'CONFIG_PCI_EXPRESS', if_true: files('pcie_port.c', 'pcie_host.c'))
 softmmu_ss.add_all(when: 'CONFIG_PCI', if_true: pci_ss)
 
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 1ecf6f6..f7516c4 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -735,7 +735,7 @@  void pcie_cap_slot_write_config(PCIDevice *dev,
 
     hotplug_event_notify(dev);
 
-    /* 
+    /*
      * 6.7.3.2 Command Completed Events
      *
      * Software issues a command to a hot-plug capable Downstream Port by
diff --git a/hw/pci/pcie_doe.c b/hw/pci/pcie_doe.c
new file mode 100644
index 0000000..df8e92e
--- /dev/null
+++ b/hw/pci/pcie_doe.c
@@ -0,0 +1,414 @@ 
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "qemu/range.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pcie.h"
+#include "hw/pci/pcie_doe.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
+
+/*
+ * DOE Default Protocols (Discovery, CMA)
+ */
+/* Discovery Request Object */
+struct doe_discovery {
+    DOEHeader header;
+    uint8_t index;
+    uint8_t reserved[3];
+} QEMU_PACKED;
+
+/* Discovery Response Object */
+struct doe_discovery_rsp {
+    DOEHeader header;
+    uint16_t vendor_id;
+    uint8_t doe_type;
+    uint8_t next_index;
+} QEMU_PACKED;
+
+/* Callback for Discovery */
+static bool pcie_doe_discovery_rsp(DOECap *doe_cap)
+{
+    PCIEDOE *doe = doe_cap->doe;
+    struct doe_discovery *req = pcie_doe_get_req(doe_cap);
+    uint8_t index = req->index;
+    DOEProtocol *prot = NULL;
+
+    /* Request length mismatch, discard */
+    if (req->header.length < dwsizeof(struct doe_discovery)) {
+        return DOE_DISCARD;
+    }
+
+    /* Point to the requested protocol */
+    if (index < doe->protocol_num) {
+        prot = &doe->protocols[index];
+    }
+
+    struct doe_discovery_rsp rsp = {
+        .header = {
+            .vendor_id = PCI_VENDOR_ID_PCI_SIG,
+            .doe_type = PCI_SIG_DOE_DISCOVERY,
+            .reserved = 0x0,
+            .length = dwsizeof(struct doe_discovery_rsp),
+        },
+        .vendor_id = (prot) ? prot->vendor_id : 0xFFFF,
+        .doe_type = (prot) ? prot->doe_type : 0xFF,
+        .next_index = (index + 1) < doe->protocol_num ?
+                      (index + 1) : 0,
+    };
+
+    pcie_doe_set_rsp(doe_cap, &rsp);
+
+    return DOE_SUCCESS;
+}
+
+/* Callback for CMA */
+static bool pcie_doe_cma_rsp(DOECap *doe_cap)
+{
+    doe_cap->status.error = 1;
+
+    memset(doe_cap->read_mbox, 0,
+           PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
+
+    doe_cap->write_mbox_len = 0;
+
+    return DOE_DISCARD;
+}
+
+/*
+ * DOE Utilities
+ */
+static void pcie_doe_reset_mbox(DOECap *st)
+{
+    st->read_mbox_idx = 0;
+
+    st->read_mbox_len = 0;
+    st->write_mbox_len = 0;
+
+    memset(st->read_mbox, 0, PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
+    memset(st->write_mbox, 0, PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
+}
+
+/*
+ * Initialize the list and protocol for a device.
+ * This function won't add the DOE capabitity to your PCIe device.
+ */
+void pcie_doe_init(PCIDevice *dev, PCIEDOE *doe)
+{
+    doe->pdev = dev;
+    doe->head = NULL;
+    doe->protocol_num = 0;
+
+    /* Register two default protocol */
+    //TODO : LINK LIST
+    pcie_doe_register_protocol(doe, PCI_VENDOR_ID_PCI_SIG,
+            PCI_SIG_DOE_DISCOVERY, pcie_doe_discovery_rsp);
+    pcie_doe_register_protocol(doe, PCI_VENDOR_ID_PCI_SIG,
+            PCI_SIG_DOE_CMA, pcie_doe_cma_rsp);
+}
+
+int pcie_cap_doe_add(PCIEDOE *doe, uint16_t offset, bool intr, uint16_t vec) {
+    DOECap *new_cap, **ptr;
+    PCIDevice *dev = doe->pdev;
+
+    pcie_add_capability(dev, PCI_EXT_CAP_ID_DOE, PCI_DOE_VER, offset,
+                        PCI_DOE_SIZEOF);
+
+    ptr = &doe->head;
+    /* Insert the new DOE at the end of linked list */
+    while (*ptr) {
+        if (range_covers_byte((*ptr)->offset, PCI_DOE_SIZEOF, offset) ||
+            (*ptr)->cap.vec == vec) {
+            return -EINVAL;
+        }
+
+        ptr = &(*ptr)->next;
+    }
+    new_cap = g_malloc0(sizeof(DOECap));
+    *ptr = new_cap;
+
+    new_cap->doe = doe;
+    new_cap->offset = offset;
+    new_cap->next = NULL;
+
+    /* Configure MSI/MSI-X */
+    if (intr && (msi_present(dev) || msix_present(dev))) {
+        new_cap->cap.intr = intr;
+        new_cap->cap.vec = vec;
+    }
+
+    /* Set up W/R Mailbox buffer */
+    new_cap->write_mbox = g_malloc0(PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
+    new_cap->read_mbox = g_malloc0(PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
+
+    pcie_doe_reset_mbox(new_cap);
+
+    return 0;
+}
+
+void pcie_doe_uninit(PCIEDOE *doe) {
+    PCIDevice *dev = doe->pdev;
+    DOECap *cap;
+
+    pci_del_capability(dev, PCI_EXT_CAP_ID_DOE, PCI_DOE_SIZEOF);
+
+    cap = doe->head;
+    while (cap) {
+        doe->head = doe->head->next;
+
+        g_free(cap->read_mbox);
+        g_free(cap->write_mbox);
+        cap->read_mbox = NULL;
+        cap->write_mbox = NULL;
+        g_free(cap);
+        cap = doe->head;
+    }
+}
+
+void pcie_doe_register_protocol(PCIEDOE *doe, uint16_t vendor_id,
+        uint8_t doe_type, bool (*set_rsp)(DOECap *))
+{
+    /* Protocol num should not exceed 256 */
+    assert(doe->protocol_num < PCI_DOE_PROTOCOL_MAX);
+
+    doe->protocols[doe->protocol_num].vendor_id = vendor_id;
+    doe->protocols[doe->protocol_num].doe_type = doe_type;
+    doe->protocols[doe->protocol_num].set_rsp = set_rsp;
+
+    doe->protocol_num++;
+}
+
+uint32_t pcie_doe_build_protocol(DOEProtocol *p)
+{
+    return DATA_OBJ_BUILD_HEADER1(p->vendor_id, p->doe_type);
+}
+
+/* Return the pointer of DOE request in write mailbox buffer */
+void *pcie_doe_get_req(DOECap *doe_cap)
+{
+    return doe_cap->write_mbox;
+}
+
+/* Copy the response to read mailbox buffer */
+void pcie_doe_set_rsp(DOECap *doe_cap, void *rsp)
+{
+    uint32_t len = pcie_doe_object_len(rsp);
+
+    memcpy(doe_cap->read_mbox + doe_cap->read_mbox_len,
+           rsp, len * sizeof(uint32_t));
+    doe_cap->read_mbox_len += len;
+}
+
+/* Get Data Object length */
+uint32_t pcie_doe_object_len(void *obj)
+{
+    return (obj)? ((DOEHeader *)obj)->length : 0;
+}
+
+static void pcie_doe_write_mbox(DOECap *doe_cap, uint32_t val)
+{
+    memcpy(doe_cap->write_mbox + doe_cap->write_mbox_len, &val, sizeof(uint32_t));
+
+    if (doe_cap->write_mbox_len == 1) {
+        DOE_DBG("  Capture DOE request DO length = %d\n", val & 0x0003ffff);
+    }
+
+    doe_cap->write_mbox_len++;
+}
+
+static void pcie_doe_irq_assert(DOECap *doe_cap)
+{
+    PCIDevice *dev = doe_cap->doe->pdev;
+
+    if (doe_cap->cap.intr && doe_cap->ctrl.intr) {
+        /* Interrupt notify */
+        if (msix_enabled(dev)) {
+            msix_notify(dev, doe_cap->cap.vec);
+        } else if (msi_enabled(dev)) {
+            msi_notify(dev, doe_cap->cap.vec);
+        }
+        /* Not support legacy IRQ */
+    }
+}
+
+static void pcie_doe_set_ready(DOECap *doe_cap, bool rdy)
+{
+    doe_cap->status.ready = rdy;
+
+    if (rdy) {
+        pcie_doe_irq_assert(doe_cap);
+    }
+}
+
+static void pcie_doe_set_error(DOECap *doe_cap, bool err)
+{
+    doe_cap->status.error = err;
+
+    if (err) {
+        pcie_doe_irq_assert(doe_cap);
+    }
+}
+
+DOECap *pcie_doe_covers_addr(PCIEDOE *doe, uint32_t addr)
+{
+    DOECap *ptr = doe->head;
+
+    /* If overlaps DOE range. PCIe Capability Header won't be included. */
+    while (ptr && 
+           !range_covers_byte(ptr->offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
+        ptr = ptr->next;
+    }
+    
+    return ptr;
+}
+
+uint32_t pcie_doe_read_config(DOECap *doe_cap,
+                              uint32_t addr, int size)
+{
+    uint32_t ret = 0, shift, mask = 0xFFFFFFFF;
+    uint16_t doe_offset = doe_cap->offset;
+
+    /* If overlaps DOE range. PCIe Capability Header won't be included. */
+    if (range_covers_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
+        addr -= doe_offset;
+
+        if (range_covers_byte(PCI_EXP_DOE_CAP, sizeof(uint32_t), addr)) {
+            ret = FIELD_DP32(ret, PCI_DOE_CAP_REG, INTR_SUPP,
+                             doe_cap->cap.intr);
+            ret = FIELD_DP32(ret, PCI_DOE_CAP_REG, DOE_INTR_MSG_NUM,
+                             doe_cap->cap.vec);
+        } else if (range_covers_byte(PCI_EXP_DOE_CTRL, sizeof(uint32_t), addr)) {
+            /* Must return ABORT=0 and GO=0 */
+            ret = FIELD_DP32(ret, PCI_DOE_CAP_CONTROL, DOE_INTR_EN,
+                             doe_cap->ctrl.intr);
+            DOE_DBG("  CONTROL REG=%x\n", ret);
+        } else if (range_covers_byte(PCI_EXP_DOE_STATUS, sizeof(uint32_t), addr)) {
+            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_BUSY,
+                             doe_cap->status.busy);
+            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_INTR_STATUS,
+                             doe_cap->status.intr);
+            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DOE_ERROR,
+                             doe_cap->status.error);
+            ret = FIELD_DP32(ret, PCI_DOE_CAP_STATUS, DATA_OBJ_RDY,
+                             doe_cap->status.ready);
+        } else if (range_covers_byte(PCI_EXP_DOE_RD_DATA_MBOX, sizeof(uint32_t), addr)) {
+            /* Check that READY is set */
+            if (!doe_cap->status.ready) {
+                /* Return 0 if not ready */
+                DOE_DBG("  RD MBOX RETURN=%x\n", ret);
+            } else {
+                /* Deposit next DO DW into read mbox */
+                DOE_DBG("  RD MBOX, DATA OBJECT READY,"
+                        " CURRENT DO DW OFFSET=%x\n",
+                        doe_cap->read_mbox_idx);
+
+                ret = doe_cap->read_mbox[doe_cap->read_mbox_idx];
+
+                DOE_DBG("  RD MBOX DW=%x\n", ret);
+                DOE_DBG("  RD MBOX DW OFFSET=%d, RD MBOX LENGTH=%d\n",
+                        doe_cap->read_mbox_idx, doe_cap->read_mbox_len);
+            }
+        } else if (range_covers_byte(PCI_EXP_DOE_WR_DATA_MBOX, sizeof(uint32_t), addr)) {
+            DOE_DBG("  WR MBOX, local_val =%x\n", ret);
+        }
+    }
+
+    /* Alignment */
+    shift = addr % sizeof(uint32_t);
+    if (shift) {
+        ret >>= shift * 8;
+    }
+    mask >>= (sizeof(uint32_t) - size) * 8;
+
+    return ret & mask;
+}
+
+void pcie_doe_write_config(DOECap *doe_cap,
+                           uint32_t addr, uint32_t val, int size)
+{
+    PCIEDOE *doe = doe_cap->doe;
+    uint16_t doe_offset = doe_cap->offset;
+    int p;
+    bool discard;
+    uint32_t shift;
+
+    DOE_DBG("  addr=%x, val=%x, size=%x\n", addr, val, size);
+
+    /* If overlaps DOE range. PCIe Capability Header won't be included. */
+    if (range_covers_byte(doe_offset + PCI_EXP_DOE_CAP, PCI_DOE_SIZEOF - 4, addr)) {
+        /* Alignment */
+        shift = addr % sizeof(uint32_t);
+        addr -= (doe_offset - shift);
+        val <<= shift * 8;
+
+        switch (addr) {
+        case PCI_EXP_DOE_CTRL:
+            DOE_DBG("  CONTROL=%x\n", val);
+            if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_ABORT)) {
+                /* If ABORT, clear status reg DOE Error and DOE Ready */
+                DOE_DBG("  Setting ABORT\n");
+                pcie_doe_set_ready(doe_cap, 0);
+                pcie_doe_set_error(doe_cap, 0);
+                pcie_doe_reset_mbox(doe_cap);
+            } else if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_GO)) {
+                DOE_DBG("  CONTROL GO=%x\n", val);
+                /*
+                 * Check protocol the incoming request in write_mbox and
+                 * memcpy the corresponding response to read_mbox.
+                 *
+                 * "discard" should be set up if the response callback
+                 * function could not deal with request (e.g. length
+                 * mismatch) or the protocol of request was not found.
+                 */
+                discard = DOE_DISCARD;
+                for (p = 0; p < doe->protocol_num; p++) {
+                    if (doe_cap->write_mbox[0] ==
+                        pcie_doe_build_protocol(&doe->protocols[p])) {
+                        /* Found */
+                        DOE_DBG("  DOE PROTOCOL=%x\n", doe_cap->write_mbox[0]);
+                        /*
+                         * Spec:
+                         * If the number of DW transferred does not match the
+                         * indicated Length for a data object, then the
+                         * data object must be silently discarded.
+                         */
+                        if (doe_cap->write_mbox_len ==
+                            pcie_doe_object_len(pcie_doe_get_req(doe_cap)))
+                            discard = doe->protocols[p].set_rsp(doe_cap);
+                        break;
+                    }
+                }
+
+                /* Set DOE Ready */
+                if (!discard) {
+                    pcie_doe_set_ready(doe_cap, 1);
+                } else {
+                    pcie_doe_reset_mbox(doe_cap);
+                }
+            } else if (FIELD_EX32(val, PCI_DOE_CAP_CONTROL, DOE_INTR_EN)) {
+                doe_cap->ctrl.intr = 1;
+            }
+            break;
+        case PCI_EXP_DOE_RD_DATA_MBOX:
+            /* Read MBOX */
+            DOE_DBG("  INCR RD MBOX DO DW OFFSET=%d\n", doe_cap->read_mbox_idx);
+            doe_cap->read_mbox_idx += 1;
+            /* Last DW */
+            if (doe_cap->read_mbox_idx >= doe_cap->read_mbox_len) {
+                pcie_doe_reset_mbox(doe_cap);
+                pcie_doe_set_ready(doe_cap, 0);
+            }
+            break;
+        case PCI_EXP_DOE_WR_DATA_MBOX:
+            /* Write MBOX */
+            DOE_DBG("  WR MBOX=%x, DW OFFSET = %d\n", val, doe_cap->write_mbox_len);
+            pcie_doe_write_mbox(doe_cap, val);
+            break;
+        case PCI_EXP_DOE_STATUS:
+        case PCI_EXP_DOE_CAP:
+        default:
+            break;
+        }
+    }
+}
diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
index 76bf3ed..636b2e8 100644
--- a/include/hw/pci/pci_ids.h
+++ b/include/hw/pci/pci_ids.h
@@ -157,6 +157,8 @@ 
 
 /* Vendors and devices.  Sort key: vendor first, device next. */
 
+#define PCI_VENDOR_ID_PCI_SIG            0x0001
+
 #define PCI_VENDOR_ID_LSI_LOGIC          0x1000
 #define PCI_DEVICE_ID_LSI_53C810         0x0001
 #define PCI_DEVICE_ID_LSI_53C895A        0x0012
diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
index 14c58eb..47d6f66 100644
--- a/include/hw/pci/pcie.h
+++ b/include/hw/pci/pcie.h
@@ -25,6 +25,7 @@ 
 #include "hw/pci/pcie_regs.h"
 #include "hw/pci/pcie_aer.h"
 #include "hw/hotplug.h"
+#include "hw/pci/pcie_doe.h"
 
 typedef enum {
     /* for attention and power indicator */
diff --git a/include/hw/pci/pcie_doe.h b/include/hw/pci/pcie_doe.h
new file mode 100644
index 0000000..f497075
--- /dev/null
+++ b/include/hw/pci/pcie_doe.h
@@ -0,0 +1,166 @@ 
+#ifndef PCIE_DOE_H
+#define PCIE_DOE_H
+
+#include "qemu/range.h"
+#include "qemu/typedefs.h"
+#include "hw/register.h"
+
+/* 
+ * PCI DOE register defines 7.9.xx 
+ */
+/* DOE Capabilities Register */
+#define PCI_EXP_DOE_CAP             0x04
+REG32(PCI_DOE_CAP_REG, 0)
+    FIELD(PCI_DOE_CAP_REG, INTR_SUPP, 0, 1)
+    FIELD(PCI_DOE_CAP_REG, DOE_INTR_MSG_NUM, 1, 11)
+
+/* DOE Control Register */
+#define PCI_EXP_DOE_CTRL            0x08
+REG32(PCI_DOE_CAP_CONTROL, 0)
+    FIELD(PCI_DOE_CAP_CONTROL, DOE_ABORT, 0, 1)
+    FIELD(PCI_DOE_CAP_CONTROL, DOE_INTR_EN, 1, 1)
+    FIELD(PCI_DOE_CAP_CONTROL, DOE_GO, 31, 1)
+
+/* DOE Status Register  */
+#define PCI_EXP_DOE_STATUS          0x0c
+REG32(PCI_DOE_CAP_STATUS, 0)
+    FIELD(PCI_DOE_CAP_STATUS, DOE_BUSY, 0, 1)
+    FIELD(PCI_DOE_CAP_STATUS, DOE_INTR_STATUS, 1, 1)
+    FIELD(PCI_DOE_CAP_STATUS, DOE_ERROR, 2, 1)
+    FIELD(PCI_DOE_CAP_STATUS, DATA_OBJ_RDY, 31, 1)
+
+/* DOE Write Data Mailbox Register  */
+#define PCI_EXP_DOE_WR_DATA_MBOX    0x10
+
+/* DOE Read Data Mailbox Register  */
+#define PCI_EXP_DOE_RD_DATA_MBOX    0x14
+
+/* 
+ * PCI DOE register defines 7.9.xx 
+ */
+/* Table 7-x2 */
+#define PCI_SIG_DOE_DISCOVERY       0x00
+#define PCI_SIG_DOE_CMA             0x01
+
+#define DATA_OBJ_BUILD_HEADER1(v, p)  ((p << 16) | v)
+
+/* Figure 6-x1 */
+#define DATA_OBJECT_HEADER1_OFFSET  0
+#define DATA_OBJECT_HEADER2_OFFSET  1
+#define DATA_OBJECT_CONTENT_OFFSET  2
+
+#define PCI_DOE_MAX_DW_SIZE (1 << 18)
+#define PCI_DOE_PROTOCOL_MAX 256
+
+/*
+ * Self-defined Marco
+ */
+/* Request/Response State */
+#define DOE_SUCCESS 0
+#define DOE_DISCARD 1
+
+/* An invalid vector number for MSI/MSI-x */
+#define DOE_NO_INTR (~0)
+
+/*
+ * DOE Protocol - Data Object
+ */
+typedef struct DOEHeader DOEHeader;
+typedef struct DOEProtocol DOEProtocol;
+typedef struct PCIEDOE PCIEDOE;
+typedef struct DOECap DOECap;
+
+struct DOEHeader {
+    uint16_t vendor_id;
+    uint8_t doe_type;
+    uint8_t reserved;
+    struct {
+        uint32_t length:18;
+        uint32_t reserved2:14;
+    };
+} QEMU_PACKED;
+
+/* Protocol infos and rsp function callback */
+struct DOEProtocol {
+    uint16_t vendor_id;
+    uint8_t doe_type;
+
+    bool (*set_rsp)(DOECap *);
+};
+
+struct PCIEDOE {
+    PCIDevice *pdev;
+    DOECap *head;
+
+    /* Protocols and its callback response */
+    DOEProtocol protocols[PCI_DOE_PROTOCOL_MAX];
+    uint32_t protocol_num;
+};
+
+struct DOECap {
+    PCIEDOE *doe;
+
+    /* Capability offset */
+    uint16_t offset;
+
+    /* Next DOE capability */
+    DOECap *next;
+
+    /* Capability */
+    struct {
+        bool intr;
+        uint16_t vec;
+    } cap;
+
+    /* Control */
+    struct {
+        bool abort;
+        bool intr;
+        bool go;
+    } ctrl;
+
+    /* Status */
+    struct {
+        bool busy;
+        bool intr;
+        bool error;
+        bool ready;
+    } status;
+
+    /* Mailbox buffer for device */
+    uint32_t *write_mbox;
+    uint32_t *read_mbox;
+
+    /* Mailbox position indicator */
+    uint32_t read_mbox_idx;
+    uint32_t read_mbox_len;
+    uint32_t write_mbox_len;
+};
+
+void pcie_doe_init(PCIDevice *dev, PCIEDOE *doe);
+int pcie_cap_doe_add(PCIEDOE *doe, uint16_t offset, bool intr, uint16_t vec);
+void pcie_doe_uninit(PCIEDOE *doe);
+void pcie_doe_register_protocol(PCIEDOE *doe, uint16_t vendor_id,
+                                uint8_t doe_type,
+                                bool (*set_rsp)(DOECap *));
+uint32_t pcie_doe_build_protocol(DOEProtocol *p);
+DOECap *pcie_doe_covers_addr(PCIEDOE *doe, uint32_t addr);
+uint32_t pcie_doe_read_config(DOECap *doe_cap, uint32_t addr, int size);
+void pcie_doe_write_config(DOECap *doe_cap, uint32_t addr,
+                           uint32_t val, int size);
+
+/* Utility functions for set_rsp in DOEProtocol */
+void *pcie_doe_get_req(DOECap *doe_cap);
+void pcie_doe_set_rsp(DOECap *doe_cap, void *rsp);
+uint32_t pcie_doe_object_len(void *obj);
+
+#define DOE_DEBUG
+#ifdef DOE_DEBUG
+#define DOE_DBG(...) printf(__VA_ARGS__)
+#else
+#define DOE_DBG(...) {}
+#endif
+
+#define dwsizeof(s) ((sizeof(s) + 4 - 1) / 4)
+
+#endif /* PCIE_DOE_H */
diff --git a/include/hw/pci/pcie_regs.h b/include/hw/pci/pcie_regs.h
index 1db86b0..963dc2e 100644
--- a/include/hw/pci/pcie_regs.h
+++ b/include/hw/pci/pcie_regs.h
@@ -179,4 +179,8 @@  typedef enum PCIExpLinkWidth {
 #define PCI_ACS_VER                     0x1
 #define PCI_ACS_SIZEOF                  8
 
+/* DOE Capability Register Fields */
+#define PCI_DOE_VER                     0x1
+#define PCI_DOE_SIZEOF                  24
+
 #endif /* QEMU_PCIE_REGS_H */
diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h
index e709ae8..4a7b7a4 100644
--- a/include/standard-headers/linux/pci_regs.h
+++ b/include/standard-headers/linux/pci_regs.h
@@ -730,7 +730,8 @@ 
 #define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
 #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
 #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
-#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
+#define PCI_EXT_CAP_ID_DOE      0x2E    /* Data Object Exchange */
+#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DOE
 
 #define PCI_EXT_CAP_DSN_SIZEOF	12
 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40