Message ID | 20240926074546.24507-10-luchangqi.123@bytedance.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Support persistent reservation operations | expand |
On Sep 26 15:45, Changqi Lu wrote: > Add reservation acquire, reservation register, > reservation release and reservation report commands > in the nvme device layer. > > By introducing these commands, this enables the nvme > device to perform reservation-related tasks, including > querying keys, querying reservation status, registering > reservation keys, initiating and releasing reservations, > as well as clearing and preempting reservations held by > other keys. > > These commands are crucial for management and control of > shared storage resources in a persistent manner. > Signed-off-by: Changqi Lu <luchangqi.123@bytedance.com> > Signed-off-by: zhenwei pi <pizhenwei@bytedance.com> > Acked-by: Klaus Jensen <k.jensen@samsung.com> > --- > hw/nvme/ctrl.c | 359 +++++++++++++++++++++++++++++++++++++++++++ > hw/nvme/ns.c | 6 + > hw/nvme/nvme.h | 10 ++ > include/block/nvme.h | 44 ++++++ > 4 files changed, 419 insertions(+) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index ad212de723..ffb876a99f 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -294,6 +294,10 @@ static const uint32_t nvme_cse_iocs_nvm[256] = { > [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, > [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP, > [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, > + [NVME_CMD_RESV_REGISTER] = NVME_CMD_EFF_CSUPP, > + [NVME_CMD_RESV_REPORT] = NVME_CMD_EFF_CSUPP, > + [NVME_CMD_RESV_ACQUIRE] = NVME_CMD_EFF_CSUPP, > + [NVME_CMD_RESV_RELEASE] = NVME_CMD_EFF_CSUPP, > }; > > static const uint32_t nvme_cse_iocs_zoned[256] = { > @@ -308,6 +312,10 @@ static const uint32_t nvme_cse_iocs_zoned[256] = { > [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, > [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, > [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP, > + [NVME_CMD_RESV_REGISTER] = NVME_CMD_EFF_CSUPP, > + [NVME_CMD_RESV_REPORT] = NVME_CMD_EFF_CSUPP, > + [NVME_CMD_RESV_ACQUIRE] = NVME_CMD_EFF_CSUPP, > + [NVME_CMD_RESV_RELEASE] = NVME_CMD_EFF_CSUPP, > }; > > static void nvme_process_sq(void *opaque); > @@ -1747,6 +1755,13 @@ static void nvme_aio_err(NvmeRequest *req, int ret) > case NVME_CMD_READ: > status = NVME_UNRECOVERED_READ; > break; > + case NVME_CMD_RESV_REPORT: > + if (ret == -ENOTSUP) { > + status = NVME_INVALID_OPCODE; > + } else { > + status = NVME_UNRECOVERED_READ; > + } > + break; > case NVME_CMD_FLUSH: > case NVME_CMD_WRITE: > case NVME_CMD_WRITE_ZEROES: > @@ -1754,6 +1769,15 @@ static void nvme_aio_err(NvmeRequest *req, int ret) > case NVME_CMD_COPY: > status = NVME_WRITE_FAULT; > break; > + case NVME_CMD_RESV_REGISTER: > + case NVME_CMD_RESV_ACQUIRE: > + case NVME_CMD_RESV_RELEASE: > + if (ret == -ENOTSUP) { > + status = NVME_INVALID_OPCODE; > + } else { > + status = NVME_WRITE_FAULT; > + } > + break; Can the -ENOTSUP actually happen if the block layer has indicated support for reservations? Or is this a left over from the way you handled missing support earlier? > default: > status = NVME_INTERNAL_DEV_ERROR; > break; > @@ -2692,6 +2716,333 @@ static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req) > return NVME_NO_COMPLETE; > } > > +typedef struct NvmeKeyInfo { > + uint64_t cr_key; > + uint64_t nr_key; > +} NvmeKeyInfo; This is an spec data structure, should be in include/block/nvme.h (and maybe use a union to alias NRKEY and PRKEY). > + > +static uint16_t nvme_resv_register(NvmeCtrl *n, NvmeRequest *req) > +{ > + int ret; > + NvmeKeyInfo key_info; > + NvmeNamespace *ns = req->ns; > + uint32_t cdw10 = le32_to_cpu(req->cmd.cdw10); > + bool ignore_key = cdw10 >> 3 & 0x1; Prefer parantheses around the shift arguments. > + uint8_t action = cdw10 & 0x7; > + uint8_t ptpl = cdw10 >> 30 & 0x3; Parantheses. Name is technically CPTPL. > + bool aptpl; > + > + if (!nvme_support_pr(ns)) { > + return NVME_INVALID_OPCODE; > + } > + > + switch (ptpl) { > + case NVME_RESV_PTPL_NO_CHANGE: > + aptpl = (ns->id_ns.rescap & NVME_PR_CAP_PTPL) ? true : false; > + break; > + case NVME_RESV_PTPL_DISABLE: > + aptpl = false; > + break; > + case NVME_RESV_PTPL_ENABLE: > + aptpl = true; > + break; > + default: > + return NVME_INVALID_FIELD; > + } > + > + ret = nvme_h2c(n, (uint8_t *)&key_info, sizeof(NvmeKeyInfo), req); > + if (ret) { > + return ret; > + } > + > + key_info.cr_key = le64_to_cpu(key_info.cr_key); > + key_info.nr_key = le64_to_cpu(key_info.nr_key); > + > + switch (action) { > + case NVME_RESV_REGISTER_ACTION_REGISTER: > + req->aiocb = blk_aio_pr_register(ns->blkconf.blk, 0, > + key_info.nr_key, 0, aptpl, > + ignore_key, nvme_misc_cb, > + req); > + break; > + case NVME_RESV_REGISTER_ACTION_UNREGISTER: > + req->aiocb = blk_aio_pr_register(ns->blkconf.blk, key_info.cr_key, 0, > + 0, aptpl, ignore_key, > + nvme_misc_cb, req); > + break; > + case NVME_RESV_REGISTER_ACTION_REPLACE: > + req->aiocb = blk_aio_pr_register(ns->blkconf.blk, key_info.cr_key, > + key_info.nr_key, 0, aptpl, ignore_key, > + nvme_misc_cb, req); > + break; > + default: > + return NVME_INVALID_FIELD; > + } > + > + return NVME_NO_COMPLETE; > +} > + > +static uint16_t nvme_resv_release(NvmeCtrl *n, NvmeRequest *req) > +{ > + int ret; > + uint64_t cr_key; > + NvmeNamespace *ns = req->ns; > + uint32_t cdw10 = le32_to_cpu(req->cmd.cdw10); I don't understand the reason for this requirement (due to an ECN?), but if IEKEY (bit 3 in DWORD 10) is set, the controller SHALL return an error of Invalid Field in Command. > + uint8_t action = cdw10 & 0x7; > + NvmeResvType type = cdw10 >> 8 & 0xff; Parantheses. > + > + if (!nvme_support_pr(ns)) { > + return NVME_INVALID_OPCODE; > + } > + > + ret = nvme_h2c(n, (uint8_t *)&cr_key, sizeof(cr_key), req); > + if (ret) { > + return ret; > + } > + > + cr_key = le64_to_cpu(cr_key); > + > + switch (action) { > + case NVME_RESV_RELEASE_ACTION_RELEASE: > + req->aiocb = blk_aio_pr_release(ns->blkconf.blk, cr_key, > + nvme_pr_type_to_block(type), > + nvme_misc_cb, req); > + break; > + case NVME_RESV_RELEASE_ACTION_CLEAR: > + req->aiocb = blk_aio_pr_clear(ns->blkconf.blk, cr_key, > + nvme_misc_cb, req); > + break; > + default: > + return NVME_INVALID_FIELD; > + } > + > + return NVME_NO_COMPLETE; > +} > + > +static uint16_t nvme_resv_acquire(NvmeCtrl *n, NvmeRequest *req) > +{ > + int ret; > + NvmeKeyInfo key_info; > + NvmeNamespace *ns = req->ns; > + uint32_t cdw10 = le32_to_cpu(req->cmd.cdw10); Same weird IEKEY requirement. > + uint8_t action = cdw10 & 0x7; > + NvmeResvType type = cdw10 >> 8 & 0xff; Parantheses. > + > + if (!nvme_support_pr(ns)) { > + return NVME_INVALID_OPCODE; > + } > + > + ret = nvme_h2c(n, (uint8_t *)&key_info, sizeof(NvmeKeyInfo), req); > + if (ret) { > + return ret; > + } > + > + key_info.cr_key = le64_to_cpu(key_info.cr_key); > + key_info.nr_key = le64_to_cpu(key_info.nr_key); > + > + switch (action) { > + case NVME_RESV_ACQUIRE_ACTION_ACQUIRE: > + req->aiocb = blk_aio_pr_reserve(ns->blkconf.blk, key_info.cr_key, > + nvme_pr_type_to_block(type), > + nvme_misc_cb, req); > + break; > + case NVME_RESV_ACQUIRE_ACTION_PREEMPT: > + req->aiocb = blk_aio_pr_preempt(ns->blkconf.blk, > + key_info.cr_key, key_info.nr_key, > + nvme_pr_type_to_block(type), > + false, nvme_misc_cb, req); > + break; > + case NVME_RESV_ACQUIRE_ACTION_PREEMPT_AND_ABORT: > + req->aiocb = blk_aio_pr_preempt(ns->blkconf.blk, key_info.cr_key, > + key_info.nr_key, type, true, > + nvme_misc_cb, req); > + break; > + default: > + return NVME_INVALID_FIELD; > + } > + > + return NVME_NO_COMPLETE; > +} > + > +typedef struct NvmeResvKeys { > + uint32_t generation; > + uint32_t num_keys; > + uint64_t *keys; > + NvmeRequest *req; > +} NvmeResvKeys; > + > +typedef struct NvmeReadReservation { > + uint32_t generation; > + uint64_t key; > + BlockPrType type; > + NvmeRequest *req; > + NvmeResvKeys *keys_info; > +} NvmeReadReservation; > + > +static int nvme_read_reservation_cb(NvmeReadReservation *reservation) > +{ > + int rc; > + NvmeReservationStatus *nvme_status; > + NvmeRequest *req = reservation->req; > + NvmeCtrl *n = req->sq->ctrl; > + NvmeResvKeys *keys_info = reservation->keys_info; > + int len = sizeof(NvmeReservationStatusHeader) + > + sizeof(NvmeRegisteredCtrl) * keys_info->num_keys; > + > + nvme_status = g_malloc(len); This leaks contents of the heap in the uninitialized reserved fields. Use g_malloc0. > + nvme_status->header.gen = cpu_to_be32(reservation->generation); > + nvme_status->header.rtype = block_pr_type_to_nvme(reservation->type); > + nvme_status->header.regctl = cpu_to_be16(keys_info->num_keys); > + for (int i = 0; i < keys_info->num_keys; i++) { > + uint16_t ctnlid = nvme_ctrl(req)->cntlid; > + nvme_status->regctl_ds[i].cntlid = cpu_to_be16(ctnlid); > + nvme_status->regctl_ds[i].rkey = cpu_to_be64(keys_info->keys[i]); > + nvme_status->regctl_ds[i].rcsts = keys_info->keys[i] == > + reservation->key ? 1 : 0; > + /* hostid is not supported currently */ > + memset(&nvme_status->regctl_ds[i].hostid, 0, 8); With g_malloc0, no need to memset this. > + } The endian conversions here should all be cpu to little-endian, no? I don't see anything in the spec defining these to be big-endian. > + > + rc = nvme_c2h(n, (uint8_t *)nvme_status, len, req); > + g_free(nvme_status); > + return rc; > +} > + > +static int nvme_read_reservation_ext_cb(NvmeReadReservation *reservation) > +{ > + int rc; > + NvmeReservationStatusExt *nvme_status_ext; > + NvmeRequest *req = reservation->req; > + NvmeCtrl *n = req->sq->ctrl; > + NvmeResvKeys *keys_info = reservation->keys_info; > + int len = sizeof(NvmeReservationStatusHeader) + > + sizeof(uint8_t) * 40 + sizeof(NvmeReservationStatusExt)? > + sizeof(NvmeRegisteredCtrlExt) * keys_info->num_keys; > + > + nvme_status_ext = g_malloc(len); This leaks contents of the heap in the uninitialized reserved fields. Use g_malloc0. > + nvme_status_ext->header.gen = cpu_to_be32(reservation->generation); > + nvme_status_ext->header.rtype = block_pr_type_to_nvme(reservation->type); > + nvme_status_ext->header.regctl = cpu_to_be16(keys_info->num_keys); > + > + for (int i = 0; i < keys_info->num_keys; i++) { > + uint16_t ctnlid = nvme_ctrl(req)->cntlid; > + nvme_status_ext->regctl_eds[i].cntlid = cpu_to_be16(ctnlid); > + nvme_status_ext->regctl_eds[i].rkey = cpu_to_be64(keys_info->keys[i]); > + nvme_status_ext->regctl_eds[i].rcsts = keys_info->keys[i] == > + reservation->key ? 1 : 0; > + /* hostid is not supported currently */ > + memset(&nvme_status_ext->regctl_eds[i].hostid, 0, 16); With g_malloc0, no need to memset this. > + } Same here, should be little-endian I believe. > + > + rc = nvme_c2h(n, (uint8_t *)nvme_status_ext, len, req); > + g_free(nvme_status_ext); > + return rc; > +} > + > +static void nvme_resv_read_reservation_cb(void *opaque, int ret) > +{ > + NvmeReadReservation *reservation = opaque; > + NvmeRequest *req = reservation->req; > + bool eds = le32_to_cpu(req->cmd.cdw11) & 0x1; > + NvmeResvKeys *keys_info = reservation->keys_info; > + > + if (ret < 0) { > + goto out; > + } > + > + if (eds) { > + ret = nvme_read_reservation_ext_cb(reservation); > + } else { > + ret = nvme_read_reservation_cb(reservation); > + } > + > +out: > + g_free(keys_info->keys); > + g_free(keys_info); > + g_free(reservation); > + nvme_misc_cb(req, ret); > +} > + > +static void nvme_resv_read_keys_cb(void *opaque, int ret) > +{ > + NvmeResvKeys *keys_info = opaque; > + NvmeRequest *req = keys_info->req; > + NvmeNamespace *ns = req->ns; > + NvmeReadReservation *reservation; > + > + if (ret < 0) { > + goto out; > + } > + > + keys_info->num_keys = MIN(ret, keys_info->num_keys); > + reservation = g_new0(NvmeReadReservation, 1); > + memset(reservation, 0, sizeof(*reservation)); g_new0 zeroes the memory. > + reservation->req = req; > + reservation->keys_info = keys_info; > + > + req->aiocb = blk_aio_pr_read_reservation(ns->blkconf.blk, > + &reservation->generation, &reservation->key, > + &reservation->type, nvme_resv_read_reservation_cb, > + reservation); > + return; > + > +out: > + g_free(keys_info->keys); > + g_free(keys_info); > + nvme_misc_cb(req, ret); > +} > + > + > +static uint16_t nvme_resv_report(NvmeCtrl *n, NvmeRequest *req) > +{ > + int num_keys; > + uint32_t cdw10 = le32_to_cpu(req->cmd.cdw10); > + uint32_t cdw11 = le32_to_cpu(req->cmd.cdw11); > + size_t buflen = (cdw10 + 1) * sizeof(uint32_t); > + bool eds = cdw11 & 0x1; > + NvmeNamespace *ns = req->ns; > + NvmeResvKeys *keys_info; > + > + if (!nvme_support_pr(ns)) { > + return NVME_INVALID_OPCODE; > + } > + > + if (eds) { > + if (buflen < sizeof(NvmeReservationStatusHeader) + > + sizeof(uint8_t) * 40) { > + return NVME_INVALID_FIELD; > + } No point in the sizeof(uint8_t) and would be more clear to just use sizeof(NvmeReservationStatusExt), no? In these cases I try to go for the strategy of just building the data structure in its entirety and then when we get to nvme_c2h above, only transfer the part of the data structure that the host requested. Maybe, you could get rid of the NVME_MAX_RESERVATION_KEYS band-aid (that, as far as I can tell, is non-spec compliant?), by DMA'ing to the host in "chunks" in the callback (i.e., read 128 keys, c2h, repeat), but I guess that would require the blk_aio_pr API to have an "offset" so you didn't have to read all keys in one go? What are the implications of putting a hard limit of NVME_MAX_RESERVATION_KEYS here? There is no way to convey that limit to the host, is there? > + > + num_keys = (buflen - sizeof(NvmeReservationStatusHeader) - > + sizeof(uint8_t) * 40) / > + sizeof(struct NvmeRegisteredCtrlExt); > + } else { > + if (buflen < sizeof(NvmeReservationStatusHeader)) { > + return NVME_INVALID_FIELD; > + } > + > + num_keys = (buflen - sizeof(NvmeReservationStatusHeader)) / > + sizeof(struct NvmeRegisteredCtrl); > + } > + > + /* > + * The maximum number of keys that can be transmitted is 128. > + */ > + num_keys = MIN(num_keys, NVME_MAX_RESERVATION_KEYS); > + keys_info = g_new0(NvmeResvKeys, 1); > + keys_info->generation = 0; > + /* num_keys is the maximum number of keys that can be transmitted */ > + keys_info->num_keys = num_keys; > + keys_info->keys = g_malloc(sizeof(uint64_t) * num_keys); > + keys_info->req = req; > + > + req->aiocb = blk_aio_pr_read_keys(ns->blkconf.blk, &keys_info->generation, > + keys_info->num_keys, keys_info->keys, > + nvme_resv_read_keys_cb, keys_info); > + > + return NVME_NO_COMPLETE; > +} > + > typedef struct NvmeCopyAIOCB { > BlockAIOCB common; > BlockAIOCB *aiocb; > @@ -4469,6 +4820,14 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) > return nvme_dsm(n, req); > case NVME_CMD_VERIFY: > return nvme_verify(n, req); > + case NVME_CMD_RESV_REGISTER: > + return nvme_resv_register(n, req); > + case NVME_CMD_RESV_REPORT: > + return nvme_resv_report(n, req); > + case NVME_CMD_RESV_ACQUIRE: > + return nvme_resv_acquire(n, req); > + case NVME_CMD_RESV_RELEASE: > + return nvme_resv_release(n, req); > case NVME_CMD_COPY: > return nvme_copy(n, req); > case NVME_CMD_ZONE_MGMT_SEND: > diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c > index a5c903d727..833bcbae08 100644 > --- a/hw/nvme/ns.c > +++ b/hw/nvme/ns.c > @@ -60,6 +60,12 @@ void nvme_ns_init_format(NvmeNamespace *ns) > > blk_pr_cap = blk_bs(ns->blkconf.blk)->bl.pr_cap; > id_ns->rescap = block_pr_cap_to_nvme(blk_pr_cap); > + if (id_ns->rescap != NVME_PR_CAP_ALL && > + id_ns->rescap != NVME_PR_CAP_RW) { > + > + /* Rescap either supports all or none of them */ > + id_ns->rescap = 0; > + } > } > > static int nvme_ns_init(NvmeNamespace *ns, Error **errp) > diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h > index 6d0e456348..b87e1fa3b0 100644 > --- a/hw/nvme/nvme.h > +++ b/hw/nvme/nvme.h > @@ -29,6 +29,7 @@ > #define NVME_EUI64_DEFAULT ((uint64_t)0x5254000000000000) > #define NVME_FDP_MAX_EVENTS 63 > #define NVME_FDP_MAXPIDS 128 > +#define NVME_MAX_RESERVATION_KEYS 128 > > /* > * The controller only supports Submission and Completion Queue Entry Sizes of > @@ -470,6 +471,10 @@ static inline const char *nvme_io_opc_str(uint8_t opc) > case NVME_CMD_ZONE_MGMT_SEND: return "NVME_ZONED_CMD_MGMT_SEND"; > case NVME_CMD_ZONE_MGMT_RECV: return "NVME_ZONED_CMD_MGMT_RECV"; > case NVME_CMD_ZONE_APPEND: return "NVME_ZONED_CMD_ZONE_APPEND"; > + case NVME_CMD_RESV_REGISTER: return "NVME_CMD_RESV_REGISTER"; > + case NVME_CMD_RESV_REPORT: return "NVME_CMD_RESV_REPORT"; > + case NVME_CMD_RESV_ACQUIRE: return "NVME_CMD_RESV_ACQUIRE"; > + case NVME_CMD_RESV_RELEASE: return "NVME_CMD_RESV_RELEASE"; > default: return "NVME_NVM_CMD_UNKNOWN"; > } > } > @@ -558,6 +563,11 @@ static inline uint8_t block_pr_cap_to_nvme(uint8_t block_pr_cap) > return res; > } > > +static inline bool nvme_support_pr(NvmeNamespace *ns) > +{ > + return (ns->id_ns.rescap & NVME_PR_CAP_RW) == NVME_PR_CAP_RW; > +} > + > typedef struct NvmeSQueue { > struct NvmeCtrl *ctrl; > uint16_t sqid; > diff --git a/include/block/nvme.h b/include/block/nvme.h > index 9b9eaeb3a7..2eec339028 100644 > --- a/include/block/nvme.h > +++ b/include/block/nvme.h > @@ -692,6 +692,13 @@ typedef enum NVMEPrCap { > NVME_PR_CAP_WR_EX_AR = 1 << 5, > /* Exclusive Access All Registrants reservation type */ > NVME_PR_CAP_EX_AC_AR = 1 << 6, > + /* Write and Read reservation type */ > + NVME_PR_CAP_RW = (NVME_PR_CAP_WR_EX | > + NVME_PR_CAP_EX_AC | > + NVME_PR_CAP_WR_EX_RO | > + NVME_PR_CAP_EX_AC_RO | > + NVME_PR_CAP_WR_EX_AR | > + NVME_PR_CAP_EX_AC_AR), > > NVME_PR_CAP_ALL = (NVME_PR_CAP_PTPL | > NVME_PR_CAP_WR_EX | > @@ -702,6 +709,43 @@ typedef enum NVMEPrCap { > NVME_PR_CAP_EX_AC_AR), > } NvmePrCap; > > +typedef struct QEMU_PACKED NvmeRegisteredCtrl { > + uint16_t cntlid; > + uint8_t rcsts; > + uint8_t rsvd3[5]; > + uint8_t hostid[8]; > + uint64_t rkey; > +} NvmeRegisteredCtrl; > + > +typedef struct QEMU_PACKED NvmeRegisteredCtrlExt { > + uint16_t cntlid; > + uint8_t rcsts; > + uint8_t rsvd3[5]; > + uint64_t rkey; > + uint8_t hostid[16]; > + uint8_t rsvd32[32]; > +} NvmeRegisteredCtrlExt; > + > +typedef struct QEMU_PACKED NvmeReservationStatusHeader { > + uint32_t gen; > + uint8_t rtype; > + uint16_t regctl; Not a biggie, but this field is named REGSTRNT now. > + uint16_t resv5; > + uint8_t ptpls; > + uint8_t resv10[14]; > +} NvmeReservationStatusHeader; > + > +typedef struct QEMU_PACKED NvmeReservationStatus { > + struct NvmeReservationStatusHeader header; > + struct NvmeRegisteredCtrl regctl_ds[]; We generally do not use 'struct' if we do not have to, and here it is not required I believe. > +} NvmeReservationStatus; > + > +typedef struct QEMU_PACKED NvmeReservationStatusExt { > + struct NvmeReservationStatusHeader header; > + uint8_t rsvd24[40]; Align or drop the spaces here. > + struct NvmeRegisteredCtrlExt regctl_eds[]; Same here. > +} NvmeReservationStatusExt; > + Please add QEMU_BUILD_BUG_ON(sizeof(...) != ...) checks in _nvme_check_size for these structs. They all add up as far as I can tell, but it's a good safe-guard generally. > typedef struct QEMU_PACKED NvmeDeleteQ { > uint8_t opcode; > uint8_t flags; > -- > 2.20.1 > >
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index ad212de723..ffb876a99f 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -294,6 +294,10 @@ static const uint32_t nvme_cse_iocs_nvm[256] = { [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP, [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_RESV_REGISTER] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_RESV_REPORT] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_RESV_ACQUIRE] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_RESV_RELEASE] = NVME_CMD_EFF_CSUPP, }; static const uint32_t nvme_cse_iocs_zoned[256] = { @@ -308,6 +312,10 @@ static const uint32_t nvme_cse_iocs_zoned[256] = { [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_RESV_REGISTER] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_RESV_REPORT] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_RESV_ACQUIRE] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_RESV_RELEASE] = NVME_CMD_EFF_CSUPP, }; static void nvme_process_sq(void *opaque); @@ -1747,6 +1755,13 @@ static void nvme_aio_err(NvmeRequest *req, int ret) case NVME_CMD_READ: status = NVME_UNRECOVERED_READ; break; + case NVME_CMD_RESV_REPORT: + if (ret == -ENOTSUP) { + status = NVME_INVALID_OPCODE; + } else { + status = NVME_UNRECOVERED_READ; + } + break; case NVME_CMD_FLUSH: case NVME_CMD_WRITE: case NVME_CMD_WRITE_ZEROES: @@ -1754,6 +1769,15 @@ static void nvme_aio_err(NvmeRequest *req, int ret) case NVME_CMD_COPY: status = NVME_WRITE_FAULT; break; + case NVME_CMD_RESV_REGISTER: + case NVME_CMD_RESV_ACQUIRE: + case NVME_CMD_RESV_RELEASE: + if (ret == -ENOTSUP) { + status = NVME_INVALID_OPCODE; + } else { + status = NVME_WRITE_FAULT; + } + break; default: status = NVME_INTERNAL_DEV_ERROR; break; @@ -2692,6 +2716,333 @@ static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req) return NVME_NO_COMPLETE; } +typedef struct NvmeKeyInfo { + uint64_t cr_key; + uint64_t nr_key; +} NvmeKeyInfo; + +static uint16_t nvme_resv_register(NvmeCtrl *n, NvmeRequest *req) +{ + int ret; + NvmeKeyInfo key_info; + NvmeNamespace *ns = req->ns; + uint32_t cdw10 = le32_to_cpu(req->cmd.cdw10); + bool ignore_key = cdw10 >> 3 & 0x1; + uint8_t action = cdw10 & 0x7; + uint8_t ptpl = cdw10 >> 30 & 0x3; + bool aptpl; + + if (!nvme_support_pr(ns)) { + return NVME_INVALID_OPCODE; + } + + switch (ptpl) { + case NVME_RESV_PTPL_NO_CHANGE: + aptpl = (ns->id_ns.rescap & NVME_PR_CAP_PTPL) ? true : false; + break; + case NVME_RESV_PTPL_DISABLE: + aptpl = false; + break; + case NVME_RESV_PTPL_ENABLE: + aptpl = true; + break; + default: + return NVME_INVALID_FIELD; + } + + ret = nvme_h2c(n, (uint8_t *)&key_info, sizeof(NvmeKeyInfo), req); + if (ret) { + return ret; + } + + key_info.cr_key = le64_to_cpu(key_info.cr_key); + key_info.nr_key = le64_to_cpu(key_info.nr_key); + + switch (action) { + case NVME_RESV_REGISTER_ACTION_REGISTER: + req->aiocb = blk_aio_pr_register(ns->blkconf.blk, 0, + key_info.nr_key, 0, aptpl, + ignore_key, nvme_misc_cb, + req); + break; + case NVME_RESV_REGISTER_ACTION_UNREGISTER: + req->aiocb = blk_aio_pr_register(ns->blkconf.blk, key_info.cr_key, 0, + 0, aptpl, ignore_key, + nvme_misc_cb, req); + break; + case NVME_RESV_REGISTER_ACTION_REPLACE: + req->aiocb = blk_aio_pr_register(ns->blkconf.blk, key_info.cr_key, + key_info.nr_key, 0, aptpl, ignore_key, + nvme_misc_cb, req); + break; + default: + return NVME_INVALID_FIELD; + } + + return NVME_NO_COMPLETE; +} + +static uint16_t nvme_resv_release(NvmeCtrl *n, NvmeRequest *req) +{ + int ret; + uint64_t cr_key; + NvmeNamespace *ns = req->ns; + uint32_t cdw10 = le32_to_cpu(req->cmd.cdw10); + uint8_t action = cdw10 & 0x7; + NvmeResvType type = cdw10 >> 8 & 0xff; + + if (!nvme_support_pr(ns)) { + return NVME_INVALID_OPCODE; + } + + ret = nvme_h2c(n, (uint8_t *)&cr_key, sizeof(cr_key), req); + if (ret) { + return ret; + } + + cr_key = le64_to_cpu(cr_key); + + switch (action) { + case NVME_RESV_RELEASE_ACTION_RELEASE: + req->aiocb = blk_aio_pr_release(ns->blkconf.blk, cr_key, + nvme_pr_type_to_block(type), + nvme_misc_cb, req); + break; + case NVME_RESV_RELEASE_ACTION_CLEAR: + req->aiocb = blk_aio_pr_clear(ns->blkconf.blk, cr_key, + nvme_misc_cb, req); + break; + default: + return NVME_INVALID_FIELD; + } + + return NVME_NO_COMPLETE; +} + +static uint16_t nvme_resv_acquire(NvmeCtrl *n, NvmeRequest *req) +{ + int ret; + NvmeKeyInfo key_info; + NvmeNamespace *ns = req->ns; + uint32_t cdw10 = le32_to_cpu(req->cmd.cdw10); + uint8_t action = cdw10 & 0x7; + NvmeResvType type = cdw10 >> 8 & 0xff; + + if (!nvme_support_pr(ns)) { + return NVME_INVALID_OPCODE; + } + + ret = nvme_h2c(n, (uint8_t *)&key_info, sizeof(NvmeKeyInfo), req); + if (ret) { + return ret; + } + + key_info.cr_key = le64_to_cpu(key_info.cr_key); + key_info.nr_key = le64_to_cpu(key_info.nr_key); + + switch (action) { + case NVME_RESV_ACQUIRE_ACTION_ACQUIRE: + req->aiocb = blk_aio_pr_reserve(ns->blkconf.blk, key_info.cr_key, + nvme_pr_type_to_block(type), + nvme_misc_cb, req); + break; + case NVME_RESV_ACQUIRE_ACTION_PREEMPT: + req->aiocb = blk_aio_pr_preempt(ns->blkconf.blk, + key_info.cr_key, key_info.nr_key, + nvme_pr_type_to_block(type), + false, nvme_misc_cb, req); + break; + case NVME_RESV_ACQUIRE_ACTION_PREEMPT_AND_ABORT: + req->aiocb = blk_aio_pr_preempt(ns->blkconf.blk, key_info.cr_key, + key_info.nr_key, type, true, + nvme_misc_cb, req); + break; + default: + return NVME_INVALID_FIELD; + } + + return NVME_NO_COMPLETE; +} + +typedef struct NvmeResvKeys { + uint32_t generation; + uint32_t num_keys; + uint64_t *keys; + NvmeRequest *req; +} NvmeResvKeys; + +typedef struct NvmeReadReservation { + uint32_t generation; + uint64_t key; + BlockPrType type; + NvmeRequest *req; + NvmeResvKeys *keys_info; +} NvmeReadReservation; + +static int nvme_read_reservation_cb(NvmeReadReservation *reservation) +{ + int rc; + NvmeReservationStatus *nvme_status; + NvmeRequest *req = reservation->req; + NvmeCtrl *n = req->sq->ctrl; + NvmeResvKeys *keys_info = reservation->keys_info; + int len = sizeof(NvmeReservationStatusHeader) + + sizeof(NvmeRegisteredCtrl) * keys_info->num_keys; + + nvme_status = g_malloc(len); + nvme_status->header.gen = cpu_to_be32(reservation->generation); + nvme_status->header.rtype = block_pr_type_to_nvme(reservation->type); + nvme_status->header.regctl = cpu_to_be16(keys_info->num_keys); + for (int i = 0; i < keys_info->num_keys; i++) { + uint16_t ctnlid = nvme_ctrl(req)->cntlid; + nvme_status->regctl_ds[i].cntlid = cpu_to_be16(ctnlid); + nvme_status->regctl_ds[i].rkey = cpu_to_be64(keys_info->keys[i]); + nvme_status->regctl_ds[i].rcsts = keys_info->keys[i] == + reservation->key ? 1 : 0; + /* hostid is not supported currently */ + memset(&nvme_status->regctl_ds[i].hostid, 0, 8); + } + + rc = nvme_c2h(n, (uint8_t *)nvme_status, len, req); + g_free(nvme_status); + return rc; +} + +static int nvme_read_reservation_ext_cb(NvmeReadReservation *reservation) +{ + int rc; + NvmeReservationStatusExt *nvme_status_ext; + NvmeRequest *req = reservation->req; + NvmeCtrl *n = req->sq->ctrl; + NvmeResvKeys *keys_info = reservation->keys_info; + int len = sizeof(NvmeReservationStatusHeader) + + sizeof(uint8_t) * 40 + + sizeof(NvmeRegisteredCtrlExt) * keys_info->num_keys; + + nvme_status_ext = g_malloc(len); + nvme_status_ext->header.gen = cpu_to_be32(reservation->generation); + nvme_status_ext->header.rtype = block_pr_type_to_nvme(reservation->type); + nvme_status_ext->header.regctl = cpu_to_be16(keys_info->num_keys); + + for (int i = 0; i < keys_info->num_keys; i++) { + uint16_t ctnlid = nvme_ctrl(req)->cntlid; + nvme_status_ext->regctl_eds[i].cntlid = cpu_to_be16(ctnlid); + nvme_status_ext->regctl_eds[i].rkey = cpu_to_be64(keys_info->keys[i]); + nvme_status_ext->regctl_eds[i].rcsts = keys_info->keys[i] == + reservation->key ? 1 : 0; + /* hostid is not supported currently */ + memset(&nvme_status_ext->regctl_eds[i].hostid, 0, 16); + } + + rc = nvme_c2h(n, (uint8_t *)nvme_status_ext, len, req); + g_free(nvme_status_ext); + return rc; +} + +static void nvme_resv_read_reservation_cb(void *opaque, int ret) +{ + NvmeReadReservation *reservation = opaque; + NvmeRequest *req = reservation->req; + bool eds = le32_to_cpu(req->cmd.cdw11) & 0x1; + NvmeResvKeys *keys_info = reservation->keys_info; + + if (ret < 0) { + goto out; + } + + if (eds) { + ret = nvme_read_reservation_ext_cb(reservation); + } else { + ret = nvme_read_reservation_cb(reservation); + } + +out: + g_free(keys_info->keys); + g_free(keys_info); + g_free(reservation); + nvme_misc_cb(req, ret); +} + +static void nvme_resv_read_keys_cb(void *opaque, int ret) +{ + NvmeResvKeys *keys_info = opaque; + NvmeRequest *req = keys_info->req; + NvmeNamespace *ns = req->ns; + NvmeReadReservation *reservation; + + if (ret < 0) { + goto out; + } + + keys_info->num_keys = MIN(ret, keys_info->num_keys); + reservation = g_new0(NvmeReadReservation, 1); + memset(reservation, 0, sizeof(*reservation)); + reservation->req = req; + reservation->keys_info = keys_info; + + req->aiocb = blk_aio_pr_read_reservation(ns->blkconf.blk, + &reservation->generation, &reservation->key, + &reservation->type, nvme_resv_read_reservation_cb, + reservation); + return; + +out: + g_free(keys_info->keys); + g_free(keys_info); + nvme_misc_cb(req, ret); +} + + +static uint16_t nvme_resv_report(NvmeCtrl *n, NvmeRequest *req) +{ + int num_keys; + uint32_t cdw10 = le32_to_cpu(req->cmd.cdw10); + uint32_t cdw11 = le32_to_cpu(req->cmd.cdw11); + size_t buflen = (cdw10 + 1) * sizeof(uint32_t); + bool eds = cdw11 & 0x1; + NvmeNamespace *ns = req->ns; + NvmeResvKeys *keys_info; + + if (!nvme_support_pr(ns)) { + return NVME_INVALID_OPCODE; + } + + if (eds) { + if (buflen < sizeof(NvmeReservationStatusHeader) + + sizeof(uint8_t) * 40) { + return NVME_INVALID_FIELD; + } + + num_keys = (buflen - sizeof(NvmeReservationStatusHeader) - + sizeof(uint8_t) * 40) / + sizeof(struct NvmeRegisteredCtrlExt); + } else { + if (buflen < sizeof(NvmeReservationStatusHeader)) { + return NVME_INVALID_FIELD; + } + + num_keys = (buflen - sizeof(NvmeReservationStatusHeader)) / + sizeof(struct NvmeRegisteredCtrl); + } + + /* + * The maximum number of keys that can be transmitted is 128. + */ + num_keys = MIN(num_keys, NVME_MAX_RESERVATION_KEYS); + keys_info = g_new0(NvmeResvKeys, 1); + keys_info->generation = 0; + /* num_keys is the maximum number of keys that can be transmitted */ + keys_info->num_keys = num_keys; + keys_info->keys = g_malloc(sizeof(uint64_t) * num_keys); + keys_info->req = req; + + req->aiocb = blk_aio_pr_read_keys(ns->blkconf.blk, &keys_info->generation, + keys_info->num_keys, keys_info->keys, + nvme_resv_read_keys_cb, keys_info); + + return NVME_NO_COMPLETE; +} + typedef struct NvmeCopyAIOCB { BlockAIOCB common; BlockAIOCB *aiocb; @@ -4469,6 +4820,14 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return nvme_dsm(n, req); case NVME_CMD_VERIFY: return nvme_verify(n, req); + case NVME_CMD_RESV_REGISTER: + return nvme_resv_register(n, req); + case NVME_CMD_RESV_REPORT: + return nvme_resv_report(n, req); + case NVME_CMD_RESV_ACQUIRE: + return nvme_resv_acquire(n, req); + case NVME_CMD_RESV_RELEASE: + return nvme_resv_release(n, req); case NVME_CMD_COPY: return nvme_copy(n, req); case NVME_CMD_ZONE_MGMT_SEND: diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c index a5c903d727..833bcbae08 100644 --- a/hw/nvme/ns.c +++ b/hw/nvme/ns.c @@ -60,6 +60,12 @@ void nvme_ns_init_format(NvmeNamespace *ns) blk_pr_cap = blk_bs(ns->blkconf.blk)->bl.pr_cap; id_ns->rescap = block_pr_cap_to_nvme(blk_pr_cap); + if (id_ns->rescap != NVME_PR_CAP_ALL && + id_ns->rescap != NVME_PR_CAP_RW) { + + /* Rescap either supports all or none of them */ + id_ns->rescap = 0; + } } static int nvme_ns_init(NvmeNamespace *ns, Error **errp) diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index 6d0e456348..b87e1fa3b0 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -29,6 +29,7 @@ #define NVME_EUI64_DEFAULT ((uint64_t)0x5254000000000000) #define NVME_FDP_MAX_EVENTS 63 #define NVME_FDP_MAXPIDS 128 +#define NVME_MAX_RESERVATION_KEYS 128 /* * The controller only supports Submission and Completion Queue Entry Sizes of @@ -470,6 +471,10 @@ static inline const char *nvme_io_opc_str(uint8_t opc) case NVME_CMD_ZONE_MGMT_SEND: return "NVME_ZONED_CMD_MGMT_SEND"; case NVME_CMD_ZONE_MGMT_RECV: return "NVME_ZONED_CMD_MGMT_RECV"; case NVME_CMD_ZONE_APPEND: return "NVME_ZONED_CMD_ZONE_APPEND"; + case NVME_CMD_RESV_REGISTER: return "NVME_CMD_RESV_REGISTER"; + case NVME_CMD_RESV_REPORT: return "NVME_CMD_RESV_REPORT"; + case NVME_CMD_RESV_ACQUIRE: return "NVME_CMD_RESV_ACQUIRE"; + case NVME_CMD_RESV_RELEASE: return "NVME_CMD_RESV_RELEASE"; default: return "NVME_NVM_CMD_UNKNOWN"; } } @@ -558,6 +563,11 @@ static inline uint8_t block_pr_cap_to_nvme(uint8_t block_pr_cap) return res; } +static inline bool nvme_support_pr(NvmeNamespace *ns) +{ + return (ns->id_ns.rescap & NVME_PR_CAP_RW) == NVME_PR_CAP_RW; +} + typedef struct NvmeSQueue { struct NvmeCtrl *ctrl; uint16_t sqid; diff --git a/include/block/nvme.h b/include/block/nvme.h index 9b9eaeb3a7..2eec339028 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -692,6 +692,13 @@ typedef enum NVMEPrCap { NVME_PR_CAP_WR_EX_AR = 1 << 5, /* Exclusive Access All Registrants reservation type */ NVME_PR_CAP_EX_AC_AR = 1 << 6, + /* Write and Read reservation type */ + NVME_PR_CAP_RW = (NVME_PR_CAP_WR_EX | + NVME_PR_CAP_EX_AC | + NVME_PR_CAP_WR_EX_RO | + NVME_PR_CAP_EX_AC_RO | + NVME_PR_CAP_WR_EX_AR | + NVME_PR_CAP_EX_AC_AR), NVME_PR_CAP_ALL = (NVME_PR_CAP_PTPL | NVME_PR_CAP_WR_EX | @@ -702,6 +709,43 @@ typedef enum NVMEPrCap { NVME_PR_CAP_EX_AC_AR), } NvmePrCap; +typedef struct QEMU_PACKED NvmeRegisteredCtrl { + uint16_t cntlid; + uint8_t rcsts; + uint8_t rsvd3[5]; + uint8_t hostid[8]; + uint64_t rkey; +} NvmeRegisteredCtrl; + +typedef struct QEMU_PACKED NvmeRegisteredCtrlExt { + uint16_t cntlid; + uint8_t rcsts; + uint8_t rsvd3[5]; + uint64_t rkey; + uint8_t hostid[16]; + uint8_t rsvd32[32]; +} NvmeRegisteredCtrlExt; + +typedef struct QEMU_PACKED NvmeReservationStatusHeader { + uint32_t gen; + uint8_t rtype; + uint16_t regctl; + uint16_t resv5; + uint8_t ptpls; + uint8_t resv10[14]; +} NvmeReservationStatusHeader; + +typedef struct QEMU_PACKED NvmeReservationStatus { + struct NvmeReservationStatusHeader header; + struct NvmeRegisteredCtrl regctl_ds[]; +} NvmeReservationStatus; + +typedef struct QEMU_PACKED NvmeReservationStatusExt { + struct NvmeReservationStatusHeader header; + uint8_t rsvd24[40]; + struct NvmeRegisteredCtrlExt regctl_eds[]; +} NvmeReservationStatusExt; + typedef struct QEMU_PACKED NvmeDeleteQ { uint8_t opcode; uint8_t flags;