From patchwork Thu May 2 11:47:52 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Maxim Levitsky X-Patchwork-Id: 10926603 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id AA9C01390 for ; Thu, 2 May 2019 11:48:40 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 8FA1329015 for ; Thu, 2 May 2019 11:48:40 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 835C929018; Thu, 2 May 2019 11:48:40 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id EBBDC29016 for ; Thu, 2 May 2019 11:48:39 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726405AbfEBLsf (ORCPT ); Thu, 2 May 2019 07:48:35 -0400 Received: from mx1.redhat.com ([209.132.183.28]:47916 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726189AbfEBLse (ORCPT ); Thu, 2 May 2019 07:48:34 -0400 Received: from smtp.corp.redhat.com (int-mx03.intmail.prod.int.phx2.redhat.com [10.5.11.13]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mx1.redhat.com (Postfix) with ESMTPS id 204A7F74B9; Thu, 2 May 2019 11:48:34 +0000 (UTC) Received: from maximlenovopc.usersys.redhat.com (unknown [10.35.206.58]) by smtp.corp.redhat.com (Postfix) with ESMTP id 48B2117DDF; Thu, 2 May 2019 11:48:29 +0000 (UTC) From: Maxim Levitsky To: linux-nvme@lists.infradead.org Cc: Maxim Levitsky , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, Jens Axboe , Alex Williamson , Keith Busch , Christoph Hellwig , Sagi Grimberg , Kirti Wankhede , "David S . Miller" , Mauro Carvalho Chehab , Greg Kroah-Hartman , Wolfram Sang , Nicolas Ferre , "Paul E . McKenney " , Paolo Bonzini , Liang Cunming , Liu Changpeng , Fam Zheng , Amnon Ilan , John Ferlan Subject: [PATCH v2 01/10] vfio/mdev: add notifier for map events Date: Thu, 2 May 2019 14:47:52 +0300 Message-Id: <20190502114801.23116-2-mlevitsk@redhat.com> In-Reply-To: <20190502114801.23116-1-mlevitsk@redhat.com> References: <20190502114801.23116-1-mlevitsk@redhat.com> X-Scanned-By: MIMEDefang 2.79 on 10.5.11.13 X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.38]); Thu, 02 May 2019 11:48:34 +0000 (UTC) Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP Allow an VFIO mdev device to listen to map events This will allow a mdev driver to dma map memory as soon as it gets added to the domain --- Signed-off-by: Maxim Levitsky --- drivers/vfio/vfio_iommu_type1.c | 97 +++++++++++++++++++++++++++++---- include/linux/vfio.h | 4 ++ 2 files changed, 89 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index d0f731c9920a..393b56d78166 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -63,17 +63,22 @@ module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644); MODULE_PARM_DESC(dma_entry_limit, "Maximum number of user DMA mappings per container (65535)."); +/* a container, usually one per VM*/ struct vfio_iommu { struct list_head domain_list; struct vfio_domain *external_domain; /* domain for external user */ struct mutex lock; struct rb_root dma_list; - struct blocking_notifier_head notifier; + struct blocking_notifier_head map_notifiers; + struct blocking_notifier_head unmap_notifiers; unsigned int dma_avail; bool v2; bool nesting; }; +/* An IOMMU domain - also usually one per VM, unless devices assigned to VM + * are connected via different IOMMUs + */ struct vfio_domain { struct iommu_domain *domain; struct list_head next; @@ -563,8 +568,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, mutex_lock(&iommu->lock); - /* Fail if notifier list is empty */ - if ((!iommu->external_domain) || (!iommu->notifier.head)) { + if (!iommu->external_domain) { ret = -EINVAL; goto pin_done; } @@ -967,7 +971,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, * invalidation. */ mutex_unlock(&iommu->lock); - blocking_notifier_call_chain(&iommu->notifier, + blocking_notifier_call_chain(&iommu->unmap_notifiers, VFIO_IOMMU_NOTIFY_DMA_UNMAP, &nb_unmap); goto again; @@ -1144,6 +1148,22 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, else ret = vfio_pin_map_dma(iommu, dma, size); + mutex_unlock(&iommu->lock); + + /* + * Notify anyone (mdev vendor drivers) that new mapping has being + * created - vendor drivers can in response pin/dma map the memory + */ + ret = blocking_notifier_call_chain(&iommu->map_notifiers, + VFIO_IOMMU_NOTIFY_DMA_MAP, + map); + + ret = notifier_to_errno(ret); + if (ret) + vfio_remove_dma(iommu, dma); + + return ret; + out_unlock: mutex_unlock(&iommu->lock); return ret; @@ -1508,7 +1528,8 @@ static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu) break; } /* mdev vendor driver must unregister notifier */ - WARN_ON(iommu->notifier.head); + WARN_ON(iommu->map_notifiers.head); + WARN_ON(iommu->unmap_notifiers.head); } static void vfio_iommu_type1_detach_group(void *iommu_data, @@ -1598,7 +1619,8 @@ static void *vfio_iommu_type1_open(unsigned long arg) iommu->dma_list = RB_ROOT; iommu->dma_avail = dma_entry_limit; mutex_init(&iommu->lock); - BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); + BLOCKING_INIT_NOTIFIER_HEAD(&iommu->unmap_notifiers); + BLOCKING_INIT_NOTIFIER_HEAD(&iommu->map_notifiers); return iommu; } @@ -1738,23 +1760,74 @@ static int vfio_iommu_type1_register_notifier(void *iommu_data, struct notifier_block *nb) { struct vfio_iommu *iommu = iommu_data; + struct rb_node *node; + int ret = 0; + + if (*events == VFIO_IOMMU_NOTIFY_DMA_MAP) { + + /* now register the notifier */ + ret = blocking_notifier_chain_register(&iommu->map_notifiers, + nb); - /* clear known events */ - *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP; + /* replay the mapping */ + node = rb_first(&iommu->dma_list); + while (node) { + struct vfio_dma *dma = rb_entry(node, struct vfio_dma, + node); - /* refuse to register if still events remaining */ - if (*events) + struct vfio_iommu_type1_dma_map map; + + map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + map.flags = 0; + + if (dma->prot & IOMMU_READ) + map.flags |= VFIO_DMA_MAP_FLAG_READ; + if (dma->prot & IOMMU_WRITE) + map.flags |= VFIO_DMA_MAP_FLAG_WRITE; + + map.iova = dma->iova; + map.vaddr = dma->vaddr; + map.size = dma->size; + + node = rb_next(node); + + /* Call only the first notifier, the one that + * we just registered + */ + ret = __blocking_notifier_call_chain( + &iommu->map_notifiers, + VFIO_IOMMU_NOTIFY_DMA_MAP, + &map, 1, NULL); + + ret = notifier_to_errno(ret); + if (ret) { + blocking_notifier_chain_unregister( + &iommu->map_notifiers, nb); + return ret; + } + } + + } else if (*events == VFIO_IOMMU_NOTIFY_DMA_UNMAP) { + ret = blocking_notifier_chain_register( + &iommu->unmap_notifiers, nb); + } else { return -EINVAL; + } + return ret; - return blocking_notifier_chain_register(&iommu->notifier, nb); } static int vfio_iommu_type1_unregister_notifier(void *iommu_data, struct notifier_block *nb) { struct vfio_iommu *iommu = iommu_data; + int ret; - return blocking_notifier_chain_unregister(&iommu->notifier, nb); + ret = blocking_notifier_chain_unregister(&iommu->map_notifiers, nb); + if (ret) + ret = blocking_notifier_chain_unregister( + &iommu->unmap_notifiers, nb); + return ret; } static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 66741ab087c1..957f09263bfe 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -118,10 +118,14 @@ enum vfio_notify_type { /* events for VFIO_IOMMU_NOTIFY */ #define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0) +#define VFIO_IOMMU_NOTIFY_DMA_MAP BIT(1) /* events for VFIO_GROUP_NOTIFY */ #define VFIO_GROUP_NOTIFY_SET_KVM BIT(0) +/* Note: currently you can only register a notifier for a single event + * at the time + */ extern int vfio_register_notifier(struct device *dev, enum vfio_notify_type type, unsigned long *required_events, From patchwork Thu May 2 11:47:53 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Maxim Levitsky X-Patchwork-Id: 10926613 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 7720C14C0 for ; Thu, 2 May 2019 11:49:16 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 6596429015 for ; Thu, 2 May 2019 11:49:16 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 592C12901B; Thu, 2 May 2019 11:49:16 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 04DB929015 for ; Thu, 2 May 2019 11:49:16 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726448AbfEBLsk (ORCPT ); Thu, 2 May 2019 07:48:40 -0400 Received: from mx1.redhat.com ([209.132.183.28]:47968 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726189AbfEBLsj (ORCPT ); Thu, 2 May 2019 07:48:39 -0400 Received: from smtp.corp.redhat.com (int-mx03.intmail.prod.int.phx2.redhat.com [10.5.11.13]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mx1.redhat.com (Postfix) with ESMTPS id 39AFDF74AB; Thu, 2 May 2019 11:48:39 +0000 (UTC) Received: from maximlenovopc.usersys.redhat.com (unknown [10.35.206.58]) by smtp.corp.redhat.com (Postfix) with ESMTP id 7999317DDF; Thu, 2 May 2019 11:48:34 +0000 (UTC) From: Maxim Levitsky To: linux-nvme@lists.infradead.org Cc: Maxim Levitsky , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, Jens Axboe , Alex Williamson , Keith Busch , Christoph Hellwig , Sagi Grimberg , Kirti Wankhede , "David S . Miller" , Mauro Carvalho Chehab , Greg Kroah-Hartman , Wolfram Sang , Nicolas Ferre , "Paul E . McKenney " , Paolo Bonzini , Liang Cunming , Liu Changpeng , Fam Zheng , Amnon Ilan , John Ferlan Subject: [PATCH v2 02/10] vfio/mdev: add .request callback Date: Thu, 2 May 2019 14:47:53 +0300 Message-Id: <20190502114801.23116-3-mlevitsk@redhat.com> In-Reply-To: <20190502114801.23116-1-mlevitsk@redhat.com> References: <20190502114801.23116-1-mlevitsk@redhat.com> X-Scanned-By: MIMEDefang 2.79 on 10.5.11.13 X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.38]); Thu, 02 May 2019 11:48:39 +0000 (UTC) Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP This will allow the hotplug to be enabled for mediated devices Signed-off-by: Maxim Levitsky --- drivers/vfio/mdev/vfio_mdev.c | 11 +++++++++++ include/linux/mdev.h | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c index d230620fe02d..17aa76de0764 100644 --- a/drivers/vfio/mdev/vfio_mdev.c +++ b/drivers/vfio/mdev/vfio_mdev.c @@ -101,6 +101,16 @@ static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma) return parent->ops->mmap(mdev, vma); } +static void vfio_mdev_request(void *device_data, unsigned int count) +{ + struct mdev_device *mdev = device_data; + struct mdev_parent *parent = mdev->parent; + + if (unlikely(!parent->ops->request)) + return; + parent->ops->request(mdev, count); +} + static const struct vfio_device_ops vfio_mdev_dev_ops = { .name = "vfio-mdev", .open = vfio_mdev_open, @@ -109,6 +119,7 @@ static const struct vfio_device_ops vfio_mdev_dev_ops = { .read = vfio_mdev_read, .write = vfio_mdev_write, .mmap = vfio_mdev_mmap, + .request = vfio_mdev_request, }; static int vfio_mdev_probe(struct device *dev) diff --git a/include/linux/mdev.h b/include/linux/mdev.h index d7aee90e5da5..6f315cb9575c 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -13,6 +13,9 @@ #ifndef MDEV_H #define MDEV_H +#include +#include + struct mdev_device; /** @@ -81,6 +84,7 @@ struct mdev_parent_ops { long (*ioctl)(struct mdev_device *mdev, unsigned int cmd, unsigned long arg); int (*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma); + void (*request)(struct mdev_device *mdev, unsigned int count); }; /* interface for exporting mdev supported type attributes */ From patchwork Thu May 2 11:47:54 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Maxim Levitsky X-Patchwork-Id: 10926609 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 414AD14C0 for ; Thu, 2 May 2019 11:49:06 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 2F5D529015 for ; Thu, 2 May 2019 11:49:06 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 236BC29019; Thu, 2 May 2019 11:49:06 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI,UPPERCASE_50_75 autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id A482729015 for ; Thu, 2 May 2019 11:49:05 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726511AbfEBLss (ORCPT ); Thu, 2 May 2019 07:48:48 -0400 Received: from mx1.redhat.com ([209.132.183.28]:56146 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726473AbfEBLsr (ORCPT ); Thu, 2 May 2019 07:48:47 -0400 Received: from smtp.corp.redhat.com (int-mx03.intmail.prod.int.phx2.redhat.com [10.5.11.13]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mx1.redhat.com (Postfix) with ESMTPS id 9112C3087925; Thu, 2 May 2019 11:48:46 +0000 (UTC) Received: from maximlenovopc.usersys.redhat.com (unknown [10.35.206.58]) by smtp.corp.redhat.com (Postfix) with ESMTP id 9248D8AD86; Thu, 2 May 2019 11:48:39 +0000 (UTC) From: Maxim Levitsky To: linux-nvme@lists.infradead.org Cc: Maxim Levitsky , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, Jens Axboe , Alex Williamson , Keith Busch , Christoph Hellwig , Sagi Grimberg , Kirti Wankhede , "David S . Miller" , Mauro Carvalho Chehab , Greg Kroah-Hartman , Wolfram Sang , Nicolas Ferre , "Paul E . McKenney " , Paolo Bonzini , Liang Cunming , Liu Changpeng , Fam Zheng , Amnon Ilan , John Ferlan Subject: [PATCH v2 03/10] nvme/core: add some more values from the spec Date: Thu, 2 May 2019 14:47:54 +0300 Message-Id: <20190502114801.23116-4-mlevitsk@redhat.com> In-Reply-To: <20190502114801.23116-1-mlevitsk@redhat.com> References: <20190502114801.23116-1-mlevitsk@redhat.com> X-Scanned-By: MIMEDefang 2.79 on 10.5.11.13 X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.45]); Thu, 02 May 2019 11:48:46 +0000 (UTC) Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP This adds few defines from the spec, that will be used in the nvme-mdev driver Signed-off-by: Maxim Levitsky --- include/linux/nvme.h | 88 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 20 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index c40720cb59ac..653c22baedb1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -144,32 +144,42 @@ enum { #define NVME_NVM_IOCQES 4 enum { - NVME_CC_ENABLE = 1 << 0, - NVME_CC_CSS_NVM = 0 << 4, NVME_CC_EN_SHIFT = 0, + NVME_CC_ENABLE = 1 << NVME_CC_EN_SHIFT, + NVME_CC_CSS_SHIFT = 4, + NVME_CC_CSS_NVM = 0 << NVME_CC_CSS_SHIFT, + NVME_CC_MPS_SHIFT = 7, + NVME_CC_MPS_MASK = 0xF << NVME_CC_MPS_SHIFT, + NVME_CC_AMS_SHIFT = 11, - NVME_CC_SHN_SHIFT = 14, - NVME_CC_IOSQES_SHIFT = 16, - NVME_CC_IOCQES_SHIFT = 20, NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, + NVME_CC_AMS_MASK = 0x7 << NVME_CC_AMS_SHIFT, + + NVME_CC_SHN_SHIFT = 14, NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, + + NVME_CC_IOSQES_SHIFT = 16, + NVME_CC_IOCQES_SHIFT = 20, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, NVME_CSTS_NSSRO = 1 << 4, NVME_CSTS_PP = 1 << 5, - NVME_CSTS_SHST_NORMAL = 0 << 2, - NVME_CSTS_SHST_OCCUR = 1 << 2, - NVME_CSTS_SHST_CMPLT = 2 << 2, - NVME_CSTS_SHST_MASK = 3 << 2, + + NVME_CSTS_SHST_SHIFT = 2, + NVME_CSTS_SHST_NORMAL = 0 << NVME_CSTS_SHST_SHIFT, + NVME_CSTS_SHST_OCCUR = 1 << NVME_CSTS_SHST_SHIFT, + NVME_CSTS_SHST_CMPLT = 2 << NVME_CSTS_SHST_SHIFT, + NVME_CSTS_SHST_MASK = 3 << NVME_CSTS_SHST_SHIFT, }; struct nvme_id_power_state { @@ -396,6 +406,20 @@ enum { NVME_NIDT_UUID = 0x03, }; +struct nvme_err_log_entry { + __u8 err_count[8]; + __le16 sqid; + __le16 cid; + __le16 status; + __le16 location; + __u8 lba[8]; + __le32 ns; + __u8 vnd; + __u8 rsvd1[3]; + __u8 cmd_specific[8]; + __u8 rsvd2[24]; +}; + struct nvme_smart_log { __u8 critical_warning; __u8 temperature[2]; @@ -483,13 +507,30 @@ enum { NVME_AER_VS = 7, }; -enum { - NVME_AER_NOTICE_NS_CHANGED = 0x00, - NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, - NVME_AER_NOTICE_ANA = 0x03, - NVME_AER_NOTICE_DISC_CHANGED = 0xf0, +enum nvme_async_event_type { + NVME_AER_TYPE_ERROR = 0, + NVME_AER_TYPE_SMART = 1, + NVME_AER_TYPE_NOTICE = 2, + NVME_AER_TYPE_MAX = 7, }; +enum nvme_async_event { + NVME_AER_ERROR_INVALID_DB_REG = 0, + NVME_AER_ERROR_INVALID_DB_VALUE = 1, + NVME_AER_ERROR_DIAG_FAILURE = 2, + NVME_AER_ERROR_PERSISTENT_INT_ERR = 3, + NVME_AER_ERROR_TRANSIENT_INT_ERR = 4, + NVME_AER_ERROR_FW_IMAGE_LOAD_ERR = 5, + + NVME_AER_SMART_SUBSYS_RELIABILITY = 0, + NVME_AER_SMART_TEMP_THRESH = 1, + NVME_AER_SMART_SPARE_BELOW_THRESH = 2, + + NVME_AER_NOTICE_NS_CHANGED = 0, + NVME_AER_NOTICE_FW_ACT_STARTING = 1, + NVME_AER_NOTICE_ANA = 3, + NVME_AER_NOTICE_DISC_CHANGED = 0xf0, +}; enum { NVME_AEN_BIT_NS_ATTR = 8, NVME_AEN_BIT_FW_ACT = 9, @@ -540,12 +581,6 @@ struct nvme_reservation_status { } regctl_ds[]; }; -enum nvme_async_event_type { - NVME_AER_TYPE_ERROR = 0, - NVME_AER_TYPE_SMART = 1, - NVME_AER_TYPE_NOTICE = 2, -}; - /* I/O commands */ enum nvme_opcode { @@ -697,10 +732,19 @@ enum { NVME_RW_DSM_LATENCY_LOW = 3 << 4, NVME_RW_DSM_SEQ_REQ = 1 << 6, NVME_RW_DSM_COMPRESSED = 1 << 7, + + NVME_WZ_DEAC = 1 << 9, NVME_RW_PRINFO_PRCHK_REF = 1 << 10, NVME_RW_PRINFO_PRCHK_APP = 1 << 11, NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, NVME_RW_PRINFO_PRACT = 1 << 13, + + NVME_RW_PRINFO = + NVME_RW_PRINFO_PRCHK_REF | + NVME_RW_PRINFO_PRCHK_APP | + NVME_RW_PRINFO_PRCHK_GUARD | + NVME_RW_PRINFO_PRACT, + NVME_RW_DTYPE_STREAMS = 1 << 4, }; @@ -801,6 +845,7 @@ enum { NVME_SQ_PRIO_HIGH = (1 << 1), NVME_SQ_PRIO_MEDIUM = (2 << 1), NVME_SQ_PRIO_LOW = (3 << 1), + NVME_SQ_PRIO_MASK = (3 << 1), NVME_FEAT_ARBITRATION = 0x01, NVME_FEAT_POWER_MGMT = 0x02, NVME_FEAT_LBA_RANGE = 0x03, @@ -1143,6 +1188,7 @@ struct streams_directive_params { struct nvme_command { union { + __le32 dwords[16]; struct nvme_common_command common; struct nvme_rw_command rw; struct nvme_identify identify; @@ -1214,6 +1260,8 @@ enum { NVME_SC_SGL_INVALID_METADATA = 0x10, NVME_SC_SGL_INVALID_TYPE = 0x11, + NVME_SC_PRP_OFFSET_INVALID = 0x13, + NVME_SC_SGL_INVALID_OFFSET = 0x16, NVME_SC_SGL_INVALID_SUBTYPE = 0x17, From patchwork Thu May 2 11:47:55 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Maxim Levitsky X-Patchwork-Id: 10926605 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id EF2CA1390 for ; Thu, 2 May 2019 11:48:57 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id DF35229015 for ; Thu, 2 May 2019 11:48:57 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id D238929018; Thu, 2 May 2019 11:48:57 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 6A88E29015 for ; Thu, 2 May 2019 11:48:57 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726544AbfEBLsw (ORCPT ); Thu, 2 May 2019 07:48:52 -0400 Received: from mx1.redhat.com ([209.132.183.28]:43082 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726473AbfEBLsw (ORCPT ); Thu, 2 May 2019 07:48:52 -0400 Received: from smtp.corp.redhat.com (int-mx03.intmail.prod.int.phx2.redhat.com [10.5.11.13]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mx1.redhat.com (Postfix) with ESMTPS id AE9B581224; Thu, 2 May 2019 11:48:51 +0000 (UTC) Received: from maximlenovopc.usersys.redhat.com (unknown [10.35.206.58]) by smtp.corp.redhat.com (Postfix) with ESMTP id EAC608AD86; Thu, 2 May 2019 11:48:46 +0000 (UTC) From: Maxim Levitsky To: linux-nvme@lists.infradead.org Cc: Maxim Levitsky , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, Jens Axboe , Alex Williamson , Keith Busch , Christoph Hellwig , Sagi Grimberg , Kirti Wankhede , "David S . Miller" , Mauro Carvalho Chehab , Greg Kroah-Hartman , Wolfram Sang , Nicolas Ferre , "Paul E . McKenney " , Paolo Bonzini , Liang Cunming , Liu Changpeng , Fam Zheng , Amnon Ilan , John Ferlan Subject: [PATCH v2 04/10] nvme/core: add NVME_CTRL_SUSPENDED controller state Date: Thu, 2 May 2019 14:47:55 +0300 Message-Id: <20190502114801.23116-5-mlevitsk@redhat.com> In-Reply-To: <20190502114801.23116-1-mlevitsk@redhat.com> References: <20190502114801.23116-1-mlevitsk@redhat.com> X-Scanned-By: MIMEDefang 2.79 on 10.5.11.13 X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.25]); Thu, 02 May 2019 11:48:51 +0000 (UTC) Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP This state will be used by a controller that is going to suspended state, and will later be used by mdev framework to detect this and flush its queues Signed-off-by: Maxim Levitsky --- drivers/nvme/host/core.c | 15 +++++++++++++++ drivers/nvme/host/nvme.h | 1 + 2 files changed, 16 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2e11d014d514..22db0c51a0bf 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -317,6 +317,19 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_NEW: case NVME_CTRL_RESETTING: + case NVME_CTRL_CONNECTING: + case NVME_CTRL_SUSPENDED: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + case NVME_CTRL_SUSPENDED: + switch (old_state) { + case NVME_CTRL_NEW: + case NVME_CTRL_LIVE: + case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: changed = true; /* FALLTHRU */ @@ -329,6 +342,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_NEW: case NVME_CTRL_LIVE: case NVME_CTRL_ADMIN_ONLY: + case NVME_CTRL_SUSPENDED: changed = true; /* FALLTHRU */ default: @@ -351,6 +365,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_ADMIN_ONLY: case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: + case NVME_CTRL_SUSPENDED: changed = true; /* FALLTHRU */ default: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 527d64545023..d040eb00e880 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -139,6 +139,7 @@ static inline u16 nvme_req_qid(struct request *req) enum nvme_ctrl_state { NVME_CTRL_NEW, NVME_CTRL_LIVE, + NVME_CTRL_SUSPENDED, NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */ NVME_CTRL_RESETTING, NVME_CTRL_CONNECTING, From patchwork Thu May 2 11:47:56 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Maxim Levitsky X-Patchwork-Id: 10926607 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 8404B1390 for ; Thu, 2 May 2019 11:49:03 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 715E829018 for ; Thu, 2 May 2019 11:49:03 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 651A02901A; Thu, 2 May 2019 11:49:03 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 16F2029018 for ; Thu, 2 May 2019 11:49:03 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726564AbfEBLs5 (ORCPT ); Thu, 2 May 2019 07:48:57 -0400 Received: from mx1.redhat.com ([209.132.183.28]:49208 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726473AbfEBLs5 (ORCPT ); Thu, 2 May 2019 07:48:57 -0400 Received: from smtp.corp.redhat.com (int-mx03.intmail.prod.int.phx2.redhat.com [10.5.11.13]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mx1.redhat.com (Postfix) with ESMTPS id C60D73082E8E; Thu, 2 May 2019 11:48:56 +0000 (UTC) Received: from maximlenovopc.usersys.redhat.com (unknown [10.35.206.58]) by smtp.corp.redhat.com (Postfix) with ESMTP id 13D2417DDF; Thu, 2 May 2019 11:48:51 +0000 (UTC) From: Maxim Levitsky To: linux-nvme@lists.infradead.org Cc: Maxim Levitsky , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, Jens Axboe , Alex Williamson , Keith Busch , Christoph Hellwig , Sagi Grimberg , Kirti Wankhede , "David S . Miller" , Mauro Carvalho Chehab , Greg Kroah-Hartman , Wolfram Sang , Nicolas Ferre , "Paul E . McKenney " , Paolo Bonzini , Liang Cunming , Liu Changpeng , Fam Zheng , Amnon Ilan , John Ferlan Subject: [PATCH v2 05/10] nvme/pci: use the NVME_CTRL_SUSPENDED state Date: Thu, 2 May 2019 14:47:56 +0300 Message-Id: <20190502114801.23116-6-mlevitsk@redhat.com> In-Reply-To: <20190502114801.23116-1-mlevitsk@redhat.com> References: <20190502114801.23116-1-mlevitsk@redhat.com> X-Scanned-By: MIMEDefang 2.79 on 10.5.11.13 X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.46]); Thu, 02 May 2019 11:48:57 +0000 (UTC) Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP When enteriing low power state, the nvme driver will now inform the core with the NVME_CTRL_SUSPENDED state which will allow mdev driver to act on this information Signed-off-by: Maxim Levitsky --- drivers/nvme/host/pci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1c844c66ecaa..282f28c851c1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2406,7 +2406,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) u32 csts = readl(dev->bar + NVME_REG_CSTS); if (dev->ctrl.state == NVME_CTRL_LIVE || - dev->ctrl.state == NVME_CTRL_RESETTING) + dev->ctrl.state == NVME_CTRL_RESETTING || + dev->ctrl.state == NVME_CTRL_SUSPENDED) nvme_start_freeze(&dev->ctrl); dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || pdev->error_state != pci_channel_io_normal); @@ -2852,6 +2853,9 @@ static int nvme_suspend(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); + if (!nvme_change_ctrl_state(&ndev->ctrl, NVME_CTRL_SUSPENDED)) + WARN_ON(1); + nvme_dev_disable(ndev, true); return 0; } From patchwork Thu May 2 11:47:57 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Maxim Levitsky X-Patchwork-Id: 10926611 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 8E2AB1390 for ; Thu, 2 May 2019 11:49:15 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 7B8ED29015 for ; Thu, 2 May 2019 11:49:15 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 6F3B22901F; Thu, 2 May 2019 11:49:15 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id A114E29019 for ; Thu, 2 May 2019 11:49:14 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726595AbfEBLtJ (ORCPT ); Thu, 2 May 2019 07:49:09 -0400 Received: from mx1.redhat.com ([209.132.183.28]:49074 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726580AbfEBLtH (ORCPT ); Thu, 2 May 2019 07:49:07 -0400 Received: from smtp.corp.redhat.com (int-mx03.intmail.prod.int.phx2.redhat.com [10.5.11.13]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mx1.redhat.com (Postfix) with ESMTPS id D57D38763B; Thu, 2 May 2019 11:49:06 +0000 (UTC) Received: from maximlenovopc.usersys.redhat.com (unknown [10.35.206.58]) by smtp.corp.redhat.com (Postfix) with ESMTP id 2C2BD17DDF; Thu, 2 May 2019 11:48:56 +0000 (UTC) From: Maxim Levitsky To: linux-nvme@lists.infradead.org Cc: Maxim Levitsky , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, Jens Axboe , Alex Williamson , Keith Busch , Christoph Hellwig , Sagi Grimberg , Kirti Wankhede , "David S . Miller" , Mauro Carvalho Chehab , Greg Kroah-Hartman , Wolfram Sang , Nicolas Ferre , "Paul E . McKenney " , Paolo Bonzini , Liang Cunming , Liu Changpeng , Fam Zheng , Amnon Ilan , John Ferlan Subject: [PATCH v2 06/10] nvme/core: add mdev interfaces Date: Thu, 2 May 2019 14:47:57 +0300 Message-Id: <20190502114801.23116-7-mlevitsk@redhat.com> In-Reply-To: <20190502114801.23116-1-mlevitsk@redhat.com> References: <20190502114801.23116-1-mlevitsk@redhat.com> X-Scanned-By: MIMEDefang 2.79 on 10.5.11.13 X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.26]); Thu, 02 May 2019 11:49:07 +0000 (UTC) Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP This adds infrastructure for a nvme-mdev to attach to the core driver, to be able to know which nvme controllers are present and which namespaces they have. It also adds an interface to nvme device drivers which expose the its queues in a controlled manner to the nvme mdev core driver. A driver must opt-in for this using a new flag NVME_F_MDEV_SUPPORTED If the mdev device driver also sets the NVME_F_MDEV_DMA_SUPPORTED, the mdev core will dma map all the guest memory into the nvme device, so that nvme device driver can use dma addresses as passed from the mdev core driver Signed-off-by: Maxim Levitsky --- drivers/nvme/host/core.c | 125 ++++++++++++++++++++++++++++++++++++++- drivers/nvme/host/nvme.h | 54 +++++++++++++++-- 2 files changed, 172 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 22db0c51a0bf..3c1b91089631 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,11 +89,111 @@ static dev_t nvme_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; +static void nvme_ns_remove(struct nvme_ns *ns); static int nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); +#ifdef CONFIG_NVME_MDEV + +static struct nvme_mdev_driver *mdev_driver_interface; +static DEFINE_MUTEX(mdev_ctrl_lock); +static LIST_HEAD(mdev_ctrl_list); + +static bool nvme_ctrl_has_mdev(struct nvme_ctrl *ctrl) +{ + return (ctrl->ops->flags & NVME_F_MDEV_SUPPORTED) != 0; +} + +static void nvme_mdev_add_ctrl(struct nvme_ctrl *ctrl) +{ + if (nvme_ctrl_has_mdev(ctrl)) { + mutex_lock(&mdev_ctrl_lock); + list_add_tail(&ctrl->link, &mdev_ctrl_list); + mutex_unlock(&mdev_ctrl_lock); + } +} + +static void nvme_mdev_remove_ctrl(struct nvme_ctrl *ctrl) +{ + if (nvme_ctrl_has_mdev(ctrl)) { + mutex_lock(&mdev_ctrl_lock); + list_del_init(&ctrl->link); + mutex_unlock(&mdev_ctrl_lock); + } +} + +int nvme_core_register_mdev_driver(struct nvme_mdev_driver *driver_ops) +{ + struct nvme_ctrl *ctrl; + + if (mdev_driver_interface) + return -EEXIST; + + mdev_driver_interface = driver_ops; + + mutex_lock(&mdev_ctrl_lock); + list_for_each_entry(ctrl, &mdev_ctrl_list, link) + mdev_driver_interface->nvme_ctrl_state_changed(ctrl); + + mutex_unlock(&mdev_ctrl_lock); + return 0; +} +EXPORT_SYMBOL_GPL(nvme_core_register_mdev_driver); + +void nvme_core_unregister_mdev_driver(struct nvme_mdev_driver *driver_ops) +{ + if (WARN_ON(driver_ops != mdev_driver_interface)) + return; + mdev_driver_interface = NULL; +} +EXPORT_SYMBOL_GPL(nvme_core_unregister_mdev_driver); + +static void nvme_mdev_ctrl_state_changed(struct nvme_ctrl *ctrl) +{ + if (!mdev_driver_interface || !nvme_ctrl_has_mdev(ctrl)) + return; + if (!try_module_get(mdev_driver_interface->owner)) + return; + + mdev_driver_interface->nvme_ctrl_state_changed(ctrl); + module_put(mdev_driver_interface->owner); +} + +static void nvme_mdev_ns_state_changed(struct nvme_ctrl *ctrl, + struct nvme_ns *ns, bool removed) +{ + if (!mdev_driver_interface || !nvme_ctrl_has_mdev(ctrl)) + return; + if (!try_module_get(mdev_driver_interface->owner)) + return; + + mdev_driver_interface->nvme_ns_state_changed(ctrl, + ns->head->ns_id, removed); + module_put(mdev_driver_interface->owner); +} + +#else +static void nvme_mdev_ctrl_state_changed(struct nvme_ctrl *ctrl) +{ +} + +static void nvme_mdev_ns_state_changed(struct nvme_ctrl *ctrl, + struct nvme_ns *ns, bool removed) +{ +} + +static void nvme_mdev_add_ctrl(struct nvme_ctrl *ctrl) +{ +} + +static void nvme_mdev_remove_ctrl(struct nvme_ctrl *ctrl) +{ +} + +#endif + static void nvme_set_queue_dying(struct nvme_ns *ns) { /* @@ -387,10 +487,13 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, if (changed) ctrl->state = new_state; - spin_unlock_irqrestore(&ctrl->lock, flags); + + nvme_mdev_ctrl_state_changed(ctrl); + if (changed && ctrl->state == NVME_CTRL_LIVE) nvme_kick_requeue_lists(ctrl); + return changed; } EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); @@ -426,10 +529,11 @@ static void nvme_free_ns(struct kref *kref) kfree(ns); } -static void nvme_put_ns(struct nvme_ns *ns) +void nvme_put_ns(struct nvme_ns *ns) { kref_put(&ns->kref, nvme_free_ns); } +EXPORT_SYMBOL_GPL(nvme_put_ns); static inline void nvme_clear_nvme_request(struct request *req) { @@ -1289,6 +1393,11 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return effects; } +static void nvme_update_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) +{ + nvme_mdev_ns_state_changed(ctrl, ns, false); +} + static void nvme_update_formats(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; @@ -1297,6 +1406,8 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl) list_for_each_entry(ns, &ctrl->namespaces, list) if (ns->disk && nvme_revalidate_disk(ns->disk)) nvme_set_queue_dying(ns); + else + nvme_update_ns(ctrl, ns); up_read(&ctrl->namespaces_rwsem); nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); @@ -3186,7 +3297,7 @@ static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) return nsa->head->ns_id - nsb->head->ns_id; } -static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) +struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned int nsid) { struct nvme_ns *ns, *ret = NULL; @@ -3204,6 +3315,7 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) up_read(&ctrl->namespaces_rwsem); return ret; } +EXPORT_SYMBOL_GPL(nvme_find_get_ns); static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) { @@ -3336,6 +3448,8 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; + nvme_mdev_ns_state_changed(ns->ctrl, ns, true); + nvme_fault_inject_fini(ns); if (ns->disk && ns->disk->flags & GENHD_FL_UP) { del_gendisk(ns->disk); @@ -3366,6 +3480,8 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (ns) { if (ns->disk && revalidate_disk(ns->disk)) nvme_ns_remove(ns); + else + nvme_update_ns(ctrl, ns); nvme_put_ns(ns); } else nvme_alloc_ns(ctrl, nsid); @@ -3717,6 +3833,7 @@ static void nvme_free_ctrl(struct device *dev) sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device)); } + nvme_mdev_remove_ctrl(ctrl); ctrl->ops->free_ctrl(ctrl); if (subsys) @@ -3789,6 +3906,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, dev_pm_qos_update_user_latency_tolerance(ctrl->device, min(default_ps_max_latency_us, (unsigned long)S32_MAX)); + nvme_mdev_add_ctrl(ctrl); + return 0; out_free_name: kfree_const(ctrl->device->kobj.name); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index d040eb00e880..6ec14ce0d015 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -148,6 +148,7 @@ enum nvme_ctrl_state { }; struct nvme_ctrl { + struct list_head link; bool comp_seen; enum nvme_ctrl_state state; bool identified; @@ -255,6 +256,23 @@ enum nvme_iopolicy { NVME_IOPOLICY_RR, }; +#ifdef CONFIG_NVME_MDEV +/* Interface to the host driver */ +struct nvme_mdev_driver { + struct module *owner; + + /* a controller state has changed*/ + void (*nvme_ctrl_state_changed)(struct nvme_ctrl *ctrl); + + /* NS is updated in some way (after format or so) */ + void (*nvme_ns_state_changed)(struct nvme_ctrl *ctrl, + u32 nsid, bool removed); +}; + +int nvme_core_register_mdev_driver(struct nvme_mdev_driver *driver_ops); +void nvme_core_unregister_mdev_driver(struct nvme_mdev_driver *driver_ops); +#endif + struct nvme_subsystem { int instance; struct device dev; @@ -280,7 +298,7 @@ struct nvme_subsystem { }; /* - * Container structure for uniqueue namespace identifiers. + * Container structure for unique namespace identifiers. */ struct nvme_ns_ids { u8 eui64[8]; @@ -356,13 +374,22 @@ struct nvme_ns { }; +struct nvme_ext_data_iter; +struct nvme_ext_cmd_result { + u32 tag; + u16 status; +}; + struct nvme_ctrl_ops { const char *name; struct module *module; unsigned int flags; -#define NVME_F_FABRICS (1 << 0) -#define NVME_F_METADATA_SUPPORTED (1 << 1) -#define NVME_F_PCI_P2PDMA (1 << 2) +#define NVME_F_FABRICS BIT(0) +#define NVME_F_METADATA_SUPPORTED BIT(1) +#define NVME_F_PCI_P2PDMA BIT(2) +#define NVME_F_MDEV_SUPPORTED BIT(3) +#define NVME_F_MDEV_DMA_SUPPORTED BIT(4) + int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); @@ -370,6 +397,23 @@ struct nvme_ctrl_ops { void (*submit_async_event)(struct nvme_ctrl *ctrl); void (*delete_ctrl)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); + +#ifdef CONFIG_NVME_MDEV + int (*ext_queues_available)(struct nvme_ctrl *ctrl); + int (*ext_queue_alloc)(struct nvme_ctrl *ctrl, u16 *qid); + void (*ext_queue_free)(struct nvme_ctrl *ctrl, u16 qid); + + int (*ext_queue_submit)(struct nvme_ctrl *ctrl, + u16 qid, u32 tag, + struct nvme_command *command, + struct nvme_ext_data_iter *iter); + + bool (*ext_queue_full)(struct nvme_ctrl *ctrl, u16 qid); + + int (*ext_queue_poll)(struct nvme_ctrl *ctrl, u16 qid, + struct nvme_ext_cmd_result *results, + unsigned int max_len); +#endif }; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS @@ -431,6 +475,8 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl); void nvme_put_ctrl(struct nvme_ctrl *ctrl); int nvme_init_identify(struct nvme_ctrl *ctrl); +struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned int nsid); +void nvme_put_ns(struct nvme_ns *ns); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, From patchwork Thu May 2 11:47:59 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Maxim Levitsky X-Patchwork-Id: 10926617 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 4B76E1390 for ; Thu, 2 May 2019 11:49:39 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 36E9229015 for ; Thu, 2 May 2019 11:49:39 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 2928F29018; Thu, 2 May 2019 11:49:39 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id C310929015 for ; Thu, 2 May 2019 11:49:37 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726617AbfEBLtc (ORCPT ); Thu, 2 May 2019 07:49:32 -0400 Received: from mx1.redhat.com ([209.132.183.28]:61370 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726268AbfEBLtc (ORCPT ); Thu, 2 May 2019 07:49:32 -0400 Received: from smtp.corp.redhat.com (int-mx03.intmail.prod.int.phx2.redhat.com [10.5.11.13]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mx1.redhat.com (Postfix) with ESMTPS id 14F8C81E00; Thu, 2 May 2019 11:49:31 +0000 (UTC) Received: from maximlenovopc.usersys.redhat.com (unknown [10.35.206.58]) by smtp.corp.redhat.com (Postfix) with ESMTP id 71F0F17DDF; Thu, 2 May 2019 11:49:16 +0000 (UTC) From: Maxim Levitsky To: linux-nvme@lists.infradead.org Cc: Maxim Levitsky , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, Jens Axboe , Alex Williamson , Keith Busch , Christoph Hellwig , Sagi Grimberg , Kirti Wankhede , "David S . Miller" , Mauro Carvalho Chehab , Greg Kroah-Hartman , Wolfram Sang , Nicolas Ferre , "Paul E . McKenney " , Paolo Bonzini , Liang Cunming , Liu Changpeng , Fam Zheng , Amnon Ilan , John Ferlan Subject: [PATCH v2 08/10] nvme/pci: implement the mdev external queue allocation interface Date: Thu, 2 May 2019 14:47:59 +0300 Message-Id: <20190502114801.23116-9-mlevitsk@redhat.com> In-Reply-To: <20190502114801.23116-1-mlevitsk@redhat.com> References: <20190502114801.23116-1-mlevitsk@redhat.com> X-Scanned-By: MIMEDefang 2.79 on 10.5.11.13 X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.25]); Thu, 02 May 2019 11:49:31 +0000 (UTC) Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP Note that currently the number of hw queues reserved for mdev, has to be pre determined on module load. (I used to allocate the queues dynamicaly on demand, but recent changes to allocate polled/read queues made this somewhat difficult, so I dropped this for now) Signed-off-by: Maxim Levitsky --- drivers/nvme/host/pci.c | 375 ++++++++++++++++++++++++++++++++++++++- drivers/nvme/mdev/host.c | 46 ++--- drivers/nvme/mdev/io.c | 46 +++-- drivers/nvme/mdev/mmio.c | 3 - 4 files changed, 421 insertions(+), 49 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 282f28c851c1..87507e710374 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -23,6 +23,7 @@ #include #include #include +#include "../mdev/mdev.h" #include "trace.h" #include "nvme.h" @@ -32,6 +33,7 @@ #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) +#define USE_SMALL_PRP_POOL(nprps) ((nprps) < (256 / 8)) /* * These can be higher, but we need to ensure that any command doesn't * require an sg allocation that needs more than a page of data. @@ -83,12 +85,24 @@ static int poll_queues = 0; module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644); MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); +static int mdev_queues; +#ifdef CONFIG_NVME_MDEV +module_param_cb(mdev_queues, &queue_count_ops, &mdev_queues, 0644); +MODULE_PARM_DESC(mdev_queues, "Number of queues to use for mediated VFIO"); +#endif + struct nvme_dev; struct nvme_queue; static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); +#ifdef CONFIG_NVME_MDEV +static void nvme_ext_queue_reset(struct nvme_dev *dev, u16 qid); +#else +static void nvme_ext_queue_reset(struct nvme_dev *dev, u16 qid) {} +#endif + /* * Represents an NVM Express device. Each nvme_dev is a PCI function. */ @@ -103,6 +117,7 @@ struct nvme_dev { unsigned online_queues; unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; + unsigned int mdev_queues; unsigned int num_vecs; int q_depth; u32 db_stride; @@ -110,6 +125,7 @@ struct nvme_dev { unsigned long bar_mapped_size; struct work_struct remove_work; struct mutex shutdown_lock; + struct mutex ext_dev_lock; bool subsystem; u64 cmb_size; bool cmb_use_sqes; @@ -172,6 +188,16 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) return container_of(ctrl, struct nvme_dev, ctrl); } +/* Simplified IO descriptor for MDEV use */ +struct nvme_ext_iod { + struct list_head link; + u32 user_tag; + int nprps; + struct nvme_ext_data_iter *saved_iter; + dma_addr_t first_prplist_dma; + __le64 *prpslists[NVME_MAX_SEGS]; +}; + /* * An NVM Express queue. Each device has at least two (one for admin * commands and one for I/O commands). @@ -196,15 +222,26 @@ struct nvme_queue { u16 qid; u8 cq_phase; unsigned long flags; + #define NVMEQ_ENABLED 0 #define NVMEQ_SQ_CMB 1 #define NVMEQ_DELETE_ERROR 2 #define NVMEQ_POLLED 3 +#define NVMEQ_EXTERNAL 4 + u32 *dbbuf_sq_db; u32 *dbbuf_cq_db; u32 *dbbuf_sq_ei; u32 *dbbuf_cq_ei; struct completion delete_done; + + /* queue passthrough for external use */ + struct { + int inflight; + struct nvme_ext_iod *iods; + struct list_head free_iods; + struct list_head used_iods; + } ext; }; /* @@ -255,7 +292,7 @@ static inline void _nvme_check_size(void) static unsigned int max_io_queues(void) { - return num_possible_cpus() + write_queues + poll_queues; + return num_possible_cpus() + write_queues + poll_queues + mdev_queues; } static unsigned int max_queue_count(void) @@ -1066,6 +1103,7 @@ static irqreturn_t nvme_irq(int irq, void *data) * the irq handler, even if that was on another CPU. */ rmb(); + if (nvmeq->cq_head != nvmeq->last_cq_head) ret = IRQ_HANDLED; nvme_process_cq(nvmeq, &start, &end, -1); @@ -1553,7 +1591,11 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); nvme_dbbuf_init(dev, nvmeq, qid); dev->online_queues++; + wmb(); /* ensure the first interrupt sees the initialization */ + + if (test_bit(NVMEQ_EXTERNAL, &nvmeq->flags)) + nvme_ext_queue_reset(nvmeq->dev, qid); } static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) @@ -1759,7 +1801,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) } max = min(dev->max_qid, dev->ctrl.queue_count - 1); - if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) { + if (max != 1) { rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] + dev->io_queues[HCTX_TYPE_READ]; } else { @@ -2095,14 +2137,23 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) * Poll queues don't need interrupts, but we need at least one IO * queue left over for non-polled IO. */ - this_p_queues = poll_queues; + this_p_queues = poll_queues + mdev_queues; if (this_p_queues >= nr_io_queues) { this_p_queues = nr_io_queues - 1; irq_queues = 1; } else { irq_queues = nr_io_queues - this_p_queues + 1; } + + if (mdev_queues > this_p_queues) { + mdev_queues = this_p_queues; + this_p_queues = 0; + } else { + this_p_queues -= mdev_queues; + } + dev->io_queues[HCTX_TYPE_POLL] = this_p_queues; + dev->mdev_queues = mdev_queues; /* Initialize for the single interrupt case */ dev->io_queues[HCTX_TYPE_DEFAULT] = 1; @@ -2170,7 +2221,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->num_vecs = result; result = max(result - 1, 1); - dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL]; + dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL] + + dev->mdev_queues; /* * Should investigate if there's a performance win from allocating @@ -2193,10 +2245,11 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nvme_suspend_io_queues(dev); goto retry; } - dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", + dev_info(dev->ctrl.device, "%d/%d/%d/%d default/read/poll/mdev queues\n", dev->io_queues[HCTX_TYPE_DEFAULT], dev->io_queues[HCTX_TYPE_READ], - dev->io_queues[HCTX_TYPE_POLL]); + dev->io_queues[HCTX_TYPE_POLL], + dev->mdev_queues); return 0; } @@ -2623,6 +2676,301 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) nvme_put_ctrl(&dev->ctrl); } +#ifdef CONFIG_NVME_MDEV +static void nvme_ext_free_iod(struct nvme_dev *dev, struct nvme_ext_iod *iod) +{ + int i = 0, max_prp, nprps = iod->nprps; + dma_addr_t dma = iod->first_prplist_dma; + + if (iod->saved_iter) { + iod->saved_iter->release(iod->saved_iter); + iod->saved_iter = NULL; + } + + if (--nprps < 2) { + goto out; + } else if (USE_SMALL_PRP_POOL(nprps)) { + dma_pool_free(dev->prp_small_pool, iod->prpslists[0], dma); + goto out; + } + + max_prp = (dev->ctrl.page_size >> 3) - 1; + while (nprps > 0) { + if (i > 0) { + dma = iod->prpslists[i - 1][max_prp]; + if (nprps == 1) + break; + } + dma_pool_free(dev->prp_page_pool, iod->prpslists[i++], dma); + nprps -= max_prp; + } +out: + iod->nprps = -1; + iod->first_prplist_dma = 0; + iod->user_tag = 0xDEADDEAD; +} + +static int nvme_ext_setup_iod(struct nvme_dev *dev, struct nvme_ext_iod *iod, + struct nvme_common_command *cmd, + struct nvme_ext_data_iter *iter) +{ + int ret, i, j; + __le64 *prp_list; + dma_addr_t prp_dma; + struct dma_pool *pool; + int max_prp = (dev->ctrl.page_size >> 3) - 1; + + iod->saved_iter = iter && iter->release ? iter : NULL; + iod->nprps = iter ? iter->count : 0; + cmd->dptr.prp1 = 0; + cmd->dptr.prp2 = 0; + cmd->metadata = 0; + + if (!iter) + return 0; + + /* put first pointer*/ + cmd->dptr.prp1 = cpu_to_le64(iter->host_iova); + if (iter->count == 1) + return 0; + + ret = iter->next(iter); + if (ret) + goto error; + + /* if only have one more pointer, put it to second data pointer*/ + if (iter->count == 1) { + cmd->dptr.prp2 = cpu_to_le64(iter->host_iova); + return 0; + } + + pool = USE_SMALL_PRP_POOL(iter->count) ? dev->prp_small_pool : + dev->prp_page_pool; + + /* Allocate prp lists as needed and fill them */ + for (i = 0 ; i < NVME_MAX_SEGS && iter->count ; i++) { + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); + if (!prp_list) { + ret = -ENOMEM; + goto error; + } + + iod->prpslists[i++] = prp_list; + + if (i == 1) { + iod->first_prplist_dma = prp_dma; + cmd->dptr.prp2 = cpu_to_le64(prp_dma); + j = 0; + } else { + prp_list[0] = iod->prpslists[i - 1][max_prp]; + iod->prpslists[i - 1][max_prp] = prp_dma; + j = 1; + } + + while (j <= max_prp && iter->count) { + prp_list[j++] = iter->host_iova; + ret = iter->next(iter); + if (ret) + goto error; + } + } + + if (iter->count) { + ret = -ENOSPC; + goto error; + } + return 0; +error: + iod->nprps -= iter->count; + nvme_ext_free_iod(dev, iod); + return ret; +} + +static int nvme_ext_queues_available(struct nvme_ctrl *ctrl) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + unsigned int ret = 0, qid; + unsigned int first_mdev_q = dev->online_queues - dev->mdev_queues; + + for (qid = first_mdev_q; qid < dev->online_queues; qid++) { + struct nvme_queue *nvmeq = &dev->queues[qid]; + + if (!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags)) + ret++; + } + return ret; +} + +static void nvme_ext_queue_reset(struct nvme_dev *dev, u16 qid) +{ + struct nvme_queue *nvmeq = &dev->queues[qid]; + struct nvme_ext_iod *iod, *tmp; + + list_for_each_entry_safe(iod, tmp, &nvmeq->ext.used_iods, link) { + if (iod->saved_iter && iod->saved_iter->release) { + iod->saved_iter->release(iod->saved_iter); + iod->saved_iter = NULL; + list_move(&iod->link, &nvmeq->ext.free_iods); + } + } + + nvmeq->ext.inflight = 0; +} + +static int nvme_ext_queue_alloc(struct nvme_ctrl *ctrl, u16 *ret_qid) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq; + int ret = 0, qid, i; + unsigned int first_mdev_q = dev->online_queues - dev->mdev_queues; + + mutex_lock(&dev->ext_dev_lock); + + /* find a polled queue to allocate */ + for (qid = dev->online_queues - 1 ; qid >= first_mdev_q ; qid--) { + nvmeq = &dev->queues[qid]; + if (!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags)) + break; + } + + if (qid < first_mdev_q) { + ret = -ENOSPC; + goto out; + } + + INIT_LIST_HEAD(&nvmeq->ext.free_iods); + INIT_LIST_HEAD(&nvmeq->ext.used_iods); + + nvmeq->ext.iods = + vzalloc_node(sizeof(struct nvme_ext_iod) * nvmeq->q_depth, + dev_to_node(dev->dev)); + + if (!nvmeq->ext.iods) { + ret = -ENOMEM; + goto out; + } + + for (i = 0 ; i < nvmeq->q_depth ; i++) + list_add_tail(&nvmeq->ext.iods[i].link, &nvmeq->ext.free_iods); + + set_bit(NVMEQ_EXTERNAL, &nvmeq->flags); + *ret_qid = qid; +out: + mutex_unlock(&dev->ext_dev_lock); + return ret; +} + +static void nvme_ext_queue_free(struct nvme_ctrl *ctrl, u16 qid) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq; + + mutex_lock(&dev->ext_dev_lock); + nvmeq = &dev->queues[qid]; + + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) + return; + + nvme_ext_queue_reset(dev, qid); + + vfree(nvmeq->ext.iods); + nvmeq->ext.iods = NULL; + INIT_LIST_HEAD(&nvmeq->ext.free_iods); + INIT_LIST_HEAD(&nvmeq->ext.used_iods); + + clear_bit(NVMEQ_EXTERNAL, &nvmeq->flags); + mutex_unlock(&dev->ext_dev_lock); +} + +static int nvme_ext_queue_submit(struct nvme_ctrl *ctrl, u16 qid, u32 user_tag, + struct nvme_command *command, + struct nvme_ext_data_iter *iter) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq = &dev->queues[qid]; + struct nvme_ext_iod *iod; + int ret; + + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) + return -EINVAL; + + if (list_empty(&nvmeq->ext.free_iods)) + return -1; + + iod = list_first_entry(&nvmeq->ext.free_iods, + struct nvme_ext_iod, link); + + list_move(&iod->link, &nvmeq->ext.used_iods); + + command->common.command_id = cpu_to_le16(iod - nvmeq->ext.iods); + iod->user_tag = user_tag; + + ret = nvme_ext_setup_iod(dev, iod, &command->common, iter); + if (ret) { + list_move(&iod->link, &nvmeq->ext.free_iods); + return ret; + } + + nvmeq->ext.inflight++; + nvme_submit_cmd(nvmeq, command, true); + return 0; +} + +static int nvme_ext_queue_poll(struct nvme_ctrl *ctrl, u16 qid, + struct nvme_ext_cmd_result *results, + unsigned int max_len) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq = &dev->queues[qid]; + u16 old_head; + int i, j; + + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) + return -EINVAL; + + if (nvmeq->ext.inflight == 0) + return -1; + + old_head = nvmeq->cq_head; + + for (i = 0 ; nvme_cqe_pending(nvmeq) && i < max_len ; i++) { + u16 status = le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status); + u16 tag = le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].command_id); + + results[i].status = status >> 1; + results[i].tag = (u32)tag; + nvme_update_cq_head(nvmeq); + } + + if (old_head != nvmeq->cq_head) + nvme_ring_cq_doorbell(nvmeq); + + for (j = 0 ; j < i ; j++) { + u16 tag = results[j].tag & 0xFFFF; + struct nvme_ext_iod *iod = &nvmeq->ext.iods[tag]; + + if (WARN_ON(tag >= nvmeq->q_depth || iod->nprps == -1)) + continue; + + results[j].tag = iod->user_tag; + nvme_ext_free_iod(dev, iod); + list_move(&iod->link, &nvmeq->ext.free_iods); + nvmeq->ext.inflight--; + } + + WARN_ON(nvmeq->ext.inflight < 0); + return i; +} + +static bool nvme_ext_queue_full(struct nvme_ctrl *ctrl, u16 qid) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq = &dev->queues[qid]; + + return nvmeq->ext.inflight < nvmeq->q_depth - 1; +} +#endif + static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) { *val = readl(to_nvme_dev(ctrl)->bar + off); @@ -2652,13 +3000,25 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, .flags = NVME_F_METADATA_SUPPORTED | - NVME_F_PCI_P2PDMA, + NVME_F_PCI_P2PDMA | + NVME_F_MDEV_SUPPORTED | + NVME_F_MDEV_DMA_SUPPORTED, + .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, .free_ctrl = nvme_pci_free_ctrl, .submit_async_event = nvme_pci_submit_async_event, .get_address = nvme_pci_get_address, + +#ifdef CONFIG_NVME_MDEV + .ext_queues_available = nvme_ext_queues_available, + .ext_queue_alloc = nvme_ext_queue_alloc, + .ext_queue_free = nvme_ext_queue_free, + .ext_queue_submit = nvme_ext_queue_submit, + .ext_queue_poll = nvme_ext_queue_poll, + .ext_queue_full = nvme_ext_queue_full, +#endif }; static int nvme_dev_map(struct nvme_dev *dev) @@ -2747,6 +3107,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); mutex_init(&dev->shutdown_lock); + mutex_init(&dev->ext_dev_lock); result = nvme_setup_prp_pools(dev); if (result) diff --git a/drivers/nvme/mdev/host.c b/drivers/nvme/mdev/host.c index 5766bad7e909..6590946b86c2 100644 --- a/drivers/nvme/mdev/host.c +++ b/drivers/nvme/mdev/host.c @@ -48,19 +48,21 @@ static struct nvme_mdev_hctrl *nvme_mdev_hctrl_create(struct nvme_ctrl *ctrl) return NULL; } + hctrl = kzalloc_node(sizeof(*hctrl), GFP_KERNEL, + dev_to_node(ctrl->dev)); + if (!hctrl) + return NULL; + nr_host_queues = ctrl->ops->ext_queues_available(ctrl); max_lba_transfer = ctrl->max_hw_sectors >> (PAGE_SHIFT - 9); if (nr_host_queues == 0) { dev_info(ctrl->dev, "no support for mdev - no mdev reserved queues available"); + kfree(hctrl); return NULL; } - hctrl = kzalloc_node(sizeof(*hctrl), GFP_KERNEL, - dev_to_node(ctrl->dev)); - if (!hctrl) - return NULL; kref_init(&hctrl->ref); mutex_init(&hctrl->lock); @@ -180,6 +182,24 @@ void nvme_mdev_hctrl_hqs_unreserve(struct nvme_mdev_hctrl *hctrl, mutex_unlock(&hctrl->lock); } +/* Check if IO passthrough is supported for given IO optcode */ +bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl *hctrl, u8 optcode) +{ + switch (optcode) { + case nvme_cmd_flush: + case nvme_cmd_read: + case nvme_cmd_write: + /* these are mandatory*/ + return true; + case nvme_cmd_write_zeroes: + return (hctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES); + case nvme_cmd_dsm: + return (hctrl->oncs & NVME_CTRL_ONCS_DSM); + default: + return false; + } +} + /* Allocate a host IO queue */ int nvme_mdev_hctrl_hq_alloc(struct nvme_mdev_hctrl *hctrl) { @@ -204,23 +224,7 @@ bool nvme_mdev_hctrl_hq_can_submit(struct nvme_mdev_hctrl *hctrl, u16 qid) return hctrl->nvme_ctrl->ops->ext_queue_full(hctrl->nvme_ctrl, qid); } -/* Check if IO passthrough is supported for given IO optcode */ -bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl *hctrl, u8 optcode) -{ - switch (optcode) { - case nvme_cmd_flush: - case nvme_cmd_read: - case nvme_cmd_write: - /* these are mandatory*/ - return true; - case nvme_cmd_write_zeroes: - return (hctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES); - case nvme_cmd_dsm: - return (hctrl->oncs & NVME_CTRL_ONCS_DSM); - default: - return false; - } -} + /* Submit a IO passthrough command */ int nvme_mdev_hctrl_hq_submit(struct nvme_mdev_hctrl *hctrl, diff --git a/drivers/nvme/mdev/io.c b/drivers/nvme/mdev/io.c index a731196d0365..59837540fec2 100644 --- a/drivers/nvme/mdev/io.c +++ b/drivers/nvme/mdev/io.c @@ -11,14 +11,16 @@ #include #include "priv.h" + struct io_ctx { struct nvme_mdev_hctrl *hctrl; struct nvme_mdev_vctrl *vctrl; const struct nvme_command *in; - struct nvme_command out; struct nvme_mdev_vns *ns; struct nvme_ext_data_iter udatait; + + struct nvme_command out; struct nvme_ext_data_iter *kdatait; ktime_t last_io_t; @@ -28,6 +30,20 @@ struct io_ctx { unsigned int arb_burst; }; +/* Check if we need to read a command from the admin queue */ +static bool nvme_mdev_adm_needs_processing(struct io_ctx *ctx) +{ + if (!timeout(ctx->last_admin_poll_time, + ctx->vctrl->now, ctx->admin_poll_rate_ms)) + return false; + + if (nvme_mdev_vsq_has_data(ctx->vctrl, &ctx->vctrl->vsqs[0])) + return true; + + ctx->last_admin_poll_time = ctx->vctrl->now; + return false; +} + /* Handle read/write command.*/ static int nvme_mdev_io_translate_rw(struct io_ctx *ctx) { @@ -229,6 +245,7 @@ static int nvme_mdev_io_translate_cmd(struct io_ctx *ctx) } } +/* process a user submission queue */ static bool nvme_mdev_io_process_sq(struct io_ctx *ctx, u16 sqid) { struct nvme_vsq *vsq = &ctx->vctrl->vsqs[sqid]; @@ -275,7 +292,13 @@ static bool nvme_mdev_io_process_sq(struct io_ctx *ctx, u16 sqid) return true; } -/* process host replies to the passed through commands */ +/* process a user completion queue */ +static void nvme_mdev_io_process_cq(struct io_ctx *ctx, u16 cqid) +{ + nvme_mdev_vcq_process(ctx->vctrl, cqid, true); +} + +/* process hardware completion queue */ static int nvme_mdev_io_process_hwq(struct io_ctx *ctx, u16 hwq) { int n, i; @@ -301,22 +324,9 @@ static int nvme_mdev_io_process_hwq(struct io_ctx *ctx, u16 hwq) return n; } -/* Check if we need to read a command from the admin queue */ -static bool nvme_mdev_adm_needs_processing(struct io_ctx *ctx) -{ - if (!timeout(ctx->last_admin_poll_time, - ctx->vctrl->now, ctx->admin_poll_rate_ms)) - return false; - - if (nvme_mdev_vsq_has_data(ctx->vctrl, &ctx->vctrl->vsqs[0])) - return true; - - ctx->last_admin_poll_time = ctx->vctrl->now; - return false; -} /* do polling till one of events stops it */ -static void nvme_mdev_io_maintask(struct io_ctx *ctx) +static void nvme_mdev_io_polling_loop(struct io_ctx *ctx) { struct nvme_mdev_vctrl *vctrl = ctx->vctrl; u16 i, cqid, sqid, hsqcnt; @@ -353,7 +363,7 @@ static void nvme_mdev_io_maintask(struct io_ctx *ctx) /* process the completions from the guest*/ cqid = 1; for_each_set_bit_from(cqid, vctrl->vcq_en, MAX_VIRTUAL_QUEUES) - nvme_mdev_vcq_process(vctrl, cqid, true); + nvme_mdev_io_process_cq(ctx, cqid); /* process the completions from the hardware*/ for (i = 0 ; i < hsqcnt ; i++) @@ -470,7 +480,7 @@ static int nvme_mdev_io_polling_thread(void *data) if (kthread_should_stop()) break; - nvme_mdev_io_maintask(&ctx); + nvme_mdev_io_polling_loop(&ctx); } _DBG(ctx.vctrl, "IO: iothread stopped\n"); diff --git a/drivers/nvme/mdev/mmio.c b/drivers/nvme/mdev/mmio.c index cf03c1f22f4c..a80962bf4a3d 100644 --- a/drivers/nvme/mdev/mmio.c +++ b/drivers/nvme/mdev/mmio.c @@ -54,9 +54,6 @@ static const struct vm_operations_struct nvme_mdev_mmio_dbs_vm_ops = { bool nvme_mdev_mmio_db_check(struct nvme_mdev_vctrl *vctrl, u16 qid, u16 size, u16 db) { - if (get_current() != vctrl->iothread) - lockdep_assert_held(&vctrl->lock); - if (db < size) return true; if (qid == 0) { From patchwork Thu May 2 11:48:00 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Maxim Levitsky X-Patchwork-Id: 10926619 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 0C64B14C0 for ; Thu, 2 May 2019 11:49:46 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id EE89629015 for ; Thu, 2 May 2019 11:49:45 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id E1C7D29018; Thu, 2 May 2019 11:49:45 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 52CDE29015 for ; Thu, 2 May 2019 11:49:45 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726645AbfEBLtj (ORCPT ); Thu, 2 May 2019 07:49:39 -0400 Received: from mx1.redhat.com ([209.132.183.28]:58660 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726583AbfEBLtj (ORCPT ); Thu, 2 May 2019 07:49:39 -0400 Received: from smtp.corp.redhat.com (int-mx03.intmail.prod.int.phx2.redhat.com [10.5.11.13]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mx1.redhat.com (Postfix) with ESMTPS id 252C0C05E767; Thu, 2 May 2019 11:49:38 +0000 (UTC) Received: from maximlenovopc.usersys.redhat.com (unknown [10.35.206.58]) by smtp.corp.redhat.com (Postfix) with ESMTP id 6E8E617DDF; Thu, 2 May 2019 11:49:31 +0000 (UTC) From: Maxim Levitsky To: linux-nvme@lists.infradead.org Cc: Maxim Levitsky , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, Jens Axboe , Alex Williamson , Keith Busch , Christoph Hellwig , Sagi Grimberg , Kirti Wankhede , "David S . Miller" , Mauro Carvalho Chehab , Greg Kroah-Hartman , Wolfram Sang , Nicolas Ferre , "Paul E . McKenney " , Paolo Bonzini , Liang Cunming , Liu Changpeng , Fam Zheng , Amnon Ilan , John Ferlan Subject: [PATCH v2 09/10] nvme/mdev - Add inline performance measurments Date: Thu, 2 May 2019 14:48:00 +0300 Message-Id: <20190502114801.23116-10-mlevitsk@redhat.com> In-Reply-To: <20190502114801.23116-1-mlevitsk@redhat.com> References: <20190502114801.23116-1-mlevitsk@redhat.com> X-Scanned-By: MIMEDefang 2.79 on 10.5.11.13 X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.31]); Thu, 02 May 2019 11:49:38 +0000 (UTC) Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP This code might not be needed to be merged in the final version Signed-off-by: Maxim Levitsky --- drivers/nvme/mdev/instance.c | 62 ++++++++++++++++++++++++++++++++++++ drivers/nvme/mdev/io.c | 21 ++++++++++++ drivers/nvme/mdev/irq.c | 6 ++++ drivers/nvme/mdev/priv.h | 13 ++++++++ drivers/nvme/mdev/vctrl.c | 3 ++ 5 files changed, 105 insertions(+) diff --git a/drivers/nvme/mdev/instance.c b/drivers/nvme/mdev/instance.c index 0e5ba5a9269f..d692b2bf2ef2 100644 --- a/drivers/nvme/mdev/instance.c +++ b/drivers/nvme/mdev/instance.c @@ -771,6 +771,62 @@ static struct attribute *nvme_mdev_dev_settings_atttributes[] = { NULL }; +/* show perf stats */ +static ssize_t stats_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_mdev_vctrl *vctrl = dev_to_vctrl(dev); + struct nvme_mdev_perf *perf; + + if (!vctrl) + return -ENODEV; + + perf = &vctrl->perf; + + return sprintf(buf, + "%u %llu %llu %llu %llu %llu %llu\n", + + tsc_khz, + + perf->cmds_started, + perf->cycles_send_to_hw, + + perf->cmds_complete, + perf->cycles_receive_from_hw, + + perf->interrupts_sent, + perf->cycles_irq_delivery); +} + +/* clear the perf stats */ +static ssize_t stats_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + bool val; + int ret; + struct nvme_mdev_vctrl *vctrl = dev_to_vctrl(dev); + + if (!vctrl) + return -ENODEV; + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + if (!val) + return -EINVAL; + + memset(&vctrl->perf, 0, sizeof(vctrl->perf)); + return count; +} + +static DEVICE_ATTR_RW(stats); + +static struct attribute *nvme_mdev_dev_debug_attributes[] = { + &dev_attr_stats.attr, + NULL +}; + static const struct attribute_group nvme_mdev_ns_attr_group = { .name = "namespaces", .attrs = nvme_mdev_dev_ns_atttributes, @@ -781,9 +837,15 @@ static const struct attribute_group nvme_mdev_setting_attr_group = { .attrs = nvme_mdev_dev_settings_atttributes, }; +static const struct attribute_group nvme_mdev_debug_attr_group = { + .name = "debug", + .attrs = nvme_mdev_dev_debug_attributes, +}; + static const struct attribute_group *nvme_mdev_dev_attributte_groups[] = { &nvme_mdev_ns_attr_group, &nvme_mdev_setting_attr_group, + &nvme_mdev_debug_attr_group, NULL, }; diff --git a/drivers/nvme/mdev/io.c b/drivers/nvme/mdev/io.c index 59837540fec2..39550d0e3649 100644 --- a/drivers/nvme/mdev/io.c +++ b/drivers/nvme/mdev/io.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "priv.h" @@ -251,6 +252,9 @@ static bool nvme_mdev_io_process_sq(struct io_ctx *ctx, u16 sqid) struct nvme_vsq *vsq = &ctx->vctrl->vsqs[sqid]; u16 ucid; int ret; + unsigned long long c1, c2; + + c1 = rdtsc(); /* If host queue is full, we can't process a command * as a command will likely result in passthrough @@ -289,6 +293,12 @@ static bool nvme_mdev_io_process_sq(struct io_ctx *ctx, u16 sqid) nvme_mdev_vsq_cmd_done_io(ctx->vctrl, sqid, ucid, ret); } + + c2 = rdtsc(); + + ctx->vctrl->perf.cmds_started++; + ctx->vctrl->perf.cycles_send_to_hw += (c2 - c1); + return true; } @@ -304,6 +314,10 @@ static int nvme_mdev_io_process_hwq(struct io_ctx *ctx, u16 hwq) int n, i; struct nvme_ext_cmd_result res[16]; + unsigned long long c1, c2; + + c1 = rdtsc(); + /* process the completions from the hardware */ n = nvme_mdev_hctrl_hq_poll(ctx->hctrl, hwq, res, 16); if (n == -1) @@ -321,6 +335,13 @@ static int nvme_mdev_io_process_hwq(struct io_ctx *ctx, u16 hwq) nvme_mdev_vsq_cmd_done_io(ctx->vctrl, qid, cid, status); } + + if (n > 0) { + c2 = rdtsc(); + ctx->vctrl->perf.cmds_complete += n; + ctx->vctrl->perf.cycles_receive_from_hw += (c2 - c1); + } + return n; } diff --git a/drivers/nvme/mdev/irq.c b/drivers/nvme/mdev/irq.c index 5809cdb4d84c..b6010a69b584 100644 --- a/drivers/nvme/mdev/irq.c +++ b/drivers/nvme/mdev/irq.c @@ -9,6 +9,7 @@ #include #include #include "priv.h" +#include /* Setup the interrupt subsystem */ void nvme_mdev_irqs_setup(struct nvme_mdev_vctrl *vctrl) @@ -253,12 +254,17 @@ void nvme_mdev_irq_cond_trigger(struct nvme_mdev_vctrl *vctrl, unsigned int index) { struct nvme_mdev_user_irq *irq = &vctrl->irqs.vecs[index]; + unsigned long long c1, c2; if (irq->irq_pending_cnt == 0) return; if (!nvme_mdev_irq_coalesce(vctrl, irq)) { + vctrl->perf.interrupts_sent++; + c1 = rdtsc(); nvme_mdev_irq_trigger(vctrl, index); + c2 = rdtsc(); nvme_mdev_irq_clear(vctrl, index); + vctrl->perf.cycles_irq_delivery += (c2 - c1); } } diff --git a/drivers/nvme/mdev/priv.h b/drivers/nvme/mdev/priv.h index 9f65e46c1ab2..a11a1842957d 100644 --- a/drivers/nvme/mdev/priv.h +++ b/drivers/nvme/mdev/priv.h @@ -245,6 +245,17 @@ struct nvme_mdev_io_region { unsigned int mmap_area_size; }; +struct nvme_mdev_perf { + /* number of IO commands received */ + unsigned long long cmds_started; + unsigned long long cmds_complete; + unsigned long long interrupts_sent; + + unsigned long long cycles_send_to_hw; + unsigned long long cycles_receive_from_hw; + unsigned long long cycles_irq_delivery; +}; + /*Virtual NVME controller state */ struct nvme_mdev_vctrl { struct kref ref; @@ -301,6 +312,8 @@ struct nvme_mdev_vctrl { char serial[9]; bool vctrl_paused; /* true when the host device paused our IO */ + + struct nvme_mdev_perf perf; }; /* mdev instance type*/ diff --git a/drivers/nvme/mdev/vctrl.c b/drivers/nvme/mdev/vctrl.c index 87bc7c435c0c..d23b543dfd52 100644 --- a/drivers/nvme/mdev/vctrl.c +++ b/drivers/nvme/mdev/vctrl.c @@ -242,6 +242,9 @@ int nvme_mdev_vctrl_open(struct nvme_mdev_vctrl *vctrl) nvme_mdev_mmio_open(vctrl); vctrl->inuse = true; + + memset(&vctrl->perf, 0, sizeof(vctrl->perf)); + out: mutex_unlock(&vctrl->lock); return ret; From patchwork Thu May 2 11:48:01 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Maxim Levitsky X-Patchwork-Id: 10926621 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 0EE0414C0 for ; Thu, 2 May 2019 11:49:57 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id EEC7228B5F for ; Thu, 2 May 2019 11:49:56 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id DCA3428DB3; Thu, 2 May 2019 11:49:56 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id BB57E28B5F for ; Thu, 2 May 2019 11:49:55 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726664AbfEBLtt (ORCPT ); Thu, 2 May 2019 07:49:49 -0400 Received: from mx1.redhat.com ([209.132.183.28]:33908 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726658AbfEBLts (ORCPT ); Thu, 2 May 2019 07:49:48 -0400 Received: from smtp.corp.redhat.com (int-mx03.intmail.prod.int.phx2.redhat.com [10.5.11.13]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mx1.redhat.com (Postfix) with ESMTPS id 5158B3086215; Thu, 2 May 2019 11:49:48 +0000 (UTC) Received: from maximlenovopc.usersys.redhat.com (unknown [10.35.206.58]) by smtp.corp.redhat.com (Postfix) with ESMTP id 7F04F17DDF; Thu, 2 May 2019 11:49:38 +0000 (UTC) From: Maxim Levitsky To: linux-nvme@lists.infradead.org Cc: Maxim Levitsky , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, Jens Axboe , Alex Williamson , Keith Busch , Christoph Hellwig , Sagi Grimberg , Kirti Wankhede , "David S . Miller" , Mauro Carvalho Chehab , Greg Kroah-Hartman , Wolfram Sang , Nicolas Ferre , "Paul E . McKenney " , Paolo Bonzini , Liang Cunming , Liu Changpeng , Fam Zheng , Amnon Ilan , John Ferlan Subject: [PATCH v2 10/10] nvme/mdev - generic block IO code Date: Thu, 2 May 2019 14:48:01 +0300 Message-Id: <20190502114801.23116-11-mlevitsk@redhat.com> In-Reply-To: <20190502114801.23116-1-mlevitsk@redhat.com> References: <20190502114801.23116-1-mlevitsk@redhat.com> X-Scanned-By: MIMEDefang 2.79 on 10.5.11.13 X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.42]); Thu, 02 May 2019 11:49:48 +0000 (UTC) Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP Use the block layer (bio_submit) to pass through the IO to the nvme driver instead of the direct IO submission hooks. Currently that code supports only read/write, and it still assumes that we talk to an nvme driver. Signed-off-by: Maxim Levitsky --- drivers/nvme/mdev/Kconfig | 8 ++ drivers/nvme/mdev/host.c | 239 +++++++++++++++++++++++++++++++++++++- drivers/nvme/mdev/io.c | 7 ++ drivers/nvme/mdev/priv.h | 61 ++++++++++ 4 files changed, 313 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/mdev/Kconfig b/drivers/nvme/mdev/Kconfig index 7ebc66cdeac0..1ace298a364d 100644 --- a/drivers/nvme/mdev/Kconfig +++ b/drivers/nvme/mdev/Kconfig @@ -14,3 +14,11 @@ config NVME_MDEV_VFIO guest, also as a NVME namespace, attached to a virtual NVME controller If unsure, say N. + +config NVME_MDEV_VFIO_GENERIC_IO + bool "Use generic block layer IO" + depends on NVME_MDEV_VFIO + help + Send the IO through the block layer using polled IO queues, + instead of dedicated mdev queues + If unsure, say N. diff --git a/drivers/nvme/mdev/host.c b/drivers/nvme/mdev/host.c index 6590946b86c2..a2ba69dcf4f2 100644 --- a/drivers/nvme/mdev/host.c +++ b/drivers/nvme/mdev/host.c @@ -53,6 +53,7 @@ static struct nvme_mdev_hctrl *nvme_mdev_hctrl_create(struct nvme_ctrl *ctrl) if (!hctrl) return NULL; +#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO nr_host_queues = ctrl->ops->ext_queues_available(ctrl); max_lba_transfer = ctrl->max_hw_sectors >> (PAGE_SHIFT - 9); @@ -63,6 +64,15 @@ static struct nvme_mdev_hctrl *nvme_mdev_hctrl_create(struct nvme_ctrl *ctrl) return NULL; } + hctrl->oncs = ctrl->oncs & + (NVME_CTRL_ONCS_DSM | NVME_CTRL_ONCS_WRITE_ZEROES); +#else + /* for now don't deal with bio chaining */ + max_lba_transfer = BIO_MAX_PAGES; + nr_host_queues = MDEV_NVME_NUM_BIO_QUEUES; + /* for now no support for write zeros and discard*/ + hctrl->oncs = 0; +#endif kref_init(&hctrl->ref); mutex_init(&hctrl->lock); @@ -70,8 +80,6 @@ static struct nvme_mdev_hctrl *nvme_mdev_hctrl_create(struct nvme_ctrl *ctrl) hctrl->nvme_ctrl = ctrl; nvme_get_ctrl(ctrl); - hctrl->oncs = ctrl->oncs & - (NVME_CTRL_ONCS_DSM | NVME_CTRL_ONCS_WRITE_ZEROES); hctrl->id = ctrl->instance; hctrl->node = dev_to_node(ctrl->dev); @@ -200,6 +208,8 @@ bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl *hctrl, u8 optcode) } } +#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO + /* Allocate a host IO queue */ int nvme_mdev_hctrl_hq_alloc(struct nvme_mdev_hctrl *hctrl) { @@ -228,6 +238,7 @@ bool nvme_mdev_hctrl_hq_can_submit(struct nvme_mdev_hctrl *hctrl, u16 qid) /* Submit a IO passthrough command */ int nvme_mdev_hctrl_hq_submit(struct nvme_mdev_hctrl *hctrl, + struct nvme_mdev_vns *vns, u16 qid, u32 tag, struct nvme_command *cmd, struct nvme_ext_data_iter *datait) @@ -248,6 +259,226 @@ int nvme_mdev_hctrl_hq_poll(struct nvme_mdev_hctrl *hctrl, return ctrl->ops->ext_queue_poll(ctrl, qid, results, max_len); } +#else + +/* Allocate a 'host' queue - here the queues are virtual*/ +int nvme_mdev_hctrl_hq_alloc(struct nvme_mdev_hctrl *hctrl) +{ + int qid, ret; + struct hw_mbio_queue *hwq; + + for (qid = 0 ; qid < MDEV_NVME_NUM_BIO_QUEUES ; qid++) + if (!hctrl->hw_queues[qid]) + break; + + if (qid == MDEV_NVME_NUM_BIO_QUEUES) + return -ENOSPC; + + hwq = kzalloc_node(sizeof(*hwq), GFP_KERNEL, hctrl->node); + if (!hwq) + return -ENOMEM; + + INIT_LIST_HEAD(&hwq->bios_in_flight); + + ret = bioset_init(&hwq->bioset, MDEV_NVME_BIO_QUEUE_SIZE, + offsetof(struct mbio, bio), BIOSET_NEED_BVECS); + + if (ret < 0) { + kfree(hwq); + return ret; + } + + hctrl->hw_queues[qid] = hwq; + return qid + 1; +} + +/* Free a 'host' queue - here the queues are virtual*/ +void nvme_mdev_hctrl_hq_free(struct nvme_mdev_hctrl *hctrl, u16 qid) +{ + struct hw_mbio_queue *hwq = hctrl->hw_queues[qid - 1]; + + if (WARN_ON(!hwq)) + return; + + WARN_ON(!list_empty(&hwq->bios_in_flight)); + WARN_ON(hwq->inflight); + + hctrl->hw_queues[qid - 1] = NULL; + bioset_exit(&hwq->bioset); + kfree(hwq); +} + +/* + * Check if the host queue has space for submission - also our limit + * not related to the block layer + */ +bool nvme_mdev_hctrl_hq_can_submit(struct nvme_mdev_hctrl *hctrl, u16 qid) +{ + struct hw_mbio_queue *hwq = hctrl->hw_queues[qid - 1]; + + if (WARN_ON(!hwq)) + return false; + return hwq->inflight < MDEV_NVME_BIO_QUEUE_SIZE; +} + +/* + * Callback we get from the block layer + * Note that despite polling, this can be run from IRQ context + */ +static void nvme_mdev_hctrl_bio_done(struct bio *bio) +{ + struct mbio *mbio = container_of(bio, struct mbio, bio); + + /* this will mark this bio as done, and allow the polling thread + * to return it to the user + */ + mbio->status = nvme_mdev_translate_error_block(bio->bi_status); +} + +/* Submit a IO passthrough command */ +int nvme_mdev_hctrl_hq_submit(struct nvme_mdev_hctrl *hctrl, + struct nvme_mdev_vns *vns, + u16 qid, u32 tag, + struct nvme_command *cmd, + struct nvme_ext_data_iter *datait) +{ + struct hw_mbio_queue *hwq = hctrl->hw_queues[qid - 1]; + struct bio *bio = NULL; + struct mbio *mbio; + struct page *page; + u8 opcode = cmd->common.opcode; + int retval, op, op_flags = 0; + int offset; + + if (WARN_ON(!hwq)) + return -EINVAL; + if (WARN_ON(hwq->inflight >= MDEV_NVME_BIO_QUEUE_SIZE)) + return -EBUSY; + + /* read/write buffer processing */ + if (opcode == nvme_cmd_read || opcode == nvme_cmd_write) { + unsigned long datalength = + (le16_to_cpu(cmd->rw.length) + 1) << vns->blksize_shift; + + if (opcode == nvme_cmd_read) { + op = REQ_OP_READ; + } else { + op = REQ_OP_WRITE; + op_flags = REQ_SYNC | REQ_IDLE; + if (cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) + op_flags |= REQ_FUA; + } + + if (WARN_ON(datait->count > BIO_MAX_PAGES)) + return -EINVAL; + + bio = bio_alloc_bioset(GFP_KERNEL, datait->count, &hwq->bioset); + if (WARN_ON(!bio)) + return -ENOMEM; + + mbio = container_of(bio, struct mbio, bio); + + /* starting sector */ + bio->bi_iter.bi_sector = le64_to_cpu(cmd->rw.slba) << + (vns->blksize_shift - 9); + + /* Data. Last page might be partial size*/ + while (datait->count) { + int chunk = min(PAGE_SIZE, datalength); + + if (WARN_ON(datalength == 0)) + break; + + page = pfn_to_page(PHYS_PFN(datait->physical)); + offset = OFFSET_IN_PAGE(datait->physical); + + if (bio_add_page(&mbio->bio, page, + chunk, offset) != chunk) { + WARN_ON(1); + retval = -ENOMEM; + goto error; + } + + retval = datait->next(datait); + if (WARN_ON(retval)) + goto error; + datalength -= chunk; + } + + /* flush request */ + } else if (opcode == nvme_cmd_flush) { + op = REQ_OP_WRITE; + op_flags = REQ_PREFLUSH; + bio = bio_alloc_bioset(GFP_KERNEL, 0, &hwq->bioset); + if (WARN_ON(!bio)) + return -ENOMEM; + mbio = container_of(bio, struct mbio, bio); + } else { + retval = -EINVAL; + goto error; + } + + /* set polling */ + op_flags |= REQ_HIPRI | REQ_NOWAIT; + + /* setup the bio */ + bio_set_dev(bio, vns->host_part); + bio->bi_end_io = nvme_mdev_hctrl_bio_done; + bio_set_op_attrs(bio, op, op_flags); + + /* setup our portion of the bio*/ + mbio = container_of(bio, struct mbio, bio); + mbio->tag = tag; + mbio->status = NVME_STATUS_PENDING; + mbio->blk_queue = bdev_get_queue(vns->host_part); + + /* submit the bio*/ + mbio->cookie = submit_bio(bio); + + list_add_tail(&mbio->link, &hwq->bios_in_flight); + hwq->inflight++; + return 0; +error: + if (bio) + bio_put(bio); + return retval; +} + +/* Poll for completion of IO passthrough commands */ +int nvme_mdev_hctrl_hq_poll(struct nvme_mdev_hctrl *hctrl, + u32 qid, + struct nvme_ext_cmd_result *results, + unsigned int max_len) +{ + struct hw_mbio_queue *hwq = hctrl->hw_queues[qid - 1]; + struct mbio *mbio, *tmp; + + int i = 0; + + if (!hwq->inflight) + return -1; + + list_for_each_entry_safe(mbio, tmp, &hwq->bios_in_flight, link) { + if (mbio->status == NVME_STATUS_PENDING) + blk_poll(mbio->blk_queue, mbio->cookie, false); + + if (mbio->status == NVME_STATUS_PENDING) + continue; + + results[i].tag = mbio->tag; + results[i].status = mbio->status; + + hwq->inflight--; + list_del(&mbio->link); + bio_put(&mbio->bio); + + if (++i == max_len) + break; + } + return i; +} +#endif + /* Destroy all host controllers */ void nvme_mdev_hctrl_destroy_all(void) { @@ -486,6 +717,10 @@ static int __init nvme_mdev_init(void) } pr_info("nvme_mdev " NVME_MDEV_FIRMWARE_VERSION " loaded\n"); + +#ifdef CONFIG_NVME_MDEV_VFIO_GENERIC_IO + pr_info("nvme_mdev: using block layer polled IO\b"); +#endif return 0; } diff --git a/drivers/nvme/mdev/io.c b/drivers/nvme/mdev/io.c index 39550d0e3649..d3c46de33b01 100644 --- a/drivers/nvme/mdev/io.c +++ b/drivers/nvme/mdev/io.c @@ -70,7 +70,11 @@ static int nvme_mdev_io_translate_rw(struct io_ctx *ctx) if (!check_range(slba, length, ctx->ns->ns_size)) return DNR(NVME_SC_LBA_RANGE); +#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO ctx->out.rw.slba = cpu_to_le64(slba + ctx->ns->host_lba_offset); +#else + ctx->out.rw.slba = in->slba; +#endif ctx->out.rw.length = in->length; ret = nvme_mdev_udata_iter_set_dptr(&ctx->udatait, &in->dptr, @@ -195,7 +199,9 @@ static int nvme_mdev_io_translate_dsm(struct io_ctx *ctx) _DBG(ctx->vctrl, "IOQ: DSM_MANAGEMENT: RANGE 0x%llx-0x%x\n", slba, nlb); +#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO data_ptr[i].slba = cpu_to_le64(slba + ctx->ns->host_lba_offset); +#endif } ctx->out.dsm.attributes = in->attributes; @@ -280,6 +286,7 @@ static bool nvme_mdev_io_process_sq(struct io_ctx *ctx, u16 sqid) /*passthrough*/ ret = nvme_mdev_hctrl_hq_submit(ctx->hctrl, + ctx->ns, vsq->hsq, (((u32)vsq->qid) << 16) | ((u32)ucid), &ctx->out, diff --git a/drivers/nvme/mdev/priv.h b/drivers/nvme/mdev/priv.h index a11a1842957d..1dd5fce0bfa6 100644 --- a/drivers/nvme/mdev/priv.h +++ b/drivers/nvme/mdev/priv.h @@ -34,7 +34,12 @@ #define MAX_VIRTUAL_NAMESPACES 16 /* NSID = 1..16*/ #define MAX_VIRTUAL_IRQS 16 +#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO #define MAX_HOST_QUEUES 4 +#else +#define MAX_HOST_QUEUES 1 +#endif + #define MAX_AER_COMMANDS 16 #define MAX_LOG_PAGES 16 @@ -323,6 +328,39 @@ struct nvme_mdev_inst_type { struct attribute_group *attrgroup; }; +#ifdef CONFIG_NVME_MDEV_VFIO_GENERIC_IO + +#define MDEV_NVME_BIO_QUEUE_SIZE 128 +#define NVME_STATUS_PENDING 0xFFFF +#define MDEV_NVME_NUM_BIO_QUEUES 16 + +struct mbio { + /* link in a list of pending bios*/ + struct list_head link; + + struct request_queue *blk_queue; + + /*GDPR compliant*/ + unsigned int cookie; + + /* tag from the translation (user cid + user qid) */ + u32 tag; + + /* result NVME status */ + u16 status; + + /* must be last for bioset allocation*/ + struct bio bio; +}; + +struct hw_mbio_queue { + int inflight; + struct list_head bios_in_flight; + struct bio_set bioset; +}; + +#endif + /*Abstraction of the host controller that we are connected to */ struct nvme_mdev_hctrl { struct mutex lock; @@ -344,6 +382,10 @@ struct nvme_mdev_hctrl { /* book-keeping for number of host queues we can allocate*/ unsigned int nr_host_queues; + +#ifdef CONFIG_NVME_MDEV_VFIO_GENERIC_IO + struct hw_mbio_queue *hw_queues[MDEV_NVME_NUM_BIO_QUEUES]; +#endif }; /* vctrl.c*/ @@ -415,6 +457,7 @@ bool nvme_mdev_hctrl_hq_can_submit(struct nvme_mdev_hctrl *hctrl, u16 qid); bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl *hctrl, u8 optcode); int nvme_mdev_hctrl_hq_submit(struct nvme_mdev_hctrl *hctrl, + struct nvme_mdev_vns *vns, u16 qid, u32 tag, struct nvme_command *cmd, struct nvme_ext_data_iter *datait); @@ -701,6 +744,24 @@ static inline int nvme_mdev_translate_error(int error) } } +static inline int nvme_mdev_translate_error_block(blk_status_t blk_sts) +{ + switch (blk_sts) { + case BLK_STS_OK: + return NVME_SC_SUCCESS; + case BLK_STS_NOSPC: + return DNR(NVME_SC_CAP_EXCEEDED); + case BLK_STS_TARGET: + return DNR(NVME_SC_LBA_RANGE); + case BLK_STS_NOTSUPP: + return DNR(NVME_SC_INVALID_OPCODE); + case BLK_STS_MEDIUM: + return DNR(NVME_SC_ACCESS_DENIED); + default: + return DNR(NVME_SC_INTERNAL); + } +} + static inline bool timeout(ktime_t event, ktime_t now, unsigned long timeout_ms) { return ktime_ms_delta(now, event) > (long)timeout_ms;