diff mbox

[v2,07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB

Message ID 20180228234006.21093-8-logang@deltatee.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Logan Gunthorpe Feb. 28, 2018, 11:40 p.m. UTC
Register the CMB buffer as p2pmem and use the appropriate allocation
functions to create and destroy the IO SQ.

If the CMB supports WDS and RDS, publish it for use as p2p memory
by other devices.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
---
 drivers/nvme/host/pci.c | 75 +++++++++++++++++++++++++++----------------------
 1 file changed, 41 insertions(+), 34 deletions(-)

Comments

Oliver O'Halloran March 5, 2018, 1:33 a.m. UTC | #1
On Thu, Mar 1, 2018 at 10:40 AM, Logan Gunthorpe <logang@deltatee.com> wrote:
> Register the CMB buffer as p2pmem and use the appropriate allocation
> functions to create and destroy the IO SQ.
>
> If the CMB supports WDS and RDS, publish it for use as p2p memory
> by other devices.
>
> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> ---
>  drivers/nvme/host/pci.c | 75 +++++++++++++++++++++++++++----------------------
>  1 file changed, 41 insertions(+), 34 deletions(-)
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 73036d2fbbd5..56ca79be8476 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -29,6 +29,7 @@
>  #include <linux/types.h>
>  #include <linux/io-64-nonatomic-lo-hi.h>
>  #include <linux/sed-opal.h>
> +#include <linux/pci-p2pdma.h>
>
>  #include "nvme.h"
>
> @@ -91,9 +92,8 @@ struct nvme_dev {
>         struct work_struct remove_work;
>         struct mutex shutdown_lock;
>         bool subsystem;
> -       void __iomem *cmb;
> -       pci_bus_addr_t cmb_bus_addr;
>         u64 cmb_size;
> +       bool cmb_use_sqes;
>         u32 cmbsz;
>         u32 cmbloc;
>         struct nvme_ctrl ctrl;
> @@ -148,7 +148,7 @@ struct nvme_queue {
>         struct nvme_dev *dev;
>         spinlock_t q_lock;
>         struct nvme_command *sq_cmds;
> -       struct nvme_command __iomem *sq_cmds_io;
> +       bool sq_cmds_is_io;
>         volatile struct nvme_completion *cqes;
>         struct blk_mq_tags **tags;
>         dma_addr_t sq_dma_addr;
> @@ -429,10 +429,7 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
>  {
>         u16 tail = nvmeq->sq_tail;

> -       if (nvmeq->sq_cmds_io)
> -               memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
> -       else
> -               memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
> +       memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));

Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC
the _toio() variant enforces alignment, does the copy with 4 byte
stores, and has a full barrier after the copy. In comparison our
regular memcpy() does none of those things and may use unaligned and
vector load/stores. For normal (cacheable) memory that is perfectly
fine, but they can cause alignment faults when targeted at MMIO
(cache-inhibited) memory.

I think in this particular case it might be ok since we know SEQs are
aligned to 64 byte boundaries and the copy is too small to use our
vectorised memcpy(). I'll assume we don't need explicit ordering
between writes of SEQs since the existing code doesn't seem to care
unless the doorbell is being rung, so you're probably fine there too.
That said, I still think this is a little bit sketchy and at the very
least you should add a comment explaining what's going on when the CMB
is being used. If someone more familiar with the NVMe driver could
chime in I would appreciate it.

>         if (++tail == nvmeq->q_depth)
>                 tail = 0;
> @@ -1286,9 +1283,18 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
>  {
>         dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
>                                 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
> -       if (nvmeq->sq_cmds)
> -               dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
> -                                       nvmeq->sq_cmds, nvmeq->sq_dma_addr);
> +
> +       if (nvmeq->sq_cmds) {
> +               if (nvmeq->sq_cmds_is_io)
> +                       pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
> +                                       nvmeq->sq_cmds,
> +                                       SQ_SIZE(nvmeq->q_depth));
> +               else
> +                       dma_free_coherent(nvmeq->q_dmadev,
> +                                         SQ_SIZE(nvmeq->q_depth),
> +                                         nvmeq->sq_cmds,
> +                                         nvmeq->sq_dma_addr);
> +       }
>  }
>
>  static void nvme_free_queues(struct nvme_dev *dev, int lowest)
> @@ -1368,12 +1374,21 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
>  static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
>                                 int qid, int depth)
>  {
> -       /* CMB SQEs will be mapped before creation */
> -       if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS))
> -               return 0;
> +       struct pci_dev *pdev = to_pci_dev(dev->dev);
> +
> +       if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
> +               nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
> +               nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
> +                                               nvmeq->sq_cmds);
> +               nvmeq->sq_cmds_is_io = true;
> +       }
> +
> +       if (!nvmeq->sq_cmds) {
> +               nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
> +                                       &nvmeq->sq_dma_addr, GFP_KERNEL);
> +               nvmeq->sq_cmds_is_io = false;
> +       }
>
> -       nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
> -                                           &nvmeq->sq_dma_addr, GFP_KERNEL);
>         if (!nvmeq->sq_cmds)
>                 return -ENOMEM;
>         return 0;
> @@ -1449,13 +1464,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
>         struct nvme_dev *dev = nvmeq->dev;
>         int result;
>
> -       if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
> -               unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth),
> -                                                     dev->ctrl.page_size);
> -               nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
> -               nvmeq->sq_cmds_io = dev->cmb + offset;
> -       }
> -
>         nvmeq->cq_vector = qid - 1;
>         result = adapter_alloc_cq(dev, qid, nvmeq);
>         if (result < 0)
> @@ -1685,9 +1693,6 @@ static void nvme_map_cmb(struct nvme_dev *dev)
>                 return;
>         dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
>
> -       if (!use_cmb_sqes)
> -               return;
> -
>         size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
>         offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
>         bar = NVME_CMB_BIR(dev->cmbloc);
> @@ -1704,11 +1709,15 @@ static void nvme_map_cmb(struct nvme_dev *dev)
>         if (size > bar_size - offset)
>                 size = bar_size - offset;
>
> -       dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
> -       if (!dev->cmb)
> +       if (pci_p2pdma_add_resource(pdev, bar, size, offset))
>                 return;
> -       dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
> +
>         dev->cmb_size = size;
> +       dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);
> +
> +       if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
> +                       (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
> +               pci_p2pmem_publish(pdev, true);
>
>         if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
>                                     &dev_attr_cmb.attr, NULL))
> @@ -1718,12 +1727,10 @@ static void nvme_map_cmb(struct nvme_dev *dev)
>
>  static inline void nvme_release_cmb(struct nvme_dev *dev)
>  {
> -       if (dev->cmb) {
> -               iounmap(dev->cmb);
> -               dev->cmb = NULL;
> +       if (dev->cmb_size) {
>                 sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
>                                              &dev_attr_cmb.attr, NULL);
> -               dev->cmbsz = 0;
> +               dev->cmb_size = 0;
>         }
>  }
>
> @@ -1918,13 +1925,13 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>         if (nr_io_queues == 0)
>                 return 0;
>
> -       if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
> +       if (dev->cmb_use_sqes) {
>                 result = nvme_cmb_qdepth(dev, nr_io_queues,
>                                 sizeof(struct nvme_command));
>                 if (result > 0)
>                         dev->q_depth = result;
>                 else
> -                       nvme_release_cmb(dev);
> +                       dev->cmb_use_sqes = false;
>         }
>
>         do {
> --
> 2.11.0
>
> _______________________________________________
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
Keith Busch March 5, 2018, 4 p.m. UTC | #2
On Mon, Mar 05, 2018 at 12:33:29PM +1100, Oliver wrote:
> On Thu, Mar 1, 2018 at 10:40 AM, Logan Gunthorpe <logang@deltatee.com> wrote:
> > @@ -429,10 +429,7 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
> >  {
> >         u16 tail = nvmeq->sq_tail;
> 
> > -       if (nvmeq->sq_cmds_io)
> > -               memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
> > -       else
> > -               memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
> > +       memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
> 
> Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC
> the _toio() variant enforces alignment, does the copy with 4 byte
> stores, and has a full barrier after the copy. In comparison our
> regular memcpy() does none of those things and may use unaligned and
> vector load/stores. For normal (cacheable) memory that is perfectly
> fine, but they can cause alignment faults when targeted at MMIO
> (cache-inhibited) memory.
> 
> I think in this particular case it might be ok since we know SEQs are
> aligned to 64 byte boundaries and the copy is too small to use our
> vectorised memcpy(). I'll assume we don't need explicit ordering
> between writes of SEQs since the existing code doesn't seem to care
> unless the doorbell is being rung, so you're probably fine there too.
> That said, I still think this is a little bit sketchy and at the very
> least you should add a comment explaining what's going on when the CMB
> is being used. If someone more familiar with the NVMe driver could
> chime in I would appreciate it.

I may not be understanding the concern, but I'll give it a shot.

You're right, the start of any SQE is always 64-byte aligned, so that
should satisfy alignment requirements.

The order when writing multiple/successive SQEs in a submission queue
does matter, and this is currently serialized through the q_lock.

The order in which the bytes of a single SQE is written doesn't really
matter as long as the entire SQE is written into the CMB prior to writing
that SQ's doorbell register.

The doorbell register is written immediately after copying a command
entry into the submission queue (ignore "shadow buffer" features),
so the doorbells written to commands submitted is 1:1.

If a CMB SQE and DB order is not enforced with the memcpy, then we do
need a barrier after the SQE's memcpy and before the doorbell's writel.
Logan Gunthorpe March 5, 2018, 5:10 p.m. UTC | #3
On 05/03/18 09:00 AM, Keith Busch wrote:
> On Mon, Mar 05, 2018 at 12:33:29PM +1100, Oliver wrote:
>> On Thu, Mar 1, 2018 at 10:40 AM, Logan Gunthorpe <logang@deltatee.com> wrote:
>>> @@ -429,10 +429,7 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
>>>   {
>>>          u16 tail = nvmeq->sq_tail;
>>
>>> -       if (nvmeq->sq_cmds_io)
>>> -               memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
>>> -       else
>>> -               memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
>>> +       memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
>>
>> Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC
>> the _toio() variant enforces alignment, does the copy with 4 byte
>> stores, and has a full barrier after the copy. In comparison our
>> regular memcpy() does none of those things and may use unaligned and
>> vector load/stores. For normal (cacheable) memory that is perfectly
>> fine, but they can cause alignment faults when targeted at MMIO
>> (cache-inhibited) memory.
>>
>> I think in this particular case it might be ok since we know SEQs are
>> aligned to 64 byte boundaries and the copy is too small to use our
>> vectorised memcpy(). I'll assume we don't need explicit ordering
>> between writes of SEQs since the existing code doesn't seem to care
>> unless the doorbell is being rung, so you're probably fine there too.
>> That said, I still think this is a little bit sketchy and at the very
>> least you should add a comment explaining what's going on when the CMB
>> is being used. If someone more familiar with the NVMe driver could
>> chime in I would appreciate it.
> 
> I may not be understanding the concern, but I'll give it a shot.
> 
> You're right, the start of any SQE is always 64-byte aligned, so that
> should satisfy alignment requirements.
> 
> The order when writing multiple/successive SQEs in a submission queue
> does matter, and this is currently serialized through the q_lock.
> 
> The order in which the bytes of a single SQE is written doesn't really
> matter as long as the entire SQE is written into the CMB prior to writing
> that SQ's doorbell register.
> 
> The doorbell register is written immediately after copying a command
> entry into the submission queue (ignore "shadow buffer" features),
> so the doorbells written to commands submitted is 1:1.
> 
> If a CMB SQE and DB order is not enforced with the memcpy, then we do
> need a barrier after the SQE's memcpy and before the doorbell's writel.


Thanks for the information Keith.

Adding to this: regular memcpy generally also enforces alignment as 
unaligned access to regular memory is typically bad in some way on most 
arches. The generic memcpy_toio also does not have any barrier as it is 
just a call to memcpy. Arm64 also does not appear to have a barrier in 
its implementation and in the short survey I did I could not find any 
implementation with a barrier. I also did not find a ppc implementation 
in the tree but it would be weird for it to add a barrier when other 
arches do not appear to need it.

We've been operating on the assumption that memory mapped by 
devm_memremap_pages() can be treated as regular memory. This is 
emphasized by the fact that it does not return an __iomem pointer. If 
this assumption does not hold for an arch then we cannot support P2P DMA 
without an overhaul of many kernel interfaces or creating other backend 
interfaces into the drivers which take different data types (ie. we'd 
have to bypass the entire block layer when trying to write data in 
p2pmem to an nvme device. This is very undesirable.

Logan
Sinan Kaya March 5, 2018, 6:02 p.m. UTC | #4
On 3/5/2018 12:10 PM, Logan Gunthorpe wrote:
> 
> 
> On 05/03/18 09:00 AM, Keith Busch wrote:
>> On Mon, Mar 05, 2018 at 12:33:29PM +1100, Oliver wrote:
>>> On Thu, Mar 1, 2018 at 10:40 AM, Logan Gunthorpe <logang@deltatee.com> wrote:
>>>> @@ -429,10 +429,7 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
>>>>   {
>>>>          u16 tail = nvmeq->sq_tail;
>>>
>>>> -       if (nvmeq->sq_cmds_io)
>>>> -               memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
>>>> -       else
>>>> -               memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
>>>> +       memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
>>>
>>> Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC
>>> the _toio() variant enforces alignment, does the copy with 4 byte
>>> stores, and has a full barrier after the copy. In comparison our
>>> regular memcpy() does none of those things and may use unaligned and
>>> vector load/stores. For normal (cacheable) memory that is perfectly
>>> fine, but they can cause alignment faults when targeted at MMIO
>>> (cache-inhibited) memory.
>>>
>>> I think in this particular case it might be ok since we know SEQs are
>>> aligned to 64 byte boundaries and the copy is too small to use our
>>> vectorised memcpy(). I'll assume we don't need explicit ordering
>>> between writes of SEQs since the existing code doesn't seem to care
>>> unless the doorbell is being rung, so you're probably fine there too.
>>> That said, I still think this is a little bit sketchy and at the very
>>> least you should add a comment explaining what's going on when the CMB
>>> is being used. If someone more familiar with the NVMe driver could
>>> chime in I would appreciate it.
>>
>> I may not be understanding the concern, but I'll give it a shot.
>>
>> You're right, the start of any SQE is always 64-byte aligned, so that
>> should satisfy alignment requirements.
>>
>> The order when writing multiple/successive SQEs in a submission queue
>> does matter, and this is currently serialized through the q_lock.
>>
>> The order in which the bytes of a single SQE is written doesn't really
>> matter as long as the entire SQE is written into the CMB prior to writing
>> that SQ's doorbell register.
>>
>> The doorbell register is written immediately after copying a command
>> entry into the submission queue (ignore "shadow buffer" features),
>> so the doorbells written to commands submitted is 1:1.
>>
>> If a CMB SQE and DB order is not enforced with the memcpy, then we do
>> need a barrier after the SQE's memcpy and before the doorbell's writel.
> 
> 
> Thanks for the information Keith.
> 
> Adding to this: regular memcpy generally also enforces alignment as unaligned access to regular memory is typically bad in some way on most arches. The generic memcpy_toio also does not have any barrier as it is just a call to memcpy. Arm64 also does not appear to have a barrier in its implementation and in the short survey I did I could not find any implementation with a barrier. I also did not find a ppc implementation in the tree but it would be weird for it to add a barrier when other arches do not appear to need it.
> 
> We've been operating on the assumption that memory mapped by devm_memremap_pages() can be treated as regular memory. This is emphasized by the fact that it does not return an __iomem pointer. If this assumption does not hold for an arch then we cannot support P2P DMA without an overhaul of many kernel interfaces or creating other backend interfaces into the drivers which take different data types (ie. we'd have to bypass the entire block layer when trying to write data in p2pmem to an nvme device. This is very undesirable.
> 

writel has a barrier inside on ARM64.

https://elixir.bootlin.com/linux/latest/source/arch/arm64/include/asm/io.h#L143

Why do you need another barrier?


ACCESSING DEVICES
-----------------

Many devices can be memory mapped, and so appear to the CPU as if they're just
a set of memory locations.  To control such a device, the driver usually has to
make the right memory accesses in exactly the right order.

However, having a clever CPU or a clever compiler creates a potential problem
in that the carefully sequenced accesses in the driver code won't reach the
device in the requisite order if the CPU or the compiler thinks it is more
efficient to reorder, combine or merge accesses - something that would cause
the device to malfunction.

Inside of the Linux kernel, I/O should be done through the appropriate accessor
routines - such as inb() or writel() - which know how to make such accesses
appropriately sequential. 


> Logan
> 
> 
>
Logan Gunthorpe March 5, 2018, 6:09 p.m. UTC | #5
On 05/03/18 11:02 AM, Sinan Kaya wrote:
> writel has a barrier inside on ARM64.
> 
> https://elixir.bootlin.com/linux/latest/source/arch/arm64/include/asm/io.h#L143

Yes, and no barrier inside memcpy_toio as it uses __raw_writes. This 
should be sufficient as we are only accessing addresses that look like 
memory and have no side effects (those enabling doorbell accesses may 
need to worry about this though). Typically, what could happen, in this 
case, is the CPU would issue writes to the BAR normally and the next 
time it programmed the DMA engine it would flush everything via the 
flush in writel.

> Why do you need another barrier?

We don't.

Thanks,

Logan
Sagi Grimberg March 5, 2018, 7:57 p.m. UTC | #6
>>> -       if (nvmeq->sq_cmds_io)
>>> -               memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
>>> -       else
>>> -               memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
>>> +       memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
>>
>> Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC
>> the _toio() variant enforces alignment, does the copy with 4 byte
>> stores, and has a full barrier after the copy. In comparison our
>> regular memcpy() does none of those things and may use unaligned and
>> vector load/stores. For normal (cacheable) memory that is perfectly
>> fine, but they can cause alignment faults when targeted at MMIO
>> (cache-inhibited) memory.
>>
>> I think in this particular case it might be ok since we know SEQs are
>> aligned to 64 byte boundaries and the copy is too small to use our
>> vectorised memcpy(). I'll assume we don't need explicit ordering
>> between writes of SEQs since the existing code doesn't seem to care
>> unless the doorbell is being rung, so you're probably fine there too.
>> That said, I still think this is a little bit sketchy and at the very
>> least you should add a comment explaining what's going on when the CMB
>> is being used. If someone more familiar with the NVMe driver could
>> chime in I would appreciate it.
> 
> I may not be understanding the concern, but I'll give it a shot.
> 
> You're right, the start of any SQE is always 64-byte aligned, so that
> should satisfy alignment requirements.
> 
> The order when writing multiple/successive SQEs in a submission queue
> does matter, and this is currently serialized through the q_lock.
> 
> The order in which the bytes of a single SQE is written doesn't really
> matter as long as the entire SQE is written into the CMB prior to writing
> that SQ's doorbell register.
> 
> The doorbell register is written immediately after copying a command
> entry into the submission queue (ignore "shadow buffer" features),
> so the doorbells written to commands submitted is 1:1.
> 
> If a CMB SQE and DB order is not enforced with the memcpy, then we do
> need a barrier after the SQE's memcpy and before the doorbell's writel.

Keith, while we're on this, regardless of cmb, is SQE memcopy and DB 
update ordering always guaranteed?

If you look at mlx4 (rdma device driver) that works exactly the same as
nvme you will find:
--
                 qp->sq.head += nreq;

                 /*
                  * Make sure that descriptors are written before
                  * doorbell record.
                  */
                 wmb();

                 writel(qp->doorbell_qpn,
                        to_mdev(ibqp->device)->uar_map + 
MLX4_SEND_DOORBELL);

                 /*
                  * Make sure doorbells don't leak out of SQ spinlock
                  * and reach the HCA out of order.
                  */
                 mmiowb();
--

That seems to explicitly make sure to place a barrier before updating
the doorbell. So as I see it, either ordering is guaranteed and the
above code is redundant, or nvme needs to do the same.

Thoughts?
Jason Gunthorpe March 5, 2018, 8:10 p.m. UTC | #7
On Mon, Mar 05, 2018 at 09:57:27PM +0200, Sagi Grimberg wrote:

> Keith, while we're on this, regardless of cmb, is SQE memcopy and DB update
> ordering always guaranteed?
> 
> If you look at mlx4 (rdma device driver) that works exactly the same as
> nvme you will find:
> --
>                 qp->sq.head += nreq;
> 
>                 /*
>                  * Make sure that descriptors are written before
>                  * doorbell record.
>                  */
>                 wmb();
> 
>                 writel(qp->doorbell_qpn,
>                        to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
> 
>                 /*
>                  * Make sure doorbells don't leak out of SQ spinlock
>                  * and reach the HCA out of order.
>                  */
>                 mmiowb();
> --
> 
> That seems to explicitly make sure to place a barrier before updating
> the doorbell. So as I see it, either ordering is guaranteed and the
> above code is redundant, or nvme needs to do the same.

A wmb() is always required before operations that can trigger DMA.

The reason ARM has a barrier in writel() is not to make it ordered
with respect to CPU stores to cachable memory, but to make it ordered
with respect to *other* writels.

Eg Linux defines this:

writel(A, mem);
writel(B, mem);

To always produce two TLPs on PCI-E when mem is UC BAR memory.

And ARM cannot guarentee that without the extra barrier.

So now we see stuff like this:

writel_relaxed(A, mem);
writel_relaxed(B, mem+4);

Which says the TLPs to A and B can be issued in any order..

So when reading the above mlx code, we see the first wmb() being used
to ensure that CPU stores to cachable memory are visible to the DMA
triggered by the doorbell ring.

The mmiowb() is used to ensure that DB writes are not combined and not
issued in any order other than implied by the lock that encloses the
whole thing. This is needed because uar_map is WC memory.

We don't have ordering with respect to two writel's here, so if ARM
performance was a concern the writel could be switched to
writel_relaxed().

Presumably nvme has similar requirments, although I guess the DB
register is mapped UC not WC?

Jason
Logan Gunthorpe March 5, 2018, 8:13 p.m. UTC | #8
On 05/03/18 12:57 PM, Sagi Grimberg wrote:
> Keith, while we're on this, regardless of cmb, is SQE memcopy and DB 
> update ordering always guaranteed?
> 
> If you look at mlx4 (rdma device driver) that works exactly the same as
> nvme you will find:
> -- 
>                  qp->sq.head += nreq;
> 
>                  /*
>                   * Make sure that descriptors are written before
>                   * doorbell record.
>                   */
>                  wmb();
> 
>                  writel(qp->doorbell_qpn,
>                         to_mdev(ibqp->device)->uar_map + 
> MLX4_SEND_DOORBELL);
> 
>                  /*
>                   * Make sure doorbells don't leak out of SQ spinlock
>                   * and reach the HCA out of order.
>                   */
>                  mmiowb();
> -- 

To me, it looks like the wmb() is redundant as writel should guarantee 
the order. (Indeed, per Sinan's comment, writel on arm64 starts with a 
wmb() which means, on that platform, there are two wmb() calls in a row.)

The mmiowb() call, on the other hand, looks correct per my understanding 
of it's purpose with respect to the spinlock.

Logan
Logan Gunthorpe March 5, 2018, 8:16 p.m. UTC | #9
On 05/03/18 01:10 PM, Jason Gunthorpe wrote:
> So when reading the above mlx code, we see the first wmb() being used
> to ensure that CPU stores to cachable memory are visible to the DMA
> triggered by the doorbell ring.

Oh, yes, that makes sense. Disregard my previous email as I was wrong.

Logan
Keith Busch March 5, 2018, 8:42 p.m. UTC | #10
On Mon, Mar 05, 2018 at 01:10:53PM -0700, Jason Gunthorpe wrote:
> So when reading the above mlx code, we see the first wmb() being used
> to ensure that CPU stores to cachable memory are visible to the DMA
> triggered by the doorbell ring.

IIUC, we don't need a similar barrier for NVMe to ensure memory is
visibile to DMA since the SQE memory is allocated DMA coherent when the
SQ is not within a CMB.
 
> The mmiowb() is used to ensure that DB writes are not combined and not
> issued in any order other than implied by the lock that encloses the
> whole thing. This is needed because uar_map is WC memory.
> 
> We don't have ordering with respect to two writel's here, so if ARM
> performance was a concern the writel could be switched to
> writel_relaxed().
> 
> Presumably nvme has similar requirments, although I guess the DB
> register is mapped UC not WC?

Yep, the NVMe DB register is required by the spec to be mapped UC.
Jason Gunthorpe March 5, 2018, 8:50 p.m. UTC | #11
On Mon, Mar 05, 2018 at 01:42:12PM -0700, Keith Busch wrote:
> On Mon, Mar 05, 2018 at 01:10:53PM -0700, Jason Gunthorpe wrote:
> > So when reading the above mlx code, we see the first wmb() being used
> > to ensure that CPU stores to cachable memory are visible to the DMA
> > triggered by the doorbell ring.
> 
> IIUC, we don't need a similar barrier for NVMe to ensure memory is
> visibile to DMA since the SQE memory is allocated DMA coherent when the
> SQ is not within a CMB.

You still need it.

DMA coherent just means you don't need to call the DMA API after
writing, it says nothing about CPU ordering.

eg on x86 DMA coherent is just normal system memory, and you do need
the SFENCE betweeen system memory stores and DMA triggering MMIO,
apparently.

Jason
Oliver O'Halloran March 6, 2018, 12:49 a.m. UTC | #12
On Tue, Mar 6, 2018 at 4:10 AM, Logan Gunthorpe <logang@deltatee.com> wrote:
>
>
> On 05/03/18 09:00 AM, Keith Busch wrote:
>>
>> On Mon, Mar 05, 2018 at 12:33:29PM +1100, Oliver wrote:
>>>
>>> On Thu, Mar 1, 2018 at 10:40 AM, Logan Gunthorpe <logang@deltatee.com>
>>> wrote:
>>>>
>>>> @@ -429,10 +429,7 @@ static void __nvme_submit_cmd(struct nvme_queue
>>>> *nvmeq,
>>>>   {
>>>>          u16 tail = nvmeq->sq_tail;
>>>
>>>
>>>> -       if (nvmeq->sq_cmds_io)
>>>> -               memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd,
>>>> sizeof(*cmd));
>>>> -       else
>>>> -               memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
>>>> +       memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
>>>
>>>
>>> Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC
>>> the _toio() variant enforces alignment, does the copy with 4 byte
>>> stores, and has a full barrier after the copy. In comparison our
>>> regular memcpy() does none of those things and may use unaligned and
>>> vector load/stores. For normal (cacheable) memory that is perfectly
>>> fine, but they can cause alignment faults when targeted at MMIO
>>> (cache-inhibited) memory.
>>>
>>> I think in this particular case it might be ok since we know SEQs are
>>> aligned to 64 byte boundaries and the copy is too small to use our
>>> vectorised memcpy(). I'll assume we don't need explicit ordering
>>> between writes of SEQs since the existing code doesn't seem to care
>>> unless the doorbell is being rung, so you're probably fine there too.
>>> That said, I still think this is a little bit sketchy and at the very
>>> least you should add a comment explaining what's going on when the CMB
>>> is being used. If someone more familiar with the NVMe driver could
>>> chime in I would appreciate it.
>>
>>
>> I may not be understanding the concern, but I'll give it a shot.
>>
>> You're right, the start of any SQE is always 64-byte aligned, so that
>> should satisfy alignment requirements.
>>
>> The order when writing multiple/successive SQEs in a submission queue
>> does matter, and this is currently serialized through the q_lock.
>>
>> The order in which the bytes of a single SQE is written doesn't really
>> matter as long as the entire SQE is written into the CMB prior to writing
>> that SQ's doorbell register.
>>
>> The doorbell register is written immediately after copying a command
>> entry into the submission queue (ignore "shadow buffer" features),
>> so the doorbells written to commands submitted is 1:1.
>>
>> If a CMB SQE and DB order is not enforced with the memcpy, then we do
>> need a barrier after the SQE's memcpy and before the doorbell's writel.
>
>
>
> Thanks for the information Keith.
>
> Adding to this: regular memcpy generally also enforces alignment as
> unaligned access to regular memory is typically bad in some way on most
> arches. The generic memcpy_toio also does not have any barrier as it is just
> a call to memcpy. Arm64 also does not appear to have a barrier in its
> implementation and in the short survey I did I could not find any
> implementation with a barrier. I also did not find a ppc implementation in
> the tree but it would be weird for it to add a barrier when other arches do
> not appear to need it.

It's in arch/powerpc/kernel/io.c as _memcpy_toio() and it has two full barriers!

Awesome!

Our io.h indicates that our iomem accessors are designed to provide x86ish
strong ordering of accesses to MMIO space. The git log indicates
arch/powerpc/kernel/io.c has barely been touched in the last decade so
odds are most of that code was written in the elder days when people
were less aware of ordering issues. It might just be overly conservative
by today's standards, but maybe not (see below).

> We've been operating on the assumption that memory mapped by
> devm_memremap_pages() can be treated as regular memory. This is emphasized
> by the fact that it does not return an __iomem pointer.

I think the lack of an __iomem annotation is a bit of a red herring. The comment
header for devm_memremap_pages() outlines the conditions for when using
it is valid, the important one being:

> * 4/ res is expected to be a host memory range that could feasibly be
> *    treated as a "System RAM" range, i.e. not a device mmio range, but
> *    this is not enforced.

Granted, the CMB is a MMIO buffer rather than a register range, but from
my perspective it's still a device MMIO range rather than something we
could feasibly treat as system RAM. The two big reasons being:

a) System RAM is cacheable and coherent while the CMB is neither, and
b) On PPC MMIO accesses are in a separate ordering domain to cacheable
   memory accesses.

The latter isn't as terrifying as it sounds since a full barrier will
order everything
with everything, but it still causes problems. For performance reasons
we want to
minimise the number of cases where we need to use a sync instruction
(full barrier)
in favour of a lightweight sync (lwsync) which only orders cacheable
accesses. To
implement that our MMIO accessors set a percpu flag that arch_spin_unlock()
checks to see if any MMIO accesses have occured which would require a full
barrier. If we havn't done any MMIO operations then it'll only do a lwsync. My
concern here is that a driver might do something like this:

void *buffer = devm_memremap_pages(<stuff corresponding to card memory buffer>);

spin_lock();
<do some stuff with system memory>
// buffer is in MMIO space, so the writes are uncached
memcpy(buffer, source_data, <size>);
<do more stuff with system memory>
spin_unlock(); // no MMIO access detected, so it does a lwsync

As far as I know the spinlock API guarantees that accesses that
occurring inside the
critical section will be ordered relative to what happens outside the
critical section.
In this case we would be violating that guarantee and I don't see a
clean way to keep
the fix for it contained to arch/powerpc/. Fundamentally we need to know when
something is accessing MMIO space.

(I'm not going to suggest ditching the lwsync trick. mpe is not going
to take that patch
without a really good reason)

>If this assumption
> does not hold for an arch then we cannot support P2P DMA without an overhaul
> of many kernel interfaces or creating other backend interfaces into the
> drivers which take different data types (ie. we'd have to bypass the entire
> block layer when trying to write data in p2pmem to an nvme device. This is
> very undesirable.

I know. I'm not trying to ruin your day, but the separation between io
and normal memory
is there for a reason. Maybe we can relax that separation, but we need
to be careful about
how we do it.

Oliver
Logan Gunthorpe March 6, 2018, 1:14 a.m. UTC | #13
On 05/03/18 05:49 PM, Oliver wrote:
> It's in arch/powerpc/kernel/io.c as _memcpy_toio() and it has two full barriers!
> 
> Awesome!
> 
> Our io.h indicates that our iomem accessors are designed to provide x86ish
> strong ordering of accesses to MMIO space. The git log indicates
> arch/powerpc/kernel/io.c has barely been touched in the last decade so
> odds are most of that code was written in the elder days when people
> were less aware of ordering issues. It might just be overly conservative
> by today's standards, but maybe not (see below).

Yes, that seems overly conservative.

> (I'm not going to suggest ditching the lwsync trick. mpe is not going
> to take that patch
> without a really good reason)

Well, that's pretty gross. Is this not exactly the situation mmiowb() is 
meant to solve? See [1].

Though, you're right in principle. Even if power was similar to other 
systems in this way, it's still a risk that if these pages get passed 
somewhere in the kernel that uses a spin lock like that without an 
mmiowb() call, then it's going to have a bug. For now, the risk is 
pretty low as we know exactly where all the p2pmem pages will be used 
but if it gets into other places, all bets are off. I did do some work 
trying to make a safe version of io-pages and also trying to change from 
pages to pfn_t in large areas but neither approach seemed likely to get 
any traction in the community, at least not in the near term.

Logan

[1] ACQUIRES VS I/O ACCESSES in 
https://www.kernel.org/doc/Documentation/memory-barriers.txt
Oliver O'Halloran March 6, 2018, 10:40 a.m. UTC | #14
On Tue, Mar 6, 2018 at 12:14 PM, Logan Gunthorpe <logang@deltatee.com> wrote:
>
> On 05/03/18 05:49 PM, Oliver wrote:
>>
>> It's in arch/powerpc/kernel/io.c as _memcpy_toio() and it has two full
>> barriers!
>>
>> Awesome!
>>
>> Our io.h indicates that our iomem accessors are designed to provide x86ish
>> strong ordering of accesses to MMIO space. The git log indicates
>> arch/powerpc/kernel/io.c has barely been touched in the last decade so
>> odds are most of that code was written in the elder days when people
>> were less aware of ordering issues. It might just be overly conservative
>> by today's standards, but maybe not (see below).
>
>
> Yes, that seems overly conservative.
>
>> (I'm not going to suggest ditching the lwsync trick. mpe is not going
>> to take that patch
>> without a really good reason)
>
>
> Well, that's pretty gross. Is this not exactly the situation mmiowb() is
> meant to solve? See [1].

Yep, mmiowb() is supposed to be used in this situation. According to BenH,
author of that io_sync hack, we implement the stronger semantics
so that we don't break existing drivers that assume spin_unlock() does
order i/o even though it's not supposed to. At a guess the x86 version of
spin_unlock() happens to do that so the rest of us need to either live
with it or fix all the bugs :)

> Though, you're right in principle. Even if power was similar to other
> systems in this way, it's still a risk that if these pages get passed
> somewhere in the kernel that uses a spin lock like that without an mmiowb()
> call, then it's going to have a bug. For now, the risk is pretty low as we
> know exactly where all the p2pmem pages will be used but if it gets into
> other places, all bets are off.

Yeah this was my main concern with the whole approach. For ioremap()ed
memory we have the __iomem annotation to help with tracking when we
need to be careful, but we'd lose that entirely here.

> I did do some work trying to make a safe
> version of io-pages and also trying to change from pages to pfn_t in large
> areas but neither approach seemed likely to get any traction in the
> community, at least not in the near term.

It's a tricky problem. HMM with DEVICE_PRIVATE might be more
palatable than the pfn_t conversion since there would still be struct pages
backing the device memory. That said, there are probably other issues with
device private memory not being part of the linear mapping, but HMM
provides some assistance there.

Oliver
diff mbox

Patch

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 73036d2fbbd5..56ca79be8476 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -29,6 +29,7 @@ 
 #include <linux/types.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/sed-opal.h>
+#include <linux/pci-p2pdma.h>
 
 #include "nvme.h"
 
@@ -91,9 +92,8 @@  struct nvme_dev {
 	struct work_struct remove_work;
 	struct mutex shutdown_lock;
 	bool subsystem;
-	void __iomem *cmb;
-	pci_bus_addr_t cmb_bus_addr;
 	u64 cmb_size;
+	bool cmb_use_sqes;
 	u32 cmbsz;
 	u32 cmbloc;
 	struct nvme_ctrl ctrl;
@@ -148,7 +148,7 @@  struct nvme_queue {
 	struct nvme_dev *dev;
 	spinlock_t q_lock;
 	struct nvme_command *sq_cmds;
-	struct nvme_command __iomem *sq_cmds_io;
+	bool sq_cmds_is_io;
 	volatile struct nvme_completion *cqes;
 	struct blk_mq_tags **tags;
 	dma_addr_t sq_dma_addr;
@@ -429,10 +429,7 @@  static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
 {
 	u16 tail = nvmeq->sq_tail;
 
-	if (nvmeq->sq_cmds_io)
-		memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
-	else
-		memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
 
 	if (++tail == nvmeq->q_depth)
 		tail = 0;
@@ -1286,9 +1283,18 @@  static void nvme_free_queue(struct nvme_queue *nvmeq)
 {
 	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
 				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
-	if (nvmeq->sq_cmds)
-		dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
-					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+
+	if (nvmeq->sq_cmds) {
+		if (nvmeq->sq_cmds_is_io)
+			pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
+					nvmeq->sq_cmds,
+					SQ_SIZE(nvmeq->q_depth));
+		else
+			dma_free_coherent(nvmeq->q_dmadev,
+					  SQ_SIZE(nvmeq->q_depth),
+					  nvmeq->sq_cmds,
+					  nvmeq->sq_dma_addr);
+	}
 }
 
 static void nvme_free_queues(struct nvme_dev *dev, int lowest)
@@ -1368,12 +1374,21 @@  static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 				int qid, int depth)
 {
-	/* CMB SQEs will be mapped before creation */
-	if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS))
-		return 0;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
+		nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
+		nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
+						nvmeq->sq_cmds);
+		nvmeq->sq_cmds_is_io = true;
+	}
+
+	if (!nvmeq->sq_cmds) {
+		nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+					&nvmeq->sq_dma_addr, GFP_KERNEL);
+		nvmeq->sq_cmds_is_io = false;
+	}
 
-	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
-					    &nvmeq->sq_dma_addr, GFP_KERNEL);
 	if (!nvmeq->sq_cmds)
 		return -ENOMEM;
 	return 0;
@@ -1449,13 +1464,6 @@  static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	struct nvme_dev *dev = nvmeq->dev;
 	int result;
 
-	if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
-		unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth),
-						      dev->ctrl.page_size);
-		nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
-		nvmeq->sq_cmds_io = dev->cmb + offset;
-	}
-
 	nvmeq->cq_vector = qid - 1;
 	result = adapter_alloc_cq(dev, qid, nvmeq);
 	if (result < 0)
@@ -1685,9 +1693,6 @@  static void nvme_map_cmb(struct nvme_dev *dev)
 		return;
 	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
 
-	if (!use_cmb_sqes)
-		return;
-
 	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
 	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
 	bar = NVME_CMB_BIR(dev->cmbloc);
@@ -1704,11 +1709,15 @@  static void nvme_map_cmb(struct nvme_dev *dev)
 	if (size > bar_size - offset)
 		size = bar_size - offset;
 
-	dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
-	if (!dev->cmb)
+	if (pci_p2pdma_add_resource(pdev, bar, size, offset))
 		return;
-	dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
+
 	dev->cmb_size = size;
+	dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);
+
+	if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
+			(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
+		pci_p2pmem_publish(pdev, true);
 
 	if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
 				    &dev_attr_cmb.attr, NULL))
@@ -1718,12 +1727,10 @@  static void nvme_map_cmb(struct nvme_dev *dev)
 
 static inline void nvme_release_cmb(struct nvme_dev *dev)
 {
-	if (dev->cmb) {
-		iounmap(dev->cmb);
-		dev->cmb = NULL;
+	if (dev->cmb_size) {
 		sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
 					     &dev_attr_cmb.attr, NULL);
-		dev->cmbsz = 0;
+		dev->cmb_size = 0;
 	}
 }
 
@@ -1918,13 +1925,13 @@  static int nvme_setup_io_queues(struct nvme_dev *dev)
 	if (nr_io_queues == 0)
 		return 0;
 
-	if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
+	if (dev->cmb_use_sqes) {
 		result = nvme_cmb_qdepth(dev, nr_io_queues,
 				sizeof(struct nvme_command));
 		if (result > 0)
 			dev->q_depth = result;
 		else
-			nvme_release_cmb(dev);
+			dev->cmb_use_sqes = false;
 	}
 
 	do {