diff mbox series

[2/6] block/nvme: Map doorbells pages write-only

Message ID 20200921162949.553863-3-philmd@redhat.com (mailing list archive)
State New, archived
Headers show
Series block/nvme: Map doorbells pages write-only, remove magic from nvme_init | expand

Commit Message

Philippe Mathieu-Daudé Sept. 21, 2020, 4:29 p.m. UTC
Per the datasheet sections 3.1.13/3.1.14:
  "The host should not read the doorbell registers."

As we don't need read access, map the doorbells with write-only
permission. We keep a reference to this mapped address in the
BDRVNVMeState structure.

Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
---
 block/nvme.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

Comments

Fam Sept. 22, 2020, 8:18 a.m. UTC | #1
On Mon, 2020-09-21 at 18:29 +0200, Philippe Mathieu-Daudé wrote:
> Per the datasheet sections 3.1.13/3.1.14:
>   "The host should not read the doorbell registers."
> 
> As we don't need read access, map the doorbells with write-only
> permission. We keep a reference to this mapped address in the
> BDRVNVMeState structure.

Besides looking more correct in access mode, is there any side effect
of WO mapping?

Fam

> 
> Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
> ---
>  block/nvme.c | 29 +++++++++++++++++++----------
>  1 file changed, 19 insertions(+), 10 deletions(-)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index 5a4dc6a722a..3c834da8fec 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -31,7 +31,7 @@
>  #define NVME_SQ_ENTRY_BYTES 64
>  #define NVME_CQ_ENTRY_BYTES 16
>  #define NVME_QUEUE_SIZE 128
> -#define NVME_BAR_SIZE 8192
> +#define NVME_DOORBELL_SIZE 4096
>  
>  /*
>   * We have to leave one slot empty as that is the full queue case
> where
> @@ -84,10 +84,6 @@ typedef struct {
>  /* Memory mapped registers */
>  typedef volatile struct {
>      NvmeBar ctrl;
> -    struct {
> -        uint32_t sq_tail;
> -        uint32_t cq_head;
> -    } doorbells[];
>  } NVMeRegs;
>  
>  #define INDEX_ADMIN     0
> @@ -103,6 +99,11 @@ struct BDRVNVMeState {
>      AioContext *aio_context;
>      QEMUVFIOState *vfio;
>      NVMeRegs *regs;
> +    /* Memory mapped registers */
> +    volatile struct {
> +        uint32_t sq_tail;
> +        uint32_t cq_head;
> +    } *doorbells;
>      /* The submission/completion queue pairs.
>       * [0]: admin queue.
>       * [1..]: io queues.
> @@ -247,14 +248,14 @@ static NVMeQueuePair
> *nvme_create_queue_pair(BDRVNVMeState *s,
>          error_propagate(errp, local_err);
>          goto fail;
>      }
> -    q->sq.doorbell = &s->regs->doorbells[idx * s-
> >doorbell_scale].sq_tail;
> +    q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
>  
>      nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES,
> &local_err);
>      if (local_err) {
>          error_propagate(errp, local_err);
>          goto fail;
>      }
> -    q->cq.doorbell = &s->regs->doorbells[idx * s-
> >doorbell_scale].cq_head;
> +    q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
>  
>      return q;
>  fail:
> @@ -712,13 +713,12 @@ static int nvme_init(BlockDriverState *bs,
> const char *device, int namespace,
>          goto out;
>      }
>  
> -    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE,
> +    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar),
>                                      PROT_READ | PROT_WRITE, errp);
>      if (!s->regs) {
>          ret = -EINVAL;
>          goto out;
>      }
> -
>      /* Perform initialize sequence as described in NVMe spec "7.6.1
>       * Initialization". */
>  
> @@ -748,6 +748,13 @@ static int nvme_init(BlockDriverState *bs, const
> char *device, int namespace,
>          }
>      }
>  
> +    s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0,
> sizeof(NvmeBar),
> +                                         NVME_DOORBELL_SIZE,
> PROT_WRITE, errp);
> +    if (!s->doorbells) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +
>      /* Set up admin queue. */
>      s->queues = g_new(NVMeQueuePair *, 1);
>      s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context,
> 0,
> @@ -873,7 +880,9 @@ static void nvme_close(BlockDriverState *bs)
>                             &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
>                             false, NULL, NULL);
>      event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
> -    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0,
> NVME_BAR_SIZE);
> +    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,
> +                            sizeof(NvmeBar), NVME_DOORBELL_SIZE);
> +    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0,
> sizeof(NvmeBar));
>      qemu_vfio_close(s->vfio);
>  
>      g_free(s->device);
Philippe Mathieu-Daudé Sept. 22, 2020, 8:41 a.m. UTC | #2
Hi Fam,

+Paolo?

On 9/22/20 10:18 AM, Fam Zheng wrote:
> On Mon, 2020-09-21 at 18:29 +0200, Philippe Mathieu-Daudé wrote:
>> Per the datasheet sections 3.1.13/3.1.14:
>>   "The host should not read the doorbell registers."
>>
>> As we don't need read access, map the doorbells with write-only
>> permission. We keep a reference to this mapped address in the
>> BDRVNVMeState structure.
> 
> Besides looking more correct in access mode, is there any side effect
> of WO mapping?

TBH I don't have enough knowledge to answer this question.
I tested successfully on X86. I'm writing more tests.

> 
> Fam
> 
>>
>> Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
>> ---
>>  block/nvme.c | 29 +++++++++++++++++++----------
>>  1 file changed, 19 insertions(+), 10 deletions(-)
>>
>> diff --git a/block/nvme.c b/block/nvme.c
>> index 5a4dc6a722a..3c834da8fec 100644
>> --- a/block/nvme.c
>> +++ b/block/nvme.c
>> @@ -31,7 +31,7 @@
>>  #define NVME_SQ_ENTRY_BYTES 64
>>  #define NVME_CQ_ENTRY_BYTES 16
>>  #define NVME_QUEUE_SIZE 128
>> -#define NVME_BAR_SIZE 8192
>> +#define NVME_DOORBELL_SIZE 4096
>>  
>>  /*
>>   * We have to leave one slot empty as that is the full queue case
>> where
>> @@ -84,10 +84,6 @@ typedef struct {
>>  /* Memory mapped registers */
>>  typedef volatile struct {
>>      NvmeBar ctrl;
>> -    struct {
>> -        uint32_t sq_tail;
>> -        uint32_t cq_head;
>> -    } doorbells[];
>>  } NVMeRegs;
>>  
>>  #define INDEX_ADMIN     0
>> @@ -103,6 +99,11 @@ struct BDRVNVMeState {
>>      AioContext *aio_context;
>>      QEMUVFIOState *vfio;
>>      NVMeRegs *regs;
>> +    /* Memory mapped registers */
>> +    volatile struct {
>> +        uint32_t sq_tail;
>> +        uint32_t cq_head;
>> +    } *doorbells;
>>      /* The submission/completion queue pairs.
>>       * [0]: admin queue.
>>       * [1..]: io queues.
>> @@ -247,14 +248,14 @@ static NVMeQueuePair
>> *nvme_create_queue_pair(BDRVNVMeState *s,
>>          error_propagate(errp, local_err);
>>          goto fail;
>>      }
>> -    q->sq.doorbell = &s->regs->doorbells[idx * s-
>>> doorbell_scale].sq_tail;
>> +    q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
>>  
>>      nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES,
>> &local_err);
>>      if (local_err) {
>>          error_propagate(errp, local_err);
>>          goto fail;
>>      }
>> -    q->cq.doorbell = &s->regs->doorbells[idx * s-
>>> doorbell_scale].cq_head;
>> +    q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
>>  
>>      return q;
>>  fail:
>> @@ -712,13 +713,12 @@ static int nvme_init(BlockDriverState *bs,
>> const char *device, int namespace,
>>          goto out;
>>      }
>>  
>> -    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE,
>> +    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar),
>>                                      PROT_READ | PROT_WRITE, errp);
>>      if (!s->regs) {
>>          ret = -EINVAL;
>>          goto out;
>>      }
>> -
>>      /* Perform initialize sequence as described in NVMe spec "7.6.1
>>       * Initialization". */
>>  
>> @@ -748,6 +748,13 @@ static int nvme_init(BlockDriverState *bs, const
>> char *device, int namespace,
>>          }
>>      }
>>  
>> +    s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0,
>> sizeof(NvmeBar),
>> +                                         NVME_DOORBELL_SIZE,
>> PROT_WRITE, errp);
>> +    if (!s->doorbells) {
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +
>>      /* Set up admin queue. */
>>      s->queues = g_new(NVMeQueuePair *, 1);
>>      s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context,
>> 0,
>> @@ -873,7 +880,9 @@ static void nvme_close(BlockDriverState *bs)
>>                             &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
>>                             false, NULL, NULL);
>>      event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
>> -    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0,
>> NVME_BAR_SIZE);
>> +    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,
>> +                            sizeof(NvmeBar), NVME_DOORBELL_SIZE);
>> +    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0,
>> sizeof(NvmeBar));
>>      qemu_vfio_close(s->vfio);
>>  
>>      g_free(s->device);
>
Paolo Bonzini Sept. 22, 2020, 9:04 a.m. UTC | #3
On 22/09/20 10:41, Philippe Mathieu-Daudé wrote:
>> Besides looking more correct in access mode, is there any side effect
>> of WO mapping?
> TBH I don't have enough knowledge to answer this question.
> I tested successfully on X86. I'm writing more tests.

No problem with doing this, but PROT_WRITE does not work at all on x86.
:)  PROT_EXEC works if you have a machine with PKRU, but PROT_WRITE
silently becomes PROT_READ|PROT_WRITE because the processor does not
support it.

Paolo
Fam Sept. 22, 2020, 9:33 a.m. UTC | #4
On Tue, 2020-09-22 at 10:41 +0200, Philippe Mathieu-Daudé wrote:
> Hi Fam,
> 
> +Paolo?
> 
> On 9/22/20 10:18 AM, Fam Zheng wrote:
> > On Mon, 2020-09-21 at 18:29 +0200, Philippe Mathieu-Daudé wrote:
> > > Per the datasheet sections 3.1.13/3.1.14:
> > >   "The host should not read the doorbell registers."
> > > 
> > > As we don't need read access, map the doorbells with write-only
> > > permission. We keep a reference to this mapped address in the
> > > BDRVNVMeState structure.
> > 
> > Besides looking more correct in access mode, is there any side
> > effect
> > of WO mapping?
> 
> TBH I don't have enough knowledge to answer this question.
> I tested successfully on X86. I'm writing more tests.

The reason I'm asking is more because x86 pages are either RO or RW. So
I'd like to see if there's a practical reason behind this patch (I have
no idea about the effects on MTRR and/or IOMMU).

Fam

> 
> > 
> > Fam
> > 
> > > 
> > > Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
> > > ---
> > >  block/nvme.c | 29 +++++++++++++++++++----------
> > >  1 file changed, 19 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/block/nvme.c b/block/nvme.c
> > > index 5a4dc6a722a..3c834da8fec 100644
> > > --- a/block/nvme.c
> > > +++ b/block/nvme.c
> > > @@ -31,7 +31,7 @@
> > >  #define NVME_SQ_ENTRY_BYTES 64
> > >  #define NVME_CQ_ENTRY_BYTES 16
> > >  #define NVME_QUEUE_SIZE 128
> > > -#define NVME_BAR_SIZE 8192
> > > +#define NVME_DOORBELL_SIZE 4096
> > >  
> > >  /*
> > >   * We have to leave one slot empty as that is the full queue
> > > case
> > > where
> > > @@ -84,10 +84,6 @@ typedef struct {
> > >  /* Memory mapped registers */
> > >  typedef volatile struct {
> > >      NvmeBar ctrl;
> > > -    struct {
> > > -        uint32_t sq_tail;
> > > -        uint32_t cq_head;
> > > -    } doorbells[];
> > >  } NVMeRegs;
> > >  
> > >  #define INDEX_ADMIN     0
> > > @@ -103,6 +99,11 @@ struct BDRVNVMeState {
> > >      AioContext *aio_context;
> > >      QEMUVFIOState *vfio;
> > >      NVMeRegs *regs;
> > > +    /* Memory mapped registers */
> > > +    volatile struct {
> > > +        uint32_t sq_tail;
> > > +        uint32_t cq_head;
> > > +    } *doorbells;
> > >      /* The submission/completion queue pairs.
> > >       * [0]: admin queue.
> > >       * [1..]: io queues.
> > > @@ -247,14 +248,14 @@ static NVMeQueuePair
> > > *nvme_create_queue_pair(BDRVNVMeState *s,
> > >          error_propagate(errp, local_err);
> > >          goto fail;
> > >      }
> > > -    q->sq.doorbell = &s->regs->doorbells[idx * s-
> > > > doorbell_scale].sq_tail;
> > > 
> > > +    q->sq.doorbell = &s->doorbells[idx * s-
> > > >doorbell_scale].sq_tail;
> > >  
> > >      nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES,
> > > &local_err);
> > >      if (local_err) {
> > >          error_propagate(errp, local_err);
> > >          goto fail;
> > >      }
> > > -    q->cq.doorbell = &s->regs->doorbells[idx * s-
> > > > doorbell_scale].cq_head;
> > > 
> > > +    q->cq.doorbell = &s->doorbells[idx * s-
> > > >doorbell_scale].cq_head;
> > >  
> > >      return q;
> > >  fail:
> > > @@ -712,13 +713,12 @@ static int nvme_init(BlockDriverState *bs,
> > > const char *device, int namespace,
> > >          goto out;
> > >      }
> > >  
> > > -    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0,
> > > NVME_BAR_SIZE,
> > > +    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0,
> > > sizeof(NvmeBar),
> > >                                      PROT_READ | PROT_WRITE,
> > > errp);
> > >      if (!s->regs) {
> > >          ret = -EINVAL;
> > >          goto out;
> > >      }
> > > -
> > >      /* Perform initialize sequence as described in NVMe spec
> > > "7.6.1
> > >       * Initialization". */
> > >  
> > > @@ -748,6 +748,13 @@ static int nvme_init(BlockDriverState *bs,
> > > const
> > > char *device, int namespace,
> > >          }
> > >      }
> > >  
> > > +    s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0,
> > > sizeof(NvmeBar),
> > > +                                         NVME_DOORBELL_SIZE,
> > > PROT_WRITE, errp);
> > > +    if (!s->doorbells) {
> > > +        ret = -EINVAL;
> > > +        goto out;
> > > +    }
> > > +
> > >      /* Set up admin queue. */
> > >      s->queues = g_new(NVMeQueuePair *, 1);
> > >      s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s,
> > > aio_context,
> > > 0,
> > > @@ -873,7 +880,9 @@ static void nvme_close(BlockDriverState *bs)
> > >                             &s-
> > > >irq_notifier[MSIX_SHARED_IRQ_IDX],
> > >                             false, NULL, NULL);
> > >      event_notifier_cleanup(&s-
> > > >irq_notifier[MSIX_SHARED_IRQ_IDX]);
> > > -    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0,
> > > NVME_BAR_SIZE);
> > > +    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,
> > > +                            sizeof(NvmeBar),
> > > NVME_DOORBELL_SIZE);
> > > +    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0,
> > > sizeof(NvmeBar));
> > >      qemu_vfio_close(s->vfio);
> > >  
> > >      g_free(s->device);
> 
>
Philippe Mathieu-Daudé Sept. 22, 2020, 9:36 a.m. UTC | #5
On 9/22/20 11:04 AM, Paolo Bonzini wrote:
> On 22/09/20 10:41, Philippe Mathieu-Daudé wrote:
>>> Besides looking more correct in access mode, is there any side effect
>>> of WO mapping?
>> TBH I don't have enough knowledge to answer this question.
>> I tested successfully on X86. I'm writing more tests.
> 
> No problem with doing this, but PROT_WRITE does not work at all on x86.
> :)  PROT_EXEC works if you have a machine with PKRU, but PROT_WRITE
> silently becomes PROT_READ|PROT_WRITE because the processor does not
> support it.

Ah this is why it works the same way in my testing.

I'll run tests on ARM.

Thanks,

Phil.

> 
> Paolo
>
diff mbox series

Patch

diff --git a/block/nvme.c b/block/nvme.c
index 5a4dc6a722a..3c834da8fec 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -31,7 +31,7 @@ 
 #define NVME_SQ_ENTRY_BYTES 64
 #define NVME_CQ_ENTRY_BYTES 16
 #define NVME_QUEUE_SIZE 128
-#define NVME_BAR_SIZE 8192
+#define NVME_DOORBELL_SIZE 4096
 
 /*
  * We have to leave one slot empty as that is the full queue case where
@@ -84,10 +84,6 @@  typedef struct {
 /* Memory mapped registers */
 typedef volatile struct {
     NvmeBar ctrl;
-    struct {
-        uint32_t sq_tail;
-        uint32_t cq_head;
-    } doorbells[];
 } NVMeRegs;
 
 #define INDEX_ADMIN     0
@@ -103,6 +99,11 @@  struct BDRVNVMeState {
     AioContext *aio_context;
     QEMUVFIOState *vfio;
     NVMeRegs *regs;
+    /* Memory mapped registers */
+    volatile struct {
+        uint32_t sq_tail;
+        uint32_t cq_head;
+    } *doorbells;
     /* The submission/completion queue pairs.
      * [0]: admin queue.
      * [1..]: io queues.
@@ -247,14 +248,14 @@  static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
         error_propagate(errp, local_err);
         goto fail;
     }
-    q->sq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].sq_tail;
+    q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
 
     nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         goto fail;
     }
-    q->cq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].cq_head;
+    q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
 
     return q;
 fail:
@@ -712,13 +713,12 @@  static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
         goto out;
     }
 
-    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE,
+    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar),
                                     PROT_READ | PROT_WRITE, errp);
     if (!s->regs) {
         ret = -EINVAL;
         goto out;
     }
-
     /* Perform initialize sequence as described in NVMe spec "7.6.1
      * Initialization". */
 
@@ -748,6 +748,13 @@  static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
         }
     }
 
+    s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar),
+                                         NVME_DOORBELL_SIZE, PROT_WRITE, errp);
+    if (!s->doorbells) {
+        ret = -EINVAL;
+        goto out;
+    }
+
     /* Set up admin queue. */
     s->queues = g_new(NVMeQueuePair *, 1);
     s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0,
@@ -873,7 +880,9 @@  static void nvme_close(BlockDriverState *bs)
                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
                            false, NULL, NULL);
     event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
-    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE);
+    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,
+                            sizeof(NvmeBar), NVME_DOORBELL_SIZE);
+    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, sizeof(NvmeBar));
     qemu_vfio_close(s->vfio);
 
     g_free(s->device);