diff mbox series

[vhost,v6,02/10] virtio_ring: packed: remove double check of the unmap ops

Message ID 20240327111430.108787-3-xuanzhuo@linux.alibaba.com (mailing list archive)
State Not Applicable
Headers show
Series virtio: drivers maintain dma info for premapped vq | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply

Commit Message

Xuan Zhuo March 27, 2024, 11:14 a.m. UTC
In the functions vring_unmap_extra_packed and vring_unmap_desc_packed,
multiple checks are made whether unmap is performed and whether it is
INDIRECT.

These two functions are usually called in a loop, and we should put the
check outside the loop.

And we unmap the descs with VRING_DESC_F_INDIRECT on the same path with
other descs, that make the thing more complex. If we distinguish the
descs with VRING_DESC_F_INDIRECT before unmap, thing will be clearer.

For desc with VRING_DESC_F_INDIRECT flag:
1. only one desc of the desc table is used, we do not need the loop
    Theoretically, indirect descriptors could be chained.
    But now, that is not supported by "add", so we ignore this case.
2. the called unmap api is difference from the other desc
3. the vq->premapped is not needed to check
4. the vq->indirect is not needed to check
5. the state->indir_desc must not be null

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/virtio/virtio_ring.c | 78 ++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 38 deletions(-)

Comments

Jason Wang March 28, 2024, 6:56 a.m. UTC | #1
On Wed, Mar 27, 2024 at 7:14 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> In the functions vring_unmap_extra_packed and vring_unmap_desc_packed,
> multiple checks are made whether unmap is performed and whether it is
> INDIRECT.
>
> These two functions are usually called in a loop, and we should put the
> check outside the loop.
>
> And we unmap the descs with VRING_DESC_F_INDIRECT on the same path with
> other descs, that make the thing more complex. If we distinguish the
> descs with VRING_DESC_F_INDIRECT before unmap, thing will be clearer.
>
> For desc with VRING_DESC_F_INDIRECT flag:
> 1. only one desc of the desc table is used, we do not need the loop
>     Theoretically, indirect descriptors could be chained.
>     But now, that is not supported by "add", so we ignore this case.
> 2. the called unmap api is difference from the other desc
> 3. the vq->premapped is not needed to check
> 4. the vq->indirect is not needed to check
> 5. the state->indir_desc must not be null

It doesn't explain the connection to the goal of this series. If it's
not a must I'd suggest moving it to a separate patch.

>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>

Rethink this, it looks to me it would complicate the codes furtherly.

For example, vring_map_xxx() helpers will check premappred and
use_dma_api by itself. But in the case of vring_unmap() you want to
move those checks to the caller. This will result in tricky codes that
are hard to understand.

We need to be consistent here.

If we try to optimize unmap we need to optimize map as well. But
generally it would complicate the logic of the caller if we want to
let the caller to differ. Ideally, the caller of those function should
know nothing about use_dma_api, premapped and other.

> ---
>  drivers/virtio/virtio_ring.c | 78 ++++++++++++++++++------------------
>  1 file changed, 40 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 03360073bd4a..a2838fe1cc08 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -1214,6 +1214,7 @@ static u16 packed_last_used(u16 last_used_idx)
>         return last_used_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR));
>  }
>
> +/* caller must check vring_need_unmap_buffer() */
>  static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
>                                      const struct vring_desc_extra *extra)
>  {
> @@ -1221,33 +1222,18 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
>
>         flags = extra->flags;
>
> -       if (flags & VRING_DESC_F_INDIRECT) {
> -               if (!vq->use_dma_api)
> -                       return;
> -
> -               dma_unmap_single(vring_dma_dev(vq),
> -                                extra->addr, extra->len,
> -                                (flags & VRING_DESC_F_WRITE) ?
> -                                DMA_FROM_DEVICE : DMA_TO_DEVICE);
> -       } else {
> -               if (!vring_need_unmap_buffer(vq))
> -                       return;
> -
> -               dma_unmap_page(vring_dma_dev(vq),
> -                              extra->addr, extra->len,
> -                              (flags & VRING_DESC_F_WRITE) ?
> -                              DMA_FROM_DEVICE : DMA_TO_DEVICE);
> -       }
> +       dma_unmap_page(vring_dma_dev(vq),
> +                      extra->addr, extra->len,
> +                      (flags & VRING_DESC_F_WRITE) ?
> +                      DMA_FROM_DEVICE : DMA_TO_DEVICE);
>  }
>
> +/* caller must check vring_need_unmap_buffer() */
>  static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
>                                     const struct vring_packed_desc *desc)
>  {
>         u16 flags;
>
> -       if (!vring_need_unmap_buffer(vq))
> -               return;
> -
>         flags = le16_to_cpu(desc->flags);
>
>         dma_unmap_page(vring_dma_dev(vq),
> @@ -1323,7 +1309,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
>                         total_sg * sizeof(struct vring_packed_desc),
>                         DMA_TO_DEVICE);
>         if (vring_mapping_error(vq, addr)) {
> -               if (vq->premapped)
> +               if (!vring_need_unmap_buffer(vq))
>                         goto free_desc;

I would do this to make it much more easier to be read and avoid the warn:

if (vring_mapping_error(vq, addr))
        goto unmap_release;

unmap_release:
        if (vring_need_unmap_buffer(vq))
                for (i = 0, xxx)
free_desc:
        kfree(desc);

or it could be

unmap_release:
      if (!vring_need_unmap_buffer(vq))
            goto free_desc;

Still tricky but better.

>
>                 goto unmap_release;
> @@ -1338,10 +1324,11 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
>                 vq->packed.desc_extra[id].addr = addr;
>                 vq->packed.desc_extra[id].len = total_sg *
>                                 sizeof(struct vring_packed_desc);
> -               vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT |
> -                                                 vq->packed.avail_used_flags;
>         }
>
> +       vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT |
> +               vq->packed.avail_used_flags;

An example of the tricky code, I think you do this because you want to
differ indirect in detach_buf_packed():

flags = vq->packed.desc_extra[id].flags;


> +
>         /*
>          * A driver MUST NOT make the first descriptor in the list
>          * available before all subsequent descriptors comprising
> @@ -1382,6 +1369,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
>  unmap_release:
>         err_idx = i;
>
> +       WARN_ON(!vring_need_unmap_buffer(vq));
> +
>         for (i = 0; i < err_idx; i++)
>                 vring_unmap_desc_packed(vq, &desc[i]);
>
> @@ -1475,12 +1464,13 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
>                         desc[i].len = cpu_to_le32(sg->length);
>                         desc[i].id = cpu_to_le16(id);
>
> -                       if (unlikely(vq->use_dma_api)) {
> +                       if (vring_need_unmap_buffer(vq)) {
>                                 vq->packed.desc_extra[curr].addr = addr;
>                                 vq->packed.desc_extra[curr].len = sg->length;
> -                               vq->packed.desc_extra[curr].flags =
> -                                       le16_to_cpu(flags);
>                         }
> +
> +                       vq->packed.desc_extra[curr].flags = le16_to_cpu(flags);
> +
>                         prev = curr;
>                         curr = vq->packed.desc_extra[curr].next;
>
> @@ -1530,6 +1520,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
>
>         vq->packed.avail_used_flags = avail_used_flags;
>
> +       WARN_ON(!vring_need_unmap_buffer(vq));
> +
>         for (n = 0; n < total_sg; n++) {
>                 if (i == err_idx)
>                         break;
> @@ -1599,7 +1591,9 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
>         struct vring_desc_state_packed *state = NULL;
>         struct vring_packed_desc *desc;
>         unsigned int i, curr;
> +       u16 flags;
>
> +       flags = vq->packed.desc_extra[id].flags;

Can we check vq->indirect && indir_desc here? Then we don't need
special care to store flags in desc_extra.

>         state = &vq->packed.desc_state[id];
>
>         /* Clear data ptr. */
> @@ -1609,22 +1603,32 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
>         vq->free_head = id;
>         vq->vq.num_free += state->num;
>
> -       if (unlikely(vq->use_dma_api)) {
> -               curr = id;
> -               for (i = 0; i < state->num; i++) {
> -                       vring_unmap_extra_packed(vq,
> -                                                &vq->packed.desc_extra[curr]);
> -                       curr = vq->packed.desc_extra[curr].next;
> +       if (!(flags & VRING_DESC_F_INDIRECT)) {
> +               if (vring_need_unmap_buffer(vq)) {
> +                       curr = id;
> +                       for (i = 0; i < state->num; i++) {
> +                               vring_unmap_extra_packed(vq,
> +                                                        &vq->packed.desc_extra[curr]);
> +                               curr = vq->packed.desc_extra[curr].next;
> +                       }
>                 }
> -       }
>
> -       if (vq->indirect) {
> +               if (ctx)
> +                       *ctx = state->indir_desc;
> +       } else {
> +               const struct vring_desc_extra *extra;
>                 u32 len;
>
> +               if (vq->use_dma_api) {
> +                       extra = &vq->packed.desc_extra[id];
> +                       dma_unmap_single(vring_dma_dev(vq),
> +                                        extra->addr, extra->len,
> +                                        (flags & VRING_DESC_F_WRITE) ?
> +                                        DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +               }
> +
>                 /* Free the indirect table, if any, now that it's unmapped. */
>                 desc = state->indir_desc;
> -               if (!desc)
> -                       return;
>
>                 if (vring_need_unmap_buffer(vq)) {
>                         len = vq->packed.desc_extra[id].len;
> @@ -1634,8 +1638,6 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
>                 }
>                 kfree(desc);
>                 state->indir_desc = NULL;
> -       } else if (ctx) {
> -               *ctx = state->indir_desc;
>         }
>  }
>
> --
> 2.32.0.3.g01195cf9f
>

Thanks
Xuan Zhuo March 28, 2024, 7:27 a.m. UTC | #2
On Thu, 28 Mar 2024 14:56:47 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Wed, Mar 27, 2024 at 7:14 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > In the functions vring_unmap_extra_packed and vring_unmap_desc_packed,
> > multiple checks are made whether unmap is performed and whether it is
> > INDIRECT.
> >
> > These two functions are usually called in a loop, and we should put the
> > check outside the loop.
> >
> > And we unmap the descs with VRING_DESC_F_INDIRECT on the same path with
> > other descs, that make the thing more complex. If we distinguish the
> > descs with VRING_DESC_F_INDIRECT before unmap, thing will be clearer.
> >
> > For desc with VRING_DESC_F_INDIRECT flag:
> > 1. only one desc of the desc table is used, we do not need the loop
> >     Theoretically, indirect descriptors could be chained.
> >     But now, that is not supported by "add", so we ignore this case.
> > 2. the called unmap api is difference from the other desc
> > 3. the vq->premapped is not needed to check
> > 4. the vq->indirect is not needed to check
> > 5. the state->indir_desc must not be null
>
> It doesn't explain the connection to the goal of this series. If it's
> not a must I'd suggest moving it to a separate patch.


The "no store dma ..." depends this.

I will add this message in next version.


>
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>
> Rethink this, it looks to me it would complicate the codes furtherly.
>
> For example, vring_map_xxx() helpers will check premappred and
> use_dma_api by itself. But in the case of vring_unmap() you want to
> move those checks to the caller. This will result in tricky codes that
> are hard to understand.
>
> We need to be consistent here.
>
> If we try to optimize unmap we need to optimize map as well. But
> generally it would complicate the logic of the caller if we want to
> let the caller to differ. Ideally, the caller of those function should
> know nothing about use_dma_api, premapped and other.


The key is that we can check "use_dma_api, premapped" to skip the loop.
If the vring_unmap_xxx is called, the "use_dma_api, premapped" is checked in
advance, so that is a waste to check thest again.


>
> > ---
> >  drivers/virtio/virtio_ring.c | 78 ++++++++++++++++++------------------
> >  1 file changed, 40 insertions(+), 38 deletions(-)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 03360073bd4a..a2838fe1cc08 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -1214,6 +1214,7 @@ static u16 packed_last_used(u16 last_used_idx)
> >         return last_used_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR));
> >  }
> >
> > +/* caller must check vring_need_unmap_buffer() */
> >  static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
> >                                      const struct vring_desc_extra *extra)
> >  {
> > @@ -1221,33 +1222,18 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
> >
> >         flags = extra->flags;
> >
> > -       if (flags & VRING_DESC_F_INDIRECT) {
> > -               if (!vq->use_dma_api)
> > -                       return;
> > -
> > -               dma_unmap_single(vring_dma_dev(vq),
> > -                                extra->addr, extra->len,
> > -                                (flags & VRING_DESC_F_WRITE) ?
> > -                                DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > -       } else {
> > -               if (!vring_need_unmap_buffer(vq))
> > -                       return;
> > -
> > -               dma_unmap_page(vring_dma_dev(vq),
> > -                              extra->addr, extra->len,
> > -                              (flags & VRING_DESC_F_WRITE) ?
> > -                              DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > -       }
> > +       dma_unmap_page(vring_dma_dev(vq),
> > +                      extra->addr, extra->len,
> > +                      (flags & VRING_DESC_F_WRITE) ?
> > +                      DMA_FROM_DEVICE : DMA_TO_DEVICE);
> >  }
> >
> > +/* caller must check vring_need_unmap_buffer() */
> >  static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
> >                                     const struct vring_packed_desc *desc)
> >  {
> >         u16 flags;
> >
> > -       if (!vring_need_unmap_buffer(vq))
> > -               return;
> > -
> >         flags = le16_to_cpu(desc->flags);
> >
> >         dma_unmap_page(vring_dma_dev(vq),
> > @@ -1323,7 +1309,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> >                         total_sg * sizeof(struct vring_packed_desc),
> >                         DMA_TO_DEVICE);
> >         if (vring_mapping_error(vq, addr)) {
> > -               if (vq->premapped)
> > +               if (!vring_need_unmap_buffer(vq))
> >                         goto free_desc;
>
> I would do this to make it much more easier to be read and avoid the warn:
>
> if (vring_mapping_error(vq, addr))
>         goto unmap_release;
>
> unmap_release:
>         if (vring_need_unmap_buffer(vq))
>                 for (i = 0, xxx)
> free_desc:
>         kfree(desc);
>
> or it could be
>
> unmap_release:
>       if (!vring_need_unmap_buffer(vq))
>             goto free_desc;
>
> Still tricky but better.

I am ok.


>
> >
> >                 goto unmap_release;
> > @@ -1338,10 +1324,11 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> >                 vq->packed.desc_extra[id].addr = addr;
> >                 vq->packed.desc_extra[id].len = total_sg *
> >                                 sizeof(struct vring_packed_desc);
> > -               vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT |
> > -                                                 vq->packed.avail_used_flags;
> >         }
> >
> > +       vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT |
> > +               vq->packed.avail_used_flags;
>
> An example of the tricky code, I think you do this because you want to
> differ indirect in detach_buf_packed():
>
> flags = vq->packed.desc_extra[id].flags;
>
>
> > +
> >         /*
> >          * A driver MUST NOT make the first descriptor in the list
> >          * available before all subsequent descriptors comprising
> > @@ -1382,6 +1369,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> >  unmap_release:
> >         err_idx = i;
> >
> > +       WARN_ON(!vring_need_unmap_buffer(vq));
> > +
> >         for (i = 0; i < err_idx; i++)
> >                 vring_unmap_desc_packed(vq, &desc[i]);
> >
> > @@ -1475,12 +1464,13 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> >                         desc[i].len = cpu_to_le32(sg->length);
> >                         desc[i].id = cpu_to_le16(id);
> >
> > -                       if (unlikely(vq->use_dma_api)) {
> > +                       if (vring_need_unmap_buffer(vq)) {
> >                                 vq->packed.desc_extra[curr].addr = addr;
> >                                 vq->packed.desc_extra[curr].len = sg->length;
> > -                               vq->packed.desc_extra[curr].flags =
> > -                                       le16_to_cpu(flags);
> >                         }
> > +
> > +                       vq->packed.desc_extra[curr].flags = le16_to_cpu(flags);
> > +
> >                         prev = curr;
> >                         curr = vq->packed.desc_extra[curr].next;
> >
> > @@ -1530,6 +1520,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> >
> >         vq->packed.avail_used_flags = avail_used_flags;
> >
> > +       WARN_ON(!vring_need_unmap_buffer(vq));
> > +
> >         for (n = 0; n < total_sg; n++) {
> >                 if (i == err_idx)
> >                         break;
> > @@ -1599,7 +1591,9 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
> >         struct vring_desc_state_packed *state = NULL;
> >         struct vring_packed_desc *desc;
> >         unsigned int i, curr;
> > +       u16 flags;
> >
> > +       flags = vq->packed.desc_extra[id].flags;
>
> Can we check vq->indirect && indir_desc here? Then we don't need
> special care to store flags in desc_extra.


No.

When vq->indirect is true, but the desc may has not indirect flag.

Thanks.


>
> >         state = &vq->packed.desc_state[id];
> >
> >         /* Clear data ptr. */
> > @@ -1609,22 +1603,32 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
> >         vq->free_head = id;
> >         vq->vq.num_free += state->num;
> >
> > -       if (unlikely(vq->use_dma_api)) {
> > -               curr = id;
> > -               for (i = 0; i < state->num; i++) {
> > -                       vring_unmap_extra_packed(vq,
> > -                                                &vq->packed.desc_extra[curr]);
> > -                       curr = vq->packed.desc_extra[curr].next;
> > +       if (!(flags & VRING_DESC_F_INDIRECT)) {
> > +               if (vring_need_unmap_buffer(vq)) {
> > +                       curr = id;
> > +                       for (i = 0; i < state->num; i++) {
> > +                               vring_unmap_extra_packed(vq,
> > +                                                        &vq->packed.desc_extra[curr]);
> > +                               curr = vq->packed.desc_extra[curr].next;
> > +                       }
> >                 }
> > -       }
> >
> > -       if (vq->indirect) {
> > +               if (ctx)
> > +                       *ctx = state->indir_desc;
> > +       } else {
> > +               const struct vring_desc_extra *extra;
> >                 u32 len;
> >
> > +               if (vq->use_dma_api) {
> > +                       extra = &vq->packed.desc_extra[id];
> > +                       dma_unmap_single(vring_dma_dev(vq),
> > +                                        extra->addr, extra->len,
> > +                                        (flags & VRING_DESC_F_WRITE) ?
> > +                                        DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > +               }
> > +
> >                 /* Free the indirect table, if any, now that it's unmapped. */
> >                 desc = state->indir_desc;
> > -               if (!desc)
> > -                       return;
> >
> >                 if (vring_need_unmap_buffer(vq)) {
> >                         len = vq->packed.desc_extra[id].len;
> > @@ -1634,8 +1638,6 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
> >                 }
> >                 kfree(desc);
> >                 state->indir_desc = NULL;
> > -       } else if (ctx) {
> > -               *ctx = state->indir_desc;
> >         }
> >  }
> >
> > --
> > 2.32.0.3.g01195cf9f
> >
>
> Thanks
>
Jason Wang March 28, 2024, 8:07 a.m. UTC | #3
On Thu, Mar 28, 2024 at 3:32 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Thu, 28 Mar 2024 14:56:47 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Wed, Mar 27, 2024 at 7:14 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > In the functions vring_unmap_extra_packed and vring_unmap_desc_packed,
> > > multiple checks are made whether unmap is performed and whether it is
> > > INDIRECT.
> > >
> > > These two functions are usually called in a loop, and we should put the
> > > check outside the loop.
> > >
> > > And we unmap the descs with VRING_DESC_F_INDIRECT on the same path with
> > > other descs, that make the thing more complex. If we distinguish the
> > > descs with VRING_DESC_F_INDIRECT before unmap, thing will be clearer.
> > >
> > > For desc with VRING_DESC_F_INDIRECT flag:
> > > 1. only one desc of the desc table is used, we do not need the loop
> > >     Theoretically, indirect descriptors could be chained.
> > >     But now, that is not supported by "add", so we ignore this case.
> > > 2. the called unmap api is difference from the other desc
> > > 3. the vq->premapped is not needed to check
> > > 4. the vq->indirect is not needed to check
> > > 5. the state->indir_desc must not be null
> >
> > It doesn't explain the connection to the goal of this series. If it's
> > not a must I'd suggest moving it to a separate patch.
>
>
> The "no store dma ..." depends this.
>
> I will add this message in next version.
>
>
> >
> > >
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> >
> > Rethink this, it looks to me it would complicate the codes furtherly.
> >
> > For example, vring_map_xxx() helpers will check premappred and
> > use_dma_api by itself. But in the case of vring_unmap() you want to
> > move those checks to the caller. This will result in tricky codes that
> > are hard to understand.
> >
> > We need to be consistent here.
> >
> > If we try to optimize unmap we need to optimize map as well. But
> > generally it would complicate the logic of the caller if we want to
> > let the caller to differ. Ideally, the caller of those function should
> > know nothing about use_dma_api, premapped and other.
>
>
> The key is that we can check "use_dma_api, premapped" to skip the loop.
> If the vring_unmap_xxx is called, the "use_dma_api, premapped" is checked in
> advance, so that is a waste to check thest again.

Right, but we have the same logic for map.

>
>
> >
> > > ---
> > >  drivers/virtio/virtio_ring.c | 78 ++++++++++++++++++------------------
> > >  1 file changed, 40 insertions(+), 38 deletions(-)
> > >
> > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > index 03360073bd4a..a2838fe1cc08 100644
> > > --- a/drivers/virtio/virtio_ring.c
> > > +++ b/drivers/virtio/virtio_ring.c
> > > @@ -1214,6 +1214,7 @@ static u16 packed_last_used(u16 last_used_idx)
> > >         return last_used_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR));
> > >  }
> > >
> > > +/* caller must check vring_need_unmap_buffer() */
> > >  static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
> > >                                      const struct vring_desc_extra *extra)
> > >  {
> > > @@ -1221,33 +1222,18 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
> > >
> > >         flags = extra->flags;
> > >
> > > -       if (flags & VRING_DESC_F_INDIRECT) {
> > > -               if (!vq->use_dma_api)
> > > -                       return;
> > > -
> > > -               dma_unmap_single(vring_dma_dev(vq),
> > > -                                extra->addr, extra->len,
> > > -                                (flags & VRING_DESC_F_WRITE) ?
> > > -                                DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > > -       } else {
> > > -               if (!vring_need_unmap_buffer(vq))
> > > -                       return;
> > > -
> > > -               dma_unmap_page(vring_dma_dev(vq),
> > > -                              extra->addr, extra->len,
> > > -                              (flags & VRING_DESC_F_WRITE) ?
> > > -                              DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > > -       }
> > > +       dma_unmap_page(vring_dma_dev(vq),
> > > +                      extra->addr, extra->len,
> > > +                      (flags & VRING_DESC_F_WRITE) ?
> > > +                      DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > >  }
> > >
> > > +/* caller must check vring_need_unmap_buffer() */
> > >  static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
> > >                                     const struct vring_packed_desc *desc)
> > >  {
> > >         u16 flags;
> > >
> > > -       if (!vring_need_unmap_buffer(vq))
> > > -               return;
> > > -
> > >         flags = le16_to_cpu(desc->flags);
> > >
> > >         dma_unmap_page(vring_dma_dev(vq),
> > > @@ -1323,7 +1309,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > >                         total_sg * sizeof(struct vring_packed_desc),
> > >                         DMA_TO_DEVICE);
> > >         if (vring_mapping_error(vq, addr)) {
> > > -               if (vq->premapped)
> > > +               if (!vring_need_unmap_buffer(vq))
> > >                         goto free_desc;
> >
> > I would do this to make it much more easier to be read and avoid the warn:
> >
> > if (vring_mapping_error(vq, addr))
> >         goto unmap_release;
> >
> > unmap_release:
> >         if (vring_need_unmap_buffer(vq))
> >                 for (i = 0, xxx)
> > free_desc:
> >         kfree(desc);
> >
> > or it could be
> >
> > unmap_release:
> >       if (!vring_need_unmap_buffer(vq))
> >             goto free_desc;
> >
> > Still tricky but better.
>
> I am ok.
>
>
> >
> > >
> > >                 goto unmap_release;
> > > @@ -1338,10 +1324,11 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > >                 vq->packed.desc_extra[id].addr = addr;
> > >                 vq->packed.desc_extra[id].len = total_sg *
> > >                                 sizeof(struct vring_packed_desc);
> > > -               vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT |
> > > -                                                 vq->packed.avail_used_flags;
> > >         }
> > >
> > > +       vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT |
> > > +               vq->packed.avail_used_flags;
> >
> > An example of the tricky code, I think you do this because you want to
> > differ indirect in detach_buf_packed():
> >
> > flags = vq->packed.desc_extra[id].flags;
> >
> >
> > > +
> > >         /*
> > >          * A driver MUST NOT make the first descriptor in the list
> > >          * available before all subsequent descriptors comprising
> > > @@ -1382,6 +1369,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > >  unmap_release:
> > >         err_idx = i;
> > >
> > > +       WARN_ON(!vring_need_unmap_buffer(vq));
> > > +
> > >         for (i = 0; i < err_idx; i++)
> > >                 vring_unmap_desc_packed(vq, &desc[i]);
> > >
> > > @@ -1475,12 +1464,13 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > >                         desc[i].len = cpu_to_le32(sg->length);
> > >                         desc[i].id = cpu_to_le16(id);
> > >
> > > -                       if (unlikely(vq->use_dma_api)) {
> > > +                       if (vring_need_unmap_buffer(vq)) {
> > >                                 vq->packed.desc_extra[curr].addr = addr;
> > >                                 vq->packed.desc_extra[curr].len = sg->length;
> > > -                               vq->packed.desc_extra[curr].flags =
> > > -                                       le16_to_cpu(flags);
> > >                         }
> > > +
> > > +                       vq->packed.desc_extra[curr].flags = le16_to_cpu(flags);
> > > +
> > >                         prev = curr;
> > >                         curr = vq->packed.desc_extra[curr].next;
> > >
> > > @@ -1530,6 +1520,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > >
> > >         vq->packed.avail_used_flags = avail_used_flags;
> > >
> > > +       WARN_ON(!vring_need_unmap_buffer(vq));
> > > +
> > >         for (n = 0; n < total_sg; n++) {
> > >                 if (i == err_idx)
> > >                         break;
> > > @@ -1599,7 +1591,9 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
> > >         struct vring_desc_state_packed *state = NULL;
> > >         struct vring_packed_desc *desc;
> > >         unsigned int i, curr;
> > > +       u16 flags;
> > >
> > > +       flags = vq->packed.desc_extra[id].flags;
> >
> > Can we check vq->indirect && indir_desc here? Then we don't need
> > special care to store flags in desc_extra.
>
>
> No.
>
> When vq->indirect is true, but the desc may has not indirect flag.

But we check indir_desc as well?

        vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
                !cfg_vq_get(cfg, vq, ctx);

Thanks
Xuan Zhuo March 28, 2024, 8:15 a.m. UTC | #4
On Thu, 28 Mar 2024 16:07:14 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Thu, Mar 28, 2024 at 3:32 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > On Thu, 28 Mar 2024 14:56:47 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > On Wed, Mar 27, 2024 at 7:14 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > >
> > > > In the functions vring_unmap_extra_packed and vring_unmap_desc_packed,
> > > > multiple checks are made whether unmap is performed and whether it is
> > > > INDIRECT.
> > > >
> > > > These two functions are usually called in a loop, and we should put the
> > > > check outside the loop.
> > > >
> > > > And we unmap the descs with VRING_DESC_F_INDIRECT on the same path with
> > > > other descs, that make the thing more complex. If we distinguish the
> > > > descs with VRING_DESC_F_INDIRECT before unmap, thing will be clearer.
> > > >
> > > > For desc with VRING_DESC_F_INDIRECT flag:
> > > > 1. only one desc of the desc table is used, we do not need the loop
> > > >     Theoretically, indirect descriptors could be chained.
> > > >     But now, that is not supported by "add", so we ignore this case.
> > > > 2. the called unmap api is difference from the other desc
> > > > 3. the vq->premapped is not needed to check
> > > > 4. the vq->indirect is not needed to check
> > > > 5. the state->indir_desc must not be null
> > >
> > > It doesn't explain the connection to the goal of this series. If it's
> > > not a must I'd suggest moving it to a separate patch.
> >
> >
> > The "no store dma ..." depends this.
> >
> > I will add this message in next version.
> >
> >
> > >
> > > >
> > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > >
> > > Rethink this, it looks to me it would complicate the codes furtherly.
> > >
> > > For example, vring_map_xxx() helpers will check premappred and
> > > use_dma_api by itself. But in the case of vring_unmap() you want to
> > > move those checks to the caller. This will result in tricky codes that
> > > are hard to understand.
> > >
> > > We need to be consistent here.
> > >
> > > If we try to optimize unmap we need to optimize map as well. But
> > > generally it would complicate the logic of the caller if we want to
> > > let the caller to differ. Ideally, the caller of those function should
> > > know nothing about use_dma_api, premapped and other.
> >
> >
> > The key is that we can check "use_dma_api, premapped" to skip the loop.
> > If the vring_unmap_xxx is called, the "use_dma_api, premapped" is checked in
> > advance, so that is a waste to check thest again.
>
> Right, but we have the same logic for map.

But we can not skip the loop for map.


>
> >
> >
> > >
> > > > ---
> > > >  drivers/virtio/virtio_ring.c | 78 ++++++++++++++++++------------------
> > > >  1 file changed, 40 insertions(+), 38 deletions(-)
> > > >
> > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > index 03360073bd4a..a2838fe1cc08 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -1214,6 +1214,7 @@ static u16 packed_last_used(u16 last_used_idx)
> > > >         return last_used_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR));
> > > >  }
> > > >
> > > > +/* caller must check vring_need_unmap_buffer() */
> > > >  static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
> > > >                                      const struct vring_desc_extra *extra)
> > > >  {
> > > > @@ -1221,33 +1222,18 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
> > > >
> > > >         flags = extra->flags;
> > > >
> > > > -       if (flags & VRING_DESC_F_INDIRECT) {
> > > > -               if (!vq->use_dma_api)
> > > > -                       return;
> > > > -
> > > > -               dma_unmap_single(vring_dma_dev(vq),
> > > > -                                extra->addr, extra->len,
> > > > -                                (flags & VRING_DESC_F_WRITE) ?
> > > > -                                DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > > > -       } else {
> > > > -               if (!vring_need_unmap_buffer(vq))
> > > > -                       return;
> > > > -
> > > > -               dma_unmap_page(vring_dma_dev(vq),
> > > > -                              extra->addr, extra->len,
> > > > -                              (flags & VRING_DESC_F_WRITE) ?
> > > > -                              DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > > > -       }
> > > > +       dma_unmap_page(vring_dma_dev(vq),
> > > > +                      extra->addr, extra->len,
> > > > +                      (flags & VRING_DESC_F_WRITE) ?
> > > > +                      DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > > >  }
> > > >
> > > > +/* caller must check vring_need_unmap_buffer() */
> > > >  static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
> > > >                                     const struct vring_packed_desc *desc)
> > > >  {
> > > >         u16 flags;
> > > >
> > > > -       if (!vring_need_unmap_buffer(vq))
> > > > -               return;
> > > > -
> > > >         flags = le16_to_cpu(desc->flags);
> > > >
> > > >         dma_unmap_page(vring_dma_dev(vq),
> > > > @@ -1323,7 +1309,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > >                         total_sg * sizeof(struct vring_packed_desc),
> > > >                         DMA_TO_DEVICE);
> > > >         if (vring_mapping_error(vq, addr)) {
> > > > -               if (vq->premapped)
> > > > +               if (!vring_need_unmap_buffer(vq))
> > > >                         goto free_desc;
> > >
> > > I would do this to make it much more easier to be read and avoid the warn:
> > >
> > > if (vring_mapping_error(vq, addr))
> > >         goto unmap_release;
> > >
> > > unmap_release:
> > >         if (vring_need_unmap_buffer(vq))
> > >                 for (i = 0, xxx)
> > > free_desc:
> > >         kfree(desc);
> > >
> > > or it could be
> > >
> > > unmap_release:
> > >       if (!vring_need_unmap_buffer(vq))
> > >             goto free_desc;
> > >
> > > Still tricky but better.
> >
> > I am ok.
> >
> >
> > >
> > > >
> > > >                 goto unmap_release;
> > > > @@ -1338,10 +1324,11 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > >                 vq->packed.desc_extra[id].addr = addr;
> > > >                 vq->packed.desc_extra[id].len = total_sg *
> > > >                                 sizeof(struct vring_packed_desc);
> > > > -               vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT |
> > > > -                                                 vq->packed.avail_used_flags;
> > > >         }
> > > >
> > > > +       vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT |
> > > > +               vq->packed.avail_used_flags;
> > >
> > > An example of the tricky code, I think you do this because you want to
> > > differ indirect in detach_buf_packed():
> > >
> > > flags = vq->packed.desc_extra[id].flags;
> > >
> > >
> > > > +
> > > >         /*
> > > >          * A driver MUST NOT make the first descriptor in the list
> > > >          * available before all subsequent descriptors comprising
> > > > @@ -1382,6 +1369,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > >  unmap_release:
> > > >         err_idx = i;
> > > >
> > > > +       WARN_ON(!vring_need_unmap_buffer(vq));
> > > > +
> > > >         for (i = 0; i < err_idx; i++)
> > > >                 vring_unmap_desc_packed(vq, &desc[i]);
> > > >
> > > > @@ -1475,12 +1464,13 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > >                         desc[i].len = cpu_to_le32(sg->length);
> > > >                         desc[i].id = cpu_to_le16(id);
> > > >
> > > > -                       if (unlikely(vq->use_dma_api)) {
> > > > +                       if (vring_need_unmap_buffer(vq)) {
> > > >                                 vq->packed.desc_extra[curr].addr = addr;
> > > >                                 vq->packed.desc_extra[curr].len = sg->length;
> > > > -                               vq->packed.desc_extra[curr].flags =
> > > > -                                       le16_to_cpu(flags);
> > > >                         }
> > > > +
> > > > +                       vq->packed.desc_extra[curr].flags = le16_to_cpu(flags);
> > > > +
> > > >                         prev = curr;
> > > >                         curr = vq->packed.desc_extra[curr].next;
> > > >
> > > > @@ -1530,6 +1520,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > >
> > > >         vq->packed.avail_used_flags = avail_used_flags;
> > > >
> > > > +       WARN_ON(!vring_need_unmap_buffer(vq));
> > > > +
> > > >         for (n = 0; n < total_sg; n++) {
> > > >                 if (i == err_idx)
> > > >                         break;
> > > > @@ -1599,7 +1591,9 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
> > > >         struct vring_desc_state_packed *state = NULL;
> > > >         struct vring_packed_desc *desc;
> > > >         unsigned int i, curr;
> > > > +       u16 flags;
> > > >
> > > > +       flags = vq->packed.desc_extra[id].flags;
> > >
> > > Can we check vq->indirect && indir_desc here? Then we don't need
> > > special care to store flags in desc_extra.
> >
> >
> > No.
> >
> > When vq->indirect is true, but the desc may has not indirect flag.
>
> But we check indir_desc as well?
>
>         vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
>                 !cfg_vq_get(cfg, vq, ctx);

I think you are right.

I will fix in next version.

Thanks.


>
> Thanks
>
Jason Wang March 29, 2024, 3:24 a.m. UTC | #5
On Thu, Mar 28, 2024 at 4:16 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Thu, 28 Mar 2024 16:07:14 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Thu, Mar 28, 2024 at 3:32 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > On Thu, 28 Mar 2024 14:56:47 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > On Wed, Mar 27, 2024 at 7:14 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > >
> > > > > In the functions vring_unmap_extra_packed and vring_unmap_desc_packed,
> > > > > multiple checks are made whether unmap is performed and whether it is
> > > > > INDIRECT.
> > > > >
> > > > > These two functions are usually called in a loop, and we should put the
> > > > > check outside the loop.
> > > > >
> > > > > And we unmap the descs with VRING_DESC_F_INDIRECT on the same path with
> > > > > other descs, that make the thing more complex. If we distinguish the
> > > > > descs with VRING_DESC_F_INDIRECT before unmap, thing will be clearer.
> > > > >
> > > > > For desc with VRING_DESC_F_INDIRECT flag:
> > > > > 1. only one desc of the desc table is used, we do not need the loop
> > > > >     Theoretically, indirect descriptors could be chained.
> > > > >     But now, that is not supported by "add", so we ignore this case.
> > > > > 2. the called unmap api is difference from the other desc
> > > > > 3. the vq->premapped is not needed to check
> > > > > 4. the vq->indirect is not needed to check
> > > > > 5. the state->indir_desc must not be null
> > > >
> > > > It doesn't explain the connection to the goal of this series. If it's
> > > > not a must I'd suggest moving it to a separate patch.
> > >
> > >
> > > The "no store dma ..." depends this.
> > >
> > > I will add this message in next version.
> > >
> > >
> > > >
> > > > >
> > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > >
> > > > Rethink this, it looks to me it would complicate the codes furtherly.
> > > >
> > > > For example, vring_map_xxx() helpers will check premappred and
> > > > use_dma_api by itself. But in the case of vring_unmap() you want to
> > > > move those checks to the caller. This will result in tricky codes that
> > > > are hard to understand.
> > > >
> > > > We need to be consistent here.
> > > >
> > > > If we try to optimize unmap we need to optimize map as well. But
> > > > generally it would complicate the logic of the caller if we want to
> > > > let the caller to differ. Ideally, the caller of those function should
> > > > know nothing about use_dma_api, premapped and other.
> > >
> > >
> > > The key is that we can check "use_dma_api, premapped" to skip the loop.
> > > If the vring_unmap_xxx is called, the "use_dma_api, premapped" is checked in
> > > advance, so that is a waste to check thest again.
> >
> > Right, but we have the same logic for map.
>
> But we can not skip the loop for map.

Ok, right. So I'm fine to leave it as is. We can optimize the checking
on top anyhow.

Thanks
diff mbox series

Patch

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 03360073bd4a..a2838fe1cc08 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1214,6 +1214,7 @@  static u16 packed_last_used(u16 last_used_idx)
 	return last_used_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR));
 }
 
+/* caller must check vring_need_unmap_buffer() */
 static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
 				     const struct vring_desc_extra *extra)
 {
@@ -1221,33 +1222,18 @@  static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
 
 	flags = extra->flags;
 
-	if (flags & VRING_DESC_F_INDIRECT) {
-		if (!vq->use_dma_api)
-			return;
-
-		dma_unmap_single(vring_dma_dev(vq),
-				 extra->addr, extra->len,
-				 (flags & VRING_DESC_F_WRITE) ?
-				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
-	} else {
-		if (!vring_need_unmap_buffer(vq))
-			return;
-
-		dma_unmap_page(vring_dma_dev(vq),
-			       extra->addr, extra->len,
-			       (flags & VRING_DESC_F_WRITE) ?
-			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
-	}
+	dma_unmap_page(vring_dma_dev(vq),
+		       extra->addr, extra->len,
+		       (flags & VRING_DESC_F_WRITE) ?
+		       DMA_FROM_DEVICE : DMA_TO_DEVICE);
 }
 
+/* caller must check vring_need_unmap_buffer() */
 static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
 				    const struct vring_packed_desc *desc)
 {
 	u16 flags;
 
-	if (!vring_need_unmap_buffer(vq))
-		return;
-
 	flags = le16_to_cpu(desc->flags);
 
 	dma_unmap_page(vring_dma_dev(vq),
@@ -1323,7 +1309,7 @@  static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 			total_sg * sizeof(struct vring_packed_desc),
 			DMA_TO_DEVICE);
 	if (vring_mapping_error(vq, addr)) {
-		if (vq->premapped)
+		if (!vring_need_unmap_buffer(vq))
 			goto free_desc;
 
 		goto unmap_release;
@@ -1338,10 +1324,11 @@  static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 		vq->packed.desc_extra[id].addr = addr;
 		vq->packed.desc_extra[id].len = total_sg *
 				sizeof(struct vring_packed_desc);
-		vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT |
-						  vq->packed.avail_used_flags;
 	}
 
+	vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT |
+		vq->packed.avail_used_flags;
+
 	/*
 	 * A driver MUST NOT make the first descriptor in the list
 	 * available before all subsequent descriptors comprising
@@ -1382,6 +1369,8 @@  static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 unmap_release:
 	err_idx = i;
 
+	WARN_ON(!vring_need_unmap_buffer(vq));
+
 	for (i = 0; i < err_idx; i++)
 		vring_unmap_desc_packed(vq, &desc[i]);
 
@@ -1475,12 +1464,13 @@  static inline int virtqueue_add_packed(struct virtqueue *_vq,
 			desc[i].len = cpu_to_le32(sg->length);
 			desc[i].id = cpu_to_le16(id);
 
-			if (unlikely(vq->use_dma_api)) {
+			if (vring_need_unmap_buffer(vq)) {
 				vq->packed.desc_extra[curr].addr = addr;
 				vq->packed.desc_extra[curr].len = sg->length;
-				vq->packed.desc_extra[curr].flags =
-					le16_to_cpu(flags);
 			}
+
+			vq->packed.desc_extra[curr].flags = le16_to_cpu(flags);
+
 			prev = curr;
 			curr = vq->packed.desc_extra[curr].next;
 
@@ -1530,6 +1520,8 @@  static inline int virtqueue_add_packed(struct virtqueue *_vq,
 
 	vq->packed.avail_used_flags = avail_used_flags;
 
+	WARN_ON(!vring_need_unmap_buffer(vq));
+
 	for (n = 0; n < total_sg; n++) {
 		if (i == err_idx)
 			break;
@@ -1599,7 +1591,9 @@  static void detach_buf_packed(struct vring_virtqueue *vq,
 	struct vring_desc_state_packed *state = NULL;
 	struct vring_packed_desc *desc;
 	unsigned int i, curr;
+	u16 flags;
 
+	flags = vq->packed.desc_extra[id].flags;
 	state = &vq->packed.desc_state[id];
 
 	/* Clear data ptr. */
@@ -1609,22 +1603,32 @@  static void detach_buf_packed(struct vring_virtqueue *vq,
 	vq->free_head = id;
 	vq->vq.num_free += state->num;
 
-	if (unlikely(vq->use_dma_api)) {
-		curr = id;
-		for (i = 0; i < state->num; i++) {
-			vring_unmap_extra_packed(vq,
-						 &vq->packed.desc_extra[curr]);
-			curr = vq->packed.desc_extra[curr].next;
+	if (!(flags & VRING_DESC_F_INDIRECT)) {
+		if (vring_need_unmap_buffer(vq)) {
+			curr = id;
+			for (i = 0; i < state->num; i++) {
+				vring_unmap_extra_packed(vq,
+							 &vq->packed.desc_extra[curr]);
+				curr = vq->packed.desc_extra[curr].next;
+			}
 		}
-	}
 
-	if (vq->indirect) {
+		if (ctx)
+			*ctx = state->indir_desc;
+	} else {
+		const struct vring_desc_extra *extra;
 		u32 len;
 
+		if (vq->use_dma_api) {
+			extra = &vq->packed.desc_extra[id];
+			dma_unmap_single(vring_dma_dev(vq),
+					 extra->addr, extra->len,
+					 (flags & VRING_DESC_F_WRITE) ?
+					 DMA_FROM_DEVICE : DMA_TO_DEVICE);
+		}
+
 		/* Free the indirect table, if any, now that it's unmapped. */
 		desc = state->indir_desc;
-		if (!desc)
-			return;
 
 		if (vring_need_unmap_buffer(vq)) {
 			len = vq->packed.desc_extra[id].len;
@@ -1634,8 +1638,6 @@  static void detach_buf_packed(struct vring_virtqueue *vq,
 		}
 		kfree(desc);
 		state->indir_desc = NULL;
-	} else if (ctx) {
-		*ctx = state->indir_desc;
 	}
 }