diff mbox series

[v7,1/6] memory: prevent dma-reentracy issues

Message ID 20230313082417.827484-2-alxndr@bu.edu (mailing list archive)
State New, archived
Headers show
Series memory: prevent dma-reentracy issues | expand

Commit Message

Alexander Bulekov March 13, 2023, 8:24 a.m. UTC
Add a flag to the DeviceState, when a device is engaged in PIO/MMIO/DMA.
This flag is set/checked prior to calling a device's MemoryRegion
handlers, and set when device code initiates DMA.  The purpose of this
flag is to prevent two types of DMA-based reentrancy issues:

1.) mmio -> dma -> mmio case
2.) bh -> dma write -> mmio case

These issues have led to problems such as stack-exhaustion and
use-after-frees.

Summary of the problem from Peter Maydell:
https://lore.kernel.org/qemu-devel/CAFEAcA_23vc7hE3iaM-JVA6W38LK4hJoWae5KcknhPRD5fPBZA@mail.gmail.com

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/62
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/540
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/541
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/556
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/557
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/827
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1282

Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
Acked-by: Peter Xu <peterx@redhat.com>
---
 include/hw/qdev-core.h |  7 +++++++
 softmmu/memory.c       | 17 +++++++++++++++++
 softmmu/trace-events   |  1 +
 3 files changed, 25 insertions(+)

Comments

Philippe Mathieu-Daudé March 13, 2023, 8:45 a.m. UTC | #1
Hi Alex,

Sorry for the late review, *sigh*.

On 13/3/23 09:24, Alexander Bulekov wrote:
> Add a flag to the DeviceState, when a device is engaged in PIO/MMIO/DMA.
> This flag is set/checked prior to calling a device's MemoryRegion
> handlers, and set when device code initiates DMA.  The purpose of this
> flag is to prevent two types of DMA-based reentrancy issues:
> 
> 1.) mmio -> dma -> mmio case
> 2.) bh -> dma write -> mmio case
> 
> These issues have led to problems such as stack-exhaustion and
> use-after-frees.
> 
> Summary of the problem from Peter Maydell:
> https://lore.kernel.org/qemu-devel/CAFEAcA_23vc7hE3iaM-JVA6W38LK4hJoWae5KcknhPRD5fPBZA@mail.gmail.com
> 
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/62
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/540
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/541
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/556
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/557
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/827
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1282
> 
> Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
> Acked-by: Peter Xu <peterx@redhat.com>
> ---
>   include/hw/qdev-core.h |  7 +++++++
>   softmmu/memory.c       | 17 +++++++++++++++++
>   softmmu/trace-events   |  1 +
>   3 files changed, 25 insertions(+)
> 
> diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
> index bd50ad5ee1..7623703943 100644
> --- a/include/hw/qdev-core.h
> +++ b/include/hw/qdev-core.h
> @@ -162,6 +162,10 @@ struct NamedClockList {
>       QLIST_ENTRY(NamedClockList) node;
>   };
>   
> +typedef struct {
> +    bool engaged_in_io;

Do you plan to add more fields?

> +} MemReentrancyGuard;
> +
>   /**
>    * DeviceState:
>    * @realized: Indicates whether the device has been fully constructed.
> @@ -194,6 +198,9 @@ struct DeviceState {
>       int alias_required_for_version;
>       ResettableState reset;
>       GSList *unplug_blockers;
> +
> +    /* Is the device currently in mmio/pio/dma? Used to prevent re-entrancy */
> +    MemReentrancyGuard mem_reentrancy_guard;

At this point I'm not sure anymore this is a device or MR property.

>   };
>   
>   struct DeviceListener {
> diff --git a/softmmu/memory.c b/softmmu/memory.c
> index 4699ba55ec..57bf18a257 100644
> --- a/softmmu/memory.c
> +++ b/softmmu/memory.c
> @@ -533,6 +533,7 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
>       uint64_t access_mask;
>       unsigned access_size;
>       unsigned i;
> +    DeviceState *dev = NULL;
>       MemTxResult r = MEMTX_OK;
>   
>       if (!access_size_min) {
> @@ -542,6 +543,19 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
>           access_size_max = 4;
>       }
>   
> +    /* Do not allow more than one simultanous access to a device's IO Regions */

Typo "simultaneous".

1/ access_with_adjusted_size() is complex enough and we are having hard
    time getting it right. I'd prefer we don't intermix size adjustment
    and re-entrancy check in the same function. This check could belong
    to the callers.

2/ I'm not keen on calling QOM object_dynamic_cast() in this hot path;
    and mixing QDev API within MR one. At least, can we cache this value
    once in memory_region_do_init() since we have access to @owner?

> +    if (mr->owner &&
> +        !mr->ram_device && !mr->ram && !mr->rom_device && !mr->readonly) {
> +        dev = (DeviceState *) object_dynamic_cast(mr->owner, TYPE_DEVICE);
> +        if (dev) {
> +            if (dev->mem_reentrancy_guard.engaged_in_io) {
> +                trace_memory_region_reentrant_io(get_cpu_index(), mr, addr, size);
> +                return MEMTX_ERROR;

MEMTX_ERROR is device-specific, I'm not sure it is right to return it
from this generic path. Maybe you meant MEMTX_ACCESS_ERROR?

> +            }
> +            dev->mem_reentrancy_guard.engaged_in_io = true;
> +        }
> +    }
> +
>       /* FIXME: support unaligned access? */
>       access_size = MAX(MIN(size, access_size_max), access_size_min);
>       access_mask = MAKE_64BIT_MASK(0, access_size * 8);
> @@ -556,6 +570,9 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
>                           access_mask, attrs);
>           }
>       }
> +    if (dev) {
> +        dev->mem_reentrancy_guard.engaged_in_io = false;
> +    }
>       return r;
>   }
Alexander Bulekov March 13, 2023, 9:15 a.m. UTC | #2
On 230313 0945, Philippe Mathieu-Daudé wrote:
> Hi Alex,
> 
> Sorry for the late review, *sigh*.
> 
> On 13/3/23 09:24, Alexander Bulekov wrote:
> > Add a flag to the DeviceState, when a device is engaged in PIO/MMIO/DMA.
> > This flag is set/checked prior to calling a device's MemoryRegion
> > handlers, and set when device code initiates DMA.  The purpose of this
> > flag is to prevent two types of DMA-based reentrancy issues:
> > 
> > 1.) mmio -> dma -> mmio case
> > 2.) bh -> dma write -> mmio case
> > 
> > These issues have led to problems such as stack-exhaustion and
> > use-after-frees.
> > 
> > Summary of the problem from Peter Maydell:
> > https://lore.kernel.org/qemu-devel/CAFEAcA_23vc7hE3iaM-JVA6W38LK4hJoWae5KcknhPRD5fPBZA@mail.gmail.com
> > 
> > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/62
> > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/540
> > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/541
> > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/556
> > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/557
> > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/827
> > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1282
> > 
> > Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
> > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
> > Acked-by: Peter Xu <peterx@redhat.com>
> > ---
> >   include/hw/qdev-core.h |  7 +++++++
> >   softmmu/memory.c       | 17 +++++++++++++++++
> >   softmmu/trace-events   |  1 +
> >   3 files changed, 25 insertions(+)
> > 
> > diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
> > index bd50ad5ee1..7623703943 100644
> > --- a/include/hw/qdev-core.h
> > +++ b/include/hw/qdev-core.h
> > @@ -162,6 +162,10 @@ struct NamedClockList {
> >       QLIST_ENTRY(NamedClockList) node;
> >   };
> > +typedef struct {
> > +    bool engaged_in_io;
> 
> Do you plan to add more fields?

Not right now, but maybe some need will come up.

> > +} MemReentrancyGuard;
> > +
> >   /**
> >    * DeviceState:
> >    * @realized: Indicates whether the device has been fully constructed.
> > @@ -194,6 +198,9 @@ struct DeviceState {
> >       int alias_required_for_version;
> >       ResettableState reset;
> >       GSList *unplug_blockers;
> > +
> > +    /* Is the device currently in mmio/pio/dma? Used to prevent re-entrancy */
> > +    MemReentrancyGuard mem_reentrancy_guard;
> 
> At this point I'm not sure anymore this is a device or MR property.

It's designed to be an MR property. If it were MR specific, it wouldn't
handle the BH -> DMA case, or this one, where there are two MRs (doorbell
and oper) involed.
https://gitlab.com/qemu-project/qemu/-/issues/540

> 
> >   };
> >   struct DeviceListener {
> > diff --git a/softmmu/memory.c b/softmmu/memory.c
> > index 4699ba55ec..57bf18a257 100644
> > --- a/softmmu/memory.c
> > +++ b/softmmu/memory.c
> > @@ -533,6 +533,7 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
> >       uint64_t access_mask;
> >       unsigned access_size;
> >       unsigned i;
> > +    DeviceState *dev = NULL;
> >       MemTxResult r = MEMTX_OK;
> >       if (!access_size_min) {
> > @@ -542,6 +543,19 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
> >           access_size_max = 4;
> >       }
> > +    /* Do not allow more than one simultanous access to a device's IO Regions */
> 
> Typo "simultaneous".
> 
> 1/ access_with_adjusted_size() is complex enough and we are having hard
>    time getting it right. I'd prefer we don't intermix size adjustment
>    and re-entrancy check in the same function. This check could belong
>    to the callers.
> 

Would moving the code within this function to keep it separate from the
size adjustment be good enough? Otherwise we would end up with duplicate
code in the read/write callers.

The size-adjustment seems to be orthogonal (the MR won't change)?

> 2/ I'm not keen on calling QOM object_dynamic_cast() in this hot path;
>    and mixing QDev API within MR one. At least, can we cache this value
>    once in memory_region_do_init() since we have access to @owner?
>

Sounds like a good idea. Is it ever possible for the owner/owner's
address to change? 

Thanks
-Alex

> > +    if (mr->owner &&
> > +        !mr->ram_device && !mr->ram && !mr->rom_device && !mr->readonly) {
> > +        dev = (DeviceState *) object_dynamic_cast(mr->owner, TYPE_DEVICE);
> > +        if (dev) {
> > +            if (dev->mem_reentrancy_guard.engaged_in_io) {
> > +                trace_memory_region_reentrant_io(get_cpu_index(), mr, addr, size);
> > +                return MEMTX_ERROR;
> 
> MEMTX_ERROR is device-specific, I'm not sure it is right to return it
> from this generic path. Maybe you meant MEMTX_ACCESS_ERROR?
> 
> > +            }
> > +            dev->mem_reentrancy_guard.engaged_in_io = true;
> > +        }
> > +    }
> > +
> >       /* FIXME: support unaligned access? */
> >       access_size = MAX(MIN(size, access_size_max), access_size_min);
> >       access_mask = MAKE_64BIT_MASK(0, access_size * 8);
> > @@ -556,6 +570,9 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
> >                           access_mask, attrs);
> >           }
> >       }
> > +    if (dev) {
> > +        dev->mem_reentrancy_guard.engaged_in_io = false;
> > +    }
> >       return r;
> >   }
>
Alexander Bulekov March 13, 2023, 9:16 a.m. UTC | #3
On 230313 0515, Alexander Bulekov wrote:
> > 
> > At this point I'm not sure anymore this is a device or MR property.
> 
> It's designed to be an MR property. If it were MR specific, it wouldn't

Should be "It's designed to be a Device property."
Philippe Mathieu-Daudé March 13, 2023, 10:06 a.m. UTC | #4
On 13/3/23 09:24, Alexander Bulekov wrote:
> Add a flag to the DeviceState, when a device is engaged in PIO/MMIO/DMA.
> This flag is set/checked prior to calling a device's MemoryRegion
> handlers, and set when device code initiates DMA.  The purpose of this
> flag is to prevent two types of DMA-based reentrancy issues:
> 
> 1.) mmio -> dma -> mmio case
> 2.) bh -> dma write -> mmio case
> 
> These issues have led to problems such as stack-exhaustion and
> use-after-frees.
> 
> Summary of the problem from Peter Maydell:
> https://lore.kernel.org/qemu-devel/CAFEAcA_23vc7hE3iaM-JVA6W38LK4hJoWae5KcknhPRD5fPBZA@mail.gmail.com
> 
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/62
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/540
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/541
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/556
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/557
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/827
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1282

BTW we need to commit these reproducers as tests/qtest/fuzz-*.

> Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
> Acked-by: Peter Xu <peterx@redhat.com>
> ---
>   include/hw/qdev-core.h |  7 +++++++
>   softmmu/memory.c       | 17 +++++++++++++++++
>   softmmu/trace-events   |  1 +
>   3 files changed, 25 insertions(+)
diff mbox series

Patch

diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index bd50ad5ee1..7623703943 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -162,6 +162,10 @@  struct NamedClockList {
     QLIST_ENTRY(NamedClockList) node;
 };
 
+typedef struct {
+    bool engaged_in_io;
+} MemReentrancyGuard;
+
 /**
  * DeviceState:
  * @realized: Indicates whether the device has been fully constructed.
@@ -194,6 +198,9 @@  struct DeviceState {
     int alias_required_for_version;
     ResettableState reset;
     GSList *unplug_blockers;
+
+    /* Is the device currently in mmio/pio/dma? Used to prevent re-entrancy */
+    MemReentrancyGuard mem_reentrancy_guard;
 };
 
 struct DeviceListener {
diff --git a/softmmu/memory.c b/softmmu/memory.c
index 4699ba55ec..57bf18a257 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -533,6 +533,7 @@  static MemTxResult access_with_adjusted_size(hwaddr addr,
     uint64_t access_mask;
     unsigned access_size;
     unsigned i;
+    DeviceState *dev = NULL;
     MemTxResult r = MEMTX_OK;
 
     if (!access_size_min) {
@@ -542,6 +543,19 @@  static MemTxResult access_with_adjusted_size(hwaddr addr,
         access_size_max = 4;
     }
 
+    /* Do not allow more than one simultanous access to a device's IO Regions */
+    if (mr->owner &&
+        !mr->ram_device && !mr->ram && !mr->rom_device && !mr->readonly) {
+        dev = (DeviceState *) object_dynamic_cast(mr->owner, TYPE_DEVICE);
+        if (dev) {
+            if (dev->mem_reentrancy_guard.engaged_in_io) {
+                trace_memory_region_reentrant_io(get_cpu_index(), mr, addr, size);
+                return MEMTX_ERROR;
+            }
+            dev->mem_reentrancy_guard.engaged_in_io = true;
+        }
+    }
+
     /* FIXME: support unaligned access? */
     access_size = MAX(MIN(size, access_size_max), access_size_min);
     access_mask = MAKE_64BIT_MASK(0, access_size * 8);
@@ -556,6 +570,9 @@  static MemTxResult access_with_adjusted_size(hwaddr addr,
                         access_mask, attrs);
         }
     }
+    if (dev) {
+        dev->mem_reentrancy_guard.engaged_in_io = false;
+    }
     return r;
 }
 
diff --git a/softmmu/trace-events b/softmmu/trace-events
index 22606dc27b..62d04ea9a7 100644
--- a/softmmu/trace-events
+++ b/softmmu/trace-events
@@ -13,6 +13,7 @@  memory_region_ops_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, u
 memory_region_ops_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size, const char *name) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u name '%s'"
 memory_region_subpage_read(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
 memory_region_subpage_write(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
+memory_region_reentrant_io(int cpu_index, void *mr, uint64_t offset, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" size %u"
 memory_region_ram_device_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
 memory_region_ram_device_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
 memory_region_sync_dirty(const char *mr, const char *listener, int global) "mr '%s' listener '%s' synced (global=%d)"