diff mbox

[PULL,02/32] memory: RCU ram_list.dirty_memory[] for safe RAM hotplug

Message ID 1455020031-8268-2-git-send-email-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Paolo Bonzini Feb. 9, 2016, 12:13 p.m. UTC
From: Stefan Hajnoczi <stefanha@redhat.com>

Although accesses to ram_list.dirty_memory[] use atomics so multiple
threads can safely dirty the bitmap, the data structure is not fully
thread-safe yet.

This patch handles the RAM hotplug case where ram_list.dirty_memory[] is
grown.  ram_list.dirty_memory[] is change from a regular bitmap to an
RCU array of pointers to fixed-size bitmap blocks.  Threads can continue
accessing bitmap blocks while the array is being extended.  See the
comments in the code for an in-depth explanation of struct
DirtyMemoryBlocks.

I have tested that live migration with virtio-blk dataplane works.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <1453728801-5398-2-git-send-email-stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 exec.c                  |  75 +++++++++++++++----
 include/exec/ram_addr.h | 189 ++++++++++++++++++++++++++++++++++++++++++------
 migration/ram.c         |   4 -
 3 files changed, 225 insertions(+), 43 deletions(-)

Comments

Leon Alrae Feb. 10, 2016, 4:56 p.m. UTC | #1
Hi,

I just noticed significant performance hit with this change. Booting
small system (I tried on system mips only) was usually taking around 20
seconds, now reaches 3 minutes with this change.

Leon

On 09/02/16 12:13, Paolo Bonzini wrote:
> From: Stefan Hajnoczi <stefanha@redhat.com>
> 
> Although accesses to ram_list.dirty_memory[] use atomics so multiple
> threads can safely dirty the bitmap, the data structure is not fully
> thread-safe yet.
> 
> This patch handles the RAM hotplug case where ram_list.dirty_memory[] is
> grown.  ram_list.dirty_memory[] is change from a regular bitmap to an
> RCU array of pointers to fixed-size bitmap blocks.  Threads can continue
> accessing bitmap blocks while the array is being extended.  See the
> comments in the code for an in-depth explanation of struct
> DirtyMemoryBlocks.
> 
> I have tested that live migration with virtio-blk dataplane works.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> Message-Id: <1453728801-5398-2-git-send-email-stefanha@redhat.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  exec.c                  |  75 +++++++++++++++----
>  include/exec/ram_addr.h | 189 ++++++++++++++++++++++++++++++++++++++++++------
>  migration/ram.c         |   4 -
>  3 files changed, 225 insertions(+), 43 deletions(-)
> 
> diff --git a/exec.c b/exec.c
> index ab37360..7d67c11 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -980,8 +980,9 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
>                                                ram_addr_t length,
>                                                unsigned client)
>  {
> +    DirtyMemoryBlocks *blocks;
>      unsigned long end, page;
> -    bool dirty;
> +    bool dirty = false;
>  
>      if (length == 0) {
>          return false;
> @@ -989,8 +990,22 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
>  
>      end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
>      page = start >> TARGET_PAGE_BITS;
> -    dirty = bitmap_test_and_clear_atomic(ram_list.dirty_memory[client],
> -                                         page, end - page);
> +
> +    rcu_read_lock();
> +
> +    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
> +
> +    while (page < end) {
> +        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
> +        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
> +        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
> +
> +        dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
> +                                              offset, num);
> +        page += num;
> +    }
> +
> +    rcu_read_unlock();
>  
>      if (dirty && tcg_enabled()) {
>          tlb_reset_dirty_range_all(start, length);
> @@ -1504,6 +1519,47 @@ int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)
>      return 0;
>  }
>  
> +/* Called with ram_list.mutex held */
> +static void dirty_memory_extend(ram_addr_t old_ram_size,
> +                                ram_addr_t new_ram_size)
> +{
> +    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
> +                                             DIRTY_MEMORY_BLOCK_SIZE);
> +    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
> +                                             DIRTY_MEMORY_BLOCK_SIZE);
> +    int i;
> +
> +    /* Only need to extend if block count increased */
> +    if (new_num_blocks <= old_num_blocks) {
> +        return;
> +    }
> +
> +    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
> +        DirtyMemoryBlocks *old_blocks;
> +        DirtyMemoryBlocks *new_blocks;
> +        int j;
> +
> +        old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
> +        new_blocks = g_malloc(sizeof(*new_blocks) +
> +                              sizeof(new_blocks->blocks[0]) * new_num_blocks);
> +
> +        if (old_num_blocks) {
> +            memcpy(new_blocks->blocks, old_blocks->blocks,
> +                   old_num_blocks * sizeof(old_blocks->blocks[0]));
> +        }
> +
> +        for (j = old_num_blocks; j < new_num_blocks; j++) {
> +            new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
> +        }
> +
> +        atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
> +
> +        if (old_blocks) {
> +            g_free_rcu(old_blocks, rcu);
> +        }
> +    }
> +}
> +
>  static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
>  {
>      RAMBlock *block;
> @@ -1543,6 +1599,7 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
>                (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
>      if (new_ram_size > old_ram_size) {
>          migration_bitmap_extend(old_ram_size, new_ram_size);
> +        dirty_memory_extend(old_ram_size, new_ram_size);
>      }
>      /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
>       * QLIST (which has an RCU-friendly variant) does not have insertion at
> @@ -1568,18 +1625,6 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
>      ram_list.version++;
>      qemu_mutex_unlock_ramlist();
>  
> -    new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
> -
> -    if (new_ram_size > old_ram_size) {
> -        int i;
> -
> -        /* ram_list.dirty_memory[] is protected by the iothread lock.  */
> -        for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
> -            ram_list.dirty_memory[i] =
> -                bitmap_zero_extend(ram_list.dirty_memory[i],
> -                                   old_ram_size, new_ram_size);
> -       }
> -    }
>      cpu_physical_memory_set_dirty_range(new_block->offset,
>                                          new_block->used_length,
>                                          DIRTY_CLIENTS_ALL);
> diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
> index f2e872d..b1413a1 100644
> --- a/include/exec/ram_addr.h
> +++ b/include/exec/ram_addr.h
> @@ -49,13 +49,43 @@ static inline void *ramblock_ptr(RAMBlock *block, ram_addr_t offset)
>      return (char *)block->host + offset;
>  }
>  
> +/* The dirty memory bitmap is split into fixed-size blocks to allow growth
> + * under RCU.  The bitmap for a block can be accessed as follows:
> + *
> + *   rcu_read_lock();
> + *
> + *   DirtyMemoryBlocks *blocks =
> + *       atomic_rcu_read(&ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION]);
> + *
> + *   ram_addr_t idx = (addr >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE;
> + *   unsigned long *block = blocks.blocks[idx];
> + *   ...access block bitmap...
> + *
> + *   rcu_read_unlock();
> + *
> + * Remember to check for the end of the block when accessing a range of
> + * addresses.  Move on to the next block if you reach the end.
> + *
> + * Organization into blocks allows dirty memory to grow (but not shrink) under
> + * RCU.  When adding new RAMBlocks requires the dirty memory to grow, a new
> + * DirtyMemoryBlocks array is allocated with pointers to existing blocks kept
> + * the same.  Other threads can safely access existing blocks while dirty
> + * memory is being grown.  When no threads are using the old DirtyMemoryBlocks
> + * anymore it is freed by RCU (but the underlying blocks stay because they are
> + * pointed to from the new DirtyMemoryBlocks).
> + */
> +#define DIRTY_MEMORY_BLOCK_SIZE ((ram_addr_t)256 * 1024 * 8)
> +typedef struct {
> +    struct rcu_head rcu;
> +    unsigned long *blocks[];
> +} DirtyMemoryBlocks;
> +
>  typedef struct RAMList {
>      QemuMutex mutex;
> -    /* Protected by the iothread lock.  */
> -    unsigned long *dirty_memory[DIRTY_MEMORY_NUM];
>      RAMBlock *mru_block;
>      /* RCU-enabled, writes protected by the ramlist lock. */
>      QLIST_HEAD(, RAMBlock) blocks;
> +    DirtyMemoryBlocks *dirty_memory[DIRTY_MEMORY_NUM];
>      uint32_t version;
>  } RAMList;
>  extern RAMList ram_list;
> @@ -89,30 +119,70 @@ static inline bool cpu_physical_memory_get_dirty(ram_addr_t start,
>                                                   ram_addr_t length,
>                                                   unsigned client)
>  {
> -    unsigned long end, page, next;
> +    DirtyMemoryBlocks *blocks;
> +    unsigned long end, page;
> +    bool dirty = false;
>  
>      assert(client < DIRTY_MEMORY_NUM);
>  
>      end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
>      page = start >> TARGET_PAGE_BITS;
> -    next = find_next_bit(ram_list.dirty_memory[client], end, page);
>  
> -    return next < end;
> +    rcu_read_lock();
> +
> +    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
> +
> +    while (page < end) {
> +        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
> +        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
> +        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
> +
> +        if (find_next_bit(blocks->blocks[idx], offset, num) < num) {
> +            dirty = true;
> +            break;
> +        }
> +
> +        page += num;
> +    }
> +
> +    rcu_read_unlock();
> +
> +    return dirty;
>  }
>  
>  static inline bool cpu_physical_memory_all_dirty(ram_addr_t start,
>                                                   ram_addr_t length,
>                                                   unsigned client)
>  {
> -    unsigned long end, page, next;
> +    DirtyMemoryBlocks *blocks;
> +    unsigned long end, page;
> +    bool dirty = true;
>  
>      assert(client < DIRTY_MEMORY_NUM);
>  
>      end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
>      page = start >> TARGET_PAGE_BITS;
> -    next = find_next_zero_bit(ram_list.dirty_memory[client], end, page);
>  
> -    return next >= end;
> +    rcu_read_lock();
> +
> +    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
> +
> +    while (page < end) {
> +        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
> +        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
> +        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
> +
> +        if (find_next_zero_bit(blocks->blocks[idx], offset, num) < num) {
> +            dirty = false;
> +            break;
> +        }
> +
> +        page += num;
> +    }
> +
> +    rcu_read_unlock();
> +
> +    return dirty;
>  }
>  
>  static inline bool cpu_physical_memory_get_dirty_flag(ram_addr_t addr,
> @@ -154,16 +224,31 @@ static inline uint8_t cpu_physical_memory_range_includes_clean(ram_addr_t start,
>  static inline void cpu_physical_memory_set_dirty_flag(ram_addr_t addr,
>                                                        unsigned client)
>  {
> +    unsigned long page, idx, offset;
> +    DirtyMemoryBlocks *blocks;
> +
>      assert(client < DIRTY_MEMORY_NUM);
> -    set_bit_atomic(addr >> TARGET_PAGE_BITS, ram_list.dirty_memory[client]);
> +
> +    page = addr >> TARGET_PAGE_BITS;
> +    idx = page / DIRTY_MEMORY_BLOCK_SIZE;
> +    offset = page % DIRTY_MEMORY_BLOCK_SIZE;
> +
> +    rcu_read_lock();
> +
> +    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
> +
> +    set_bit_atomic(offset, blocks->blocks[idx]);
> +
> +    rcu_read_unlock();
>  }
>  
>  static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start,
>                                                         ram_addr_t length,
>                                                         uint8_t mask)
>  {
> +    DirtyMemoryBlocks *blocks[DIRTY_MEMORY_NUM];
>      unsigned long end, page;
> -    unsigned long **d = ram_list.dirty_memory;
> +    int i;
>  
>      if (!mask && !xen_enabled()) {
>          return;
> @@ -171,15 +256,36 @@ static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start,
>  
>      end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
>      page = start >> TARGET_PAGE_BITS;
> -    if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) {
> -        bitmap_set_atomic(d[DIRTY_MEMORY_MIGRATION], page, end - page);
> -    }
> -    if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) {
> -        bitmap_set_atomic(d[DIRTY_MEMORY_VGA], page, end - page);
> +
> +    rcu_read_lock();
> +
> +    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
> +        blocks[i] = atomic_rcu_read(&ram_list.dirty_memory[i]);
>      }
> -    if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) {
> -        bitmap_set_atomic(d[DIRTY_MEMORY_CODE], page, end - page);
> +
> +    while (page < end) {
> +        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
> +        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
> +        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
> +
> +        if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) {
> +            bitmap_set_atomic(blocks[DIRTY_MEMORY_MIGRATION]->blocks[idx],
> +                              offset, num);
> +        }
> +        if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) {
> +            bitmap_set_atomic(blocks[DIRTY_MEMORY_VGA]->blocks[idx],
> +                              offset, num);
> +        }
> +        if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) {
> +            bitmap_set_atomic(blocks[DIRTY_MEMORY_CODE]->blocks[idx],
> +                              offset, num);
> +        }
> +
> +        page += num;
>      }
> +
> +    rcu_read_unlock();
> +
>      xen_modified_memory(start, length);
>  }
>  
> @@ -199,21 +305,41 @@ static inline void cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
>      /* start address is aligned at the start of a word? */
>      if ((((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) &&
>          (hpratio == 1)) {
> +        unsigned long **blocks[DIRTY_MEMORY_NUM];
> +        unsigned long idx;
> +        unsigned long offset;
>          long k;
>          long nr = BITS_TO_LONGS(pages);
>  
> +        idx = (start >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE;
> +        offset = BIT_WORD((start >> TARGET_PAGE_BITS) %
> +                          DIRTY_MEMORY_BLOCK_SIZE);
> +
> +        rcu_read_lock();
> +
> +        for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
> +            blocks[i] = atomic_rcu_read(&ram_list.dirty_memory[i])->blocks;
> +        }
> +
>          for (k = 0; k < nr; k++) {
>              if (bitmap[k]) {
>                  unsigned long temp = leul_to_cpu(bitmap[k]);
> -                unsigned long **d = ram_list.dirty_memory;
>  
> -                atomic_or(&d[DIRTY_MEMORY_MIGRATION][page + k], temp);
> -                atomic_or(&d[DIRTY_MEMORY_VGA][page + k], temp);
> +                atomic_or(&blocks[DIRTY_MEMORY_MIGRATION][idx][offset], temp);
> +                atomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp);
>                  if (tcg_enabled()) {
> -                    atomic_or(&d[DIRTY_MEMORY_CODE][page + k], temp);
> +                    atomic_or(&blocks[DIRTY_MEMORY_CODE][idx][offset], temp);
>                  }
>              }
> +
> +            if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
> +                offset = 0;
> +                idx++;
> +            }
>          }
> +
> +        rcu_read_unlock();
> +
>          xen_modified_memory(start, pages << TARGET_PAGE_BITS);
>      } else {
>          uint8_t clients = tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE;
> @@ -265,18 +391,33 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(unsigned long *dest,
>      if (((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) {
>          int k;
>          int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS);
> -        unsigned long *src = ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION];
> +        unsigned long * const *src;
> +        unsigned long idx = (page * BITS_PER_LONG) / DIRTY_MEMORY_BLOCK_SIZE;
> +        unsigned long offset = BIT_WORD((page * BITS_PER_LONG) %
> +                                        DIRTY_MEMORY_BLOCK_SIZE);
> +
> +        rcu_read_lock();
> +
> +        src = atomic_rcu_read(
> +                &ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION])->blocks;
>  
>          for (k = page; k < page + nr; k++) {
> -            if (src[k]) {
> -                unsigned long bits = atomic_xchg(&src[k], 0);
> +            if (src[idx][offset]) {
> +                unsigned long bits = atomic_xchg(&src[idx][offset], 0);
>                  unsigned long new_dirty;
>                  new_dirty = ~dest[k];
>                  dest[k] |= bits;
>                  new_dirty &= bits;
>                  num_dirty += ctpopl(new_dirty);
>              }
> +
> +            if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
> +                offset = 0;
> +                idx++;
> +            }
>          }
> +
> +        rcu_read_unlock();
>      } else {
>          for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) {
>              if (cpu_physical_memory_test_and_clear_dirty(
> diff --git a/migration/ram.c b/migration/ram.c
> index 3cdfea4..96c749f 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -609,7 +609,6 @@ static void migration_bitmap_sync_init(void)
>      iterations_prev = 0;
>  }
>  
> -/* Called with iothread lock held, to protect ram_list.dirty_memory[] */
>  static void migration_bitmap_sync(void)
>  {
>      RAMBlock *block;
> @@ -1921,8 +1920,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>          acct_clear();
>      }
>  
> -    /* iothread lock needed for ram_list.dirty_memory[] */
> -    qemu_mutex_lock_iothread();
>      qemu_mutex_lock_ramlist();
>      rcu_read_lock();
>      bytes_transferred = 0;
> @@ -1947,7 +1944,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>      memory_global_dirty_log_start();
>      migration_bitmap_sync();
>      qemu_mutex_unlock_ramlist();
> -    qemu_mutex_unlock_iothread();
>  
>      qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
>  
>
Paolo Bonzini Feb. 10, 2016, 5:01 p.m. UTC | #2
On 10/02/2016 17:56, Leon Alrae wrote:
> Hi,
> 
> I just noticed significant performance hit with this change. Booting
> small system (I tried on system mips only) was usually taking around 20
> seconds, now reaches 3 minutes with this change.

You're lucky that it booted at all. :)  Unfortunately I noticed a slow
boot yesterday (6 minutes for Fedora 21 on TCG), but I blamed that I was
running on top of TCG _and_ NFS.  In retrospect that was not too smart.

Today I tried FreeDOS and it hangs completely, but my self-nak and
Peter's push unfortunately crossed.

I've posted a fix and Stefan has already reviewed it.  If you can test
it, that would be great.  The above Fedora 21 image takes about 50
seconds to boot with the patch, so that's in line with the x9 slowdown
you reported.

http://article.gmane.org/gmane.comp.emulators.qemu/393105/raw

Paolo

> Leon
> 
> On 09/02/16 12:13, Paolo Bonzini wrote:
>> From: Stefan Hajnoczi <stefanha@redhat.com>
>>
>> Although accesses to ram_list.dirty_memory[] use atomics so multiple
>> threads can safely dirty the bitmap, the data structure is not fully
>> thread-safe yet.
>>
>> This patch handles the RAM hotplug case where ram_list.dirty_memory[] is
>> grown.  ram_list.dirty_memory[] is change from a regular bitmap to an
>> RCU array of pointers to fixed-size bitmap blocks.  Threads can continue
>> accessing bitmap blocks while the array is being extended.  See the
>> comments in the code for an in-depth explanation of struct
>> DirtyMemoryBlocks.
>>
>> I have tested that live migration with virtio-blk dataplane works.
>>
>> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
>> Message-Id: <1453728801-5398-2-git-send-email-stefanha@redhat.com>
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> ---
>>  exec.c                  |  75 +++++++++++++++----
>>  include/exec/ram_addr.h | 189 ++++++++++++++++++++++++++++++++++++++++++------
>>  migration/ram.c         |   4 -
>>  3 files changed, 225 insertions(+), 43 deletions(-)
>>
>> diff --git a/exec.c b/exec.c
>> index ab37360..7d67c11 100644
>> --- a/exec.c
>> +++ b/exec.c
>> @@ -980,8 +980,9 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
>>                                                ram_addr_t length,
>>                                                unsigned client)
>>  {
>> +    DirtyMemoryBlocks *blocks;
>>      unsigned long end, page;
>> -    bool dirty;
>> +    bool dirty = false;
>>  
>>      if (length == 0) {
>>          return false;
>> @@ -989,8 +990,22 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
>>  
>>      end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
>>      page = start >> TARGET_PAGE_BITS;
>> -    dirty = bitmap_test_and_clear_atomic(ram_list.dirty_memory[client],
>> -                                         page, end - page);
>> +
>> +    rcu_read_lock();
>> +
>> +    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
>> +
>> +    while (page < end) {
>> +        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
>> +
>> +        dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
>> +                                              offset, num);
>> +        page += num;
>> +    }
>> +
>> +    rcu_read_unlock();
>>  
>>      if (dirty && tcg_enabled()) {
>>          tlb_reset_dirty_range_all(start, length);
>> @@ -1504,6 +1519,47 @@ int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)
>>      return 0;
>>  }
>>  
>> +/* Called with ram_list.mutex held */
>> +static void dirty_memory_extend(ram_addr_t old_ram_size,
>> +                                ram_addr_t new_ram_size)
>> +{
>> +    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
>> +                                             DIRTY_MEMORY_BLOCK_SIZE);
>> +    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
>> +                                             DIRTY_MEMORY_BLOCK_SIZE);
>> +    int i;
>> +
>> +    /* Only need to extend if block count increased */
>> +    if (new_num_blocks <= old_num_blocks) {
>> +        return;
>> +    }
>> +
>> +    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
>> +        DirtyMemoryBlocks *old_blocks;
>> +        DirtyMemoryBlocks *new_blocks;
>> +        int j;
>> +
>> +        old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
>> +        new_blocks = g_malloc(sizeof(*new_blocks) +
>> +                              sizeof(new_blocks->blocks[0]) * new_num_blocks);
>> +
>> +        if (old_num_blocks) {
>> +            memcpy(new_blocks->blocks, old_blocks->blocks,
>> +                   old_num_blocks * sizeof(old_blocks->blocks[0]));
>> +        }
>> +
>> +        for (j = old_num_blocks; j < new_num_blocks; j++) {
>> +            new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
>> +        }
>> +
>> +        atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
>> +
>> +        if (old_blocks) {
>> +            g_free_rcu(old_blocks, rcu);
>> +        }
>> +    }
>> +}
>> +
>>  static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
>>  {
>>      RAMBlock *block;
>> @@ -1543,6 +1599,7 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
>>                (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
>>      if (new_ram_size > old_ram_size) {
>>          migration_bitmap_extend(old_ram_size, new_ram_size);
>> +        dirty_memory_extend(old_ram_size, new_ram_size);
>>      }
>>      /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
>>       * QLIST (which has an RCU-friendly variant) does not have insertion at
>> @@ -1568,18 +1625,6 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
>>      ram_list.version++;
>>      qemu_mutex_unlock_ramlist();
>>  
>> -    new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
>> -
>> -    if (new_ram_size > old_ram_size) {
>> -        int i;
>> -
>> -        /* ram_list.dirty_memory[] is protected by the iothread lock.  */
>> -        for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
>> -            ram_list.dirty_memory[i] =
>> -                bitmap_zero_extend(ram_list.dirty_memory[i],
>> -                                   old_ram_size, new_ram_size);
>> -       }
>> -    }
>>      cpu_physical_memory_set_dirty_range(new_block->offset,
>>                                          new_block->used_length,
>>                                          DIRTY_CLIENTS_ALL);
>> diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
>> index f2e872d..b1413a1 100644
>> --- a/include/exec/ram_addr.h
>> +++ b/include/exec/ram_addr.h
>> @@ -49,13 +49,43 @@ static inline void *ramblock_ptr(RAMBlock *block, ram_addr_t offset)
>>      return (char *)block->host + offset;
>>  }
>>  
>> +/* The dirty memory bitmap is split into fixed-size blocks to allow growth
>> + * under RCU.  The bitmap for a block can be accessed as follows:
>> + *
>> + *   rcu_read_lock();
>> + *
>> + *   DirtyMemoryBlocks *blocks =
>> + *       atomic_rcu_read(&ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION]);
>> + *
>> + *   ram_addr_t idx = (addr >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE;
>> + *   unsigned long *block = blocks.blocks[idx];
>> + *   ...access block bitmap...
>> + *
>> + *   rcu_read_unlock();
>> + *
>> + * Remember to check for the end of the block when accessing a range of
>> + * addresses.  Move on to the next block if you reach the end.
>> + *
>> + * Organization into blocks allows dirty memory to grow (but not shrink) under
>> + * RCU.  When adding new RAMBlocks requires the dirty memory to grow, a new
>> + * DirtyMemoryBlocks array is allocated with pointers to existing blocks kept
>> + * the same.  Other threads can safely access existing blocks while dirty
>> + * memory is being grown.  When no threads are using the old DirtyMemoryBlocks
>> + * anymore it is freed by RCU (but the underlying blocks stay because they are
>> + * pointed to from the new DirtyMemoryBlocks).
>> + */
>> +#define DIRTY_MEMORY_BLOCK_SIZE ((ram_addr_t)256 * 1024 * 8)
>> +typedef struct {
>> +    struct rcu_head rcu;
>> +    unsigned long *blocks[];
>> +} DirtyMemoryBlocks;
>> +
>>  typedef struct RAMList {
>>      QemuMutex mutex;
>> -    /* Protected by the iothread lock.  */
>> -    unsigned long *dirty_memory[DIRTY_MEMORY_NUM];
>>      RAMBlock *mru_block;
>>      /* RCU-enabled, writes protected by the ramlist lock. */
>>      QLIST_HEAD(, RAMBlock) blocks;
>> +    DirtyMemoryBlocks *dirty_memory[DIRTY_MEMORY_NUM];
>>      uint32_t version;
>>  } RAMList;
>>  extern RAMList ram_list;
>> @@ -89,30 +119,70 @@ static inline bool cpu_physical_memory_get_dirty(ram_addr_t start,
>>                                                   ram_addr_t length,
>>                                                   unsigned client)
>>  {
>> -    unsigned long end, page, next;
>> +    DirtyMemoryBlocks *blocks;
>> +    unsigned long end, page;
>> +    bool dirty = false;
>>  
>>      assert(client < DIRTY_MEMORY_NUM);
>>  
>>      end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
>>      page = start >> TARGET_PAGE_BITS;
>> -    next = find_next_bit(ram_list.dirty_memory[client], end, page);
>>  
>> -    return next < end;
>> +    rcu_read_lock();
>> +
>> +    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
>> +
>> +    while (page < end) {
>> +        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
>> +
>> +        if (find_next_bit(blocks->blocks[idx], offset, num) < num) {
>> +            dirty = true;
>> +            break;
>> +        }
>> +
>> +        page += num;
>> +    }
>> +
>> +    rcu_read_unlock();
>> +
>> +    return dirty;
>>  }
>>  
>>  static inline bool cpu_physical_memory_all_dirty(ram_addr_t start,
>>                                                   ram_addr_t length,
>>                                                   unsigned client)
>>  {
>> -    unsigned long end, page, next;
>> +    DirtyMemoryBlocks *blocks;
>> +    unsigned long end, page;
>> +    bool dirty = true;
>>  
>>      assert(client < DIRTY_MEMORY_NUM);
>>  
>>      end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
>>      page = start >> TARGET_PAGE_BITS;
>> -    next = find_next_zero_bit(ram_list.dirty_memory[client], end, page);
>>  
>> -    return next >= end;
>> +    rcu_read_lock();
>> +
>> +    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
>> +
>> +    while (page < end) {
>> +        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
>> +
>> +        if (find_next_zero_bit(blocks->blocks[idx], offset, num) < num) {
>> +            dirty = false;
>> +            break;
>> +        }
>> +
>> +        page += num;
>> +    }
>> +
>> +    rcu_read_unlock();
>> +
>> +    return dirty;
>>  }
>>  
>>  static inline bool cpu_physical_memory_get_dirty_flag(ram_addr_t addr,
>> @@ -154,16 +224,31 @@ static inline uint8_t cpu_physical_memory_range_includes_clean(ram_addr_t start,
>>  static inline void cpu_physical_memory_set_dirty_flag(ram_addr_t addr,
>>                                                        unsigned client)
>>  {
>> +    unsigned long page, idx, offset;
>> +    DirtyMemoryBlocks *blocks;
>> +
>>      assert(client < DIRTY_MEMORY_NUM);
>> -    set_bit_atomic(addr >> TARGET_PAGE_BITS, ram_list.dirty_memory[client]);
>> +
>> +    page = addr >> TARGET_PAGE_BITS;
>> +    idx = page / DIRTY_MEMORY_BLOCK_SIZE;
>> +    offset = page % DIRTY_MEMORY_BLOCK_SIZE;
>> +
>> +    rcu_read_lock();
>> +
>> +    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
>> +
>> +    set_bit_atomic(offset, blocks->blocks[idx]);
>> +
>> +    rcu_read_unlock();
>>  }
>>  
>>  static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start,
>>                                                         ram_addr_t length,
>>                                                         uint8_t mask)
>>  {
>> +    DirtyMemoryBlocks *blocks[DIRTY_MEMORY_NUM];
>>      unsigned long end, page;
>> -    unsigned long **d = ram_list.dirty_memory;
>> +    int i;
>>  
>>      if (!mask && !xen_enabled()) {
>>          return;
>> @@ -171,15 +256,36 @@ static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start,
>>  
>>      end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
>>      page = start >> TARGET_PAGE_BITS;
>> -    if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) {
>> -        bitmap_set_atomic(d[DIRTY_MEMORY_MIGRATION], page, end - page);
>> -    }
>> -    if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) {
>> -        bitmap_set_atomic(d[DIRTY_MEMORY_VGA], page, end - page);
>> +
>> +    rcu_read_lock();
>> +
>> +    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
>> +        blocks[i] = atomic_rcu_read(&ram_list.dirty_memory[i]);
>>      }
>> -    if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) {
>> -        bitmap_set_atomic(d[DIRTY_MEMORY_CODE], page, end - page);
>> +
>> +    while (page < end) {
>> +        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
>> +
>> +        if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) {
>> +            bitmap_set_atomic(blocks[DIRTY_MEMORY_MIGRATION]->blocks[idx],
>> +                              offset, num);
>> +        }
>> +        if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) {
>> +            bitmap_set_atomic(blocks[DIRTY_MEMORY_VGA]->blocks[idx],
>> +                              offset, num);
>> +        }
>> +        if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) {
>> +            bitmap_set_atomic(blocks[DIRTY_MEMORY_CODE]->blocks[idx],
>> +                              offset, num);
>> +        }
>> +
>> +        page += num;
>>      }
>> +
>> +    rcu_read_unlock();
>> +
>>      xen_modified_memory(start, length);
>>  }
>>  
>> @@ -199,21 +305,41 @@ static inline void cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
>>      /* start address is aligned at the start of a word? */
>>      if ((((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) &&
>>          (hpratio == 1)) {
>> +        unsigned long **blocks[DIRTY_MEMORY_NUM];
>> +        unsigned long idx;
>> +        unsigned long offset;
>>          long k;
>>          long nr = BITS_TO_LONGS(pages);
>>  
>> +        idx = (start >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE;
>> +        offset = BIT_WORD((start >> TARGET_PAGE_BITS) %
>> +                          DIRTY_MEMORY_BLOCK_SIZE);
>> +
>> +        rcu_read_lock();
>> +
>> +        for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
>> +            blocks[i] = atomic_rcu_read(&ram_list.dirty_memory[i])->blocks;
>> +        }
>> +
>>          for (k = 0; k < nr; k++) {
>>              if (bitmap[k]) {
>>                  unsigned long temp = leul_to_cpu(bitmap[k]);
>> -                unsigned long **d = ram_list.dirty_memory;
>>  
>> -                atomic_or(&d[DIRTY_MEMORY_MIGRATION][page + k], temp);
>> -                atomic_or(&d[DIRTY_MEMORY_VGA][page + k], temp);
>> +                atomic_or(&blocks[DIRTY_MEMORY_MIGRATION][idx][offset], temp);
>> +                atomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp);
>>                  if (tcg_enabled()) {
>> -                    atomic_or(&d[DIRTY_MEMORY_CODE][page + k], temp);
>> +                    atomic_or(&blocks[DIRTY_MEMORY_CODE][idx][offset], temp);
>>                  }
>>              }
>> +
>> +            if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
>> +                offset = 0;
>> +                idx++;
>> +            }
>>          }
>> +
>> +        rcu_read_unlock();
>> +
>>          xen_modified_memory(start, pages << TARGET_PAGE_BITS);
>>      } else {
>>          uint8_t clients = tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE;
>> @@ -265,18 +391,33 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(unsigned long *dest,
>>      if (((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) {
>>          int k;
>>          int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS);
>> -        unsigned long *src = ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION];
>> +        unsigned long * const *src;
>> +        unsigned long idx = (page * BITS_PER_LONG) / DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long offset = BIT_WORD((page * BITS_PER_LONG) %
>> +                                        DIRTY_MEMORY_BLOCK_SIZE);
>> +
>> +        rcu_read_lock();
>> +
>> +        src = atomic_rcu_read(
>> +                &ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION])->blocks;
>>  
>>          for (k = page; k < page + nr; k++) {
>> -            if (src[k]) {
>> -                unsigned long bits = atomic_xchg(&src[k], 0);
>> +            if (src[idx][offset]) {
>> +                unsigned long bits = atomic_xchg(&src[idx][offset], 0);
>>                  unsigned long new_dirty;
>>                  new_dirty = ~dest[k];
>>                  dest[k] |= bits;
>>                  new_dirty &= bits;
>>                  num_dirty += ctpopl(new_dirty);
>>              }
>> +
>> +            if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
>> +                offset = 0;
>> +                idx++;
>> +            }
>>          }
>> +
>> +        rcu_read_unlock();
>>      } else {
>>          for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) {
>>              if (cpu_physical_memory_test_and_clear_dirty(
>> diff --git a/migration/ram.c b/migration/ram.c
>> index 3cdfea4..96c749f 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -609,7 +609,6 @@ static void migration_bitmap_sync_init(void)
>>      iterations_prev = 0;
>>  }
>>  
>> -/* Called with iothread lock held, to protect ram_list.dirty_memory[] */
>>  static void migration_bitmap_sync(void)
>>  {
>>      RAMBlock *block;
>> @@ -1921,8 +1920,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>          acct_clear();
>>      }
>>  
>> -    /* iothread lock needed for ram_list.dirty_memory[] */
>> -    qemu_mutex_lock_iothread();
>>      qemu_mutex_lock_ramlist();
>>      rcu_read_lock();
>>      bytes_transferred = 0;
>> @@ -1947,7 +1944,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>      memory_global_dirty_log_start();
>>      migration_bitmap_sync();
>>      qemu_mutex_unlock_ramlist();
>> -    qemu_mutex_unlock_iothread();
>>  
>>      qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
>>  
>>
>
diff mbox

Patch

diff --git a/exec.c b/exec.c
index ab37360..7d67c11 100644
--- a/exec.c
+++ b/exec.c
@@ -980,8 +980,9 @@  bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
                                               ram_addr_t length,
                                               unsigned client)
 {
+    DirtyMemoryBlocks *blocks;
     unsigned long end, page;
-    bool dirty;
+    bool dirty = false;
 
     if (length == 0) {
         return false;
@@ -989,8 +990,22 @@  bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
 
     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
     page = start >> TARGET_PAGE_BITS;
-    dirty = bitmap_test_and_clear_atomic(ram_list.dirty_memory[client],
-                                         page, end - page);
+
+    rcu_read_lock();
+
+    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
+
+    while (page < end) {
+        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
+        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
+        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
+
+        dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
+                                              offset, num);
+        page += num;
+    }
+
+    rcu_read_unlock();
 
     if (dirty && tcg_enabled()) {
         tlb_reset_dirty_range_all(start, length);
@@ -1504,6 +1519,47 @@  int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)
     return 0;
 }
 
+/* Called with ram_list.mutex held */
+static void dirty_memory_extend(ram_addr_t old_ram_size,
+                                ram_addr_t new_ram_size)
+{
+    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
+                                             DIRTY_MEMORY_BLOCK_SIZE);
+    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
+                                             DIRTY_MEMORY_BLOCK_SIZE);
+    int i;
+
+    /* Only need to extend if block count increased */
+    if (new_num_blocks <= old_num_blocks) {
+        return;
+    }
+
+    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
+        DirtyMemoryBlocks *old_blocks;
+        DirtyMemoryBlocks *new_blocks;
+        int j;
+
+        old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
+        new_blocks = g_malloc(sizeof(*new_blocks) +
+                              sizeof(new_blocks->blocks[0]) * new_num_blocks);
+
+        if (old_num_blocks) {
+            memcpy(new_blocks->blocks, old_blocks->blocks,
+                   old_num_blocks * sizeof(old_blocks->blocks[0]));
+        }
+
+        for (j = old_num_blocks; j < new_num_blocks; j++) {
+            new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
+        }
+
+        atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
+
+        if (old_blocks) {
+            g_free_rcu(old_blocks, rcu);
+        }
+    }
+}
+
 static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
 {
     RAMBlock *block;
@@ -1543,6 +1599,7 @@  static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
               (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
     if (new_ram_size > old_ram_size) {
         migration_bitmap_extend(old_ram_size, new_ram_size);
+        dirty_memory_extend(old_ram_size, new_ram_size);
     }
     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
      * QLIST (which has an RCU-friendly variant) does not have insertion at
@@ -1568,18 +1625,6 @@  static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
     ram_list.version++;
     qemu_mutex_unlock_ramlist();
 
-    new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
-
-    if (new_ram_size > old_ram_size) {
-        int i;
-
-        /* ram_list.dirty_memory[] is protected by the iothread lock.  */
-        for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
-            ram_list.dirty_memory[i] =
-                bitmap_zero_extend(ram_list.dirty_memory[i],
-                                   old_ram_size, new_ram_size);
-       }
-    }
     cpu_physical_memory_set_dirty_range(new_block->offset,
                                         new_block->used_length,
                                         DIRTY_CLIENTS_ALL);
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index f2e872d..b1413a1 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -49,13 +49,43 @@  static inline void *ramblock_ptr(RAMBlock *block, ram_addr_t offset)
     return (char *)block->host + offset;
 }
 
+/* The dirty memory bitmap is split into fixed-size blocks to allow growth
+ * under RCU.  The bitmap for a block can be accessed as follows:
+ *
+ *   rcu_read_lock();
+ *
+ *   DirtyMemoryBlocks *blocks =
+ *       atomic_rcu_read(&ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION]);
+ *
+ *   ram_addr_t idx = (addr >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE;
+ *   unsigned long *block = blocks.blocks[idx];
+ *   ...access block bitmap...
+ *
+ *   rcu_read_unlock();
+ *
+ * Remember to check for the end of the block when accessing a range of
+ * addresses.  Move on to the next block if you reach the end.
+ *
+ * Organization into blocks allows dirty memory to grow (but not shrink) under
+ * RCU.  When adding new RAMBlocks requires the dirty memory to grow, a new
+ * DirtyMemoryBlocks array is allocated with pointers to existing blocks kept
+ * the same.  Other threads can safely access existing blocks while dirty
+ * memory is being grown.  When no threads are using the old DirtyMemoryBlocks
+ * anymore it is freed by RCU (but the underlying blocks stay because they are
+ * pointed to from the new DirtyMemoryBlocks).
+ */
+#define DIRTY_MEMORY_BLOCK_SIZE ((ram_addr_t)256 * 1024 * 8)
+typedef struct {
+    struct rcu_head rcu;
+    unsigned long *blocks[];
+} DirtyMemoryBlocks;
+
 typedef struct RAMList {
     QemuMutex mutex;
-    /* Protected by the iothread lock.  */
-    unsigned long *dirty_memory[DIRTY_MEMORY_NUM];
     RAMBlock *mru_block;
     /* RCU-enabled, writes protected by the ramlist lock. */
     QLIST_HEAD(, RAMBlock) blocks;
+    DirtyMemoryBlocks *dirty_memory[DIRTY_MEMORY_NUM];
     uint32_t version;
 } RAMList;
 extern RAMList ram_list;
@@ -89,30 +119,70 @@  static inline bool cpu_physical_memory_get_dirty(ram_addr_t start,
                                                  ram_addr_t length,
                                                  unsigned client)
 {
-    unsigned long end, page, next;
+    DirtyMemoryBlocks *blocks;
+    unsigned long end, page;
+    bool dirty = false;
 
     assert(client < DIRTY_MEMORY_NUM);
 
     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
     page = start >> TARGET_PAGE_BITS;
-    next = find_next_bit(ram_list.dirty_memory[client], end, page);
 
-    return next < end;
+    rcu_read_lock();
+
+    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
+
+    while (page < end) {
+        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
+        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
+        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
+
+        if (find_next_bit(blocks->blocks[idx], offset, num) < num) {
+            dirty = true;
+            break;
+        }
+
+        page += num;
+    }
+
+    rcu_read_unlock();
+
+    return dirty;
 }
 
 static inline bool cpu_physical_memory_all_dirty(ram_addr_t start,
                                                  ram_addr_t length,
                                                  unsigned client)
 {
-    unsigned long end, page, next;
+    DirtyMemoryBlocks *blocks;
+    unsigned long end, page;
+    bool dirty = true;
 
     assert(client < DIRTY_MEMORY_NUM);
 
     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
     page = start >> TARGET_PAGE_BITS;
-    next = find_next_zero_bit(ram_list.dirty_memory[client], end, page);
 
-    return next >= end;
+    rcu_read_lock();
+
+    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
+
+    while (page < end) {
+        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
+        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
+        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
+
+        if (find_next_zero_bit(blocks->blocks[idx], offset, num) < num) {
+            dirty = false;
+            break;
+        }
+
+        page += num;
+    }
+
+    rcu_read_unlock();
+
+    return dirty;
 }
 
 static inline bool cpu_physical_memory_get_dirty_flag(ram_addr_t addr,
@@ -154,16 +224,31 @@  static inline uint8_t cpu_physical_memory_range_includes_clean(ram_addr_t start,
 static inline void cpu_physical_memory_set_dirty_flag(ram_addr_t addr,
                                                       unsigned client)
 {
+    unsigned long page, idx, offset;
+    DirtyMemoryBlocks *blocks;
+
     assert(client < DIRTY_MEMORY_NUM);
-    set_bit_atomic(addr >> TARGET_PAGE_BITS, ram_list.dirty_memory[client]);
+
+    page = addr >> TARGET_PAGE_BITS;
+    idx = page / DIRTY_MEMORY_BLOCK_SIZE;
+    offset = page % DIRTY_MEMORY_BLOCK_SIZE;
+
+    rcu_read_lock();
+
+    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
+
+    set_bit_atomic(offset, blocks->blocks[idx]);
+
+    rcu_read_unlock();
 }
 
 static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start,
                                                        ram_addr_t length,
                                                        uint8_t mask)
 {
+    DirtyMemoryBlocks *blocks[DIRTY_MEMORY_NUM];
     unsigned long end, page;
-    unsigned long **d = ram_list.dirty_memory;
+    int i;
 
     if (!mask && !xen_enabled()) {
         return;
@@ -171,15 +256,36 @@  static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start,
 
     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
     page = start >> TARGET_PAGE_BITS;
-    if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) {
-        bitmap_set_atomic(d[DIRTY_MEMORY_MIGRATION], page, end - page);
-    }
-    if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) {
-        bitmap_set_atomic(d[DIRTY_MEMORY_VGA], page, end - page);
+
+    rcu_read_lock();
+
+    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
+        blocks[i] = atomic_rcu_read(&ram_list.dirty_memory[i]);
     }
-    if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) {
-        bitmap_set_atomic(d[DIRTY_MEMORY_CODE], page, end - page);
+
+    while (page < end) {
+        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
+        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
+        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
+
+        if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) {
+            bitmap_set_atomic(blocks[DIRTY_MEMORY_MIGRATION]->blocks[idx],
+                              offset, num);
+        }
+        if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) {
+            bitmap_set_atomic(blocks[DIRTY_MEMORY_VGA]->blocks[idx],
+                              offset, num);
+        }
+        if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) {
+            bitmap_set_atomic(blocks[DIRTY_MEMORY_CODE]->blocks[idx],
+                              offset, num);
+        }
+
+        page += num;
     }
+
+    rcu_read_unlock();
+
     xen_modified_memory(start, length);
 }
 
@@ -199,21 +305,41 @@  static inline void cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
     /* start address is aligned at the start of a word? */
     if ((((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) &&
         (hpratio == 1)) {
+        unsigned long **blocks[DIRTY_MEMORY_NUM];
+        unsigned long idx;
+        unsigned long offset;
         long k;
         long nr = BITS_TO_LONGS(pages);
 
+        idx = (start >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE;
+        offset = BIT_WORD((start >> TARGET_PAGE_BITS) %
+                          DIRTY_MEMORY_BLOCK_SIZE);
+
+        rcu_read_lock();
+
+        for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
+            blocks[i] = atomic_rcu_read(&ram_list.dirty_memory[i])->blocks;
+        }
+
         for (k = 0; k < nr; k++) {
             if (bitmap[k]) {
                 unsigned long temp = leul_to_cpu(bitmap[k]);
-                unsigned long **d = ram_list.dirty_memory;
 
-                atomic_or(&d[DIRTY_MEMORY_MIGRATION][page + k], temp);
-                atomic_or(&d[DIRTY_MEMORY_VGA][page + k], temp);
+                atomic_or(&blocks[DIRTY_MEMORY_MIGRATION][idx][offset], temp);
+                atomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp);
                 if (tcg_enabled()) {
-                    atomic_or(&d[DIRTY_MEMORY_CODE][page + k], temp);
+                    atomic_or(&blocks[DIRTY_MEMORY_CODE][idx][offset], temp);
                 }
             }
+
+            if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
+                offset = 0;
+                idx++;
+            }
         }
+
+        rcu_read_unlock();
+
         xen_modified_memory(start, pages << TARGET_PAGE_BITS);
     } else {
         uint8_t clients = tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE;
@@ -265,18 +391,33 @@  uint64_t cpu_physical_memory_sync_dirty_bitmap(unsigned long *dest,
     if (((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) {
         int k;
         int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS);
-        unsigned long *src = ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION];
+        unsigned long * const *src;
+        unsigned long idx = (page * BITS_PER_LONG) / DIRTY_MEMORY_BLOCK_SIZE;
+        unsigned long offset = BIT_WORD((page * BITS_PER_LONG) %
+                                        DIRTY_MEMORY_BLOCK_SIZE);
+
+        rcu_read_lock();
+
+        src = atomic_rcu_read(
+                &ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION])->blocks;
 
         for (k = page; k < page + nr; k++) {
-            if (src[k]) {
-                unsigned long bits = atomic_xchg(&src[k], 0);
+            if (src[idx][offset]) {
+                unsigned long bits = atomic_xchg(&src[idx][offset], 0);
                 unsigned long new_dirty;
                 new_dirty = ~dest[k];
                 dest[k] |= bits;
                 new_dirty &= bits;
                 num_dirty += ctpopl(new_dirty);
             }
+
+            if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
+                offset = 0;
+                idx++;
+            }
         }
+
+        rcu_read_unlock();
     } else {
         for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) {
             if (cpu_physical_memory_test_and_clear_dirty(
diff --git a/migration/ram.c b/migration/ram.c
index 3cdfea4..96c749f 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -609,7 +609,6 @@  static void migration_bitmap_sync_init(void)
     iterations_prev = 0;
 }
 
-/* Called with iothread lock held, to protect ram_list.dirty_memory[] */
 static void migration_bitmap_sync(void)
 {
     RAMBlock *block;
@@ -1921,8 +1920,6 @@  static int ram_save_setup(QEMUFile *f, void *opaque)
         acct_clear();
     }
 
-    /* iothread lock needed for ram_list.dirty_memory[] */
-    qemu_mutex_lock_iothread();
     qemu_mutex_lock_ramlist();
     rcu_read_lock();
     bytes_transferred = 0;
@@ -1947,7 +1944,6 @@  static int ram_save_setup(QEMUFile *f, void *opaque)
     memory_global_dirty_log_start();
     migration_bitmap_sync();
     qemu_mutex_unlock_ramlist();
-    qemu_mutex_unlock_iothread();
 
     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);