@@ -34,6 +34,7 @@
#include "qemu/hbitmap.h"
#include "block/snapshot.h"
#include "qemu/throttle.h"
+#include "qemu/rcu.h"
#define BLOCK_FLAG_LAZY_REFCOUNTS 8
@@ -839,6 +840,24 @@ struct BdrvChild {
QLIST_ENTRY(BdrvChild) next_parent;
};
+/*
+ * Allows bdrv_co_block_status() to cache one data region for a
+ * protocol node.
+ *
+ * @valid: Whether the cache is valid (should be accessed with atomic
+ * functions so this can be reset by RCU readers)
+ * @data_start: Offset where we know (or strongly assume) is data
+ * @data_end: Offset where the data region ends (which is not necessarily
+ * the start of a zeroed region)
+ */
+typedef struct BdrvBlockStatusCache {
+ struct rcu_head rcu;
+
+ bool valid;
+ int64_t data_start;
+ int64_t data_end;
+} BdrvBlockStatusCache;
+
struct BlockDriverState {
/* Protected by big QEMU lock or read-only after opening. No special
* locking needed during I/O...
@@ -1004,6 +1023,11 @@ struct BlockDriverState {
/* BdrvChild links to this node may never be frozen */
bool never_freeze;
+
+ /* Lock for block-status cache RCU writers */
+ CoMutex bsc_modify_lock;
+ /* Always non-NULL, but must only be dereferenced under an RCU read guard */
+ BdrvBlockStatusCache *block_status_cache;
};
struct BlockBackendRootState {
@@ -1429,4 +1453,30 @@ static inline BlockDriverState *bdrv_primary_bs(BlockDriverState *bs)
*/
void bdrv_drain_all_end_quiesce(BlockDriverState *bs);
+/**
+ * Check whether the given offset is in the cached block-status data
+ * region.
+ *
+ * If it is, and @pnum is not NULL, *pnum is set to
+ * `bsc.data_end - offset`, i.e. how many bytes, starting from
+ * @offset, are data (according to the cache).
+ * Otherwise, *pnum is not touched.
+ */
+bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum);
+
+/**
+ * If [offset, offset + bytes) overlaps with the currently cached
+ * block-status region, invalidate the cache.
+ *
+ * (To be used by I/O paths that cause data regions to be zero or
+ * holes.)
+ */
+void bdrv_bsc_invalidate_range(BlockDriverState *bs,
+ int64_t offset, int64_t bytes);
+
+/**
+ * Mark the range [offset, offset + bytes) as a data region.
+ */
+void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes);
+
#endif /* BLOCK_INT_H */
@@ -49,6 +49,8 @@
#include "qemu/timer.h"
#include "qemu/cutils.h"
#include "qemu/id.h"
+#include "qemu/range.h"
+#include "qemu/rcu.h"
#include "block/coroutines.h"
#ifdef CONFIG_BSD
@@ -401,6 +403,9 @@ BlockDriverState *bdrv_new(void)
qemu_co_queue_init(&bs->flush_queue);
+ qemu_co_mutex_init(&bs->bsc_modify_lock);
+ bs->block_status_cache = g_new0(BdrvBlockStatusCache, 1);
+
for (i = 0; i < bdrv_drain_all_count; i++) {
bdrv_drained_begin(bs);
}
@@ -4694,6 +4699,8 @@ static void bdrv_close(BlockDriverState *bs)
bs->explicit_options = NULL;
qobject_unref(bs->full_open_options);
bs->full_open_options = NULL;
+ g_free(bs->block_status_cache);
+ bs->block_status_cache = NULL;
bdrv_release_named_dirty_bitmaps(bs);
assert(QLIST_EMPTY(&bs->dirty_bitmaps));
@@ -7684,3 +7691,76 @@ BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
{
return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs)));
}
+
+/**
+ * Check whether [offset, offset + bytes) overlaps with the cached
+ * block-status data region.
+ *
+ * If so, and @pnum is not NULL, set *pnum to `bsc.data_end - offset`,
+ * which is what bdrv_bsc_is_data()'s interface needs.
+ * Otherwise, *pnum is not touched.
+ */
+static bool bdrv_bsc_range_overlaps_locked(BlockDriverState *bs,
+ int64_t offset, int64_t bytes,
+ int64_t *pnum)
+{
+ BdrvBlockStatusCache *bsc = qatomic_rcu_read(&bs->block_status_cache);
+ bool overlaps;
+
+ overlaps =
+ qatomic_read(&bsc->valid) &&
+ ranges_overlap(offset, bytes, bsc->data_start,
+ bsc->data_end - bsc->data_start);
+
+ if (overlaps && pnum) {
+ *pnum = bsc->data_end - offset;
+ }
+
+ return overlaps;
+}
+
+/**
+ * See block_int.h for this function's documentation.
+ */
+bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum)
+{
+ RCU_READ_LOCK_GUARD();
+
+ return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum);
+}
+
+/**
+ * See block_int.h for this function's documentation.
+ */
+void bdrv_bsc_invalidate_range(BlockDriverState *bs,
+ int64_t offset, int64_t bytes)
+{
+ RCU_READ_LOCK_GUARD();
+
+ if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) {
+ qatomic_set(&bs->block_status_cache->valid, false);
+ }
+}
+
+/**
+ * See block_int.h for this function's documentation.
+ */
+void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes)
+{
+ BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1);
+ BdrvBlockStatusCache *old_bsc;
+
+ *new_bsc = (BdrvBlockStatusCache) {
+ .valid = true,
+ .data_start = offset,
+ .data_end = offset + bytes,
+ };
+
+ QEMU_LOCK_GUARD(&bs->bsc_modify_lock);
+
+ old_bsc = qatomic_rcu_read(&bs->block_status_cache);
+ qatomic_rcu_set(&bs->block_status_cache, new_bsc);
+ if (old_bsc) {
+ g_free_rcu(old_bsc, rcu);
+ }
+}
@@ -1883,6 +1883,9 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
return -ENOTSUP;
}
+ /* Invalidate the cached block-status data range if this write overlaps */
+ bdrv_bsc_invalidate_range(bs, offset, bytes);
+
assert(alignment % bs->bl.request_alignment == 0);
head = offset % alignment;
tail = (offset + bytes) % alignment;
@@ -2447,9 +2450,65 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
if (bs->drv->bdrv_co_block_status) {
- ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
- aligned_bytes, pnum, &local_map,
- &local_file);
+ /*
+ * Use the block-status cache only for protocol nodes: Format
+ * drivers are generally quick to inquire the status, but protocol
+ * drivers often need to get information from outside of qemu, so
+ * we do not have control over the actual implementation. There
+ * have been cases where inquiring the status took an unreasonably
+ * long time, and we can do nothing in qemu to fix it.
+ * This is especially problematic for images with large data areas,
+ * because finding the few holes in them and giving them special
+ * treatment does not gain much performance. Therefore, we try to
+ * cache the last-identified data region.
+ *
+ * Second, limiting ourselves to protocol nodes allows us to assume
+ * the block status for data regions to be DATA | OFFSET_VALID, and
+ * that the host offset is the same as the guest offset.
+ *
+ * Note that it is possible that external writers zero parts of
+ * the cached regions without the cache being invalidated, and so
+ * we may report zeroes as data. This is not catastrophic,
+ * however, because reporting zeroes as data is fine.
+ */
+ if (QLIST_EMPTY(&bs->children) &&
+ bdrv_bsc_is_data(bs, aligned_offset, pnum))
+ {
+ ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+ local_file = bs;
+ local_map = aligned_offset;
+ } else {
+ ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
+ aligned_bytes, pnum, &local_map,
+ &local_file);
+
+ /*
+ * Note that checking QLIST_EMPTY(&bs->children) is also done when
+ * the cache is queried above. Technically, we do not need to check
+ * it here; the worst that can happen is that we fill the cache for
+ * non-protocol nodes, and then it is never used. However, filling
+ * the cache requires an RCU update, so double check here to avoid
+ * such an update if possible.
+ */
+ if (ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) &&
+ QLIST_EMPTY(&bs->children))
+ {
+ /*
+ * When a protocol driver reports BLOCK_OFFSET_VALID, the
+ * returned local_map value must be the same as the offset we
+ * have passed (aligned_offset), and local_bs must be the node
+ * itself.
+ * Assert this, because we follow this rule when reading from
+ * the cache (see the `local_file = bs` and
+ * `local_map = aligned_offset` assignments above), and the
+ * result the cache delivers must be the same as the driver
+ * would deliver.
+ */
+ assert(local_file == bs);
+ assert(local_map == aligned_offset);
+ bdrv_bsc_fill(bs, aligned_offset, *pnum);
+ }
+ }
} else {
/* Default code for filters */
@@ -3002,6 +3061,9 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
return 0;
}
+ /* Invalidate the cached block-status data range if this discard overlaps */
+ bdrv_bsc_invalidate_range(bs, offset, bytes);
+
/* Discard is advisory, but some devices track and coalesce
* unaligned requests, so we must pass everything down rather than
* round here. Still, most devices will just silently ignore