diff mbox series

[v3,6/6] block/qcow2: use seqcache for compressed writes

Message ID 20210305173507.393137-7-vsementsov@virtuozzo.com (mailing list archive)
State New, archived
Headers show
Series qcow2: compressed write cache | expand

Commit Message

Vladimir Sementsov-Ogievskiy March 5, 2021, 5:35 p.m. UTC
Compressed writes are unaligned to 512, which works very slow in
O_DIRECT mode. Let's use the cache.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/coroutines.h     |   3 +
 block/qcow2.h          |   4 ++
 block/qcow2-refcount.c |  10 +++
 block/qcow2.c          | 158 ++++++++++++++++++++++++++++++++++++++---
 4 files changed, 164 insertions(+), 11 deletions(-)

Comments

Max Reitz March 12, 2021, 6:15 p.m. UTC | #1
On 05.03.21 18:35, Vladimir Sementsov-Ogievskiy wrote:
> Compressed writes are unaligned to 512, which works very slow in
> O_DIRECT mode. Let's use the cache.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>   block/coroutines.h     |   3 +
>   block/qcow2.h          |   4 ++
>   block/qcow2-refcount.c |  10 +++
>   block/qcow2.c          | 158 ++++++++++++++++++++++++++++++++++++++---
>   4 files changed, 164 insertions(+), 11 deletions(-)
> 
> diff --git a/block/coroutines.h b/block/coroutines.h
> index 4cfb4946e6..cb8e05830e 100644
> --- a/block/coroutines.h
> +++ b/block/coroutines.h
> @@ -66,4 +66,7 @@ int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs,
>   int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs,
>                                           QEMUIOVector *qiov, int64_t pos);
>   
> +int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs);
> +int generated_co_wrapper qcow2_flush_compressed_cache(BlockDriverState *bs);
> +
>   #endif /* BLOCK_COROUTINES_INT_H */
> diff --git a/block/qcow2.h b/block/qcow2.h
> index e18d8dfbae..8b8fed0950 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -28,6 +28,7 @@
>   #include "crypto/block.h"
>   #include "qemu/coroutine.h"
>   #include "qemu/units.h"
> +#include "qemu/seqcache.h"
>   #include "block/block_int.h"
>   
>   //#define DEBUG_ALLOC
> @@ -422,6 +423,9 @@ typedef struct BDRVQcow2State {
>       Qcow2CompressionType compression_type;
>   
>       GHashTable *inflight_writes_counters;
> +
> +    SeqCache *compressed_cache;
> +    int compressed_flush_ret;
>   } BDRVQcow2State;
>   
>   typedef struct Qcow2COWRegion {
> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
> index 464d133368..218917308e 100644
> --- a/block/qcow2-refcount.c
> +++ b/block/qcow2-refcount.c
> @@ -29,6 +29,7 @@
>   #include "qemu/bswap.h"
>   #include "qemu/cutils.h"
>   #include "trace.h"
> +#include "block/coroutines.h"
>   
>   static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
>                                       uint64_t max);
> @@ -1040,6 +1041,10 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
>                   qcow2_cache_discard(s->l2_table_cache, table);
>               }
>   
> +            if (s->compressed_cache) {
> +                seqcache_discard_cluster(s->compressed_cache, cluster_offset);
> +            }
> +
>               if (s->discard_passthrough[type]) {
>                   update_refcount_discard(bs, cluster_offset, s->cluster_size);
>               }
> @@ -1349,6 +1354,11 @@ int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
>       BDRVQcow2State *s = bs->opaque;
>       int ret;
>   
> +    ret = qcow2_flush_compressed_cache(bs);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +

I wonder a bit why this function doesn’t work on a best-effort basis, 
but that isn’t your problem.

>       ret = qcow2_cache_write(bs, s->l2_table_cache);
>       if (ret < 0) {
>           return ret;
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 6ee94421e0..5f3713cd6f 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -42,6 +42,7 @@
>   #include "qapi/qapi-visit-block-core.h"
>   #include "crypto.h"
>   #include "block/aio_task.h"
> +#include "block/coroutines.h"
>   
>   /*
>     Differences with QCOW:
> @@ -1834,6 +1835,10 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
>       s->inflight_writes_counters =
>           g_hash_table_new_full(g_int64_hash, g_int64_equal, g_free, g_free);
>   
> +    if (!has_data_file(bs) && (bs->file->bs->open_flags & BDRV_O_NOCACHE)) {

Looks a bit like a layering violation, but I have no better proposal and 
you gave your reasoning, so, OK.

I wonder whether this should be re-checked during reopen, though.

> +        s->compressed_cache = seqcache_new(s->cluster_size);
> +    }
> +
>       return ret;
>   
>    fail:
> @@ -2653,6 +2658,91 @@ fail_nometa:
>       return ret;
>   }
>   
> +/*
> + * Flush one cluster of compressed cache if exists. If @unfinished is true and
> + * there is no finished cluster to flush, flush the unfinished one. Otherwise
> + * flush only finished clusters.
> + *
> + * Return 0 if nothing to flush, 1 if one cluster successfully flushed and <0 on
> + * error.
> + */
> +static int coroutine_fn
> +qcow2_co_compressed_flush_one(BlockDriverState *bs, bool unfinished)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    int ret;
> +    int64_t align = 1;
> +    int64_t offset, bytes;
> +    uint8_t *buf;
> +
> +    if (!s->compressed_cache) {
> +        return 0;
> +    }
> +
> +    if (!seqcache_get_next_flush(s->compressed_cache, &offset, &bytes, &buf,
> +                                 &unfinished))
> +    {
> +        return 0;
> +    }
> +
> +    qcow2_inflight_writes_inc(bs, offset, bytes);
> +
> +    /*
> +     * Don't try to align-up unfinished cached cluster, parallel write to it is
> +     * possible! For finished host clusters data in the tail of the cluster will
> +     * be never used. So, take some good alignment for speed.
> +     *
> +     * Note also, that seqcache guarantees that allocated size of @buf is enough
> +     * to fill the cluster up to the end, so we are safe to align up with
> +     * align <= cluster_size.
> +     */
> +    if (!unfinished) {
> +        align = MIN(s->cluster_size,
> +                    MAX(s->data_file->bs->bl.request_alignment,
> +                        bdrv_opt_mem_align(bs)));
> +    }
> +

I’d move the pre-write overlap check here, because its purpose is to 
prevent writes to metadata structures as they are happening, not as they 
are queued into a cache.  (I’d say.)

> +    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
> +    ret = bdrv_co_pwrite(s->data_file, offset,
> +                         QEMU_ALIGN_UP(offset + bytes, align) - offset,

I remember you said you wanted to describe more of the properties of the 
buffer returned by seqcache_get_next_flush().  That would be nice here, 
because intuitively one would assume that the buffer is @bytes bytes 
long, which aligning here will exceed.  (It’s fine, but it would be 
nicer if there was a comment that assured that it’s fine.)

> +                         buf, 0);
> +    if (ret < 0 && s->compressed_flush_ret == 0) {
> +        /*
> +         * The data that was "written" earlier is lost. We don't want to
> +         * care with storing it somehow to retry flushing later.

Yeah, there is little point in trying something like writing it again 
and again in the hope that maybe at some point it’ll work.

It is a shame that the error isn’t returned by the original compressed 
write, but what can you do.

>                                                                   Also, how much
> +         * data will we have to store, if not drop it after failed flush?
> +         * After this point qcow2_co_flush_compressed_cache() (and
> +         * qcow2_flush_compressed_cache()) never return success.
> +         */
> +        s->compressed_flush_ret = ret;
> +    }
> +
> +    seqcache_discard_cluster(s->compressed_cache, offset);
> +
> +    qcow2_inflight_writes_dec(bs, offset, bytes);
> +
> +    return ret < 0 ? ret : 1;
> +}
> +
> +int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +
> +    if (s->compressed_cache) {
> +        uint64_t nb_clusters = seqcache_nb_clusters(s->compressed_cache);
> +
> +        /*
> +         * If parallel writes are possible we don't want to loop forever. So
> +         * flush only clusters available at start of flush operation.
> +         */
> +        while (nb_clusters--) {
> +            qcow2_co_compressed_flush_one(bs, true);
> +        }
> +    }
> +
> +    return s->compressed_flush_ret;
> +}
> +
>   static int qcow2_inactivate(BlockDriverState *bs)
>   {
>       BDRVQcow2State *s = bs->opaque;
> @@ -2667,6 +2757,13 @@ static int qcow2_inactivate(BlockDriverState *bs)
>                             bdrv_get_device_or_node_name(bs));
>       }
>   
> +    ret = qcow2_flush_compressed_cache(bs);
> +    if (ret) {
> +        result = ret;
> +        error_report("Failed to flush the compressed write cache: %s",
> +                     strerror(-ret));
> +    }
> +
>       ret = qcow2_cache_flush(bs, s->l2_table_cache);
>       if (ret) {
>           result = ret;
> @@ -2699,6 +2796,12 @@ static void qcow2_close(BlockDriverState *bs)
>           qcow2_inactivate(bs);
>       }
>   
> +    /*
> +     * Cache should be flushed in qcow2_inactivate() and should be empty in
> +     * inactive mode. So we are safe to free it.
> +     */
> +    seqcache_free(s->compressed_cache);
> +
>       cache_clean_timer_del(bs);
>       qcow2_cache_destroy(s->l2_table_cache);
>       qcow2_cache_destroy(s->refcount_block_cache);
> @@ -4558,18 +4661,42 @@ qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
>           goto fail;
>       }
>   
> -    qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
> +    if (s->compressed_cache) {

Why is this conditional?

> +        /*
> +         * It's important to do seqcache_write() in the same critical section
> +         * (by s->lock) as qcow2_alloc_compressed_cluster_offset(), so that the
> +         * cache is filled sequentially.
> +         */

Yes.

> +        seqcache_write(s->compressed_cache, cluster_offset, out_len, out_buf);
>   
> -    qemu_co_mutex_unlock(&s->lock);
> +        qemu_co_mutex_unlock(&s->lock);
>   
> -    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
> -    ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
> +        ret = qcow2_co_compressed_flush_one(bs, false);

The qcow2 doc says a compressed cluster can span multiple host clusters. 
  I don’t know whether that can happen with this driver, but if it does, 
wouldn’t that mean we’d need to flush two clusters here?  Oh, no, never 
mind.  Only the first one would be finished and thus flushed, not the 
second one.

I could have now removed the above paragraph, but it made me think, so I 
kept it:

Hm.  Actually, if we unconditionally flush here, doesn’t that mean that 
we’ll never have a finished cluster in the cache for longer than the 
span between the seqcache_write() and this 
qcow2_co_compressed_flush_one()?  I.e., the 
qcow2_co_flush_compressed_cache() is supposed to never flush any 
finished cluster, but only the currently active unfinished cluster (if 
there is one), right?

> +        if (ret < 0) {
> +            /*
> +             * if ret < 0 it probably not this request which failed, but some
> +             * previous one that was cached. Moreover, when we write to the
> +             * cache we probably may always report success, because
> +             * seqcache_write() never fails. Still, it's better to inform
> +             * compressed backup now then wait until final flush.
> +             */

Yup.

> +            goto fail;
> +        }
> +    } else {
> +        qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
>   
> -    qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
> +        qemu_co_mutex_unlock(&s->lock);
>   
> -    if (ret < 0) {
> -        goto fail;
> +        BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
> +        ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
> +
> +        qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
> +
> +        if (ret < 0) {
> +            goto fail;
> +        }
>       }
> +
>   success:
>       ret = 0;
>   fail:
> @@ -4681,10 +4808,19 @@ qcow2_co_preadv_compressed(BlockDriverState *bs,
>   
>       out_buf = qemu_blockalign(bs, s->cluster_size);
>   
> -    BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
> -    ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
> -    if (ret < 0) {
> -        goto fail;
> +    /*
> +     * seqcache_read may return less bytes than csize, as csize may exceed
> +     * actual compressed data size. So we are OK if seqcache_read returns
> +     * something > 0.

I was about to ask what happens when a compressed cluster spans two host 
clusters (I could have imagined that in theory the second one could have 
been discarded, but not the first one, so reading from the cache would 
really be short -- we would have needed to check that we only fell short 
in the range of 512 bytes, not more).

But then I realized that in this version of the series, all finished 
clusters are immediately discarded and only the current unfinished one 
is kept.  Does it even make sense to try seqcache_read() here, then?

> +     */
> +    if (!s->compressed_cache ||

(As for writing, I don’t think this can ever occur.)

Max

> +        seqcache_read(s->compressed_cache, coffset, csize, buf) <= 0)
> +    {
> +        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
> +        ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
> +        if (ret < 0) {
> +            goto fail;
> +        }
>       }
>   
>       if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {
>
Vladimir Sementsov-Ogievskiy March 12, 2021, 6:43 p.m. UTC | #2
12.03.2021 21:15, Max Reitz wrote:
> On 05.03.21 18:35, Vladimir Sementsov-Ogievskiy wrote:
>> Compressed writes are unaligned to 512, which works very slow in
>> O_DIRECT mode. Let's use the cache.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>   block/coroutines.h     |   3 +
>>   block/qcow2.h          |   4 ++
>>   block/qcow2-refcount.c |  10 +++
>>   block/qcow2.c          | 158 ++++++++++++++++++++++++++++++++++++++---
>>   4 files changed, 164 insertions(+), 11 deletions(-)
>>
>> diff --git a/block/coroutines.h b/block/coroutines.h
>> index 4cfb4946e6..cb8e05830e 100644
>> --- a/block/coroutines.h
>> +++ b/block/coroutines.h
>> @@ -66,4 +66,7 @@ int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs,
>>   int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs,
>>                                           QEMUIOVector *qiov, int64_t pos);
>> +int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs);
>> +int generated_co_wrapper qcow2_flush_compressed_cache(BlockDriverState *bs);
>> +
>>   #endif /* BLOCK_COROUTINES_INT_H */
>> diff --git a/block/qcow2.h b/block/qcow2.h
>> index e18d8dfbae..8b8fed0950 100644
>> --- a/block/qcow2.h
>> +++ b/block/qcow2.h
>> @@ -28,6 +28,7 @@
>>   #include "crypto/block.h"
>>   #include "qemu/coroutine.h"
>>   #include "qemu/units.h"
>> +#include "qemu/seqcache.h"
>>   #include "block/block_int.h"
>>   //#define DEBUG_ALLOC
>> @@ -422,6 +423,9 @@ typedef struct BDRVQcow2State {
>>       Qcow2CompressionType compression_type;
>>       GHashTable *inflight_writes_counters;
>> +
>> +    SeqCache *compressed_cache;
>> +    int compressed_flush_ret;
>>   } BDRVQcow2State;
>>   typedef struct Qcow2COWRegion {
>> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
>> index 464d133368..218917308e 100644
>> --- a/block/qcow2-refcount.c
>> +++ b/block/qcow2-refcount.c
>> @@ -29,6 +29,7 @@
>>   #include "qemu/bswap.h"
>>   #include "qemu/cutils.h"
>>   #include "trace.h"
>> +#include "block/coroutines.h"
>>   static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
>>                                       uint64_t max);
>> @@ -1040,6 +1041,10 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
>>                   qcow2_cache_discard(s->l2_table_cache, table);
>>               }
>> +            if (s->compressed_cache) {
>> +                seqcache_discard_cluster(s->compressed_cache, cluster_offset);
>> +            }
>> +
>>               if (s->discard_passthrough[type]) {
>>                   update_refcount_discard(bs, cluster_offset, s->cluster_size);
>>               }
>> @@ -1349,6 +1354,11 @@ int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
>>       BDRVQcow2State *s = bs->opaque;
>>       int ret;
>> +    ret = qcow2_flush_compressed_cache(bs);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
> 
> I wonder a bit why this function doesn’t work on a best-effort basis, but that isn’t your problem.
> 
>>       ret = qcow2_cache_write(bs, s->l2_table_cache);
>>       if (ret < 0) {
>>           return ret;
>> diff --git a/block/qcow2.c b/block/qcow2.c
>> index 6ee94421e0..5f3713cd6f 100644
>> --- a/block/qcow2.c
>> +++ b/block/qcow2.c
>> @@ -42,6 +42,7 @@
>>   #include "qapi/qapi-visit-block-core.h"
>>   #include "crypto.h"
>>   #include "block/aio_task.h"
>> +#include "block/coroutines.h"
>>   /*
>>     Differences with QCOW:
>> @@ -1834,6 +1835,10 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
>>       s->inflight_writes_counters =
>>           g_hash_table_new_full(g_int64_hash, g_int64_equal, g_free, g_free);
>> +    if (!has_data_file(bs) && (bs->file->bs->open_flags & BDRV_O_NOCACHE)) {
> 
> Looks a bit like a layering violation, but I have no better proposal and you gave your reasoning, so, OK.
> 
> I wonder whether this should be re-checked during reopen, though.

Hmm yes. Somehow I thought qcow2_do_open() will be called again, but it's about inactivation/invalidation, not about reopen.

> 
>> +        s->compressed_cache = seqcache_new(s->cluster_size);
>> +    }
>> +
>>       return ret;
>>    fail:
>> @@ -2653,6 +2658,91 @@ fail_nometa:
>>       return ret;
>>   }
>> +/*
>> + * Flush one cluster of compressed cache if exists. If @unfinished is true and
>> + * there is no finished cluster to flush, flush the unfinished one. Otherwise
>> + * flush only finished clusters.
>> + *
>> + * Return 0 if nothing to flush, 1 if one cluster successfully flushed and <0 on
>> + * error.
>> + */
>> +static int coroutine_fn
>> +qcow2_co_compressed_flush_one(BlockDriverState *bs, bool unfinished)
>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +    int ret;
>> +    int64_t align = 1;
>> +    int64_t offset, bytes;
>> +    uint8_t *buf;
>> +
>> +    if (!s->compressed_cache) {
>> +        return 0;
>> +    }
>> +
>> +    if (!seqcache_get_next_flush(s->compressed_cache, &offset, &bytes, &buf,
>> +                                 &unfinished))
>> +    {
>> +        return 0;
>> +    }
>> +
>> +    qcow2_inflight_writes_inc(bs, offset, bytes);
>> +
>> +    /*
>> +     * Don't try to align-up unfinished cached cluster, parallel write to it is
>> +     * possible! For finished host clusters data in the tail of the cluster will
>> +     * be never used. So, take some good alignment for speed.
>> +     *
>> +     * Note also, that seqcache guarantees that allocated size of @buf is enough
>> +     * to fill the cluster up to the end, so we are safe to align up with
>> +     * align <= cluster_size.
>> +     */
>> +    if (!unfinished) {
>> +        align = MIN(s->cluster_size,
>> +                    MAX(s->data_file->bs->bl.request_alignment,
>> +                        bdrv_opt_mem_align(bs)));
>> +    }
>> +
> 
> I’d move the pre-write overlap check here, because its purpose is to prevent writes to metadata structures as they are happening, not as they are queued into a cache.  (I’d say.)

Agree

> 
>> +    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
>> +    ret = bdrv_co_pwrite(s->data_file, offset,
>> +                         QEMU_ALIGN_UP(offset + bytes, align) - offset,
> 
> I remember you said you wanted to describe more of the properties of the buffer returned by seqcache_get_next_flush().  That would be nice here, because intuitively one would assume that the buffer is @bytes bytes long, which aligning here will exceed.  (It’s fine, but it would be nicer if there was a comment that assured that it’s fine.)
> 
>> +                         buf, 0);
>> +    if (ret < 0 && s->compressed_flush_ret == 0) {
>> +        /*
>> +         * The data that was "written" earlier is lost. We don't want to
>> +         * care with storing it somehow to retry flushing later.
> 
> Yeah, there is little point in trying something like writing it again and again in the hope that maybe at some point it’ll work.
> 
> It is a shame that the error isn’t returned by the original compressed write, but what can you do.
> 
>>                                                                   Also, how much
>> +         * data will we have to store, if not drop it after failed flush?
>> +         * After this point qcow2_co_flush_compressed_cache() (and
>> +         * qcow2_flush_compressed_cache()) never return success.
>> +         */
>> +        s->compressed_flush_ret = ret;
>> +    }
>> +
>> +    seqcache_discard_cluster(s->compressed_cache, offset);
>> +
>> +    qcow2_inflight_writes_dec(bs, offset, bytes);
>> +
>> +    return ret < 0 ? ret : 1;
>> +}
>> +
>> +int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs)
>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +
>> +    if (s->compressed_cache) {
>> +        uint64_t nb_clusters = seqcache_nb_clusters(s->compressed_cache);
>> +
>> +        /*
>> +         * If parallel writes are possible we don't want to loop forever. So
>> +         * flush only clusters available at start of flush operation.
>> +         */
>> +        while (nb_clusters--) {
>> +            qcow2_co_compressed_flush_one(bs, true);
>> +        }
>> +    }
>> +
>> +    return s->compressed_flush_ret;
>> +}
>> +
>>   static int qcow2_inactivate(BlockDriverState *bs)
>>   {
>>       BDRVQcow2State *s = bs->opaque;
>> @@ -2667,6 +2757,13 @@ static int qcow2_inactivate(BlockDriverState *bs)
>>                             bdrv_get_device_or_node_name(bs));
>>       }
>> +    ret = qcow2_flush_compressed_cache(bs);
>> +    if (ret) {
>> +        result = ret;
>> +        error_report("Failed to flush the compressed write cache: %s",
>> +                     strerror(-ret));
>> +    }
>> +
>>       ret = qcow2_cache_flush(bs, s->l2_table_cache);
>>       if (ret) {
>>           result = ret;
>> @@ -2699,6 +2796,12 @@ static void qcow2_close(BlockDriverState *bs)
>>           qcow2_inactivate(bs);
>>       }
>> +    /*
>> +     * Cache should be flushed in qcow2_inactivate() and should be empty in
>> +     * inactive mode. So we are safe to free it.
>> +     */
>> +    seqcache_free(s->compressed_cache);
>> +
>>       cache_clean_timer_del(bs);
>>       qcow2_cache_destroy(s->l2_table_cache);
>>       qcow2_cache_destroy(s->refcount_block_cache);
>> @@ -4558,18 +4661,42 @@ qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
>>           goto fail;
>>       }
>> -    qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
>> +    if (s->compressed_cache) {
> 
> Why is this conditional?

We don't have compressed_cache for non o_direct.

> 
>> +        /*
>> +         * It's important to do seqcache_write() in the same critical section
>> +         * (by s->lock) as qcow2_alloc_compressed_cluster_offset(), so that the
>> +         * cache is filled sequentially.
>> +         */
> 
> Yes.
> 
>> +        seqcache_write(s->compressed_cache, cluster_offset, out_len, out_buf);
>> -    qemu_co_mutex_unlock(&s->lock);
>> +        qemu_co_mutex_unlock(&s->lock);
>> -    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
>> -    ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
>> +        ret = qcow2_co_compressed_flush_one(bs, false);
> 
> The qcow2 doc says a compressed cluster can span multiple host clusters.  I don’t know whether that can happen with this driver, but if it does, wouldn’t that mean we’d need to flush two clusters here?  Oh, no, never mind.  Only the first one would be finished and thus flushed, not the second one.
> 
> I could have now removed the above paragraph, but it made me think, so I kept it:
> 
> Hm.  Actually, if we unconditionally flush here, doesn’t that mean that we’ll never have a finished cluster in the cache for longer than the span between the seqcache_write() and this qcow2_co_compressed_flush_one()?  I.e., the qcow2_co_flush_compressed_cache() is supposed to never flush any finished cluster, but only the currently active unfinished cluster (if there is one), right?

Hmm. Maybe if we have parallel write and flush requests, it's a kind of race condition: may be flush will flush both finished and unfinished cluster, maybe write will flush the finished cluster and flush will flush only unfinished one.. Moreover we may have several parallel requests, so they make several finished clusters, and sudden flush will flush them all.

> 
>> +        if (ret < 0) {
>> +            /*
>> +             * if ret < 0 it probably not this request which failed, but some
>> +             * previous one that was cached. Moreover, when we write to the
>> +             * cache we probably may always report success, because
>> +             * seqcache_write() never fails. Still, it's better to inform
>> +             * compressed backup now then wait until final flush.
>> +             */
> 
> Yup.
> 
>> +            goto fail;
>> +        }
>> +    } else {
>> +        qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
>> -    qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
>> +        qemu_co_mutex_unlock(&s->lock);
>> -    if (ret < 0) {
>> -        goto fail;
>> +        BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
>> +        ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
>> +
>> +        qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
>> +
>> +        if (ret < 0) {
>> +            goto fail;
>> +        }
>>       }
>> +
>>   success:
>>       ret = 0;
>>   fail:
>> @@ -4681,10 +4808,19 @@ qcow2_co_preadv_compressed(BlockDriverState *bs,
>>       out_buf = qemu_blockalign(bs, s->cluster_size);
>> -    BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
>> -    ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
>> -    if (ret < 0) {
>> -        goto fail;
>> +    /*
>> +     * seqcache_read may return less bytes than csize, as csize may exceed
>> +     * actual compressed data size. So we are OK if seqcache_read returns
>> +     * something > 0.
> 
> I was about to ask what happens when a compressed cluster spans two host clusters (I could have imagined that in theory the second one could have been discarded, but not the first one, so reading from the cache would really be short -- we would have needed to check that we only fell short in the range of 512 bytes, not more).
> 
> But then I realized that in this version of the series, all finished clusters are immediately discarded and only the current unfinished one is kept.  Does it even make sense to try seqcache_read() here, then?

Hmm. Not immediately, but after flush. An flush is not under mutex. So in theory at some moment we may have several finished clusters "in-flight". And your question make sense. The cache supports reading from consequitive clusters. But we also should support here reading one part of data from disk and another from the cache to be safe.

> 
>> +     */
>> +    if (!s->compressed_cache ||
> 
> (As for writing, I don’t think this can ever occur.)
> 
> Max
> 
>> +        seqcache_read(s->compressed_cache, coffset, csize, buf) <= 0)
>> +    {
>> +        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
>> +        ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
>> +        if (ret < 0) {
>> +            goto fail;
>> +        }
>>       }
>>       if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {
>>
>
Vladimir Sementsov-Ogievskiy March 12, 2021, 6:45 p.m. UTC | #3
12.03.2021 21:15, Max Reitz wrote:
>> @@ -1834,6 +1835,10 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
>>       s->inflight_writes_counters =
>>           g_hash_table_new_full(g_int64_hash, g_int64_equal, g_free, g_free);
>> +    if (!has_data_file(bs) && (bs->file->bs->open_flags & BDRV_O_NOCACHE)) {
> 
> Looks a bit like a layering violation, but I have no better proposal and you gave your reasoning, so, OK.

Probably better is check request_alignment of bs->file->bs. If it > 1 then enable the cache.
Max Reitz March 15, 2021, 9:58 a.m. UTC | #4
On 12.03.21 19:43, Vladimir Sementsov-Ogievskiy wrote:
> 12.03.2021 21:15, Max Reitz wrote:
>> On 05.03.21 18:35, Vladimir Sementsov-Ogievskiy wrote:
>>> Compressed writes are unaligned to 512, which works very slow in
>>> O_DIRECT mode. Let's use the cache.
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>> ---
>>>   block/coroutines.h     |   3 +
>>>   block/qcow2.h          |   4 ++
>>>   block/qcow2-refcount.c |  10 +++
>>>   block/qcow2.c          | 158 ++++++++++++++++++++++++++++++++++++++---
>>>   4 files changed, 164 insertions(+), 11 deletions(-)

[...]

>>> @@ -2699,6 +2796,12 @@ static void qcow2_close(BlockDriverState *bs)
>>>           qcow2_inactivate(bs);
>>>       }
>>> +    /*
>>> +     * Cache should be flushed in qcow2_inactivate() and should be 
>>> empty in
>>> +     * inactive mode. So we are safe to free it.
>>> +     */
>>> +    seqcache_free(s->compressed_cache);
>>> +
>>>       cache_clean_timer_del(bs);
>>>       qcow2_cache_destroy(s->l2_table_cache);
>>>       qcow2_cache_destroy(s->refcount_block_cache);
>>> @@ -4558,18 +4661,42 @@ 
>>> qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
>>>           goto fail;
>>>       }
>>> -    qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
>>> +    if (s->compressed_cache) {
>>
>> Why is this conditional?
> 
> We don't have compressed_cache for non o_direct.

Oh right.

>>> +        /*
>>> +         * It's important to do seqcache_write() in the same 
>>> critical section
>>> +         * (by s->lock) as qcow2_alloc_compressed_cluster_offset(), 
>>> so that the
>>> +         * cache is filled sequentially.
>>> +         */
>>
>> Yes.
>>
>>> +        seqcache_write(s->compressed_cache, cluster_offset, out_len, 
>>> out_buf);
>>> -    qemu_co_mutex_unlock(&s->lock);
>>> +        qemu_co_mutex_unlock(&s->lock);
>>> -    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
>>> -    ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, 
>>> out_buf, 0);
>>> +        ret = qcow2_co_compressed_flush_one(bs, false);
>>
>> The qcow2 doc says a compressed cluster can span multiple host 
>> clusters.  I don’t know whether that can happen with this driver, but 
>> if it does, wouldn’t that mean we’d need to flush two clusters here?  
>> Oh, no, never mind.  Only the first one would be finished and thus 
>> flushed, not the second one.
>>
>> I could have now removed the above paragraph, but it made me think, so 
>> I kept it:
>>
>> Hm.  Actually, if we unconditionally flush here, doesn’t that mean 
>> that we’ll never have a finished cluster in the cache for longer than 
>> the span between the seqcache_write() and this 
>> qcow2_co_compressed_flush_one()?  I.e., the 
>> qcow2_co_flush_compressed_cache() is supposed to never flush any 
>> finished cluster, but only the currently active unfinished cluster (if 
>> there is one), right?
> 
> Hmm. Maybe if we have parallel write and flush requests, it's a kind of 
> race condition: may be flush will flush both finished and unfinished 
> cluster, maybe write will flush the finished cluster and flush will 
> flush only unfinished one.. Moreover we may have several parallel 
> requests, so they make several finished clusters, and sudden flush will 
> flush them all.

OK.  I was mostly asking because I was wondering how much you expect the 
cache to be filled, i.e., how much you expect the read cache to help.

[...]

>>> @@ -4681,10 +4808,19 @@ qcow2_co_preadv_compressed(BlockDriverState *bs,
>>>       out_buf = qemu_blockalign(bs, s->cluster_size);
>>> -    BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
>>> -    ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
>>> -    if (ret < 0) {
>>> -        goto fail;
>>> +    /*
>>> +     * seqcache_read may return less bytes than csize, as csize may 
>>> exceed
>>> +     * actual compressed data size. So we are OK if seqcache_read 
>>> returns
>>> +     * something > 0.
>>
>> I was about to ask what happens when a compressed cluster spans two 
>> host clusters (I could have imagined that in theory the second one 
>> could have been discarded, but not the first one, so reading from the 
>> cache would really be short -- we would have needed to check that we 
>> only fell short in the range of 512 bytes, not more).
>>
>> But then I realized that in this version of the series, all finished 
>> clusters are immediately discarded and only the current unfinished one 
>> is kept.  Does it even make sense to try seqcache_read() here, then?
> 
> Hmm. Not immediately, but after flush. An flush is not under mutex. So 
> in theory at some moment we may have several finished clusters 
> "in-flight". And your question make sense. The cache supports reading 
> from consequitive clusters. But we also should support here reading one 
> part of data from disk and another from the cache to be safe.

The question is whether it really makes sense to even have a 
seqcache_read() path when in reality it’s probably never accessed.  I 
mean, besides the fact that it seems based purely on chance whether a 
read might fetch something from the cache even while we’re writing, in 
practice I don’t know any case where we’d write to and read from a 
compressed qcow2 image at the same time.  (I don’t know what you’re 
doing with the 'compress' filter, though.)

Max
Vladimir Sementsov-Ogievskiy March 15, 2021, 2:40 p.m. UTC | #5
15.03.2021 12:58, Max Reitz wrote:
> On 12.03.21 19:43, Vladimir Sementsov-Ogievskiy wrote:
>> 12.03.2021 21:15, Max Reitz wrote:
>>> On 05.03.21 18:35, Vladimir Sementsov-Ogievskiy wrote:
>>>> Compressed writes are unaligned to 512, which works very slow in
>>>> O_DIRECT mode. Let's use the cache.
>>>>
>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>> ---
>>>>   block/coroutines.h     |   3 +
>>>>   block/qcow2.h          |   4 ++
>>>>   block/qcow2-refcount.c |  10 +++
>>>>   block/qcow2.c          | 158 ++++++++++++++++++++++++++++++++++++++---
>>>>   4 files changed, 164 insertions(+), 11 deletions(-)
> 
> [...]
> 
>>>> @@ -2699,6 +2796,12 @@ static void qcow2_close(BlockDriverState *bs)
>>>>           qcow2_inactivate(bs);
>>>>       }
>>>> +    /*
>>>> +     * Cache should be flushed in qcow2_inactivate() and should be empty in
>>>> +     * inactive mode. So we are safe to free it.
>>>> +     */
>>>> +    seqcache_free(s->compressed_cache);
>>>> +
>>>>       cache_clean_timer_del(bs);
>>>>       qcow2_cache_destroy(s->l2_table_cache);
>>>>       qcow2_cache_destroy(s->refcount_block_cache);
>>>> @@ -4558,18 +4661,42 @@ qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
>>>>           goto fail;
>>>>       }
>>>> -    qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
>>>> +    if (s->compressed_cache) {
>>>
>>> Why is this conditional?
>>
>> We don't have compressed_cache for non o_direct.
> 
> Oh right.
> 
>>>> +        /*
>>>> +         * It's important to do seqcache_write() in the same critical section
>>>> +         * (by s->lock) as qcow2_alloc_compressed_cluster_offset(), so that the
>>>> +         * cache is filled sequentially.
>>>> +         */
>>>
>>> Yes.
>>>
>>>> +        seqcache_write(s->compressed_cache, cluster_offset, out_len, out_buf);
>>>> -    qemu_co_mutex_unlock(&s->lock);
>>>> +        qemu_co_mutex_unlock(&s->lock);
>>>> -    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
>>>> -    ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
>>>> +        ret = qcow2_co_compressed_flush_one(bs, false);
>>>
>>> The qcow2 doc says a compressed cluster can span multiple host clusters.  I don’t know whether that can happen with this driver, but if it does, wouldn’t that mean we’d need to flush two clusters here? Oh, no, never mind.  Only the first one would be finished and thus flushed, not the second one.
>>>
>>> I could have now removed the above paragraph, but it made me think, so I kept it:
>>>
>>> Hm.  Actually, if we unconditionally flush here, doesn’t that mean that we’ll never have a finished cluster in the cache for longer than the span between the seqcache_write() and this qcow2_co_compressed_flush_one()?  I.e., the qcow2_co_flush_compressed_cache() is supposed to never flush any finished cluster, but only the currently active unfinished cluster (if there is one), right?
>>
>> Hmm. Maybe if we have parallel write and flush requests, it's a kind of race condition: may be flush will flush both finished and unfinished cluster, maybe write will flush the finished cluster and flush will flush only unfinished one.. Moreover we may have several parallel requests, so they make several finished clusters, and sudden flush will flush them all.
> 
> OK.  I was mostly asking because I was wondering how much you expect the cache to be filled, i.e., how much you expect the read cache to help.
> 

It should depend on how many parallel write requests allowed.. (and that's an actual limitation for cache size). We can add a "bottom-limit" for the cache, i.e. don't flush if there less than X clusters in the cache. Still I don't think there actual scenarios where both compressed reads and writes done. And anyway, to accelerate compressed read we'd better do some simple read-ahead.


> 
>>>> @@ -4681,10 +4808,19 @@ qcow2_co_preadv_compressed(BlockDriverState *bs,
>>>>       out_buf = qemu_blockalign(bs, s->cluster_size);
>>>> -    BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
>>>> -    ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
>>>> -    if (ret < 0) {
>>>> -        goto fail;
>>>> +    /*
>>>> +     * seqcache_read may return less bytes than csize, as csize may exceed
>>>> +     * actual compressed data size. So we are OK if seqcache_read returns
>>>> +     * something > 0.
>>>
>>> I was about to ask what happens when a compressed cluster spans two host clusters (I could have imagined that in theory the second one could have been discarded, but not the first one, so reading from the cache would really be short -- we would have needed to check that we only fell short in the range of 512 bytes, not more).
>>>
>>> But then I realized that in this version of the series, all finished clusters are immediately discarded and only the current unfinished one is kept.  Does it even make sense to try seqcache_read() here, then?
>>
>> Hmm. Not immediately, but after flush. An flush is not under mutex. So in theory at some moment we may have several finished clusters "in-flight". And your question make sense. The cache supports reading from consequitive clusters. But we also should support here reading one part of data from disk and another from the cache to be safe.
> 
> The question is whether it really makes sense to even have a seqcache_read() path when in reality it’s probably never accessed.  I mean, besides the fact that it seems based purely on chance whether a read might fetch something from the cache even while we’re writing, in practice I don’t know any case where we’d write to and read from a compressed qcow2 image at the same time.  (I don’t know what you’re doing with the 'compress' filter, though.)
> 

Note, that for user that's not a parallel write and read to the same cluster:

1. user writes cluster A, request succeeded, data is in the cache

2. user writes some other clusters, cache filled, flush started

3. in parallel to [2] user reads cluster A. From the POV of user, cluster A is written already, and should be read successfully

And seqcache_read() gives a simple non-blocking way to support read operation.

I have two additional thoughts in mind:

- After introducing compress filter we actually support parallel compressed reads and writes.. Even if nobody uses it, it works, and it should continue working correctly

- The only think that breaks using compressed device (with help of compressed filter) is that we can't rewrite compressed cluster (by compressed cluster I mean). But it's not hard to implement.. So, at some moment the limitation will go away.. Hmm. still I can't imagine a usecase :)

I can image that both reads and writes may be used on temporary image in backup scheme with reverse delta, or for external backup.

But rewriting compressed clusters is sensible only when we run real guest on compressed image.. Can it be helpful? Maybe for scenarios with low disk usage ratio..
Max Reitz March 16, 2021, 12:25 p.m. UTC | #6
On 15.03.21 15:40, Vladimir Sementsov-Ogievskiy wrote:
> 15.03.2021 12:58, Max Reitz wrote:

[...]

>> The question is whether it really makes sense to even have a 
>> seqcache_read() path when in reality it’s probably never accessed.  I 
>> mean, besides the fact that it seems based purely on chance whether a 
>> read might fetch something from the cache even while we’re writing, in 
>> practice I don’t know any case where we’d write to and read from a 
>> compressed qcow2 image at the same time.  (I don’t know what you’re 
>> doing with the 'compress' filter, though.)
>>
> 
> Note, that for user that's not a parallel write and read to the same 
> cluster:
> 
> 1. user writes cluster A, request succeeded, data is in the cache
> 
> 2. user writes some other clusters, cache filled, flush started
> 
> 3. in parallel to [2] user reads cluster A. From the POV of user, 
> cluster A is written already, and should be read successfully

Yes, but when would that happen?

> And seqcache_read() gives a simple non-blocking way to support read 
> operation.

OK, that makes sense.  We’d need to flush the cache before we can read 
anything from the disk, so we should have a read-from-cache branch here.

> But rewriting compressed clusters is sensible only when we run real 
> guest on compressed image.. Can it be helpful? Maybe for scenarios with 
> low disk usage ratio..

I’m not sure, but the point is that rewrites are currently not 
supported.  The whole compression implementation is mainly tailored 
towards just writing a complete image (e.g. by qemu-img convert or the 
backup job), so that’s where my question is coming from: It’s difficult 
for me to see a currently working use case where you’d read from and 
write to a compressed image at the same time.

Max
Vladimir Sementsov-Ogievskiy March 16, 2021, 5:48 p.m. UTC | #7
16.03.2021 15:25, Max Reitz wrote:
> On 15.03.21 15:40, Vladimir Sementsov-Ogievskiy wrote:
>> 15.03.2021 12:58, Max Reitz wrote:
> 
> [...]
> 
>>> The question is whether it really makes sense to even have a seqcache_read() path when in reality it’s probably never accessed.  I mean, besides the fact that it seems based purely on chance whether a read might fetch something from the cache even while we’re writing, in practice I don’t know any case where we’d write to and read from a compressed qcow2 image at the same time.  (I don’t know what you’re doing with the 'compress' filter, though.)
>>>
>>
>> Note, that for user that's not a parallel write and read to the same cluster:
>>
>> 1. user writes cluster A, request succeeded, data is in the cache
>>
>> 2. user writes some other clusters, cache filled, flush started
>>
>> 3. in parallel to [2] user reads cluster A. From the POV of user, cluster A is written already, and should be read successfully
> 
> Yes, but when would that happen?
> 
>> And seqcache_read() gives a simple non-blocking way to support read operation.
> 
> OK, that makes sense.  We’d need to flush the cache before we can read anything from the disk, so we should have a read-from-cache branch here.
> 
>> But rewriting compressed clusters is sensible only when we run real guest on compressed image.. Can it be helpful? Maybe for scenarios with low disk usage ratio..
> 
> I’m not sure, but the point is that rewrites are currently not supported.  The whole compression implementation is mainly tailored towards just writing a complete image (e.g. by qemu-img convert or the backup job), so that’s where my question is coming from: It’s difficult for me to see a currently working use case where you’d read from and write to a compressed image at the same time.
> 

External backup works like the following:

  - start backup sync=none from active disk to temporary disk
  - export temporary disk through nbd
  - external tool reads from nbd export

For this scheme it may make sense to use compression, and we get a use case where compressed reads and writes are used in the same time. Moreover this is possible just now, and no reason to not support it.
Max Reitz March 17, 2021, 8:09 a.m. UTC | #8
On 16.03.21 18:48, Vladimir Sementsov-Ogievskiy wrote:
> 16.03.2021 15:25, Max Reitz wrote:
>> On 15.03.21 15:40, Vladimir Sementsov-Ogievskiy wrote:
>>> 15.03.2021 12:58, Max Reitz wrote:
>>
>> [...]
>>
>>>> The question is whether it really makes sense to even have a 
>>>> seqcache_read() path when in reality it’s probably never accessed.  
>>>> I mean, besides the fact that it seems based purely on chance 
>>>> whether a read might fetch something from the cache even while we’re 
>>>> writing, in practice I don’t know any case where we’d write to and 
>>>> read from a compressed qcow2 image at the same time.  (I don’t know 
>>>> what you’re doing with the 'compress' filter, though.)
>>>>
>>>
>>> Note, that for user that's not a parallel write and read to the same 
>>> cluster:
>>>
>>> 1. user writes cluster A, request succeeded, data is in the cache
>>>
>>> 2. user writes some other clusters, cache filled, flush started
>>>
>>> 3. in parallel to [2] user reads cluster A. From the POV of user, 
>>> cluster A is written already, and should be read successfully
>>
>> Yes, but when would that happen?
>>
>>> And seqcache_read() gives a simple non-blocking way to support read 
>>> operation.
>>
>> OK, that makes sense.  We’d need to flush the cache before we can read 
>> anything from the disk, so we should have a read-from-cache branch here.
>>
>>> But rewriting compressed clusters is sensible only when we run real 
>>> guest on compressed image.. Can it be helpful? Maybe for scenarios 
>>> with low disk usage ratio..
>>
>> I’m not sure, but the point is that rewrites are currently not 
>> supported.  The whole compression implementation is mainly tailored 
>> towards just writing a complete image (e.g. by qemu-img convert or the 
>> backup job), so that’s where my question is coming from: It’s 
>> difficult for me to see a currently working use case where you’d read 
>> from and write to a compressed image at the same time.
>>
> 
> External backup works like the following:
> 
>   - start backup sync=none from active disk to temporary disk
>   - export temporary disk through nbd
>   - external tool reads from nbd export
> 
> For this scheme it may make sense to use compression,

Not sure whether it really does, because it’s my impression the 
temporary disk is deleted after the backup, but no matter: You’re right, 
that’s indeed a valid case.

Max
Vladimir Sementsov-Ogievskiy March 29, 2021, 8:18 p.m. UTC | #9
12.03.2021 21:15, Max Reitz wrote:
> On 05.03.21 18:35, Vladimir Sementsov-Ogievskiy wrote:
>> Compressed writes are unaligned to 512, which works very slow in
>> O_DIRECT mode. Let's use the cache.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>   block/coroutines.h     |   3 +
>>   block/qcow2.h          |   4 ++
>>   block/qcow2-refcount.c |  10 +++
>>   block/qcow2.c          | 158 ++++++++++++++++++++++++++++++++++++++---
>>   4 files changed, 164 insertions(+), 11 deletions(-)
>>

[..]

>> +static int coroutine_fn
>> +qcow2_co_compressed_flush_one(BlockDriverState *bs, bool unfinished)
>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +    int ret;
>> +    int64_t align = 1;
>> +    int64_t offset, bytes;
>> +    uint8_t *buf;
>> +
>> +    if (!s->compressed_cache) {
>> +        return 0;
>> +    }
>> +
>> +    if (!seqcache_get_next_flush(s->compressed_cache, &offset, &bytes, &buf,
>> +                                 &unfinished))
>> +    {
>> +        return 0;
>> +    }
>> +
>> +    qcow2_inflight_writes_inc(bs, offset, bytes);
>> +
>> +    /*
>> +     * Don't try to align-up unfinished cached cluster, parallel write to it is
>> +     * possible! For finished host clusters data in the tail of the cluster will
>> +     * be never used. So, take some good alignment for speed.
>> +     *
>> +     * Note also, that seqcache guarantees that allocated size of @buf is enough
>> +     * to fill the cluster up to the end, so we are safe to align up with
>> +     * align <= cluster_size.
>> +     */
>> +    if (!unfinished) {
>> +        align = MIN(s->cluster_size,
>> +                    MAX(s->data_file->bs->bl.request_alignment,
>> +                        bdrv_opt_mem_align(bs)));
>> +    }
>> +
> 
> I’d move the pre-write overlap check here, because its purpose is to prevent writes to metadata structures as they are happening, not as they are queued into a cache.  (I’d say.)

Hmm. pre-write overlap check usually done under mutex.. Should I add an additional critical section to do overlap check? I'm not sure.

> 
>> +    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
>> +    ret = bdrv_co_pwrite(s->data_file, offset,
>> +                         QEMU_ALIGN_UP(offset + bytes, align) - offset,
> 
> I remember you said you wanted to describe more of the properties of the buffer returned by seqcache_get_next_flush().  That would be nice here, because intuitively one would assume that the buffer is @bytes bytes long, which aligning here will exceed.  (It’s fine, but it would be nicer if there was a comment that assured that it’s fine.)
> 

It's here, read several lines above: "Note also, that..."

>> +                         buf, 0);
>> +    if (ret < 0 && s->compressed_flush_ret == 0) {
>> +        /*
>> +         * The data that was "written" earlier is lost. We don't want to
>> +         * care with storing it somehow to retry flushing later.
diff mbox series

Patch

diff --git a/block/coroutines.h b/block/coroutines.h
index 4cfb4946e6..cb8e05830e 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -66,4 +66,7 @@  int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs,
 int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs,
                                         QEMUIOVector *qiov, int64_t pos);
 
+int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs);
+int generated_co_wrapper qcow2_flush_compressed_cache(BlockDriverState *bs);
+
 #endif /* BLOCK_COROUTINES_INT_H */
diff --git a/block/qcow2.h b/block/qcow2.h
index e18d8dfbae..8b8fed0950 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -28,6 +28,7 @@ 
 #include "crypto/block.h"
 #include "qemu/coroutine.h"
 #include "qemu/units.h"
+#include "qemu/seqcache.h"
 #include "block/block_int.h"
 
 //#define DEBUG_ALLOC
@@ -422,6 +423,9 @@  typedef struct BDRVQcow2State {
     Qcow2CompressionType compression_type;
 
     GHashTable *inflight_writes_counters;
+
+    SeqCache *compressed_cache;
+    int compressed_flush_ret;
 } BDRVQcow2State;
 
 typedef struct Qcow2COWRegion {
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 464d133368..218917308e 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -29,6 +29,7 @@ 
 #include "qemu/bswap.h"
 #include "qemu/cutils.h"
 #include "trace.h"
+#include "block/coroutines.h"
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
                                     uint64_t max);
@@ -1040,6 +1041,10 @@  static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                 qcow2_cache_discard(s->l2_table_cache, table);
             }
 
+            if (s->compressed_cache) {
+                seqcache_discard_cluster(s->compressed_cache, cluster_offset);
+            }
+
             if (s->discard_passthrough[type]) {
                 update_refcount_discard(bs, cluster_offset, s->cluster_size);
             }
@@ -1349,6 +1354,11 @@  int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
     BDRVQcow2State *s = bs->opaque;
     int ret;
 
+    ret = qcow2_flush_compressed_cache(bs);
+    if (ret < 0) {
+        return ret;
+    }
+
     ret = qcow2_cache_write(bs, s->l2_table_cache);
     if (ret < 0) {
         return ret;
diff --git a/block/qcow2.c b/block/qcow2.c
index 6ee94421e0..5f3713cd6f 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -42,6 +42,7 @@ 
 #include "qapi/qapi-visit-block-core.h"
 #include "crypto.h"
 #include "block/aio_task.h"
+#include "block/coroutines.h"
 
 /*
   Differences with QCOW:
@@ -1834,6 +1835,10 @@  static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
     s->inflight_writes_counters =
         g_hash_table_new_full(g_int64_hash, g_int64_equal, g_free, g_free);
 
+    if (!has_data_file(bs) && (bs->file->bs->open_flags & BDRV_O_NOCACHE)) {
+        s->compressed_cache = seqcache_new(s->cluster_size);
+    }
+
     return ret;
 
  fail:
@@ -2653,6 +2658,91 @@  fail_nometa:
     return ret;
 }
 
+/*
+ * Flush one cluster of compressed cache if exists. If @unfinished is true and
+ * there is no finished cluster to flush, flush the unfinished one. Otherwise
+ * flush only finished clusters.
+ *
+ * Return 0 if nothing to flush, 1 if one cluster successfully flushed and <0 on
+ * error.
+ */
+static int coroutine_fn
+qcow2_co_compressed_flush_one(BlockDriverState *bs, bool unfinished)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int ret;
+    int64_t align = 1;
+    int64_t offset, bytes;
+    uint8_t *buf;
+
+    if (!s->compressed_cache) {
+        return 0;
+    }
+
+    if (!seqcache_get_next_flush(s->compressed_cache, &offset, &bytes, &buf,
+                                 &unfinished))
+    {
+        return 0;
+    }
+
+    qcow2_inflight_writes_inc(bs, offset, bytes);
+
+    /*
+     * Don't try to align-up unfinished cached cluster, parallel write to it is
+     * possible! For finished host clusters data in the tail of the cluster will
+     * be never used. So, take some good alignment for speed.
+     *
+     * Note also, that seqcache guarantees that allocated size of @buf is enough
+     * to fill the cluster up to the end, so we are safe to align up with
+     * align <= cluster_size.
+     */
+    if (!unfinished) {
+        align = MIN(s->cluster_size,
+                    MAX(s->data_file->bs->bl.request_alignment,
+                        bdrv_opt_mem_align(bs)));
+    }
+
+    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
+    ret = bdrv_co_pwrite(s->data_file, offset,
+                         QEMU_ALIGN_UP(offset + bytes, align) - offset,
+                         buf, 0);
+    if (ret < 0 && s->compressed_flush_ret == 0) {
+        /*
+         * The data that was "written" earlier is lost. We don't want to
+         * care with storing it somehow to retry flushing later. Also, how much
+         * data will we have to store, if not drop it after failed flush?
+         * After this point qcow2_co_flush_compressed_cache() (and
+         * qcow2_flush_compressed_cache()) never return success.
+         */
+        s->compressed_flush_ret = ret;
+    }
+
+    seqcache_discard_cluster(s->compressed_cache, offset);
+
+    qcow2_inflight_writes_dec(bs, offset, bytes);
+
+    return ret < 0 ? ret : 1;
+}
+
+int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs)
+{
+    BDRVQcow2State *s = bs->opaque;
+
+    if (s->compressed_cache) {
+        uint64_t nb_clusters = seqcache_nb_clusters(s->compressed_cache);
+
+        /*
+         * If parallel writes are possible we don't want to loop forever. So
+         * flush only clusters available at start of flush operation.
+         */
+        while (nb_clusters--) {
+            qcow2_co_compressed_flush_one(bs, true);
+        }
+    }
+
+    return s->compressed_flush_ret;
+}
+
 static int qcow2_inactivate(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;
@@ -2667,6 +2757,13 @@  static int qcow2_inactivate(BlockDriverState *bs)
                           bdrv_get_device_or_node_name(bs));
     }
 
+    ret = qcow2_flush_compressed_cache(bs);
+    if (ret) {
+        result = ret;
+        error_report("Failed to flush the compressed write cache: %s",
+                     strerror(-ret));
+    }
+
     ret = qcow2_cache_flush(bs, s->l2_table_cache);
     if (ret) {
         result = ret;
@@ -2699,6 +2796,12 @@  static void qcow2_close(BlockDriverState *bs)
         qcow2_inactivate(bs);
     }
 
+    /*
+     * Cache should be flushed in qcow2_inactivate() and should be empty in
+     * inactive mode. So we are safe to free it.
+     */
+    seqcache_free(s->compressed_cache);
+
     cache_clean_timer_del(bs);
     qcow2_cache_destroy(s->l2_table_cache);
     qcow2_cache_destroy(s->refcount_block_cache);
@@ -4558,18 +4661,42 @@  qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
         goto fail;
     }
 
-    qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
+    if (s->compressed_cache) {
+        /*
+         * It's important to do seqcache_write() in the same critical section
+         * (by s->lock) as qcow2_alloc_compressed_cluster_offset(), so that the
+         * cache is filled sequentially.
+         */
+        seqcache_write(s->compressed_cache, cluster_offset, out_len, out_buf);
 
-    qemu_co_mutex_unlock(&s->lock);
+        qemu_co_mutex_unlock(&s->lock);
 
-    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
-    ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
+        ret = qcow2_co_compressed_flush_one(bs, false);
+        if (ret < 0) {
+            /*
+             * if ret < 0 it probably not this request which failed, but some
+             * previous one that was cached. Moreover, when we write to the
+             * cache we probably may always report success, because
+             * seqcache_write() never fails. Still, it's better to inform
+             * compressed backup now then wait until final flush.
+             */
+            goto fail;
+        }
+    } else {
+        qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
 
-    qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
+        qemu_co_mutex_unlock(&s->lock);
 
-    if (ret < 0) {
-        goto fail;
+        BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
+        ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
+
+        qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
+
+        if (ret < 0) {
+            goto fail;
+        }
     }
+
 success:
     ret = 0;
 fail:
@@ -4681,10 +4808,19 @@  qcow2_co_preadv_compressed(BlockDriverState *bs,
 
     out_buf = qemu_blockalign(bs, s->cluster_size);
 
-    BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
-    ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
-    if (ret < 0) {
-        goto fail;
+    /*
+     * seqcache_read may return less bytes than csize, as csize may exceed
+     * actual compressed data size. So we are OK if seqcache_read returns
+     * something > 0.
+     */
+    if (!s->compressed_cache ||
+        seqcache_read(s->compressed_cache, coffset, csize, buf) <= 0)
+    {
+        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
+        ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
+        if (ret < 0) {
+            goto fail;
+        }
     }
 
     if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {