diff mbox

[v2,3/4] qcow2: add shrink image support

Message ID 20170613121639.17853-4-pbutsykin@virtuozzo.com (mailing list archive)
State New, archived
Headers show

Commit Message

Pavel Butsykin June 13, 2017, 12:16 p.m. UTC
This patch add shrinking of the image file for qcow2. As a result, this allows
us to reduce the virtual image size and free up space on the disk without
copying the image. Image can be fragmented and shrink is done by punching holes
in the image file.

Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
---
 block/qcow2-cluster.c  | 42 ++++++++++++++++++++++++++++++++
 block/qcow2-refcount.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.c          | 40 +++++++++++++++++++++++--------
 block/qcow2.h          |  2 ++
 qapi/block-core.json   |  3 ++-
 5 files changed, 141 insertions(+), 11 deletions(-)

Comments

Max Reitz June 21, 2017, 10:55 p.m. UTC | #1
On 2017-06-13 14:16, Pavel Butsykin wrote:
> This patch add shrinking of the image file for qcow2. As a result, this allows
> us to reduce the virtual image size and free up space on the disk without
> copying the image. Image can be fragmented and shrink is done by punching holes
> in the image file.
> 
> Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
> ---
>  block/qcow2-cluster.c  | 42 ++++++++++++++++++++++++++++++++
>  block/qcow2-refcount.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/qcow2.c          | 40 +++++++++++++++++++++++--------
>  block/qcow2.h          |  2 ++
>  qapi/block-core.json   |  3 ++-
>  5 files changed, 141 insertions(+), 11 deletions(-)
> 
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index d779ea19cf..a84b7e607e 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -32,6 +32,48 @@
>  #include "qemu/bswap.h"
>  #include "trace.h"
>  
> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size)

It's not really a max_size but always an exact size. You don't want it
to be any smaller than this.

> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    int new_l1_size, i, ret;
> +
> +    if (max_size >= s->l1_size) {
> +        return 0;
> +    }
> +
> +    new_l1_size = max_size;
> +
> +#ifdef DEBUG_ALLOC2
> +    fprintf(stderr, "shrink l1_table from %d to %" PRId64 "\n",
> +            s->l1_size, new_l1_size);

new_l1_size is of type int, not int64_t.

> +#endif
> +
> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
> +    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
> +                                       sizeof(uint64_t) * new_l1_size,
> +                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_flush(bs->file->bs);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
> +    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
> +        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
> +            continue;
> +        }
> +        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
> +                            s->l2_size * sizeof(uint64_t),

I'm more of a fan of s->cluster_size instead of s->l2_size *
sizeof(uint64_t) but it's not like it matters...

> +                            QCOW2_DISCARD_ALWAYS);
> +        s->l1_table[i] = 0;

I'd probably clear the overhanging s->l1_table entries before
bdrv_flush() (before you shouldn't really use them after
bdrv_pwrite_zeroes() has returned, even if bdrv_flush() has failed), but
it's not absolutely necessary. As long as they still have a refcount of
at least one, writing to them will just be useless but not destroy any data.

> +    }
> +    return 0;
> +}
> +
>  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>                          bool exact_size)
>  {
> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
> index 576ab551d6..e98306acd8 100644
> --- a/block/qcow2-refcount.c
> +++ b/block/qcow2-refcount.c
> @@ -29,6 +29,7 @@
>  #include "block/qcow2.h"
>  #include "qemu/range.h"
>  #include "qemu/bswap.h"
> +#include "qemu/cutils.h"
>  
>  static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
>  static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
> @@ -2936,3 +2937,67 @@ done:
>      qemu_vfree(new_refblock);
>      return ret;
>  }
> +
> +int qcow2_shrink_reftable(BlockDriverState *bs)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    uint64_t *reftable_tmp =
> +        g_try_malloc(sizeof(uint64_t) * s->refcount_table_size);
> +    int i, ret;
> +
> +    if (s->refcount_table_size && reftable_tmp == NULL) {
> +        return -ENOMEM;
> +    }
> +
> +    for (i = 0; i < s->refcount_table_size; i++) {
> +        int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
> +        void *refblock;
> +        bool unused_block;
> +
> +        if (refblock_offs == 0) {
> +            reftable_tmp[i] = 0;
> +            continue;
> +        }
> +        ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
> +                              &refblock);
> +        if (ret < 0) {
> +            goto out;
> +        }
> +
> +        /* the refblock has own reference */
> +        if (i == refblock_offs >> (s->refcount_block_bits + s->cluster_bits)) {
> +            uint64_t blk_index = (refblock_offs >> s->cluster_bits) &
> +                                 (s->refcount_block_size - 1);
> +            uint64_t refcount = s->get_refcount(refblock, blk_index);
> +
> +            s->set_refcount(refblock, blk_index, 0);
> +
> +            unused_block = buffer_is_zero(refblock, s->refcount_block_size);

s/refcount_block_size/cluster_size/

> +
> +            s->set_refcount(refblock, blk_index, refcount);
> +        } else {
> +            unused_block = buffer_is_zero(refblock, s->refcount_block_size);

Same here.

> +        }
> +        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
> +
> +        reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
> +    }
> +
> +    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
> +                           sizeof(uint64_t) * s->refcount_table_size);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    for (i = 0; i < s->refcount_table_size; i++) {
> +        if (s->refcount_table[i] && !reftable_tmp[i]) {
> +            qcow2_free_clusters(bs, s->refcount_table[i] & REFT_OFFSET_MASK,
> +                                s->cluster_size, QCOW2_DISCARD_ALWAYS);

This doesn't feel like a very good idea. The bdrv_pwrite_sync() before
has brought the on-disk refcount structures into a different state than
what we have cached.

OTOH, the bdrv_pwrite_sync() has accessed only the reftable and this
should only access refblocks. So I cannot think of any way this might
actually do something bad. But I guess it'll be better for to revisit
this when it's not in the middle of the night (so on Friday).

> +            s->refcount_table[i] = 0;
> +        }
> +    }
> +
> +out:
> +    g_free(reftable_tmp);
> +    return ret;
> +}
> diff --git a/block/qcow2.c b/block/qcow2.c
> index b3ba5daa93..0ad46d2776 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2545,6 +2545,7 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
>  {
>      BDRVQcow2State *s = bs->opaque;
>      int64_t new_l1_size;
> +    uint64_t total_size;
>      int ret;
>  
>      if (offset & 511) {
> @@ -2558,17 +2559,36 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
>          return -ENOTSUP;
>      }
>  
> -    /* shrinking is currently not supported */
> -    if (offset < bs->total_sectors * 512) {
> -        error_setg(errp, "qcow2 doesn't support shrinking images yet");
> -        return -ENOTSUP;
> -    }
> -
>      new_l1_size = size_to_l1(s, offset);
> -    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
> -    if (ret < 0) {
> -        error_setg_errno(errp, -ret, "Failed to grow the L1 table");
> -        return ret;
> +    total_size = bs->total_sectors << BDRV_SECTOR_BITS;
> +
> +    if (offset < total_size) {
> +        ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
> +                                    total_size - ROUND_UP(offset,
> +                                                          s->cluster_size),
> +                                    QCOW2_DISCARD_ALWAYS, true);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Failed to discard reduced clasters");

s/clasters/clusters/

And maybe "truncated", "stripped", or "cropped" instead of "reduced"?

> +            return ret;
> +        }
> +
> +        ret = qcow2_shrink_l1_table(bs, new_l1_size);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Failed to reduce the L1 table");

s/reduce/shrink/ (or "truncate"; or "reduce the L1 table size")

Also, to be fair, you're actually reducing the number of L2 tables, not
the size of the L1 table. (But that's a nit pick)

> +            return ret;
> +        }
> +
> +        ret = qcow2_shrink_reftable(bs);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Failed to shrink the refcount table");

And this is not really shrinking the reftable but instead discarding
some refblocks (potentially). (This is a nit pick, too)

Max

> +            return ret;
> +        }
> +    } else {
> +        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Failed to grow the L1 table");
> +            return ret;
> +        }
>      }
>  
>      /* write updated header.size */
> diff --git a/block/qcow2.h b/block/qcow2.h
> index 07faa6dc78..600463bf8e 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -531,10 +531,12 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
>  int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
>                                  BlockDriverAmendStatusCB *status_cb,
>                                  void *cb_opaque, Error **errp);
> +int qcow2_shrink_reftable(BlockDriverState *bs);
>  
>  /* qcow2-cluster.c functions */
>  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>                          bool exact_size);
> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size);
>  int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
>  int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
>  int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index f85c2235c7..bcbffa3339 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -2372,7 +2372,8 @@
>              'cluster_alloc_bytes', 'cluster_free', 'flush_to_os',
>              'flush_to_disk', 'pwritev_rmw_head', 'pwritev_rmw_after_head',
>              'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
> -            'pwritev_zero', 'pwritev_done', 'empty_image_prepare' ] }
> +            'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
> +            'l1_shrink_write_table', 'l1_shrink_free_l2_clusters' ] }
>  
>  ##
>  # @BlkdebugInjectErrorOptions:
>
Pavel Butsykin June 22, 2017, 1:57 p.m. UTC | #2
On 22.06.2017 01:55, Max Reitz wrote:
> On 2017-06-13 14:16, Pavel Butsykin wrote:
>> This patch add shrinking of the image file for qcow2. As a result, this allows
>> us to reduce the virtual image size and free up space on the disk without
>> copying the image. Image can be fragmented and shrink is done by punching holes
>> in the image file.
>>
>> Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
>> ---
>>   block/qcow2-cluster.c  | 42 ++++++++++++++++++++++++++++++++
>>   block/qcow2-refcount.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   block/qcow2.c          | 40 +++++++++++++++++++++++--------
>>   block/qcow2.h          |  2 ++
>>   qapi/block-core.json   |  3 ++-
>>   5 files changed, 141 insertions(+), 11 deletions(-)
>>
>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>> index d779ea19cf..a84b7e607e 100644
>> --- a/block/qcow2-cluster.c
>> +++ b/block/qcow2-cluster.c
>> @@ -32,6 +32,48 @@
>>   #include "qemu/bswap.h"
>>   #include "trace.h"
>>   
>> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size)
> 
> It's not really a max_size but always an exact size. You don't want it
> to be any smaller than this.
> 
>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +    int new_l1_size, i, ret;
>> +
>> +    if (max_size >= s->l1_size) {
>> +        return 0;
>> +    }
>> +
>> +    new_l1_size = max_size;
>> +
>> +#ifdef DEBUG_ALLOC2
>> +    fprintf(stderr, "shrink l1_table from %d to %" PRId64 "\n",
>> +            s->l1_size, new_l1_size);
> 
> new_l1_size is of type int, not int64_t.
> 
>> +#endif
>> +
>> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
>> +    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
>> +                                       sizeof(uint64_t) * new_l1_size,
>> +                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_flush(bs->file->bs);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
>> +    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
>> +        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
>> +            continue;
>> +        }
>> +        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
>> +                            s->l2_size * sizeof(uint64_t),
> 
> I'm more of a fan of s->cluster_size instead of s->l2_size *
> sizeof(uint64_t) but it's not like it matters...
> 
>> +                            QCOW2_DISCARD_ALWAYS);
>> +        s->l1_table[i] = 0;
> 
> I'd probably clear the overhanging s->l1_table entries before
> bdrv_flush() (before you shouldn't really use them after
> bdrv_pwrite_zeroes() has returned, even if bdrv_flush() has failed), but
> it's not absolutely necessary. As long as they still have a refcount of
> at least one, writing to them will just be useless but not destroy any data.
>

You're right, but If it's not necessary, I would prefer to leave as is..
Just because overhanging s->l1_table entries used to release clusters :)

>> +    }
>> +    return 0;
>> +}
>> +
>>   int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>>                           bool exact_size)
>>   {
>> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
>> index 576ab551d6..e98306acd8 100644
>> --- a/block/qcow2-refcount.c
>> +++ b/block/qcow2-refcount.c
>> @@ -29,6 +29,7 @@
>>   #include "block/qcow2.h"
>>   #include "qemu/range.h"
>>   #include "qemu/bswap.h"
>> +#include "qemu/cutils.h"
>>   
>>   static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
>>   static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
>> @@ -2936,3 +2937,67 @@ done:
>>       qemu_vfree(new_refblock);
>>       return ret;
>>   }
>> +
>> +int qcow2_shrink_reftable(BlockDriverState *bs)
>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +    uint64_t *reftable_tmp =
>> +        g_try_malloc(sizeof(uint64_t) * s->refcount_table_size);
>> +    int i, ret;
>> +
>> +    if (s->refcount_table_size && reftable_tmp == NULL) {
>> +        return -ENOMEM;
>> +    }
>> +
>> +    for (i = 0; i < s->refcount_table_size; i++) {
>> +        int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
>> +        void *refblock;
>> +        bool unused_block;
>> +
>> +        if (refblock_offs == 0) {
>> +            reftable_tmp[i] = 0;
>> +            continue;
>> +        }
>> +        ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
>> +                              &refblock);
>> +        if (ret < 0) {
>> +            goto out;
>> +        }
>> +
>> +        /* the refblock has own reference */
>> +        if (i == refblock_offs >> (s->refcount_block_bits + s->cluster_bits)) {
>> +            uint64_t blk_index = (refblock_offs >> s->cluster_bits) &
>> +                                 (s->refcount_block_size - 1);
>> +            uint64_t refcount = s->get_refcount(refblock, blk_index);
>> +
>> +            s->set_refcount(refblock, blk_index, 0);
>> +
>> +            unused_block = buffer_is_zero(refblock, s->refcount_block_size);
> 
> s/refcount_block_size/cluster_size/
> 
>> +
>> +            s->set_refcount(refblock, blk_index, refcount);
>> +        } else {
>> +            unused_block = buffer_is_zero(refblock, s->refcount_block_size);
> 
> Same here.
> 
>> +        }
>> +        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
>> +
>> +        reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
>> +    }
>> +
>> +    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
>> +                           sizeof(uint64_t) * s->refcount_table_size);
>> +    if (ret < 0) {
>> +        goto out;
>> +    }
>> +
>> +    for (i = 0; i < s->refcount_table_size; i++) {
>> +        if (s->refcount_table[i] && !reftable_tmp[i]) {
>> +            qcow2_free_clusters(bs, s->refcount_table[i] & REFT_OFFSET_MASK,
>> +                                s->cluster_size, QCOW2_DISCARD_ALWAYS);
> 
> This doesn't feel like a very good idea. The bdrv_pwrite_sync() before
> has brought the on-disk refcount structures into a different state than
> what we have cached.

It is for this inside qcow2_free_clusters()->update_refcount() the cache
is discarded by qcow2_cache_discard().

> OTOH, the bdrv_pwrite_sync() has accessed only the reftable and this
> should only access refblocks. So I cannot think of any way this might
> actually do something bad. But I guess it'll be better for to revisit
> this when it's not in the middle of the night (so on Friday).
> 
>> +            s->refcount_table[i] = 0;
>> +        }
>> +    }
>> +
>> +out:
>> +    g_free(reftable_tmp);
>> +    return ret;
>> +}
>> diff --git a/block/qcow2.c b/block/qcow2.c
>> index b3ba5daa93..0ad46d2776 100644
>> --- a/block/qcow2.c
>> +++ b/block/qcow2.c
>> @@ -2545,6 +2545,7 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
>>   {
>>       BDRVQcow2State *s = bs->opaque;
>>       int64_t new_l1_size;
>> +    uint64_t total_size;
>>       int ret;
>>   
>>       if (offset & 511) {
>> @@ -2558,17 +2559,36 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
>>           return -ENOTSUP;
>>       }
>>   
>> -    /* shrinking is currently not supported */
>> -    if (offset < bs->total_sectors * 512) {
>> -        error_setg(errp, "qcow2 doesn't support shrinking images yet");
>> -        return -ENOTSUP;
>> -    }
>> -
>>       new_l1_size = size_to_l1(s, offset);
>> -    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
>> -    if (ret < 0) {
>> -        error_setg_errno(errp, -ret, "Failed to grow the L1 table");
>> -        return ret;
>> +    total_size = bs->total_sectors << BDRV_SECTOR_BITS;
>> +
>> +    if (offset < total_size) {
>> +        ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
>> +                                    total_size - ROUND_UP(offset,
>> +                                                          s->cluster_size),
>> +                                    QCOW2_DISCARD_ALWAYS, true);
>> +        if (ret < 0) {
>> +            error_setg_errno(errp, -ret, "Failed to discard reduced clasters");
> 
> s/clasters/clusters/
> 
> And maybe "truncated", "stripped", or "cropped" instead of "reduced"?
> 
>> +            return ret;
>> +        }
>> +
>> +        ret = qcow2_shrink_l1_table(bs, new_l1_size);
>> +        if (ret < 0) {
>> +            error_setg_errno(errp, -ret, "Failed to reduce the L1 table");
> 
> s/reduce/shrink/ (or "truncate"; or "reduce the L1 table size")
> 
> Also, to be fair, you're actually reducing the number of L2 tables, not
> the size of the L1 table. (But that's a nit pick)

In the previous patch version, there really was reducing the L1 table
size :) I think now it's better to fix the error message.

>> +            return ret;
>> +        }
>> +
>> +        ret = qcow2_shrink_reftable(bs);
>> +        if (ret < 0) {
>> +            error_setg_errno(errp, -ret, "Failed to shrink the refcount table");
> 
> And this is not really shrinking the reftable but instead discarding
> some refblocks (potentially). (This is a nit pick, too)
> 
> Max
> 
>> +            return ret;
>> +        }
>> +    } else {
>> +        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
>> +        if (ret < 0) {
>> +            error_setg_errno(errp, -ret, "Failed to grow the L1 table");
>> +            return ret;
>> +        }
>>       }
>>   
>>       /* write updated header.size */
>> diff --git a/block/qcow2.h b/block/qcow2.h
>> index 07faa6dc78..600463bf8e 100644
>> --- a/block/qcow2.h
>> +++ b/block/qcow2.h
>> @@ -531,10 +531,12 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
>>   int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
>>                                   BlockDriverAmendStatusCB *status_cb,
>>                                   void *cb_opaque, Error **errp);
>> +int qcow2_shrink_reftable(BlockDriverState *bs);
>>   
>>   /* qcow2-cluster.c functions */
>>   int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>>                           bool exact_size);
>> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size);
>>   int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
>>   int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
>>   int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>> index f85c2235c7..bcbffa3339 100644
>> --- a/qapi/block-core.json
>> +++ b/qapi/block-core.json
>> @@ -2372,7 +2372,8 @@
>>               'cluster_alloc_bytes', 'cluster_free', 'flush_to_os',
>>               'flush_to_disk', 'pwritev_rmw_head', 'pwritev_rmw_after_head',
>>               'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
>> -            'pwritev_zero', 'pwritev_done', 'empty_image_prepare' ] }
>> +            'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
>> +            'l1_shrink_write_table', 'l1_shrink_free_l2_clusters' ] }
>>   
>>   ##
>>   # @BlkdebugInjectErrorOptions:
>>
> 
>
Max Reitz June 23, 2017, 3:46 p.m. UTC | #3
On 2017-06-22 15:57, Pavel Butsykin wrote:
> 
> On 22.06.2017 01:55, Max Reitz wrote:
>> On 2017-06-13 14:16, Pavel Butsykin wrote:
>>> This patch add shrinking of the image file for qcow2. As a result,
>>> this allows
>>> us to reduce the virtual image size and free up space on the disk
>>> without
>>> copying the image. Image can be fragmented and shrink is done by
>>> punching holes
>>> in the image file.
>>>
>>> Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
>>> ---
>>>   block/qcow2-cluster.c  | 42 ++++++++++++++++++++++++++++++++
>>>   block/qcow2-refcount.c | 65
>>> ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>   block/qcow2.c          | 40 +++++++++++++++++++++++--------
>>>   block/qcow2.h          |  2 ++
>>>   qapi/block-core.json   |  3 ++-
>>>   5 files changed, 141 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>>> index d779ea19cf..a84b7e607e 100644
>>> --- a/block/qcow2-cluster.c
>>> +++ b/block/qcow2-cluster.c
>>> @@ -32,6 +32,48 @@
>>>   #include "qemu/bswap.h"
>>>   #include "trace.h"
>>>   +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size)
>>
>> It's not really a max_size but always an exact size. You don't want it
>> to be any smaller than this.
>>
>>> +{
>>> +    BDRVQcow2State *s = bs->opaque;
>>> +    int new_l1_size, i, ret;
>>> +
>>> +    if (max_size >= s->l1_size) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    new_l1_size = max_size;
>>> +
>>> +#ifdef DEBUG_ALLOC2
>>> +    fprintf(stderr, "shrink l1_table from %d to %" PRId64 "\n",
>>> +            s->l1_size, new_l1_size);
>>
>> new_l1_size is of type int, not int64_t.
>>
>>> +#endif
>>> +
>>> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
>>> +    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
>>> +                                       sizeof(uint64_t) * new_l1_size,
>>> +                             (s->l1_size - new_l1_size) *
>>> sizeof(uint64_t), 0);
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    ret = bdrv_flush(bs->file->bs);
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
>>> +    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
>>> +        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
>>> +            continue;
>>> +        }
>>> +        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
>>> +                            s->l2_size * sizeof(uint64_t),
>>
>> I'm more of a fan of s->cluster_size instead of s->l2_size *
>> sizeof(uint64_t) but it's not like it matters...
>>
>>> +                            QCOW2_DISCARD_ALWAYS);
>>> +        s->l1_table[i] = 0;
>>
>> I'd probably clear the overhanging s->l1_table entries before
>> bdrv_flush() (before you shouldn't really use them after
>> bdrv_pwrite_zeroes() has returned, even if bdrv_flush() has failed), but
>> it's not absolutely necessary. As long as they still have a refcount of
>> at least one, writing to them will just be useless but not destroy any
>> data.
>>
> 
> You're right, but If it's not necessary, I would prefer to leave as is..
> Just because overhanging s->l1_table entries used to release clusters :)

Hm, yes. The question is, how bad are useless writes?

So the worst case scenario is this: You invoke qmp_block_resize() to
shrink the image; the bdrv_flush() call fails somewhere in the middle
but the data is still kind of pending and basically in the image.

Now when you continue to use the image and write data beyond the
intended new end, that data basically ends up nowhere. You can still
read the data just fine and change it, but when you restart qemu, it
will all be gone. So that's weird.

Admittedly, though, bdrv_flush() isn't the only issue here;
bdrv_pwrite_zeroes() is, too. If that fails somewhere in the middle, we
basically have the same situation.

Now if we were to update s->l1_table before the bdrv_pwrite_zeroes()
call, we might end up with the opposite issue: The data appears to be
gone, but after reopening the image, it's back again. The main
difference is that in this case we'll have to allocate L2 tables anew
and this will require writes to the L1 table, so maybe we can actually
succeed in overwriting the old data then... But that's a big maybe.

So all in all we'll very likely get inconsistencies either way, so yes,
it doesn't actually matter. :-)

>>> +    }
>>> +    return 0;
>>> +}
>>> +
>>>   int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>>>                           bool exact_size)
>>>   {
>>> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
>>> index 576ab551d6..e98306acd8 100644
>>> --- a/block/qcow2-refcount.c
>>> +++ b/block/qcow2-refcount.c
>>> @@ -29,6 +29,7 @@
>>>   #include "block/qcow2.h"
>>>   #include "qemu/range.h"
>>>   #include "qemu/bswap.h"
>>> +#include "qemu/cutils.h"
>>>     static int64_t alloc_clusters_noref(BlockDriverState *bs,
>>> uint64_t size);
>>>   static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState
>>> *bs,
>>> @@ -2936,3 +2937,67 @@ done:
>>>       qemu_vfree(new_refblock);
>>>       return ret;
>>>   }
>>> +
>>> +int qcow2_shrink_reftable(BlockDriverState *bs)
>>> +{
>>> +    BDRVQcow2State *s = bs->opaque;
>>> +    uint64_t *reftable_tmp =
>>> +        g_try_malloc(sizeof(uint64_t) * s->refcount_table_size);
>>> +    int i, ret;
>>> +
>>> +    if (s->refcount_table_size && reftable_tmp == NULL) {
>>> +        return -ENOMEM;
>>> +    }
>>> +
>>> +    for (i = 0; i < s->refcount_table_size; i++) {
>>> +        int64_t refblock_offs = s->refcount_table[i] &
>>> REFT_OFFSET_MASK;
>>> +        void *refblock;
>>> +        bool unused_block;
>>> +
>>> +        if (refblock_offs == 0) {
>>> +            reftable_tmp[i] = 0;
>>> +            continue;
>>> +        }
>>> +        ret = qcow2_cache_get(bs, s->refcount_block_cache,
>>> refblock_offs,
>>> +                              &refblock);
>>> +        if (ret < 0) {
>>> +            goto out;
>>> +        }
>>> +
>>> +        /* the refblock has own reference */
>>> +        if (i == refblock_offs >> (s->refcount_block_bits +
>>> s->cluster_bits)) {
>>> +            uint64_t blk_index = (refblock_offs >> s->cluster_bits) &
>>> +                                 (s->refcount_block_size - 1);
>>> +            uint64_t refcount = s->get_refcount(refblock, blk_index);
>>> +
>>> +            s->set_refcount(refblock, blk_index, 0);
>>> +
>>> +            unused_block = buffer_is_zero(refblock,
>>> s->refcount_block_size);
>>
>> s/refcount_block_size/cluster_size/
>>
>>> +
>>> +            s->set_refcount(refblock, blk_index, refcount);
>>> +        } else {
>>> +            unused_block = buffer_is_zero(refblock,
>>> s->refcount_block_size);
>>
>> Same here.
>>
>>> +        }
>>> +        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
>>> +
>>> +        reftable_tmp[i] = unused_block ? 0 :
>>> cpu_to_be64(s->refcount_table[i]);
>>> +    }
>>> +
>>> +    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset,
>>> reftable_tmp,
>>> +                           sizeof(uint64_t) * s->refcount_table_size);
>>> +    if (ret < 0) {
>>> +        goto out;
>>> +    }
>>> +
>>> +    for (i = 0; i < s->refcount_table_size; i++) {
>>> +        if (s->refcount_table[i] && !reftable_tmp[i]) {
>>> +            qcow2_free_clusters(bs, s->refcount_table[i] &
>>> REFT_OFFSET_MASK,
>>> +                                s->cluster_size, QCOW2_DISCARD_ALWAYS);
>>
>> This doesn't feel like a very good idea. The bdrv_pwrite_sync() before
>> has brought the on-disk refcount structures into a different state than
>> what we have cached.
> 
> It is for this inside qcow2_free_clusters()->update_refcount() the cache
> is discarded by qcow2_cache_discard().

This doesn't change the fact that the in-memory reftable is different
from the on-disk reftable and that qcow2_free_clusters() may trip up on
that; the main issue is the allocate_refcount_block() call before.

So we need a guarantee that update_refcount() won't touch the reftable
if the refcount is decreased. It will call alloc_refcount_block() and
that should definitely find the respective refblock to already exist
because of course it has a refcount already.

But here's an issue: It tries to read from s->refcount_table[], and you
are slowly overwriting it in the same loop here. So it may not actually
find the refcount (if a refblock is described by an earlier one).
(After more than an hour of debugging, I realized this is not true: You
will only zero reftable entries if the refblock describes nothing or
only themselves. So overwriting one reftable entry cannot have effects
on other refblocks. Or at least it should not.)

Another potential issue is that you're assuming s->refcount_table_size
to be constant. I cannot find a way for it not to be, but investigating
this is painful and I can't claim I know for sure that it is constant.
If it isn't, you may get overflows when accessing reftable_tmp[].

(Yes, it may be constant; but the reader of this code has to read
through qcow2_free_clusters(), allocate_refcount_block() and
update_refcount() to know (or at least to guess) that's the case.)

I don't really want to look deeper into this, but here's an image that I
produced while trying to somehow break all of this. It makes qemu-img
check pass but fails after qemu-img resize --shrink shrink.qcow2 32M:
https://xanclic.moe/shrink.qcow2

(The image has been created with cluster_size=512 and refcount_bits=64;
then I filled the penultimate two entries of the reftable with pointers
to 0x1f0000 and 0x1f0200, respectively (so the first of these refblocks
would describe both), giving me this:
https://xanclic.moe/shrink-template.qcow2
I then put some data onto it with qemu-io -c 'write 0 1457K', which gave
me shrink.qcow2.)

Max

>> OTOH, the bdrv_pwrite_sync() has accessed only the reftable and this
>> should only access refblocks. So I cannot think of any way this might
>> actually do something bad. But I guess it'll be better for to revisit
>> this when it's not in the middle of the night (so on Friday).
>>
>>> +            s->refcount_table[i] = 0;
>>> +        }
>>> +    }
>>> +
>>> +out:
>>> +    g_free(reftable_tmp);
>>> +    return ret;
>>> +}
>>> diff --git a/block/qcow2.c b/block/qcow2.c
>>> index b3ba5daa93..0ad46d2776 100644
>>> --- a/block/qcow2.c
>>> +++ b/block/qcow2.c
>>> @@ -2545,6 +2545,7 @@ static int qcow2_truncate(BlockDriverState *bs,
>>> int64_t offset, Error **errp)
>>>   {
>>>       BDRVQcow2State *s = bs->opaque;
>>>       int64_t new_l1_size;
>>> +    uint64_t total_size;
>>>       int ret;
>>>         if (offset & 511) {
>>> @@ -2558,17 +2559,36 @@ static int qcow2_truncate(BlockDriverState
>>> *bs, int64_t offset, Error **errp)
>>>           return -ENOTSUP;
>>>       }
>>>   -    /* shrinking is currently not supported */
>>> -    if (offset < bs->total_sectors * 512) {
>>> -        error_setg(errp, "qcow2 doesn't support shrinking images yet");
>>> -        return -ENOTSUP;
>>> -    }
>>> -
>>>       new_l1_size = size_to_l1(s, offset);
>>> -    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
>>> -    if (ret < 0) {
>>> -        error_setg_errno(errp, -ret, "Failed to grow the L1 table");
>>> -        return ret;
>>> +    total_size = bs->total_sectors << BDRV_SECTOR_BITS;
>>> +
>>> +    if (offset < total_size) {
>>> +        ret = qcow2_cluster_discard(bs, ROUND_UP(offset,
>>> s->cluster_size),
>>> +                                    total_size - ROUND_UP(offset,
>>> +                                                         
>>> s->cluster_size),
>>> +                                    QCOW2_DISCARD_ALWAYS, true);
>>> +        if (ret < 0) {
>>> +            error_setg_errno(errp, -ret, "Failed to discard reduced
>>> clasters");
>>
>> s/clasters/clusters/
>>
>> And maybe "truncated", "stripped", or "cropped" instead of "reduced"?
>>
>>> +            return ret;
>>> +        }
>>> +
>>> +        ret = qcow2_shrink_l1_table(bs, new_l1_size);
>>> +        if (ret < 0) {
>>> +            error_setg_errno(errp, -ret, "Failed to reduce the L1
>>> table");
>>
>> s/reduce/shrink/ (or "truncate"; or "reduce the L1 table size")
>>
>> Also, to be fair, you're actually reducing the number of L2 tables, not
>> the size of the L1 table. (But that's a nit pick)
> 
> In the previous patch version, there really was reducing the L1 table
> size :) I think now it's better to fix the error message.
> 
>>> +            return ret;
>>> +        }
>>> +
>>> +        ret = qcow2_shrink_reftable(bs);
>>> +        if (ret < 0) {
>>> +            error_setg_errno(errp, -ret, "Failed to shrink the
>>> refcount table");
>>
>> And this is not really shrinking the reftable but instead discarding
>> some refblocks (potentially). (This is a nit pick, too)
>>
>> Max
>>
>>> +            return ret;
>>> +        }
>>> +    } else {
>>> +        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
>>> +        if (ret < 0) {
>>> +            error_setg_errno(errp, -ret, "Failed to grow the L1
>>> table");
>>> +            return ret;
>>> +        }
>>>       }
>>>         /* write updated header.size */
>>> diff --git a/block/qcow2.h b/block/qcow2.h
>>> index 07faa6dc78..600463bf8e 100644
>>> --- a/block/qcow2.h
>>> +++ b/block/qcow2.h
>>> @@ -531,10 +531,12 @@ int
>>> qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t
>>> offset,
>>>   int qcow2_change_refcount_order(BlockDriverState *bs, int
>>> refcount_order,
>>>                                   BlockDriverAmendStatusCB *status_cb,
>>>                                   void *cb_opaque, Error **errp);
>>> +int qcow2_shrink_reftable(BlockDriverState *bs);
>>>     /* qcow2-cluster.c functions */
>>>   int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>>>                           bool exact_size);
>>> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size);
>>>   int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
>>>   int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t
>>> cluster_offset);
>>>   int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
>>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>>> index f85c2235c7..bcbffa3339 100644
>>> --- a/qapi/block-core.json
>>> +++ b/qapi/block-core.json
>>> @@ -2372,7 +2372,8 @@
>>>               'cluster_alloc_bytes', 'cluster_free', 'flush_to_os',
>>>               'flush_to_disk', 'pwritev_rmw_head',
>>> 'pwritev_rmw_after_head',
>>>               'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
>>> -            'pwritev_zero', 'pwritev_done', 'empty_image_prepare' ] }
>>> +            'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
>>> +            'l1_shrink_write_table', 'l1_shrink_free_l2_clusters' ] }
>>>     ##
>>>   # @BlkdebugInjectErrorOptions:
>>>
>>
>>
Pavel Butsykin June 26, 2017, 3:23 p.m. UTC | #4
On 23.06.2017 18:46, Max Reitz wrote:
> On 2017-06-22 15:57, Pavel Butsykin wrote:
>>
>> On 22.06.2017 01:55, Max Reitz wrote:
>>> On 2017-06-13 14:16, Pavel Butsykin wrote:
[]
>>>> +        }
>>>> +        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
>>>> +
>>>> +        reftable_tmp[i] = unused_block ? 0 :
>>>> cpu_to_be64(s->refcount_table[i]);
>>>> +    }
>>>> +
>>>> +    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset,
>>>> reftable_tmp,
>>>> +                           sizeof(uint64_t) * s->refcount_table_size);
>>>> +    if (ret < 0) {
>>>> +        goto out;
>>>> +    }
>>>> +
>>>> +    for (i = 0; i < s->refcount_table_size; i++) {
>>>> +        if (s->refcount_table[i] && !reftable_tmp[i]) {
>>>> +            qcow2_free_clusters(bs, s->refcount_table[i] &
>>>> REFT_OFFSET_MASK,
>>>> +                                s->cluster_size, QCOW2_DISCARD_ALWAYS);
>>>
>>> This doesn't feel like a very good idea. The bdrv_pwrite_sync() before
>>> has brought the on-disk refcount structures into a different state than
>>> what we have cached.
>>
>> It is for this inside qcow2_free_clusters()->update_refcount() the cache
>> is discarded by qcow2_cache_discard().
> 
> This doesn't change the fact that the in-memory reftable is different
> from the on-disk reftable and that qcow2_free_clusters() may trip up on
> that; the main issue is the allocate_refcount_block() call before.

before what?

If we are talking about allocate_refcount_block() calls after
bdrv_pwrite_sync(), then... Inside allocate_refcount_block() will always
be called load_refcount_block(), what actually is not so dangerous even
if refcount_block_cache is empty. Because the refblock offset will
always be taken from s->refcount_table.

> So we need a guarantee that update_refcount() won't touch the reftable
> if the refcount is decreased. It will call alloc_refcount_block() and
> that should definitely find the respective refblock to already exist
> because of course it has a refcount already.

We don't touch the refblocks which contain references to other
refblocks, this ensures that update_refcount() will not try to raise
the discarded refblock.

> But here's an issue: It tries to read from s->refcount_table[], and you
> are slowly overwriting it in the same loop here. So it may not actually
> find the refcount (if a refblock is described by an earlier one).
> (After more than an hour of debugging, I realized this is not true: You
> will only zero reftable entries if the refblock describes nothing or
> only themselves. So overwriting one reftable entry cannot have effects
> on other refblocks. Or at least it should not.)

As you've noticed, here uses a simple approach:
We discard only refblocks that contain nothing or own reference. If we
have a refblock that is actually empty, but contains a reference to
another empty refblock, we don't touch this refblock. Maybe it's not the
best solution, but at least it's simple and secure.

There is another approach that can be applied here:

1. decrease the refcounts for all refblocks
2. find all empty refblocks
3. increase the refcounts for all refblocks
4. rewrite the refcount_table on disk (with the empty reftable entries)
5. release all the emptt reblocks in reverse order (start at the end of 
the s->refcount_table)

This will certainly allow us to get rid of all empty reblocks, but the
code will be less welcoming :) Also the case when the refblock contains
a reference to another refblock is quite rare.

> Another potential issue is that you're assuming s->refcount_table_size
> to be constant. I cannot find a way for it not to be, but investigating
> this is painful and I can't claim I know for sure that it is constant.
> If it isn't, you may get overflows when accessing reftable_tmp[].
> 
> (Yes, it may be constant; but the reader of this code has to read
> through qcow2_free_clusters(), allocate_refcount_block() and
> update_refcount() to know (or at least to guess) that's the case.)

Is there any guarantee that in the future this will not change? Because
in this case it can be a potential danger.

I can add a comment... Or add a new variable with the size of
reftable_tmp, and every time count min(s->refcount_table_size, 
reftable_tmp_size)
before accessing to s->refcount_table[]/reftable_tmp[]

> I don't really want to look deeper into this, but here's an image that I
> produced while trying to somehow break all of this. It makes qemu-img
> check pass but fails after qemu-img resize --shrink shrink.qcow2 32M:
> https://xanclic.moe/shrink.qcow2
> 
> (The image has been created with cluster_size=512 and refcount_bits=64;
> then I filled the penultimate two entries of the reftable with pointers
> to 0x1f0000 and 0x1f0200, respectively (so the first of these refblocks
> would describe both), giving me this:
> https://xanclic.moe/shrink-template.qcow2
> I then put some data onto it with qemu-io -c 'write 0 1457K', which gave
> me shrink.qcow2.)
>

Thank you for the samples! The mistake was quite naive :) , I just
messed up the sizes here:
             unused_block = buffer_is_zero(refblock, ->refcount_block_size);

             s->set_refcount(refblock, blk_index, refcount);
         } else {
             unused_block = buffer_is_zero(refblock, 
s->refcount_block_size);

Should be:
buffer_is_zero(refblock, s->cluster_size);

> Max
> 
>>> OTOH, the bdrv_pwrite_sync() has accessed only the reftable and this
>>> should only access refblocks. So I cannot think of any way this might
>>> actually do something bad. But I guess it'll be better for to revisit
>>> this when it's not in the middle of the night (so on Friday).
>>>
>>>> +            s->refcount_table[i] = 0;
>>>> +        }
>>>> +    }
>>>> +
>>>> +out:
>>>> +    g_free(reftable_tmp);
>>>> +    return ret;
>>>> +}
>>>> diff --git a/block/qcow2.c b/block/qcow2.c
>>>> index b3ba5daa93..0ad46d2776 100644
>>>> --- a/block/qcow2.c
>>>> +++ b/block/qcow2.c
>>>> @@ -2545,6 +2545,7 @@ static int qcow2_truncate(BlockDriverState *bs,
>>>> int64_t offset, Error **errp)
>>>>    {
>>>>        BDRVQcow2State *s = bs->opaque;
>>>>        int64_t new_l1_size;
>>>> +    uint64_t total_size;
>>>>        int ret;
>>>>          if (offset & 511) {
>>>> @@ -2558,17 +2559,36 @@ static int qcow2_truncate(BlockDriverState
>>>> *bs, int64_t offset, Error **errp)
>>>>            return -ENOTSUP;
>>>>        }
>>>>    -    /* shrinking is currently not supported */
>>>> -    if (offset < bs->total_sectors * 512) {
>>>> -        error_setg(errp, "qcow2 doesn't support shrinking images yet");
>>>> -        return -ENOTSUP;
>>>> -    }
>>>> -
>>>>        new_l1_size = size_to_l1(s, offset);
>>>> -    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
>>>> -    if (ret < 0) {
>>>> -        error_setg_errno(errp, -ret, "Failed to grow the L1 table");
>>>> -        return ret;
>>>> +    total_size = bs->total_sectors << BDRV_SECTOR_BITS;
>>>> +
>>>> +    if (offset < total_size) {
>>>> +        ret = qcow2_cluster_discard(bs, ROUND_UP(offset,
>>>> s->cluster_size),
>>>> +                                    total_size - ROUND_UP(offset,
>>>> +
>>>> s->cluster_size),
>>>> +                                    QCOW2_DISCARD_ALWAYS, true);
>>>> +        if (ret < 0) {
>>>> +            error_setg_errno(errp, -ret, "Failed to discard reduced
>>>> clasters");
>>>
>>> s/clasters/clusters/
>>>
>>> And maybe "truncated", "stripped", or "cropped" instead of "reduced"?
>>>
>>>> +            return ret;
>>>> +        }
>>>> +
>>>> +        ret = qcow2_shrink_l1_table(bs, new_l1_size);
>>>> +        if (ret < 0) {
>>>> +            error_setg_errno(errp, -ret, "Failed to reduce the L1
>>>> table");
>>>
>>> s/reduce/shrink/ (or "truncate"; or "reduce the L1 table size")
>>>
>>> Also, to be fair, you're actually reducing the number of L2 tables, not
>>> the size of the L1 table. (But that's a nit pick)
>>
>> In the previous patch version, there really was reducing the L1 table
>> size :) I think now it's better to fix the error message.
>>
>>>> +            return ret;
>>>> +        }
>>>> +
>>>> +        ret = qcow2_shrink_reftable(bs);
>>>> +        if (ret < 0) {
>>>> +            error_setg_errno(errp, -ret, "Failed to shrink the
>>>> refcount table");
>>>
>>> And this is not really shrinking the reftable but instead discarding
>>> some refblocks (potentially). (This is a nit pick, too)
>>>
>>> Max
>>>
>>>> +            return ret;
>>>> +        }
>>>> +    } else {
>>>> +        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
>>>> +        if (ret < 0) {
>>>> +            error_setg_errno(errp, -ret, "Failed to grow the L1
>>>> table");
>>>> +            return ret;
>>>> +        }
>>>>        }
>>>>          /* write updated header.size */
>>>> diff --git a/block/qcow2.h b/block/qcow2.h
>>>> index 07faa6dc78..600463bf8e 100644
>>>> --- a/block/qcow2.h
>>>> +++ b/block/qcow2.h
>>>> @@ -531,10 +531,12 @@ int
>>>> qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t
>>>> offset,
>>>>    int qcow2_change_refcount_order(BlockDriverState *bs, int
>>>> refcount_order,
>>>>                                    BlockDriverAmendStatusCB *status_cb,
>>>>                                    void *cb_opaque, Error **errp);
>>>> +int qcow2_shrink_reftable(BlockDriverState *bs);
>>>>      /* qcow2-cluster.c functions */
>>>>    int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>>>>                            bool exact_size);
>>>> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size);
>>>>    int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
>>>>    int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t
>>>> cluster_offset);
>>>>    int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
>>>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>>>> index f85c2235c7..bcbffa3339 100644
>>>> --- a/qapi/block-core.json
>>>> +++ b/qapi/block-core.json
>>>> @@ -2372,7 +2372,8 @@
>>>>                'cluster_alloc_bytes', 'cluster_free', 'flush_to_os',
>>>>                'flush_to_disk', 'pwritev_rmw_head',
>>>> 'pwritev_rmw_after_head',
>>>>                'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
>>>> -            'pwritev_zero', 'pwritev_done', 'empty_image_prepare' ] }
>>>> +            'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
>>>> +            'l1_shrink_write_table', 'l1_shrink_free_l2_clusters' ] }
>>>>      ##
>>>>    # @BlkdebugInjectErrorOptions:
>>>>
>>>
>>>
> 
>
Max Reitz June 26, 2017, 5:47 p.m. UTC | #5
On 2017-06-26 17:23, Pavel Butsykin wrote:
> On 23.06.2017 18:46, Max Reitz wrote:
>> On 2017-06-22 15:57, Pavel Butsykin wrote:
>>>
>>> On 22.06.2017 01:55, Max Reitz wrote:
>>>> On 2017-06-13 14:16, Pavel Butsykin wrote:
> []
>>>>> +        }
>>>>> +        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
>>>>> +
>>>>> +        reftable_tmp[i] = unused_block ? 0 :
>>>>> cpu_to_be64(s->refcount_table[i]);
>>>>> +    }
>>>>> +
>>>>> +    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset,
>>>>> reftable_tmp,
>>>>> +                           sizeof(uint64_t) *
>>>>> s->refcount_table_size);
>>>>> +    if (ret < 0) {
>>>>> +        goto out;
>>>>> +    }
>>>>> +
>>>>> +    for (i = 0; i < s->refcount_table_size; i++) {
>>>>> +        if (s->refcount_table[i] && !reftable_tmp[i]) {
>>>>> +            qcow2_free_clusters(bs, s->refcount_table[i] &
>>>>> REFT_OFFSET_MASK,
>>>>> +                                s->cluster_size,
>>>>> QCOW2_DISCARD_ALWAYS);
>>>>
>>>> This doesn't feel like a very good idea. The bdrv_pwrite_sync() before
>>>> has brought the on-disk refcount structures into a different state than
>>>> what we have cached.
>>>
>>> It is for this inside qcow2_free_clusters()->update_refcount() the cache
>>> is discarded by qcow2_cache_discard().
>>
>> This doesn't change the fact that the in-memory reftable is different
>> from the on-disk reftable and that qcow2_free_clusters() may trip up on
>> that; the main issue is the allocate_refcount_block() call before.

*alloc_refcount_block(), sorry.

> before what?

Before qcow2_cache_discard() is called.

> If we are talking about allocate_refcount_block() calls after
> bdrv_pwrite_sync(), then... Inside allocate_refcount_block() will always
> be called load_refcount_block(), what actually is not so dangerous even
> if refcount_block_cache is empty. Because the refblock offset will
> always be taken from s->refcount_table.

Well, yes, and this offset has not been cleared yet, so it still points
to the old refblock (but the on-disk reftable does not, and this worries
me).

>> So we need a guarantee that update_refcount() won't touch the reftable
>> if the refcount is decreased. It will call alloc_refcount_block() and
>> that should definitely find the respective refblock to already exist
>> because of course it has a refcount already.
> 
> We don't touch the refblocks which contain references to other
> refblocks, this ensures that update_refcount() will not try to raise
> the discarded refblock.

It may ensure this in practice, yes, but I found proving this to be
rather difficult.

>> But here's an issue: It tries to read from s->refcount_table[], and you
>> are slowly overwriting it in the same loop here. So it may not actually
>> find the refcount (if a refblock is described by an earlier one).
>> (After more than an hour of debugging, I realized this is not true: You
>> will only zero reftable entries if the refblock describes nothing or
>> only themselves. So overwriting one reftable entry cannot have effects
>> on other refblocks. Or at least it should not.)
> 
> As you've noticed, here uses a simple approach:

Well, if it is a simple approach, I'm just very dense.

> We discard only refblocks that contain nothing or own reference. If we
> have a refblock that is actually empty, but contains a reference to
> another empty refblock, we don't touch this refblock. Maybe it's not the
> best solution, but at least it's simple and secure.

The theory is simple, yes, which is why I found it fine until the point
you decide to call usual refcount management functions with the on-disk
data differing from what is cached as s->refcount_table.

> There is another approach that can be applied here:
> 
> 1. decrease the refcounts for all refblocks

I think this will leave a broken image at this point, so I'd rather
avoid it.

> 2. find all empty refblocks
> 3. increase the refcounts for all refblocks
> 4. rewrite the refcount_table on disk (with the empty reftable entries)
> 5. release all the emptt reblocks in reverse order (start at the end of
> the s->refcount_table)
> 
> This will certainly allow us to get rid of all empty reblocks, but the
> code will be less welcoming :) Also the case when the refblock contains
> a reference to another refblock is quite rare.

Another way would be to invoke the function for dropping empty refblocks
repeatedly until no refblocks can be dropped anymore. This wouldn't
cover cyclic references, but I think that's fine.

In any case, though, I agree that we don't need to put too much work
into this. Refblocks by default just use around 0.03 % of what's needed
for data, so...

>> Another potential issue is that you're assuming s->refcount_table_size
>> to be constant. I cannot find a way for it not to be, but investigating
>> this is painful and I can't claim I know for sure that it is constant.
>> If it isn't, you may get overflows when accessing reftable_tmp[].
>>
>> (Yes, it may be constant; but the reader of this code has to read
>> through qcow2_free_clusters(), allocate_refcount_block() and
>> update_refcount() to know (or at least to guess) that's the case.)
> 
> Is there any guarantee that in the future this will not change? Because
> in this case it can be a potential danger.

Since this behavior is not documented anywhere, there is no guarantee.

> I can add a comment... Or add a new variable with the size of
> reftable_tmp, and every time count min(s->refcount_table_size,
> reftable_tmp_size)
> before accessing to s->refcount_table[]/reftable_tmp[]

Or (1) you add an assertion that refcount_table_size doesn't change
along with a comment why that is the case, which also explains in detail
why the call to qcow2_free_clusters() should be safe: The on-disk
reftable differs from the one in memory. qcow2_free_clusters()and
update_refcount() themselves do not access the reftable, so they are
safe. However, update_refcount() calls alloc_refcount_block(), and that
function does access the reftable: Now, as long as
s->refcount_table_size does not shrink (which I can't see why it would),
refcount_table_index should always be smaller. Now we're accessing
s->refcount_table: This will always return an existing refblock because
this will either be the refblock itself (for self-referencing refblocks)
or another one that is not going to be freed by qcow2_shrink_reftable()
because this function will not free refblocks which cover other clusters
than themselves.
We will then proceed to update the refblock which is either right (if it
is not the refblock to be freed) or won't do anything (if it is the one
to be freed).
In any case, we will never write to the reftable and reading from the
basically outdated cached version will never do anything bad.

Or (2) you copy reftable_tmp into s->refcount_table[] *before* any call
to qcow2_free_clusters(). To make this work, you would need to also
discard all refblocks from the cache in this function here (and not in
update_refcount()) and then only call qcow2_free_clusters() on refblocks
which were not self-referencing. An alternative hack would be to simply
mark the image dirty and just not do any qcow2_free_clusters() call...

Or (3) of course it would be possible to not clean up refcount
structures at all...

Max
Pavel Butsykin June 27, 2017, 3:06 p.m. UTC | #6
On 26.06.2017 20:47, Max Reitz wrote:
> On 2017-06-26 17:23, Pavel Butsykin wrote:
[]
>>
>> Is there any guarantee that in the future this will not change? Because
>> in this case it can be a potential danger.
> 
> Since this behavior is not documented anywhere, there is no guarantee.
> 
>> I can add a comment... Or add a new variable with the size of
>> reftable_tmp, and every time count min(s->refcount_table_size,
>> reftable_tmp_size)
>> before accessing to s->refcount_table[]/reftable_tmp[]
> 
> Or (1) you add an assertion that refcount_table_size doesn't change
> along with a comment why that is the case, which also explains in detail
> why the call to qcow2_free_clusters() should be safe: The on-disk
> reftable differs from the one in memory. qcow2_free_clusters()and
> update_refcount() themselves do not access the reftable, so they are
> safe. However, update_refcount() calls alloc_refcount_block(), and that
> function does access the reftable: Now, as long as
> s->refcount_table_size does not shrink (which I can't see why it would),
> refcount_table_index should always be smaller. Now we're accessing
> s->refcount_table: This will always return an existing refblock because
> this will either be the refblock itself (for self-referencing refblocks)
> or another one that is not going to be freed by qcow2_shrink_reftable()
> because this function will not free refblocks which cover other clusters
> than themselves.
> We will then proceed to update the refblock which is either right (if it
> is not the refblock to be freed) or won't do anything (if it is the one
> to be freed).
> In any case, we will never write to the reftable and reading from the
> basically outdated cached version will never do anything bad.

OK, SGTM.

> Or (2) you copy reftable_tmp into s->refcount_table[] *before* any call
> to qcow2_free_clusters(). To make this work, you would need to also
> discard all refblocks from the cache in this function here (and not in
> update_refcount()) and then only call qcow2_free_clusters() on refblocks
> which were not self-referencing. An alternative hack would be to simply
> mark the image dirty and just not do any qcow2_free_clusters() call...

The main purpose of qcow2_reftable_shrink() function is discard all
unnecessary refblocks from the file. If we do only rewrite
refcount_table and discard non-self-referencing refblocks (which are
actually very rare), then the meaning of the function is lost.

> Or (3) of course it would be possible to not clean up refcount
> structures at all...

Nice solution :)

> Max
>
Max Reitz June 28, 2017, 1:59 p.m. UTC | #7
On 2017-06-27 17:06, Pavel Butsykin wrote:
> On 26.06.2017 20:47, Max Reitz wrote:
>> On 2017-06-26 17:23, Pavel Butsykin wrote:
> []
>>>
>>> Is there any guarantee that in the future this will not change? Because
>>> in this case it can be a potential danger.
>>
>> Since this behavior is not documented anywhere, there is no guarantee.
>>
>>> I can add a comment... Or add a new variable with the size of
>>> reftable_tmp, and every time count min(s->refcount_table_size,
>>> reftable_tmp_size)
>>> before accessing to s->refcount_table[]/reftable_tmp[]
>>
>> Or (1) you add an assertion that refcount_table_size doesn't change
>> along with a comment why that is the case, which also explains in detail
>> why the call to qcow2_free_clusters() should be safe: The on-disk
>> reftable differs from the one in memory. qcow2_free_clusters()and
>> update_refcount() themselves do not access the reftable, so they are
>> safe. However, update_refcount() calls alloc_refcount_block(), and that
>> function does access the reftable: Now, as long as
>> s->refcount_table_size does not shrink (which I can't see why it would),
>> refcount_table_index should always be smaller. Now we're accessing
>> s->refcount_table: This will always return an existing refblock because
>> this will either be the refblock itself (for self-referencing refblocks)
>> or another one that is not going to be freed by qcow2_shrink_reftable()
>> because this function will not free refblocks which cover other clusters
>> than themselves.
>> We will then proceed to update the refblock which is either right (if it
>> is not the refblock to be freed) or won't do anything (if it is the one
>> to be freed).
>> In any case, we will never write to the reftable and reading from the
>> basically outdated cached version will never do anything bad.
> 
> OK, SGTM.
> 
>> Or (2) you copy reftable_tmp into s->refcount_table[] *before* any call
>> to qcow2_free_clusters(). To make this work, you would need to also
>> discard all refblocks from the cache in this function here (and not in
>> update_refcount()) and then only call qcow2_free_clusters() on refblocks
>> which were not self-referencing. An alternative hack would be to simply
>> mark the image dirty and just not do any qcow2_free_clusters() call...
> 
> The main purpose of qcow2_reftable_shrink() function is discard all
> unnecessary refblocks from the file. If we do only rewrite
> refcount_table and discard non-self-referencing refblocks (which are
> actually very rare), then the meaning of the function is lost.

It would do exactly the same. The idea is that you do not need to call
qcow2_free_clusters() on self-referencing refblocks at all, since they
are freed automatically when their reftable entry is overwritten with 0.

>> Or (3) of course it would be possible to not clean up refcount
>> structures at all...
> 
> Nice solution :)

It is, because as I said refcount structures only have a small overhead.

Max
Pavel Butsykin June 28, 2017, 3:31 p.m. UTC | #8
On 28.06.2017 16:59, Max Reitz wrote:
> On 2017-06-27 17:06, Pavel Butsykin wrote:
>> On 26.06.2017 20:47, Max Reitz wrote:
>>> On 2017-06-26 17:23, Pavel Butsykin wrote:
>> []
>>>>
>>>> Is there any guarantee that in the future this will not change? Because
>>>> in this case it can be a potential danger.
>>>
>>> Since this behavior is not documented anywhere, there is no guarantee.
>>>
>>>> I can add a comment... Or add a new variable with the size of
>>>> reftable_tmp, and every time count min(s->refcount_table_size,
>>>> reftable_tmp_size)
>>>> before accessing to s->refcount_table[]/reftable_tmp[]
>>>
>>> Or (1) you add an assertion that refcount_table_size doesn't change
>>> along with a comment why that is the case, which also explains in detail
>>> why the call to qcow2_free_clusters() should be safe: The on-disk
>>> reftable differs from the one in memory. qcow2_free_clusters()and
>>> update_refcount() themselves do not access the reftable, so they are
>>> safe. However, update_refcount() calls alloc_refcount_block(), and that
>>> function does access the reftable: Now, as long as
>>> s->refcount_table_size does not shrink (which I can't see why it would),
>>> refcount_table_index should always be smaller. Now we're accessing
>>> s->refcount_table: This will always return an existing refblock because
>>> this will either be the refblock itself (for self-referencing refblocks)
>>> or another one that is not going to be freed by qcow2_shrink_reftable()
>>> because this function will not free refblocks which cover other clusters
>>> than themselves.
>>> We will then proceed to update the refblock which is either right (if it
>>> is not the refblock to be freed) or won't do anything (if it is the one
>>> to be freed).
>>> In any case, we will never write to the reftable and reading from the
>>> basically outdated cached version will never do anything bad.
>>
>> OK, SGTM.
>>
>>> Or (2) you copy reftable_tmp into s->refcount_table[] *before* any call
>>> to qcow2_free_clusters(). To make this work, you would need to also
>>> discard all refblocks from the cache in this function here (and not in
>>> update_refcount()) and then only call qcow2_free_clusters() on refblocks
>>> which were not self-referencing. An alternative hack would be to simply
>>> mark the image dirty and just not do any qcow2_free_clusters() call...
>>
>> The main purpose of qcow2_reftable_shrink() function is discard all
>> unnecessary refblocks from the file. If we do only rewrite
>> refcount_table and discard non-self-referencing refblocks (which are
>> actually very rare), then the meaning of the function is lost.
> 
> It would do exactly the same. The idea is that you do not need to call
> qcow2_free_clusters() on self-referencing refblocks at all, since they
> are freed automatically when their reftable entry is overwritten with 0.

Not sure.. For self-referencing refblocks, we also need to do:
1. check if refcount > 1
2. update s->free_cluster_index
3. call update_refcount_discard() (to in the end the fallocate
PUNCH_HOLE was called on refblock offset)

It will be practically a copy-paste from qcow2_free_clusters(), so it is
better to avoid it. I think that if it makes sense to do
qcow2_reftable_shrink(), it is only because we can slightly reduce image
size.

>>> Or (3) of course it would be possible to not clean up refcount
>>> structures at all...
>>
>> Nice solution :)
> 
> It is, because as I said refcount structures only have a small overhead.

Yes, I agree.

> Max
>
Max Reitz June 28, 2017, 11:36 p.m. UTC | #9
On 2017-06-28 17:31, Pavel Butsykin wrote:
> On 28.06.2017 16:59, Max Reitz wrote:
>> On 2017-06-27 17:06, Pavel Butsykin wrote:
>>> On 26.06.2017 20:47, Max Reitz wrote:
>>>> On 2017-06-26 17:23, Pavel Butsykin wrote:
>>> []
>>>>>
>>>>> Is there any guarantee that in the future this will not change?
>>>>> Because
>>>>> in this case it can be a potential danger.
>>>>
>>>> Since this behavior is not documented anywhere, there is no guarantee.
>>>>
>>>>> I can add a comment... Or add a new variable with the size of
>>>>> reftable_tmp, and every time count min(s->refcount_table_size,
>>>>> reftable_tmp_size)
>>>>> before accessing to s->refcount_table[]/reftable_tmp[]
>>>>
>>>> Or (1) you add an assertion that refcount_table_size doesn't change
>>>> along with a comment why that is the case, which also explains in
>>>> detail
>>>> why the call to qcow2_free_clusters() should be safe: The on-disk
>>>> reftable differs from the one in memory. qcow2_free_clusters()and
>>>> update_refcount() themselves do not access the reftable, so they are
>>>> safe. However, update_refcount() calls alloc_refcount_block(), and that
>>>> function does access the reftable: Now, as long as
>>>> s->refcount_table_size does not shrink (which I can't see why it
>>>> would),
>>>> refcount_table_index should always be smaller. Now we're accessing
>>>> s->refcount_table: This will always return an existing refblock because
>>>> this will either be the refblock itself (for self-referencing
>>>> refblocks)
>>>> or another one that is not going to be freed by qcow2_shrink_reftable()
>>>> because this function will not free refblocks which cover other
>>>> clusters
>>>> than themselves.
>>>> We will then proceed to update the refblock which is either right
>>>> (if it
>>>> is not the refblock to be freed) or won't do anything (if it is the one
>>>> to be freed).
>>>> In any case, we will never write to the reftable and reading from the
>>>> basically outdated cached version will never do anything bad.
>>>
>>> OK, SGTM.
>>>
>>>> Or (2) you copy reftable_tmp into s->refcount_table[] *before* any call
>>>> to qcow2_free_clusters(). To make this work, you would need to also
>>>> discard all refblocks from the cache in this function here (and not in
>>>> update_refcount()) and then only call qcow2_free_clusters() on
>>>> refblocks
>>>> which were not self-referencing. An alternative hack would be to simply
>>>> mark the image dirty and just not do any qcow2_free_clusters() call...
>>>
>>> The main purpose of qcow2_reftable_shrink() function is discard all
>>> unnecessary refblocks from the file. If we do only rewrite
>>> refcount_table and discard non-self-referencing refblocks (which are
>>> actually very rare), then the meaning of the function is lost.
>>
>> It would do exactly the same. The idea is that you do not need to call
>> qcow2_free_clusters() on self-referencing refblocks at all, since they
>> are freed automatically when their reftable entry is overwritten with 0.
> 
> Not sure.. For self-referencing refblocks, we also need to do:
> 1. check if refcount > 1

Yes, if that wasn't an error flagged by qemu-img check. :-)

(http://git.qemu.org/?p=qemu.git;a=blob;f=block/qcow2-refcount.c;h=7c06061aae90eb4f091f51df995a9e099178c0ed;hb=HEAD#l1787)

> 2. update s->free_cluster_index
> 3. call update_refcount_discard() (to in the end the fallocate
> PUNCH_HOLE was called on refblock offset)

These, yes, you'd have to do here.

> It will be practically a copy-paste from qcow2_free_clusters(), so it is
> better to avoid it. I think that if it makes sense to do
> qcow2_reftable_shrink(), it is only because we can slightly reduce image
> size.

But it would be a small copy-paste (although I may very well be wrong)
and it would help me sleep better because I could actually understand it.

Max

>>>> Or (3) of course it would be possible to not clean up refcount
>>>> structures at all...
>>>
>>> Nice solution :)
>>
>> It is, because as I said refcount structures only have a small overhead.
> 
> Yes, I agree.
diff mbox

Patch

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d779ea19cf..a84b7e607e 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -32,6 +32,48 @@ 
 #include "qemu/bswap.h"
 #include "trace.h"
 
+int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int new_l1_size, i, ret;
+
+    if (max_size >= s->l1_size) {
+        return 0;
+    }
+
+    new_l1_size = max_size;
+
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "shrink l1_table from %d to %" PRId64 "\n",
+            s->l1_size, new_l1_size);
+#endif
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
+    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
+                                       sizeof(uint64_t) * new_l1_size,
+                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_flush(bs->file->bs);
+    if (ret < 0) {
+        return ret;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
+    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
+        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
+            continue;
+        }
+        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
+                            s->l2_size * sizeof(uint64_t),
+                            QCOW2_DISCARD_ALWAYS);
+        s->l1_table[i] = 0;
+    }
+    return 0;
+}
+
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         bool exact_size)
 {
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 576ab551d6..e98306acd8 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -29,6 +29,7 @@ 
 #include "block/qcow2.h"
 #include "qemu/range.h"
 #include "qemu/bswap.h"
+#include "qemu/cutils.h"
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
@@ -2936,3 +2937,67 @@  done:
     qemu_vfree(new_refblock);
     return ret;
 }
+
+int qcow2_shrink_reftable(BlockDriverState *bs)
+{
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t *reftable_tmp =
+        g_try_malloc(sizeof(uint64_t) * s->refcount_table_size);
+    int i, ret;
+
+    if (s->refcount_table_size && reftable_tmp == NULL) {
+        return -ENOMEM;
+    }
+
+    for (i = 0; i < s->refcount_table_size; i++) {
+        int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
+        void *refblock;
+        bool unused_block;
+
+        if (refblock_offs == 0) {
+            reftable_tmp[i] = 0;
+            continue;
+        }
+        ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
+                              &refblock);
+        if (ret < 0) {
+            goto out;
+        }
+
+        /* the refblock has own reference */
+        if (i == refblock_offs >> (s->refcount_block_bits + s->cluster_bits)) {
+            uint64_t blk_index = (refblock_offs >> s->cluster_bits) &
+                                 (s->refcount_block_size - 1);
+            uint64_t refcount = s->get_refcount(refblock, blk_index);
+
+            s->set_refcount(refblock, blk_index, 0);
+
+            unused_block = buffer_is_zero(refblock, s->refcount_block_size);
+
+            s->set_refcount(refblock, blk_index, refcount);
+        } else {
+            unused_block = buffer_is_zero(refblock, s->refcount_block_size);
+        }
+        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+
+        reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
+                           sizeof(uint64_t) * s->refcount_table_size);
+    if (ret < 0) {
+        goto out;
+    }
+
+    for (i = 0; i < s->refcount_table_size; i++) {
+        if (s->refcount_table[i] && !reftable_tmp[i]) {
+            qcow2_free_clusters(bs, s->refcount_table[i] & REFT_OFFSET_MASK,
+                                s->cluster_size, QCOW2_DISCARD_ALWAYS);
+            s->refcount_table[i] = 0;
+        }
+    }
+
+out:
+    g_free(reftable_tmp);
+    return ret;
+}
diff --git a/block/qcow2.c b/block/qcow2.c
index b3ba5daa93..0ad46d2776 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2545,6 +2545,7 @@  static int qcow2_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
     int64_t new_l1_size;
+    uint64_t total_size;
     int ret;
 
     if (offset & 511) {
@@ -2558,17 +2559,36 @@  static int qcow2_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
         return -ENOTSUP;
     }
 
-    /* shrinking is currently not supported */
-    if (offset < bs->total_sectors * 512) {
-        error_setg(errp, "qcow2 doesn't support shrinking images yet");
-        return -ENOTSUP;
-    }
-
     new_l1_size = size_to_l1(s, offset);
-    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "Failed to grow the L1 table");
-        return ret;
+    total_size = bs->total_sectors << BDRV_SECTOR_BITS;
+
+    if (offset < total_size) {
+        ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
+                                    total_size - ROUND_UP(offset,
+                                                          s->cluster_size),
+                                    QCOW2_DISCARD_ALWAYS, true);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to discard reduced clasters");
+            return ret;
+        }
+
+        ret = qcow2_shrink_l1_table(bs, new_l1_size);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to reduce the L1 table");
+            return ret;
+        }
+
+        ret = qcow2_shrink_reftable(bs);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to shrink the refcount table");
+            return ret;
+        }
+    } else {
+        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to grow the L1 table");
+            return ret;
+        }
     }
 
     /* write updated header.size */
diff --git a/block/qcow2.h b/block/qcow2.h
index 07faa6dc78..600463bf8e 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -531,10 +531,12 @@  int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
                                 BlockDriverAmendStatusCB *status_cb,
                                 void *cb_opaque, Error **errp);
+int qcow2_shrink_reftable(BlockDriverState *bs);
 
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         bool exact_size);
+int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size);
 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
 int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
diff --git a/qapi/block-core.json b/qapi/block-core.json
index f85c2235c7..bcbffa3339 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2372,7 +2372,8 @@ 
             'cluster_alloc_bytes', 'cluster_free', 'flush_to_os',
             'flush_to_disk', 'pwritev_rmw_head', 'pwritev_rmw_after_head',
             'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
-            'pwritev_zero', 'pwritev_done', 'empty_image_prepare' ] }
+            'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
+            'l1_shrink_write_table', 'l1_shrink_free_l2_clusters' ] }
 
 ##
 # @BlkdebugInjectErrorOptions: