diff mbox

[16/17] block: Make bdrv_is_allocated() byte-based

Message ID 20170411222945.11741-17-eblake@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Eric Blake April 11, 2017, 10:29 p.m. UTC
We are gradually moving away from sector-based interfaces, towards
byte-based.  In the common case, allocation is unlikely to ever use
values that are not naturally sector-aligned, but it is possible
that byte-based values will let us be more precise about allocation
at the end of an unaligned file that can do byte-based access.

Changing the signature of the function to use int64_t *pnum ensures
that the compiler enforces that all callers are updated.  For now,
the io.c layer still assert()s that all callers are sector-aligned,
but that can be relaxed when a later patch implements byte-based
block status.  Therefore, for the most part this patch is just the
addition of scaling at the callers followed by inverse scaling at
bdrv_is_allocated().  But some code, particularly bdrv_commit(),
gets a lot simpler because it no longer has to mess with sectors;
also, it is now possible to pass NULL if the caller does not care
how much of the image is allocated beyond the initial offset.

For ease of review, bdrv_is_allocated_above() will be tackled
separately.

Signed-off-by: Eric Blake <eblake@redhat.com>
---
 include/block/block.h |  4 ++--
 block/backup.c        | 17 +++++----------
 block/commit.c        | 21 ++++++++-----------
 block/io.c            | 49 +++++++++++++++++++++++++++++--------------
 block/stream.c        |  5 +++--
 block/vvfat.c         | 34 +++++++++++++++++-------------
 migration/block.c     |  9 ++++----
 qemu-img.c            |  5 ++++-
 qemu-io-cmds.c        | 57 +++++++++++++++++++++++++--------------------------
 9 files changed, 110 insertions(+), 91 deletions(-)

Comments

John Snow April 18, 2017, 10:15 p.m. UTC | #1
On 04/11/2017 06:29 PM, Eric Blake wrote:
> We are gradually moving away from sector-based interfaces, towards
> byte-based.  In the common case, allocation is unlikely to ever use
> values that are not naturally sector-aligned, but it is possible
> that byte-based values will let us be more precise about allocation
> at the end of an unaligned file that can do byte-based access.
> 
> Changing the signature of the function to use int64_t *pnum ensures
> that the compiler enforces that all callers are updated.  For now,
> the io.c layer still assert()s that all callers are sector-aligned,
> but that can be relaxed when a later patch implements byte-based
> block status.  Therefore, for the most part this patch is just the
> addition of scaling at the callers followed by inverse scaling at
> bdrv_is_allocated().  But some code, particularly bdrv_commit(),
> gets a lot simpler because it no longer has to mess with sectors;
> also, it is now possible to pass NULL if the caller does not care
> how much of the image is allocated beyond the initial offset.
> 
> For ease of review, bdrv_is_allocated_above() will be tackled
> separately.
> 
> Signed-off-by: Eric Blake <eblake@redhat.com>
> ---
>  include/block/block.h |  4 ++--
>  block/backup.c        | 17 +++++----------
>  block/commit.c        | 21 ++++++++-----------
>  block/io.c            | 49 +++++++++++++++++++++++++++++--------------
>  block/stream.c        |  5 +++--
>  block/vvfat.c         | 34 +++++++++++++++++-------------
>  migration/block.c     |  9 ++++----
>  qemu-img.c            |  5 ++++-
>  qemu-io-cmds.c        | 57 +++++++++++++++++++++++++--------------------------
>  9 files changed, 110 insertions(+), 91 deletions(-)
> 
> diff --git a/include/block/block.h b/include/block/block.h
> index b5289f7..8641149 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -422,8 +422,8 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs,
>                                      int64_t sector_num,
>                                      int nb_sectors, int *pnum,
>                                      BlockDriverState **file);
> -int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
> -                      int *pnum);
> +int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
> +                      int64_t *pnum);
>  int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
>                              int64_t sector_num, int nb_sectors, int *pnum);
> 
> diff --git a/block/backup.c b/block/backup.c
> index 2a51e8b..63ca208 100644
> --- a/block/backup.c
> +++ b/block/backup.c
> @@ -47,12 +47,6 @@ typedef struct BackupBlockJob {
>      QLIST_HEAD(, CowRequest) inflight_reqs;
>  } BackupBlockJob;
> 
> -/* Size of a cluster in sectors, instead of bytes. */
> -static inline int64_t cluster_size_sectors(BackupBlockJob *job)
> -{
> -  return job->cluster_size / BDRV_SECTOR_SIZE;
> -}
> -
>  /* See if in-flight requests overlap and wait for them to complete */
>  static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
>                                                         int64_t start,
> @@ -433,7 +427,6 @@ static void coroutine_fn backup_run(void *opaque)
>      BackupCompleteData *data;
>      BlockDriverState *bs = blk_bs(job->common.blk);
>      int64_t offset;
> -    int64_t sectors_per_cluster = cluster_size_sectors(job);
>      int ret = 0;
> 
>      QLIST_INIT(&job->inflight_reqs);
> @@ -465,12 +458,13 @@ static void coroutine_fn backup_run(void *opaque)
>              }
> 
>              if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
> -                int i, n;
> +                int i;
> +                int64_t n;
> 
>                  /* Check to see if these blocks are already in the
>                   * backing file. */
> 
> -                for (i = 0; i < sectors_per_cluster;) {
> +                for (i = 0; i < job->cluster_size;) {
>                      /* bdrv_is_allocated() only returns true/false based
>                       * on the first set of sectors it comes across that
>                       * are are all in the same state.
> @@ -478,9 +472,8 @@ static void coroutine_fn backup_run(void *opaque)
>                       * backup cluster length.  We end up copying more than
>                       * needed but at some point that is always the case. */
>                      alloced =
> -                        bdrv_is_allocated(bs,
> -                                          (offset >> BDRV_SECTOR_BITS) + i,
> -                                          sectors_per_cluster - i, &n);
> +                        bdrv_is_allocated(bs, offset + i,
> +                                          job->cluster_size - i, &n);
>                      i += n;
> 
>                      if (alloced || n == 0) {
> diff --git a/block/commit.c b/block/commit.c
> index b7d3e60..4d6bb2a 100644
> --- a/block/commit.c
> +++ b/block/commit.c
> @@ -429,7 +429,7 @@ fail:
>  }
> 
> 
> -#define COMMIT_BUF_SECTORS 2048
> +#define COMMIT_BUF_SIZE (2048 * BDRV_SECTOR_SIZE)
> 
>  /* commit COW file into the raw image */
>  int bdrv_commit(BlockDriverState *bs)
> @@ -438,8 +438,9 @@ int bdrv_commit(BlockDriverState *bs)
>      BlockDriverState *backing_file_bs = NULL;
>      BlockDriverState *commit_top_bs = NULL;
>      BlockDriver *drv = bs->drv;
> -    int64_t sector, total_sectors, length, backing_length;
> -    int n, ro, open_flags;
> +    int64_t offset, length, backing_length;
> +    int ro, open_flags;
> +    int64_t n;
>      int ret = 0;
>      uint8_t *buf = NULL;
>      Error *local_err = NULL;
> @@ -517,30 +518,26 @@ int bdrv_commit(BlockDriverState *bs)
>          }
>      }
> 
> -    total_sectors = length >> BDRV_SECTOR_BITS;
> -
>      /* blk_try_blockalign() for src will choose an alignment that works for
>       * backing as well, so no need to compare the alignment manually. */
> -    buf = blk_try_blockalign(src, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
> +    buf = blk_try_blockalign(src, COMMIT_BUF_SIZE);
>      if (buf == NULL) {
>          ret = -ENOMEM;
>          goto ro_cleanup;
>      }
> 
> -    for (sector = 0; sector < total_sectors; sector += n) {
> -        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
> +    for (offset = 0; offset < length; offset += n) {
> +        ret = bdrv_is_allocated(bs, offset, COMMIT_BUF_SIZE, &n);
>          if (ret < 0) {
>              goto ro_cleanup;
>          }
>          if (ret) {
> -            ret = blk_pread(src, sector * BDRV_SECTOR_SIZE, buf,
> -                            n * BDRV_SECTOR_SIZE);
> +            ret = blk_pread(src, offset, buf, n);
>              if (ret < 0) {
>                  goto ro_cleanup;
>              }
> 
> -            ret = blk_pwrite(backing, sector * BDRV_SECTOR_SIZE, buf,
> -                             n * BDRV_SECTOR_SIZE, 0);
> +            ret = blk_pwrite(backing, offset, buf, n, 0);
>              if (ret < 0) {
>                  goto ro_cleanup;
>              }
> diff --git a/block/io.c b/block/io.c
> index 8706bfa..438a493 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -1054,14 +1054,15 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
>          int64_t start_sector = offset >> BDRV_SECTOR_BITS;
>          int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
>          unsigned int nb_sectors = end_sector - start_sector;
> -        int pnum;
> +        int64_t pnum;
> 
> -        ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum);
> +        ret = bdrv_is_allocated(bs, start_sector << BDRV_SECTOR_BITS,
> +                                nb_sectors << BDRV_SECTOR_BITS, &pnum);
>          if (ret < 0) {
>              goto out;
>          }
> 
> -        if (!ret || pnum != nb_sectors) {
> +        if (!ret || pnum != nb_sectors << BDRV_SECTOR_BITS) {
>              ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
>              goto out;
>          }
> @@ -1903,15 +1904,26 @@ int64_t bdrv_get_block_status(BlockDriverState *bs,
>                                         sector_num, nb_sectors, pnum, file);
>  }
> 
> -int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
> -                                   int nb_sectors, int *pnum)
> +int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
> +                                   int64_t bytes, int64_t *pnum)
>  {
>      BlockDriverState *file;
> -    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
> -                                        &file);
> +    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
> +    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
> +    int64_t ret;
> +    int psectors;
> +
> +    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
> +    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE) &&
> +           bytes < INT_MAX * BDRV_SECTOR_SIZE);
> +    ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &psectors,
> +                                &file);
>      if (ret < 0) {
>          return ret;
>      }
> +    if (pnum) {
> +        *pnum = psectors * BDRV_SECTOR_SIZE;
> +    }
>      return !!(ret & BDRV_BLOCK_ALLOCATED);
>  }
> 
> @@ -1920,7 +1932,8 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
>   *
>   * Return true if the given sector is allocated in any image between
>   * BASE and TOP (inclusive).  BASE can be NULL to check if the given
> - * sector is allocated in any image of the chain.  Return false otherwise.
> + * sector is allocated in any image of the chain.  Return false otherwise,
> + * or negative errno on failure.
>   *
>   * 'pnum' is set to the number of sectors (including and immediately following
>   *  the specified sector) that are known to be in the same
> @@ -1937,13 +1950,19 @@ int bdrv_is_allocated_above(BlockDriverState *top,
> 
>      intermediate = top;
>      while (intermediate && intermediate != base) {
> -        int pnum_inter;
> -        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
> +        int64_t pnum_inter;
> +        int psectors_inter;
> +
> +        ret = bdrv_is_allocated(intermediate, sector_num * BDRV_SECTOR_SIZE,
> +                                nb_sectors * BDRV_SECTOR_SIZE,
>                                  &pnum_inter);
>          if (ret < 0) {
>              return ret;
> -        } else if (ret) {
> -            *pnum = pnum_inter;
> +        }
> +        assert(pnum_inter < INT_MAX * BDRV_SECTOR_SIZE);
> +        psectors_inter = pnum_inter >> BDRV_SECTOR_BITS;
> +        if (ret) {
> +            *pnum = psectors_inter;
>              return 1;
>          }
> 
> @@ -1953,10 +1972,10 @@ int bdrv_is_allocated_above(BlockDriverState *top,
>           *
>           * [sector_num+x, nr_sectors] allocated.
>           */
> -        if (n > pnum_inter &&
> +        if (n > psectors_inter &&
>              (intermediate == top ||
> -             sector_num + pnum_inter < intermediate->total_sectors)) {
> -            n = pnum_inter;
> +             sector_num + psectors_inter < intermediate->total_sectors)) {
> +            n = psectors_inter;
>          }
> 
>          intermediate = backing_bs(intermediate);
> diff --git a/block/stream.c b/block/stream.c
> index 3ed4fef..85502eb 100644
> --- a/block/stream.c
> +++ b/block/stream.c
> @@ -138,6 +138,7 @@ static void coroutine_fn stream_run(void *opaque)
> 
>      for ( ; offset < s->common.len; offset += n * BDRV_SECTOR_SIZE) {
>          bool copy;
> +        int64_t count = 0;
> 
>          /* Note that even when no rate limit is applied we need to yield
>           * with no pending I/O here so that bdrv_drain_all() returns.
> @@ -149,8 +150,8 @@ static void coroutine_fn stream_run(void *opaque)
> 
>          copy = false;
> 
> -        ret = bdrv_is_allocated(bs, offset / BDRV_SECTOR_SIZE,
> -                                STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
> +        ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &count);
> +        n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
>          if (ret == 1) {
>              /* Allocated in the top, no need to copy.  */
>          } else if (ret >= 0) {
> diff --git a/block/vvfat.c b/block/vvfat.c
> index af5153d..bef2056 100644
> --- a/block/vvfat.c
> +++ b/block/vvfat.c
> @@ -1393,24 +1393,27 @@ static int vvfat_read(BlockDriverState *bs, int64_t sector_num,
>  	if (sector_num >= bs->total_sectors)
>  	   return -1;
>  	if (s->qcow) {
> -	    int n;
> +            int64_t n;
>              int ret;
> -            ret = bdrv_is_allocated(s->qcow->bs, sector_num,
> -                                    nb_sectors - i, &n);
> +            ret = bdrv_is_allocated(s->qcow->bs, sector_num * BDRV_SECTOR_SIZE,
> +                                    (nb_sectors - i) * BDRV_SECTOR_SIZE, &n);
>              if (ret < 0) {
>                  return ret;
>              }
>              if (ret) {
> -                DLOG(fprintf(stderr, "sectors %d+%d allocated\n",
> -                             (int)sector_num, n));
> -                if (bdrv_read(s->qcow, sector_num, buf + i * 0x200, n)) {
> +                DLOG(fprintf(stderr, "sectors %" PRId64 "+%" PRId64
> +                             " allocated\n", sector_num,
> +                             n >> BDRV_SECTOR_BITS));
> +                if (bdrv_read(s->qcow, sector_num, buf + i * 0x200,
> +                              n >> BDRV_SECTOR_BITS)) {
>                      return -1;
>                  }
> -                i += n - 1;
> -                sector_num += n - 1;
> +                i += (n >> BDRV_SECTOR_BITS) - 1;
> +                sector_num += (n >> BDRV_SECTOR_BITS) - 1;
>                  continue;
>              }
> -DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num));
> +            DLOG(fprintf(stderr, "sector %" PRId64 " not allocated\n",
> +                         sector_num));
>  	}
>  	if(sector_num<s->faked_sectors) {
>  	    if(sector_num<s->first_sectors_number)
> @@ -1678,7 +1681,7 @@ static inline bool cluster_was_modified(BDRVVVFATState *s,
>                                          uint32_t cluster_num)
>  {
>      int was_modified = 0;
> -    int i, dummy;
> +    int i;
> 
>      if (s->qcow == NULL) {
>          return 0;
> @@ -1686,8 +1689,9 @@ static inline bool cluster_was_modified(BDRVVVFATState *s,
> 
>      for (i = 0; !was_modified && i < s->sectors_per_cluster; i++) {
>          was_modified = bdrv_is_allocated(s->qcow->bs,
> -                                         cluster2sector(s, cluster_num) + i,
> -                                         1, &dummy);
> +                                         (cluster2sector(s, cluster_num) +

cluster2sector2bytes2bits2atoms()

> +                                          i) * BDRV_SECTOR_SIZE,
> +                                         BDRV_SECTOR_SIZE, NULL);
>      }
> 
>      /*
> @@ -1834,7 +1838,7 @@ static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,
>  	    }
> 
>  	    if (copy_it) {
> -		int i, dummy;
> +                int i;
>  		/*
>  		 * This is horribly inefficient, but that is okay, since
>  		 * it is rarely executed, if at all.
> @@ -1845,7 +1849,9 @@ static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,
>                  for (i = 0; i < s->sectors_per_cluster; i++) {
>                      int res;
> 
> -                    res = bdrv_is_allocated(s->qcow->bs, offset + i, 1, &dummy);
> +                    res = bdrv_is_allocated(s->qcow->bs,
> +                                            (offset + i) * BDRV_SECTOR_SIZE,
> +                                            BDRV_SECTOR_SIZE, NULL);
>                      if (res < 0) {
>                          return -1;
>                      }
> diff --git a/migration/block.c b/migration/block.c
> index 7734ff7..18d50ff 100644
> --- a/migration/block.c
> +++ b/migration/block.c
> @@ -36,7 +36,7 @@
>  #define BLK_MIG_FLAG_PROGRESS           0x04
>  #define BLK_MIG_FLAG_ZERO_BLOCK         0x08
> 
> -#define MAX_IS_ALLOCATED_SEARCH 65536
> +#define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE)
> 
>  #define MAX_INFLIGHT_IO 512
> 
> @@ -272,6 +272,7 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
>      BlockBackend *bb = bmds->blk;
>      BlkMigBlock *blk;
>      int nr_sectors;
> +    int64_t count;
> 
>      if (bmds->shared_base) {
>          qemu_mutex_lock_iothread();
> @@ -279,9 +280,9 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
>          /* Skip unallocated sectors; intentionally treats failure as
>           * an allocated sector */
>          while (cur_sector < total_sectors &&
> -               !bdrv_is_allocated(blk_bs(bb), cur_sector,
> -                                  MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
> -            cur_sector += nr_sectors;
> +               !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE,
> +                                  MAX_IS_ALLOCATED_SEARCH, &count)) {
> +            cur_sector += DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);

Hm, what's the story here, why are we rounding this up? If, in theory,
we have a partially allocated cluster won't we advance past that?

>          }
>          aio_context_release(blk_get_aio_context(bb));
>          qemu_mutex_unlock_iothread();
> diff --git a/qemu-img.c b/qemu-img.c
> index 37c2894..2f21d69 100644
> --- a/qemu-img.c
> +++ b/qemu-img.c
> @@ -3207,6 +3207,7 @@ static int img_rebase(int argc, char **argv)
>          int64_t new_backing_num_sectors = 0;
>          uint64_t sector;
>          int n;
> +        int64_t count;
>          float local_progress = 0;
> 
>          buf_old = blk_blockalign(blk, IO_BUF_SIZE);
> @@ -3254,12 +3255,14 @@ static int img_rebase(int argc, char **argv)
>              }
> 
>              /* If the cluster is allocated, we don't need to take action */
> -            ret = bdrv_is_allocated(bs, sector, n, &n);
> +            ret = bdrv_is_allocated(bs, sector << BDRV_SECTOR_BITS,
> +                                    n << BDRV_SECTOR_BITS, &count);
>              if (ret < 0) {
>                  error_report("error while reading image metadata: %s",
>                               strerror(-ret));
>                  goto out;
>              }
> +            n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
>              if (ret) {
>                  continue;
>              }
> diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
> index 0ac7457..62278ea 100644
> --- a/qemu-io-cmds.c
> +++ b/qemu-io-cmds.c
> @@ -1760,12 +1760,12 @@ out:
>  static int alloc_f(BlockBackend *blk, int argc, char **argv)
>  {
>      BlockDriverState *bs = blk_bs(blk);
> -    int64_t offset, sector_num, nb_sectors, remaining, bytes;
> +    int64_t offset, start, remaining, bytes;
>      char s1[64];
> -    int num, ret;
> -    int64_t sum_alloc;
> +    int ret;
> +    int64_t num, sum_alloc;
> 
> -    offset = cvtnum(argv[1]);
> +    start = offset = cvtnum(argv[1]);
>      if (offset < 0) {
>          print_cvtnum_err(offset, argv[1]);
>          return 0;
> @@ -1793,32 +1793,30 @@ static int alloc_f(BlockBackend *blk, int argc, char **argv)
>                 bytes);
>          return 0;
>      }
> -    nb_sectors = bytes >> BDRV_SECTOR_BITS;
> 
> -    remaining = nb_sectors;
> +    remaining = bytes;
>      sum_alloc = 0;
> -    sector_num = offset >> 9;
>      while (remaining) {
> -        ret = bdrv_is_allocated(bs, sector_num, remaining, &num);
> +        ret = bdrv_is_allocated(bs, offset, remaining, &num);
>          if (ret < 0) {
>              printf("is_allocated failed: %s\n", strerror(-ret));
>              return 0;
>          }
> -        sector_num += num;
> +        offset += num;
>          remaining -= num;
>          if (ret) {
>              sum_alloc += num;
>          }
>          if (num == 0) {
> -            nb_sectors -= remaining;
> +            bytes -= remaining;
>              remaining = 0;
>          }
>      }
> 
> -    cvtstr(offset, s1, sizeof(s1));
> +    cvtstr(start, s1, sizeof(s1));
> 
>      printf("%"PRId64"/%"PRId64" bytes allocated at offset %s\n",
> -           sum_alloc << BDRV_SECTOR_BITS, nb_sectors << BDRV_SECTOR_BITS, s1);
> +           sum_alloc, bytes, s1);
>      return 0;
>  }
> 
> @@ -1833,14 +1831,15 @@ static const cmdinfo_t alloc_cmd = {
>  };
> 
> 
> -static int map_is_allocated(BlockDriverState *bs, int64_t sector_num,
> -                            int64_t nb_sectors, int64_t *pnum)
> +static int map_is_allocated(BlockDriverState *bs, int64_t offset,
> +                            int64_t bytes, int64_t *pnum)
>  {
> -    int num, num_checked;
> +    int64_t num;
> +    int num_checked;
>      int ret, firstret;
> 
> -    num_checked = MIN(nb_sectors, INT_MAX);
> -    ret = bdrv_is_allocated(bs, sector_num, num_checked, &num);
> +    num_checked = MIN(bytes, BDRV_REQUEST_MAX_BYTES);
> +    ret = bdrv_is_allocated(bs, offset, num_checked, &num);
>      if (ret < 0) {
>          return ret;
>      }
> @@ -1848,12 +1847,12 @@ static int map_is_allocated(BlockDriverState *bs, int64_t sector_num,
>      firstret = ret;
>      *pnum = num;
> 
> -    while (nb_sectors > 0 && ret == firstret) {
> -        sector_num += num;
> -        nb_sectors -= num;
> +    while (bytes > 0 && ret == firstret) {
> +        offset += num;
> +        bytes -= num;
> 
> -        num_checked = MIN(nb_sectors, INT_MAX);
> -        ret = bdrv_is_allocated(bs, sector_num, num_checked, &num);
> +        num_checked = MIN(bytes, BDRV_REQUEST_MAX_BYTES);
> +        ret = bdrv_is_allocated(bs, offset, num_checked, &num);
>          if (ret == firstret && num) {
>              *pnum += num;
>          } else {
> @@ -1867,7 +1866,7 @@ static int map_is_allocated(BlockDriverState *bs, int64_t sector_num,
>  static int map_f(BlockBackend *blk, int argc, char **argv)
>  {
>      int64_t offset;
> -    int64_t nb_sectors, total_sectors;
> +    int64_t bytes, total_sectors;
>      char s1[64];
>      int64_t num;
>      int ret;
> @@ -1881,10 +1880,10 @@ static int map_f(BlockBackend *blk, int argc, char **argv)
>          return 0;
>      }
> 
> -    nb_sectors = total_sectors;
> +    bytes = total_sectors * BDRV_SECTOR_SIZE;
> 
>      do {
> -        ret = map_is_allocated(blk_bs(blk), offset, nb_sectors, &num);
> +        ret = map_is_allocated(blk_bs(blk), offset, bytes, &num);
>          if (ret < 0) {
>              error_report("Failed to get allocation status: %s", strerror(-ret));
>              return 0;
> @@ -1894,13 +1893,13 @@ static int map_f(BlockBackend *blk, int argc, char **argv)
>          }
> 
>          retstr = ret ? "    allocated" : "not allocated";
> -        cvtstr(offset << 9ULL, s1, sizeof(s1));
> +        cvtstr(offset, s1, sizeof(s1));
>          printf("[% 24" PRId64 "] % 16" PRId64 " bytes %s at offset %s (%d)\n",
> -               offset << 9ULL, num << 9ULL, retstr, s1, ret);
> +               offset, num, retstr, s1, ret);
> 
>          offset += num;
> -        nb_sectors -= num;
> -    } while (offset < total_sectors);
> +        bytes -= num;
> +    } while (offset < total_sectors * BDRV_SECTOR_SIZE);
> 
>      return 0;
>  }
>
Eric Blake April 19, 2017, 5:54 p.m. UTC | #2
On 04/18/2017 05:15 PM, John Snow wrote:

>> @@ -279,9 +280,9 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
>>          /* Skip unallocated sectors; intentionally treats failure as
>>           * an allocated sector */
>>          while (cur_sector < total_sectors &&
>> -               !bdrv_is_allocated(blk_bs(bb), cur_sector,
>> -                                  MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
>> -            cur_sector += nr_sectors;
>> +               !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE,
>> +                                  MAX_IS_ALLOCATED_SEARCH, &count)) {
>> +            cur_sector += DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
> 
> Hm, what's the story here, why are we rounding this up? If, in theory,
> we have a partially allocated cluster won't we advance past that?

This is rounding to sectors, not clusters (that is, the code is supposed
to advance cur_sector identically pre- and post-patch).  As to whether
the overall algorithm makes sense, or could use some tweaking by
converting migration/block.c to do everything by bytes instead of by
sectors, I haven't yet given that any serious time.
John Snow April 19, 2017, 7:37 p.m. UTC | #3
On 04/19/2017 01:54 PM, Eric Blake wrote:
> On 04/18/2017 05:15 PM, John Snow wrote:
> 
>>> @@ -279,9 +280,9 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
>>>          /* Skip unallocated sectors; intentionally treats failure as
>>>           * an allocated sector */
>>>          while (cur_sector < total_sectors &&
>>> -               !bdrv_is_allocated(blk_bs(bb), cur_sector,
>>> -                                  MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
>>> -            cur_sector += nr_sectors;
>>> +               !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE,
>>> +                                  MAX_IS_ALLOCATED_SEARCH, &count)) {
>>> +            cur_sector += DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
>>
>> Hm, what's the story here, why are we rounding this up? If, in theory,
>> we have a partially allocated cluster won't we advance past that?
> 
> This is rounding to sectors, not clusters (that is, the code is supposed
> to advance cur_sector identically pre- and post-patch).  As to whether
> the overall algorithm makes sense, or could use some tweaking by
> converting migration/block.c to do everything by bytes instead of by
> sectors, I haven't yet given that any serious time.
> 

Err, right... temporary brain schism. DIV_ROUND_UP not ALIGN_UP, my mistake.

>
John Snow April 19, 2017, 8:32 p.m. UTC | #4
On 04/11/2017 06:29 PM, Eric Blake wrote:
> We are gradually moving away from sector-based interfaces, towards
> byte-based.  In the common case, allocation is unlikely to ever use
> values that are not naturally sector-aligned, but it is possible
> that byte-based values will let us be more precise about allocation
> at the end of an unaligned file that can do byte-based access.
> 
> Changing the signature of the function to use int64_t *pnum ensures
> that the compiler enforces that all callers are updated.  For now,
> the io.c layer still assert()s that all callers are sector-aligned,
> but that can be relaxed when a later patch implements byte-based
> block status.  Therefore, for the most part this patch is just the
> addition of scaling at the callers followed by inverse scaling at
> bdrv_is_allocated().  But some code, particularly bdrv_commit(),
> gets a lot simpler because it no longer has to mess with sectors;
> also, it is now possible to pass NULL if the caller does not care
> how much of the image is allocated beyond the initial offset.
> 
> For ease of review, bdrv_is_allocated_above() will be tackled
> separately.
> 
> Signed-off-by: Eric Blake <eblake@redhat.com>
> ---
>  include/block/block.h |  4 ++--
>  block/backup.c        | 17 +++++----------
>  block/commit.c        | 21 ++++++++-----------
>  block/io.c            | 49 +++++++++++++++++++++++++++++--------------
>  block/stream.c        |  5 +++--
>  block/vvfat.c         | 34 +++++++++++++++++-------------
>  migration/block.c     |  9 ++++----
>  qemu-img.c            |  5 ++++-
>  qemu-io-cmds.c        | 57 +++++++++++++++++++++++++--------------------------
>  9 files changed, 110 insertions(+), 91 deletions(-)
> 
> diff --git a/include/block/block.h b/include/block/block.h
> index b5289f7..8641149 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -422,8 +422,8 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs,
>                                      int64_t sector_num,
>                                      int nb_sectors, int *pnum,
>                                      BlockDriverState **file);
> -int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
> -                      int *pnum);
> +int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
> +                      int64_t *pnum);
>  int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
>                              int64_t sector_num, int nb_sectors, int *pnum);
> 
> diff --git a/block/backup.c b/block/backup.c
> index 2a51e8b..63ca208 100644
> --- a/block/backup.c
> +++ b/block/backup.c
> @@ -47,12 +47,6 @@ typedef struct BackupBlockJob {
>      QLIST_HEAD(, CowRequest) inflight_reqs;
>  } BackupBlockJob;
> 
> -/* Size of a cluster in sectors, instead of bytes. */
> -static inline int64_t cluster_size_sectors(BackupBlockJob *job)
> -{
> -  return job->cluster_size / BDRV_SECTOR_SIZE;
> -}
> -
>  /* See if in-flight requests overlap and wait for them to complete */
>  static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
>                                                         int64_t start,
> @@ -433,7 +427,6 @@ static void coroutine_fn backup_run(void *opaque)
>      BackupCompleteData *data;
>      BlockDriverState *bs = blk_bs(job->common.blk);
>      int64_t offset;
> -    int64_t sectors_per_cluster = cluster_size_sectors(job);
>      int ret = 0;
> 
>      QLIST_INIT(&job->inflight_reqs);
> @@ -465,12 +458,13 @@ static void coroutine_fn backup_run(void *opaque)
>              }
> 
>              if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
> -                int i, n;
> +                int i;
> +                int64_t n;
> 
>                  /* Check to see if these blocks are already in the
>                   * backing file. */
> 
> -                for (i = 0; i < sectors_per_cluster;) {
> +                for (i = 0; i < job->cluster_size;) {
>                      /* bdrv_is_allocated() only returns true/false based
>                       * on the first set of sectors it comes across that
>                       * are are all in the same state.
> @@ -478,9 +472,8 @@ static void coroutine_fn backup_run(void *opaque)
>                       * backup cluster length.  We end up copying more than
>                       * needed but at some point that is always the case. */
>                      alloced =
> -                        bdrv_is_allocated(bs,
> -                                          (offset >> BDRV_SECTOR_BITS) + i,
> -                                          sectors_per_cluster - i, &n);
> +                        bdrv_is_allocated(bs, offset + i,
> +                                          job->cluster_size - i, &n);
>                      i += n;
> 
>                      if (alloced || n == 0) {
> diff --git a/block/commit.c b/block/commit.c
> index b7d3e60..4d6bb2a 100644
> --- a/block/commit.c
> +++ b/block/commit.c
> @@ -429,7 +429,7 @@ fail:
>  }
> 
> 
> -#define COMMIT_BUF_SECTORS 2048
> +#define COMMIT_BUF_SIZE (2048 * BDRV_SECTOR_SIZE)
> 
>  /* commit COW file into the raw image */
>  int bdrv_commit(BlockDriverState *bs)
> @@ -438,8 +438,9 @@ int bdrv_commit(BlockDriverState *bs)
>      BlockDriverState *backing_file_bs = NULL;
>      BlockDriverState *commit_top_bs = NULL;
>      BlockDriver *drv = bs->drv;
> -    int64_t sector, total_sectors, length, backing_length;
> -    int n, ro, open_flags;
> +    int64_t offset, length, backing_length;
> +    int ro, open_flags;
> +    int64_t n;
>      int ret = 0;
>      uint8_t *buf = NULL;
>      Error *local_err = NULL;
> @@ -517,30 +518,26 @@ int bdrv_commit(BlockDriverState *bs)
>          }
>      }
> 
> -    total_sectors = length >> BDRV_SECTOR_BITS;
> -
>      /* blk_try_blockalign() for src will choose an alignment that works for
>       * backing as well, so no need to compare the alignment manually. */
> -    buf = blk_try_blockalign(src, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
> +    buf = blk_try_blockalign(src, COMMIT_BUF_SIZE);
>      if (buf == NULL) {
>          ret = -ENOMEM;
>          goto ro_cleanup;
>      }
> 
> -    for (sector = 0; sector < total_sectors; sector += n) {
> -        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
> +    for (offset = 0; offset < length; offset += n) {
> +        ret = bdrv_is_allocated(bs, offset, COMMIT_BUF_SIZE, &n);
>          if (ret < 0) {
>              goto ro_cleanup;
>          }
>          if (ret) {
> -            ret = blk_pread(src, sector * BDRV_SECTOR_SIZE, buf,
> -                            n * BDRV_SECTOR_SIZE);
> +            ret = blk_pread(src, offset, buf, n);
>              if (ret < 0) {
>                  goto ro_cleanup;
>              }
> 
> -            ret = blk_pwrite(backing, sector * BDRV_SECTOR_SIZE, buf,
> -                             n * BDRV_SECTOR_SIZE, 0);
> +            ret = blk_pwrite(backing, offset, buf, n, 0);
>              if (ret < 0) {
>                  goto ro_cleanup;
>              }
> diff --git a/block/io.c b/block/io.c
> index 8706bfa..438a493 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -1054,14 +1054,15 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
>          int64_t start_sector = offset >> BDRV_SECTOR_BITS;
>          int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
>          unsigned int nb_sectors = end_sector - start_sector;
> -        int pnum;
> +        int64_t pnum;
> 
> -        ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum);
> +        ret = bdrv_is_allocated(bs, start_sector << BDRV_SECTOR_BITS,
> +                                nb_sectors << BDRV_SECTOR_BITS, &pnum);
>          if (ret < 0) {
>              goto out;
>          }
> 
> -        if (!ret || pnum != nb_sectors) {
> +        if (!ret || pnum != nb_sectors << BDRV_SECTOR_BITS) {
>              ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
>              goto out;
>          }
> @@ -1903,15 +1904,26 @@ int64_t bdrv_get_block_status(BlockDriverState *bs,
>                                         sector_num, nb_sectors, pnum, file);
>  }
> 
> -int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
> -                                   int nb_sectors, int *pnum)
> +int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
> +                                   int64_t bytes, int64_t *pnum)
>  {
>      BlockDriverState *file;
> -    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
> -                                        &file);
> +    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
> +    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
> +    int64_t ret;
> +    int psectors;
> +
> +    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
> +    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE) &&
> +           bytes < INT_MAX * BDRV_SECTOR_SIZE);
> +    ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &psectors,
> +                                &file);
>      if (ret < 0) {
>          return ret;
>      }
> +    if (pnum) {
> +        *pnum = psectors * BDRV_SECTOR_SIZE;
> +    }
>      return !!(ret & BDRV_BLOCK_ALLOCATED);
>  }
> 
> @@ -1920,7 +1932,8 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
>   *
>   * Return true if the given sector is allocated in any image between
>   * BASE and TOP (inclusive).  BASE can be NULL to check if the given
> - * sector is allocated in any image of the chain.  Return false otherwise.
> + * sector is allocated in any image of the chain.  Return false otherwise,
> + * or negative errno on failure.
>   *
>   * 'pnum' is set to the number of sectors (including and immediately following
>   *  the specified sector) that are known to be in the same
> @@ -1937,13 +1950,19 @@ int bdrv_is_allocated_above(BlockDriverState *top,
> 
>      intermediate = top;
>      while (intermediate && intermediate != base) {
> -        int pnum_inter;
> -        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
> +        int64_t pnum_inter;
> +        int psectors_inter;
> +
> +        ret = bdrv_is_allocated(intermediate, sector_num * BDRV_SECTOR_SIZE,
> +                                nb_sectors * BDRV_SECTOR_SIZE,
>                                  &pnum_inter);
>          if (ret < 0) {
>              return ret;
> -        } else if (ret) {
> -            *pnum = pnum_inter;
> +        }
> +        assert(pnum_inter < INT_MAX * BDRV_SECTOR_SIZE);
> +        psectors_inter = pnum_inter >> BDRV_SECTOR_BITS;
> +        if (ret) {
> +            *pnum = psectors_inter;
>              return 1;
>          }
> 
> @@ -1953,10 +1972,10 @@ int bdrv_is_allocated_above(BlockDriverState *top,
>           *
>           * [sector_num+x, nr_sectors] allocated.
>           */
> -        if (n > pnum_inter &&
> +        if (n > psectors_inter &&
>              (intermediate == top ||
> -             sector_num + pnum_inter < intermediate->total_sectors)) {
> -            n = pnum_inter;
> +             sector_num + psectors_inter < intermediate->total_sectors)) {
> +            n = psectors_inter;
>          }
> 
>          intermediate = backing_bs(intermediate);
> diff --git a/block/stream.c b/block/stream.c
> index 3ed4fef..85502eb 100644
> --- a/block/stream.c
> +++ b/block/stream.c
> @@ -138,6 +138,7 @@ static void coroutine_fn stream_run(void *opaque)
> 
>      for ( ; offset < s->common.len; offset += n * BDRV_SECTOR_SIZE) {
>          bool copy;
> +        int64_t count = 0;
> 
>          /* Note that even when no rate limit is applied we need to yield
>           * with no pending I/O here so that bdrv_drain_all() returns.
> @@ -149,8 +150,8 @@ static void coroutine_fn stream_run(void *opaque)
> 
>          copy = false;
> 
> -        ret = bdrv_is_allocated(bs, offset / BDRV_SECTOR_SIZE,
> -                                STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
> +        ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &count);
> +        n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
>          if (ret == 1) {
>              /* Allocated in the top, no need to copy.  */
>          } else if (ret >= 0) {
> diff --git a/block/vvfat.c b/block/vvfat.c
> index af5153d..bef2056 100644
> --- a/block/vvfat.c
> +++ b/block/vvfat.c
> @@ -1393,24 +1393,27 @@ static int vvfat_read(BlockDriverState *bs, int64_t sector_num,
>  	if (sector_num >= bs->total_sectors)
>  	   return -1;
>  	if (s->qcow) {
> -	    int n;
> +            int64_t n;
>              int ret;
> -            ret = bdrv_is_allocated(s->qcow->bs, sector_num,
> -                                    nb_sectors - i, &n);
> +            ret = bdrv_is_allocated(s->qcow->bs, sector_num * BDRV_SECTOR_SIZE,
> +                                    (nb_sectors - i) * BDRV_SECTOR_SIZE, &n);
>              if (ret < 0) {
>                  return ret;
>              }
>              if (ret) {
> -                DLOG(fprintf(stderr, "sectors %d+%d allocated\n",
> -                             (int)sector_num, n));
> -                if (bdrv_read(s->qcow, sector_num, buf + i * 0x200, n)) {
> +                DLOG(fprintf(stderr, "sectors %" PRId64 "+%" PRId64
> +                             " allocated\n", sector_num,
> +                             n >> BDRV_SECTOR_BITS));
> +                if (bdrv_read(s->qcow, sector_num, buf + i * 0x200,
> +                              n >> BDRV_SECTOR_BITS)) {
>                      return -1;
>                  }
> -                i += n - 1;
> -                sector_num += n - 1;
> +                i += (n >> BDRV_SECTOR_BITS) - 1;
> +                sector_num += (n >> BDRV_SECTOR_BITS) - 1;
>                  continue;
>              }
> -DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num));
> +            DLOG(fprintf(stderr, "sector %" PRId64 " not allocated\n",
> +                         sector_num));
>  	}
>  	if(sector_num<s->faked_sectors) {
>  	    if(sector_num<s->first_sectors_number)
> @@ -1678,7 +1681,7 @@ static inline bool cluster_was_modified(BDRVVVFATState *s,
>                                          uint32_t cluster_num)
>  {
>      int was_modified = 0;
> -    int i, dummy;
> +    int i;
> 
>      if (s->qcow == NULL) {
>          return 0;
> @@ -1686,8 +1689,9 @@ static inline bool cluster_was_modified(BDRVVVFATState *s,
> 
>      for (i = 0; !was_modified && i < s->sectors_per_cluster; i++) {
>          was_modified = bdrv_is_allocated(s->qcow->bs,
> -                                         cluster2sector(s, cluster_num) + i,
> -                                         1, &dummy);
> +                                         (cluster2sector(s, cluster_num) +
> +                                          i) * BDRV_SECTOR_SIZE,
> +                                         BDRV_SECTOR_SIZE, NULL);
>      }
> 
>      /*
> @@ -1834,7 +1838,7 @@ static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,
>  	    }
> 
>  	    if (copy_it) {
> -		int i, dummy;
> +                int i;
>  		/*
>  		 * This is horribly inefficient, but that is okay, since
>  		 * it is rarely executed, if at all.
> @@ -1845,7 +1849,9 @@ static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,
>                  for (i = 0; i < s->sectors_per_cluster; i++) {
>                      int res;
> 
> -                    res = bdrv_is_allocated(s->qcow->bs, offset + i, 1, &dummy);
> +                    res = bdrv_is_allocated(s->qcow->bs,
> +                                            (offset + i) * BDRV_SECTOR_SIZE,
> +                                            BDRV_SECTOR_SIZE, NULL);
>                      if (res < 0) {
>                          return -1;
>                      }
> diff --git a/migration/block.c b/migration/block.c
> index 7734ff7..18d50ff 100644
> --- a/migration/block.c
> +++ b/migration/block.c
> @@ -36,7 +36,7 @@
>  #define BLK_MIG_FLAG_PROGRESS           0x04
>  #define BLK_MIG_FLAG_ZERO_BLOCK         0x08
> 
> -#define MAX_IS_ALLOCATED_SEARCH 65536
> +#define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE)
> 
>  #define MAX_INFLIGHT_IO 512
> 
> @@ -272,6 +272,7 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
>      BlockBackend *bb = bmds->blk;
>      BlkMigBlock *blk;
>      int nr_sectors;
> +    int64_t count;
> 
>      if (bmds->shared_base) {
>          qemu_mutex_lock_iothread();
> @@ -279,9 +280,9 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
>          /* Skip unallocated sectors; intentionally treats failure as
>           * an allocated sector */
>          while (cur_sector < total_sectors &&
> -               !bdrv_is_allocated(blk_bs(bb), cur_sector,
> -                                  MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
> -            cur_sector += nr_sectors;
> +               !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE,
> +                                  MAX_IS_ALLOCATED_SEARCH, &count)) {
> +            cur_sector += DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
>          }

Let's try asking again with the right vocabulary:

OK, so I guess the idea is that we definitely shouldn't ever have a
partially allocated sector; so I suppose this ROUND_UP is just a precaution?

What would it mean if this actually DID round up? Would we miss
transmitting a fraction of a sector because we assumed it was unallocated?

In other places, you scale down with X / BDRV_SECTOR_SIZE or an
equivalent bitshift, why does this receive a round *up* treatment?

Are we considering a future in which this function might actually give
us some unaligned answers? Do we need to re-audit all of the callers at
that point to make sure they cope appropriately?

--js

>          aio_context_release(blk_get_aio_context(bb));
>          qemu_mutex_unlock_iothread();
> diff --git a/qemu-img.c b/qemu-img.c
> index 37c2894..2f21d69 100644
> --- a/qemu-img.c
> +++ b/qemu-img.c
> @@ -3207,6 +3207,7 @@ static int img_rebase(int argc, char **argv)
>          int64_t new_backing_num_sectors = 0;
>          uint64_t sector;
>          int n;
> +        int64_t count;
>          float local_progress = 0;
> 
>          buf_old = blk_blockalign(blk, IO_BUF_SIZE);
> @@ -3254,12 +3255,14 @@ static int img_rebase(int argc, char **argv)
>              }
> 
>              /* If the cluster is allocated, we don't need to take action */
> -            ret = bdrv_is_allocated(bs, sector, n, &n);
> +            ret = bdrv_is_allocated(bs, sector << BDRV_SECTOR_BITS,
> +                                    n << BDRV_SECTOR_BITS, &count);
>              if (ret < 0) {
>                  error_report("error while reading image metadata: %s",
>                               strerror(-ret));
>                  goto out;
>              }
> +            n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
>              if (ret) {
>                  continue;
>              }
> diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
> index 0ac7457..62278ea 100644
> --- a/qemu-io-cmds.c
> +++ b/qemu-io-cmds.c
> @@ -1760,12 +1760,12 @@ out:
>  static int alloc_f(BlockBackend *blk, int argc, char **argv)
>  {
>      BlockDriverState *bs = blk_bs(blk);
> -    int64_t offset, sector_num, nb_sectors, remaining, bytes;
> +    int64_t offset, start, remaining, bytes;
>      char s1[64];
> -    int num, ret;
> -    int64_t sum_alloc;
> +    int ret;
> +    int64_t num, sum_alloc;
> 
> -    offset = cvtnum(argv[1]);
> +    start = offset = cvtnum(argv[1]);
>      if (offset < 0) {
>          print_cvtnum_err(offset, argv[1]);
>          return 0;
> @@ -1793,32 +1793,30 @@ static int alloc_f(BlockBackend *blk, int argc, char **argv)
>                 bytes);
>          return 0;
>      }
> -    nb_sectors = bytes >> BDRV_SECTOR_BITS;
> 
> -    remaining = nb_sectors;
> +    remaining = bytes;
>      sum_alloc = 0;
> -    sector_num = offset >> 9;
>      while (remaining) {
> -        ret = bdrv_is_allocated(bs, sector_num, remaining, &num);
> +        ret = bdrv_is_allocated(bs, offset, remaining, &num);
>          if (ret < 0) {
>              printf("is_allocated failed: %s\n", strerror(-ret));
>              return 0;
>          }
> -        sector_num += num;
> +        offset += num;
>          remaining -= num;
>          if (ret) {
>              sum_alloc += num;
>          }
>          if (num == 0) {
> -            nb_sectors -= remaining;
> +            bytes -= remaining;
>              remaining = 0;
>          }
>      }
> 
> -    cvtstr(offset, s1, sizeof(s1));
> +    cvtstr(start, s1, sizeof(s1));
> 
>      printf("%"PRId64"/%"PRId64" bytes allocated at offset %s\n",
> -           sum_alloc << BDRV_SECTOR_BITS, nb_sectors << BDRV_SECTOR_BITS, s1);
> +           sum_alloc, bytes, s1);
>      return 0;
>  }
> 
> @@ -1833,14 +1831,15 @@ static const cmdinfo_t alloc_cmd = {
>  };
> 
> 
> -static int map_is_allocated(BlockDriverState *bs, int64_t sector_num,
> -                            int64_t nb_sectors, int64_t *pnum)
> +static int map_is_allocated(BlockDriverState *bs, int64_t offset,
> +                            int64_t bytes, int64_t *pnum)
>  {
> -    int num, num_checked;
> +    int64_t num;
> +    int num_checked;
>      int ret, firstret;
> 
> -    num_checked = MIN(nb_sectors, INT_MAX);
> -    ret = bdrv_is_allocated(bs, sector_num, num_checked, &num);
> +    num_checked = MIN(bytes, BDRV_REQUEST_MAX_BYTES);
> +    ret = bdrv_is_allocated(bs, offset, num_checked, &num);
>      if (ret < 0) {
>          return ret;
>      }
> @@ -1848,12 +1847,12 @@ static int map_is_allocated(BlockDriverState *bs, int64_t sector_num,
>      firstret = ret;
>      *pnum = num;
> 
> -    while (nb_sectors > 0 && ret == firstret) {
> -        sector_num += num;
> -        nb_sectors -= num;
> +    while (bytes > 0 && ret == firstret) {
> +        offset += num;
> +        bytes -= num;
> 
> -        num_checked = MIN(nb_sectors, INT_MAX);
> -        ret = bdrv_is_allocated(bs, sector_num, num_checked, &num);
> +        num_checked = MIN(bytes, BDRV_REQUEST_MAX_BYTES);
> +        ret = bdrv_is_allocated(bs, offset, num_checked, &num);
>          if (ret == firstret && num) {
>              *pnum += num;
>          } else {
> @@ -1867,7 +1866,7 @@ static int map_is_allocated(BlockDriverState *bs, int64_t sector_num,
>  static int map_f(BlockBackend *blk, int argc, char **argv)
>  {
>      int64_t offset;
> -    int64_t nb_sectors, total_sectors;
> +    int64_t bytes, total_sectors;
>      char s1[64];
>      int64_t num;
>      int ret;
> @@ -1881,10 +1880,10 @@ static int map_f(BlockBackend *blk, int argc, char **argv)
>          return 0;
>      }
> 
> -    nb_sectors = total_sectors;
> +    bytes = total_sectors * BDRV_SECTOR_SIZE;
> 
>      do {
> -        ret = map_is_allocated(blk_bs(blk), offset, nb_sectors, &num);
> +        ret = map_is_allocated(blk_bs(blk), offset, bytes, &num);
>          if (ret < 0) {
>              error_report("Failed to get allocation status: %s", strerror(-ret));
>              return 0;
> @@ -1894,13 +1893,13 @@ static int map_f(BlockBackend *blk, int argc, char **argv)
>          }
> 
>          retstr = ret ? "    allocated" : "not allocated";
> -        cvtstr(offset << 9ULL, s1, sizeof(s1));
> +        cvtstr(offset, s1, sizeof(s1));
>          printf("[% 24" PRId64 "] % 16" PRId64 " bytes %s at offset %s (%d)\n",
> -               offset << 9ULL, num << 9ULL, retstr, s1, ret);
> +               offset, num, retstr, s1, ret);
> 
>          offset += num;
> -        nb_sectors -= num;
> -    } while (offset < total_sectors);
> +        bytes -= num;
> +    } while (offset < total_sectors * BDRV_SECTOR_SIZE);
> 
>      return 0;
>  }
>
Eric Blake April 19, 2017, 9:12 p.m. UTC | #5
On 04/19/2017 03:32 PM, John Snow wrote:

>> @@ -279,9 +280,9 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
>>          /* Skip unallocated sectors; intentionally treats failure as
>>           * an allocated sector */
>>          while (cur_sector < total_sectors &&
>> -               !bdrv_is_allocated(blk_bs(bb), cur_sector,
>> -                                  MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
>> -            cur_sector += nr_sectors;
>> +               !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE,
>> +                                  MAX_IS_ALLOCATED_SEARCH, &count)) {
>> +            cur_sector += DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
>>          }
> 
> Let's try asking again with the right vocabulary:
> 
> OK, so I guess the idea is that we definitely shouldn't ever have a
> partially allocated sector; so I suppose this ROUND_UP is just a precaution?

For now, yes - but the part 3 series (bdrv_get_block_status) makes
returning a sub-sector value possible.  For most drivers, it will only
happen at end-of-file (as status doesn't change mid-sector unless your
file is not a multiple of a sector size) - in fact, I found that I had
to patch the block layer to round up and sub-sector return that
coincides with end-of-file back to a round sector amount in order to
avoid breaking other code that assumed everything is allocated in sectors.

> 
> What would it mean if this actually DID round up? Would we miss
> transmitting a fraction of a sector because we assumed it was unallocated?

In general, the only mid-sector transition I encountered in testing was
at EOF, where the first half is allocated and the second half (beyond
EOF) was an implicit hole.  But here, you are worried about the opposite
case - can we ever have a hole that ends mid-sector, followed by actual
data in the second half.  Probably not, but I can add an assertion to
the generic block layer that mid-sector returns for an unallocated
answer can only happen at end-of-file.

> 
> In other places, you scale down with X / BDRV_SECTOR_SIZE or an
> equivalent bitshift, why does this receive a round *up* treatment?

Really? I thought I was being careful in this patch about ALWAYS
preserving the same sector value - the only time I did a bitshift was if
I already asserted the answer was aligned to sector boundaries to begin
with, and when I wasn't sure, it should be a round up.

> 
> Are we considering a future in which this function might actually give
> us some unaligned answers? Do we need to re-audit all of the callers at
> that point to make sure they cope appropriately?

Yes, I've been trying to do that all along, but a second set of eyes
never hurts.  Or, as I said, we can play it safe by guaranteeing that
even when byte-based, the block layer enforces sector-aligned answers.
John Snow April 19, 2017, 9:40 p.m. UTC | #6
On 04/19/2017 05:12 PM, Eric Blake wrote:
> On 04/19/2017 03:32 PM, John Snow wrote:
> 
>>> @@ -279,9 +280,9 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
>>>          /* Skip unallocated sectors; intentionally treats failure as
>>>           * an allocated sector */
>>>          while (cur_sector < total_sectors &&
>>> -               !bdrv_is_allocated(blk_bs(bb), cur_sector,
>>> -                                  MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
>>> -            cur_sector += nr_sectors;
>>> +               !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE,
>>> +                                  MAX_IS_ALLOCATED_SEARCH, &count)) {
>>> +            cur_sector += DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
>>>          }
>>
>> Let's try asking again with the right vocabulary:
>>
>> OK, so I guess the idea is that we definitely shouldn't ever have a
>> partially allocated sector; so I suppose this ROUND_UP is just a precaution?
> 
> For now, yes - but the part 3 series (bdrv_get_block_status) makes
> returning a sub-sector value possible.  For most drivers, it will only
> happen at end-of-file (as status doesn't change mid-sector unless your
> file is not a multiple of a sector size) - in fact, I found that I had
> to patch the block layer to round up and sub-sector return that
> coincides with end-of-file back to a round sector amount in order to
> avoid breaking other code that assumed everything is allocated in sectors.

I'm sorry, I'm having difficulty parsing your last sentence.

> 
>>
>> What would it mean if this actually DID round up? Would we miss
>> transmitting a fraction of a sector because we assumed it was unallocated?
> 
> In general, the only mid-sector transition I encountered in testing was
> at EOF, where the first half is allocated and the second half (beyond
> EOF) was an implicit hole.  But here, you are worried about the opposite
> case - can we ever have a hole that ends mid-sector, followed by actual
> data in the second half.  Probably not, but I can add an assertion to
> the generic block layer that mid-sector returns for an unallocated
> answer can only happen at end-of-file.
> 

Yeah, just curious about the implications from a naive-user point of
view, as it makes the algorithm in the caller look ugly now.

I agree with you that there is absolutely definitely no real problem
here right now, but I do secretly wonder if we'll invent a way to screw
ourselves over.

Presumably it will get converted at SOME point like everything else, and
it won't be a concern anymore.

Presumably Presumably that will even happen before we invent a way to
screw ourselves over.

>>
>> In other places, you scale down with X / BDRV_SECTOR_SIZE or an
>> equivalent bitshift, why does this receive a round *up* treatment?
> 
> Really? I thought I was being careful in this patch about ALWAYS
> preserving the same sector value - the only time I did a bitshift was if
> I already asserted the answer was aligned to sector boundaries to begin
> with, and when I wasn't sure, it should be a round up.
> 

You're probably right as I hadn't been paying perfectly close attention
to exactly how you'd been scaling the values, this is simply the only
one that stood out to me as slightly strange due to the DIV_ROUND_UP
macro so I had assumed it stood out, but maybe I'm misidentifying why.

>>
>> Are we considering a future in which this function might actually give
>> us some unaligned answers? Do we need to re-audit all of the callers at
>> that point to make sure they cope appropriately?
> 
> Yes, I've been trying to do that all along, but a second set of eyes
> never hurts.  Or, as I said, we can play it safe by guaranteeing that
> even when byte-based, the block layer enforces sector-aligned answers.
> 

Just making sure I'm clear that we expect this function to do so;
clearly we're (You're*) removing sector assumptions from as many places
as we (You) can.

Thanks for playing whack-a-mole with stupid questions, I think I'm
on-board now.

--js
Eric Blake May 10, 2017, 10:33 p.m. UTC | #7
On 04/19/2017 04:40 PM, John Snow wrote:
> 
> 
> On 04/19/2017 05:12 PM, Eric Blake wrote:
>> On 04/19/2017 03:32 PM, John Snow wrote:
>>
>>>> @@ -279,9 +280,9 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
>>>>          /* Skip unallocated sectors; intentionally treats failure as
>>>>           * an allocated sector */
>>>>          while (cur_sector < total_sectors &&
>>>> -               !bdrv_is_allocated(blk_bs(bb), cur_sector,
>>>> -                                  MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
>>>> -            cur_sector += nr_sectors;
>>>> +               !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE,
>>>> +                                  MAX_IS_ALLOCATED_SEARCH, &count)) {
>>>> +            cur_sector += DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
>>>>          }
>>>
>>> Let's try asking again with the right vocabulary:
>>>
>>> OK, so I guess the idea is that we definitely shouldn't ever have a
>>> partially allocated sector; so I suppose this ROUND_UP is just a precaution?
>>
>> For now, yes - but the part 3 series (bdrv_get_block_status) makes
>> returning a sub-sector value possible.  For most drivers, it will only
>> happen at end-of-file (as status doesn't change mid-sector unless your
>> file is not a multiple of a sector size) - in fact, I found that I had
>> to patch the block layer to round up and sub-sector return that
>> coincides with end-of-file back to a round sector amount in order to
>> avoid breaking other code that assumed everything is allocated in sectors.
> 
> I'm sorry, I'm having difficulty parsing your last sentence.

We should NEVER have an allocation smaller than
bds->bl.request_alignment (for example, if you can't write in chunks
smaller than 512, then how can your allocation be smaller than that?).
But when we eventually have byte-granularity support, and for a device
where request_alignment is 1, it is feasible that we could track
mid-sector alignment changes.  In practice, we DO have an
easy-to-observe mid-sector alignment change: on a regular file not
aligned to sector boundaries, where request_alignment IS 1, we get a
mid-sector transition from data to hole at EOF.  So it turned out to be
easier in my later series on bdrv_get_block_status() to intentionally
round a partial sector at end-of-file up as if the entire sector was
allocated, rather than reporting the mid-sector alignment (matching the
fact that bdrv_getlength() rounds up to sector boundaries).

If, someday, we fix bdrv_getlength() to report a non-sector-aligned
value, we'll have to revisit the rounding (again, it feels like
bds->bl.request_alignment is the obvious place to ensure that we never
report a mid-alignment result, but where the alignment size is now
driver-dependent instead of hard-coded to 512).  In fact, even with
non-sector-aligned bdrv_getlength(), we may STILL be able to enforce
rules such as mid-sector transitions from hole->data are not possible
(regardless of request_alignment), and mid-sector transitions from
data->hole are possible only at the end of the file.

> I agree with you that there is absolutely definitely no real problem
> here right now, but I do secretly wonder if we'll invent a way to screw
> ourselves over.
> 
> Presumably it will get converted at SOME point like everything else, and
> it won't be a concern anymore.
> 
> Presumably Presumably that will even happen before we invent a way to
> screw ourselves over.

Here's hoping we are careful enough, and have enough asserts in the
right place so that if we guessed wrong we get a nice assert (rather
than risking inf-looping because we rounded a sub-sector down to 0, or
causing data corruption that doesn't show up until much later where we
rounded up and missed copying something); at the same time, that the
asserts are accurate enough that we can't trip them from legitimate users.

One thing I plan to add to my v2 posting (for the part 3
bdrv_get_block_status series) is to remove the 'qemu-io alloc' sector
alignment restrictions, to have a tool that makes it easier to prove
that I am properly handling rounding at the block layer out to
request_alignment limits without triggering any asserts.  And my recent
blkdebug enhancements are probably going to be helpful in validating
things, even if I have to add more iotests.
diff mbox

Patch

diff --git a/include/block/block.h b/include/block/block.h
index b5289f7..8641149 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -422,8 +422,8 @@  int64_t bdrv_get_block_status_above(BlockDriverState *bs,
                                     int64_t sector_num,
                                     int nb_sectors, int *pnum,
                                     BlockDriverState **file);
-int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                      int *pnum);
+int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
+                      int64_t *pnum);
 int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
                             int64_t sector_num, int nb_sectors, int *pnum);

diff --git a/block/backup.c b/block/backup.c
index 2a51e8b..63ca208 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -47,12 +47,6 @@  typedef struct BackupBlockJob {
     QLIST_HEAD(, CowRequest) inflight_reqs;
 } BackupBlockJob;

-/* Size of a cluster in sectors, instead of bytes. */
-static inline int64_t cluster_size_sectors(BackupBlockJob *job)
-{
-  return job->cluster_size / BDRV_SECTOR_SIZE;
-}
-
 /* See if in-flight requests overlap and wait for them to complete */
 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
                                                        int64_t start,
@@ -433,7 +427,6 @@  static void coroutine_fn backup_run(void *opaque)
     BackupCompleteData *data;
     BlockDriverState *bs = blk_bs(job->common.blk);
     int64_t offset;
-    int64_t sectors_per_cluster = cluster_size_sectors(job);
     int ret = 0;

     QLIST_INIT(&job->inflight_reqs);
@@ -465,12 +458,13 @@  static void coroutine_fn backup_run(void *opaque)
             }

             if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
-                int i, n;
+                int i;
+                int64_t n;

                 /* Check to see if these blocks are already in the
                  * backing file. */

-                for (i = 0; i < sectors_per_cluster;) {
+                for (i = 0; i < job->cluster_size;) {
                     /* bdrv_is_allocated() only returns true/false based
                      * on the first set of sectors it comes across that
                      * are are all in the same state.
@@ -478,9 +472,8 @@  static void coroutine_fn backup_run(void *opaque)
                      * backup cluster length.  We end up copying more than
                      * needed but at some point that is always the case. */
                     alloced =
-                        bdrv_is_allocated(bs,
-                                          (offset >> BDRV_SECTOR_BITS) + i,
-                                          sectors_per_cluster - i, &n);
+                        bdrv_is_allocated(bs, offset + i,
+                                          job->cluster_size - i, &n);
                     i += n;

                     if (alloced || n == 0) {
diff --git a/block/commit.c b/block/commit.c
index b7d3e60..4d6bb2a 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -429,7 +429,7 @@  fail:
 }


-#define COMMIT_BUF_SECTORS 2048
+#define COMMIT_BUF_SIZE (2048 * BDRV_SECTOR_SIZE)

 /* commit COW file into the raw image */
 int bdrv_commit(BlockDriverState *bs)
@@ -438,8 +438,9 @@  int bdrv_commit(BlockDriverState *bs)
     BlockDriverState *backing_file_bs = NULL;
     BlockDriverState *commit_top_bs = NULL;
     BlockDriver *drv = bs->drv;
-    int64_t sector, total_sectors, length, backing_length;
-    int n, ro, open_flags;
+    int64_t offset, length, backing_length;
+    int ro, open_flags;
+    int64_t n;
     int ret = 0;
     uint8_t *buf = NULL;
     Error *local_err = NULL;
@@ -517,30 +518,26 @@  int bdrv_commit(BlockDriverState *bs)
         }
     }

-    total_sectors = length >> BDRV_SECTOR_BITS;
-
     /* blk_try_blockalign() for src will choose an alignment that works for
      * backing as well, so no need to compare the alignment manually. */
-    buf = blk_try_blockalign(src, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
+    buf = blk_try_blockalign(src, COMMIT_BUF_SIZE);
     if (buf == NULL) {
         ret = -ENOMEM;
         goto ro_cleanup;
     }

-    for (sector = 0; sector < total_sectors; sector += n) {
-        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
+    for (offset = 0; offset < length; offset += n) {
+        ret = bdrv_is_allocated(bs, offset, COMMIT_BUF_SIZE, &n);
         if (ret < 0) {
             goto ro_cleanup;
         }
         if (ret) {
-            ret = blk_pread(src, sector * BDRV_SECTOR_SIZE, buf,
-                            n * BDRV_SECTOR_SIZE);
+            ret = blk_pread(src, offset, buf, n);
             if (ret < 0) {
                 goto ro_cleanup;
             }

-            ret = blk_pwrite(backing, sector * BDRV_SECTOR_SIZE, buf,
-                             n * BDRV_SECTOR_SIZE, 0);
+            ret = blk_pwrite(backing, offset, buf, n, 0);
             if (ret < 0) {
                 goto ro_cleanup;
             }
diff --git a/block/io.c b/block/io.c
index 8706bfa..438a493 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1054,14 +1054,15 @@  static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
         int64_t start_sector = offset >> BDRV_SECTOR_BITS;
         int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
         unsigned int nb_sectors = end_sector - start_sector;
-        int pnum;
+        int64_t pnum;

-        ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum);
+        ret = bdrv_is_allocated(bs, start_sector << BDRV_SECTOR_BITS,
+                                nb_sectors << BDRV_SECTOR_BITS, &pnum);
         if (ret < 0) {
             goto out;
         }

-        if (!ret || pnum != nb_sectors) {
+        if (!ret || pnum != nb_sectors << BDRV_SECTOR_BITS) {
             ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
             goto out;
         }
@@ -1903,15 +1904,26 @@  int64_t bdrv_get_block_status(BlockDriverState *bs,
                                        sector_num, nb_sectors, pnum, file);
 }

-int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                                   int nb_sectors, int *pnum)
+int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
+                                   int64_t bytes, int64_t *pnum)
 {
     BlockDriverState *file;
-    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
-                                        &file);
+    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    int64_t ret;
+    int psectors;
+
+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE) &&
+           bytes < INT_MAX * BDRV_SECTOR_SIZE);
+    ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &psectors,
+                                &file);
     if (ret < 0) {
         return ret;
     }
+    if (pnum) {
+        *pnum = psectors * BDRV_SECTOR_SIZE;
+    }
     return !!(ret & BDRV_BLOCK_ALLOCATED);
 }

@@ -1920,7 +1932,8 @@  int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
  *
  * Return true if the given sector is allocated in any image between
  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
- * sector is allocated in any image of the chain.  Return false otherwise.
+ * sector is allocated in any image of the chain.  Return false otherwise,
+ * or negative errno on failure.
  *
  * 'pnum' is set to the number of sectors (including and immediately following
  *  the specified sector) that are known to be in the same
@@ -1937,13 +1950,19 @@  int bdrv_is_allocated_above(BlockDriverState *top,

     intermediate = top;
     while (intermediate && intermediate != base) {
-        int pnum_inter;
-        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
+        int64_t pnum_inter;
+        int psectors_inter;
+
+        ret = bdrv_is_allocated(intermediate, sector_num * BDRV_SECTOR_SIZE,
+                                nb_sectors * BDRV_SECTOR_SIZE,
                                 &pnum_inter);
         if (ret < 0) {
             return ret;
-        } else if (ret) {
-            *pnum = pnum_inter;
+        }
+        assert(pnum_inter < INT_MAX * BDRV_SECTOR_SIZE);
+        psectors_inter = pnum_inter >> BDRV_SECTOR_BITS;
+        if (ret) {
+            *pnum = psectors_inter;
             return 1;
         }

@@ -1953,10 +1972,10 @@  int bdrv_is_allocated_above(BlockDriverState *top,
          *
          * [sector_num+x, nr_sectors] allocated.
          */
-        if (n > pnum_inter &&
+        if (n > psectors_inter &&
             (intermediate == top ||
-             sector_num + pnum_inter < intermediate->total_sectors)) {
-            n = pnum_inter;
+             sector_num + psectors_inter < intermediate->total_sectors)) {
+            n = psectors_inter;
         }

         intermediate = backing_bs(intermediate);
diff --git a/block/stream.c b/block/stream.c
index 3ed4fef..85502eb 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -138,6 +138,7 @@  static void coroutine_fn stream_run(void *opaque)

     for ( ; offset < s->common.len; offset += n * BDRV_SECTOR_SIZE) {
         bool copy;
+        int64_t count = 0;

         /* Note that even when no rate limit is applied we need to yield
          * with no pending I/O here so that bdrv_drain_all() returns.
@@ -149,8 +150,8 @@  static void coroutine_fn stream_run(void *opaque)

         copy = false;

-        ret = bdrv_is_allocated(bs, offset / BDRV_SECTOR_SIZE,
-                                STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
+        ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &count);
+        n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
         if (ret == 1) {
             /* Allocated in the top, no need to copy.  */
         } else if (ret >= 0) {
diff --git a/block/vvfat.c b/block/vvfat.c
index af5153d..bef2056 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -1393,24 +1393,27 @@  static int vvfat_read(BlockDriverState *bs, int64_t sector_num,
 	if (sector_num >= bs->total_sectors)
 	   return -1;
 	if (s->qcow) {
-	    int n;
+            int64_t n;
             int ret;
-            ret = bdrv_is_allocated(s->qcow->bs, sector_num,
-                                    nb_sectors - i, &n);
+            ret = bdrv_is_allocated(s->qcow->bs, sector_num * BDRV_SECTOR_SIZE,
+                                    (nb_sectors - i) * BDRV_SECTOR_SIZE, &n);
             if (ret < 0) {
                 return ret;
             }
             if (ret) {
-                DLOG(fprintf(stderr, "sectors %d+%d allocated\n",
-                             (int)sector_num, n));
-                if (bdrv_read(s->qcow, sector_num, buf + i * 0x200, n)) {
+                DLOG(fprintf(stderr, "sectors %" PRId64 "+%" PRId64
+                             " allocated\n", sector_num,
+                             n >> BDRV_SECTOR_BITS));
+                if (bdrv_read(s->qcow, sector_num, buf + i * 0x200,
+                              n >> BDRV_SECTOR_BITS)) {
                     return -1;
                 }
-                i += n - 1;
-                sector_num += n - 1;
+                i += (n >> BDRV_SECTOR_BITS) - 1;
+                sector_num += (n >> BDRV_SECTOR_BITS) - 1;
                 continue;
             }
-DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num));
+            DLOG(fprintf(stderr, "sector %" PRId64 " not allocated\n",
+                         sector_num));
 	}
 	if(sector_num<s->faked_sectors) {
 	    if(sector_num<s->first_sectors_number)
@@ -1678,7 +1681,7 @@  static inline bool cluster_was_modified(BDRVVVFATState *s,
                                         uint32_t cluster_num)
 {
     int was_modified = 0;
-    int i, dummy;
+    int i;

     if (s->qcow == NULL) {
         return 0;
@@ -1686,8 +1689,9 @@  static inline bool cluster_was_modified(BDRVVVFATState *s,

     for (i = 0; !was_modified && i < s->sectors_per_cluster; i++) {
         was_modified = bdrv_is_allocated(s->qcow->bs,
-                                         cluster2sector(s, cluster_num) + i,
-                                         1, &dummy);
+                                         (cluster2sector(s, cluster_num) +
+                                          i) * BDRV_SECTOR_SIZE,
+                                         BDRV_SECTOR_SIZE, NULL);
     }

     /*
@@ -1834,7 +1838,7 @@  static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,
 	    }

 	    if (copy_it) {
-		int i, dummy;
+                int i;
 		/*
 		 * This is horribly inefficient, but that is okay, since
 		 * it is rarely executed, if at all.
@@ -1845,7 +1849,9 @@  static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,
                 for (i = 0; i < s->sectors_per_cluster; i++) {
                     int res;

-                    res = bdrv_is_allocated(s->qcow->bs, offset + i, 1, &dummy);
+                    res = bdrv_is_allocated(s->qcow->bs,
+                                            (offset + i) * BDRV_SECTOR_SIZE,
+                                            BDRV_SECTOR_SIZE, NULL);
                     if (res < 0) {
                         return -1;
                     }
diff --git a/migration/block.c b/migration/block.c
index 7734ff7..18d50ff 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -36,7 +36,7 @@ 
 #define BLK_MIG_FLAG_PROGRESS           0x04
 #define BLK_MIG_FLAG_ZERO_BLOCK         0x08

-#define MAX_IS_ALLOCATED_SEARCH 65536
+#define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE)

 #define MAX_INFLIGHT_IO 512

@@ -272,6 +272,7 @@  static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
     BlockBackend *bb = bmds->blk;
     BlkMigBlock *blk;
     int nr_sectors;
+    int64_t count;

     if (bmds->shared_base) {
         qemu_mutex_lock_iothread();
@@ -279,9 +280,9 @@  static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
         /* Skip unallocated sectors; intentionally treats failure as
          * an allocated sector */
         while (cur_sector < total_sectors &&
-               !bdrv_is_allocated(blk_bs(bb), cur_sector,
-                                  MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
-            cur_sector += nr_sectors;
+               !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE,
+                                  MAX_IS_ALLOCATED_SEARCH, &count)) {
+            cur_sector += DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
         }
         aio_context_release(blk_get_aio_context(bb));
         qemu_mutex_unlock_iothread();
diff --git a/qemu-img.c b/qemu-img.c
index 37c2894..2f21d69 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3207,6 +3207,7 @@  static int img_rebase(int argc, char **argv)
         int64_t new_backing_num_sectors = 0;
         uint64_t sector;
         int n;
+        int64_t count;
         float local_progress = 0;

         buf_old = blk_blockalign(blk, IO_BUF_SIZE);
@@ -3254,12 +3255,14 @@  static int img_rebase(int argc, char **argv)
             }

             /* If the cluster is allocated, we don't need to take action */
-            ret = bdrv_is_allocated(bs, sector, n, &n);
+            ret = bdrv_is_allocated(bs, sector << BDRV_SECTOR_BITS,
+                                    n << BDRV_SECTOR_BITS, &count);
             if (ret < 0) {
                 error_report("error while reading image metadata: %s",
                              strerror(-ret));
                 goto out;
             }
+            n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
             if (ret) {
                 continue;
             }
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 0ac7457..62278ea 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1760,12 +1760,12 @@  out:
 static int alloc_f(BlockBackend *blk, int argc, char **argv)
 {
     BlockDriverState *bs = blk_bs(blk);
-    int64_t offset, sector_num, nb_sectors, remaining, bytes;
+    int64_t offset, start, remaining, bytes;
     char s1[64];
-    int num, ret;
-    int64_t sum_alloc;
+    int ret;
+    int64_t num, sum_alloc;

-    offset = cvtnum(argv[1]);
+    start = offset = cvtnum(argv[1]);
     if (offset < 0) {
         print_cvtnum_err(offset, argv[1]);
         return 0;
@@ -1793,32 +1793,30 @@  static int alloc_f(BlockBackend *blk, int argc, char **argv)
                bytes);
         return 0;
     }
-    nb_sectors = bytes >> BDRV_SECTOR_BITS;

-    remaining = nb_sectors;
+    remaining = bytes;
     sum_alloc = 0;
-    sector_num = offset >> 9;
     while (remaining) {
-        ret = bdrv_is_allocated(bs, sector_num, remaining, &num);
+        ret = bdrv_is_allocated(bs, offset, remaining, &num);
         if (ret < 0) {
             printf("is_allocated failed: %s\n", strerror(-ret));
             return 0;
         }
-        sector_num += num;
+        offset += num;
         remaining -= num;
         if (ret) {
             sum_alloc += num;
         }
         if (num == 0) {
-            nb_sectors -= remaining;
+            bytes -= remaining;
             remaining = 0;
         }
     }

-    cvtstr(offset, s1, sizeof(s1));
+    cvtstr(start, s1, sizeof(s1));

     printf("%"PRId64"/%"PRId64" bytes allocated at offset %s\n",
-           sum_alloc << BDRV_SECTOR_BITS, nb_sectors << BDRV_SECTOR_BITS, s1);
+           sum_alloc, bytes, s1);
     return 0;
 }

@@ -1833,14 +1831,15 @@  static const cmdinfo_t alloc_cmd = {
 };


-static int map_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                            int64_t nb_sectors, int64_t *pnum)
+static int map_is_allocated(BlockDriverState *bs, int64_t offset,
+                            int64_t bytes, int64_t *pnum)
 {
-    int num, num_checked;
+    int64_t num;
+    int num_checked;
     int ret, firstret;

-    num_checked = MIN(nb_sectors, INT_MAX);
-    ret = bdrv_is_allocated(bs, sector_num, num_checked, &num);
+    num_checked = MIN(bytes, BDRV_REQUEST_MAX_BYTES);
+    ret = bdrv_is_allocated(bs, offset, num_checked, &num);
     if (ret < 0) {
         return ret;
     }
@@ -1848,12 +1847,12 @@  static int map_is_allocated(BlockDriverState *bs, int64_t sector_num,
     firstret = ret;
     *pnum = num;

-    while (nb_sectors > 0 && ret == firstret) {
-        sector_num += num;
-        nb_sectors -= num;
+    while (bytes > 0 && ret == firstret) {
+        offset += num;
+        bytes -= num;

-        num_checked = MIN(nb_sectors, INT_MAX);
-        ret = bdrv_is_allocated(bs, sector_num, num_checked, &num);
+        num_checked = MIN(bytes, BDRV_REQUEST_MAX_BYTES);
+        ret = bdrv_is_allocated(bs, offset, num_checked, &num);
         if (ret == firstret && num) {
             *pnum += num;
         } else {
@@ -1867,7 +1866,7 @@  static int map_is_allocated(BlockDriverState *bs, int64_t sector_num,
 static int map_f(BlockBackend *blk, int argc, char **argv)
 {
     int64_t offset;
-    int64_t nb_sectors, total_sectors;
+    int64_t bytes, total_sectors;
     char s1[64];
     int64_t num;
     int ret;
@@ -1881,10 +1880,10 @@  static int map_f(BlockBackend *blk, int argc, char **argv)
         return 0;
     }

-    nb_sectors = total_sectors;
+    bytes = total_sectors * BDRV_SECTOR_SIZE;

     do {
-        ret = map_is_allocated(blk_bs(blk), offset, nb_sectors, &num);
+        ret = map_is_allocated(blk_bs(blk), offset, bytes, &num);
         if (ret < 0) {
             error_report("Failed to get allocation status: %s", strerror(-ret));
             return 0;
@@ -1894,13 +1893,13 @@  static int map_f(BlockBackend *blk, int argc, char **argv)
         }

         retstr = ret ? "    allocated" : "not allocated";
-        cvtstr(offset << 9ULL, s1, sizeof(s1));
+        cvtstr(offset, s1, sizeof(s1));
         printf("[% 24" PRId64 "] % 16" PRId64 " bytes %s at offset %s (%d)\n",
-               offset << 9ULL, num << 9ULL, retstr, s1, ret);
+               offset, num, retstr, s1, ret);

         offset += num;
-        nb_sectors -= num;
-    } while (offset < total_sectors);
+        bytes -= num;
+    } while (offset < total_sectors * BDRV_SECTOR_SIZE);

     return 0;
 }