diff mbox

Btrfs: recheck bio against block device when we map the bio

Message ID 1350680478-11222-1-git-send-email-jbacik@fusionio.com (mailing list archive)
State New, archived
Headers show

Commit Message

Josef Bacik Oct. 19, 2012, 9:01 p.m. UTC
Alex reported a problem where we were writing between chunks on a rbd
device.  The thing is we do bio_add_page using logical offsets, but the
physical offset may be different.  So when we map the bio now check to see
if the bio is still ok with the physical offset, and if it is not split the
bio up and redo the bio_add_page with the physical sector.  This fixes the
problem for Alex and doesn't affect performance in the normal case.  Thanks,

Reported-and-tested-by: Alex Elder <elder@inktank.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/volumes.c |  159 ++++++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 131 insertions(+), 28 deletions(-)

Comments

Liu Bo Oct. 20, 2012, 2:31 a.m. UTC | #1
On 10/20/2012 05:01 AM, Josef Bacik wrote:
> Alex reported a problem where we were writing between chunks on a rbd
> device.  The thing is we do bio_add_page using logical offsets, but the
> physical offset may be different.  So when we map the bio now check to see
> if the bio is still ok with the physical offset, and if it is not split the
> bio up and redo the bio_add_page with the physical sector.  This fixes the
> problem for Alex and doesn't affect performance in the normal case.  Thanks,
> 
> Reported-and-tested-by: Alex Elder <elder@inktank.com>
> Signed-off-by: Josef Bacik <jbacik@fusionio.com>
> ---
>  fs/btrfs/volumes.c |  159 ++++++++++++++++++++++++++++++++++++++++++---------
>  1 files changed, 131 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index a8adf26..eaaf0bf 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -4217,6 +4217,113 @@ static noinline void schedule_bio(struct btrfs_root *root,
>  				   &device->work);
>  }
>  
> +static int bio_size_ok(struct block_device *bdev, struct bio *bio,
> +		       sector_t sector)
> +{
> +	struct bio_vec *prev;
> +	struct request_queue *q = bdev_get_queue(bdev);
> +	unsigned short max_sectors = queue_max_sectors(q);
> +	struct bvec_merge_data bvm = {
> +		.bi_bdev = bdev,
> +		.bi_sector = sector,
> +		.bi_rw = bio->bi_rw,
> +	};
> +
> +	if (bio->bi_vcnt == 0) {
> +		WARN_ON(1);
> +		return 1;
> +	}
> +
> +	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];

I prefer 'last' for this.


Others look good to me :)

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>

thanks,
liubo

> +	if ((bio->bi_size >> 9) > max_sectors)
> +		return 0;
> +
> +	if (!q->merge_bvec_fn)
> +		return 1;
> +
> +	bvm.bi_size = bio->bi_size - prev->bv_len;
> +	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
> +		return 0;
> +	return 1;
> +}
> +
> +static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
> +			      struct bio *bio, u64 physical, int dev_nr,
> +			      int rw, int async)
> +{
> +	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
> +
> +	bio->bi_private = bbio;
> +	bio->bi_private = merge_stripe_index_into_bio_private(
> +			bio->bi_private, (unsigned int)dev_nr);
> +	bio->bi_end_io = btrfs_end_bio;
> +	bio->bi_sector = physical >> 9;
> +#ifdef DEBUG
> +	{
> +		struct rcu_string *name;
> +
> +		rcu_read_lock();
> +		name = rcu_dereference(dev->name);
> +		pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
> +			 "(%s id %llu), size=%u\n", rw,
> +			 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
> +			 name->str, dev->devid, bio->bi_size);
> +		rcu_read_unlock();
> +	}
> +#endif
> +	bio->bi_bdev = dev->bdev;
> +	if (async)
> +		schedule_bio(root, dev, rw, bio);
> +	else
> +		btrfsic_submit_bio(rw, bio);
> +}
> +
> +static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
> +			      struct bio *first_bio, struct btrfs_device *dev,
> +			      int dev_nr, int rw, int async)
> +{
> +	struct bio_vec *bvec = first_bio->bi_io_vec;
> +	struct bio *bio;
> +	int nr_vecs = bio_get_nr_vecs(dev->bdev);
> +	u64 physical = bbio->stripes[dev_nr].physical;
> +
> +again:
> +	bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
> +	if (!bio)
> +		return -ENOMEM;
> +
> +	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
> +		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
> +				 bvec->bv_offset) < bvec->bv_len) {
> +			u64 len = bio->bi_size;
> +
> +			atomic_inc(&bbio->stripes_pending);
> +			submit_stripe_bio(root, bbio, bio, physical, dev_nr,
> +					  rw, async);
> +			physical += len;
> +			goto again;
> +		}
> +		bvec++;
> +	}
> +
> +	submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
> +	return 0;
> +}
> +
> +static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
> +{
> +	atomic_inc(&bbio->error);
> +	if (atomic_dec_and_test(&bbio->stripes_pending)) {
> +		bio->bi_private = bbio->private;
> +		bio->bi_end_io = bbio->end_io;
> +		bio->bi_bdev = (struct block_device *)
> +			(unsigned long)bbio->mirror_num;
> +		bio->bi_sector = logical >> 9;
> +		kfree(bbio);
> +		bio_endio(bio, -EIO);
> +	}
> +}
> +
>  int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
>  		  int mirror_num, int async_submit)
>  {
> @@ -4255,40 +4362,36 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
>  	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
>  
>  	while (dev_nr < total_devs) {
> +		dev = bbio->stripes[dev_nr].dev;
> +		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
> +			bbio_error(bbio, first_bio, logical);
> +			dev_nr++;
> +			continue;
> +		}
> +
> +		/*
> +		 * Check and see if we're ok with this bio based on it's size
> +		 * and offset with the given device.
> +		 */
> +		if (!bio_size_ok(dev->bdev, first_bio,
> +				 bbio->stripes[dev_nr].physical >> 9)) {
> +			ret = breakup_stripe_bio(root, bbio, first_bio, dev,
> +						 dev_nr, rw, async_submit);
> +			BUG_ON(ret);
> +			dev_nr++;
> +			continue;
> +		}
> +
>  		if (dev_nr < total_devs - 1) {
>  			bio = bio_clone(first_bio, GFP_NOFS);
>  			BUG_ON(!bio); /* -ENOMEM */
>  		} else {
>  			bio = first_bio;
>  		}
> -		bio->bi_private = bbio;
> -		bio->bi_private = merge_stripe_index_into_bio_private(
> -				bio->bi_private, (unsigned int)dev_nr);
> -		bio->bi_end_io = btrfs_end_bio;
> -		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
> -		dev = bbio->stripes[dev_nr].dev;
> -		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
> -#ifdef DEBUG
> -			struct rcu_string *name;
> -
> -			rcu_read_lock();
> -			name = rcu_dereference(dev->name);
> -			pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
> -				 "(%s id %llu), size=%u\n", rw,
> -				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
> -				 name->str, dev->devid, bio->bi_size);
> -			rcu_read_unlock();
> -#endif
> -			bio->bi_bdev = dev->bdev;
> -			if (async_submit)
> -				schedule_bio(root, dev, rw, bio);
> -			else
> -				btrfsic_submit_bio(rw, bio);
> -		} else {
> -			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
> -			bio->bi_sector = logical >> 9;
> -			bio_endio(bio, -EIO);
> -		}
> +
> +		submit_stripe_bio(root, bbio, bio,
> +				  bbio->stripes[dev_nr].physical, dev_nr, rw,
> +				  async_submit);
>  		dev_nr++;
>  	}
>  	return 0;
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Elder Oct. 22, 2012, 4:14 p.m. UTC | #2
On 10/19/2012 09:31 PM, Liu Bo wrote:
>> +
>> > +	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
> I prefer 'last' for this.

I said exactly the same thing.

> Others look good to me :)
> 
> Reviewed-by: Liu Bo <bo.li.liu@oracle.com>

					-Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Elder Oct. 22, 2012, 4:14 p.m. UTC | #3
On 10/19/2012 04:01 PM, Josef Bacik wrote:
> Alex reported a problem where we were writing between chunks on a rbd
> device.  The thing is we do bio_add_page using logical offsets, but the
> physical offset may be different.  So when we map the bio now check to see
> if the bio is still ok with the physical offset, and if it is not split the
> bio up and redo the bio_add_page with the physical sector.  This fixes the
> problem for Alex and doesn't affect performance in the normal case.  Thanks,
> 
> Reported-and-tested-by: Alex Elder <elder@inktank.com>
> Signed-off-by: Josef Bacik <jbacik@fusionio.com>
> ---
>  fs/btrfs/volumes.c |  159 ++++++++++++++++++++++++++++++++++++++++++---------
>  1 files changed, 131 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index a8adf26..eaaf0bf 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -4217,6 +4217,113 @@ static noinline void schedule_bio(struct btrfs_root *root,
>  				   &device->work);
>  }
>  
> +static int bio_size_ok(struct block_device *bdev, struct bio *bio,
> +		       sector_t sector)
> +{
> +	struct bio_vec *prev;
> +	struct request_queue *q = bdev_get_queue(bdev);
> +	unsigned short max_sectors = queue_max_sectors(q);
> +	struct bvec_merge_data bvm = {
> +		.bi_bdev = bdev,
> +		.bi_sector = sector,
> +		.bi_rw = bio->bi_rw,
> +	};
> +
> +	if (bio->bi_vcnt == 0) {
> +		WARN_ON(1);
> +		return 1;
> +	}
> +
> +	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
> +	if ((bio->bi_size >> 9) > max_sectors)
> +		return 0;
> +
> +	if (!q->merge_bvec_fn)
> +		return 1;
> +
> +	bvm.bi_size = bio->bi_size - prev->bv_len;
> +	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
> +		return 0;

I wanted to mention one concern that occurred to me over the
weekend.

By checking only the last bio_vec in the bio here it's
conceivable you'll miss another boundary that sits earlier
in the bio.

We technically allow the boundaries of objects that back rbd
disk images to be as small as 4 KB (but by default it's 4 MB).
4 KB is really unlikely, but 256 KB (which is smaller than
128-entry bio) is less of a stretch.

I also came up with a different way of splitting that makes
it moot anyway, obviating the need for doing this check at
all so after a little more testing I'll get that posted.

					-Alex


> +	return 1;
> +}
> +
> +static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
> +			      struct bio *bio, u64 physical, int dev_nr,
> +			      int rw, int async)
> +{
> +	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
> +
> +	bio->bi_private = bbio;
> +	bio->bi_private = merge_stripe_index_into_bio_private(
> +			bio->bi_private, (unsigned int)dev_nr);
> +	bio->bi_end_io = btrfs_end_bio;
> +	bio->bi_sector = physical >> 9;
> +#ifdef DEBUG
> +	{
> +		struct rcu_string *name;
> +
> +		rcu_read_lock();
> +		name = rcu_dereference(dev->name);
> +		pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
> +			 "(%s id %llu), size=%u\n", rw,
> +			 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
> +			 name->str, dev->devid, bio->bi_size);
> +		rcu_read_unlock();
> +	}
> +#endif
> +	bio->bi_bdev = dev->bdev;
> +	if (async)
> +		schedule_bio(root, dev, rw, bio);
> +	else
> +		btrfsic_submit_bio(rw, bio);
> +}
> +
> +static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
> +			      struct bio *first_bio, struct btrfs_device *dev,
> +			      int dev_nr, int rw, int async)
> +{
> +	struct bio_vec *bvec = first_bio->bi_io_vec;
> +	struct bio *bio;
> +	int nr_vecs = bio_get_nr_vecs(dev->bdev);
> +	u64 physical = bbio->stripes[dev_nr].physical;
> +
> +again:
> +	bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
> +	if (!bio)
> +		return -ENOMEM;
> +
> +	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
> +		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
> +				 bvec->bv_offset) < bvec->bv_len) {
> +			u64 len = bio->bi_size;
> +
> +			atomic_inc(&bbio->stripes_pending);
> +			submit_stripe_bio(root, bbio, bio, physical, dev_nr,
> +					  rw, async);
> +			physical += len;
> +			goto again;
> +		}
> +		bvec++;
> +	}
> +
> +	submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
> +	return 0;
> +}
> +
> +static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
> +{
> +	atomic_inc(&bbio->error);
> +	if (atomic_dec_and_test(&bbio->stripes_pending)) {
> +		bio->bi_private = bbio->private;
> +		bio->bi_end_io = bbio->end_io;
> +		bio->bi_bdev = (struct block_device *)
> +			(unsigned long)bbio->mirror_num;
> +		bio->bi_sector = logical >> 9;
> +		kfree(bbio);
> +		bio_endio(bio, -EIO);
> +	}
> +}
> +
>  int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
>  		  int mirror_num, int async_submit)
>  {
> @@ -4255,40 +4362,36 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
>  	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
>  
>  	while (dev_nr < total_devs) {
> +		dev = bbio->stripes[dev_nr].dev;
> +		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
> +			bbio_error(bbio, first_bio, logical);
> +			dev_nr++;
> +			continue;
> +		}
> +
> +		/*
> +		 * Check and see if we're ok with this bio based on it's size
> +		 * and offset with the given device.
> +		 */
> +		if (!bio_size_ok(dev->bdev, first_bio,
> +				 bbio->stripes[dev_nr].physical >> 9)) {
> +			ret = breakup_stripe_bio(root, bbio, first_bio, dev,
> +						 dev_nr, rw, async_submit);
> +			BUG_ON(ret);
> +			dev_nr++;
> +			continue;
> +		}
> +
>  		if (dev_nr < total_devs - 1) {
>  			bio = bio_clone(first_bio, GFP_NOFS);
>  			BUG_ON(!bio); /* -ENOMEM */
>  		} else {
>  			bio = first_bio;
>  		}
> -		bio->bi_private = bbio;
> -		bio->bi_private = merge_stripe_index_into_bio_private(
> -				bio->bi_private, (unsigned int)dev_nr);
> -		bio->bi_end_io = btrfs_end_bio;
> -		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
> -		dev = bbio->stripes[dev_nr].dev;
> -		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
> -#ifdef DEBUG
> -			struct rcu_string *name;
> -
> -			rcu_read_lock();
> -			name = rcu_dereference(dev->name);
> -			pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
> -				 "(%s id %llu), size=%u\n", rw,
> -				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
> -				 name->str, dev->devid, bio->bi_size);
> -			rcu_read_unlock();
> -#endif
> -			bio->bi_bdev = dev->bdev;
> -			if (async_submit)
> -				schedule_bio(root, dev, rw, bio);
> -			else
> -				btrfsic_submit_bio(rw, bio);
> -		} else {
> -			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
> -			bio->bi_sector = logical >> 9;
> -			bio_endio(bio, -EIO);
> -		}
> +
> +		submit_stripe_bio(root, bbio, bio,
> +				  bbio->stripes[dev_nr].physical, dev_nr, rw,
> +				  async_submit);
>  		dev_nr++;
>  	}
>  	return 0;
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik Oct. 22, 2012, 7:53 p.m. UTC | #4
On Fri, Oct 19, 2012 at 08:31:17PM -0600, Liu Bo wrote:
> On 10/20/2012 05:01 AM, Josef Bacik wrote:
> > Alex reported a problem where we were writing between chunks on a rbd
> > device.  The thing is we do bio_add_page using logical offsets, but the
> > physical offset may be different.  So when we map the bio now check to see
> > if the bio is still ok with the physical offset, and if it is not split the
> > bio up and redo the bio_add_page with the physical sector.  This fixes the
> > problem for Alex and doesn't affect performance in the normal case.  Thanks,
> > 
> > Reported-and-tested-by: Alex Elder <elder@inktank.com>
> > Signed-off-by: Josef Bacik <jbacik@fusionio.com>
> > ---
> >  fs/btrfs/volumes.c |  159 ++++++++++++++++++++++++++++++++++++++++++---------
> >  1 files changed, 131 insertions(+), 28 deletions(-)
> > 
> > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> > index a8adf26..eaaf0bf 100644
> > --- a/fs/btrfs/volumes.c
> > +++ b/fs/btrfs/volumes.c
> > @@ -4217,6 +4217,113 @@ static noinline void schedule_bio(struct btrfs_root *root,
> >  				   &device->work);
> >  }
> >  
> > +static int bio_size_ok(struct block_device *bdev, struct bio *bio,
> > +		       sector_t sector)
> > +{
> > +	struct bio_vec *prev;
> > +	struct request_queue *q = bdev_get_queue(bdev);
> > +	unsigned short max_sectors = queue_max_sectors(q);
> > +	struct bvec_merge_data bvm = {
> > +		.bi_bdev = bdev,
> > +		.bi_sector = sector,
> > +		.bi_rw = bio->bi_rw,
> > +	};
> > +
> > +	if (bio->bi_vcnt == 0) {
> > +		WARN_ON(1);
> > +		return 1;
> > +	}
> > +
> > +	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
> 
> I prefer 'last' for this.
> 
> 
> Others look good to me :)
> 
> Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
> 

Heh I had made that change but apparently didn't commit it before I sent it out.
Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik Oct. 22, 2012, 7:55 p.m. UTC | #5
On Mon, Oct 22, 2012 at 10:14:58AM -0600, Alex Elder wrote:
> On 10/19/2012 04:01 PM, Josef Bacik wrote:
> > Alex reported a problem where we were writing between chunks on a rbd
> > device.  The thing is we do bio_add_page using logical offsets, but the
> > physical offset may be different.  So when we map the bio now check to see
> > if the bio is still ok with the physical offset, and if it is not split the
> > bio up and redo the bio_add_page with the physical sector.  This fixes the
> > problem for Alex and doesn't affect performance in the normal case.  Thanks,
> > 
> > Reported-and-tested-by: Alex Elder <elder@inktank.com>
> > Signed-off-by: Josef Bacik <jbacik@fusionio.com>
> > ---
> >  fs/btrfs/volumes.c |  159 ++++++++++++++++++++++++++++++++++++++++++---------
> >  1 files changed, 131 insertions(+), 28 deletions(-)
> > 
> > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> > index a8adf26..eaaf0bf 100644
> > --- a/fs/btrfs/volumes.c
> > +++ b/fs/btrfs/volumes.c
> > @@ -4217,6 +4217,113 @@ static noinline void schedule_bio(struct btrfs_root *root,
> >  				   &device->work);
> >  }
> >  
> > +static int bio_size_ok(struct block_device *bdev, struct bio *bio,
> > +		       sector_t sector)
> > +{
> > +	struct bio_vec *prev;
> > +	struct request_queue *q = bdev_get_queue(bdev);
> > +	unsigned short max_sectors = queue_max_sectors(q);
> > +	struct bvec_merge_data bvm = {
> > +		.bi_bdev = bdev,
> > +		.bi_sector = sector,
> > +		.bi_rw = bio->bi_rw,
> > +	};
> > +
> > +	if (bio->bi_vcnt == 0) {
> > +		WARN_ON(1);
> > +		return 1;
> > +	}
> > +
> > +	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
> > +	if ((bio->bi_size >> 9) > max_sectors)
> > +		return 0;
> > +
> > +	if (!q->merge_bvec_fn)
> > +		return 1;
> > +
> > +	bvm.bi_size = bio->bi_size - prev->bv_len;
> > +	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
> > +		return 0;
> 
> I wanted to mention one concern that occurred to me over the
> weekend.
> 
> By checking only the last bio_vec in the bio here it's
> conceivable you'll miss another boundary that sits earlier
> in the bio.
> 
> We technically allow the boundaries of objects that back rbd
> disk images to be as small as 4 KB (but by default it's 4 MB).
> 4 KB is really unlikely, but 256 KB (which is smaller than
> 128-entry bio) is less of a stretch.
> 
> I also came up with a different way of splitting that makes
> it moot anyway, obviating the need for doing this check at
> all so after a little more testing I'll get that posted.
> 

Right but if we go over the boundary earlier in the bio shouldn't we trip the
test anyway?  For example, if you have 4k images and the last one happens to be
4k aligned but is in a 8 megabyte bio it should fail, and if it doesn't the
merge function is wrong :).  Once it fails we add all pages starting from the
first one forward, so we will end up with correct bios if we have to split it.
Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Elder Oct. 24, 2012, 2:35 p.m. UTC | #6
On 10/22/2012 02:55 PM, Josef Bacik wrote:
> On Mon, Oct 22, 2012 at 10:14:58AM -0600, Alex Elder wrote:
>> On 10/19/2012 04:01 PM, Josef Bacik wrote:
>>> Alex reported a problem where we were writing between chunks on a rbd
>>> device.  The thing is we do bio_add_page using logical offsets, but the
>>> physical offset may be different.  So when we map the bio now check to see
>>> if the bio is still ok with the physical offset, and if it is not split the
>>> bio up and redo the bio_add_page with the physical sector.  This fixes the
>>> problem for Alex and doesn't affect performance in the normal case.  Thanks,
>>>
>>> Reported-and-tested-by: Alex Elder <elder@inktank.com>
>>> Signed-off-by: Josef Bacik <jbacik@fusionio.com>
>>> ---
>>>  fs/btrfs/volumes.c |  159 ++++++++++++++++++++++++++++++++++++++++++---------
>>>  1 files changed, 131 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>> index a8adf26..eaaf0bf 100644
>>> --- a/fs/btrfs/volumes.c
>>> +++ b/fs/btrfs/volumes.c
>>> @@ -4217,6 +4217,113 @@ static noinline void schedule_bio(struct btrfs_root *root,
>>>  				   &device->work);
>>>  }
>>>  
>>> +static int bio_size_ok(struct block_device *bdev, struct bio *bio,
>>> +		       sector_t sector)
>>> +{
>>> +	struct bio_vec *prev;
>>> +	struct request_queue *q = bdev_get_queue(bdev);
>>> +	unsigned short max_sectors = queue_max_sectors(q);
>>> +	struct bvec_merge_data bvm = {
>>> +		.bi_bdev = bdev,
>>> +		.bi_sector = sector,
>>> +		.bi_rw = bio->bi_rw,
>>> +	};
>>> +
>>> +	if (bio->bi_vcnt == 0) {
>>> +		WARN_ON(1);
>>> +		return 1;
>>> +	}
>>> +
>>> +	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
>>> +	if ((bio->bi_size >> 9) > max_sectors)
>>> +		return 0;
>>> +
>>> +	if (!q->merge_bvec_fn)
>>> +		return 1;
>>> +
>>> +	bvm.bi_size = bio->bi_size - prev->bv_len;
>>> +	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
>>> +		return 0;
>>
>> I wanted to mention one concern that occurred to me over the
>> weekend.
>>
>> By checking only the last bio_vec in the bio here it's
>> conceivable you'll miss another boundary that sits earlier
>> in the bio.
>>
>> We technically allow the boundaries of objects that back rbd
>> disk images to be as small as 4 KB (but by default it's 4 MB).
>> 4 KB is really unlikely, but 256 KB (which is smaller than
>> 128-entry bio) is less of a stretch.
>>
>> I also came up with a different way of splitting that makes
>> it moot anyway, obviating the need for doing this check at
>> all so after a little more testing I'll get that posted.
>>
> 
> Right but if we go over the boundary earlier in the bio shouldn't we trip the
> test anyway?  For example, if you have 4k images and the last one happens to be
> 4k aligned but is in a 8 megabyte bio it should fail, and if it doesn't the
> merge function is wrong :).  Once it fails we add all pages starting from the
> first one forward, so we will end up with correct bios if we have to split it.
> Thanks,

You're right.  I finally took a closer look.  You're passing
a bio, and the decision is based on whether, considering the
sector start address provided, adding the last segment in that
bio will cross a boundary.  So if there were another boundary
prior to that in the bio, it too would be crossed and therefore
this check still works.

Thanks for the explanation.

					-Alex


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a8adf26..eaaf0bf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4217,6 +4217,113 @@  static noinline void schedule_bio(struct btrfs_root *root,
 				   &device->work);
 }
 
+static int bio_size_ok(struct block_device *bdev, struct bio *bio,
+		       sector_t sector)
+{
+	struct bio_vec *prev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	unsigned short max_sectors = queue_max_sectors(q);
+	struct bvec_merge_data bvm = {
+		.bi_bdev = bdev,
+		.bi_sector = sector,
+		.bi_rw = bio->bi_rw,
+	};
+
+	if (bio->bi_vcnt == 0) {
+		WARN_ON(1);
+		return 1;
+	}
+
+	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+	if ((bio->bi_size >> 9) > max_sectors)
+		return 0;
+
+	if (!q->merge_bvec_fn)
+		return 1;
+
+	bvm.bi_size = bio->bi_size - prev->bv_len;
+	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
+		return 0;
+	return 1;
+}
+
+static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+			      struct bio *bio, u64 physical, int dev_nr,
+			      int rw, int async)
+{
+	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
+
+	bio->bi_private = bbio;
+	bio->bi_private = merge_stripe_index_into_bio_private(
+			bio->bi_private, (unsigned int)dev_nr);
+	bio->bi_end_io = btrfs_end_bio;
+	bio->bi_sector = physical >> 9;
+#ifdef DEBUG
+	{
+		struct rcu_string *name;
+
+		rcu_read_lock();
+		name = rcu_dereference(dev->name);
+		pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
+			 "(%s id %llu), size=%u\n", rw,
+			 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+			 name->str, dev->devid, bio->bi_size);
+		rcu_read_unlock();
+	}
+#endif
+	bio->bi_bdev = dev->bdev;
+	if (async)
+		schedule_bio(root, dev, rw, bio);
+	else
+		btrfsic_submit_bio(rw, bio);
+}
+
+static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+			      struct bio *first_bio, struct btrfs_device *dev,
+			      int dev_nr, int rw, int async)
+{
+	struct bio_vec *bvec = first_bio->bi_io_vec;
+	struct bio *bio;
+	int nr_vecs = bio_get_nr_vecs(dev->bdev);
+	u64 physical = bbio->stripes[dev_nr].physical;
+
+again:
+	bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
+	if (!bio)
+		return -ENOMEM;
+
+	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
+		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+				 bvec->bv_offset) < bvec->bv_len) {
+			u64 len = bio->bi_size;
+
+			atomic_inc(&bbio->stripes_pending);
+			submit_stripe_bio(root, bbio, bio, physical, dev_nr,
+					  rw, async);
+			physical += len;
+			goto again;
+		}
+		bvec++;
+	}
+
+	submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
+	return 0;
+}
+
+static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+{
+	atomic_inc(&bbio->error);
+	if (atomic_dec_and_test(&bbio->stripes_pending)) {
+		bio->bi_private = bbio->private;
+		bio->bi_end_io = bbio->end_io;
+		bio->bi_bdev = (struct block_device *)
+			(unsigned long)bbio->mirror_num;
+		bio->bi_sector = logical >> 9;
+		kfree(bbio);
+		bio_endio(bio, -EIO);
+	}
+}
+
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		  int mirror_num, int async_submit)
 {
@@ -4255,40 +4362,36 @@  int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
 
 	while (dev_nr < total_devs) {
+		dev = bbio->stripes[dev_nr].dev;
+		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
+			bbio_error(bbio, first_bio, logical);
+			dev_nr++;
+			continue;
+		}
+
+		/*
+		 * Check and see if we're ok with this bio based on it's size
+		 * and offset with the given device.
+		 */
+		if (!bio_size_ok(dev->bdev, first_bio,
+				 bbio->stripes[dev_nr].physical >> 9)) {
+			ret = breakup_stripe_bio(root, bbio, first_bio, dev,
+						 dev_nr, rw, async_submit);
+			BUG_ON(ret);
+			dev_nr++;
+			continue;
+		}
+
 		if (dev_nr < total_devs - 1) {
 			bio = bio_clone(first_bio, GFP_NOFS);
 			BUG_ON(!bio); /* -ENOMEM */
 		} else {
 			bio = first_bio;
 		}
-		bio->bi_private = bbio;
-		bio->bi_private = merge_stripe_index_into_bio_private(
-				bio->bi_private, (unsigned int)dev_nr);
-		bio->bi_end_io = btrfs_end_bio;
-		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
-		dev = bbio->stripes[dev_nr].dev;
-		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
-#ifdef DEBUG
-			struct rcu_string *name;
-
-			rcu_read_lock();
-			name = rcu_dereference(dev->name);
-			pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
-				 "(%s id %llu), size=%u\n", rw,
-				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
-				 name->str, dev->devid, bio->bi_size);
-			rcu_read_unlock();
-#endif
-			bio->bi_bdev = dev->bdev;
-			if (async_submit)
-				schedule_bio(root, dev, rw, bio);
-			else
-				btrfsic_submit_bio(rw, bio);
-		} else {
-			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
-			bio->bi_sector = logical >> 9;
-			bio_endio(bio, -EIO);
-		}
+
+		submit_stripe_bio(root, bbio, bio,
+				  bbio->stripes[dev_nr].physical, dev_nr, rw,
+				  async_submit);
 		dev_nr++;
 	}
 	return 0;