diff mbox

[RFC,v2,05/10] dm thin: add methods to set and get reserved space

Message ID 1460479373-63317-6-git-send-email-bfoster@redhat.com (mailing list archive)
State Deferred, archived
Delegated to: Mike Snitzer
Headers show

Commit Message

Brian Foster April 12, 2016, 4:42 p.m. UTC
From: Joe Thornber <ejt@redhat.com>

Experimental reserve interface for XFS guys to play with.

I have big reservations (no pun intended) about this patch.

[BF:
 - Support for reservation reduction.
 - Support for space provisioning.
 - Condensed to a single function.]

Not-Signed-off-by: Joe Thornber <ejt@redhat.com>
Not-Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 171 insertions(+), 10 deletions(-)

Comments

Darrick J. Wong April 13, 2016, 5:44 p.m. UTC | #1
On Tue, Apr 12, 2016 at 12:42:48PM -0400, Brian Foster wrote:
> From: Joe Thornber <ejt@redhat.com>
> 
> Experimental reserve interface for XFS guys to play with.
> 
> I have big reservations (no pun intended) about this patch.
> 
> [BF:
>  - Support for reservation reduction.
>  - Support for space provisioning.
>  - Condensed to a single function.]
> 
> Not-Signed-off-by: Joe Thornber <ejt@redhat.com>
> Not-Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> ---
>  drivers/md/dm-thin.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 171 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> index 92237b6..32bc5bd 100644
> --- a/drivers/md/dm-thin.c
> +++ b/drivers/md/dm-thin.c
> @@ -271,6 +271,8 @@ struct pool {
>  	process_mapping_fn process_prepared_discard;
>  
>  	struct dm_bio_prison_cell **cell_sort_array;
> +
> +	dm_block_t reserve_count;
>  };
>  
>  static enum pool_mode get_pool_mode(struct pool *pool);
> @@ -318,6 +320,8 @@ struct thin_c {
>  	 */
>  	atomic_t refcount;
>  	struct completion can_destroy;
> +
> +	dm_block_t reserve_count;
>  };
>  
>  /*----------------------------------------------------------------*/
> @@ -1359,24 +1363,19 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
>  	}
>  }
>  
> -static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
> +static int get_free_blocks(struct pool *pool, dm_block_t *free_blocks)
>  {
>  	int r;
> -	dm_block_t free_blocks;
> -	struct pool *pool = tc->pool;
> -
> -	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
> -		return -EINVAL;
>  
> -	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
> +	r = dm_pool_get_free_block_count(pool->pmd, free_blocks);
>  	if (r) {
>  		metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
>  		return r;
>  	}
>  
> -	check_low_water_mark(pool, free_blocks);
> +	check_low_water_mark(pool, *free_blocks);
>  
> -	if (!free_blocks) {
> +	if (!*free_blocks) {
>  		/*
>  		 * Try to commit to see if that will free up some
>  		 * more space.
> @@ -1385,7 +1384,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
>  		if (r)
>  			return r;
>  
> -		r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
> +		r = dm_pool_get_free_block_count(pool->pmd, free_blocks);
>  		if (r) {
>  			metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
>  			return r;
> @@ -1397,6 +1396,76 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
>  		}
>  	}
>  
> +	return r;
> +}
> +
> +/*
> + * Returns true iff either:
> + * i) decrement succeeded (ie. there was reserve left)
> + * ii) there is extra space in the pool
> + */
> +static bool dec_reserve_count(struct thin_c *tc, dm_block_t free_blocks)
> +{
> +	bool r = false;
> +	unsigned long flags;
> +
> +	if (!free_blocks)
> +		return false;
> +
> +	spin_lock_irqsave(&tc->pool->lock, flags);
> +	if (tc->reserve_count > 0) {
> +		tc->reserve_count--;
> +		tc->pool->reserve_count--;
> +		r = true;
> +	} else {
> +		if (free_blocks > tc->pool->reserve_count)
> +			r = true;
> +	}
> +	spin_unlock_irqrestore(&tc->pool->lock, flags);
> +
> +	return r;
> +}
> +
> +static int set_reserve_count(struct thin_c *tc, dm_block_t count)
> +{
> +	int r;
> +	dm_block_t free_blocks;
> +	int64_t delta;
> +	unsigned long flags;
> +
> +	r = get_free_blocks(tc->pool, &free_blocks);
> +	if (r)
> +		return r;
> +
> +	spin_lock_irqsave(&tc->pool->lock, flags);
> +	delta = count - tc->reserve_count;
> +	if (tc->pool->reserve_count + delta > free_blocks)
> +		r = -ENOSPC;
> +	else {
> +		tc->reserve_count = count;
> +		tc->pool->reserve_count += delta;
> +	}
> +	spin_unlock_irqrestore(&tc->pool->lock, flags);
> +
> +	return r;
> +}
> +
> +static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
> +{
> +	int r;
> +	dm_block_t free_blocks;
> +	struct pool *pool = tc->pool;
> +
> +	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
> +		return -EINVAL;
> +
> +	r = get_free_blocks(tc->pool, &free_blocks);
> +	if (r)
> +		return r;
> +
> +	if (!dec_reserve_count(tc, free_blocks))
> +		return -ENOSPC;
> +
>  	r = dm_pool_alloc_data_block(pool->pmd, result);
>  	if (r) {
>  		metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
> @@ -2880,6 +2949,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
>  	pool->last_commit_jiffies = jiffies;
>  	pool->pool_md = pool_md;
>  	pool->md_dev = metadata_dev;
> +	pool->reserve_count = 0;
>  	__pool_table_insert(pool);
>  
>  	return pool;
> @@ -3936,6 +4006,7 @@ static void thin_dtr(struct dm_target *ti)
>  
>  	spin_lock_irqsave(&tc->pool->lock, flags);
>  	list_del_rcu(&tc->list);
> +	tc->pool->reserve_count -= tc->reserve_count;
>  	spin_unlock_irqrestore(&tc->pool->lock, flags);
>  	synchronize_rcu();
>  
> @@ -4074,6 +4145,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
>  	init_completion(&tc->can_destroy);
>  	list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
>  	spin_unlock_irqrestore(&tc->pool->lock, flags);
> +	tc->reserve_count = 0;
>  	/*
>  	 * This synchronize_rcu() call is needed here otherwise we risk a
>  	 * wake_worker() call finding no bios to process (because the newly
> @@ -4271,6 +4343,94 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
>  	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
>  }
>  
> +static int thin_provision_space(struct dm_target *ti, sector_t offset,
> +				sector_t len, sector_t *res)
> +{
> +	struct thin_c *tc = ti->private;
> +	struct pool *pool = tc->pool;
> +	sector_t end;
> +	dm_block_t pblock;
> +	dm_block_t vblock;
> +	int error;
> +	struct dm_thin_lookup_result lookup;
> +
> +	if (!is_factor(offset, pool->sectors_per_block))
> +		return -EINVAL;
> +
> +	if (!len || !is_factor(len, pool->sectors_per_block))
> +		return -EINVAL;
> +
> +	if (res && !is_factor(*res, pool->sectors_per_block))
> +		return -EINVAL;
> +
> +	end = offset + len;
> +
> +	while (offset < end) {
> +		vblock = offset;
> +		do_div(vblock, pool->sectors_per_block);
> +
> +		error = dm_thin_find_block(tc->td, vblock, true, &lookup);
> +		if (error == 0)
> +			goto next;
> +		if (error != -ENODATA)
> +			return error;
> +
> +		error = alloc_data_block(tc, &pblock);

So this means that if fallocate wants to BDEV_RES_PROVISION N blocks, it must
first increase the reservation (BDEV_RES_MOD) by N blocks to avoid using up
space that was previously reserved by some other caller.  I think?

> +		if (error)
> +			return error;
> +
> +		error = dm_thin_insert_block(tc->td, vblock, pblock);

Having reserved and mapped blocks, what happens when we try to read them?
Do we actually get zeroes, or does the read go straight through to whatever
happens to be in the disk blocks?  I don't think it's correct that we could
BDEV_RES_PROVISION and end up with stale credit card numbers from some other
thin device.

(PS: I don't know enough about thinp to know if this has already been taken
care of.  I didn't see anything, but who knows what I missed. :))

--D

> +		if (error)
> +			return error;
> +
> +		if (res && *res)
> +			*res -= pool->sectors_per_block;
> +next:
> +		offset += pool->sectors_per_block;
> +	}
> +
> +	return 0;
> +}
> +
> +static int thin_reserve_space(struct dm_target *ti, int mode, sector_t offset,
> +			      sector_t len, sector_t *res)
> +{
> +	struct thin_c *tc = ti->private;
> +	struct pool *pool = tc->pool;
> +	sector_t blocks;
> +	unsigned long flags;
> +	int error;
> +
> +	if (mode == BDEV_RES_PROVISION)
> +		return thin_provision_space(ti, offset, len, res);
> +
> +	/* res required for get/set */
> +	error = -EINVAL;
> +	if (!res)
> +		return error;
> +
> +	if (mode == BDEV_RES_GET) {
> +		spin_lock_irqsave(&tc->pool->lock, flags);
> +		*res = tc->reserve_count * pool->sectors_per_block;
> +		spin_unlock_irqrestore(&tc->pool->lock, flags);
> +		error = 0;
> +	} else if (mode == BDEV_RES_MOD) {
> +		/*
> +		* @res must always be a factor of the pool's blocksize; upper
> +		* layers can rely on the bdev's minimum_io_size for this.
> +		*/
> +		if (!is_factor(*res, pool->sectors_per_block))
> +			return error;
> +
> +		blocks = *res;
> +		(void) sector_div(blocks, pool->sectors_per_block);
> +
> +		error = set_reserve_count(tc, blocks);
> +	}
> +
> +	return error;
> +}
> +
>  static struct target_type thin_target = {
>  	.name = "thin",
>  	.version = {1, 18, 0},
> @@ -4285,6 +4445,7 @@ static struct target_type thin_target = {
>  	.status = thin_status,
>  	.iterate_devices = thin_iterate_devices,
>  	.io_hints = thin_io_hints,
> +	.reserve_space = thin_reserve_space,
>  };
>  
>  /*----------------------------------------------------------------*/
> -- 
> 2.4.11
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Brian Foster April 13, 2016, 6:33 p.m. UTC | #2
On Wed, Apr 13, 2016 at 10:44:42AM -0700, Darrick J. Wong wrote:
> On Tue, Apr 12, 2016 at 12:42:48PM -0400, Brian Foster wrote:
> > From: Joe Thornber <ejt@redhat.com>
> > 
> > Experimental reserve interface for XFS guys to play with.
> > 
> > I have big reservations (no pun intended) about this patch.
> > 
> > [BF:
> >  - Support for reservation reduction.
> >  - Support for space provisioning.
> >  - Condensed to a single function.]
> > 
> > Not-Signed-off-by: Joe Thornber <ejt@redhat.com>
> > Not-Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> > ---
> >  drivers/md/dm-thin.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 171 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> > index 92237b6..32bc5bd 100644
> > --- a/drivers/md/dm-thin.c
> > +++ b/drivers/md/dm-thin.c
...
> > @@ -4271,6 +4343,94 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
> >  	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
> >  }
> >  
> > +static int thin_provision_space(struct dm_target *ti, sector_t offset,
> > +				sector_t len, sector_t *res)
> > +{
> > +	struct thin_c *tc = ti->private;
> > +	struct pool *pool = tc->pool;
> > +	sector_t end;
> > +	dm_block_t pblock;
> > +	dm_block_t vblock;
> > +	int error;
> > +	struct dm_thin_lookup_result lookup;
> > +
> > +	if (!is_factor(offset, pool->sectors_per_block))
> > +		return -EINVAL;
> > +
> > +	if (!len || !is_factor(len, pool->sectors_per_block))
> > +		return -EINVAL;
> > +
> > +	if (res && !is_factor(*res, pool->sectors_per_block))
> > +		return -EINVAL;
> > +
> > +	end = offset + len;
> > +
> > +	while (offset < end) {
> > +		vblock = offset;
> > +		do_div(vblock, pool->sectors_per_block);
> > +
> > +		error = dm_thin_find_block(tc->td, vblock, true, &lookup);
> > +		if (error == 0)
> > +			goto next;
> > +		if (error != -ENODATA)
> > +			return error;
> > +
> > +		error = alloc_data_block(tc, &pblock);
> 
> So this means that if fallocate wants to BDEV_RES_PROVISION N blocks, it must
> first increase the reservation (BDEV_RES_MOD) by N blocks to avoid using up
> space that was previously reserved by some other caller.  I think?
> 

Yes, assuming this is being called from a filesystem using the
reservation mechanism.

> > +		if (error)
> > +			return error;
> > +
> > +		error = dm_thin_insert_block(tc->td, vblock, pblock);
> 
> Having reserved and mapped blocks, what happens when we try to read them?
> Do we actually get zeroes, or does the read go straight through to whatever
> happens to be in the disk blocks?  I don't think it's correct that we could
> BDEV_RES_PROVISION and end up with stale credit card numbers from some other
> thin device.
> 

Agree, but I'm not really sure how this works in thinp tbh. fallocate
wasn't really on my mind when doing this. I was simply trying to cobble
together what I could to facilitate making progress on the fs parts
(e.g., I just needed a call that allocated blocks and consumed
reservation in the process).

Skimming through the dm-thin code, it looks like a (configurable) block
zeroing mechanism can be triggered from somewhere around
provision_block()->schedule_zero(), depending on whether the incoming
write overwrites the newly allocated block. If that's the case, then I
suspect that means reads would just fall through to the block and return
whatever was on disk. This code would probably need to tie into that
zeroing mechanism one way or another to deal with that issue. (Though
somebody who actually knows something about dm-thin should verify that.
:)

Brian

> (PS: I don't know enough about thinp to know if this has already been taken
> care of.  I didn't see anything, but who knows what I missed. :))
> 
> --D
> 
> > +		if (error)
> > +			return error;
> > +
> > +		if (res && *res)
> > +			*res -= pool->sectors_per_block;
> > +next:
> > +		offset += pool->sectors_per_block;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int thin_reserve_space(struct dm_target *ti, int mode, sector_t offset,
> > +			      sector_t len, sector_t *res)
> > +{
> > +	struct thin_c *tc = ti->private;
> > +	struct pool *pool = tc->pool;
> > +	sector_t blocks;
> > +	unsigned long flags;
> > +	int error;
> > +
> > +	if (mode == BDEV_RES_PROVISION)
> > +		return thin_provision_space(ti, offset, len, res);
> > +
> > +	/* res required for get/set */
> > +	error = -EINVAL;
> > +	if (!res)
> > +		return error;
> > +
> > +	if (mode == BDEV_RES_GET) {
> > +		spin_lock_irqsave(&tc->pool->lock, flags);
> > +		*res = tc->reserve_count * pool->sectors_per_block;
> > +		spin_unlock_irqrestore(&tc->pool->lock, flags);
> > +		error = 0;
> > +	} else if (mode == BDEV_RES_MOD) {
> > +		/*
> > +		* @res must always be a factor of the pool's blocksize; upper
> > +		* layers can rely on the bdev's minimum_io_size for this.
> > +		*/
> > +		if (!is_factor(*res, pool->sectors_per_block))
> > +			return error;
> > +
> > +		blocks = *res;
> > +		(void) sector_div(blocks, pool->sectors_per_block);
> > +
> > +		error = set_reserve_count(tc, blocks);
> > +	}
> > +
> > +	return error;
> > +}
> > +
> >  static struct target_type thin_target = {
> >  	.name = "thin",
> >  	.version = {1, 18, 0},
> > @@ -4285,6 +4445,7 @@ static struct target_type thin_target = {
> >  	.status = thin_status,
> >  	.iterate_devices = thin_iterate_devices,
> >  	.io_hints = thin_io_hints,
> > +	.reserve_space = thin_reserve_space,
> >  };
> >  
> >  /*----------------------------------------------------------------*/
> > -- 
> > 2.4.11
> > 
> > _______________________________________________
> > xfs mailing list
> > xfs@oss.sgi.com
> > http://oss.sgi.com/mailman/listinfo/xfs
> --
> To unsubscribe from this list: send the line "unsubscribe linux-block" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Brian Foster April 13, 2016, 8:41 p.m. UTC | #3
On Wed, Apr 13, 2016 at 02:33:52PM -0400, Brian Foster wrote:
> On Wed, Apr 13, 2016 at 10:44:42AM -0700, Darrick J. Wong wrote:
> > On Tue, Apr 12, 2016 at 12:42:48PM -0400, Brian Foster wrote:
> > > From: Joe Thornber <ejt@redhat.com>
> > > 
> > > Experimental reserve interface for XFS guys to play with.
> > > 
> > > I have big reservations (no pun intended) about this patch.
> > > 
> > > [BF:
> > >  - Support for reservation reduction.
> > >  - Support for space provisioning.
> > >  - Condensed to a single function.]
> > > 
> > > Not-Signed-off-by: Joe Thornber <ejt@redhat.com>
> > > Not-Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> > > ---
> > >  drivers/md/dm-thin.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++---
> > >  1 file changed, 171 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> > > index 92237b6..32bc5bd 100644
> > > --- a/drivers/md/dm-thin.c
> > > +++ b/drivers/md/dm-thin.c
> ...
> > > @@ -4271,6 +4343,94 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
> > >  	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
> > >  }
> > >  
> > > +static int thin_provision_space(struct dm_target *ti, sector_t offset,
> > > +				sector_t len, sector_t *res)
> > > +{
> > > +	struct thin_c *tc = ti->private;
> > > +	struct pool *pool = tc->pool;
> > > +	sector_t end;
> > > +	dm_block_t pblock;
> > > +	dm_block_t vblock;
> > > +	int error;
> > > +	struct dm_thin_lookup_result lookup;
> > > +
> > > +	if (!is_factor(offset, pool->sectors_per_block))
> > > +		return -EINVAL;
> > > +
> > > +	if (!len || !is_factor(len, pool->sectors_per_block))
> > > +		return -EINVAL;
> > > +
> > > +	if (res && !is_factor(*res, pool->sectors_per_block))
> > > +		return -EINVAL;
> > > +
> > > +	end = offset + len;
> > > +
> > > +	while (offset < end) {
> > > +		vblock = offset;
> > > +		do_div(vblock, pool->sectors_per_block);
> > > +
> > > +		error = dm_thin_find_block(tc->td, vblock, true, &lookup);
> > > +		if (error == 0)
> > > +			goto next;
> > > +		if (error != -ENODATA)
> > > +			return error;
> > > +
> > > +		error = alloc_data_block(tc, &pblock);
> > 
> > So this means that if fallocate wants to BDEV_RES_PROVISION N blocks, it must
> > first increase the reservation (BDEV_RES_MOD) by N blocks to avoid using up
> > space that was previously reserved by some other caller.  I think?
> > 
> 
> Yes, assuming this is being called from a filesystem using the
> reservation mechanism.
> 
> > > +		if (error)
> > > +			return error;
> > > +
> > > +		error = dm_thin_insert_block(tc->td, vblock, pblock);
> > 
> > Having reserved and mapped blocks, what happens when we try to read them?
> > Do we actually get zeroes, or does the read go straight through to whatever
> > happens to be in the disk blocks?  I don't think it's correct that we could
> > BDEV_RES_PROVISION and end up with stale credit card numbers from some other
> > thin device.
> > 
> 
> Agree, but I'm not really sure how this works in thinp tbh. fallocate
> wasn't really on my mind when doing this. I was simply trying to cobble
> together what I could to facilitate making progress on the fs parts
> (e.g., I just needed a call that allocated blocks and consumed
> reservation in the process).
> 
> Skimming through the dm-thin code, it looks like a (configurable) block
> zeroing mechanism can be triggered from somewhere around
> provision_block()->schedule_zero(), depending on whether the incoming
> write overwrites the newly allocated block. If that's the case, then I
> suspect that means reads would just fall through to the block and return
> whatever was on disk. This code would probably need to tie into that
> zeroing mechanism one way or another to deal with that issue. (Though
> somebody who actually knows something about dm-thin should verify that.
> :)
> 

BTW, if that mechanism is in fact doing I/O, that might not be the
appropriate solution for fallocate. Perhaps we'd have to consider an
unwritten flag or some such in dm-thin, if possible.

Brian

> Brian
> 
> > (PS: I don't know enough about thinp to know if this has already been taken
> > care of.  I didn't see anything, but who knows what I missed. :))
> > 
> > --D
> > 
> > > +		if (error)
> > > +			return error;
> > > +
> > > +		if (res && *res)
> > > +			*res -= pool->sectors_per_block;
> > > +next:
> > > +		offset += pool->sectors_per_block;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int thin_reserve_space(struct dm_target *ti, int mode, sector_t offset,
> > > +			      sector_t len, sector_t *res)
> > > +{
> > > +	struct thin_c *tc = ti->private;
> > > +	struct pool *pool = tc->pool;
> > > +	sector_t blocks;
> > > +	unsigned long flags;
> > > +	int error;
> > > +
> > > +	if (mode == BDEV_RES_PROVISION)
> > > +		return thin_provision_space(ti, offset, len, res);
> > > +
> > > +	/* res required for get/set */
> > > +	error = -EINVAL;
> > > +	if (!res)
> > > +		return error;
> > > +
> > > +	if (mode == BDEV_RES_GET) {
> > > +		spin_lock_irqsave(&tc->pool->lock, flags);
> > > +		*res = tc->reserve_count * pool->sectors_per_block;
> > > +		spin_unlock_irqrestore(&tc->pool->lock, flags);
> > > +		error = 0;
> > > +	} else if (mode == BDEV_RES_MOD) {
> > > +		/*
> > > +		* @res must always be a factor of the pool's blocksize; upper
> > > +		* layers can rely on the bdev's minimum_io_size for this.
> > > +		*/
> > > +		if (!is_factor(*res, pool->sectors_per_block))
> > > +			return error;
> > > +
> > > +		blocks = *res;
> > > +		(void) sector_div(blocks, pool->sectors_per_block);
> > > +
> > > +		error = set_reserve_count(tc, blocks);
> > > +	}
> > > +
> > > +	return error;
> > > +}
> > > +
> > >  static struct target_type thin_target = {
> > >  	.name = "thin",
> > >  	.version = {1, 18, 0},
> > > @@ -4285,6 +4445,7 @@ static struct target_type thin_target = {
> > >  	.status = thin_status,
> > >  	.iterate_devices = thin_iterate_devices,
> > >  	.io_hints = thin_io_hints,
> > > +	.reserve_space = thin_reserve_space,
> > >  };
> > >  
> > >  /*----------------------------------------------------------------*/
> > > -- 
> > > 2.4.11
> > > 
> > > _______________________________________________
> > > xfs mailing list
> > > xfs@oss.sgi.com
> > > http://oss.sgi.com/mailman/listinfo/xfs
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-block" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-block" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Darrick J. Wong April 13, 2016, 9:01 p.m. UTC | #4
On Wed, Apr 13, 2016 at 04:41:18PM -0400, Brian Foster wrote:
> On Wed, Apr 13, 2016 at 02:33:52PM -0400, Brian Foster wrote:
> > On Wed, Apr 13, 2016 at 10:44:42AM -0700, Darrick J. Wong wrote:
> > > On Tue, Apr 12, 2016 at 12:42:48PM -0400, Brian Foster wrote:
> > > > From: Joe Thornber <ejt@redhat.com>
> > > > 
> > > > Experimental reserve interface for XFS guys to play with.
> > > > 
> > > > I have big reservations (no pun intended) about this patch.
> > > > 
> > > > [BF:
> > > >  - Support for reservation reduction.
> > > >  - Support for space provisioning.
> > > >  - Condensed to a single function.]
> > > > 
> > > > Not-Signed-off-by: Joe Thornber <ejt@redhat.com>
> > > > Not-Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> > > > ---
> > > >  drivers/md/dm-thin.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++---
> > > >  1 file changed, 171 insertions(+), 10 deletions(-)
> > > > 
> > > > diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> > > > index 92237b6..32bc5bd 100644
> > > > --- a/drivers/md/dm-thin.c
> > > > +++ b/drivers/md/dm-thin.c
> > ...
> > > > @@ -4271,6 +4343,94 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
> > > >  	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
> > > >  }
> > > >  
> > > > +static int thin_provision_space(struct dm_target *ti, sector_t offset,
> > > > +				sector_t len, sector_t *res)
> > > > +{
> > > > +	struct thin_c *tc = ti->private;
> > > > +	struct pool *pool = tc->pool;
> > > > +	sector_t end;
> > > > +	dm_block_t pblock;
> > > > +	dm_block_t vblock;
> > > > +	int error;
> > > > +	struct dm_thin_lookup_result lookup;
> > > > +
> > > > +	if (!is_factor(offset, pool->sectors_per_block))
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (!len || !is_factor(len, pool->sectors_per_block))
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (res && !is_factor(*res, pool->sectors_per_block))
> > > > +		return -EINVAL;
> > > > +
> > > > +	end = offset + len;
> > > > +
> > > > +	while (offset < end) {
> > > > +		vblock = offset;
> > > > +		do_div(vblock, pool->sectors_per_block);
> > > > +
> > > > +		error = dm_thin_find_block(tc->td, vblock, true, &lookup);
> > > > +		if (error == 0)
> > > > +			goto next;
> > > > +		if (error != -ENODATA)
> > > > +			return error;
> > > > +
> > > > +		error = alloc_data_block(tc, &pblock);
> > > 
> > > So this means that if fallocate wants to BDEV_RES_PROVISION N blocks, it must
> > > first increase the reservation (BDEV_RES_MOD) by N blocks to avoid using up
> > > space that was previously reserved by some other caller.  I think?
> > > 
> > 
> > Yes, assuming this is being called from a filesystem using the
> > reservation mechanism.
> > 
> > > > +		if (error)
> > > > +			return error;
> > > > +
> > > > +		error = dm_thin_insert_block(tc->td, vblock, pblock);
> > > 
> > > Having reserved and mapped blocks, what happens when we try to read them?
> > > Do we actually get zeroes, or does the read go straight through to whatever
> > > happens to be in the disk blocks?  I don't think it's correct that we could
> > > BDEV_RES_PROVISION and end up with stale credit card numbers from some other
> > > thin device.
> > > 
> > 
> > Agree, but I'm not really sure how this works in thinp tbh. fallocate
> > wasn't really on my mind when doing this. I was simply trying to cobble
> > together what I could to facilitate making progress on the fs parts
> > (e.g., I just needed a call that allocated blocks and consumed
> > reservation in the process).
> > 
> > Skimming through the dm-thin code, it looks like a (configurable) block
> > zeroing mechanism can be triggered from somewhere around
> > provision_block()->schedule_zero(), depending on whether the incoming
> > write overwrites the newly allocated block. If that's the case, then I
> > suspect that means reads would just fall through to the block and return
> > whatever was on disk. This code would probably need to tie into that
> > zeroing mechanism one way or another to deal with that issue. (Though
> > somebody who actually knows something about dm-thin should verify that.
> > :)
> > 
> 
> BTW, if that mechanism is in fact doing I/O, that might not be the
> appropriate solution for fallocate. Perhaps we'd have to consider an
> unwritten flag or some such in dm-thin, if possible.

The hard part is that we don't know if the caller actually has a way to
prevent userspace from seeing the stale contents (filesystems) or if we'd
be leaking data straight to userspace (user program calling fallocate).

(And yeah, here we go with NO_HIDE_STALE again...)

--D

> 
> Brian
> 
> > Brian
> > 
> > > (PS: I don't know enough about thinp to know if this has already been taken
> > > care of.  I didn't see anything, but who knows what I missed. :))
> > > 
> > > --D
> > > 
> > > > +		if (error)
> > > > +			return error;
> > > > +
> > > > +		if (res && *res)
> > > > +			*res -= pool->sectors_per_block;
> > > > +next:
> > > > +		offset += pool->sectors_per_block;
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int thin_reserve_space(struct dm_target *ti, int mode, sector_t offset,
> > > > +			      sector_t len, sector_t *res)
> > > > +{
> > > > +	struct thin_c *tc = ti->private;
> > > > +	struct pool *pool = tc->pool;
> > > > +	sector_t blocks;
> > > > +	unsigned long flags;
> > > > +	int error;
> > > > +
> > > > +	if (mode == BDEV_RES_PROVISION)
> > > > +		return thin_provision_space(ti, offset, len, res);
> > > > +
> > > > +	/* res required for get/set */
> > > > +	error = -EINVAL;
> > > > +	if (!res)
> > > > +		return error;
> > > > +
> > > > +	if (mode == BDEV_RES_GET) {
> > > > +		spin_lock_irqsave(&tc->pool->lock, flags);
> > > > +		*res = tc->reserve_count * pool->sectors_per_block;
> > > > +		spin_unlock_irqrestore(&tc->pool->lock, flags);
> > > > +		error = 0;
> > > > +	} else if (mode == BDEV_RES_MOD) {
> > > > +		/*
> > > > +		* @res must always be a factor of the pool's blocksize; upper
> > > > +		* layers can rely on the bdev's minimum_io_size for this.
> > > > +		*/
> > > > +		if (!is_factor(*res, pool->sectors_per_block))
> > > > +			return error;
> > > > +
> > > > +		blocks = *res;
> > > > +		(void) sector_div(blocks, pool->sectors_per_block);
> > > > +
> > > > +		error = set_reserve_count(tc, blocks);
> > > > +	}
> > > > +
> > > > +	return error;
> > > > +}
> > > > +
> > > >  static struct target_type thin_target = {
> > > >  	.name = "thin",
> > > >  	.version = {1, 18, 0},
> > > > @@ -4285,6 +4445,7 @@ static struct target_type thin_target = {
> > > >  	.status = thin_status,
> > > >  	.iterate_devices = thin_iterate_devices,
> > > >  	.io_hints = thin_io_hints,
> > > > +	.reserve_space = thin_reserve_space,
> > > >  };
> > > >  
> > > >  /*----------------------------------------------------------------*/
> > > > -- 
> > > > 2.4.11
> > > > 
> > > > _______________________________________________
> > > > xfs mailing list
> > > > xfs@oss.sgi.com
> > > > http://oss.sgi.com/mailman/listinfo/xfs
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-block" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-block" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Mike Snitzer April 14, 2016, 3:10 p.m. UTC | #5
On Wed, Apr 13 2016 at  4:41pm -0400,
Brian Foster <bfoster@redhat.com> wrote:

> On Wed, Apr 13, 2016 at 02:33:52PM -0400, Brian Foster wrote:
> > On Wed, Apr 13, 2016 at 10:44:42AM -0700, Darrick J. Wong wrote:
> > > On Tue, Apr 12, 2016 at 12:42:48PM -0400, Brian Foster wrote:
> > > > From: Joe Thornber <ejt@redhat.com>
> > > > 
> > > > Experimental reserve interface for XFS guys to play with.
> > > > 
> > > > I have big reservations (no pun intended) about this patch.
> > > > 
> > > > [BF:
> > > >  - Support for reservation reduction.
> > > >  - Support for space provisioning.
> > > >  - Condensed to a single function.]
> > > > 
> > > > Not-Signed-off-by: Joe Thornber <ejt@redhat.com>
> > > > Not-Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> > > > ---
> > > >  drivers/md/dm-thin.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++---
> > > >  1 file changed, 171 insertions(+), 10 deletions(-)
> > > > 
> > > > diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> > > > index 92237b6..32bc5bd 100644
> > > > --- a/drivers/md/dm-thin.c
> > > > +++ b/drivers/md/dm-thin.c
> > ...
> > > > @@ -4271,6 +4343,94 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
> > > >  	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
> > > >  }
> > > >  
> > > > +static int thin_provision_space(struct dm_target *ti, sector_t offset,
> > > > +				sector_t len, sector_t *res)
> > > > +{
> > > > +	struct thin_c *tc = ti->private;
> > > > +	struct pool *pool = tc->pool;
> > > > +	sector_t end;
> > > > +	dm_block_t pblock;
> > > > +	dm_block_t vblock;
> > > > +	int error;
> > > > +	struct dm_thin_lookup_result lookup;
> > > > +
> > > > +	if (!is_factor(offset, pool->sectors_per_block))
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (!len || !is_factor(len, pool->sectors_per_block))
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (res && !is_factor(*res, pool->sectors_per_block))
> > > > +		return -EINVAL;
> > > > +
> > > > +	end = offset + len;
> > > > +
> > > > +	while (offset < end) {
> > > > +		vblock = offset;
> > > > +		do_div(vblock, pool->sectors_per_block);
> > > > +
> > > > +		error = dm_thin_find_block(tc->td, vblock, true, &lookup);
> > > > +		if (error == 0)
> > > > +			goto next;
> > > > +		if (error != -ENODATA)
> > > > +			return error;
> > > > +
> > > > +		error = alloc_data_block(tc, &pblock);
> > > 
> > > So this means that if fallocate wants to BDEV_RES_PROVISION N blocks, it must
> > > first increase the reservation (BDEV_RES_MOD) by N blocks to avoid using up
> > > space that was previously reserved by some other caller.  I think?
> > > 
> > 
> > Yes, assuming this is being called from a filesystem using the
> > reservation mechanism.

Brian, I need to circle back with you to understand why XFS even needs
reservation as opposed to just using something like fallocate (which
would provision the space before you actually initiate the IO that would
use it).  But we can discuss that in person and then report back to the
list if it makes it easier...
 
> > > > +		if (error)
> > > > +			return error;
> > > > +
> > > > +		error = dm_thin_insert_block(tc->td, vblock, pblock);
> > > 
> > > Having reserved and mapped blocks, what happens when we try to read them?
> > > Do we actually get zeroes, or does the read go straight through to whatever
> > > happens to be in the disk blocks?  I don't think it's correct that we could
> > > BDEV_RES_PROVISION and end up with stale credit card numbers from some other
> > > thin device.
> > > 
> > 
> > Agree, but I'm not really sure how this works in thinp tbh. fallocate
> > wasn't really on my mind when doing this. I was simply trying to cobble
> > together what I could to facilitate making progress on the fs parts
> > (e.g., I just needed a call that allocated blocks and consumed
> > reservation in the process).
> > 
> > Skimming through the dm-thin code, it looks like a (configurable) block
> > zeroing mechanism can be triggered from somewhere around
> > provision_block()->schedule_zero(), depending on whether the incoming
> > write overwrites the newly allocated block. If that's the case, then I
> > suspect that means reads would just fall through to the block and return
> > whatever was on disk. This code would probably need to tie into that
> > zeroing mechanism one way or another to deal with that issue. (Though
> > somebody who actually knows something about dm-thin should verify that.
> > :)
> > 
> 
> BTW, if that mechanism is in fact doing I/O, that might not be the
> appropriate solution for fallocate. Perhaps we'd have to consider an
> unwritten flag or some such in dm-thin, if possible.

DM thinp defaults to enabling 'zero_new_blocks' (can be disabled using
the 'skip_block_zeroing' feature when loading the DM table for the
thin-pool).  With block-zeroing any blocks that are provisioned _will_
be overwritten with zeroes (using dm-kcopyd which is trained to use
WRITE_SAME if supported).

But yeah, for fallocate.. certainly not something we want as it defeats
the point of fallocate being cheap.

So we probably would need a flag comparable to the
ext4-stale-flag-that-shall-not-be-named ;)

Mike

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Brian Foster April 14, 2016, 4:23 p.m. UTC | #6
On Thu, Apr 14, 2016 at 11:10:14AM -0400, Mike Snitzer wrote:
> On Wed, Apr 13 2016 at  4:41pm -0400,
> Brian Foster <bfoster@redhat.com> wrote:
> 
> > On Wed, Apr 13, 2016 at 02:33:52PM -0400, Brian Foster wrote:
> > > On Wed, Apr 13, 2016 at 10:44:42AM -0700, Darrick J. Wong wrote:
> > > > On Tue, Apr 12, 2016 at 12:42:48PM -0400, Brian Foster wrote:
> > > > > From: Joe Thornber <ejt@redhat.com>
> > > > > 
> > > > > Experimental reserve interface for XFS guys to play with.
> > > > > 
> > > > > I have big reservations (no pun intended) about this patch.
> > > > > 
> > > > > [BF:
> > > > >  - Support for reservation reduction.
> > > > >  - Support for space provisioning.
> > > > >  - Condensed to a single function.]
> > > > > 
> > > > > Not-Signed-off-by: Joe Thornber <ejt@redhat.com>
> > > > > Not-Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> > > > > ---
> > > > >  drivers/md/dm-thin.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++---
> > > > >  1 file changed, 171 insertions(+), 10 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> > > > > index 92237b6..32bc5bd 100644
> > > > > --- a/drivers/md/dm-thin.c
> > > > > +++ b/drivers/md/dm-thin.c
> > > ...
> > > > > @@ -4271,6 +4343,94 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
> > > > >  	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
> > > > >  }
> > > > >  
> > > > > +static int thin_provision_space(struct dm_target *ti, sector_t offset,
> > > > > +				sector_t len, sector_t *res)
> > > > > +{
> > > > > +	struct thin_c *tc = ti->private;
> > > > > +	struct pool *pool = tc->pool;
> > > > > +	sector_t end;
> > > > > +	dm_block_t pblock;
> > > > > +	dm_block_t vblock;
> > > > > +	int error;
> > > > > +	struct dm_thin_lookup_result lookup;
> > > > > +
> > > > > +	if (!is_factor(offset, pool->sectors_per_block))
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (!len || !is_factor(len, pool->sectors_per_block))
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (res && !is_factor(*res, pool->sectors_per_block))
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	end = offset + len;
> > > > > +
> > > > > +	while (offset < end) {
> > > > > +		vblock = offset;
> > > > > +		do_div(vblock, pool->sectors_per_block);
> > > > > +
> > > > > +		error = dm_thin_find_block(tc->td, vblock, true, &lookup);
> > > > > +		if (error == 0)
> > > > > +			goto next;
> > > > > +		if (error != -ENODATA)
> > > > > +			return error;
> > > > > +
> > > > > +		error = alloc_data_block(tc, &pblock);
> > > > 
> > > > So this means that if fallocate wants to BDEV_RES_PROVISION N blocks, it must
> > > > first increase the reservation (BDEV_RES_MOD) by N blocks to avoid using up
> > > > space that was previously reserved by some other caller.  I think?
> > > > 
> > > 
> > > Yes, assuming this is being called from a filesystem using the
> > > reservation mechanism.
> 
> Brian, I need to circle back with you to understand why XFS even needs
> reservation as opposed to just using something like fallocate (which
> would provision the space before you actually initiate the IO that would
> use it).  But we can discuss that in person and then report back to the
> list if it makes it easier...
>  

The primary reason is delayed allocation. Buffered writes to the fs copy
data into the pagecache before the physical space has been allocated.
E.g., we only modify the free blocks counters at write() time in order
to guarantee that we have space somewhere in the fs. The physical
extents aren't allocated until later at writeback time.

So reservation from dm-thin basically extends the mechanism to also
guarantee that the underlying thin volume has space for writes that
we've received but haven't written back yet.

> > > > > +		if (error)
> > > > > +			return error;
> > > > > +
> > > > > +		error = dm_thin_insert_block(tc->td, vblock, pblock);
> > > > 
> > > > Having reserved and mapped blocks, what happens when we try to read them?
> > > > Do we actually get zeroes, or does the read go straight through to whatever
> > > > happens to be in the disk blocks?  I don't think it's correct that we could
> > > > BDEV_RES_PROVISION and end up with stale credit card numbers from some other
> > > > thin device.
> > > > 
> > > 
> > > Agree, but I'm not really sure how this works in thinp tbh. fallocate
> > > wasn't really on my mind when doing this. I was simply trying to cobble
> > > together what I could to facilitate making progress on the fs parts
> > > (e.g., I just needed a call that allocated blocks and consumed
> > > reservation in the process).
> > > 
> > > Skimming through the dm-thin code, it looks like a (configurable) block
> > > zeroing mechanism can be triggered from somewhere around
> > > provision_block()->schedule_zero(), depending on whether the incoming
> > > write overwrites the newly allocated block. If that's the case, then I
> > > suspect that means reads would just fall through to the block and return
> > > whatever was on disk. This code would probably need to tie into that
> > > zeroing mechanism one way or another to deal with that issue. (Though
> > > somebody who actually knows something about dm-thin should verify that.
> > > :)
> > > 
> > 
> > BTW, if that mechanism is in fact doing I/O, that might not be the
> > appropriate solution for fallocate. Perhaps we'd have to consider an
> > unwritten flag or some such in dm-thin, if possible.
> 
> DM thinp defaults to enabling 'zero_new_blocks' (can be disabled using
> the 'skip_block_zeroing' feature when loading the DM table for the
> thin-pool).  With block-zeroing any blocks that are provisioned _will_
> be overwritten with zeroes (using dm-kcopyd which is trained to use
> WRITE_SAME if supported).
> 

Ok, thanks.

> But yeah, for fallocate.. certainly not something we want as it defeats
> the point of fallocate being cheap.
> 

Indeed.

> So we probably would need a flag comparable to the
> ext4-stale-flag-that-shall-not-be-named ;)
> 

Any chance to support an unwritten flag for all blocks that are
allocated via fallocate? E.g., subsequent reads detect the flag and
return zeroes as if the block wasn't there and a subsequent write clears
the flag (doing any partial block zeroing that might be necessary as
well).

Brian

> Mike
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Mike Snitzer April 14, 2016, 8:18 p.m. UTC | #7
On Thu, Apr 14 2016 at 12:23pm -0400,
Brian Foster <bfoster@redhat.com> wrote:

> On Thu, Apr 14, 2016 at 11:10:14AM -0400, Mike Snitzer wrote:
> > 
> > Brian, I need to circle back with you to understand why XFS even needs
> > reservation as opposed to just using something like fallocate (which
> > would provision the space before you actually initiate the IO that would
> > use it).  But we can discuss that in person and then report back to the
> > list if it makes it easier...
> >  
> 
> The primary reason is delayed allocation. Buffered writes to the fs copy
> data into the pagecache before the physical space has been allocated.
> E.g., we only modify the free blocks counters at write() time in order
> to guarantee that we have space somewhere in the fs. The physical
> extents aren't allocated until later at writeback time.
> 
> So reservation from dm-thin basically extends the mechanism to also
> guarantee that the underlying thin volume has space for writes that
> we've received but haven't written back yet.

OK, so even if/when we have bdev_fallocate support that would be more
rigid than XFS would like.

As you've said, the XFS established reservation is larger than is really
needed.  Whereas regularly provisioning more than is actually needed is
a recipe for disaster.

> > > > > > +		if (error)
> > > > > > +			return error;
> > > > > > +
> > > > > > +		error = dm_thin_insert_block(tc->td, vblock, pblock);
> > > > > 
> > > > > Having reserved and mapped blocks, what happens when we try to read them?
> > > > > Do we actually get zeroes, or does the read go straight through to whatever
> > > > > happens to be in the disk blocks?  I don't think it's correct that we could
> > > > > BDEV_RES_PROVISION and end up with stale credit card numbers from some other
> > > > > thin device.
> > > > > 
> > > > 
> > > > Agree, but I'm not really sure how this works in thinp tbh. fallocate
> > > > wasn't really on my mind when doing this. I was simply trying to cobble
> > > > together what I could to facilitate making progress on the fs parts
> > > > (e.g., I just needed a call that allocated blocks and consumed
> > > > reservation in the process).
> > > > 
> > > > Skimming through the dm-thin code, it looks like a (configurable) block
> > > > zeroing mechanism can be triggered from somewhere around
> > > > provision_block()->schedule_zero(), depending on whether the incoming
> > > > write overwrites the newly allocated block. If that's the case, then I
> > > > suspect that means reads would just fall through to the block and return
> > > > whatever was on disk. This code would probably need to tie into that
> > > > zeroing mechanism one way or another to deal with that issue. (Though
> > > > somebody who actually knows something about dm-thin should verify that.
> > > > :)
> > > > 
> > > 
> > > BTW, if that mechanism is in fact doing I/O, that might not be the
> > > appropriate solution for fallocate. Perhaps we'd have to consider an
> > > unwritten flag or some such in dm-thin, if possible.
> > 
> > DM thinp defaults to enabling 'zero_new_blocks' (can be disabled using
> > the 'skip_block_zeroing' feature when loading the DM table for the
> > thin-pool).  With block-zeroing any blocks that are provisioned _will_
> > be overwritten with zeroes (using dm-kcopyd which is trained to use
> > WRITE_SAME if supported).
> > 
> 
> Ok, thanks.
> 
> > But yeah, for fallocate.. certainly not something we want as it defeats
> > the point of fallocate being cheap.
> > 
> 
> Indeed.
> 
> > So we probably would need a flag comparable to the
> > ext4-stale-flag-that-shall-not-be-named ;)
> > 
> 
> Any chance to support an unwritten flag for all blocks that are
> allocated via fallocate? E.g., subsequent reads detect the flag and
> return zeroes as if the block wasn't there and a subsequent write clears
> the flag (doing any partial block zeroing that might be necessary as
> well).

Yeah, I've already started talking to Joe about doing exactly that.
Without it we cannot securely provide fallocate support in DM thinp.

I'll keep discussing with Joe... he doesn't like this requirement but
we'll work through it.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Brian Foster April 15, 2016, 11:48 a.m. UTC | #8
On Thu, Apr 14, 2016 at 04:18:12PM -0400, Mike Snitzer wrote:
> On Thu, Apr 14 2016 at 12:23pm -0400,
> Brian Foster <bfoster@redhat.com> wrote:
> 
> > On Thu, Apr 14, 2016 at 11:10:14AM -0400, Mike Snitzer wrote:
> > > 
> > > Brian, I need to circle back with you to understand why XFS even needs
> > > reservation as opposed to just using something like fallocate (which
> > > would provision the space before you actually initiate the IO that would
> > > use it).  But we can discuss that in person and then report back to the
> > > list if it makes it easier...
> > >  
> > 
> > The primary reason is delayed allocation. Buffered writes to the fs copy
> > data into the pagecache before the physical space has been allocated.
> > E.g., we only modify the free blocks counters at write() time in order
> > to guarantee that we have space somewhere in the fs. The physical
> > extents aren't allocated until later at writeback time.
> > 
> > So reservation from dm-thin basically extends the mechanism to also
> > guarantee that the underlying thin volume has space for writes that
> > we've received but haven't written back yet.
> 
> OK, so even if/when we have bdev_fallocate support that would be more
> rigid than XFS would like.
> 

Yeah, fallocate is still useful on its own. For example, we could still
invoke bdev_fallocate() in response to userspace fallocate to ensure the
space is physically allocated (i.e., provide the no -ENOSPC guarantee).

That just doesn't help us avoid the overprovisioned situation where we
have data in pagecache and nowhere to write it back to (w/o setting the
volume read-only). The only way I'm aware of to handle that is to
account for the space at write time.

> As you've said, the XFS established reservation is larger than is really
> needed.  Whereas regularly provisioning more than is actually needed is
> a recipe for disaster.
> 

Indeed, this prototype ties right into XFS' existing transaction
reservation mechanism. It basically adds bdev reservation to the blocks
that we already locally reserve during creation of a transaction or a
delalloc write. This already does worst case reservation. What's
interesting is that the worst case 1-1 fs-dm block reservation doesn't
appear to be as much of a functional impediment as anticipated. I think
that's because XFS is already designed for such worst case reservations
and has mechanisms in place to handle it in appropriate situations.

For example, if an incoming write fails to reserve blocks due to too
much outstanding reservation, xfs_file_buffered_aio_write() will do
things like flush inodes and run our post-eof (for speculatively
preallocated space) scanner to reclaim some of that space and retry
before it gives up.

I'm sure there's a performance issue in there somewhere when that whole
sequence occurs more frequently than normal due to the amplified
reservation (a 4k fs block reserving 64k or more in the dm vol), but I
don't think that's necessarily a disaster scenario.

Brian


> > > > > > > +		if (error)
> > > > > > > +			return error;
> > > > > > > +
> > > > > > > +		error = dm_thin_insert_block(tc->td, vblock, pblock);
> > > > > > 
> > > > > > Having reserved and mapped blocks, what happens when we try to read them?
> > > > > > Do we actually get zeroes, or does the read go straight through to whatever
> > > > > > happens to be in the disk blocks?  I don't think it's correct that we could
> > > > > > BDEV_RES_PROVISION and end up with stale credit card numbers from some other
> > > > > > thin device.
> > > > > > 
> > > > > 
> > > > > Agree, but I'm not really sure how this works in thinp tbh. fallocate
> > > > > wasn't really on my mind when doing this. I was simply trying to cobble
> > > > > together what I could to facilitate making progress on the fs parts
> > > > > (e.g., I just needed a call that allocated blocks and consumed
> > > > > reservation in the process).
> > > > > 
> > > > > Skimming through the dm-thin code, it looks like a (configurable) block
> > > > > zeroing mechanism can be triggered from somewhere around
> > > > > provision_block()->schedule_zero(), depending on whether the incoming
> > > > > write overwrites the newly allocated block. If that's the case, then I
> > > > > suspect that means reads would just fall through to the block and return
> > > > > whatever was on disk. This code would probably need to tie into that
> > > > > zeroing mechanism one way or another to deal with that issue. (Though
> > > > > somebody who actually knows something about dm-thin should verify that.
> > > > > :)
> > > > > 
> > > > 
> > > > BTW, if that mechanism is in fact doing I/O, that might not be the
> > > > appropriate solution for fallocate. Perhaps we'd have to consider an
> > > > unwritten flag or some such in dm-thin, if possible.
> > > 
> > > DM thinp defaults to enabling 'zero_new_blocks' (can be disabled using
> > > the 'skip_block_zeroing' feature when loading the DM table for the
> > > thin-pool).  With block-zeroing any blocks that are provisioned _will_
> > > be overwritten with zeroes (using dm-kcopyd which is trained to use
> > > WRITE_SAME if supported).
> > > 
> > 
> > Ok, thanks.
> > 
> > > But yeah, for fallocate.. certainly not something we want as it defeats
> > > the point of fallocate being cheap.
> > > 
> > 
> > Indeed.
> > 
> > > So we probably would need a flag comparable to the
> > > ext4-stale-flag-that-shall-not-be-named ;)
> > > 
> > 
> > Any chance to support an unwritten flag for all blocks that are
> > allocated via fallocate? E.g., subsequent reads detect the flag and
> > return zeroes as if the block wasn't there and a subsequent write clears
> > the flag (doing any partial block zeroing that might be necessary as
> > well).
> 
> Yeah, I've already started talking to Joe about doing exactly that.
> Without it we cannot securely provide fallocate support in DM thinp.
> 
> I'll keep discussing with Joe... he doesn't like this requirement but
> we'll work through it.
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
diff mbox

Patch

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 92237b6..32bc5bd 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -271,6 +271,8 @@  struct pool {
 	process_mapping_fn process_prepared_discard;
 
 	struct dm_bio_prison_cell **cell_sort_array;
+
+	dm_block_t reserve_count;
 };
 
 static enum pool_mode get_pool_mode(struct pool *pool);
@@ -318,6 +320,8 @@  struct thin_c {
 	 */
 	atomic_t refcount;
 	struct completion can_destroy;
+
+	dm_block_t reserve_count;
 };
 
 /*----------------------------------------------------------------*/
@@ -1359,24 +1363,19 @@  static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
 	}
 }
 
-static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
+static int get_free_blocks(struct pool *pool, dm_block_t *free_blocks)
 {
 	int r;
-	dm_block_t free_blocks;
-	struct pool *pool = tc->pool;
-
-	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
-		return -EINVAL;
 
-	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
+	r = dm_pool_get_free_block_count(pool->pmd, free_blocks);
 	if (r) {
 		metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
 		return r;
 	}
 
-	check_low_water_mark(pool, free_blocks);
+	check_low_water_mark(pool, *free_blocks);
 
-	if (!free_blocks) {
+	if (!*free_blocks) {
 		/*
 		 * Try to commit to see if that will free up some
 		 * more space.
@@ -1385,7 +1384,7 @@  static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 		if (r)
 			return r;
 
-		r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
+		r = dm_pool_get_free_block_count(pool->pmd, free_blocks);
 		if (r) {
 			metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
 			return r;
@@ -1397,6 +1396,76 @@  static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 		}
 	}
 
+	return r;
+}
+
+/*
+ * Returns true iff either:
+ * i) decrement succeeded (ie. there was reserve left)
+ * ii) there is extra space in the pool
+ */
+static bool dec_reserve_count(struct thin_c *tc, dm_block_t free_blocks)
+{
+	bool r = false;
+	unsigned long flags;
+
+	if (!free_blocks)
+		return false;
+
+	spin_lock_irqsave(&tc->pool->lock, flags);
+	if (tc->reserve_count > 0) {
+		tc->reserve_count--;
+		tc->pool->reserve_count--;
+		r = true;
+	} else {
+		if (free_blocks > tc->pool->reserve_count)
+			r = true;
+	}
+	spin_unlock_irqrestore(&tc->pool->lock, flags);
+
+	return r;
+}
+
+static int set_reserve_count(struct thin_c *tc, dm_block_t count)
+{
+	int r;
+	dm_block_t free_blocks;
+	int64_t delta;
+	unsigned long flags;
+
+	r = get_free_blocks(tc->pool, &free_blocks);
+	if (r)
+		return r;
+
+	spin_lock_irqsave(&tc->pool->lock, flags);
+	delta = count - tc->reserve_count;
+	if (tc->pool->reserve_count + delta > free_blocks)
+		r = -ENOSPC;
+	else {
+		tc->reserve_count = count;
+		tc->pool->reserve_count += delta;
+	}
+	spin_unlock_irqrestore(&tc->pool->lock, flags);
+
+	return r;
+}
+
+static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
+{
+	int r;
+	dm_block_t free_blocks;
+	struct pool *pool = tc->pool;
+
+	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
+		return -EINVAL;
+
+	r = get_free_blocks(tc->pool, &free_blocks);
+	if (r)
+		return r;
+
+	if (!dec_reserve_count(tc, free_blocks))
+		return -ENOSPC;
+
 	r = dm_pool_alloc_data_block(pool->pmd, result);
 	if (r) {
 		metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
@@ -2880,6 +2949,7 @@  static struct pool *pool_create(struct mapped_device *pool_md,
 	pool->last_commit_jiffies = jiffies;
 	pool->pool_md = pool_md;
 	pool->md_dev = metadata_dev;
+	pool->reserve_count = 0;
 	__pool_table_insert(pool);
 
 	return pool;
@@ -3936,6 +4006,7 @@  static void thin_dtr(struct dm_target *ti)
 
 	spin_lock_irqsave(&tc->pool->lock, flags);
 	list_del_rcu(&tc->list);
+	tc->pool->reserve_count -= tc->reserve_count;
 	spin_unlock_irqrestore(&tc->pool->lock, flags);
 	synchronize_rcu();
 
@@ -4074,6 +4145,7 @@  static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	init_completion(&tc->can_destroy);
 	list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
 	spin_unlock_irqrestore(&tc->pool->lock, flags);
+	tc->reserve_count = 0;
 	/*
 	 * This synchronize_rcu() call is needed here otherwise we risk a
 	 * wake_worker() call finding no bios to process (because the newly
@@ -4271,6 +4343,94 @@  static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
 }
 
+static int thin_provision_space(struct dm_target *ti, sector_t offset,
+				sector_t len, sector_t *res)
+{
+	struct thin_c *tc = ti->private;
+	struct pool *pool = tc->pool;
+	sector_t end;
+	dm_block_t pblock;
+	dm_block_t vblock;
+	int error;
+	struct dm_thin_lookup_result lookup;
+
+	if (!is_factor(offset, pool->sectors_per_block))
+		return -EINVAL;
+
+	if (!len || !is_factor(len, pool->sectors_per_block))
+		return -EINVAL;
+
+	if (res && !is_factor(*res, pool->sectors_per_block))
+		return -EINVAL;
+
+	end = offset + len;
+
+	while (offset < end) {
+		vblock = offset;
+		do_div(vblock, pool->sectors_per_block);
+
+		error = dm_thin_find_block(tc->td, vblock, true, &lookup);
+		if (error == 0)
+			goto next;
+		if (error != -ENODATA)
+			return error;
+
+		error = alloc_data_block(tc, &pblock);
+		if (error)
+			return error;
+
+		error = dm_thin_insert_block(tc->td, vblock, pblock);
+		if (error)
+			return error;
+
+		if (res && *res)
+			*res -= pool->sectors_per_block;
+next:
+		offset += pool->sectors_per_block;
+	}
+
+	return 0;
+}
+
+static int thin_reserve_space(struct dm_target *ti, int mode, sector_t offset,
+			      sector_t len, sector_t *res)
+{
+	struct thin_c *tc = ti->private;
+	struct pool *pool = tc->pool;
+	sector_t blocks;
+	unsigned long flags;
+	int error;
+
+	if (mode == BDEV_RES_PROVISION)
+		return thin_provision_space(ti, offset, len, res);
+
+	/* res required for get/set */
+	error = -EINVAL;
+	if (!res)
+		return error;
+
+	if (mode == BDEV_RES_GET) {
+		spin_lock_irqsave(&tc->pool->lock, flags);
+		*res = tc->reserve_count * pool->sectors_per_block;
+		spin_unlock_irqrestore(&tc->pool->lock, flags);
+		error = 0;
+	} else if (mode == BDEV_RES_MOD) {
+		/*
+		* @res must always be a factor of the pool's blocksize; upper
+		* layers can rely on the bdev's minimum_io_size for this.
+		*/
+		if (!is_factor(*res, pool->sectors_per_block))
+			return error;
+
+		blocks = *res;
+		(void) sector_div(blocks, pool->sectors_per_block);
+
+		error = set_reserve_count(tc, blocks);
+	}
+
+	return error;
+}
+
 static struct target_type thin_target = {
 	.name = "thin",
 	.version = {1, 18, 0},
@@ -4285,6 +4445,7 @@  static struct target_type thin_target = {
 	.status = thin_status,
 	.iterate_devices = thin_iterate_devices,
 	.io_hints = thin_io_hints,
+	.reserve_space = thin_reserve_space,
 };
 
 /*----------------------------------------------------------------*/