diff mbox

Btrfs: incompatible format change to remove hole extents V2

Message ID 1383054986-21707-1-git-send-email-jbacik@fusionio.com (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Josef Bacik Oct. 29, 2013, 1:56 p.m. UTC
Btrfs has always had these filler extent data items for holes in inodes.  This
has made somethings very easy, like logging hole punches and sending hole
punches.  However for large holey files these extent data items are pure
overhead.  So add an incompatible feature to no longer add hole extents to
reduce the amount of metadata used by these sort of files.  This has a few
changes for logging and send obviously since they will need to detect holes and
log/send the holes if there are any.  I've tested this thoroughly with xfstests
and it doesn't cause any issues with and without the incompat format set.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
V1->V2: my snapshot exerciser pointed out I don't understand how INLINE extents
work, fixed this.

 fs/btrfs/ctree.h    |   4 +-
 fs/btrfs/file.c     |  13 ++++--
 fs/btrfs/inode.c    |  78 ++++++++++++++++++++-------------
 fs/btrfs/send.c     | 122 +++++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/tree-log.c |  56 +++++++++++++++++++++---
 5 files changed, 217 insertions(+), 56 deletions(-)

Comments

Jeff Mahoney Oct. 29, 2013, 6:15 p.m. UTC | #1
On 10/29/13, 9:56 AM, Josef Bacik wrote:
> Btrfs has always had these filler extent data items for holes in inodes.  This
> has made somethings very easy, like logging hole punches and sending hole
> punches.  However for large holey files these extent data items are pure
> overhead.  So add an incompatible feature to no longer add hole extents to
> reduce the amount of metadata used by these sort of files.  This has a few
> changes for logging and send obviously since they will need to detect holes and
> log/send the holes if there are any.  I've tested this thoroughly with xfstests
> and it doesn't cause any issues with and without the incompat format set.
> Thanks,

This sounds like it could be a candidate for the online-enable mask in
my feature ioctl patchset.

-Jeff

> Signed-off-by: Josef Bacik <jbacik@fusionio.com>
> ---
> V1->V2: my snapshot exerciser pointed out I don't understand how INLINE extents
> work, fixed this.
> 
>  fs/btrfs/ctree.h    |   4 +-
>  fs/btrfs/file.c     |  13 ++++--
>  fs/btrfs/inode.c    |  78 ++++++++++++++++++++-------------
>  fs/btrfs/send.c     | 122 +++++++++++++++++++++++++++++++++++++++++++++-------
>  fs/btrfs/tree-log.c |  56 +++++++++++++++++++++---
>  5 files changed, 217 insertions(+), 56 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 9f5e1cf..0a008d8 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -521,6 +521,7 @@ struct btrfs_super_block {
>  #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6)
>  #define BTRFS_FEATURE_INCOMPAT_RAID56		(1ULL << 7)
>  #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA	(1ULL << 8)
> +#define BTRFS_FEATURE_INCOMPAT_NO_HOLES		(1ULL << 9)
>  
>  #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
>  #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
> @@ -532,7 +533,8 @@ struct btrfs_super_block {
>  	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
>  	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
>  	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
> -	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
> +	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
> +	 BTRFS_FEATURE_INCOMPAT_NO_HOLES)
>  
>  /*
>   * A leaf is full of items. offset and size tell us where to find
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 728f56f..ec0a95a 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -1963,11 +1963,13 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
>  	struct btrfs_key key;
>  	int ret;
>  
> +	if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
> +		goto out;
> +
>  	key.objectid = btrfs_ino(inode);
>  	key.type = BTRFS_EXTENT_DATA_KEY;
>  	key.offset = offset;
>  
> -
>  	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
>  	if (ret < 0)
>  		return ret;
> @@ -2064,8 +2066,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  	u64 drop_end;
>  	int ret = 0;
>  	int err = 0;
> +	int rsv_count;
>  	bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
>  			  ((offset + len - 1) >> PAGE_CACHE_SHIFT));
> +	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
>  
>  	btrfs_wait_ordered_range(inode, offset, len);
>  
> @@ -2157,9 +2161,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  	/*
>  	 * 1 - update the inode
>  	 * 1 - removing the extents in the range
> -	 * 1 - adding the hole extent
> +	 * 1 - adding the hole extent if no_holes isn't set
>  	 */
> -	trans = btrfs_start_transaction(root, 3);
> +	rsv_count = no_holes ? 2 : 3;
> +	trans = btrfs_start_transaction(root, rsv_count);
>  	if (IS_ERR(trans)) {
>  		err = PTR_ERR(trans);
>  		goto out_free;
> @@ -2196,7 +2201,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  		btrfs_end_transaction(trans, root);
>  		btrfs_btree_balance_dirty(root);
>  
> -		trans = btrfs_start_transaction(root, 3);
> +		trans = btrfs_start_transaction(root, rsv_count);
>  		if (IS_ERR(trans)) {
>  			ret = PTR_ERR(trans);
>  			trans = NULL;
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 30bdacd..38fa301 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -4204,6 +4204,49 @@ out:
>  	return ret;
>  }
>  
> +static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
> +			     u64 offset, u64 len)
> +{
> +	struct btrfs_trans_handle *trans;
> +	int ret;
> +
> +	/*
> +	 * Still need to make sure the inode looks like it's been updated so
> +	 * that any holes get logged if we fsync.
> +	 */
> +	if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
> +		BTRFS_I(inode)->last_trans = root->fs_info->generation;
> +		BTRFS_I(inode)->last_sub_trans = root->log_transid;
> +		BTRFS_I(inode)->last_log_commit = root->last_log_commit;
> +		return 0;
> +	}
> +
> +	/*
> +	 * 1 - for the one we're dropping
> +	 * 1 - for the one we're adding
> +	 * 1 - for updating the inode.
> +	 */
> +	trans = btrfs_start_transaction(root, 3);
> +	if (IS_ERR(trans))
> +		return PTR_ERR(trans);
> +
> +	ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
> +	if (ret) {
> +		btrfs_abort_transaction(trans, root, ret);
> +		btrfs_end_transaction(trans, root);
> +		return ret;
> +	}
> +
> +	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
> +				       0, 0, len, 0, len, 0, 0, 0);
> +	if (ret)
> +		btrfs_abort_transaction(trans, root, ret);
> +	else
> +		btrfs_update_inode(trans, root, inode);
> +	btrfs_end_transaction(trans, root);
> +	return ret;
> +}
> +
>  /*
>   * This function puts in dummy file extents for the area we're creating a hole
>   * for.  So if we are truncating this file to a larger size we need to insert
> @@ -4212,7 +4255,6 @@ out:
>   */
>  int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
>  {
> -	struct btrfs_trans_handle *trans;
>  	struct btrfs_root *root = BTRFS_I(inode)->root;
>  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
>  	struct extent_map *em = NULL;
> @@ -4267,31 +4309,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
>  			struct extent_map *hole_em;
>  			hole_size = last_byte - cur_offset;
>  
> -			trans = btrfs_start_transaction(root, 3);
> -			if (IS_ERR(trans)) {
> -				err = PTR_ERR(trans);
> -				break;
> -			}
> -
> -			err = btrfs_drop_extents(trans, root, inode,
> -						 cur_offset,
> -						 cur_offset + hole_size, 1);
> -			if (err) {
> -				btrfs_abort_transaction(trans, root, err);
> -				btrfs_end_transaction(trans, root);
> -				break;
> -			}
> -
> -			err = btrfs_insert_file_extent(trans, root,
> -					btrfs_ino(inode), cur_offset, 0,
> -					0, hole_size, 0, hole_size,
> -					0, 0, 0);
> -			if (err) {
> -				btrfs_abort_transaction(trans, root, err);
> -				btrfs_end_transaction(trans, root);
> +			err = maybe_insert_hole(root, inode, cur_offset,
> +						hole_size);
> +			if (err)
>  				break;
> -			}
> -
>  			btrfs_drop_extent_cache(inode, cur_offset,
>  						cur_offset + hole_size - 1, 0);
>  			hole_em = alloc_extent_map();
> @@ -4310,7 +4331,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
>  			hole_em->ram_bytes = hole_size;
>  			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
>  			hole_em->compress_type = BTRFS_COMPRESS_NONE;
> -			hole_em->generation = trans->transid;
> +			hole_em->generation = root->fs_info->generation;
>  
>  			while (1) {
>  				write_lock(&em_tree->lock);
> @@ -4323,17 +4344,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
>  							hole_size - 1, 0);
>  			}
>  			free_extent_map(hole_em);
> -next:
> -			btrfs_update_inode(trans, root, inode);
> -			btrfs_end_transaction(trans, root);
>  		}
> +next:
>  		free_extent_map(em);
>  		em = NULL;
>  		cur_offset = last_byte;
>  		if (cur_offset >= block_end)
>  			break;
>  	}
> -
>  	free_extent_map(em);
>  	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
>  			     GFP_NOFS);
> diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
> index 5a3874c..2a2677b 100644
> --- a/fs/btrfs/send.c
> +++ b/fs/btrfs/send.c
> @@ -111,6 +111,7 @@ struct send_ctx {
>  	int cur_inode_deleted;
>  	u64 cur_inode_size;
>  	u64 cur_inode_mode;
> +	u64 cur_inode_last_extent;
>  
>  	u64 send_progress;
>  
> @@ -146,6 +147,13 @@ struct name_cache_entry {
>  	char name[];
>  };
>  
> +static int need_send_hole(struct send_ctx *sctx)
> +{
> +	return (sctx->parent_root && !sctx->cur_inode_new &&
> +		!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
> +		S_ISREG(sctx->cur_inode_mode));
> +}
> +
>  static void fs_path_reset(struct fs_path *p)
>  {
>  	if (p->reversed) {
> @@ -3776,6 +3784,43 @@ out:
>  	return ret;
>  }
>  
> +static int send_hole(struct send_ctx *sctx, u64 end)
> +{
> +	struct fs_path *p = NULL;
> +	u64 offset = sctx->cur_inode_last_extent;
> +	u64 len;
> +	int ret;
> +
> +	p = fs_path_alloc();
> +	if (!p)
> +		return -ENOMEM;
> +	ret = open_cur_inode_file(sctx);
> +	if (ret < 0)
> +		goto out;
> +	memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
> +	while (offset < end) {
> +		len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
> +
> +		ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
> +		if (ret < 0)
> +			break;
> +		ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
> +		if (ret < 0)
> +			break;
> +		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
> +		TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
> +		TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
> +		ret = send_cmd(sctx);
> +		if (ret < 0)
> +			break;
> +		offset += len;
> +	}
> +tlv_put_failure:
> +out:
> +	fs_path_free(p);
> +	return ret;
> +}
> +
>  static int send_write_or_clone(struct send_ctx *sctx,
>  			       struct btrfs_path *path,
>  			       struct btrfs_key *key,
> @@ -4008,26 +4053,32 @@ static int process_extent(struct send_ctx *sctx,
>  			  struct btrfs_key *key)
>  {
>  	struct clone_root *found_clone = NULL;
> +	struct btrfs_file_extent_item *ei;
> +	u64 len;
> +	u8 type;
>  	int ret = 0;
>  
>  	if (S_ISLNK(sctx->cur_inode_mode))
>  		return 0;
>  
> +	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
> +			    struct btrfs_file_extent_item);
> +	type = btrfs_file_extent_type(path->nodes[0], ei);
> +	if (type == BTRFS_FILE_EXTENT_INLINE)
> +		len = ALIGN(btrfs_file_extent_inline_len(path->nodes[0], ei),
> +			    sctx->send_root->sectorsize);
> +	else
> +		len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
> +
>  	if (sctx->parent_root && !sctx->cur_inode_new) {
>  		ret = is_extent_unchanged(sctx, path, key);
>  		if (ret < 0)
>  			goto out;
>  		if (ret) {
>  			ret = 0;
> -			goto out;
> +			goto out_hole;
>  		}
>  	} else {
> -		struct btrfs_file_extent_item *ei;
> -		u8 type;
> -
> -		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
> -				    struct btrfs_file_extent_item);
> -		type = btrfs_file_extent_type(path->nodes[0], ei);
>  		if (type == BTRFS_FILE_EXTENT_PREALLOC ||
>  		    type == BTRFS_FILE_EXTENT_REG) {
>  			/*
> @@ -4055,7 +4106,12 @@ static int process_extent(struct send_ctx *sctx,
>  		goto out;
>  
>  	ret = send_write_or_clone(sctx, path, key, found_clone);
> -
> +	if (ret)
> +		goto out;
> +out_hole:
> +	if (need_send_hole(sctx) && sctx->cur_inode_last_extent != key->offset)
> +		ret = send_hole(sctx, key->offset);
> +	sctx->cur_inode_last_extent = key->offset + len;
>  out:
>  	return ret;
>  }
> @@ -4181,6 +4237,12 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
>  	}
>  
>  	if (S_ISREG(sctx->cur_inode_mode)) {
> +		if (need_send_hole(sctx) &&
> +		    sctx->cur_inode_last_extent < sctx->cur_inode_size) {
> +			ret = send_hole(sctx, sctx->cur_inode_size);
> +			if (ret)
> +				goto out;
> +		}
>  		ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
>  				sctx->cur_inode_size);
>  		if (ret < 0)
> @@ -4228,6 +4290,7 @@ static int changed_inode(struct send_ctx *sctx,
>  
>  	sctx->cur_ino = key->objectid;
>  	sctx->cur_inode_new_gen = 0;
> +	sctx->cur_inode_last_extent = 0;
>  
>  	/*
>  	 * Set send_progress to current inode. This will tell all get_cur_xxx
> @@ -4492,6 +4555,31 @@ out:
>  	return ret;
>  }
>  
> +static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
> +			   struct btrfs_key *key)
> +{
> +	struct btrfs_file_extent_item *fi;
> +	u64 len;
> +	u8 type;
> +	int ret = 0;
> +
> +	if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
> +		return 0;
> +
> +	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
> +			    struct btrfs_file_extent_item);
> +	type = btrfs_file_extent_type(path->nodes[0], fi);
> +	if (type == BTRFS_FILE_EXTENT_INLINE)
> +		len = ALIGN(btrfs_file_extent_inline_len(path->nodes[0], fi),
> +			    sctx->send_root->sectorsize);
> +	else
> +		len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
> +	if (sctx->cur_inode_last_extent != key->offset)
> +		ret = send_hole(sctx, key->offset);
> +	sctx->cur_inode_last_extent = key->offset + len;
> +	return ret;
> +}
> +
>  /*
>   * Updates compare related fields in sctx and simply forwards to the actual
>   * changed_xxx functions.
> @@ -4508,14 +4596,18 @@ static int changed_cb(struct btrfs_root *left_root,
>  	struct send_ctx *sctx = ctx;
>  
>  	if (result == BTRFS_COMPARE_TREE_SAME) {
> -		if (key->type != BTRFS_INODE_REF_KEY &&
> -		    key->type != BTRFS_INODE_EXTREF_KEY)
> -			return 0;
> -		ret = compare_refs(sctx, left_path, key);
> -		if (!ret)
> +		if (key->type == BTRFS_INODE_REF_KEY ||
> +		    key->type == BTRFS_INODE_EXTREF_KEY) {
> +			ret = compare_refs(sctx, left_path, key);
> +			if (!ret)
> +				return 0;
> +			if (ret < 0)
> +				return ret;
> +		} else if (key->type == BTRFS_EXTENT_DATA_KEY) {
> +			return maybe_send_hole(sctx, left_path, key);
> +		} else {
>  			return 0;
> -		if (ret < 0)
> -			return ret;
> +		}
>  		result = BTRFS_COMPARE_TREE_CHANGED;
>  		ret = 0;
>  	}
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 1134aa4..3c8564a 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -3188,7 +3188,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
>  static noinline int copy_items(struct btrfs_trans_handle *trans,
>  			       struct inode *inode,
>  			       struct btrfs_path *dst_path,
> -			       struct extent_buffer *src,
> +			       struct extent_buffer *src, u64 *last_extent,
>  			       int start_slot, int nr, int inode_only)
>  {
>  	unsigned long src_offset;
> @@ -3203,6 +3203,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
>  	int i;
>  	struct list_head ordered_sums;
>  	int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
> +	bool has_extents = false;
>  
>  	INIT_LIST_HEAD(&ordered_sums);
>  
> @@ -3242,6 +3243,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
>  					   src_offset, ins_sizes[i]);
>  		}
>  
> +		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY)
> +			has_extents = true;
>  		/* take a reference on file data extents so that truncates
>  		 * or deletes of this inode don't have to relog the inode
>  		 * again
> @@ -3306,6 +3309,46 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
>  		list_del(&sums->list);
>  		kfree(sums);
>  	}
> +
> +	if (!has_extents)
> +		return ret;
> +
> +	/*
> +	 * Ok so here we need to go through and fill in any holes we may have
> +	 * to make sure that holes are punched for those areas in case they had
> +	 * extents previously.
> +	 */
> +	for (i = start_slot; i < start_slot + nr; i++) {
> +		struct btrfs_key key;
> +		u64 offset, len;
> +		u64 extent_end;
> +
> +		btrfs_item_key_to_cpu(src, &key, i);
> +		if (key.type != BTRFS_EXTENT_DATA_KEY)
> +			continue;
> +		extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
> +		if (btrfs_file_extent_type(src, extent) ==
> +		    BTRFS_FILE_EXTENT_INLINE) {
> +			len = btrfs_file_extent_inline_len(src, extent);
> +			extent_end = ALIGN(key.offset + len, log->sectorsize);
> +		} else {
> +			len = btrfs_file_extent_num_bytes(src, extent);
> +			extent_end = key.offset + len;
> +		}
> +
> +		if (*last_extent == key.offset) {
> +			*last_extent = extent_end;
> +			continue;
> +		}
> +		offset = *last_extent;
> +		len = key.offset - *last_extent;
> +		ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
> +					       offset, 0, 0, len, 0, len, 0,
> +					       0, 0);
> +		if (ret)
> +			break;
> +		*last_extent = extent_end;
> +	}
>  	return ret;
>  }
>  
> @@ -3624,6 +3667,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
>  	struct btrfs_key max_key;
>  	struct btrfs_root *log = root->log_root;
>  	struct extent_buffer *src = NULL;
> +	u64 last_extent = 0;
>  	int err = 0;
>  	int ret;
>  	int nritems;
> @@ -3738,8 +3782,8 @@ again:
>  			goto next_slot;
>  		}
>  
> -		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
> -				 ins_nr, inode_only);
> +		ret = copy_items(trans, inode, dst_path, src, &last_extent,
> +				 ins_start_slot, ins_nr, inode_only);
>  		if (ret) {
>  			err = ret;
>  			goto out_unlock;
> @@ -3757,7 +3801,7 @@ next_slot:
>  		}
>  		if (ins_nr) {
>  			ret = copy_items(trans, inode, dst_path, src,
> -					 ins_start_slot,
> +					 &last_extent, ins_start_slot,
>  					 ins_nr, inode_only);
>  			if (ret) {
>  				err = ret;
> @@ -3777,8 +3821,8 @@ next_slot:
>  		}
>  	}
>  	if (ins_nr) {
> -		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
> -				 ins_nr, inode_only);
> +		ret = copy_items(trans, inode, dst_path, src, &last_extent,
> +				 ins_start_slot, ins_nr, inode_only);
>  		if (ret) {
>  			err = ret;
>  			goto out_unlock;
>
Josef Bacik Oct. 29, 2013, 6:30 p.m. UTC | #2
On Tue, Oct 29, 2013 at 02:15:20PM -0400, Jeff Mahoney wrote:
> On 10/29/13, 9:56 AM, Josef Bacik wrote:
> > Btrfs has always had these filler extent data items for holes in inodes.  This
> > has made somethings very easy, like logging hole punches and sending hole
> > punches.  However for large holey files these extent data items are pure
> > overhead.  So add an incompatible feature to no longer add hole extents to
> > reduce the amount of metadata used by these sort of files.  This has a few
> > changes for logging and send obviously since they will need to detect holes and
> > log/send the holes if there are any.  I've tested this thoroughly with xfstests
> > and it doesn't cause any issues with and without the incompat format set.
> > Thanks,
> 
> This sounds like it could be a candidate for the online-enable mask in
> my feature ioctl patchset.

Yeah once I pull your stuff in I'll add it.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9f5e1cf..0a008d8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -521,6 +521,7 @@  struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6)
 #define BTRFS_FEATURE_INCOMPAT_RAID56		(1ULL << 7)
 #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA	(1ULL << 8)
+#define BTRFS_FEATURE_INCOMPAT_NO_HOLES		(1ULL << 9)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
@@ -532,7 +533,8 @@  struct btrfs_super_block {
 	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
 	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
 	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
-	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
+	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
+	 BTRFS_FEATURE_INCOMPAT_NO_HOLES)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 728f56f..ec0a95a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1963,11 +1963,13 @@  static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
 	struct btrfs_key key;
 	int ret;
 
+	if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+		goto out;
+
 	key.objectid = btrfs_ino(inode);
 	key.type = BTRFS_EXTENT_DATA_KEY;
 	key.offset = offset;
 
-
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0)
 		return ret;
@@ -2064,8 +2066,10 @@  static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	u64 drop_end;
 	int ret = 0;
 	int err = 0;
+	int rsv_count;
 	bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
 			  ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
 
 	btrfs_wait_ordered_range(inode, offset, len);
 
@@ -2157,9 +2161,10 @@  static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	/*
 	 * 1 - update the inode
 	 * 1 - removing the extents in the range
-	 * 1 - adding the hole extent
+	 * 1 - adding the hole extent if no_holes isn't set
 	 */
-	trans = btrfs_start_transaction(root, 3);
+	rsv_count = no_holes ? 2 : 3;
+	trans = btrfs_start_transaction(root, rsv_count);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto out_free;
@@ -2196,7 +2201,7 @@  static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		btrfs_end_transaction(trans, root);
 		btrfs_btree_balance_dirty(root);
 
-		trans = btrfs_start_transaction(root, 3);
+		trans = btrfs_start_transaction(root, rsv_count);
 		if (IS_ERR(trans)) {
 			ret = PTR_ERR(trans);
 			trans = NULL;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 30bdacd..38fa301 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4204,6 +4204,49 @@  out:
 	return ret;
 }
 
+static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
+			     u64 offset, u64 len)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	/*
+	 * Still need to make sure the inode looks like it's been updated so
+	 * that any holes get logged if we fsync.
+	 */
+	if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
+		BTRFS_I(inode)->last_trans = root->fs_info->generation;
+		BTRFS_I(inode)->last_sub_trans = root->log_transid;
+		BTRFS_I(inode)->last_log_commit = root->last_log_commit;
+		return 0;
+	}
+
+	/*
+	 * 1 - for the one we're dropping
+	 * 1 - for the one we're adding
+	 * 1 - for updating the inode.
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
+	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+				       0, 0, len, 0, len, 0, 0, 0);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+	else
+		btrfs_update_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
 /*
  * This function puts in dummy file extents for the area we're creating a hole
  * for.  So if we are truncating this file to a larger size we need to insert
@@ -4212,7 +4255,6 @@  out:
  */
 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 {
-	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map *em = NULL;
@@ -4267,31 +4309,10 @@  int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 			struct extent_map *hole_em;
 			hole_size = last_byte - cur_offset;
 
-			trans = btrfs_start_transaction(root, 3);
-			if (IS_ERR(trans)) {
-				err = PTR_ERR(trans);
-				break;
-			}
-
-			err = btrfs_drop_extents(trans, root, inode,
-						 cur_offset,
-						 cur_offset + hole_size, 1);
-			if (err) {
-				btrfs_abort_transaction(trans, root, err);
-				btrfs_end_transaction(trans, root);
-				break;
-			}
-
-			err = btrfs_insert_file_extent(trans, root,
-					btrfs_ino(inode), cur_offset, 0,
-					0, hole_size, 0, hole_size,
-					0, 0, 0);
-			if (err) {
-				btrfs_abort_transaction(trans, root, err);
-				btrfs_end_transaction(trans, root);
+			err = maybe_insert_hole(root, inode, cur_offset,
+						hole_size);
+			if (err)
 				break;
-			}
-
 			btrfs_drop_extent_cache(inode, cur_offset,
 						cur_offset + hole_size - 1, 0);
 			hole_em = alloc_extent_map();
@@ -4310,7 +4331,7 @@  int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 			hole_em->ram_bytes = hole_size;
 			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
-			hole_em->generation = trans->transid;
+			hole_em->generation = root->fs_info->generation;
 
 			while (1) {
 				write_lock(&em_tree->lock);
@@ -4323,17 +4344,14 @@  int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 							hole_size - 1, 0);
 			}
 			free_extent_map(hole_em);
-next:
-			btrfs_update_inode(trans, root, inode);
-			btrfs_end_transaction(trans, root);
 		}
+next:
 		free_extent_map(em);
 		em = NULL;
 		cur_offset = last_byte;
 		if (cur_offset >= block_end)
 			break;
 	}
-
 	free_extent_map(em);
 	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
 			     GFP_NOFS);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 5a3874c..2a2677b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -111,6 +111,7 @@  struct send_ctx {
 	int cur_inode_deleted;
 	u64 cur_inode_size;
 	u64 cur_inode_mode;
+	u64 cur_inode_last_extent;
 
 	u64 send_progress;
 
@@ -146,6 +147,13 @@  struct name_cache_entry {
 	char name[];
 };
 
+static int need_send_hole(struct send_ctx *sctx)
+{
+	return (sctx->parent_root && !sctx->cur_inode_new &&
+		!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
+		S_ISREG(sctx->cur_inode_mode));
+}
+
 static void fs_path_reset(struct fs_path *p)
 {
 	if (p->reversed) {
@@ -3776,6 +3784,43 @@  out:
 	return ret;
 }
 
+static int send_hole(struct send_ctx *sctx, u64 end)
+{
+	struct fs_path *p = NULL;
+	u64 offset = sctx->cur_inode_last_extent;
+	u64 len;
+	int ret;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+	ret = open_cur_inode_file(sctx);
+	if (ret < 0)
+		goto out;
+	memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
+	while (offset < end) {
+		len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+
+		ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+		if (ret < 0)
+			break;
+		ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+		if (ret < 0)
+			break;
+		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+		TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+		ret = send_cmd(sctx);
+		if (ret < 0)
+			break;
+		offset += len;
+	}
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	return ret;
+}
+
 static int send_write_or_clone(struct send_ctx *sctx,
 			       struct btrfs_path *path,
 			       struct btrfs_key *key,
@@ -4008,26 +4053,32 @@  static int process_extent(struct send_ctx *sctx,
 			  struct btrfs_key *key)
 {
 	struct clone_root *found_clone = NULL;
+	struct btrfs_file_extent_item *ei;
+	u64 len;
+	u8 type;
 	int ret = 0;
 
 	if (S_ISLNK(sctx->cur_inode_mode))
 		return 0;
 
+	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			    struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], ei);
+	if (type == BTRFS_FILE_EXTENT_INLINE)
+		len = ALIGN(btrfs_file_extent_inline_len(path->nodes[0], ei),
+			    sctx->send_root->sectorsize);
+	else
+		len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+
 	if (sctx->parent_root && !sctx->cur_inode_new) {
 		ret = is_extent_unchanged(sctx, path, key);
 		if (ret < 0)
 			goto out;
 		if (ret) {
 			ret = 0;
-			goto out;
+			goto out_hole;
 		}
 	} else {
-		struct btrfs_file_extent_item *ei;
-		u8 type;
-
-		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
-				    struct btrfs_file_extent_item);
-		type = btrfs_file_extent_type(path->nodes[0], ei);
 		if (type == BTRFS_FILE_EXTENT_PREALLOC ||
 		    type == BTRFS_FILE_EXTENT_REG) {
 			/*
@@ -4055,7 +4106,12 @@  static int process_extent(struct send_ctx *sctx,
 		goto out;
 
 	ret = send_write_or_clone(sctx, path, key, found_clone);
-
+	if (ret)
+		goto out;
+out_hole:
+	if (need_send_hole(sctx) && sctx->cur_inode_last_extent != key->offset)
+		ret = send_hole(sctx, key->offset);
+	sctx->cur_inode_last_extent = key->offset + len;
 out:
 	return ret;
 }
@@ -4181,6 +4237,12 @@  static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 	}
 
 	if (S_ISREG(sctx->cur_inode_mode)) {
+		if (need_send_hole(sctx) &&
+		    sctx->cur_inode_last_extent < sctx->cur_inode_size) {
+			ret = send_hole(sctx, sctx->cur_inode_size);
+			if (ret)
+				goto out;
+		}
 		ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
 				sctx->cur_inode_size);
 		if (ret < 0)
@@ -4228,6 +4290,7 @@  static int changed_inode(struct send_ctx *sctx,
 
 	sctx->cur_ino = key->objectid;
 	sctx->cur_inode_new_gen = 0;
+	sctx->cur_inode_last_extent = 0;
 
 	/*
 	 * Set send_progress to current inode. This will tell all get_cur_xxx
@@ -4492,6 +4555,31 @@  out:
 	return ret;
 }
 
+static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
+			   struct btrfs_key *key)
+{
+	struct btrfs_file_extent_item *fi;
+	u64 len;
+	u8 type;
+	int ret = 0;
+
+	if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
+		return 0;
+
+	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			    struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], fi);
+	if (type == BTRFS_FILE_EXTENT_INLINE)
+		len = ALIGN(btrfs_file_extent_inline_len(path->nodes[0], fi),
+			    sctx->send_root->sectorsize);
+	else
+		len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
+	if (sctx->cur_inode_last_extent != key->offset)
+		ret = send_hole(sctx, key->offset);
+	sctx->cur_inode_last_extent = key->offset + len;
+	return ret;
+}
+
 /*
  * Updates compare related fields in sctx and simply forwards to the actual
  * changed_xxx functions.
@@ -4508,14 +4596,18 @@  static int changed_cb(struct btrfs_root *left_root,
 	struct send_ctx *sctx = ctx;
 
 	if (result == BTRFS_COMPARE_TREE_SAME) {
-		if (key->type != BTRFS_INODE_REF_KEY &&
-		    key->type != BTRFS_INODE_EXTREF_KEY)
-			return 0;
-		ret = compare_refs(sctx, left_path, key);
-		if (!ret)
+		if (key->type == BTRFS_INODE_REF_KEY ||
+		    key->type == BTRFS_INODE_EXTREF_KEY) {
+			ret = compare_refs(sctx, left_path, key);
+			if (!ret)
+				return 0;
+			if (ret < 0)
+				return ret;
+		} else if (key->type == BTRFS_EXTENT_DATA_KEY) {
+			return maybe_send_hole(sctx, left_path, key);
+		} else {
 			return 0;
-		if (ret < 0)
-			return ret;
+		}
 		result = BTRFS_COMPARE_TREE_CHANGED;
 		ret = 0;
 	}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1134aa4..3c8564a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3188,7 +3188,7 @@  static int log_inode_item(struct btrfs_trans_handle *trans,
 static noinline int copy_items(struct btrfs_trans_handle *trans,
 			       struct inode *inode,
 			       struct btrfs_path *dst_path,
-			       struct extent_buffer *src,
+			       struct extent_buffer *src, u64 *last_extent,
 			       int start_slot, int nr, int inode_only)
 {
 	unsigned long src_offset;
@@ -3203,6 +3203,7 @@  static noinline int copy_items(struct btrfs_trans_handle *trans,
 	int i;
 	struct list_head ordered_sums;
 	int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+	bool has_extents = false;
 
 	INIT_LIST_HEAD(&ordered_sums);
 
@@ -3242,6 +3243,8 @@  static noinline int copy_items(struct btrfs_trans_handle *trans,
 					   src_offset, ins_sizes[i]);
 		}
 
+		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY)
+			has_extents = true;
 		/* take a reference on file data extents so that truncates
 		 * or deletes of this inode don't have to relog the inode
 		 * again
@@ -3306,6 +3309,46 @@  static noinline int copy_items(struct btrfs_trans_handle *trans,
 		list_del(&sums->list);
 		kfree(sums);
 	}
+
+	if (!has_extents)
+		return ret;
+
+	/*
+	 * Ok so here we need to go through and fill in any holes we may have
+	 * to make sure that holes are punched for those areas in case they had
+	 * extents previously.
+	 */
+	for (i = start_slot; i < start_slot + nr; i++) {
+		struct btrfs_key key;
+		u64 offset, len;
+		u64 extent_end;
+
+		btrfs_item_key_to_cpu(src, &key, i);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(src, extent) ==
+		    BTRFS_FILE_EXTENT_INLINE) {
+			len = btrfs_file_extent_inline_len(src, extent);
+			extent_end = ALIGN(key.offset + len, log->sectorsize);
+		} else {
+			len = btrfs_file_extent_num_bytes(src, extent);
+			extent_end = key.offset + len;
+		}
+
+		if (*last_extent == key.offset) {
+			*last_extent = extent_end;
+			continue;
+		}
+		offset = *last_extent;
+		len = key.offset - *last_extent;
+		ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
+					       offset, 0, 0, len, 0, len, 0,
+					       0, 0);
+		if (ret)
+			break;
+		*last_extent = extent_end;
+	}
 	return ret;
 }
 
@@ -3624,6 +3667,7 @@  static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
 	struct extent_buffer *src = NULL;
+	u64 last_extent = 0;
 	int err = 0;
 	int ret;
 	int nritems;
@@ -3738,8 +3782,8 @@  again:
 			goto next_slot;
 		}
 
-		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
-				 ins_nr, inode_only);
+		ret = copy_items(trans, inode, dst_path, src, &last_extent,
+				 ins_start_slot, ins_nr, inode_only);
 		if (ret) {
 			err = ret;
 			goto out_unlock;
@@ -3757,7 +3801,7 @@  next_slot:
 		}
 		if (ins_nr) {
 			ret = copy_items(trans, inode, dst_path, src,
-					 ins_start_slot,
+					 &last_extent, ins_start_slot,
 					 ins_nr, inode_only);
 			if (ret) {
 				err = ret;
@@ -3777,8 +3821,8 @@  next_slot:
 		}
 	}
 	if (ins_nr) {
-		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
-				 ins_nr, inode_only);
+		ret = copy_items(trans, inode, dst_path, src, &last_extent,
+				 ins_start_slot, ins_nr, inode_only);
 		if (ret) {
 			err = ret;
 			goto out_unlock;