diff mbox

[RFC,V11,01/21] Btrfs: subpagesize-blocksize: Fix whole page read.

Message ID 1433172176-8742-2-git-send-email-chandan@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Chandan Rajendra June 1, 2015, 3:22 p.m. UTC
For the subpagesize-blocksize scenario, a page can contain multiple
blocks. In such cases, this patch handles reading data from files.

To track the status of individual blocks of a page, this patch makes use of a
bitmap pointed to by page->private.

Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
---
 fs/btrfs/extent_io.c | 301 +++++++++++++++++++++++++++++++++------------------
 fs/btrfs/extent_io.h |  28 ++++-
 fs/btrfs/inode.c     |  13 +--
 3 files changed, 224 insertions(+), 118 deletions(-)

Comments

Liu Bo June 19, 2015, 4:45 a.m. UTC | #1
On Mon, Jun 01, 2015 at 08:52:36PM +0530, Chandan Rajendra wrote:
> For the subpagesize-blocksize scenario, a page can contain multiple
> blocks. In such cases, this patch handles reading data from files.
> 
> To track the status of individual blocks of a page, this patch makes use of a
> bitmap pointed to by page->private.

Start going through the patchset, it's not easy though.

Several comments are following.

> 
> Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
> ---
>  fs/btrfs/extent_io.c | 301 +++++++++++++++++++++++++++++++++------------------
>  fs/btrfs/extent_io.h |  28 ++++-
>  fs/btrfs/inode.c     |  13 +--
>  3 files changed, 224 insertions(+), 118 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 782f3bc..d37badb 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1325,6 +1325,88 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
>  				cached_state, mask);
>  }
>  
> +static int modify_page_blks_state(struct page *page,
> +				unsigned long blk_states,
> +				u64 start, u64 end, int set)
> +{
> +	struct inode *inode = page->mapping->host;
> +	unsigned long *bitmap;
> +	unsigned long state;
> +	u64 nr_blks;
> +	u64 blk;
> +
> +	BUG_ON(!PagePrivate(page));
> +
> +	bitmap = ((struct btrfs_page_private *)page->private)->bstate;
> +
> +	blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits;
> +	nr_blks = (end - start + 1) >> inode->i_blkbits;
> +
> +	while (nr_blks--) {
> +		state = find_next_bit(&blk_states, BLK_NR_STATE, 0);

Looks like we don't need to do find_next_bit for every block.

> +
> +		while (state < BLK_NR_STATE) {
> +			if (set)
> +				set_bit((blk * BLK_NR_STATE) + state, bitmap);
> +			else
> +				clear_bit((blk * BLK_NR_STATE) + state, bitmap);
> +
> +			state = find_next_bit(&blk_states, BLK_NR_STATE,
> +					state + 1);
> +		}
> +
> +		++blk;
> +	}
> +
> +	return 0;
> +}
> +
> +int set_page_blks_state(struct page *page, unsigned long blk_states,
> +			u64 start, u64 end)
> +{
> +	return modify_page_blks_state(page, blk_states, start, end, 1);
> +}
> +
> +int clear_page_blks_state(struct page *page, unsigned long blk_states,
> +			u64 start, u64 end)
> +{
> +	return modify_page_blks_state(page, blk_states, start, end, 0);
> +}
> +
> +int test_page_blks_state(struct page *page, enum blk_state blk_state,
> +			u64 start, u64 end, int check_all)
> +{
> +	struct inode *inode = page->mapping->host;
> +	unsigned long *bitmap;
> +	unsigned long blk;
> +	u64 nr_blks;
> +	int found = 0;
> +
> +	BUG_ON(!PagePrivate(page));
> +
> +	bitmap = ((struct btrfs_page_private *)page->private)->bstate;
> +
> +	blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits;
> +	nr_blks = (end - start + 1) >> inode->i_blkbits;
> +
> +	while (nr_blks--) {
> +		if (test_bit((blk * BLK_NR_STATE) + blk_state, bitmap)) {
> +			if (!check_all)
> +				return 1;
> +			found = 1;
> +		} else if (check_all) {
> +			return 0;
> +		}
> +
> +		++blk;
> +	}
> +
> +	if (!check_all && !found)
> +		return 0;
> +
> +	return 1;
> +}
> +
>  /*
>   * either insert or lock state struct between start and end use mask to tell
>   * us if waiting is desired.
> @@ -1982,14 +2064,22 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
>   * helper function to set a given page up to date if all the
>   * extents in the tree for that page are up to date
>   */
> -static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
> +static void check_page_uptodate(struct page *page)
>  {
>  	u64 start = page_offset(page);
>  	u64 end = start + PAGE_CACHE_SIZE - 1;
> -	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
> +	if (test_page_blks_state(page, BLK_STATE_UPTODATE, start, end, 1))
>  		SetPageUptodate(page);
>  }
>  
> +static int page_read_complete(struct page *page)
> +{
> +	u64 start = page_offset(page);
> +	u64 end = start + PAGE_CACHE_SIZE - 1;
> +
> +	return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0);
> +}
> +
>  int free_io_failure(struct inode *inode, struct io_failure_record *rec)
>  {
>  	int ret;
> @@ -2311,7 +2401,9 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
>  	 *	a) deliver good data to the caller
>  	 *	b) correct the bad sectors on disk
>  	 */
> -	if (failed_bio->bi_vcnt > 1) {
> +	if ((failed_bio->bi_vcnt > 1)
> +		|| (failed_bio->bi_io_vec->bv_len
> +			> BTRFS_I(inode)->root->sectorsize)) {
>  		/*
>  		 * to fulfill b), we need to know the exact failing sectors, as
>  		 * we don't want to rewrite any more than the failed ones. thus,
> @@ -2520,18 +2612,6 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
>  	bio_put(bio);
>  }
>  
> -static void
> -endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
> -			      int uptodate)
> -{
> -	struct extent_state *cached = NULL;
> -	u64 end = start + len - 1;
> -
> -	if (uptodate && tree->track_uptodate)
> -		set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
> -	unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
> -}
> -
>  /*
>   * after a readpage IO is done, we need to:
>   * clear the uptodate bits on error
> @@ -2548,14 +2628,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
>  	struct bio_vec *bvec;
>  	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
>  	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
> +	struct extent_state *cached = NULL;
> +	struct btrfs_page_private *pg_private;
>  	struct extent_io_tree *tree;
> +	unsigned long flags;
>  	u64 offset = 0;
>  	u64 start;
>  	u64 end;
> -	u64 len;
> -	u64 extent_start = 0;
> -	u64 extent_len = 0;
> +	int nr_sectors;
>  	int mirror;
> +	int unlock;
>  	int ret;
>  	int i;
>  
> @@ -2565,54 +2647,31 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
>  	bio_for_each_segment_all(bvec, bio, i) {
>  		struct page *page = bvec->bv_page;
>  		struct inode *inode = page->mapping->host;
> +		struct btrfs_root *root = BTRFS_I(inode)->root;
>  
>  		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
>  			 "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
>  			 io_bio->mirror_num);
>  		tree = &BTRFS_I(inode)->io_tree;
>  
> -		/* We always issue full-page reads, but if some block
> -		 * in a page fails to read, blk_update_request() will
> -		 * advance bv_offset and adjust bv_len to compensate.
> -		 * Print a warning for nonzero offsets, and an error
> -		 * if they don't add up to a full page.  */
> -		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
> -			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
> -				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
> -				   "partial page read in btrfs with offset %u and length %u",
> -					bvec->bv_offset, bvec->bv_len);
> -			else
> -				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
> -				   "incomplete page read in btrfs with offset %u and "
> -				   "length %u",
> -					bvec->bv_offset, bvec->bv_len);
> -		}
> -
> -		start = page_offset(page);
> -		end = start + bvec->bv_offset + bvec->bv_len - 1;
> -		len = bvec->bv_len;
> -
> +		start = page_offset(page) + bvec->bv_offset;
> +		end = start + bvec->bv_len - 1;
> +		nr_sectors = bvec->bv_len >> inode->i_sb->s_blocksize_bits;
>  		mirror = io_bio->mirror_num;
> -		if (likely(uptodate && tree->ops &&
> -			   tree->ops->readpage_end_io_hook)) {
> +
> +next_block:
> +		if (likely(uptodate)) {

Any reason of killing (tree->ops && tree->ops->readpage_end_io_hook)?

>  			ret = tree->ops->readpage_end_io_hook(io_bio, offset,
> -							      page, start, end,
> -							      mirror);
> +							page, start,
> +							start + root->sectorsize - 1,
> +							mirror);
>  			if (ret)
>  				uptodate = 0;
>  			else
>  				clean_io_failure(inode, start, page, 0);
>  		}
>  
> -		if (likely(uptodate))
> -			goto readpage_ok;
> -
> -		if (tree->ops && tree->ops->readpage_io_failed_hook) {
> -			ret = tree->ops->readpage_io_failed_hook(page, mirror);
> -			if (!ret && !err &&
> -			    test_bit(BIO_UPTODATE, &bio->bi_flags))
> -				uptodate = 1;
> -		} else {
> +		if (!uptodate) {
>  			/*
>  			 * The generic bio_readpage_error handles errors the
>  			 * following way: If possible, new read requests are
> @@ -2623,61 +2682,63 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
>  			 * can't handle the error it will return -EIO and we
>  			 * remain responsible for that page.
>  			 */
> -			ret = bio_readpage_error(bio, offset, page, start, end,
> -						 mirror);
> +			ret = bio_readpage_error(bio, offset, page,
> +						start, start + root->sectorsize - 1,
> +						mirror);
>  			if (ret == 0) {
> -				uptodate =
> -					test_bit(BIO_UPTODATE, &bio->bi_flags);
> +				uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
>  				if (err)
>  					uptodate = 0;
> -				offset += len;
> -				continue;
> +				offset += root->sectorsize;
> +				if (--nr_sectors) {
> +					start += root->sectorsize;
> +					goto next_block;
> +				} else {
> +					continue;
> +				}
>  			}
>  		}
> -readpage_ok:
> -		if (likely(uptodate)) {
> -			loff_t i_size = i_size_read(inode);
> -			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
> -			unsigned off;
> -
> -			/* Zero out the end if this page straddles i_size */
> -			off = i_size & (PAGE_CACHE_SIZE-1);
> -			if (page->index == end_index && off)
> -				zero_user_segment(page, off, PAGE_CACHE_SIZE);
> -			SetPageUptodate(page);
> +
> +		if (uptodate) {
> +			set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, start,
> +					start + root->sectorsize - 1);
> +			check_page_uptodate(page);
>  		} else {
>  			ClearPageUptodate(page);
>  			SetPageError(page);
>  		}
> -		unlock_page(page);
> -		offset += len;
> -
> -		if (unlikely(!uptodate)) {
> -			if (extent_len) {
> -				endio_readpage_release_extent(tree,
> -							      extent_start,
> -							      extent_len, 1);
> -				extent_start = 0;
> -				extent_len = 0;
> -			}
> -			endio_readpage_release_extent(tree, start,
> -						      end - start + 1, 0);
> -		} else if (!extent_len) {
> -			extent_start = start;
> -			extent_len = end + 1 - start;
> -		} else if (extent_start + extent_len == start) {
> -			extent_len += end + 1 - start;
> -		} else {
> -			endio_readpage_release_extent(tree, extent_start,
> -						      extent_len, uptodate);
> -			extent_start = start;
> -			extent_len = end + 1 - start;
> +
> +		offset += root->sectorsize;
> +
> +		if (--nr_sectors) {
> +			clear_page_blks_state(page, 1 << BLK_STATE_IO,
> +					start, start + root->sectorsize - 1);

private->io_lock is not acquired here but not in below.

IIUC, this can be protected by EXTENT_LOCKED.

Thanks,

-liubo

> +			clear_extent_bit(tree, start, start + root->sectorsize - 1,
> +					EXTENT_LOCKED, 1, 0, &cached, GFP_ATOMIC);
> +			start += root->sectorsize;
> +			goto next_block;
>  		}
> +
> +		WARN_ON(!PagePrivate(page));
> +
> +		pg_private = (struct btrfs_page_private *)page->private;
> +
> +		spin_lock_irqsave(&pg_private->io_lock, flags);
> +
> +		clear_page_blks_state(page, 1 << BLK_STATE_IO,
> +				start, start + root->sectorsize - 1);
> +
> +		unlock = page_read_complete(page);
> +
> +		spin_unlock_irqrestore(&pg_private->io_lock, flags);
> +
> +		clear_extent_bit(tree, start, start + root->sectorsize - 1,
> +				EXTENT_LOCKED, 1, 0, &cached, GFP_ATOMIC);
> +
> +		if (unlock)
> +			unlock_page(page);
>  	}
>  
> -	if (extent_len)
> -		endio_readpage_release_extent(tree, extent_start, extent_len,
> -					      uptodate);
>  	if (io_bio->end_io)
>  		io_bio->end_io(io_bio, err);
>  	bio_put(bio);
> @@ -2859,13 +2920,36 @@ static void attach_extent_buffer_page(struct extent_buffer *eb,
>  	}
>  }
>  
> -void set_page_extent_mapped(struct page *page)
> +int set_page_extent_mapped(struct page *page)
>  {
> +	struct btrfs_page_private *pg_private;
> +
>  	if (!PagePrivate(page)) {
> +		pg_private = kzalloc(sizeof(*pg_private), GFP_NOFS);
> +		if (!pg_private)
> +			return -ENOMEM;
> +
> +		spin_lock_init(&pg_private->io_lock);
> +
>  		SetPagePrivate(page);
>  		page_cache_get(page);
> -		set_page_private(page, EXTENT_PAGE_PRIVATE);
> +
> +		set_page_private(page, (unsigned long)pg_private);
> +	}
> +
> +	return 0;
> +}
> +
> +int clear_page_extent_mapped(struct page *page)
> +{
> +	if (PagePrivate(page)) {
> +		kfree((struct btrfs_page_private *)(page->private));
> +		ClearPagePrivate(page);
> +		set_page_private(page, 0);
> +		page_cache_release(page);
>  	}
> +
> +	return 0;
>  }
>  
>  static struct extent_map *
> @@ -2909,6 +2993,7 @@ static int __do_readpage(struct extent_io_tree *tree,
>  			 unsigned long *bio_flags, int rw)
>  {
>  	struct inode *inode = page->mapping->host;
> +	struct extent_state *cached = NULL;
>  	u64 start = page_offset(page);
>  	u64 page_end = start + PAGE_CACHE_SIZE - 1;
>  	u64 end;
> @@ -2964,8 +3049,8 @@ static int __do_readpage(struct extent_io_tree *tree,
>  			memset(userpage + pg_offset, 0, iosize);
>  			flush_dcache_page(page);
>  			kunmap_atomic(userpage);
> -			set_extent_uptodate(tree, cur, cur + iosize - 1,
> -					    &cached, GFP_NOFS);
> +			set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, cur,
> +					cur + iosize - 1);
>  			if (!parent_locked)
>  				unlock_extent_cached(tree, cur,
>  						     cur + iosize - 1,
> @@ -3017,8 +3102,8 @@ static int __do_readpage(struct extent_io_tree *tree,
>  			flush_dcache_page(page);
>  			kunmap_atomic(userpage);
>  
> -			set_extent_uptodate(tree, cur, cur + iosize - 1,
> -					    &cached, GFP_NOFS);
> +			set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, cur,
> +					cur + iosize - 1);
>  			unlock_extent_cached(tree, cur, cur + iosize - 1,
>  			                     &cached, GFP_NOFS);
>  			cur = cur + iosize;
> @@ -3026,9 +3111,9 @@ static int __do_readpage(struct extent_io_tree *tree,
>  			continue;
>  		}
>  		/* the get_extent function already copied into the page */
> -		if (test_range_bit(tree, cur, cur_end,
> -				   EXTENT_UPTODATE, 1, NULL)) {
> -			check_page_uptodate(tree, page);
> +		if (test_page_blks_state(page, BLK_STATE_UPTODATE, cur,
> +						cur_end, 1)) {
> +			check_page_uptodate(page);
>  			if (!parent_locked)
>  				unlock_extent(tree, cur, cur + iosize - 1);
>  			cur = cur + iosize;
> @@ -3048,6 +3133,9 @@ static int __do_readpage(struct extent_io_tree *tree,
>  		}
>  
>  		pnr -= page->index;
> +
> +		set_page_blks_state(page, 1 << BLK_STATE_IO, cur,
> +				cur + iosize - 1);
>  		ret = submit_extent_page(rw, tree, page,
>  					 sector, disk_io_size, pg_offset,
>  					 bdev, bio, pnr,
> @@ -3059,8 +3147,11 @@ static int __do_readpage(struct extent_io_tree *tree,
>  			*bio_flags = this_bio_flag;
>  		} else {
>  			SetPageError(page);
> +			clear_page_blks_state(page, 1 << BLK_STATE_IO, cur,
> +					cur + iosize - 1);
>  			if (!parent_locked)
> -				unlock_extent(tree, cur, cur + iosize - 1);
> +				unlock_extent_cached(tree, cur, cur + iosize - 1,
> +						&cached, GFP_NOFS);
>  		}
>  		cur = cur + iosize;
>  		pg_offset += iosize;
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index c668f36..541b40a 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -51,11 +51,22 @@
>  #define PAGE_SET_PRIVATE2	(1 << 4)
>  #define PAGE_SET_ERROR		(1 << 5)
>  
> +enum blk_state {
> +	BLK_STATE_UPTODATE,
> +	BLK_STATE_DIRTY,
> +	BLK_STATE_IO,
> +	BLK_NR_STATE,
> +};
> +
>  /*
> - * page->private values.  Every page that is controlled by the extent
> - * map has page->private set to one.
> - */
> -#define EXTENT_PAGE_PRIVATE 1
> +  The maximum number of blocks per page (i.e. 32) occurs when using 2k
> +  as the block size and having 64k as the page size.
> +*/
> +#define BLK_STATE_NR_LONGS DIV_ROUND_UP(BLK_NR_STATE * 32, BITS_PER_LONG)
> +struct btrfs_page_private {
> +	spinlock_t io_lock;
> +	unsigned long bstate[BLK_STATE_NR_LONGS];
> +};
>  
>  struct extent_state;
>  struct btrfs_root;
> @@ -259,7 +270,14 @@ int extent_readpages(struct extent_io_tree *tree,
>  int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  		__u64 start, __u64 len, get_extent_t *get_extent);
>  int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
> -void set_page_extent_mapped(struct page *page);
> +int set_page_extent_mapped(struct page *page);
> +int clear_page_extent_mapped(struct page *page);
> +int set_page_blks_state(struct page *page, unsigned long blk_states,
> + 			u64 start, u64 end);
> +int clear_page_blks_state(struct page *page, unsigned long blk_states,
> + 			u64 start, u64 end);
> +int test_page_blks_state(struct page *page, enum blk_state blk_state,
> +			u64 start, u64 end, int check_all);
>  
>  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  					  u64 start);
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 0020b56..8262f83 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -6622,7 +6622,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
>  	struct btrfs_key found_key;
>  	struct extent_map *em = NULL;
>  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
> -	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
>  	struct btrfs_trans_handle *trans = NULL;
>  	const bool new_inline = !page || create;
>  
> @@ -6800,8 +6799,8 @@ next:
>  			kunmap(page);
>  			btrfs_mark_buffer_dirty(leaf);
>  		}
> -		set_extent_uptodate(io_tree, em->start,
> -				    extent_map_end(em) - 1, NULL, GFP_NOFS);
> +		set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, em->start,
> +				extent_map_end(em) - 1);
>  		goto insert;
>  	}
>  not_found:
> @@ -8392,11 +8391,9 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
>  	tree = &BTRFS_I(page->mapping->host)->io_tree;
>  	map = &BTRFS_I(page->mapping->host)->extent_tree;
>  	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
> -	if (ret == 1) {
> -		ClearPagePrivate(page);
> -		set_page_private(page, 0);
> -		page_cache_release(page);
> -	}
> +	if (ret == 1)
> +		clear_page_extent_mapped(page);
> +
>  	return ret;
>  }
>  
> -- 
> 2.1.0
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chandan Rajendra June 19, 2015, 9:45 a.m. UTC | #2
On Friday 19 Jun 2015 12:45:37 Liu Bo wrote:
> On Mon, Jun 01, 2015 at 08:52:36PM +0530, Chandan Rajendra wrote:
> > For the subpagesize-blocksize scenario, a page can contain multiple
> > blocks. In such cases, this patch handles reading data from files.
> > 
> > To track the status of individual blocks of a page, this patch makes use
> > of a bitmap pointed to by page->private.
> 
> Start going through the patchset, it's not easy though.
> 
> Several comments are following.

Thanks for the review comments Liu.

> > +static int modify_page_blks_state(struct page *page,
> > +				unsigned long blk_states,
> > +				u64 start, u64 end, int set)
> > +{
> > +	struct inode *inode = page->mapping->host;
> > +	unsigned long *bitmap;
> > +	unsigned long state;
> > +	u64 nr_blks;
> > +	u64 blk;
> > +
> > +	BUG_ON(!PagePrivate(page));
> > +
> > +	bitmap = ((struct btrfs_page_private *)page->private)->bstate;
> > +
> > +	blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits;
> > +	nr_blks = (end - start + 1) >> inode->i_blkbits;
> > +
> > +	while (nr_blks--) {
> > +		state = find_next_bit(&blk_states, BLK_NR_STATE, 0);
> 
> Looks like we don't need to do find_next_bit for every block.

Yes, I agree. The find_next_bit() invocation in the outer loop can be moved
outside the loop.
> 
> > +
> > +		while (state < BLK_NR_STATE) {
> > +			if (set)
> > +				set_bit((blk * BLK_NR_STATE) + state, bitmap);
> > +			else
> > +				clear_bit((blk * BLK_NR_STATE) + state, 
bitmap);
> > +
> > +			state = find_next_bit(&blk_states, BLK_NR_STATE,
> > +					state + 1);
> > +		}
> > +
> > +		++blk;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > 
> >  /*
> >  
> >   * after a readpage IO is done, we need to:
> >   * clear the uptodate bits on error
> > 
> > @@ -2548,14 +2628,16 @@ static void end_bio_extent_readpage(struct bio
> > *bio, int err)> 
> >  	struct bio_vec *bvec;
> >  	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
> >  	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
> > 
> > +	struct extent_state *cached = NULL;
> > +	struct btrfs_page_private *pg_private;
> > 
> >  	struct extent_io_tree *tree;
> > 
> > +	unsigned long flags;
> > 
> >  	u64 offset = 0;
> >  	u64 start;
> >  	u64 end;
> > 
> > -	u64 len;
> > -	u64 extent_start = 0;
> > -	u64 extent_len = 0;
> > +	int nr_sectors;
> > 
> >  	int mirror;
> > 
> > +	int unlock;
> > 
> >  	int ret;
> >  	int i;
> > 
> > @@ -2565,54 +2647,31 @@ static void end_bio_extent_readpage(struct bio
> > *bio, int err)> 
> >  	bio_for_each_segment_all(bvec, bio, i) {
> >  	
> >  		struct page *page = bvec->bv_page;
> >  		struct inode *inode = page->mapping->host;
> > 
> > +		struct btrfs_root *root = BTRFS_I(inode)->root;
> > 
> >  		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
> >  		
> >  			 "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
> >  			 io_bio->mirror_num);
> >  		
> >  		tree = &BTRFS_I(inode)->io_tree;
> > 
> > -		/* We always issue full-page reads, but if some block
> > -		 * in a page fails to read, blk_update_request() will
> > -		 * advance bv_offset and adjust bv_len to compensate.
> > -		 * Print a warning for nonzero offsets, and an error
> > -		 * if they don't add up to a full page.  */
> > -		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
> > -			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
> > -				btrfs_err(BTRFS_I(page->mapping->host)->root-
>fs_info,
> > -				   "partial page read in btrfs with offset %u 
and length %u",
> > -					bvec->bv_offset, bvec->bv_len);
> > -			else
> > -				btrfs_info(BTRFS_I(page->mapping->host)->root-
>fs_info,
> > -				   "incomplete page read in btrfs with offset 
%u and "
> > -				   "length %u",
> > -					bvec->bv_offset, bvec->bv_len);
> > -		}
> > -
> > -		start = page_offset(page);
> > -		end = start + bvec->bv_offset + bvec->bv_len - 1;
> > -		len = bvec->bv_len;
> > -
> > +		start = page_offset(page) + bvec->bv_offset;
> > +		end = start + bvec->bv_len - 1;
> > +		nr_sectors = bvec->bv_len >> inode->i_sb->s_blocksize_bits;
> > 
> >  		mirror = io_bio->mirror_num;
> > 
> > -		if (likely(uptodate && tree->ops &&
> > -			   tree->ops->readpage_end_io_hook)) {
> > +
> > +next_block:
> > +		if (likely(uptodate)) {
> 
> Any reason of killing (tree->ops && tree->ops->readpage_end_io_hook)?

In subpagesize-blocksize scenario, For extent buffers we need the ability to
read just a single extent buffer rather than reading the complete contents of
the page containing the extent buffer. Similarly in the corresponding endio
function we need to verify a single extent buffer rather than the contents of
the full page.  Hence I ended up removing btree_readpage_end_io_hook() and
btree_io_failed_hook() functions and had verfication functions being
invoked directly by the endio function.

So since data "read page code" was the only one left to have
extent_io_tree->ops->readpage_end_io_hook set, I removed the code to check for
its existance. Now i realize that it is not the right thing to do. I will
restore back the condition check to its original state.

> 
> >  			ret = tree->ops->readpage_end_io_hook(io_bio, offset,
> > 
> > -							      page, start, 
end,
> > -							      mirror);
> > +							page, start,
> > +							start + root-
>sectorsize - 1,
> > +							mirror);
> > 
> >  			if (ret)
> >  			
> >  				uptodate = 0;
> >  			
> >  			else
> >  			
> >  				clean_io_failure(inode, start, page, 0);
> >  		
> >  		}
> > 
> > -		if (likely(uptodate))
> > -			goto readpage_ok;
> > -
> > -		if (tree->ops && tree->ops->readpage_io_failed_hook) {
> > -			ret = tree->ops->readpage_io_failed_hook(page, 
mirror);
> > -			if (!ret && !err &&
> > -			    test_bit(BIO_UPTODATE, &bio->bi_flags))
> > -				uptodate = 1;
> > -		} else {
> > +		if (!uptodate) {
> > 
> >  			/*
> >  			
> >  			 * The generic bio_readpage_error handles errors the
> >  			 * following way: If possible, new read requests are
> > 
> > @@ -2623,61 +2682,63 @@ static void end_bio_extent_readpage(struct bio
> > *bio, int err)> 
> >  			 * can't handle the error it will return -EIO and we
> >  			 * remain responsible for that page.
> >  			 */
> > 
> > -			ret = bio_readpage_error(bio, offset, page, start, 
end,
> > -						 mirror);
> > +			ret = bio_readpage_error(bio, offset, page,
> > +						start, start + root-
>sectorsize - 1,
> > +						mirror);
> > 
> >  			if (ret == 0) {
> > 
> > -				uptodate =
> > -					test_bit(BIO_UPTODATE, &bio-
>bi_flags);
> > +				uptodate = test_bit(BIO_UPTODATE, &bio-
>bi_flags);
> > 
> >  				if (err)
> >  				
> >  					uptodate = 0;
> > 
> > -				offset += len;
> > -				continue;
> > +				offset += root->sectorsize;
> > +				if (--nr_sectors) {
> > +					start += root->sectorsize;
> > +					goto next_block;
> > +				} else {
> > +					continue;
> > +				}
> > 
> >  			}
> >  		
> >  		}
> > 
> > -readpage_ok:
> > -		if (likely(uptodate)) {
> > -			loff_t i_size = i_size_read(inode);
> > -			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
> > -			unsigned off;
> > -
> > -			/* Zero out the end if this page straddles i_size */
> > -			off = i_size & (PAGE_CACHE_SIZE-1);
> > -			if (page->index == end_index && off)
> > -				zero_user_segment(page, off, PAGE_CACHE_SIZE);
> > -			SetPageUptodate(page);
> > +
> > +		if (uptodate) {
> > +			set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, 
start,
> > +					start + root->sectorsize - 1);
> > +			check_page_uptodate(page);
> > 
> >  		} else {
> >  		
> >  			ClearPageUptodate(page);
> >  			SetPageError(page);
> >  		
> >  		}
> > 
> > -		unlock_page(page);
> > -		offset += len;
> > -
> > -		if (unlikely(!uptodate)) {
> > -			if (extent_len) {
> > -				endio_readpage_release_extent(tree,
> > -							      extent_start,
> > -							      extent_len, 1);
> > -				extent_start = 0;
> > -				extent_len = 0;
> > -			}
> > -			endio_readpage_release_extent(tree, start,
> > -						      end - start + 1, 0);
> > -		} else if (!extent_len) {
> > -			extent_start = start;
> > -			extent_len = end + 1 - start;
> > -		} else if (extent_start + extent_len == start) {
> > -			extent_len += end + 1 - start;
> > -		} else {
> > -			endio_readpage_release_extent(tree, extent_start,
> > -						      extent_len, uptodate);
> > -			extent_start = start;
> > -			extent_len = end + 1 - start;
> > +
> > +		offset += root->sectorsize;
> > +
> > +		if (--nr_sectors) {
> > +			clear_page_blks_state(page, 1 << BLK_STATE_IO,
> > +					start, start + root->sectorsize - 1);
> 
> private->io_lock is not acquired here but not in below.
> 
> IIUC, this can be protected by EXTENT_LOCKED.
>

private->io_lock plays the same role as BH_Uptodate_Lock (see
end_buffer_async_read()) i.e. without the io_lock we may end up in the
following situation,

NOTE: Assume 64k page size and 4k block size. Also assume that the first 12
blocks of the page are contiguous while the next 4 blocks are contiguous. When
reading the page we end up submitting two "logical address space" bios. So
end_bio_extent_readpage function is invoked twice (once for each bio).

|-------------------------+-------------------------+-------------|
| Task A                  | Task B                  | Task C      |
|-------------------------+-------------------------+-------------|
| end_bio_extent_readpage |                         |             |
| process block 0         |                         |             |
| - clear BLK_STATE_IO    |                         |             |
| - page_read_complete    |                         |             |
| process block 1         |                         |             |
| ...                     |                         |             |
| ...                     |                         |             |
| ...                     | end_bio_extent_readpage |             |
| ...                     | process block 0         |             |
| ...                     | - clear BLK_STATE_IO    |             |
| ...                     | - page_read_complete    |             |
| ...                     | process block 1         |             |
| ...                     | ...                     |             |
| process block 11        | process block 3         |             |
| - clear BLK_STATE_IO    | - clear BLK_STATE_IO    |             |
| - page_read_complete    | - page_read_complete    |             |
|   - returns true        |   - returns true        |             |
|   - unlock_page()       |                         |             |
|                         |                         | lock_page() |
|                         |   - unlock_page()       |             |
|-------------------------+-------------------------+-------------|

So we end up incorrectly unlocking the page twice and "Task C" ends up working
on an unlocked page. So private->io_lock makes sure that only one of the tasks
gets "true" as the return value when page_read_complete() is invoked. As an
optimization the patch gets the io_lock only when nr_sectors counter reaches
the value 0 (i.e. when the last block of the bio_vec is being processed).
Please let me know if my analysis was incorrect.

Also, I noticed that page_read_complete() and page_write_complete() can be
replaced by just one function i.e. page_io_complete().
Liu Bo June 23, 2015, 8:37 a.m. UTC | #3
On Fri, Jun 19, 2015 at 03:15:01PM +0530, Chandan Rajendra wrote:
> On Friday 19 Jun 2015 12:45:37 Liu Bo wrote:
> > On Mon, Jun 01, 2015 at 08:52:36PM +0530, Chandan Rajendra wrote:
> > > For the subpagesize-blocksize scenario, a page can contain multiple
> > > blocks. In such cases, this patch handles reading data from files.
> > > 
> > > To track the status of individual blocks of a page, this patch makes use
> > > of a bitmap pointed to by page->private.
> > 
> > Start going through the patchset, it's not easy though.
> > 
> > Several comments are following.
> 
> Thanks for the review comments Liu.
> 
> > > +static int modify_page_blks_state(struct page *page,
> > > +				unsigned long blk_states,
> > > +				u64 start, u64 end, int set)
> > > +{
> > > +	struct inode *inode = page->mapping->host;
> > > +	unsigned long *bitmap;
> > > +	unsigned long state;
> > > +	u64 nr_blks;
> > > +	u64 blk;
> > > +
> > > +	BUG_ON(!PagePrivate(page));
> > > +
> > > +	bitmap = ((struct btrfs_page_private *)page->private)->bstate;
> > > +
> > > +	blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits;
> > > +	nr_blks = (end - start + 1) >> inode->i_blkbits;
> > > +
> > > +	while (nr_blks--) {
> > > +		state = find_next_bit(&blk_states, BLK_NR_STATE, 0);
> > 
> > Looks like we don't need to do find_next_bit for every block.
> 
> Yes, I agree. The find_next_bit() invocation in the outer loop can be moved
> outside the loop.
> > 
> > > +
> > > +		while (state < BLK_NR_STATE) {
> > > +			if (set)
> > > +				set_bit((blk * BLK_NR_STATE) + state, bitmap);
> > > +			else
> > > +				clear_bit((blk * BLK_NR_STATE) + state, 
> bitmap);
> > > +
> > > +			state = find_next_bit(&blk_states, BLK_NR_STATE,
> > > +					state + 1);
> > > +		}
> > > +
> > > +		++blk;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > 
> > >  /*
> > >  
> > >   * after a readpage IO is done, we need to:
> > >   * clear the uptodate bits on error
> > > 
> > > @@ -2548,14 +2628,16 @@ static void end_bio_extent_readpage(struct bio
> > > *bio, int err)> 
> > >  	struct bio_vec *bvec;
> > >  	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
> > >  	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
> > > 
> > > +	struct extent_state *cached = NULL;
> > > +	struct btrfs_page_private *pg_private;
> > > 
> > >  	struct extent_io_tree *tree;
> > > 
> > > +	unsigned long flags;
> > > 
> > >  	u64 offset = 0;
> > >  	u64 start;
> > >  	u64 end;
> > > 
> > > -	u64 len;
> > > -	u64 extent_start = 0;
> > > -	u64 extent_len = 0;
> > > +	int nr_sectors;
> > > 
> > >  	int mirror;
> > > 
> > > +	int unlock;
> > > 
> > >  	int ret;
> > >  	int i;
> > > 
> > > @@ -2565,54 +2647,31 @@ static void end_bio_extent_readpage(struct bio
> > > *bio, int err)> 
> > >  	bio_for_each_segment_all(bvec, bio, i) {
> > >  	
> > >  		struct page *page = bvec->bv_page;
> > >  		struct inode *inode = page->mapping->host;
> > > 
> > > +		struct btrfs_root *root = BTRFS_I(inode)->root;
> > > 
> > >  		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
> > >  		
> > >  			 "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
> > >  			 io_bio->mirror_num);
> > >  		
> > >  		tree = &BTRFS_I(inode)->io_tree;
> > > 
> > > -		/* We always issue full-page reads, but if some block
> > > -		 * in a page fails to read, blk_update_request() will
> > > -		 * advance bv_offset and adjust bv_len to compensate.
> > > -		 * Print a warning for nonzero offsets, and an error
> > > -		 * if they don't add up to a full page.  */
> > > -		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
> > > -			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
> > > -				btrfs_err(BTRFS_I(page->mapping->host)->root-
> >fs_info,
> > > -				   "partial page read in btrfs with offset %u 
> and length %u",
> > > -					bvec->bv_offset, bvec->bv_len);
> > > -			else
> > > -				btrfs_info(BTRFS_I(page->mapping->host)->root-
> >fs_info,
> > > -				   "incomplete page read in btrfs with offset 
> %u and "
> > > -				   "length %u",
> > > -					bvec->bv_offset, bvec->bv_len);
> > > -		}
> > > -
> > > -		start = page_offset(page);
> > > -		end = start + bvec->bv_offset + bvec->bv_len - 1;
> > > -		len = bvec->bv_len;
> > > -
> > > +		start = page_offset(page) + bvec->bv_offset;
> > > +		end = start + bvec->bv_len - 1;
> > > +		nr_sectors = bvec->bv_len >> inode->i_sb->s_blocksize_bits;
> > > 
> > >  		mirror = io_bio->mirror_num;
> > > 
> > > -		if (likely(uptodate && tree->ops &&
> > > -			   tree->ops->readpage_end_io_hook)) {
> > > +
> > > +next_block:
> > > +		if (likely(uptodate)) {
> > 
> > Any reason of killing (tree->ops && tree->ops->readpage_end_io_hook)?
> 
> In subpagesize-blocksize scenario, For extent buffers we need the ability to
> read just a single extent buffer rather than reading the complete contents of
> the page containing the extent buffer. Similarly in the corresponding endio
> function we need to verify a single extent buffer rather than the contents of
> the full page.  Hence I ended up removing btree_readpage_end_io_hook() and
> btree_io_failed_hook() functions and had verfication functions being
> invoked directly by the endio function.
> 
> So since data "read page code" was the only one left to have
> extent_io_tree->ops->readpage_end_io_hook set, I removed the code to check for
> its existance. Now i realize that it is not the right thing to do. I will
> restore back the condition check to its original state.
> 
> > 
> > >  			ret = tree->ops->readpage_end_io_hook(io_bio, offset,
> > > 
> > > -							      page, start, 
> end,
> > > -							      mirror);
> > > +							page, start,
> > > +							start + root-
> >sectorsize - 1,
> > > +							mirror);
> > > 
> > >  			if (ret)
> > >  			
> > >  				uptodate = 0;
> > >  			
> > >  			else
> > >  			
> > >  				clean_io_failure(inode, start, page, 0);
> > >  		
> > >  		}
> > > 
> > > -		if (likely(uptodate))
> > > -			goto readpage_ok;
> > > -
> > > -		if (tree->ops && tree->ops->readpage_io_failed_hook) {
> > > -			ret = tree->ops->readpage_io_failed_hook(page, 
> mirror);
> > > -			if (!ret && !err &&
> > > -			    test_bit(BIO_UPTODATE, &bio->bi_flags))
> > > -				uptodate = 1;
> > > -		} else {
> > > +		if (!uptodate) {
> > > 
> > >  			/*
> > >  			
> > >  			 * The generic bio_readpage_error handles errors the
> > >  			 * following way: If possible, new read requests are
> > > 
> > > @@ -2623,61 +2682,63 @@ static void end_bio_extent_readpage(struct bio
> > > *bio, int err)> 
> > >  			 * can't handle the error it will return -EIO and we
> > >  			 * remain responsible for that page.
> > >  			 */
> > > 
> > > -			ret = bio_readpage_error(bio, offset, page, start, 
> end,
> > > -						 mirror);
> > > +			ret = bio_readpage_error(bio, offset, page,
> > > +						start, start + root-
> >sectorsize - 1,
> > > +						mirror);
> > > 
> > >  			if (ret == 0) {
> > > 
> > > -				uptodate =
> > > -					test_bit(BIO_UPTODATE, &bio-
> >bi_flags);
> > > +				uptodate = test_bit(BIO_UPTODATE, &bio-
> >bi_flags);
> > > 
> > >  				if (err)
> > >  				
> > >  					uptodate = 0;
> > > 
> > > -				offset += len;
> > > -				continue;
> > > +				offset += root->sectorsize;
> > > +				if (--nr_sectors) {
> > > +					start += root->sectorsize;
> > > +					goto next_block;
> > > +				} else {
> > > +					continue;
> > > +				}
> > > 
> > >  			}
> > >  		
> > >  		}
> > > 
> > > -readpage_ok:
> > > -		if (likely(uptodate)) {
> > > -			loff_t i_size = i_size_read(inode);
> > > -			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
> > > -			unsigned off;
> > > -
> > > -			/* Zero out the end if this page straddles i_size */
> > > -			off = i_size & (PAGE_CACHE_SIZE-1);
> > > -			if (page->index == end_index && off)
> > > -				zero_user_segment(page, off, PAGE_CACHE_SIZE);
> > > -			SetPageUptodate(page);
> > > +
> > > +		if (uptodate) {
> > > +			set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, 
> start,
> > > +					start + root->sectorsize - 1);
> > > +			check_page_uptodate(page);
> > > 
> > >  		} else {
> > >  		
> > >  			ClearPageUptodate(page);
> > >  			SetPageError(page);
> > >  		
> > >  		}
> > > 
> > > -		unlock_page(page);
> > > -		offset += len;
> > > -
> > > -		if (unlikely(!uptodate)) {
> > > -			if (extent_len) {
> > > -				endio_readpage_release_extent(tree,
> > > -							      extent_start,
> > > -							      extent_len, 1);
> > > -				extent_start = 0;
> > > -				extent_len = 0;
> > > -			}
> > > -			endio_readpage_release_extent(tree, start,
> > > -						      end - start + 1, 0);
> > > -		} else if (!extent_len) {
> > > -			extent_start = start;
> > > -			extent_len = end + 1 - start;
> > > -		} else if (extent_start + extent_len == start) {
> > > -			extent_len += end + 1 - start;
> > > -		} else {
> > > -			endio_readpage_release_extent(tree, extent_start,
> > > -						      extent_len, uptodate);
> > > -			extent_start = start;
> > > -			extent_len = end + 1 - start;
> > > +
> > > +		offset += root->sectorsize;
> > > +
> > > +		if (--nr_sectors) {
> > > +			clear_page_blks_state(page, 1 << BLK_STATE_IO,
> > > +					start, start + root->sectorsize - 1);
> > 
> > private->io_lock is not acquired here but not in below.
> > 
> > IIUC, this can be protected by EXTENT_LOCKED.
> >
> 
> private->io_lock plays the same role as BH_Uptodate_Lock (see
> end_buffer_async_read()) i.e. without the io_lock we may end up in the
> following situation,
> 
> NOTE: Assume 64k page size and 4k block size. Also assume that the first 12
> blocks of the page are contiguous while the next 4 blocks are contiguous. When
> reading the page we end up submitting two "logical address space" bios. So
> end_bio_extent_readpage function is invoked twice (once for each bio).
> 
> |-------------------------+-------------------------+-------------|
> | Task A                  | Task B                  | Task C      |
> |-------------------------+-------------------------+-------------|
> | end_bio_extent_readpage |                         |             |
> | process block 0         |                         |             |
> | - clear BLK_STATE_IO    |                         |             |
> | - page_read_complete    |                         |             |
> | process block 1         |                         |             |
> | ...                     |                         |             |
> | ...                     |                         |             |
> | ...                     | end_bio_extent_readpage |             |
> | ...                     | process block 0         |             |
> | ...                     | - clear BLK_STATE_IO    |             |
> | ...                     | - page_read_complete    |             |
> | ...                     | process block 1         |             |
> | ...                     | ...                     |             |
> | process block 11        | process block 3         |             |
> | - clear BLK_STATE_IO    | - clear BLK_STATE_IO    |             |
> | - page_read_complete    | - page_read_complete    |             |
> |   - returns true        |   - returns true        |             |
> |   - unlock_page()       |                         |             |
> |                         |                         | lock_page() |
> |                         |   - unlock_page()       |             |
> |-------------------------+-------------------------+-------------|
> 
> So we end up incorrectly unlocking the page twice and "Task C" ends up working
> on an unlocked page. So private->io_lock makes sure that only one of the tasks
> gets "true" as the return value when page_read_complete() is invoked. As an
> optimization the patch gets the io_lock only when nr_sectors counter reaches
> the value 0 (i.e. when the last block of the bio_vec is being processed).
> Please let me know if my analysis was incorrect.

Thanks for the nice explanation, it looks reasonable to me.

Thanks,

-liubo

> 
> Also, I noticed that page_read_complete() and page_write_complete() can be
> replaced by just one function i.e. page_io_complete().
> 
> 
> -- 
> chandan
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba Feb. 10, 2016, 10:39 a.m. UTC | #4
On Fri, Jun 19, 2015 at 03:15:01PM +0530, Chandan Rajendra wrote:
> > private->io_lock is not acquired here but not in below.
> > 
> > IIUC, this can be protected by EXTENT_LOCKED.
> >
> 
> private->io_lock plays the same role as BH_Uptodate_Lock (see
> end_buffer_async_read()) i.e. without the io_lock we may end up in the
> following situation,
> 
> NOTE: Assume 64k page size and 4k block size. Also assume that the first 12
> blocks of the page are contiguous while the next 4 blocks are contiguous. When
> reading the page we end up submitting two "logical address space" bios. So
> end_bio_extent_readpage function is invoked twice (once for each bio).
> 
> |-------------------------+-------------------------+-------------|
> | Task A                  | Task B                  | Task C      |
> |-------------------------+-------------------------+-------------|
> | end_bio_extent_readpage |                         |             |
> | process block 0         |                         |             |
> | - clear BLK_STATE_IO    |                         |             |
> | - page_read_complete    |                         |             |
> | process block 1         |                         |             |
> | ...                     |                         |             |
> | ...                     |                         |             |
> | ...                     | end_bio_extent_readpage |             |
> | ...                     | process block 0         |             |
> | ...                     | - clear BLK_STATE_IO    |             |
> | ...                     | - page_read_complete    |             |
> | ...                     | process block 1         |             |
> | ...                     | ...                     |             |
> | process block 11        | process block 3         |             |
> | - clear BLK_STATE_IO    | - clear BLK_STATE_IO    |             |
> | - page_read_complete    | - page_read_complete    |             |
> |   - returns true        |   - returns true        |             |
> |   - unlock_page()       |                         |             |
> |                         |                         | lock_page() |
> |                         |   - unlock_page()       |             |
> |-------------------------+-------------------------+-------------|
> 
> So we end up incorrectly unlocking the page twice and "Task C" ends up working
> on an unlocked page. So private->io_lock makes sure that only one of the tasks
> gets "true" as the return value when page_read_complete() is invoked. As an
> optimization the patch gets the io_lock only when nr_sectors counter reaches
> the value 0 (i.e. when the last block of the bio_vec is being processed).
> Please let me know if my analysis was incorrect.
> 
> Also, I noticed that page_read_complete() and page_write_complete() can be
> replaced by just one function i.e. page_io_complete().

The explanations and the table would be good in the changelog and as
comments. I think we'll need to consider the smaller blocks more often
so some examples and locking rules would be useful, eg. documented in
this file.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba Feb. 10, 2016, 10:44 a.m. UTC | #5
On Tue, Jun 23, 2015 at 04:37:48PM +0800, Liu Bo wrote:
...
> > | - clear BLK_STATE_IO    | - clear BLK_STATE_IO    |             |
> > | - page_read_complete    | - page_read_complete    |             |
> > |   - returns true        |   - returns true        |             |
> > |   - unlock_page()       |                         |             |
> > |                         |                         | lock_page() |
> > |                         |   - unlock_page()       |             |
> > |-------------------------+-------------------------+-------------|
> > 
> > So we end up incorrectly unlocking the page twice and "Task C" ends up working
> > on an unlocked page. So private->io_lock makes sure that only one of the tasks
> > gets "true" as the return value when page_read_complete() is invoked. As an
> > optimization the patch gets the io_lock only when nr_sectors counter reaches
> > the value 0 (i.e. when the last block of the bio_vec is being processed).
> > Please let me know if my analysis was incorrect.
> 
> Thanks for the nice explanation, it looks reasonable to me.

Please don't hesitate to add your reviewed-by if you spent time on that
and think it's ok, this rellay helps to make decisions about merging.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chandan Rajendra Feb. 11, 2016, 5:42 a.m. UTC | #6
On Wednesday 10 Feb 2016 11:39:25 David Sterba wrote:
> 
> The explanations and the table would be good in the changelog and as
> comments. I think we'll need to consider the smaller blocks more often
> so some examples and locking rules would be useful, eg. documented in
> this file.

David, I agree.  As suggested, I will add the documentation to the commit
message and as comments in the code.
diff mbox

Patch

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 782f3bc..d37badb 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1325,6 +1325,88 @@  int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 				cached_state, mask);
 }
 
+static int modify_page_blks_state(struct page *page,
+				unsigned long blk_states,
+				u64 start, u64 end, int set)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned long *bitmap;
+	unsigned long state;
+	u64 nr_blks;
+	u64 blk;
+
+	BUG_ON(!PagePrivate(page));
+
+	bitmap = ((struct btrfs_page_private *)page->private)->bstate;
+
+	blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits;
+	nr_blks = (end - start + 1) >> inode->i_blkbits;
+
+	while (nr_blks--) {
+		state = find_next_bit(&blk_states, BLK_NR_STATE, 0);
+
+		while (state < BLK_NR_STATE) {
+			if (set)
+				set_bit((blk * BLK_NR_STATE) + state, bitmap);
+			else
+				clear_bit((blk * BLK_NR_STATE) + state, bitmap);
+
+			state = find_next_bit(&blk_states, BLK_NR_STATE,
+					state + 1);
+		}
+
+		++blk;
+	}
+
+	return 0;
+}
+
+int set_page_blks_state(struct page *page, unsigned long blk_states,
+			u64 start, u64 end)
+{
+	return modify_page_blks_state(page, blk_states, start, end, 1);
+}
+
+int clear_page_blks_state(struct page *page, unsigned long blk_states,
+			u64 start, u64 end)
+{
+	return modify_page_blks_state(page, blk_states, start, end, 0);
+}
+
+int test_page_blks_state(struct page *page, enum blk_state blk_state,
+			u64 start, u64 end, int check_all)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned long *bitmap;
+	unsigned long blk;
+	u64 nr_blks;
+	int found = 0;
+
+	BUG_ON(!PagePrivate(page));
+
+	bitmap = ((struct btrfs_page_private *)page->private)->bstate;
+
+	blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits;
+	nr_blks = (end - start + 1) >> inode->i_blkbits;
+
+	while (nr_blks--) {
+		if (test_bit((blk * BLK_NR_STATE) + blk_state, bitmap)) {
+			if (!check_all)
+				return 1;
+			found = 1;
+		} else if (check_all) {
+			return 0;
+		}
+
+		++blk;
+	}
+
+	if (!check_all && !found)
+		return 0;
+
+	return 1;
+}
+
 /*
  * either insert or lock state struct between start and end use mask to tell
  * us if waiting is desired.
@@ -1982,14 +2064,22 @@  int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
  * helper function to set a given page up to date if all the
  * extents in the tree for that page are up to date
  */
-static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
+static void check_page_uptodate(struct page *page)
 {
 	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
+	if (test_page_blks_state(page, BLK_STATE_UPTODATE, start, end, 1))
 		SetPageUptodate(page);
 }
 
+static int page_read_complete(struct page *page)
+{
+	u64 start = page_offset(page);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+
+	return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0);
+}
+
 int free_io_failure(struct inode *inode, struct io_failure_record *rec)
 {
 	int ret;
@@ -2311,7 +2401,9 @@  int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
 	 *	a) deliver good data to the caller
 	 *	b) correct the bad sectors on disk
 	 */
-	if (failed_bio->bi_vcnt > 1) {
+	if ((failed_bio->bi_vcnt > 1)
+		|| (failed_bio->bi_io_vec->bv_len
+			> BTRFS_I(inode)->root->sectorsize)) {
 		/*
 		 * to fulfill b), we need to know the exact failing sectors, as
 		 * we don't want to rewrite any more than the failed ones. thus,
@@ -2520,18 +2612,6 @@  static void end_bio_extent_writepage(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-static void
-endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
-			      int uptodate)
-{
-	struct extent_state *cached = NULL;
-	u64 end = start + len - 1;
-
-	if (uptodate && tree->track_uptodate)
-		set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
-	unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
-}
-
 /*
  * after a readpage IO is done, we need to:
  * clear the uptodate bits on error
@@ -2548,14 +2628,16 @@  static void end_bio_extent_readpage(struct bio *bio, int err)
 	struct bio_vec *bvec;
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+	struct extent_state *cached = NULL;
+	struct btrfs_page_private *pg_private;
 	struct extent_io_tree *tree;
+	unsigned long flags;
 	u64 offset = 0;
 	u64 start;
 	u64 end;
-	u64 len;
-	u64 extent_start = 0;
-	u64 extent_len = 0;
+	int nr_sectors;
 	int mirror;
+	int unlock;
 	int ret;
 	int i;
 
@@ -2565,54 +2647,31 @@  static void end_bio_extent_readpage(struct bio *bio, int err)
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 		struct inode *inode = page->mapping->host;
+		struct btrfs_root *root = BTRFS_I(inode)->root;
 
 		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
 			 "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
 			 io_bio->mirror_num);
 		tree = &BTRFS_I(inode)->io_tree;
 
-		/* We always issue full-page reads, but if some block
-		 * in a page fails to read, blk_update_request() will
-		 * advance bv_offset and adjust bv_len to compensate.
-		 * Print a warning for nonzero offsets, and an error
-		 * if they don't add up to a full page.  */
-		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
-			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
-				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
-				   "partial page read in btrfs with offset %u and length %u",
-					bvec->bv_offset, bvec->bv_len);
-			else
-				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
-				   "incomplete page read in btrfs with offset %u and "
-				   "length %u",
-					bvec->bv_offset, bvec->bv_len);
-		}
-
-		start = page_offset(page);
-		end = start + bvec->bv_offset + bvec->bv_len - 1;
-		len = bvec->bv_len;
-
+		start = page_offset(page) + bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+		nr_sectors = bvec->bv_len >> inode->i_sb->s_blocksize_bits;
 		mirror = io_bio->mirror_num;
-		if (likely(uptodate && tree->ops &&
-			   tree->ops->readpage_end_io_hook)) {
+
+next_block:
+		if (likely(uptodate)) {
 			ret = tree->ops->readpage_end_io_hook(io_bio, offset,
-							      page, start, end,
-							      mirror);
+							page, start,
+							start + root->sectorsize - 1,
+							mirror);
 			if (ret)
 				uptodate = 0;
 			else
 				clean_io_failure(inode, start, page, 0);
 		}
 
-		if (likely(uptodate))
-			goto readpage_ok;
-
-		if (tree->ops && tree->ops->readpage_io_failed_hook) {
-			ret = tree->ops->readpage_io_failed_hook(page, mirror);
-			if (!ret && !err &&
-			    test_bit(BIO_UPTODATE, &bio->bi_flags))
-				uptodate = 1;
-		} else {
+		if (!uptodate) {
 			/*
 			 * The generic bio_readpage_error handles errors the
 			 * following way: If possible, new read requests are
@@ -2623,61 +2682,63 @@  static void end_bio_extent_readpage(struct bio *bio, int err)
 			 * can't handle the error it will return -EIO and we
 			 * remain responsible for that page.
 			 */
-			ret = bio_readpage_error(bio, offset, page, start, end,
-						 mirror);
+			ret = bio_readpage_error(bio, offset, page,
+						start, start + root->sectorsize - 1,
+						mirror);
 			if (ret == 0) {
-				uptodate =
-					test_bit(BIO_UPTODATE, &bio->bi_flags);
+				uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 				if (err)
 					uptodate = 0;
-				offset += len;
-				continue;
+				offset += root->sectorsize;
+				if (--nr_sectors) {
+					start += root->sectorsize;
+					goto next_block;
+				} else {
+					continue;
+				}
 			}
 		}
-readpage_ok:
-		if (likely(uptodate)) {
-			loff_t i_size = i_size_read(inode);
-			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
-			unsigned off;
-
-			/* Zero out the end if this page straddles i_size */
-			off = i_size & (PAGE_CACHE_SIZE-1);
-			if (page->index == end_index && off)
-				zero_user_segment(page, off, PAGE_CACHE_SIZE);
-			SetPageUptodate(page);
+
+		if (uptodate) {
+			set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, start,
+					start + root->sectorsize - 1);
+			check_page_uptodate(page);
 		} else {
 			ClearPageUptodate(page);
 			SetPageError(page);
 		}
-		unlock_page(page);
-		offset += len;
-
-		if (unlikely(!uptodate)) {
-			if (extent_len) {
-				endio_readpage_release_extent(tree,
-							      extent_start,
-							      extent_len, 1);
-				extent_start = 0;
-				extent_len = 0;
-			}
-			endio_readpage_release_extent(tree, start,
-						      end - start + 1, 0);
-		} else if (!extent_len) {
-			extent_start = start;
-			extent_len = end + 1 - start;
-		} else if (extent_start + extent_len == start) {
-			extent_len += end + 1 - start;
-		} else {
-			endio_readpage_release_extent(tree, extent_start,
-						      extent_len, uptodate);
-			extent_start = start;
-			extent_len = end + 1 - start;
+
+		offset += root->sectorsize;
+
+		if (--nr_sectors) {
+			clear_page_blks_state(page, 1 << BLK_STATE_IO,
+					start, start + root->sectorsize - 1);
+			clear_extent_bit(tree, start, start + root->sectorsize - 1,
+					EXTENT_LOCKED, 1, 0, &cached, GFP_ATOMIC);
+			start += root->sectorsize;
+			goto next_block;
 		}
+
+		WARN_ON(!PagePrivate(page));
+
+		pg_private = (struct btrfs_page_private *)page->private;
+
+		spin_lock_irqsave(&pg_private->io_lock, flags);
+
+		clear_page_blks_state(page, 1 << BLK_STATE_IO,
+				start, start + root->sectorsize - 1);
+
+		unlock = page_read_complete(page);
+
+		spin_unlock_irqrestore(&pg_private->io_lock, flags);
+
+		clear_extent_bit(tree, start, start + root->sectorsize - 1,
+				EXTENT_LOCKED, 1, 0, &cached, GFP_ATOMIC);
+
+		if (unlock)
+			unlock_page(page);
 	}
 
-	if (extent_len)
-		endio_readpage_release_extent(tree, extent_start, extent_len,
-					      uptodate);
 	if (io_bio->end_io)
 		io_bio->end_io(io_bio, err);
 	bio_put(bio);
@@ -2859,13 +2920,36 @@  static void attach_extent_buffer_page(struct extent_buffer *eb,
 	}
 }
 
-void set_page_extent_mapped(struct page *page)
+int set_page_extent_mapped(struct page *page)
 {
+	struct btrfs_page_private *pg_private;
+
 	if (!PagePrivate(page)) {
+		pg_private = kzalloc(sizeof(*pg_private), GFP_NOFS);
+		if (!pg_private)
+			return -ENOMEM;
+
+		spin_lock_init(&pg_private->io_lock);
+
 		SetPagePrivate(page);
 		page_cache_get(page);
-		set_page_private(page, EXTENT_PAGE_PRIVATE);
+
+		set_page_private(page, (unsigned long)pg_private);
+	}
+
+	return 0;
+}
+
+int clear_page_extent_mapped(struct page *page)
+{
+	if (PagePrivate(page)) {
+		kfree((struct btrfs_page_private *)(page->private));
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
 	}
+
+	return 0;
 }
 
 static struct extent_map *
@@ -2909,6 +2993,7 @@  static int __do_readpage(struct extent_io_tree *tree,
 			 unsigned long *bio_flags, int rw)
 {
 	struct inode *inode = page->mapping->host;
+	struct extent_state *cached = NULL;
 	u64 start = page_offset(page);
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
@@ -2964,8 +3049,8 @@  static int __do_readpage(struct extent_io_tree *tree,
 			memset(userpage + pg_offset, 0, iosize);
 			flush_dcache_page(page);
 			kunmap_atomic(userpage);
-			set_extent_uptodate(tree, cur, cur + iosize - 1,
-					    &cached, GFP_NOFS);
+			set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, cur,
+					cur + iosize - 1);
 			if (!parent_locked)
 				unlock_extent_cached(tree, cur,
 						     cur + iosize - 1,
@@ -3017,8 +3102,8 @@  static int __do_readpage(struct extent_io_tree *tree,
 			flush_dcache_page(page);
 			kunmap_atomic(userpage);
 
-			set_extent_uptodate(tree, cur, cur + iosize - 1,
-					    &cached, GFP_NOFS);
+			set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, cur,
+					cur + iosize - 1);
 			unlock_extent_cached(tree, cur, cur + iosize - 1,
 			                     &cached, GFP_NOFS);
 			cur = cur + iosize;
@@ -3026,9 +3111,9 @@  static int __do_readpage(struct extent_io_tree *tree,
 			continue;
 		}
 		/* the get_extent function already copied into the page */
-		if (test_range_bit(tree, cur, cur_end,
-				   EXTENT_UPTODATE, 1, NULL)) {
-			check_page_uptodate(tree, page);
+		if (test_page_blks_state(page, BLK_STATE_UPTODATE, cur,
+						cur_end, 1)) {
+			check_page_uptodate(page);
 			if (!parent_locked)
 				unlock_extent(tree, cur, cur + iosize - 1);
 			cur = cur + iosize;
@@ -3048,6 +3133,9 @@  static int __do_readpage(struct extent_io_tree *tree,
 		}
 
 		pnr -= page->index;
+
+		set_page_blks_state(page, 1 << BLK_STATE_IO, cur,
+				cur + iosize - 1);
 		ret = submit_extent_page(rw, tree, page,
 					 sector, disk_io_size, pg_offset,
 					 bdev, bio, pnr,
@@ -3059,8 +3147,11 @@  static int __do_readpage(struct extent_io_tree *tree,
 			*bio_flags = this_bio_flag;
 		} else {
 			SetPageError(page);
+			clear_page_blks_state(page, 1 << BLK_STATE_IO, cur,
+					cur + iosize - 1);
 			if (!parent_locked)
-				unlock_extent(tree, cur, cur + iosize - 1);
+				unlock_extent_cached(tree, cur, cur + iosize - 1,
+						&cached, GFP_NOFS);
 		}
 		cur = cur + iosize;
 		pg_offset += iosize;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c668f36..541b40a 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -51,11 +51,22 @@ 
 #define PAGE_SET_PRIVATE2	(1 << 4)
 #define PAGE_SET_ERROR		(1 << 5)
 
+enum blk_state {
+	BLK_STATE_UPTODATE,
+	BLK_STATE_DIRTY,
+	BLK_STATE_IO,
+	BLK_NR_STATE,
+};
+
 /*
- * page->private values.  Every page that is controlled by the extent
- * map has page->private set to one.
- */
-#define EXTENT_PAGE_PRIVATE 1
+  The maximum number of blocks per page (i.e. 32) occurs when using 2k
+  as the block size and having 64k as the page size.
+*/
+#define BLK_STATE_NR_LONGS DIV_ROUND_UP(BLK_NR_STATE * 32, BITS_PER_LONG)
+struct btrfs_page_private {
+	spinlock_t io_lock;
+	unsigned long bstate[BLK_STATE_NR_LONGS];
+};
 
 struct extent_state;
 struct btrfs_root;
@@ -259,7 +270,14 @@  int extent_readpages(struct extent_io_tree *tree,
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len, get_extent_t *get_extent);
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
-void set_page_extent_mapped(struct page *page);
+int set_page_extent_mapped(struct page *page);
+int clear_page_extent_mapped(struct page *page);
+int set_page_blks_state(struct page *page, unsigned long blk_states,
+ 			u64 start, u64 end);
+int clear_page_blks_state(struct page *page, unsigned long blk_states,
+ 			u64 start, u64 end);
+int test_page_blks_state(struct page *page, enum blk_state blk_state,
+			u64 start, u64 end, int check_all);
 
 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 					  u64 start);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0020b56..8262f83 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6622,7 +6622,6 @@  struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	struct btrfs_key found_key;
 	struct extent_map *em = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_trans_handle *trans = NULL;
 	const bool new_inline = !page || create;
 
@@ -6800,8 +6799,8 @@  next:
 			kunmap(page);
 			btrfs_mark_buffer_dirty(leaf);
 		}
-		set_extent_uptodate(io_tree, em->start,
-				    extent_map_end(em) - 1, NULL, GFP_NOFS);
+		set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, em->start,
+				extent_map_end(em) - 1);
 		goto insert;
 	}
 not_found:
@@ -8392,11 +8391,9 @@  static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
 	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
-	if (ret == 1) {
-		ClearPagePrivate(page);
-		set_page_private(page, 0);
-		page_cache_release(page);
-	}
+	if (ret == 1)
+		clear_page_extent_mapped(page);
+
 	return ret;
 }