Message ID | 1433172176-8742-2-git-send-email-chandan@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, Jun 01, 2015 at 08:52:36PM +0530, Chandan Rajendra wrote: > For the subpagesize-blocksize scenario, a page can contain multiple > blocks. In such cases, this patch handles reading data from files. > > To track the status of individual blocks of a page, this patch makes use of a > bitmap pointed to by page->private. Start going through the patchset, it's not easy though. Several comments are following. > > Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> > --- > fs/btrfs/extent_io.c | 301 +++++++++++++++++++++++++++++++++------------------ > fs/btrfs/extent_io.h | 28 ++++- > fs/btrfs/inode.c | 13 +-- > 3 files changed, 224 insertions(+), 118 deletions(-) > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index 782f3bc..d37badb 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -1325,6 +1325,88 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, > cached_state, mask); > } > > +static int modify_page_blks_state(struct page *page, > + unsigned long blk_states, > + u64 start, u64 end, int set) > +{ > + struct inode *inode = page->mapping->host; > + unsigned long *bitmap; > + unsigned long state; > + u64 nr_blks; > + u64 blk; > + > + BUG_ON(!PagePrivate(page)); > + > + bitmap = ((struct btrfs_page_private *)page->private)->bstate; > + > + blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits; > + nr_blks = (end - start + 1) >> inode->i_blkbits; > + > + while (nr_blks--) { > + state = find_next_bit(&blk_states, BLK_NR_STATE, 0); Looks like we don't need to do find_next_bit for every block. > + > + while (state < BLK_NR_STATE) { > + if (set) > + set_bit((blk * BLK_NR_STATE) + state, bitmap); > + else > + clear_bit((blk * BLK_NR_STATE) + state, bitmap); > + > + state = find_next_bit(&blk_states, BLK_NR_STATE, > + state + 1); > + } > + > + ++blk; > + } > + > + return 0; > +} > + > +int set_page_blks_state(struct page *page, unsigned long blk_states, > + u64 start, u64 end) > +{ > + return modify_page_blks_state(page, blk_states, start, end, 1); > +} > + > +int clear_page_blks_state(struct page *page, unsigned long blk_states, > + u64 start, u64 end) > +{ > + return modify_page_blks_state(page, blk_states, start, end, 0); > +} > + > +int test_page_blks_state(struct page *page, enum blk_state blk_state, > + u64 start, u64 end, int check_all) > +{ > + struct inode *inode = page->mapping->host; > + unsigned long *bitmap; > + unsigned long blk; > + u64 nr_blks; > + int found = 0; > + > + BUG_ON(!PagePrivate(page)); > + > + bitmap = ((struct btrfs_page_private *)page->private)->bstate; > + > + blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits; > + nr_blks = (end - start + 1) >> inode->i_blkbits; > + > + while (nr_blks--) { > + if (test_bit((blk * BLK_NR_STATE) + blk_state, bitmap)) { > + if (!check_all) > + return 1; > + found = 1; > + } else if (check_all) { > + return 0; > + } > + > + ++blk; > + } > + > + if (!check_all && !found) > + return 0; > + > + return 1; > +} > + > /* > * either insert or lock state struct between start and end use mask to tell > * us if waiting is desired. > @@ -1982,14 +2064,22 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, > * helper function to set a given page up to date if all the > * extents in the tree for that page are up to date > */ > -static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) > +static void check_page_uptodate(struct page *page) > { > u64 start = page_offset(page); > u64 end = start + PAGE_CACHE_SIZE - 1; > - if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) > + if (test_page_blks_state(page, BLK_STATE_UPTODATE, start, end, 1)) > SetPageUptodate(page); > } > > +static int page_read_complete(struct page *page) > +{ > + u64 start = page_offset(page); > + u64 end = start + PAGE_CACHE_SIZE - 1; > + > + return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0); > +} > + > int free_io_failure(struct inode *inode, struct io_failure_record *rec) > { > int ret; > @@ -2311,7 +2401,9 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, > * a) deliver good data to the caller > * b) correct the bad sectors on disk > */ > - if (failed_bio->bi_vcnt > 1) { > + if ((failed_bio->bi_vcnt > 1) > + || (failed_bio->bi_io_vec->bv_len > + > BTRFS_I(inode)->root->sectorsize)) { > /* > * to fulfill b), we need to know the exact failing sectors, as > * we don't want to rewrite any more than the failed ones. thus, > @@ -2520,18 +2612,6 @@ static void end_bio_extent_writepage(struct bio *bio, int err) > bio_put(bio); > } > > -static void > -endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, > - int uptodate) > -{ > - struct extent_state *cached = NULL; > - u64 end = start + len - 1; > - > - if (uptodate && tree->track_uptodate) > - set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); > - unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); > -} > - > /* > * after a readpage IO is done, we need to: > * clear the uptodate bits on error > @@ -2548,14 +2628,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err) > struct bio_vec *bvec; > int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); > struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); > + struct extent_state *cached = NULL; > + struct btrfs_page_private *pg_private; > struct extent_io_tree *tree; > + unsigned long flags; > u64 offset = 0; > u64 start; > u64 end; > - u64 len; > - u64 extent_start = 0; > - u64 extent_len = 0; > + int nr_sectors; > int mirror; > + int unlock; > int ret; > int i; > > @@ -2565,54 +2647,31 @@ static void end_bio_extent_readpage(struct bio *bio, int err) > bio_for_each_segment_all(bvec, bio, i) { > struct page *page = bvec->bv_page; > struct inode *inode = page->mapping->host; > + struct btrfs_root *root = BTRFS_I(inode)->root; > > pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " > "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err, > io_bio->mirror_num); > tree = &BTRFS_I(inode)->io_tree; > > - /* We always issue full-page reads, but if some block > - * in a page fails to read, blk_update_request() will > - * advance bv_offset and adjust bv_len to compensate. > - * Print a warning for nonzero offsets, and an error > - * if they don't add up to a full page. */ > - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { > - if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) > - btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, > - "partial page read in btrfs with offset %u and length %u", > - bvec->bv_offset, bvec->bv_len); > - else > - btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, > - "incomplete page read in btrfs with offset %u and " > - "length %u", > - bvec->bv_offset, bvec->bv_len); > - } > - > - start = page_offset(page); > - end = start + bvec->bv_offset + bvec->bv_len - 1; > - len = bvec->bv_len; > - > + start = page_offset(page) + bvec->bv_offset; > + end = start + bvec->bv_len - 1; > + nr_sectors = bvec->bv_len >> inode->i_sb->s_blocksize_bits; > mirror = io_bio->mirror_num; > - if (likely(uptodate && tree->ops && > - tree->ops->readpage_end_io_hook)) { > + > +next_block: > + if (likely(uptodate)) { Any reason of killing (tree->ops && tree->ops->readpage_end_io_hook)? > ret = tree->ops->readpage_end_io_hook(io_bio, offset, > - page, start, end, > - mirror); > + page, start, > + start + root->sectorsize - 1, > + mirror); > if (ret) > uptodate = 0; > else > clean_io_failure(inode, start, page, 0); > } > > - if (likely(uptodate)) > - goto readpage_ok; > - > - if (tree->ops && tree->ops->readpage_io_failed_hook) { > - ret = tree->ops->readpage_io_failed_hook(page, mirror); > - if (!ret && !err && > - test_bit(BIO_UPTODATE, &bio->bi_flags)) > - uptodate = 1; > - } else { > + if (!uptodate) { > /* > * The generic bio_readpage_error handles errors the > * following way: If possible, new read requests are > @@ -2623,61 +2682,63 @@ static void end_bio_extent_readpage(struct bio *bio, int err) > * can't handle the error it will return -EIO and we > * remain responsible for that page. > */ > - ret = bio_readpage_error(bio, offset, page, start, end, > - mirror); > + ret = bio_readpage_error(bio, offset, page, > + start, start + root->sectorsize - 1, > + mirror); > if (ret == 0) { > - uptodate = > - test_bit(BIO_UPTODATE, &bio->bi_flags); > + uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); > if (err) > uptodate = 0; > - offset += len; > - continue; > + offset += root->sectorsize; > + if (--nr_sectors) { > + start += root->sectorsize; > + goto next_block; > + } else { > + continue; > + } > } > } > -readpage_ok: > - if (likely(uptodate)) { > - loff_t i_size = i_size_read(inode); > - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; > - unsigned off; > - > - /* Zero out the end if this page straddles i_size */ > - off = i_size & (PAGE_CACHE_SIZE-1); > - if (page->index == end_index && off) > - zero_user_segment(page, off, PAGE_CACHE_SIZE); > - SetPageUptodate(page); > + > + if (uptodate) { > + set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, start, > + start + root->sectorsize - 1); > + check_page_uptodate(page); > } else { > ClearPageUptodate(page); > SetPageError(page); > } > - unlock_page(page); > - offset += len; > - > - if (unlikely(!uptodate)) { > - if (extent_len) { > - endio_readpage_release_extent(tree, > - extent_start, > - extent_len, 1); > - extent_start = 0; > - extent_len = 0; > - } > - endio_readpage_release_extent(tree, start, > - end - start + 1, 0); > - } else if (!extent_len) { > - extent_start = start; > - extent_len = end + 1 - start; > - } else if (extent_start + extent_len == start) { > - extent_len += end + 1 - start; > - } else { > - endio_readpage_release_extent(tree, extent_start, > - extent_len, uptodate); > - extent_start = start; > - extent_len = end + 1 - start; > + > + offset += root->sectorsize; > + > + if (--nr_sectors) { > + clear_page_blks_state(page, 1 << BLK_STATE_IO, > + start, start + root->sectorsize - 1); private->io_lock is not acquired here but not in below. IIUC, this can be protected by EXTENT_LOCKED. Thanks, -liubo > + clear_extent_bit(tree, start, start + root->sectorsize - 1, > + EXTENT_LOCKED, 1, 0, &cached, GFP_ATOMIC); > + start += root->sectorsize; > + goto next_block; > } > + > + WARN_ON(!PagePrivate(page)); > + > + pg_private = (struct btrfs_page_private *)page->private; > + > + spin_lock_irqsave(&pg_private->io_lock, flags); > + > + clear_page_blks_state(page, 1 << BLK_STATE_IO, > + start, start + root->sectorsize - 1); > + > + unlock = page_read_complete(page); > + > + spin_unlock_irqrestore(&pg_private->io_lock, flags); > + > + clear_extent_bit(tree, start, start + root->sectorsize - 1, > + EXTENT_LOCKED, 1, 0, &cached, GFP_ATOMIC); > + > + if (unlock) > + unlock_page(page); > } > > - if (extent_len) > - endio_readpage_release_extent(tree, extent_start, extent_len, > - uptodate); > if (io_bio->end_io) > io_bio->end_io(io_bio, err); > bio_put(bio); > @@ -2859,13 +2920,36 @@ static void attach_extent_buffer_page(struct extent_buffer *eb, > } > } > > -void set_page_extent_mapped(struct page *page) > +int set_page_extent_mapped(struct page *page) > { > + struct btrfs_page_private *pg_private; > + > if (!PagePrivate(page)) { > + pg_private = kzalloc(sizeof(*pg_private), GFP_NOFS); > + if (!pg_private) > + return -ENOMEM; > + > + spin_lock_init(&pg_private->io_lock); > + > SetPagePrivate(page); > page_cache_get(page); > - set_page_private(page, EXTENT_PAGE_PRIVATE); > + > + set_page_private(page, (unsigned long)pg_private); > + } > + > + return 0; > +} > + > +int clear_page_extent_mapped(struct page *page) > +{ > + if (PagePrivate(page)) { > + kfree((struct btrfs_page_private *)(page->private)); > + ClearPagePrivate(page); > + set_page_private(page, 0); > + page_cache_release(page); > } > + > + return 0; > } > > static struct extent_map * > @@ -2909,6 +2993,7 @@ static int __do_readpage(struct extent_io_tree *tree, > unsigned long *bio_flags, int rw) > { > struct inode *inode = page->mapping->host; > + struct extent_state *cached = NULL; > u64 start = page_offset(page); > u64 page_end = start + PAGE_CACHE_SIZE - 1; > u64 end; > @@ -2964,8 +3049,8 @@ static int __do_readpage(struct extent_io_tree *tree, > memset(userpage + pg_offset, 0, iosize); > flush_dcache_page(page); > kunmap_atomic(userpage); > - set_extent_uptodate(tree, cur, cur + iosize - 1, > - &cached, GFP_NOFS); > + set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, cur, > + cur + iosize - 1); > if (!parent_locked) > unlock_extent_cached(tree, cur, > cur + iosize - 1, > @@ -3017,8 +3102,8 @@ static int __do_readpage(struct extent_io_tree *tree, > flush_dcache_page(page); > kunmap_atomic(userpage); > > - set_extent_uptodate(tree, cur, cur + iosize - 1, > - &cached, GFP_NOFS); > + set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, cur, > + cur + iosize - 1); > unlock_extent_cached(tree, cur, cur + iosize - 1, > &cached, GFP_NOFS); > cur = cur + iosize; > @@ -3026,9 +3111,9 @@ static int __do_readpage(struct extent_io_tree *tree, > continue; > } > /* the get_extent function already copied into the page */ > - if (test_range_bit(tree, cur, cur_end, > - EXTENT_UPTODATE, 1, NULL)) { > - check_page_uptodate(tree, page); > + if (test_page_blks_state(page, BLK_STATE_UPTODATE, cur, > + cur_end, 1)) { > + check_page_uptodate(page); > if (!parent_locked) > unlock_extent(tree, cur, cur + iosize - 1); > cur = cur + iosize; > @@ -3048,6 +3133,9 @@ static int __do_readpage(struct extent_io_tree *tree, > } > > pnr -= page->index; > + > + set_page_blks_state(page, 1 << BLK_STATE_IO, cur, > + cur + iosize - 1); > ret = submit_extent_page(rw, tree, page, > sector, disk_io_size, pg_offset, > bdev, bio, pnr, > @@ -3059,8 +3147,11 @@ static int __do_readpage(struct extent_io_tree *tree, > *bio_flags = this_bio_flag; > } else { > SetPageError(page); > + clear_page_blks_state(page, 1 << BLK_STATE_IO, cur, > + cur + iosize - 1); > if (!parent_locked) > - unlock_extent(tree, cur, cur + iosize - 1); > + unlock_extent_cached(tree, cur, cur + iosize - 1, > + &cached, GFP_NOFS); > } > cur = cur + iosize; > pg_offset += iosize; > diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h > index c668f36..541b40a 100644 > --- a/fs/btrfs/extent_io.h > +++ b/fs/btrfs/extent_io.h > @@ -51,11 +51,22 @@ > #define PAGE_SET_PRIVATE2 (1 << 4) > #define PAGE_SET_ERROR (1 << 5) > > +enum blk_state { > + BLK_STATE_UPTODATE, > + BLK_STATE_DIRTY, > + BLK_STATE_IO, > + BLK_NR_STATE, > +}; > + > /* > - * page->private values. Every page that is controlled by the extent > - * map has page->private set to one. > - */ > -#define EXTENT_PAGE_PRIVATE 1 > + The maximum number of blocks per page (i.e. 32) occurs when using 2k > + as the block size and having 64k as the page size. > +*/ > +#define BLK_STATE_NR_LONGS DIV_ROUND_UP(BLK_NR_STATE * 32, BITS_PER_LONG) > +struct btrfs_page_private { > + spinlock_t io_lock; > + unsigned long bstate[BLK_STATE_NR_LONGS]; > +}; > > struct extent_state; > struct btrfs_root; > @@ -259,7 +270,14 @@ int extent_readpages(struct extent_io_tree *tree, > int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, > __u64 start, __u64 len, get_extent_t *get_extent); > int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); > -void set_page_extent_mapped(struct page *page); > +int set_page_extent_mapped(struct page *page); > +int clear_page_extent_mapped(struct page *page); > +int set_page_blks_state(struct page *page, unsigned long blk_states, > + u64 start, u64 end); > +int clear_page_blks_state(struct page *page, unsigned long blk_states, > + u64 start, u64 end); > +int test_page_blks_state(struct page *page, enum blk_state blk_state, > + u64 start, u64 end, int check_all); > > struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > u64 start); > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index 0020b56..8262f83 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -6622,7 +6622,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, > struct btrfs_key found_key; > struct extent_map *em = NULL; > struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; > - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; > struct btrfs_trans_handle *trans = NULL; > const bool new_inline = !page || create; > > @@ -6800,8 +6799,8 @@ next: > kunmap(page); > btrfs_mark_buffer_dirty(leaf); > } > - set_extent_uptodate(io_tree, em->start, > - extent_map_end(em) - 1, NULL, GFP_NOFS); > + set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, em->start, > + extent_map_end(em) - 1); > goto insert; > } > not_found: > @@ -8392,11 +8391,9 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) > tree = &BTRFS_I(page->mapping->host)->io_tree; > map = &BTRFS_I(page->mapping->host)->extent_tree; > ret = try_release_extent_mapping(map, tree, page, gfp_flags); > - if (ret == 1) { > - ClearPagePrivate(page); > - set_page_private(page, 0); > - page_cache_release(page); > - } > + if (ret == 1) > + clear_page_extent_mapped(page); > + > return ret; > } > > -- > 2.1.0 > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Friday 19 Jun 2015 12:45:37 Liu Bo wrote: > On Mon, Jun 01, 2015 at 08:52:36PM +0530, Chandan Rajendra wrote: > > For the subpagesize-blocksize scenario, a page can contain multiple > > blocks. In such cases, this patch handles reading data from files. > > > > To track the status of individual blocks of a page, this patch makes use > > of a bitmap pointed to by page->private. > > Start going through the patchset, it's not easy though. > > Several comments are following. Thanks for the review comments Liu. > > +static int modify_page_blks_state(struct page *page, > > + unsigned long blk_states, > > + u64 start, u64 end, int set) > > +{ > > + struct inode *inode = page->mapping->host; > > + unsigned long *bitmap; > > + unsigned long state; > > + u64 nr_blks; > > + u64 blk; > > + > > + BUG_ON(!PagePrivate(page)); > > + > > + bitmap = ((struct btrfs_page_private *)page->private)->bstate; > > + > > + blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits; > > + nr_blks = (end - start + 1) >> inode->i_blkbits; > > + > > + while (nr_blks--) { > > + state = find_next_bit(&blk_states, BLK_NR_STATE, 0); > > Looks like we don't need to do find_next_bit for every block. Yes, I agree. The find_next_bit() invocation in the outer loop can be moved outside the loop. > > > + > > + while (state < BLK_NR_STATE) { > > + if (set) > > + set_bit((blk * BLK_NR_STATE) + state, bitmap); > > + else > > + clear_bit((blk * BLK_NR_STATE) + state, bitmap); > > + > > + state = find_next_bit(&blk_states, BLK_NR_STATE, > > + state + 1); > > + } > > + > > + ++blk; > > + } > > + > > + return 0; > > +} > > + > > > > /* > > > > * after a readpage IO is done, we need to: > > * clear the uptodate bits on error > > > > @@ -2548,14 +2628,16 @@ static void end_bio_extent_readpage(struct bio > > *bio, int err)> > > struct bio_vec *bvec; > > int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); > > struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); > > > > + struct extent_state *cached = NULL; > > + struct btrfs_page_private *pg_private; > > > > struct extent_io_tree *tree; > > > > + unsigned long flags; > > > > u64 offset = 0; > > u64 start; > > u64 end; > > > > - u64 len; > > - u64 extent_start = 0; > > - u64 extent_len = 0; > > + int nr_sectors; > > > > int mirror; > > > > + int unlock; > > > > int ret; > > int i; > > > > @@ -2565,54 +2647,31 @@ static void end_bio_extent_readpage(struct bio > > *bio, int err)> > > bio_for_each_segment_all(bvec, bio, i) { > > > > struct page *page = bvec->bv_page; > > struct inode *inode = page->mapping->host; > > > > + struct btrfs_root *root = BTRFS_I(inode)->root; > > > > pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " > > > > "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err, > > io_bio->mirror_num); > > > > tree = &BTRFS_I(inode)->io_tree; > > > > - /* We always issue full-page reads, but if some block > > - * in a page fails to read, blk_update_request() will > > - * advance bv_offset and adjust bv_len to compensate. > > - * Print a warning for nonzero offsets, and an error > > - * if they don't add up to a full page. */ > > - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { > > - if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) > > - btrfs_err(BTRFS_I(page->mapping->host)->root- >fs_info, > > - "partial page read in btrfs with offset %u and length %u", > > - bvec->bv_offset, bvec->bv_len); > > - else > > - btrfs_info(BTRFS_I(page->mapping->host)->root- >fs_info, > > - "incomplete page read in btrfs with offset %u and " > > - "length %u", > > - bvec->bv_offset, bvec->bv_len); > > - } > > - > > - start = page_offset(page); > > - end = start + bvec->bv_offset + bvec->bv_len - 1; > > - len = bvec->bv_len; > > - > > + start = page_offset(page) + bvec->bv_offset; > > + end = start + bvec->bv_len - 1; > > + nr_sectors = bvec->bv_len >> inode->i_sb->s_blocksize_bits; > > > > mirror = io_bio->mirror_num; > > > > - if (likely(uptodate && tree->ops && > > - tree->ops->readpage_end_io_hook)) { > > + > > +next_block: > > + if (likely(uptodate)) { > > Any reason of killing (tree->ops && tree->ops->readpage_end_io_hook)? In subpagesize-blocksize scenario, For extent buffers we need the ability to read just a single extent buffer rather than reading the complete contents of the page containing the extent buffer. Similarly in the corresponding endio function we need to verify a single extent buffer rather than the contents of the full page. Hence I ended up removing btree_readpage_end_io_hook() and btree_io_failed_hook() functions and had verfication functions being invoked directly by the endio function. So since data "read page code" was the only one left to have extent_io_tree->ops->readpage_end_io_hook set, I removed the code to check for its existance. Now i realize that it is not the right thing to do. I will restore back the condition check to its original state. > > > ret = tree->ops->readpage_end_io_hook(io_bio, offset, > > > > - page, start, end, > > - mirror); > > + page, start, > > + start + root- >sectorsize - 1, > > + mirror); > > > > if (ret) > > > > uptodate = 0; > > > > else > > > > clean_io_failure(inode, start, page, 0); > > > > } > > > > - if (likely(uptodate)) > > - goto readpage_ok; > > - > > - if (tree->ops && tree->ops->readpage_io_failed_hook) { > > - ret = tree->ops->readpage_io_failed_hook(page, mirror); > > - if (!ret && !err && > > - test_bit(BIO_UPTODATE, &bio->bi_flags)) > > - uptodate = 1; > > - } else { > > + if (!uptodate) { > > > > /* > > > > * The generic bio_readpage_error handles errors the > > * following way: If possible, new read requests are > > > > @@ -2623,61 +2682,63 @@ static void end_bio_extent_readpage(struct bio > > *bio, int err)> > > * can't handle the error it will return -EIO and we > > * remain responsible for that page. > > */ > > > > - ret = bio_readpage_error(bio, offset, page, start, end, > > - mirror); > > + ret = bio_readpage_error(bio, offset, page, > > + start, start + root- >sectorsize - 1, > > + mirror); > > > > if (ret == 0) { > > > > - uptodate = > > - test_bit(BIO_UPTODATE, &bio- >bi_flags); > > + uptodate = test_bit(BIO_UPTODATE, &bio- >bi_flags); > > > > if (err) > > > > uptodate = 0; > > > > - offset += len; > > - continue; > > + offset += root->sectorsize; > > + if (--nr_sectors) { > > + start += root->sectorsize; > > + goto next_block; > > + } else { > > + continue; > > + } > > > > } > > > > } > > > > -readpage_ok: > > - if (likely(uptodate)) { > > - loff_t i_size = i_size_read(inode); > > - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; > > - unsigned off; > > - > > - /* Zero out the end if this page straddles i_size */ > > - off = i_size & (PAGE_CACHE_SIZE-1); > > - if (page->index == end_index && off) > > - zero_user_segment(page, off, PAGE_CACHE_SIZE); > > - SetPageUptodate(page); > > + > > + if (uptodate) { > > + set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, start, > > + start + root->sectorsize - 1); > > + check_page_uptodate(page); > > > > } else { > > > > ClearPageUptodate(page); > > SetPageError(page); > > > > } > > > > - unlock_page(page); > > - offset += len; > > - > > - if (unlikely(!uptodate)) { > > - if (extent_len) { > > - endio_readpage_release_extent(tree, > > - extent_start, > > - extent_len, 1); > > - extent_start = 0; > > - extent_len = 0; > > - } > > - endio_readpage_release_extent(tree, start, > > - end - start + 1, 0); > > - } else if (!extent_len) { > > - extent_start = start; > > - extent_len = end + 1 - start; > > - } else if (extent_start + extent_len == start) { > > - extent_len += end + 1 - start; > > - } else { > > - endio_readpage_release_extent(tree, extent_start, > > - extent_len, uptodate); > > - extent_start = start; > > - extent_len = end + 1 - start; > > + > > + offset += root->sectorsize; > > + > > + if (--nr_sectors) { > > + clear_page_blks_state(page, 1 << BLK_STATE_IO, > > + start, start + root->sectorsize - 1); > > private->io_lock is not acquired here but not in below. > > IIUC, this can be protected by EXTENT_LOCKED. > private->io_lock plays the same role as BH_Uptodate_Lock (see end_buffer_async_read()) i.e. without the io_lock we may end up in the following situation, NOTE: Assume 64k page size and 4k block size. Also assume that the first 12 blocks of the page are contiguous while the next 4 blocks are contiguous. When reading the page we end up submitting two "logical address space" bios. So end_bio_extent_readpage function is invoked twice (once for each bio). |-------------------------+-------------------------+-------------| | Task A | Task B | Task C | |-------------------------+-------------------------+-------------| | end_bio_extent_readpage | | | | process block 0 | | | | - clear BLK_STATE_IO | | | | - page_read_complete | | | | process block 1 | | | | ... | | | | ... | | | | ... | end_bio_extent_readpage | | | ... | process block 0 | | | ... | - clear BLK_STATE_IO | | | ... | - page_read_complete | | | ... | process block 1 | | | ... | ... | | | process block 11 | process block 3 | | | - clear BLK_STATE_IO | - clear BLK_STATE_IO | | | - page_read_complete | - page_read_complete | | | - returns true | - returns true | | | - unlock_page() | | | | | | lock_page() | | | - unlock_page() | | |-------------------------+-------------------------+-------------| So we end up incorrectly unlocking the page twice and "Task C" ends up working on an unlocked page. So private->io_lock makes sure that only one of the tasks gets "true" as the return value when page_read_complete() is invoked. As an optimization the patch gets the io_lock only when nr_sectors counter reaches the value 0 (i.e. when the last block of the bio_vec is being processed). Please let me know if my analysis was incorrect. Also, I noticed that page_read_complete() and page_write_complete() can be replaced by just one function i.e. page_io_complete().
On Fri, Jun 19, 2015 at 03:15:01PM +0530, Chandan Rajendra wrote: > On Friday 19 Jun 2015 12:45:37 Liu Bo wrote: > > On Mon, Jun 01, 2015 at 08:52:36PM +0530, Chandan Rajendra wrote: > > > For the subpagesize-blocksize scenario, a page can contain multiple > > > blocks. In such cases, this patch handles reading data from files. > > > > > > To track the status of individual blocks of a page, this patch makes use > > > of a bitmap pointed to by page->private. > > > > Start going through the patchset, it's not easy though. > > > > Several comments are following. > > Thanks for the review comments Liu. > > > > +static int modify_page_blks_state(struct page *page, > > > + unsigned long blk_states, > > > + u64 start, u64 end, int set) > > > +{ > > > + struct inode *inode = page->mapping->host; > > > + unsigned long *bitmap; > > > + unsigned long state; > > > + u64 nr_blks; > > > + u64 blk; > > > + > > > + BUG_ON(!PagePrivate(page)); > > > + > > > + bitmap = ((struct btrfs_page_private *)page->private)->bstate; > > > + > > > + blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits; > > > + nr_blks = (end - start + 1) >> inode->i_blkbits; > > > + > > > + while (nr_blks--) { > > > + state = find_next_bit(&blk_states, BLK_NR_STATE, 0); > > > > Looks like we don't need to do find_next_bit for every block. > > Yes, I agree. The find_next_bit() invocation in the outer loop can be moved > outside the loop. > > > > > + > > > + while (state < BLK_NR_STATE) { > > > + if (set) > > > + set_bit((blk * BLK_NR_STATE) + state, bitmap); > > > + else > > > + clear_bit((blk * BLK_NR_STATE) + state, > bitmap); > > > + > > > + state = find_next_bit(&blk_states, BLK_NR_STATE, > > > + state + 1); > > > + } > > > + > > > + ++blk; > > > + } > > > + > > > + return 0; > > > +} > > > + > > > > > > /* > > > > > > * after a readpage IO is done, we need to: > > > * clear the uptodate bits on error > > > > > > @@ -2548,14 +2628,16 @@ static void end_bio_extent_readpage(struct bio > > > *bio, int err)> > > > struct bio_vec *bvec; > > > int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); > > > struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); > > > > > > + struct extent_state *cached = NULL; > > > + struct btrfs_page_private *pg_private; > > > > > > struct extent_io_tree *tree; > > > > > > + unsigned long flags; > > > > > > u64 offset = 0; > > > u64 start; > > > u64 end; > > > > > > - u64 len; > > > - u64 extent_start = 0; > > > - u64 extent_len = 0; > > > + int nr_sectors; > > > > > > int mirror; > > > > > > + int unlock; > > > > > > int ret; > > > int i; > > > > > > @@ -2565,54 +2647,31 @@ static void end_bio_extent_readpage(struct bio > > > *bio, int err)> > > > bio_for_each_segment_all(bvec, bio, i) { > > > > > > struct page *page = bvec->bv_page; > > > struct inode *inode = page->mapping->host; > > > > > > + struct btrfs_root *root = BTRFS_I(inode)->root; > > > > > > pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " > > > > > > "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err, > > > io_bio->mirror_num); > > > > > > tree = &BTRFS_I(inode)->io_tree; > > > > > > - /* We always issue full-page reads, but if some block > > > - * in a page fails to read, blk_update_request() will > > > - * advance bv_offset and adjust bv_len to compensate. > > > - * Print a warning for nonzero offsets, and an error > > > - * if they don't add up to a full page. */ > > > - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { > > > - if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) > > > - btrfs_err(BTRFS_I(page->mapping->host)->root- > >fs_info, > > > - "partial page read in btrfs with offset %u > and length %u", > > > - bvec->bv_offset, bvec->bv_len); > > > - else > > > - btrfs_info(BTRFS_I(page->mapping->host)->root- > >fs_info, > > > - "incomplete page read in btrfs with offset > %u and " > > > - "length %u", > > > - bvec->bv_offset, bvec->bv_len); > > > - } > > > - > > > - start = page_offset(page); > > > - end = start + bvec->bv_offset + bvec->bv_len - 1; > > > - len = bvec->bv_len; > > > - > > > + start = page_offset(page) + bvec->bv_offset; > > > + end = start + bvec->bv_len - 1; > > > + nr_sectors = bvec->bv_len >> inode->i_sb->s_blocksize_bits; > > > > > > mirror = io_bio->mirror_num; > > > > > > - if (likely(uptodate && tree->ops && > > > - tree->ops->readpage_end_io_hook)) { > > > + > > > +next_block: > > > + if (likely(uptodate)) { > > > > Any reason of killing (tree->ops && tree->ops->readpage_end_io_hook)? > > In subpagesize-blocksize scenario, For extent buffers we need the ability to > read just a single extent buffer rather than reading the complete contents of > the page containing the extent buffer. Similarly in the corresponding endio > function we need to verify a single extent buffer rather than the contents of > the full page. Hence I ended up removing btree_readpage_end_io_hook() and > btree_io_failed_hook() functions and had verfication functions being > invoked directly by the endio function. > > So since data "read page code" was the only one left to have > extent_io_tree->ops->readpage_end_io_hook set, I removed the code to check for > its existance. Now i realize that it is not the right thing to do. I will > restore back the condition check to its original state. > > > > > > ret = tree->ops->readpage_end_io_hook(io_bio, offset, > > > > > > - page, start, > end, > > > - mirror); > > > + page, start, > > > + start + root- > >sectorsize - 1, > > > + mirror); > > > > > > if (ret) > > > > > > uptodate = 0; > > > > > > else > > > > > > clean_io_failure(inode, start, page, 0); > > > > > > } > > > > > > - if (likely(uptodate)) > > > - goto readpage_ok; > > > - > > > - if (tree->ops && tree->ops->readpage_io_failed_hook) { > > > - ret = tree->ops->readpage_io_failed_hook(page, > mirror); > > > - if (!ret && !err && > > > - test_bit(BIO_UPTODATE, &bio->bi_flags)) > > > - uptodate = 1; > > > - } else { > > > + if (!uptodate) { > > > > > > /* > > > > > > * The generic bio_readpage_error handles errors the > > > * following way: If possible, new read requests are > > > > > > @@ -2623,61 +2682,63 @@ static void end_bio_extent_readpage(struct bio > > > *bio, int err)> > > > * can't handle the error it will return -EIO and we > > > * remain responsible for that page. > > > */ > > > > > > - ret = bio_readpage_error(bio, offset, page, start, > end, > > > - mirror); > > > + ret = bio_readpage_error(bio, offset, page, > > > + start, start + root- > >sectorsize - 1, > > > + mirror); > > > > > > if (ret == 0) { > > > > > > - uptodate = > > > - test_bit(BIO_UPTODATE, &bio- > >bi_flags); > > > + uptodate = test_bit(BIO_UPTODATE, &bio- > >bi_flags); > > > > > > if (err) > > > > > > uptodate = 0; > > > > > > - offset += len; > > > - continue; > > > + offset += root->sectorsize; > > > + if (--nr_sectors) { > > > + start += root->sectorsize; > > > + goto next_block; > > > + } else { > > > + continue; > > > + } > > > > > > } > > > > > > } > > > > > > -readpage_ok: > > > - if (likely(uptodate)) { > > > - loff_t i_size = i_size_read(inode); > > > - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; > > > - unsigned off; > > > - > > > - /* Zero out the end if this page straddles i_size */ > > > - off = i_size & (PAGE_CACHE_SIZE-1); > > > - if (page->index == end_index && off) > > > - zero_user_segment(page, off, PAGE_CACHE_SIZE); > > > - SetPageUptodate(page); > > > + > > > + if (uptodate) { > > > + set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, > start, > > > + start + root->sectorsize - 1); > > > + check_page_uptodate(page); > > > > > > } else { > > > > > > ClearPageUptodate(page); > > > SetPageError(page); > > > > > > } > > > > > > - unlock_page(page); > > > - offset += len; > > > - > > > - if (unlikely(!uptodate)) { > > > - if (extent_len) { > > > - endio_readpage_release_extent(tree, > > > - extent_start, > > > - extent_len, 1); > > > - extent_start = 0; > > > - extent_len = 0; > > > - } > > > - endio_readpage_release_extent(tree, start, > > > - end - start + 1, 0); > > > - } else if (!extent_len) { > > > - extent_start = start; > > > - extent_len = end + 1 - start; > > > - } else if (extent_start + extent_len == start) { > > > - extent_len += end + 1 - start; > > > - } else { > > > - endio_readpage_release_extent(tree, extent_start, > > > - extent_len, uptodate); > > > - extent_start = start; > > > - extent_len = end + 1 - start; > > > + > > > + offset += root->sectorsize; > > > + > > > + if (--nr_sectors) { > > > + clear_page_blks_state(page, 1 << BLK_STATE_IO, > > > + start, start + root->sectorsize - 1); > > > > private->io_lock is not acquired here but not in below. > > > > IIUC, this can be protected by EXTENT_LOCKED. > > > > private->io_lock plays the same role as BH_Uptodate_Lock (see > end_buffer_async_read()) i.e. without the io_lock we may end up in the > following situation, > > NOTE: Assume 64k page size and 4k block size. Also assume that the first 12 > blocks of the page are contiguous while the next 4 blocks are contiguous. When > reading the page we end up submitting two "logical address space" bios. So > end_bio_extent_readpage function is invoked twice (once for each bio). > > |-------------------------+-------------------------+-------------| > | Task A | Task B | Task C | > |-------------------------+-------------------------+-------------| > | end_bio_extent_readpage | | | > | process block 0 | | | > | - clear BLK_STATE_IO | | | > | - page_read_complete | | | > | process block 1 | | | > | ... | | | > | ... | | | > | ... | end_bio_extent_readpage | | > | ... | process block 0 | | > | ... | - clear BLK_STATE_IO | | > | ... | - page_read_complete | | > | ... | process block 1 | | > | ... | ... | | > | process block 11 | process block 3 | | > | - clear BLK_STATE_IO | - clear BLK_STATE_IO | | > | - page_read_complete | - page_read_complete | | > | - returns true | - returns true | | > | - unlock_page() | | | > | | | lock_page() | > | | - unlock_page() | | > |-------------------------+-------------------------+-------------| > > So we end up incorrectly unlocking the page twice and "Task C" ends up working > on an unlocked page. So private->io_lock makes sure that only one of the tasks > gets "true" as the return value when page_read_complete() is invoked. As an > optimization the patch gets the io_lock only when nr_sectors counter reaches > the value 0 (i.e. when the last block of the bio_vec is being processed). > Please let me know if my analysis was incorrect. Thanks for the nice explanation, it looks reasonable to me. Thanks, -liubo > > Also, I noticed that page_read_complete() and page_write_complete() can be > replaced by just one function i.e. page_io_complete(). > > > -- > chandan > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jun 19, 2015 at 03:15:01PM +0530, Chandan Rajendra wrote: > > private->io_lock is not acquired here but not in below. > > > > IIUC, this can be protected by EXTENT_LOCKED. > > > > private->io_lock plays the same role as BH_Uptodate_Lock (see > end_buffer_async_read()) i.e. without the io_lock we may end up in the > following situation, > > NOTE: Assume 64k page size and 4k block size. Also assume that the first 12 > blocks of the page are contiguous while the next 4 blocks are contiguous. When > reading the page we end up submitting two "logical address space" bios. So > end_bio_extent_readpage function is invoked twice (once for each bio). > > |-------------------------+-------------------------+-------------| > | Task A | Task B | Task C | > |-------------------------+-------------------------+-------------| > | end_bio_extent_readpage | | | > | process block 0 | | | > | - clear BLK_STATE_IO | | | > | - page_read_complete | | | > | process block 1 | | | > | ... | | | > | ... | | | > | ... | end_bio_extent_readpage | | > | ... | process block 0 | | > | ... | - clear BLK_STATE_IO | | > | ... | - page_read_complete | | > | ... | process block 1 | | > | ... | ... | | > | process block 11 | process block 3 | | > | - clear BLK_STATE_IO | - clear BLK_STATE_IO | | > | - page_read_complete | - page_read_complete | | > | - returns true | - returns true | | > | - unlock_page() | | | > | | | lock_page() | > | | - unlock_page() | | > |-------------------------+-------------------------+-------------| > > So we end up incorrectly unlocking the page twice and "Task C" ends up working > on an unlocked page. So private->io_lock makes sure that only one of the tasks > gets "true" as the return value when page_read_complete() is invoked. As an > optimization the patch gets the io_lock only when nr_sectors counter reaches > the value 0 (i.e. when the last block of the bio_vec is being processed). > Please let me know if my analysis was incorrect. > > Also, I noticed that page_read_complete() and page_write_complete() can be > replaced by just one function i.e. page_io_complete(). The explanations and the table would be good in the changelog and as comments. I think we'll need to consider the smaller blocks more often so some examples and locking rules would be useful, eg. documented in this file. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jun 23, 2015 at 04:37:48PM +0800, Liu Bo wrote: ... > > | - clear BLK_STATE_IO | - clear BLK_STATE_IO | | > > | - page_read_complete | - page_read_complete | | > > | - returns true | - returns true | | > > | - unlock_page() | | | > > | | | lock_page() | > > | | - unlock_page() | | > > |-------------------------+-------------------------+-------------| > > > > So we end up incorrectly unlocking the page twice and "Task C" ends up working > > on an unlocked page. So private->io_lock makes sure that only one of the tasks > > gets "true" as the return value when page_read_complete() is invoked. As an > > optimization the patch gets the io_lock only when nr_sectors counter reaches > > the value 0 (i.e. when the last block of the bio_vec is being processed). > > Please let me know if my analysis was incorrect. > > Thanks for the nice explanation, it looks reasonable to me. Please don't hesitate to add your reviewed-by if you spent time on that and think it's ok, this rellay helps to make decisions about merging. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wednesday 10 Feb 2016 11:39:25 David Sterba wrote: > > The explanations and the table would be good in the changelog and as > comments. I think we'll need to consider the smaller blocks more often > so some examples and locking rules would be useful, eg. documented in > this file. David, I agree. As suggested, I will add the documentation to the commit message and as comments in the code.
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 782f3bc..d37badb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1325,6 +1325,88 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, cached_state, mask); } +static int modify_page_blks_state(struct page *page, + unsigned long blk_states, + u64 start, u64 end, int set) +{ + struct inode *inode = page->mapping->host; + unsigned long *bitmap; + unsigned long state; + u64 nr_blks; + u64 blk; + + BUG_ON(!PagePrivate(page)); + + bitmap = ((struct btrfs_page_private *)page->private)->bstate; + + blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits; + nr_blks = (end - start + 1) >> inode->i_blkbits; + + while (nr_blks--) { + state = find_next_bit(&blk_states, BLK_NR_STATE, 0); + + while (state < BLK_NR_STATE) { + if (set) + set_bit((blk * BLK_NR_STATE) + state, bitmap); + else + clear_bit((blk * BLK_NR_STATE) + state, bitmap); + + state = find_next_bit(&blk_states, BLK_NR_STATE, + state + 1); + } + + ++blk; + } + + return 0; +} + +int set_page_blks_state(struct page *page, unsigned long blk_states, + u64 start, u64 end) +{ + return modify_page_blks_state(page, blk_states, start, end, 1); +} + +int clear_page_blks_state(struct page *page, unsigned long blk_states, + u64 start, u64 end) +{ + return modify_page_blks_state(page, blk_states, start, end, 0); +} + +int test_page_blks_state(struct page *page, enum blk_state blk_state, + u64 start, u64 end, int check_all) +{ + struct inode *inode = page->mapping->host; + unsigned long *bitmap; + unsigned long blk; + u64 nr_blks; + int found = 0; + + BUG_ON(!PagePrivate(page)); + + bitmap = ((struct btrfs_page_private *)page->private)->bstate; + + blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits; + nr_blks = (end - start + 1) >> inode->i_blkbits; + + while (nr_blks--) { + if (test_bit((blk * BLK_NR_STATE) + blk_state, bitmap)) { + if (!check_all) + return 1; + found = 1; + } else if (check_all) { + return 0; + } + + ++blk; + } + + if (!check_all && !found) + return 0; + + return 1; +} + /* * either insert or lock state struct between start and end use mask to tell * us if waiting is desired. @@ -1982,14 +2064,22 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, * helper function to set a given page up to date if all the * extents in the tree for that page are up to date */ -static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) +static void check_page_uptodate(struct page *page) { u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) + if (test_page_blks_state(page, BLK_STATE_UPTODATE, start, end, 1)) SetPageUptodate(page); } +static int page_read_complete(struct page *page) +{ + u64 start = page_offset(page); + u64 end = start + PAGE_CACHE_SIZE - 1; + + return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0); +} + int free_io_failure(struct inode *inode, struct io_failure_record *rec) { int ret; @@ -2311,7 +2401,9 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, * a) deliver good data to the caller * b) correct the bad sectors on disk */ - if (failed_bio->bi_vcnt > 1) { + if ((failed_bio->bi_vcnt > 1) + || (failed_bio->bi_io_vec->bv_len + > BTRFS_I(inode)->root->sectorsize)) { /* * to fulfill b), we need to know the exact failing sectors, as * we don't want to rewrite any more than the failed ones. thus, @@ -2520,18 +2612,6 @@ static void end_bio_extent_writepage(struct bio *bio, int err) bio_put(bio); } -static void -endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, - int uptodate) -{ - struct extent_state *cached = NULL; - u64 end = start + len - 1; - - if (uptodate && tree->track_uptodate) - set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); - unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); -} - /* * after a readpage IO is done, we need to: * clear the uptodate bits on error @@ -2548,14 +2628,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct bio_vec *bvec; int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct extent_state *cached = NULL; + struct btrfs_page_private *pg_private; struct extent_io_tree *tree; + unsigned long flags; u64 offset = 0; u64 start; u64 end; - u64 len; - u64 extent_start = 0; - u64 extent_len = 0; + int nr_sectors; int mirror; + int unlock; int ret; int i; @@ -2565,54 +2647,31 @@ static void end_bio_extent_readpage(struct bio *bio, int err) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; - /* We always issue full-page reads, but if some block - * in a page fails to read, blk_update_request() will - * advance bv_offset and adjust bv_len to compensate. - * Print a warning for nonzero offsets, and an error - * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { - if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) - btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, - "partial page read in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else - btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, - "incomplete page read in btrfs with offset %u and " - "length %u", - bvec->bv_offset, bvec->bv_len); - } - - start = page_offset(page); - end = start + bvec->bv_offset + bvec->bv_len - 1; - len = bvec->bv_len; - + start = page_offset(page) + bvec->bv_offset; + end = start + bvec->bv_len - 1; + nr_sectors = bvec->bv_len >> inode->i_sb->s_blocksize_bits; mirror = io_bio->mirror_num; - if (likely(uptodate && tree->ops && - tree->ops->readpage_end_io_hook)) { + +next_block: + if (likely(uptodate)) { ret = tree->ops->readpage_end_io_hook(io_bio, offset, - page, start, end, - mirror); + page, start, + start + root->sectorsize - 1, + mirror); if (ret) uptodate = 0; else clean_io_failure(inode, start, page, 0); } - if (likely(uptodate)) - goto readpage_ok; - - if (tree->ops && tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook(page, mirror); - if (!ret && !err && - test_bit(BIO_UPTODATE, &bio->bi_flags)) - uptodate = 1; - } else { + if (!uptodate) { /* * The generic bio_readpage_error handles errors the * following way: If possible, new read requests are @@ -2623,61 +2682,63 @@ static void end_bio_extent_readpage(struct bio *bio, int err) * can't handle the error it will return -EIO and we * remain responsible for that page. */ - ret = bio_readpage_error(bio, offset, page, start, end, - mirror); + ret = bio_readpage_error(bio, offset, page, + start, start + root->sectorsize - 1, + mirror); if (ret == 0) { - uptodate = - test_bit(BIO_UPTODATE, &bio->bi_flags); + uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) uptodate = 0; - offset += len; - continue; + offset += root->sectorsize; + if (--nr_sectors) { + start += root->sectorsize; + goto next_block; + } else { + continue; + } } } -readpage_ok: - if (likely(uptodate)) { - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; - unsigned off; - - /* Zero out the end if this page straddles i_size */ - off = i_size & (PAGE_CACHE_SIZE-1); - if (page->index == end_index && off) - zero_user_segment(page, off, PAGE_CACHE_SIZE); - SetPageUptodate(page); + + if (uptodate) { + set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, start, + start + root->sectorsize - 1); + check_page_uptodate(page); } else { ClearPageUptodate(page); SetPageError(page); } - unlock_page(page); - offset += len; - - if (unlikely(!uptodate)) { - if (extent_len) { - endio_readpage_release_extent(tree, - extent_start, - extent_len, 1); - extent_start = 0; - extent_len = 0; - } - endio_readpage_release_extent(tree, start, - end - start + 1, 0); - } else if (!extent_len) { - extent_start = start; - extent_len = end + 1 - start; - } else if (extent_start + extent_len == start) { - extent_len += end + 1 - start; - } else { - endio_readpage_release_extent(tree, extent_start, - extent_len, uptodate); - extent_start = start; - extent_len = end + 1 - start; + + offset += root->sectorsize; + + if (--nr_sectors) { + clear_page_blks_state(page, 1 << BLK_STATE_IO, + start, start + root->sectorsize - 1); + clear_extent_bit(tree, start, start + root->sectorsize - 1, + EXTENT_LOCKED, 1, 0, &cached, GFP_ATOMIC); + start += root->sectorsize; + goto next_block; } + + WARN_ON(!PagePrivate(page)); + + pg_private = (struct btrfs_page_private *)page->private; + + spin_lock_irqsave(&pg_private->io_lock, flags); + + clear_page_blks_state(page, 1 << BLK_STATE_IO, + start, start + root->sectorsize - 1); + + unlock = page_read_complete(page); + + spin_unlock_irqrestore(&pg_private->io_lock, flags); + + clear_extent_bit(tree, start, start + root->sectorsize - 1, + EXTENT_LOCKED, 1, 0, &cached, GFP_ATOMIC); + + if (unlock) + unlock_page(page); } - if (extent_len) - endio_readpage_release_extent(tree, extent_start, extent_len, - uptodate); if (io_bio->end_io) io_bio->end_io(io_bio, err); bio_put(bio); @@ -2859,13 +2920,36 @@ static void attach_extent_buffer_page(struct extent_buffer *eb, } } -void set_page_extent_mapped(struct page *page) +int set_page_extent_mapped(struct page *page) { + struct btrfs_page_private *pg_private; + if (!PagePrivate(page)) { + pg_private = kzalloc(sizeof(*pg_private), GFP_NOFS); + if (!pg_private) + return -ENOMEM; + + spin_lock_init(&pg_private->io_lock); + SetPagePrivate(page); page_cache_get(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); + + set_page_private(page, (unsigned long)pg_private); + } + + return 0; +} + +int clear_page_extent_mapped(struct page *page) +{ + if (PagePrivate(page)) { + kfree((struct btrfs_page_private *)(page->private)); + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); } + + return 0; } static struct extent_map * @@ -2909,6 +2993,7 @@ static int __do_readpage(struct extent_io_tree *tree, unsigned long *bio_flags, int rw) { struct inode *inode = page->mapping->host; + struct extent_state *cached = NULL; u64 start = page_offset(page); u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; @@ -2964,8 +3049,8 @@ static int __do_readpage(struct extent_io_tree *tree, memset(userpage + pg_offset, 0, iosize); flush_dcache_page(page); kunmap_atomic(userpage); - set_extent_uptodate(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); + set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, cur, + cur + iosize - 1); if (!parent_locked) unlock_extent_cached(tree, cur, cur + iosize - 1, @@ -3017,8 +3102,8 @@ static int __do_readpage(struct extent_io_tree *tree, flush_dcache_page(page); kunmap_atomic(userpage); - set_extent_uptodate(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); + set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, cur, + cur + iosize - 1); unlock_extent_cached(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); cur = cur + iosize; @@ -3026,9 +3111,9 @@ static int __do_readpage(struct extent_io_tree *tree, continue; } /* the get_extent function already copied into the page */ - if (test_range_bit(tree, cur, cur_end, - EXTENT_UPTODATE, 1, NULL)) { - check_page_uptodate(tree, page); + if (test_page_blks_state(page, BLK_STATE_UPTODATE, cur, + cur_end, 1)) { + check_page_uptodate(page); if (!parent_locked) unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; @@ -3048,6 +3133,9 @@ static int __do_readpage(struct extent_io_tree *tree, } pnr -= page->index; + + set_page_blks_state(page, 1 << BLK_STATE_IO, cur, + cur + iosize - 1); ret = submit_extent_page(rw, tree, page, sector, disk_io_size, pg_offset, bdev, bio, pnr, @@ -3059,8 +3147,11 @@ static int __do_readpage(struct extent_io_tree *tree, *bio_flags = this_bio_flag; } else { SetPageError(page); + clear_page_blks_state(page, 1 << BLK_STATE_IO, cur, + cur + iosize - 1); if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent_cached(tree, cur, cur + iosize - 1, + &cached, GFP_NOFS); } cur = cur + iosize; pg_offset += iosize; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c668f36..541b40a 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -51,11 +51,22 @@ #define PAGE_SET_PRIVATE2 (1 << 4) #define PAGE_SET_ERROR (1 << 5) +enum blk_state { + BLK_STATE_UPTODATE, + BLK_STATE_DIRTY, + BLK_STATE_IO, + BLK_NR_STATE, +}; + /* - * page->private values. Every page that is controlled by the extent - * map has page->private set to one. - */ -#define EXTENT_PAGE_PRIVATE 1 + The maximum number of blocks per page (i.e. 32) occurs when using 2k + as the block size and having 64k as the page size. +*/ +#define BLK_STATE_NR_LONGS DIV_ROUND_UP(BLK_NR_STATE * 32, BITS_PER_LONG) +struct btrfs_page_private { + spinlock_t io_lock; + unsigned long bstate[BLK_STATE_NR_LONGS]; +}; struct extent_state; struct btrfs_root; @@ -259,7 +270,14 @@ int extent_readpages(struct extent_io_tree *tree, int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent); int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); -void set_page_extent_mapped(struct page *page); +int set_page_extent_mapped(struct page *page); +int clear_page_extent_mapped(struct page *page); +int set_page_blks_state(struct page *page, unsigned long blk_states, + u64 start, u64 end); +int clear_page_blks_state(struct page *page, unsigned long blk_states, + u64 start, u64 end); +int test_page_blks_state(struct page *page, enum blk_state blk_state, + u64 start, u64 end, int check_all); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0020b56..8262f83 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6622,7 +6622,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_trans_handle *trans = NULL; const bool new_inline = !page || create; @@ -6800,8 +6799,8 @@ next: kunmap(page); btrfs_mark_buffer_dirty(leaf); } - set_extent_uptodate(io_tree, em->start, - extent_map_end(em) - 1, NULL, GFP_NOFS); + set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, em->start, + extent_map_end(em) - 1); goto insert; } not_found: @@ -8392,11 +8391,9 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) tree = &BTRFS_I(page->mapping->host)->io_tree; map = &BTRFS_I(page->mapping->host)->extent_tree; ret = try_release_extent_mapping(map, tree, page, gfp_flags); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - page_cache_release(page); - } + if (ret == 1) + clear_page_extent_mapped(page); + return ret; }
For the subpagesize-blocksize scenario, a page can contain multiple blocks. In such cases, this patch handles reading data from files. To track the status of individual blocks of a page, this patch makes use of a bitmap pointed to by page->private. Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> --- fs/btrfs/extent_io.c | 301 +++++++++++++++++++++++++++++++++------------------ fs/btrfs/extent_io.h | 28 ++++- fs/btrfs/inode.c | 13 +-- 3 files changed, 224 insertions(+), 118 deletions(-)