Message ID | 1433172176-8742-3-git-send-email-chandan@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, Jun 01, 2015 at 08:52:37PM +0530, Chandan Rajendra wrote: > For the subpagesize-blocksize scenario, a page can contain multiple > blocks. In such cases, this patch handles writing data to files. > > Also, When setting EXTENT_DELALLOC, we no longer set EXTENT_UPTODATE bit on > the extent_io_tree since uptodate status is being tracked by the bitmap > pointed to by page->private. To be honestly, I'm not sure why we set EXTENT_UPTODATE bit for data as we don't check for that bit at all for now, correct me if I'm wrong. > > Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> > --- > fs/btrfs/extent_io.c | 141 +++++++++++++++++++++++---------------------------- > fs/btrfs/file.c | 16 ++++++ > fs/btrfs/inode.c | 58 ++++++++++++++++----- > 3 files changed, 125 insertions(+), 90 deletions(-) > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index d37badb..3736ab5 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -1283,9 +1283,8 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, > int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, > struct extent_state **cached_state, gfp_t mask) > { > - return set_extent_bit(tree, start, end, > - EXTENT_DELALLOC | EXTENT_UPTODATE, > - NULL, cached_state, mask); > + return set_extent_bit(tree, start, end, EXTENT_DELALLOC, > + NULL, cached_state, mask); > } > > int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, > @@ -1498,25 +1497,6 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) > return 0; > } > > -/* > - * helper function to set both pages and extents in the tree writeback > - */ > -static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) > -{ > - unsigned long index = start >> PAGE_CACHE_SHIFT; > - unsigned long end_index = end >> PAGE_CACHE_SHIFT; > - struct page *page; > - > - while (index <= end_index) { > - page = find_get_page(tree->mapping, index); > - BUG_ON(!page); /* Pages should be in the extent_io_tree */ > - set_page_writeback(page); > - page_cache_release(page); > - index++; > - } > - return 0; > -} > - > /* find the first state struct with 'bits' set after 'start', and > * return it. tree->lock must be held. NULL will returned if > * nothing was found after 'start' > @@ -2080,6 +2060,14 @@ static int page_read_complete(struct page *page) > return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0); > } > > +static int page_write_complete(struct page *page) > +{ > + u64 start = page_offset(page); > + u64 end = start + PAGE_CACHE_SIZE - 1; > + > + return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0); > +} > + > int free_io_failure(struct inode *inode, struct io_failure_record *rec) > { > int ret; > @@ -2575,38 +2563,37 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) > */ > static void end_bio_extent_writepage(struct bio *bio, int err) > { > + struct btrfs_page_private *pg_private; > struct bio_vec *bvec; > + unsigned long flags; > u64 start; > u64 end; > + int clear_writeback; > int i; > > bio_for_each_segment_all(bvec, bio, i) { > struct page *page = bvec->bv_page; > > - /* We always issue full-page reads, but if some block > - * in a page fails to read, blk_update_request() will > - * advance bv_offset and adjust bv_len to compensate. > - * Print a warning for nonzero offsets, and an error > - * if they don't add up to a full page. */ > - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { > - if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) > - btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, > - "partial page write in btrfs with offset %u and length %u", > - bvec->bv_offset, bvec->bv_len); > - else > - btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, > - "incomplete page write in btrfs with offset %u and " > - "length %u", > - bvec->bv_offset, bvec->bv_len); > - } > + start = page_offset(page) + bvec->bv_offset; > + end = start + bvec->bv_len - 1; > > - start = page_offset(page); > - end = start + bvec->bv_offset + bvec->bv_len - 1; > + pg_private = (struct btrfs_page_private *)page->private; > + > + spin_lock_irqsave(&pg_private->io_lock, flags); > > - if (end_extent_writepage(page, err, start, end)) > + if (end_extent_writepage(page, err, start, end)) { > + spin_unlock_irqrestore(&pg_private->io_lock, flags); > continue; > + } > > - end_page_writeback(page); > + clear_page_blks_state(page, 1 << BLK_STATE_IO, start, end); > + > + clear_writeback = page_write_complete(page); > + > + spin_unlock_irqrestore(&pg_private->io_lock, flags); > + > + if (clear_writeback) > + end_page_writeback(page); > } > > bio_put(bio); > @@ -3417,10 +3404,9 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, > u64 block_start; > u64 iosize; > sector_t sector; > - struct extent_state *cached_state = NULL; > struct extent_map *em; > struct block_device *bdev; > - size_t pg_offset = 0; > + size_t pg_offset; > size_t blocksize; > int ret = 0; > int nr = 0; > @@ -3467,8 +3453,16 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, > page_end, NULL, 1); > break; > } > - em = epd->get_extent(inode, page, pg_offset, cur, > - end - cur + 1, 1); > + > + pg_offset = cur & (PAGE_CACHE_SIZE - 1); > + > + if (!test_page_blks_state(page, BLK_STATE_DIRTY, cur, > + cur + blocksize - 1, 1)) { > + cur += blocksize; > + continue; > + } If we don't check this, the below get_extent() will return a HOLE (block_start == EXTENT_MAP_HOLE) and we can still go on to the next block, then we don't need to maintain this BLK_STATE_DIRTY bit all the while. > + > + em = epd->get_extent(inode, page, pg_offset, cur, blocksize, 1); > if (IS_ERR_OR_NULL(em)) { > SetPageError(page); > ret = PTR_ERR_OR_ZERO(em); > @@ -3479,7 +3473,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, > em_end = extent_map_end(em); > BUG_ON(em_end <= cur); > BUG_ON(end < cur); > - iosize = min(em_end - cur, end - cur + 1); > + iosize = min_t(u64, em_end - cur, blocksize); > iosize = ALIGN(iosize, blocksize); This limits us to do one block per loop, if two blocks are contiguous, it should be fine to write them along. > sector = (em->block_start + extent_offset) >> 9; > bdev = em->bdev; > @@ -3488,32 +3482,20 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, > free_extent_map(em); > em = NULL; > > - /* > - * compressed and inline extents are written through other > - * paths in the FS > - */ > - if (compressed || block_start == EXTENT_MAP_HOLE || > - block_start == EXTENT_MAP_INLINE) { > - /* > - * end_io notification does not happen here for > - * compressed extents > - */ > - if (!compressed && tree->ops && > - tree->ops->writepage_end_io_hook) > - tree->ops->writepage_end_io_hook(page, cur, > - cur + iosize - 1, > - NULL, 1); > - else if (compressed) { > - /* we don't want to end_page_writeback on > - * a compressed extent. this happens > - * elsewhere > - */ > - nr++; > - } > + BUG_ON(compressed); > + BUG_ON(block_start == EXTENT_MAP_INLINE); > > - cur += iosize; > - pg_offset += iosize; > - continue; > + if (block_start == EXTENT_MAP_HOLE) { > + if (test_page_blks_state(page, BLK_STATE_UPTODATE, cur, > + cur + iosize - 1, 1)) { > + clear_page_blks_state(page, > + 1 << BLK_STATE_DIRTY, cur, > + cur + iosize - 1); > + cur += iosize; > + continue; > + } else { > + BUG(); > + } > } > > if (tree->ops && tree->ops->writepage_io_hook) { > @@ -3527,7 +3509,13 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, > } else { > unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1; > > - set_range_writeback(tree, cur, cur + iosize - 1); > + clear_page_blks_state(page, 1 << BLK_STATE_DIRTY, cur, > + cur + iosize - 1); > + set_page_writeback(page); > + > + set_page_blks_state(page, 1 << BLK_STATE_IO, cur, > + cur + iosize - 1); > + > if (!PageWriteback(page)) { > btrfs_err(BTRFS_I(inode)->root->fs_info, > "page %lu not writeback, cur %llu end %llu", > @@ -3542,17 +3530,14 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, > if (ret) > SetPageError(page); > } > - cur = cur + iosize; > - pg_offset += iosize; > + > + cur += iosize; > nr++; > } > done: > *nr_ret = nr; > > done_unlocked: > - > - /* drop our reference on any cached states */ > - free_extent_state(cached_state); > return ret; > } > > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > index 23b6e03..cbe6381 100644 > --- a/fs/btrfs/file.c > +++ b/fs/btrfs/file.c > @@ -495,6 +495,9 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, > u64 num_bytes; > u64 start_pos; > u64 end_of_last_block; > + u64 start; > + u64 end; > + u64 page_end; > u64 end_pos = pos + write_bytes; > loff_t isize = i_size_read(inode); > > @@ -507,11 +510,24 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, > if (err) > return err; > > + start = start_pos; > + > for (i = 0; i < num_pages; i++) { > struct page *p = pages[i]; > SetPageUptodate(p); > ClearPageChecked(p); > + > + end = page_end = page_offset(p) + PAGE_CACHE_SIZE - 1; > + > + if (i == num_pages - 1) > + end = min_t(u64, page_end, end_of_last_block); > + > + set_page_blks_state(p, > + 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE, > + start, end); > set_page_dirty(p); > + > + start = page_end + 1; This is not the usual way, page_end is unnecessary, (start += PAGE_CACHE_SIZE) should work. > } > > /* > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index 8262f83..ac6a3f3 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -1995,6 +1995,11 @@ again: > } > > btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); > + > + set_page_blks_state(page, > + 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE, > + page_start, page_end); > + > ClearPageChecked(page); > set_page_dirty(page); > out: > @@ -2984,26 +2989,48 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, > struct btrfs_ordered_extent *ordered_extent = NULL; > struct btrfs_workqueue *wq; > btrfs_work_func_t func; > + u64 ordered_start, ordered_end; > + int done; > > trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); > > ClearPagePrivate2(page); > - if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, > - end - start + 1, uptodate)) > - return 0; > +loop: > + ordered_extent = btrfs_lookup_ordered_range(inode, start, > + end - start + 1); > + if (!ordered_extent) > + goto out; > > - if (btrfs_is_free_space_inode(inode)) { > - wq = root->fs_info->endio_freespace_worker; > - func = btrfs_freespace_write_helper; > - } else { > - wq = root->fs_info->endio_write_workers; > - func = btrfs_endio_write_helper; > + ordered_start = max_t(u64, start, ordered_extent->file_offset); > + ordered_end = min_t(u64, end, > + ordered_extent->file_offset + ordered_extent->len - 1); > + > + done = btrfs_dec_test_ordered_pending(inode, &ordered_extent, > + ordered_start, > + ordered_end - ordered_start + 1, > + uptodate); > + if (done) { > + if (btrfs_is_free_space_inode(inode)) { > + wq = root->fs_info->endio_freespace_worker; > + func = btrfs_freespace_write_helper; > + } else { > + wq = root->fs_info->endio_write_workers; > + func = btrfs_endio_write_helper; > + } > + > + btrfs_init_work(&ordered_extent->work, func, > + finish_ordered_fn, NULL, NULL); > + btrfs_queue_work(wq, &ordered_extent->work); > } > > - btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, > - NULL); > - btrfs_queue_work(wq, &ordered_extent->work); > + btrfs_put_ordered_extent(ordered_extent); > + > + start = ordered_end + 1; > + > + if (start < end) > + goto loop; > > +out: I saw this's put a BUG_ON(block_start == EXTENT_MAP_INLINE); in writepage(), but I didn't see the code of disabling inline data in patch 01 and patch 02, but anyway I think we can avoid above searching for ordered_extents in a single page if we enable inline data. Thanks, -liubo > return 0; > } > > @@ -4601,6 +4628,9 @@ again: > goto out_unlock; > } > > + set_page_blks_state(page, 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE, > + page_start, page_end); > + > if (offset != PAGE_CACHE_SIZE) { > if (!len) > len = PAGE_CACHE_SIZE - offset; > @@ -8590,6 +8620,10 @@ again: > ret = VM_FAULT_SIGBUS; > goto out_unlock; > } > + > + set_page_blks_state(page, 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE, > + page_start, end); > + > ret = 0; > > /* page is wholly or partially inside EOF */ > -- > 2.1.0 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Friday 26 Jun 2015 17:50:54 Liu Bo wrote: > On Mon, Jun 01, 2015 at 08:52:37PM +0530, Chandan Rajendra wrote: > > For the subpagesize-blocksize scenario, a page can contain multiple > > blocks. In such cases, this patch handles writing data to files. > > > > Also, When setting EXTENT_DELALLOC, we no longer set EXTENT_UPTODATE bit > > on > > the extent_io_tree since uptodate status is being tracked by the bitmap > > pointed to by page->private. > > To be honestly, I'm not sure why we set EXTENT_UPTODATE bit for data as we > don't check for that bit at all for now, correct me if I'm wrong. Yes, I didn't find any code using EXTENT_UPTODATE flag. That is probably because we could get away by referring to the page's PG_uptodate flag in blocksize == Pagesize scenario. But for the subpagesize-blocksize scenario we need BLK_STATE_UPTODATE to determine if a page's PG_uptodate flag can be set. > > > Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> > > --- > > > > fs/btrfs/extent_io.c | 141 > > +++++++++++++++++++++++---------------------------- fs/btrfs/file.c > > | 16 ++++++ > > fs/btrfs/inode.c | 58 ++++++++++++++++----- > > 3 files changed, 125 insertions(+), 90 deletions(-) > > > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > > index d37badb..3736ab5 100644 > > --- a/fs/btrfs/extent_io.c > > +++ b/fs/btrfs/extent_io.c > > @@ -1283,9 +1283,8 @@ int clear_extent_bits(struct extent_io_tree *tree, > > u64 start, u64 end,> > > int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, > > > > struct extent_state **cached_state, gfp_t mask) > > > > { > > > > - return set_extent_bit(tree, start, end, > > - EXTENT_DELALLOC | EXTENT_UPTODATE, > > - NULL, cached_state, mask); > > + return set_extent_bit(tree, start, end, EXTENT_DELALLOC, > > + NULL, cached_state, mask); > > > > } > > > > int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, > > > > @@ -1498,25 +1497,6 @@ int extent_range_redirty_for_io(struct inode > > *inode, u64 start, u64 end)> > > return 0; > > > > } > > > > -/* > > - * helper function to set both pages and extents in the tree writeback > > - */ > > -static int set_range_writeback(struct extent_io_tree *tree, u64 start, > > u64 end) -{ > > - unsigned long index = start >> PAGE_CACHE_SHIFT; > > - unsigned long end_index = end >> PAGE_CACHE_SHIFT; > > - struct page *page; > > - > > - while (index <= end_index) { > > - page = find_get_page(tree->mapping, index); > > - BUG_ON(!page); /* Pages should be in the extent_io_tree */ > > - set_page_writeback(page); > > - page_cache_release(page); > > - index++; > > - } > > - return 0; > > -} > > - > > > > /* find the first state struct with 'bits' set after 'start', and > > > > * return it. tree->lock must be held. NULL will returned if > > * nothing was found after 'start' > > > > @@ -2080,6 +2060,14 @@ static int page_read_complete(struct page *page) > > > > return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0); > > > > } > > > > +static int page_write_complete(struct page *page) > > +{ > > + u64 start = page_offset(page); > > + u64 end = start + PAGE_CACHE_SIZE - 1; > > + > > + return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0); > > +} > > + > > > > int free_io_failure(struct inode *inode, struct io_failure_record *rec) > > { > > > > int ret; > > > > @@ -2575,38 +2563,37 @@ int end_extent_writepage(struct page *page, int > > err, u64 start, u64 end)> > > */ > > > > static void end_bio_extent_writepage(struct bio *bio, int err) > > { > > > > + struct btrfs_page_private *pg_private; > > > > struct bio_vec *bvec; > > > > + unsigned long flags; > > > > u64 start; > > u64 end; > > > > + int clear_writeback; > > > > int i; > > > > bio_for_each_segment_all(bvec, bio, i) { > > > > struct page *page = bvec->bv_page; > > > > - /* We always issue full-page reads, but if some block > > - * in a page fails to read, blk_update_request() will > > - * advance bv_offset and adjust bv_len to compensate. > > - * Print a warning for nonzero offsets, and an error > > - * if they don't add up to a full page. */ > > - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { > > - if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) > > - btrfs_err(BTRFS_I(page->mapping->host)->root- >fs_info, > > - "partial page write in btrfs with offset %u and length %u", > > - bvec->bv_offset, bvec->bv_len); > > - else > > - btrfs_info(BTRFS_I(page->mapping->host)->root- >fs_info, > > - "incomplete page write in btrfs with offset %u and " > > - "length %u", > > - bvec->bv_offset, bvec->bv_len); > > - } > > + start = page_offset(page) + bvec->bv_offset; > > + end = start + bvec->bv_len - 1; > > > > - start = page_offset(page); > > - end = start + bvec->bv_offset + bvec->bv_len - 1; > > + pg_private = (struct btrfs_page_private *)page->private; > > + > > + spin_lock_irqsave(&pg_private->io_lock, flags); > > > > - if (end_extent_writepage(page, err, start, end)) > > + if (end_extent_writepage(page, err, start, end)) { > > + spin_unlock_irqrestore(&pg_private->io_lock, flags); > > > > continue; > > > > + } > > > > - end_page_writeback(page); > > + clear_page_blks_state(page, 1 << BLK_STATE_IO, start, end); > > + > > + clear_writeback = page_write_complete(page); > > + > > + spin_unlock_irqrestore(&pg_private->io_lock, flags); > > + > > + if (clear_writeback) > > + end_page_writeback(page); > > > > } > > > > bio_put(bio); > > > > @@ -3417,10 +3404,9 @@ static noinline_for_stack int > > __extent_writepage_io(struct inode *inode,> > > u64 block_start; > > u64 iosize; > > sector_t sector; > > > > - struct extent_state *cached_state = NULL; > > > > struct extent_map *em; > > struct block_device *bdev; > > > > - size_t pg_offset = 0; > > + size_t pg_offset; > > > > size_t blocksize; > > int ret = 0; > > int nr = 0; > > > > @@ -3467,8 +3453,16 @@ static noinline_for_stack int > > __extent_writepage_io(struct inode *inode,> > > page_end, NULL, 1); > > > > break; > > > > } > > > > - em = epd->get_extent(inode, page, pg_offset, cur, > > - end - cur + 1, 1); > > + > > + pg_offset = cur & (PAGE_CACHE_SIZE - 1); > > + > > + if (!test_page_blks_state(page, BLK_STATE_DIRTY, cur, > > + cur + blocksize - 1, 1)) { > > + cur += blocksize; > > + continue; > > + } > > If we don't check this, the below get_extent() will return a HOLE > (block_start == EXTENT_MAP_HOLE) and we can still go on to the next block, > then we don't need to maintain this BLK_STATE_DIRTY bit all the while. Sorry, I am not sure if I understood your comment correctly. Are you suggesting that *page blocks* that are not dirty are always holes? Let's assume a 64k page whose contents are within i_size and none of the blocks of the page map to a file hole. Also assume 4k as the block size. Say, the userspace writes to the "block 0" of the page. The corresponding code in __btrfs_buffered_write() reads up the complete page into the inode's page cache and then marks "block 0" of the page as BLK_STATE_DIRTY. Next, the userspace seeks and writes to "block 4" of the page. In this case, since the page has PG_uptodate flag already set we don't read the data from the disk again. We simply go ahead and mark "block 4" as BLK_STATE_DIRTY. As can be seen in the example scenario, the blocks 1, 2 and 3 are not holes and hence btrfs_get_extent() would end up returning values other than EXTENT_MAP_HOLE for em->block_start. > > > + > > + em = epd->get_extent(inode, page, pg_offset, cur, blocksize, 1); > > > > if (IS_ERR_OR_NULL(em)) { > > > > SetPageError(page); > > ret = PTR_ERR_OR_ZERO(em); > > > > @@ -3479,7 +3473,7 @@ static noinline_for_stack int > > __extent_writepage_io(struct inode *inode,> > > em_end = extent_map_end(em); > > BUG_ON(em_end <= cur); > > BUG_ON(end < cur); > > > > - iosize = min(em_end - cur, end - cur + 1); > > + iosize = min_t(u64, em_end - cur, blocksize); > > > > iosize = ALIGN(iosize, blocksize); > > This limits us to do one block per loop, if two blocks are contiguous, > it should be fine to write them along. Yes, I agree. I will fix this up in one of the next versions of the patchset. Thanks for pointing it out. > > > sector = (em->block_start + extent_offset) >> 9; > > bdev = em->bdev; > > > > @@ -3488,32 +3482,20 @@ static noinline_for_stack int > > __extent_writepage_io(struct inode *inode,> > > free_extent_map(em); > > em = NULL; > > > > - /* > > - * compressed and inline extents are written through other > > - * paths in the FS > > - */ > > - if (compressed || block_start == EXTENT_MAP_HOLE || > > - block_start == EXTENT_MAP_INLINE) { > > - /* > > - * end_io notification does not happen here for > > - * compressed extents > > - */ > > - if (!compressed && tree->ops && > > - tree->ops->writepage_end_io_hook) > > - tree->ops->writepage_end_io_hook(page, cur, > > - cur + iosize - 1, > > - NULL, 1); > > - else if (compressed) { > > - /* we don't want to end_page_writeback on > > - * a compressed extent. this happens > > - * elsewhere > > - */ > > - nr++; > > - } > > + BUG_ON(compressed); > > + BUG_ON(block_start == EXTENT_MAP_INLINE); > > > > - cur += iosize; > > - pg_offset += iosize; > > - continue; > > + if (block_start == EXTENT_MAP_HOLE) { > > + if (test_page_blks_state(page, BLK_STATE_UPTODATE, cur, > > + cur + iosize - 1, 1)) { > > + clear_page_blks_state(page, > > + 1 << BLK_STATE_DIRTY, cur, > > + cur + iosize - 1); > > + cur += iosize; > > + continue; > > + } else { > > + BUG(); > > + } > > > > } > > > > if (tree->ops && tree->ops->writepage_io_hook) { > > > > @@ -3527,7 +3509,13 @@ static noinline_for_stack int > > __extent_writepage_io(struct inode *inode,> > > } else { > > > > unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1; > > > > - set_range_writeback(tree, cur, cur + iosize - 1); > > + clear_page_blks_state(page, 1 << BLK_STATE_DIRTY, cur, > > + cur + iosize - 1); > > + set_page_writeback(page); > > + > > + set_page_blks_state(page, 1 << BLK_STATE_IO, cur, > > + cur + iosize - 1); > > + > > > > if (!PageWriteback(page)) { > > > > btrfs_err(BTRFS_I(inode)->root->fs_info, > > > > "page %lu not writeback, cur %llu end %llu", > > > > @@ -3542,17 +3530,14 @@ static noinline_for_stack int > > __extent_writepage_io(struct inode *inode,> > > if (ret) > > > > SetPageError(page); > > > > } > > > > - cur = cur + iosize; > > - pg_offset += iosize; > > + > > + cur += iosize; > > > > nr++; > > > > } > > > > done: > > *nr_ret = nr; > > > > done_unlocked: > > - > > - /* drop our reference on any cached states */ > > - free_extent_state(cached_state); > > > > return ret; > > > > } > > > > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > > index 23b6e03..cbe6381 100644 > > --- a/fs/btrfs/file.c > > +++ b/fs/btrfs/file.c > > @@ -495,6 +495,9 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct > > inode *inode,> > > u64 num_bytes; > > u64 start_pos; > > u64 end_of_last_block; > > > > + u64 start; > > + u64 end; > > + u64 page_end; > > > > u64 end_pos = pos + write_bytes; > > loff_t isize = i_size_read(inode); > > > > @@ -507,11 +510,24 @@ int btrfs_dirty_pages(struct btrfs_root *root, > > struct inode *inode,> > > if (err) > > > > return err; > > > > + start = start_pos; > > + > > > > for (i = 0; i < num_pages; i++) { > > > > struct page *p = pages[i]; > > SetPageUptodate(p); > > ClearPageChecked(p); > > > > + > > + end = page_end = page_offset(p) + PAGE_CACHE_SIZE - 1; > > + > > + if (i == num_pages - 1) > > + end = min_t(u64, page_end, end_of_last_block); > > + > > + set_page_blks_state(p, > > + 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE, > > + start, end); > > > > set_page_dirty(p); > > > > + > > + start = page_end + 1; > > This is not the usual way, page_end is unnecessary, (start += > PAGE_CACHE_SIZE) should work. "start" may not always be set to a file offset that is a multiple of page size. If the userspace dirties say "block 4" of 64k page, then start will be set to 16384. Hence in such cases, "start += PAGE_CACHE_SIZE" would yield an incorrect value. > > } > > > > /* > > > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > > index 8262f83..ac6a3f3 100644 > > --- a/fs/btrfs/inode.c > > +++ b/fs/btrfs/inode.c > > > > @@ -1995,6 +1995,11 @@ again: > > } > > > > btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); > > > > + > > + set_page_blks_state(page, > > + 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE, > > + page_start, page_end); > > + > > > > ClearPageChecked(page); > > set_page_dirty(page); > > > > out: > > @@ -2984,26 +2989,48 @@ static int btrfs_writepage_end_io_hook(struct page > > *page, u64 start, u64 end,> > > struct btrfs_ordered_extent *ordered_extent = NULL; > > struct btrfs_workqueue *wq; > > btrfs_work_func_t func; > > > > + u64 ordered_start, ordered_end; > > + int done; > > > > trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); > > > > ClearPagePrivate2(page); > > > > - if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, > > - end - start + 1, uptodate)) > > - return 0; > > +loop: > > + ordered_extent = btrfs_lookup_ordered_range(inode, start, > > + end - start + 1); > > + if (!ordered_extent) > > + goto out; > > > > - if (btrfs_is_free_space_inode(inode)) { > > - wq = root->fs_info->endio_freespace_worker; > > - func = btrfs_freespace_write_helper; > > - } else { > > - wq = root->fs_info->endio_write_workers; > > - func = btrfs_endio_write_helper; > > + ordered_start = max_t(u64, start, ordered_extent->file_offset); > > + ordered_end = min_t(u64, end, > > + ordered_extent->file_offset + ordered_extent->len - 1); > > + > > + done = btrfs_dec_test_ordered_pending(inode, &ordered_extent, > > + ordered_start, > > + ordered_end - ordered_start + 1, > > + uptodate); > > + if (done) { > > + if (btrfs_is_free_space_inode(inode)) { > > + wq = root->fs_info->endio_freespace_worker; > > + func = btrfs_freespace_write_helper; > > + } else { > > + wq = root->fs_info->endio_write_workers; > > + func = btrfs_endio_write_helper; > > + } > > + > > + btrfs_init_work(&ordered_extent->work, func, > > + finish_ordered_fn, NULL, NULL); > > + btrfs_queue_work(wq, &ordered_extent->work); > > > > } > > > > - btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, > > - NULL); > > - btrfs_queue_work(wq, &ordered_extent->work); > > + btrfs_put_ordered_extent(ordered_extent); > > + > > + start = ordered_end + 1; > > + > > + if (start < end) > > + goto loop; > > > +out: > I saw this's put a BUG_ON(block_start == EXTENT_MAP_INLINE); in writepage(), > but I didn't see the code of disabling inline data in patch 01 and patch > 02, but anyway I think we can avoid above searching for ordered_extents in > a single page if we enable inline data. For inline extents, The call to __extent_writepage => writepage_delalloc => run_delalloc_range => cow_file_range => cow_file_range_inline should write the block's content into the appropriate location in the btree leaf. Hence __extent_writepage_io() should never get invoked for files with inline extents. The call to BUG_ON(block_start == EXTENT_MAP_INLINE) just makes this explicit and also helps in debugging. Liu, However I am not sure if we could avoid looping across ordered extents in the above code. Could you please elaborate on that?
On Mon, Jun 29, 2015 at 02:24:18PM +0530, Chandan Rajendra wrote: > On Friday 26 Jun 2015 17:50:54 Liu Bo wrote: > > On Mon, Jun 01, 2015 at 08:52:37PM +0530, Chandan Rajendra wrote: > > > For the subpagesize-blocksize scenario, a page can contain multiple > > > blocks. In such cases, this patch handles writing data to files. > > > > > > Also, When setting EXTENT_DELALLOC, we no longer set EXTENT_UPTODATE bit > > > on > > > the extent_io_tree since uptodate status is being tracked by the bitmap > > > pointed to by page->private. > > > > To be honestly, I'm not sure why we set EXTENT_UPTODATE bit for data as we > > don't check for that bit at all for now, correct me if I'm wrong. > > Yes, I didn't find any code using EXTENT_UPTODATE flag. That is probably > because we could get away by referring to the page's PG_uptodate flag in > blocksize == Pagesize scenario. But for the subpagesize-blocksize scenario we > need BLK_STATE_UPTODATE to determine if a page's PG_uptodate flag can be set. > > > > > > Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> > > > --- > > > > > > fs/btrfs/extent_io.c | 141 > > > +++++++++++++++++++++++---------------------------- fs/btrfs/file.c > > > | 16 ++++++ > > > fs/btrfs/inode.c | 58 ++++++++++++++++----- > > > 3 files changed, 125 insertions(+), 90 deletions(-) > > > > > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > > > index d37badb..3736ab5 100644 > > > --- a/fs/btrfs/extent_io.c > > > +++ b/fs/btrfs/extent_io.c > > > @@ -1283,9 +1283,8 @@ int clear_extent_bits(struct extent_io_tree *tree, > > > u64 start, u64 end,> > > > int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, > > > > > > struct extent_state **cached_state, gfp_t mask) > > > > > > { > > > > > > - return set_extent_bit(tree, start, end, > > > - EXTENT_DELALLOC | EXTENT_UPTODATE, > > > - NULL, cached_state, mask); > > > + return set_extent_bit(tree, start, end, EXTENT_DELALLOC, > > > + NULL, cached_state, mask); > > > > > > } > > > > > > int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, > > > > > > @@ -1498,25 +1497,6 @@ int extent_range_redirty_for_io(struct inode > > > *inode, u64 start, u64 end)> > > > return 0; > > > > > > } > > > > > > -/* > > > - * helper function to set both pages and extents in the tree writeback > > > - */ > > > -static int set_range_writeback(struct extent_io_tree *tree, u64 start, > > > u64 end) -{ > > > - unsigned long index = start >> PAGE_CACHE_SHIFT; > > > - unsigned long end_index = end >> PAGE_CACHE_SHIFT; > > > - struct page *page; > > > - > > > - while (index <= end_index) { > > > - page = find_get_page(tree->mapping, index); > > > - BUG_ON(!page); /* Pages should be in the extent_io_tree */ > > > - set_page_writeback(page); > > > - page_cache_release(page); > > > - index++; > > > - } > > > - return 0; > > > -} > > > - > > > > > > /* find the first state struct with 'bits' set after 'start', and > > > > > > * return it. tree->lock must be held. NULL will returned if > > > * nothing was found after 'start' > > > > > > @@ -2080,6 +2060,14 @@ static int page_read_complete(struct page *page) > > > > > > return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0); > > > > > > } > > > > > > +static int page_write_complete(struct page *page) > > > +{ > > > + u64 start = page_offset(page); > > > + u64 end = start + PAGE_CACHE_SIZE - 1; > > > + > > > + return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0); > > > +} > > > + > > > > > > int free_io_failure(struct inode *inode, struct io_failure_record *rec) > > > { > > > > > > int ret; > > > > > > @@ -2575,38 +2563,37 @@ int end_extent_writepage(struct page *page, int > > > err, u64 start, u64 end)> > > > */ > > > > > > static void end_bio_extent_writepage(struct bio *bio, int err) > > > { > > > > > > + struct btrfs_page_private *pg_private; > > > > > > struct bio_vec *bvec; > > > > > > + unsigned long flags; > > > > > > u64 start; > > > u64 end; > > > > > > + int clear_writeback; > > > > > > int i; > > > > > > bio_for_each_segment_all(bvec, bio, i) { > > > > > > struct page *page = bvec->bv_page; > > > > > > - /* We always issue full-page reads, but if some block > > > - * in a page fails to read, blk_update_request() will > > > - * advance bv_offset and adjust bv_len to compensate. > > > - * Print a warning for nonzero offsets, and an error > > > - * if they don't add up to a full page. */ > > > - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { > > > - if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) > > > - btrfs_err(BTRFS_I(page->mapping->host)->root- > >fs_info, > > > - "partial page write in btrfs with offset %u > and length %u", > > > - bvec->bv_offset, bvec->bv_len); > > > - else > > > - btrfs_info(BTRFS_I(page->mapping->host)->root- > >fs_info, > > > - "incomplete page write in btrfs with offset > %u and " > > > - "length %u", > > > - bvec->bv_offset, bvec->bv_len); > > > - } > > > + start = page_offset(page) + bvec->bv_offset; > > > + end = start + bvec->bv_len - 1; > > > > > > - start = page_offset(page); > > > - end = start + bvec->bv_offset + bvec->bv_len - 1; > > > + pg_private = (struct btrfs_page_private *)page->private; > > > + > > > + spin_lock_irqsave(&pg_private->io_lock, flags); > > > > > > - if (end_extent_writepage(page, err, start, end)) > > > + if (end_extent_writepage(page, err, start, end)) { > > > + spin_unlock_irqrestore(&pg_private->io_lock, flags); > > > > > > continue; > > > > > > + } > > > > > > - end_page_writeback(page); > > > + clear_page_blks_state(page, 1 << BLK_STATE_IO, start, end); > > > + > > > + clear_writeback = page_write_complete(page); > > > + > > > + spin_unlock_irqrestore(&pg_private->io_lock, flags); > > > + > > > + if (clear_writeback) > > > + end_page_writeback(page); > > > > > > } > > > > > > bio_put(bio); > > > > > > @@ -3417,10 +3404,9 @@ static noinline_for_stack int > > > __extent_writepage_io(struct inode *inode,> > > > u64 block_start; > > > u64 iosize; > > > sector_t sector; > > > > > > - struct extent_state *cached_state = NULL; > > > > > > struct extent_map *em; > > > struct block_device *bdev; > > > > > > - size_t pg_offset = 0; > > > + size_t pg_offset; > > > > > > size_t blocksize; > > > int ret = 0; > > > int nr = 0; > > > > > > @@ -3467,8 +3453,16 @@ static noinline_for_stack int > > > __extent_writepage_io(struct inode *inode,> > > > page_end, NULL, 1); > > > > > > break; > > > > > > } > > > > > > - em = epd->get_extent(inode, page, pg_offset, cur, > > > - end - cur + 1, 1); > > > + > > > + pg_offset = cur & (PAGE_CACHE_SIZE - 1); > > > + > > > + if (!test_page_blks_state(page, BLK_STATE_DIRTY, cur, > > > + cur + blocksize - 1, 1)) { > > > + cur += blocksize; > > > + continue; > > > + } > > > > If we don't check this, the below get_extent() will return a HOLE > > (block_start == EXTENT_MAP_HOLE) and we can still go on to the next block, > > then we don't need to maintain this BLK_STATE_DIRTY bit all the while. > > Sorry, I am not sure if I understood your comment correctly. Are you > suggesting that *page blocks* that are not dirty are always holes? > > Let's assume a 64k page whose contents are within i_size and none of the > blocks of the page map to a file hole. Also assume 4k as the block size. Say, > the userspace writes to the "block 0" of the page. The corresponding code in > __btrfs_buffered_write() reads up the complete page into the inode's page > cache and then marks "block 0" of the page as BLK_STATE_DIRTY. Next, the > userspace seeks and writes to "block 4" of the page. In this case, since the > page has PG_uptodate flag already set we don't read the data from the disk > again. We simply go ahead and mark "block 4" as BLK_STATE_DIRTY. As can be > seen in the example scenario, the blocks 1, 2 and 3 are not holes and hence > btrfs_get_extent() would end up returning values other than EXTENT_MAP_HOLE > for em->block_start. I see it now, this is a bit subtle at the first glance. > > > > > > + > > > + em = epd->get_extent(inode, page, pg_offset, cur, blocksize, > 1); > > > > > > if (IS_ERR_OR_NULL(em)) { > > > > > > SetPageError(page); > > > ret = PTR_ERR_OR_ZERO(em); > > > > > > @@ -3479,7 +3473,7 @@ static noinline_for_stack int > > > __extent_writepage_io(struct inode *inode,> > > > em_end = extent_map_end(em); > > > BUG_ON(em_end <= cur); > > > BUG_ON(end < cur); > > > > > > - iosize = min(em_end - cur, end - cur + 1); > > > + iosize = min_t(u64, em_end - cur, blocksize); > > > > > > iosize = ALIGN(iosize, blocksize); > > > > This limits us to do one block per loop, if two blocks are contiguous, > > it should be fine to write them along. > > Yes, I agree. I will fix this up in one of the next versions of the > patchset. Thanks for pointing it out. OK. > > > > > > sector = (em->block_start + extent_offset) >> 9; > > > bdev = em->bdev; > > > > > > @@ -3488,32 +3482,20 @@ static noinline_for_stack int > > > __extent_writepage_io(struct inode *inode,> > > > free_extent_map(em); > > > em = NULL; > > > > > > - /* > > > - * compressed and inline extents are written through other > > > - * paths in the FS > > > - */ > > > - if (compressed || block_start == EXTENT_MAP_HOLE || > > > - block_start == EXTENT_MAP_INLINE) { > > > - /* > > > - * end_io notification does not happen here for > > > - * compressed extents > > > - */ > > > - if (!compressed && tree->ops && > > > - tree->ops->writepage_end_io_hook) > > > - tree->ops->writepage_end_io_hook(page, cur, > > > - cur + iosize - 1, > > > - NULL, 1); > > > - else if (compressed) { > > > - /* we don't want to end_page_writeback on > > > - * a compressed extent. this happens > > > - * elsewhere > > > - */ > > > - nr++; > > > - } > > > + BUG_ON(compressed); > > > + BUG_ON(block_start == EXTENT_MAP_INLINE); > > > > > > - cur += iosize; > > > - pg_offset += iosize; > > > - continue; > > > + if (block_start == EXTENT_MAP_HOLE) { > > > + if (test_page_blks_state(page, BLK_STATE_UPTODATE, > cur, > > > + cur + iosize - 1, 1)) > { > > > + clear_page_blks_state(page, > > > + 1 << BLK_STATE_DIRTY, cur, > > > + cur + iosize - 1); > > > + cur += iosize; > > > + continue; > > > + } else { > > > + BUG(); > > > + } > > > > > > } > > > > > > if (tree->ops && tree->ops->writepage_io_hook) { > > > > > > @@ -3527,7 +3509,13 @@ static noinline_for_stack int > > > __extent_writepage_io(struct inode *inode,> > > > } else { > > > > > > unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + > 1; > > > > > > - set_range_writeback(tree, cur, cur + iosize - 1); > > > + clear_page_blks_state(page, 1 << BLK_STATE_DIRTY, cur, > > > + cur + iosize - 1); > > > + set_page_writeback(page); > > > + > > > + set_page_blks_state(page, 1 << BLK_STATE_IO, cur, > > > + cur + iosize - 1); > > > + > > > > > > if (!PageWriteback(page)) { > > > > > > btrfs_err(BTRFS_I(inode)->root->fs_info, > > > > > > "page %lu not writeback, cur %llu > end %llu", > > > > > > @@ -3542,17 +3530,14 @@ static noinline_for_stack int > > > __extent_writepage_io(struct inode *inode,> > > > if (ret) > > > > > > SetPageError(page); > > > > > > } > > > > > > - cur = cur + iosize; > > > - pg_offset += iosize; > > > + > > > + cur += iosize; > > > > > > nr++; > > > > > > } > > > > > > done: > > > *nr_ret = nr; > > > > > > done_unlocked: > > > - > > > - /* drop our reference on any cached states */ > > > - free_extent_state(cached_state); > > > > > > return ret; > > > > > > } > > > > > > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > > > index 23b6e03..cbe6381 100644 > > > --- a/fs/btrfs/file.c > > > +++ b/fs/btrfs/file.c > > > @@ -495,6 +495,9 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct > > > inode *inode,> > > > u64 num_bytes; > > > u64 start_pos; > > > u64 end_of_last_block; > > > > > > + u64 start; > > > + u64 end; > > > + u64 page_end; > > > > > > u64 end_pos = pos + write_bytes; > > > loff_t isize = i_size_read(inode); > > > > > > @@ -507,11 +510,24 @@ int btrfs_dirty_pages(struct btrfs_root *root, > > > struct inode *inode,> > > > if (err) > > > > > > return err; > > > > > > + start = start_pos; > > > + > > > > > > for (i = 0; i < num_pages; i++) { > > > > > > struct page *p = pages[i]; > > > SetPageUptodate(p); > > > ClearPageChecked(p); > > > > > > + > > > + end = page_end = page_offset(p) + PAGE_CACHE_SIZE - 1; > > > + > > > + if (i == num_pages - 1) > > > + end = min_t(u64, page_end, end_of_last_block); > > > + > > > + set_page_blks_state(p, > > > + 1 << BLK_STATE_DIRTY | 1 << > BLK_STATE_UPTODATE, > > > + start, end); > > > > > > set_page_dirty(p); > > > > > > + > > > + start = page_end + 1; > > > > This is not the usual way, page_end is unnecessary, (start += > > PAGE_CACHE_SIZE) should work. > > "start" may not always be set to a file offset that is a multiple of page > size. If the userspace dirties say "block 4" of 64k page, then start will be > set to 16384. Hence in such cases, "start += PAGE_CACHE_SIZE" would yield an > incorrect value. Right. > > > > } > > > > > > /* > > > > > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > > > index 8262f83..ac6a3f3 100644 > > > --- a/fs/btrfs/inode.c > > > +++ b/fs/btrfs/inode.c > > > > > > @@ -1995,6 +1995,11 @@ again: > > > } > > > > > > btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); > > > > > > + > > > + set_page_blks_state(page, > > > + 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE, > > > + page_start, page_end); > > > + > > > > > > ClearPageChecked(page); > > > set_page_dirty(page); > > > > > > out: > > > @@ -2984,26 +2989,48 @@ static int btrfs_writepage_end_io_hook(struct page > > > *page, u64 start, u64 end,> > > > struct btrfs_ordered_extent *ordered_extent = NULL; > > > struct btrfs_workqueue *wq; > > > btrfs_work_func_t func; > > > > > > + u64 ordered_start, ordered_end; > > > + int done; > > > > > > trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); > > > > > > ClearPagePrivate2(page); > > > > > > - if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, > > > - end - start + 1, uptodate)) > > > - return 0; > > > +loop: > > > + ordered_extent = btrfs_lookup_ordered_range(inode, start, > > > + end - start + 1); > > > + if (!ordered_extent) > > > + goto out; > > > > > > - if (btrfs_is_free_space_inode(inode)) { > > > - wq = root->fs_info->endio_freespace_worker; > > > - func = btrfs_freespace_write_helper; > > > - } else { > > > - wq = root->fs_info->endio_write_workers; > > > - func = btrfs_endio_write_helper; > > > + ordered_start = max_t(u64, start, ordered_extent->file_offset); > > > + ordered_end = min_t(u64, end, > > > + ordered_extent->file_offset + ordered_extent->len - > 1); > > > + > > > + done = btrfs_dec_test_ordered_pending(inode, &ordered_extent, > > > + ordered_start, > > > + ordered_end - ordered_start + 1, > > > + uptodate); > > > + if (done) { > > > + if (btrfs_is_free_space_inode(inode)) { > > > + wq = root->fs_info->endio_freespace_worker; > > > + func = btrfs_freespace_write_helper; > > > + } else { > > > + wq = root->fs_info->endio_write_workers; > > > + func = btrfs_endio_write_helper; > > > + } > > > + > > > + btrfs_init_work(&ordered_extent->work, func, > > > + finish_ordered_fn, NULL, NULL); > > > + btrfs_queue_work(wq, &ordered_extent->work); > > > > > > } > > > > > > - btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, > > > - NULL); > > > - btrfs_queue_work(wq, &ordered_extent->work); > > > + btrfs_put_ordered_extent(ordered_extent); > > > + > > > + start = ordered_end + 1; > > > + > > > + if (start < end) > > > + goto loop; > > > > > +out: > > I saw this's put a BUG_ON(block_start == EXTENT_MAP_INLINE); in writepage(), > > but I didn't see the code of disabling inline data in patch 01 and patch > > 02, but anyway I think we can avoid above searching for ordered_extents in > > a single page if we enable inline data. > > For inline extents, The call to __extent_writepage => writepage_delalloc => > run_delalloc_range => cow_file_range => cow_file_range_inline should write the > block's content into the appropriate location in the btree leaf. Hence > __extent_writepage_io() should never get invoked for files with inline > extents. The call to BUG_ON(block_start == EXTENT_MAP_INLINE) just makes this > explicit and also helps in debugging. Yes, that's right, thanks for the explanation.. > > Liu, However I am not sure if we could avoid looping across ordered > extents in the above code. Could you please elaborate on that? Given that a page may span two ordered extents(in cow_file_range(), a ENOSPC can split contiguous range into two ordered extents), the above loop can make sure we don't miss any of the two. Thanks, -liubo > > -- > chandan > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d37badb..3736ab5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1283,9 +1283,8 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE, - NULL, cached_state, mask); + return set_extent_bit(tree, start, end, EXTENT_DELALLOC, + NULL, cached_state, mask); } int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, @@ -1498,25 +1497,6 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) return 0; } -/* - * helper function to set both pages and extents in the tree writeback - */ -static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - set_page_writeback(page); - page_cache_release(page); - index++; - } - return 0; -} - /* find the first state struct with 'bits' set after 'start', and * return it. tree->lock must be held. NULL will returned if * nothing was found after 'start' @@ -2080,6 +2060,14 @@ static int page_read_complete(struct page *page) return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0); } +static int page_write_complete(struct page *page) +{ + u64 start = page_offset(page); + u64 end = start + PAGE_CACHE_SIZE - 1; + + return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0); +} + int free_io_failure(struct inode *inode, struct io_failure_record *rec) { int ret; @@ -2575,38 +2563,37 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) */ static void end_bio_extent_writepage(struct bio *bio, int err) { + struct btrfs_page_private *pg_private; struct bio_vec *bvec; + unsigned long flags; u64 start; u64 end; + int clear_writeback; int i; bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - /* We always issue full-page reads, but if some block - * in a page fails to read, blk_update_request() will - * advance bv_offset and adjust bv_len to compensate. - * Print a warning for nonzero offsets, and an error - * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { - if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) - btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, - "partial page write in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else - btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, - "incomplete page write in btrfs with offset %u and " - "length %u", - bvec->bv_offset, bvec->bv_len); - } + start = page_offset(page) + bvec->bv_offset; + end = start + bvec->bv_len - 1; - start = page_offset(page); - end = start + bvec->bv_offset + bvec->bv_len - 1; + pg_private = (struct btrfs_page_private *)page->private; + + spin_lock_irqsave(&pg_private->io_lock, flags); - if (end_extent_writepage(page, err, start, end)) + if (end_extent_writepage(page, err, start, end)) { + spin_unlock_irqrestore(&pg_private->io_lock, flags); continue; + } - end_page_writeback(page); + clear_page_blks_state(page, 1 << BLK_STATE_IO, start, end); + + clear_writeback = page_write_complete(page); + + spin_unlock_irqrestore(&pg_private->io_lock, flags); + + if (clear_writeback) + end_page_writeback(page); } bio_put(bio); @@ -3417,10 +3404,9 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, u64 block_start; u64 iosize; sector_t sector; - struct extent_state *cached_state = NULL; struct extent_map *em; struct block_device *bdev; - size_t pg_offset = 0; + size_t pg_offset; size_t blocksize; int ret = 0; int nr = 0; @@ -3467,8 +3453,16 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page_end, NULL, 1); break; } - em = epd->get_extent(inode, page, pg_offset, cur, - end - cur + 1, 1); + + pg_offset = cur & (PAGE_CACHE_SIZE - 1); + + if (!test_page_blks_state(page, BLK_STATE_DIRTY, cur, + cur + blocksize - 1, 1)) { + cur += blocksize; + continue; + } + + em = epd->get_extent(inode, page, pg_offset, cur, blocksize, 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); ret = PTR_ERR_OR_ZERO(em); @@ -3479,7 +3473,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, em_end = extent_map_end(em); BUG_ON(em_end <= cur); BUG_ON(end < cur); - iosize = min(em_end - cur, end - cur + 1); + iosize = min_t(u64, em_end - cur, blocksize); iosize = ALIGN(iosize, blocksize); sector = (em->block_start + extent_offset) >> 9; bdev = em->bdev; @@ -3488,32 +3482,20 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, free_extent_map(em); em = NULL; - /* - * compressed and inline extents are written through other - * paths in the FS - */ - if (compressed || block_start == EXTENT_MAP_HOLE || - block_start == EXTENT_MAP_INLINE) { - /* - * end_io notification does not happen here for - * compressed extents - */ - if (!compressed && tree->ops && - tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, cur, - cur + iosize - 1, - NULL, 1); - else if (compressed) { - /* we don't want to end_page_writeback on - * a compressed extent. this happens - * elsewhere - */ - nr++; - } + BUG_ON(compressed); + BUG_ON(block_start == EXTENT_MAP_INLINE); - cur += iosize; - pg_offset += iosize; - continue; + if (block_start == EXTENT_MAP_HOLE) { + if (test_page_blks_state(page, BLK_STATE_UPTODATE, cur, + cur + iosize - 1, 1)) { + clear_page_blks_state(page, + 1 << BLK_STATE_DIRTY, cur, + cur + iosize - 1); + cur += iosize; + continue; + } else { + BUG(); + } } if (tree->ops && tree->ops->writepage_io_hook) { @@ -3527,7 +3509,13 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, } else { unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1; - set_range_writeback(tree, cur, cur + iosize - 1); + clear_page_blks_state(page, 1 << BLK_STATE_DIRTY, cur, + cur + iosize - 1); + set_page_writeback(page); + + set_page_blks_state(page, 1 << BLK_STATE_IO, cur, + cur + iosize - 1); + if (!PageWriteback(page)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "page %lu not writeback, cur %llu end %llu", @@ -3542,17 +3530,14 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, if (ret) SetPageError(page); } - cur = cur + iosize; - pg_offset += iosize; + + cur += iosize; nr++; } done: *nr_ret = nr; done_unlocked: - - /* drop our reference on any cached states */ - free_extent_state(cached_state); return ret; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 23b6e03..cbe6381 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -495,6 +495,9 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, u64 num_bytes; u64 start_pos; u64 end_of_last_block; + u64 start; + u64 end; + u64 page_end; u64 end_pos = pos + write_bytes; loff_t isize = i_size_read(inode); @@ -507,11 +510,24 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, if (err) return err; + start = start_pos; + for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; SetPageUptodate(p); ClearPageChecked(p); + + end = page_end = page_offset(p) + PAGE_CACHE_SIZE - 1; + + if (i == num_pages - 1) + end = min_t(u64, page_end, end_of_last_block); + + set_page_blks_state(p, + 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE, + start, end); set_page_dirty(p); + + start = page_end + 1; } /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8262f83..ac6a3f3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1995,6 +1995,11 @@ again: } btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); + + set_page_blks_state(page, + 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE, + page_start, page_end); + ClearPageChecked(page); set_page_dirty(page); out: @@ -2984,26 +2989,48 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct btrfs_ordered_extent *ordered_extent = NULL; struct btrfs_workqueue *wq; btrfs_work_func_t func; + u64 ordered_start, ordered_end; + int done; trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); ClearPagePrivate2(page); - if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1, uptodate)) - return 0; +loop: + ordered_extent = btrfs_lookup_ordered_range(inode, start, + end - start + 1); + if (!ordered_extent) + goto out; - if (btrfs_is_free_space_inode(inode)) { - wq = root->fs_info->endio_freespace_worker; - func = btrfs_freespace_write_helper; - } else { - wq = root->fs_info->endio_write_workers; - func = btrfs_endio_write_helper; + ordered_start = max_t(u64, start, ordered_extent->file_offset); + ordered_end = min_t(u64, end, + ordered_extent->file_offset + ordered_extent->len - 1); + + done = btrfs_dec_test_ordered_pending(inode, &ordered_extent, + ordered_start, + ordered_end - ordered_start + 1, + uptodate); + if (done) { + if (btrfs_is_free_space_inode(inode)) { + wq = root->fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else { + wq = root->fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } + + btrfs_init_work(&ordered_extent->work, func, + finish_ordered_fn, NULL, NULL); + btrfs_queue_work(wq, &ordered_extent->work); } - btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, - NULL); - btrfs_queue_work(wq, &ordered_extent->work); + btrfs_put_ordered_extent(ordered_extent); + + start = ordered_end + 1; + + if (start < end) + goto loop; +out: return 0; } @@ -4601,6 +4628,9 @@ again: goto out_unlock; } + set_page_blks_state(page, 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE, + page_start, page_end); + if (offset != PAGE_CACHE_SIZE) { if (!len) len = PAGE_CACHE_SIZE - offset; @@ -8590,6 +8620,10 @@ again: ret = VM_FAULT_SIGBUS; goto out_unlock; } + + set_page_blks_state(page, 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE, + page_start, end); + ret = 0; /* page is wholly or partially inside EOF */
For the subpagesize-blocksize scenario, a page can contain multiple blocks. In such cases, this patch handles writing data to files. Also, When setting EXTENT_DELALLOC, we no longer set EXTENT_UPTODATE bit on the extent_io_tree since uptodate status is being tracked by the bitmap pointed to by page->private. Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> --- fs/btrfs/extent_io.c | 141 +++++++++++++++++++++++---------------------------- fs/btrfs/file.c | 16 ++++++ fs/btrfs/inode.c | 58 ++++++++++++++++----- 3 files changed, 125 insertions(+), 90 deletions(-)