diff mbox series

[04/23] page-writeback: Convert write_cache_pages() to use filemap_get_folios_tag()

Message ID 20220901220138.182896-5-vishal.moola@gmail.com (mailing list archive)
State New, archived
Headers show
Series Convert to filemap_get_folios_tag() | expand

Commit Message

Vishal Moola (Oracle) Sept. 1, 2022, 10:01 p.m. UTC
Converted function to use folios throughout. This is in preparation for
the removal of find_get_pages_range_tag().

Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
---
 mm/page-writeback.c | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

Comments

Dave Chinner Oct. 18, 2022, 9:01 p.m. UTC | #1
On Thu, Sep 01, 2022 at 03:01:19PM -0700, Vishal Moola (Oracle) wrote:
> Converted function to use folios throughout. This is in preparation for
> the removal of find_get_pages_range_tag().
> 
> Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
> ---
>  mm/page-writeback.c | 44 +++++++++++++++++++++++---------------------
>  1 file changed, 23 insertions(+), 21 deletions(-)
> 
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 032a7bf8d259..087165357a5a 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -2285,15 +2285,15 @@ int write_cache_pages(struct address_space *mapping,
>  	int ret = 0;
>  	int done = 0;
>  	int error;
> -	struct pagevec pvec;
> -	int nr_pages;
> +	struct folio_batch fbatch;
> +	int nr_folios;
>  	pgoff_t index;
>  	pgoff_t end;		/* Inclusive */
>  	pgoff_t done_index;
>  	int range_whole = 0;
>  	xa_mark_t tag;
>  
> -	pagevec_init(&pvec);
> +	folio_batch_init(&fbatch);
>  	if (wbc->range_cyclic) {
>  		index = mapping->writeback_index; /* prev offset */
>  		end = -1;
> @@ -2313,17 +2313,18 @@ int write_cache_pages(struct address_space *mapping,
>  	while (!done && (index <= end)) {
>  		int i;
>  
> -		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
> -				tag);
> -		if (nr_pages == 0)
> +		nr_folios = filemap_get_folios_tag(mapping, &index, end,
> +				tag, &fbatch);

This can find and return dirty multi-page folios if the filesystem
enables them in the mapping at instantiation time, right?

> +
> +		if (nr_folios == 0)
>  			break;
>  
> -		for (i = 0; i < nr_pages; i++) {
> -			struct page *page = pvec.pages[i];
> +		for (i = 0; i < nr_folios; i++) {
> +			struct folio *folio = fbatch.folios[i];
>  
> -			done_index = page->index;
> +			done_index = folio->index;
>  
> -			lock_page(page);
> +			folio_lock(folio);
>  
>  			/*
>  			 * Page truncated or invalidated. We can freely skip it
> @@ -2333,30 +2334,30 @@ int write_cache_pages(struct address_space *mapping,
>  			 * even if there is now a new, dirty page at the same
>  			 * pagecache address.
>  			 */
> -			if (unlikely(page->mapping != mapping)) {
> +			if (unlikely(folio->mapping != mapping)) {
>  continue_unlock:
> -				unlock_page(page);
> +				folio_unlock(folio);
>  				continue;
>  			}
>  
> -			if (!PageDirty(page)) {
> +			if (!folio_test_dirty(folio)) {
>  				/* someone wrote it for us */
>  				goto continue_unlock;
>  			}
>  
> -			if (PageWriteback(page)) {
> +			if (folio_test_writeback(folio)) {
>  				if (wbc->sync_mode != WB_SYNC_NONE)
> -					wait_on_page_writeback(page);
> +					folio_wait_writeback(folio);
>  				else
>  					goto continue_unlock;
>  			}
>  
> -			BUG_ON(PageWriteback(page));
> -			if (!clear_page_dirty_for_io(page))
> +			BUG_ON(folio_test_writeback(folio));
> +			if (!folio_clear_dirty_for_io(folio))
>  				goto continue_unlock;
>  
>  			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
> -			error = (*writepage)(page, wbc, data);
> +			error = writepage(&folio->page, wbc, data);

Yet, IIUC, this treats all folios as if they are single page folios.
i.e. it passes the head page of a multi-page folio to a callback
that will treat it as a single PAGE_SIZE page, because that's all
the writepage callbacks are currently expected to be passed...

So won't this break writeback of dirty multipage folios?

-Dave.
Vishal Moola (Oracle) Nov. 3, 2022, 10:28 p.m. UTC | #2
On Wed, Oct 19, 2022 at 08:01:52AM +1100, Dave Chinner wrote:
> On Thu, Sep 01, 2022 at 03:01:19PM -0700, Vishal Moola (Oracle) wrote:
> > Converted function to use folios throughout. This is in preparation for
> > the removal of find_get_pages_range_tag().
> > 
> > Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
> > ---
> >  mm/page-writeback.c | 44 +++++++++++++++++++++++---------------------
> >  1 file changed, 23 insertions(+), 21 deletions(-)
> > 
> > diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> > index 032a7bf8d259..087165357a5a 100644
> > --- a/mm/page-writeback.c
> > +++ b/mm/page-writeback.c
> > @@ -2285,15 +2285,15 @@ int write_cache_pages(struct address_space *mapping,
> >  	int ret = 0;
> >  	int done = 0;
> >  	int error;
> > -	struct pagevec pvec;
> > -	int nr_pages;
> > +	struct folio_batch fbatch;
> > +	int nr_folios;
> >  	pgoff_t index;
> >  	pgoff_t end;		/* Inclusive */
> >  	pgoff_t done_index;
> >  	int range_whole = 0;
> >  	xa_mark_t tag;
> >  
> > -	pagevec_init(&pvec);
> > +	folio_batch_init(&fbatch);
> >  	if (wbc->range_cyclic) {
> >  		index = mapping->writeback_index; /* prev offset */
> >  		end = -1;
> > @@ -2313,17 +2313,18 @@ int write_cache_pages(struct address_space *mapping,
> >  	while (!done && (index <= end)) {
> >  		int i;
> >  
> > -		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
> > -				tag);
> > -		if (nr_pages == 0)
> > +		nr_folios = filemap_get_folios_tag(mapping, &index, end,
> > +				tag, &fbatch);
> 
> This can find and return dirty multi-page folios if the filesystem
> enables them in the mapping at instantiation time, right?

Yup, it will.

> > +
> > +		if (nr_folios == 0)
> >  			break;
> >  
> > -		for (i = 0; i < nr_pages; i++) {
> > -			struct page *page = pvec.pages[i];
> > +		for (i = 0; i < nr_folios; i++) {
> > +			struct folio *folio = fbatch.folios[i];
> >  
> > -			done_index = page->index;
> > +			done_index = folio->index;
> >  
> > -			lock_page(page);
> > +			folio_lock(folio);
> >  
> >  			/*
> >  			 * Page truncated or invalidated. We can freely skip it
> > @@ -2333,30 +2334,30 @@ int write_cache_pages(struct address_space *mapping,
> >  			 * even if there is now a new, dirty page at the same
> >  			 * pagecache address.
> >  			 */
> > -			if (unlikely(page->mapping != mapping)) {
> > +			if (unlikely(folio->mapping != mapping)) {
> >  continue_unlock:
> > -				unlock_page(page);
> > +				folio_unlock(folio);
> >  				continue;
> >  			}
> >  
> > -			if (!PageDirty(page)) {
> > +			if (!folio_test_dirty(folio)) {
> >  				/* someone wrote it for us */
> >  				goto continue_unlock;
> >  			}
> >  
> > -			if (PageWriteback(page)) {
> > +			if (folio_test_writeback(folio)) {
> >  				if (wbc->sync_mode != WB_SYNC_NONE)
> > -					wait_on_page_writeback(page);
> > +					folio_wait_writeback(folio);
> >  				else
> >  					goto continue_unlock;
> >  			}
> >  
> > -			BUG_ON(PageWriteback(page));
> > -			if (!clear_page_dirty_for_io(page))
> > +			BUG_ON(folio_test_writeback(folio));
> > +			if (!folio_clear_dirty_for_io(folio))
> >  				goto continue_unlock;
> >  
> >  			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
> > -			error = (*writepage)(page, wbc, data);
> > +			error = writepage(&folio->page, wbc, data);
> 
> Yet, IIUC, this treats all folios as if they are single page folios.
> i.e. it passes the head page of a multi-page folio to a callback
> that will treat it as a single PAGE_SIZE page, because that's all
> the writepage callbacks are currently expected to be passed...
> 
> So won't this break writeback of dirty multipage folios?

Yes, it appears it would. But it wouldn't because its already 'broken'.

The current find_get_pages_range_tag() actually has the exact same
issue. The current code to fill up the pages array is:

		pages[ret] = &folio->page;
		if (++ret == nr_pages) {
			*index = folio->index + folio_nr_pages(folio);
			goto out;

which behaves the same way as the issue you pointed out (both break
large folios). When I spoke to Matthew about this earlier, we decided
to go ahead with replacing the function and leave it up to the callers
to fix/handle large folios when the filesystem gets to it.

Its not great to leave it 'broken' but its something that isn't - or at
least shouldn't be - creating any problems at present. And I believe Matthew
has plans to address them at some point before they actually become problems?
Dave Chinner Nov. 4, 2022, 12:32 a.m. UTC | #3
On Thu, Nov 03, 2022 at 03:28:05PM -0700, Vishal Moola wrote:
> On Wed, Oct 19, 2022 at 08:01:52AM +1100, Dave Chinner wrote:
> > On Thu, Sep 01, 2022 at 03:01:19PM -0700, Vishal Moola (Oracle) wrote:
> > > Converted function to use folios throughout. This is in preparation for
> > > the removal of find_get_pages_range_tag().
> > > 
> > > Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
> > > ---
> > >  mm/page-writeback.c | 44 +++++++++++++++++++++++---------------------
> > >  1 file changed, 23 insertions(+), 21 deletions(-)
> > > 
> > > diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> > > index 032a7bf8d259..087165357a5a 100644
> > > --- a/mm/page-writeback.c
> > > +++ b/mm/page-writeback.c
> > > @@ -2285,15 +2285,15 @@ int write_cache_pages(struct address_space *mapping,
> > >  	int ret = 0;
> > >  	int done = 0;
> > >  	int error;
> > > -	struct pagevec pvec;
> > > -	int nr_pages;
> > > +	struct folio_batch fbatch;
> > > +	int nr_folios;
> > >  	pgoff_t index;
> > >  	pgoff_t end;		/* Inclusive */
> > >  	pgoff_t done_index;
> > >  	int range_whole = 0;
> > >  	xa_mark_t tag;
> > >  
> > > -	pagevec_init(&pvec);
> > > +	folio_batch_init(&fbatch);
> > >  	if (wbc->range_cyclic) {
> > >  		index = mapping->writeback_index; /* prev offset */
> > >  		end = -1;
> > > @@ -2313,17 +2313,18 @@ int write_cache_pages(struct address_space *mapping,
> > >  	while (!done && (index <= end)) {
> > >  		int i;
> > >  
> > > -		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
> > > -				tag);
> > > -		if (nr_pages == 0)
> > > +		nr_folios = filemap_get_folios_tag(mapping, &index, end,
> > > +				tag, &fbatch);
> > 
> > This can find and return dirty multi-page folios if the filesystem
> > enables them in the mapping at instantiation time, right?
> 
> Yup, it will.
> 
> > > +
> > > +		if (nr_folios == 0)
> > >  			break;
> > >  
> > > -		for (i = 0; i < nr_pages; i++) {
> > > -			struct page *page = pvec.pages[i];
> > > +		for (i = 0; i < nr_folios; i++) {
> > > +			struct folio *folio = fbatch.folios[i];
> > >  
> > > -			done_index = page->index;
> > > +			done_index = folio->index;
> > >  
> > > -			lock_page(page);
> > > +			folio_lock(folio);
> > >  
> > >  			/*
> > >  			 * Page truncated or invalidated. We can freely skip it
> > > @@ -2333,30 +2334,30 @@ int write_cache_pages(struct address_space *mapping,
> > >  			 * even if there is now a new, dirty page at the same
> > >  			 * pagecache address.
> > >  			 */
> > > -			if (unlikely(page->mapping != mapping)) {
> > > +			if (unlikely(folio->mapping != mapping)) {
> > >  continue_unlock:
> > > -				unlock_page(page);
> > > +				folio_unlock(folio);
> > >  				continue;
> > >  			}
> > >  
> > > -			if (!PageDirty(page)) {
> > > +			if (!folio_test_dirty(folio)) {
> > >  				/* someone wrote it for us */
> > >  				goto continue_unlock;
> > >  			}
> > >  
> > > -			if (PageWriteback(page)) {
> > > +			if (folio_test_writeback(folio)) {
> > >  				if (wbc->sync_mode != WB_SYNC_NONE)
> > > -					wait_on_page_writeback(page);
> > > +					folio_wait_writeback(folio);
> > >  				else
> > >  					goto continue_unlock;
> > >  			}
> > >  
> > > -			BUG_ON(PageWriteback(page));
> > > -			if (!clear_page_dirty_for_io(page))
> > > +			BUG_ON(folio_test_writeback(folio));
> > > +			if (!folio_clear_dirty_for_io(folio))
> > >  				goto continue_unlock;
> > >  
> > >  			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
> > > -			error = (*writepage)(page, wbc, data);
> > > +			error = writepage(&folio->page, wbc, data);
> > 
> > Yet, IIUC, this treats all folios as if they are single page folios.
> > i.e. it passes the head page of a multi-page folio to a callback
> > that will treat it as a single PAGE_SIZE page, because that's all
> > the writepage callbacks are currently expected to be passed...
> > 
> > So won't this break writeback of dirty multipage folios?
> 
> Yes, it appears it would. But it wouldn't because its already 'broken'.

It is? Then why isn't XFS broken on existing kernels? Oh, we don't
know because it hasn't been tested?

Seriously - if this really is broken, and this patchset further
propagating the brokeness, then somebody needs to explain to me why
this is not corrupting data in XFS.

I get it that page/folios are in transition, but passing a
multi-page folio page to an interface that expects a PAGE_SIZE
struct page is a pretty nasty landmine, regardless of how broken the
higher level iteration code already might be.

At minimum, it needs to be documented, though I'd much prefer that
we explicitly duplicate write_cache_pages() as write_cache_folios()
with a callback that takes a folio and change the code to be fully
multi-page folio safe. Then filesystems that support folios (and
large folios) natively can be passed folios without going through
this crappy "folio->page, page->folio" dance because the writepage
APIs are unaware of multi-page folio constructs.

Then you can convert the individual filesystems using
write_cache_pages() to call write_cache_folios() one at a time,
updating the filesystem callback to do the conversion from folio to
struct page and checking that it an order-0 page that it has been
handed....

> The current find_get_pages_range_tag() actually has the exact same
> issue. The current code to fill up the pages array is:
> 
> 		pages[ret] = &folio->page;
> 		if (++ret == nr_pages) {
> 			*index = folio->index + folio_nr_pages(folio);
> 			goto out;

"It's already broken so we can make it more broken" isn't an
acceptible answer....

> Its not great to leave it 'broken' but its something that isn't - or at
> least shouldn't be - creating any problems at present. And I believe Matthew
> has plans to address them at some point before they actually become problems?

You are modifying the interfaces and doing folio conversions that
expose and propagate the brokenness. The brokeness needs to be
either avoided or fixed and not propagated further. Doing the above
write_cache_folios() conversion avoids the propagating the
brokenness, adds runtime detection of brokenness, and provides the
right interface for writeback iteration of folios.

Fixing the generic writeback iterator properly is not much extra
work, and it sets the model for filesytsems that have copy-pasted
write_cache_pages() and then hacked it around for their own purposes
(e.g. ext4, btrfs) to follow.

-Dave.
Darrick J. Wong Nov. 4, 2022, 2:45 a.m. UTC | #4
On Fri, Nov 04, 2022 at 11:32:35AM +1100, Dave Chinner wrote:
> On Thu, Nov 03, 2022 at 03:28:05PM -0700, Vishal Moola wrote:
> > On Wed, Oct 19, 2022 at 08:01:52AM +1100, Dave Chinner wrote:
> > > On Thu, Sep 01, 2022 at 03:01:19PM -0700, Vishal Moola (Oracle) wrote:
> > > > Converted function to use folios throughout. This is in preparation for
> > > > the removal of find_get_pages_range_tag().
> > > > 
> > > > Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
> > > > ---
> > > >  mm/page-writeback.c | 44 +++++++++++++++++++++++---------------------
> > > >  1 file changed, 23 insertions(+), 21 deletions(-)
> > > > 
> > > > diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> > > > index 032a7bf8d259..087165357a5a 100644
> > > > --- a/mm/page-writeback.c
> > > > +++ b/mm/page-writeback.c
> > > > @@ -2285,15 +2285,15 @@ int write_cache_pages(struct address_space *mapping,
> > > >  	int ret = 0;
> > > >  	int done = 0;
> > > >  	int error;
> > > > -	struct pagevec pvec;
> > > > -	int nr_pages;
> > > > +	struct folio_batch fbatch;
> > > > +	int nr_folios;
> > > >  	pgoff_t index;
> > > >  	pgoff_t end;		/* Inclusive */
> > > >  	pgoff_t done_index;
> > > >  	int range_whole = 0;
> > > >  	xa_mark_t tag;
> > > >  
> > > > -	pagevec_init(&pvec);
> > > > +	folio_batch_init(&fbatch);
> > > >  	if (wbc->range_cyclic) {
> > > >  		index = mapping->writeback_index; /* prev offset */
> > > >  		end = -1;
> > > > @@ -2313,17 +2313,18 @@ int write_cache_pages(struct address_space *mapping,
> > > >  	while (!done && (index <= end)) {
> > > >  		int i;
> > > >  
> > > > -		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
> > > > -				tag);
> > > > -		if (nr_pages == 0)
> > > > +		nr_folios = filemap_get_folios_tag(mapping, &index, end,
> > > > +				tag, &fbatch);
> > > 
> > > This can find and return dirty multi-page folios if the filesystem
> > > enables them in the mapping at instantiation time, right?
> > 
> > Yup, it will.
> > 
> > > > +
> > > > +		if (nr_folios == 0)
> > > >  			break;
> > > >  
> > > > -		for (i = 0; i < nr_pages; i++) {
> > > > -			struct page *page = pvec.pages[i];
> > > > +		for (i = 0; i < nr_folios; i++) {
> > > > +			struct folio *folio = fbatch.folios[i];
> > > >  
> > > > -			done_index = page->index;
> > > > +			done_index = folio->index;
> > > >  
> > > > -			lock_page(page);
> > > > +			folio_lock(folio);
> > > >  
> > > >  			/*
> > > >  			 * Page truncated or invalidated. We can freely skip it
> > > > @@ -2333,30 +2334,30 @@ int write_cache_pages(struct address_space *mapping,
> > > >  			 * even if there is now a new, dirty page at the same
> > > >  			 * pagecache address.
> > > >  			 */
> > > > -			if (unlikely(page->mapping != mapping)) {
> > > > +			if (unlikely(folio->mapping != mapping)) {
> > > >  continue_unlock:
> > > > -				unlock_page(page);
> > > > +				folio_unlock(folio);
> > > >  				continue;
> > > >  			}
> > > >  
> > > > -			if (!PageDirty(page)) {
> > > > +			if (!folio_test_dirty(folio)) {
> > > >  				/* someone wrote it for us */
> > > >  				goto continue_unlock;
> > > >  			}
> > > >  
> > > > -			if (PageWriteback(page)) {
> > > > +			if (folio_test_writeback(folio)) {
> > > >  				if (wbc->sync_mode != WB_SYNC_NONE)
> > > > -					wait_on_page_writeback(page);
> > > > +					folio_wait_writeback(folio);
> > > >  				else
> > > >  					goto continue_unlock;
> > > >  			}
> > > >  
> > > > -			BUG_ON(PageWriteback(page));
> > > > -			if (!clear_page_dirty_for_io(page))
> > > > +			BUG_ON(folio_test_writeback(folio));
> > > > +			if (!folio_clear_dirty_for_io(folio))
> > > >  				goto continue_unlock;
> > > >  
> > > >  			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
> > > > -			error = (*writepage)(page, wbc, data);
> > > > +			error = writepage(&folio->page, wbc, data);
> > > 
> > > Yet, IIUC, this treats all folios as if they are single page folios.
> > > i.e. it passes the head page of a multi-page folio to a callback
> > > that will treat it as a single PAGE_SIZE page, because that's all
> > > the writepage callbacks are currently expected to be passed...
> > > 
> > > So won't this break writeback of dirty multipage folios?
> > 
> > Yes, it appears it would. But it wouldn't because its already 'broken'.
> 
> It is? Then why isn't XFS broken on existing kernels? Oh, we don't
> know because it hasn't been tested?
> 
> Seriously - if this really is broken, and this patchset further
> propagating the brokeness, then somebody needs to explain to me why
> this is not corrupting data in XFS.

It looks like iomap_do_writepage finds the folio size correctly

	end_pos = folio_pos(folio) + folio_size(folio);

and iomap_writpage_map will map out the correct number of blocks

	unsigned nblocks = i_blocks_per_folio(inode, folio);

	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {

right?  The interface is dangerous because anyone who enables multipage
folios has to be aware that ->writepage can be handed a multipage folio.

(That said, the lack of mention of xfs in the testing plan doesn't give
me much confidence anyone has checked this...)

> I get it that page/folios are in transition, but passing a
> multi-page folio page to an interface that expects a PAGE_SIZE
> struct page is a pretty nasty landmine, regardless of how broken the
> higher level iteration code already might be.
> 
> At minimum, it needs to be documented, though I'd much prefer that
> we explicitly duplicate write_cache_pages() as write_cache_folios()
> with a callback that takes a folio and change the code to be fully
> multi-page folio safe. Then filesystems that support folios (and
> large folios) natively can be passed folios without going through
> this crappy "folio->page, page->folio" dance because the writepage
> APIs are unaware of multi-page folio constructs.

Agree.  Build the new one, move callers over, and kill the old one.

> Then you can convert the individual filesystems using
> write_cache_pages() to call write_cache_folios() one at a time,
> updating the filesystem callback to do the conversion from folio to
> struct page and checking that it an order-0 page that it has been
> handed....
> 
> > The current find_get_pages_range_tag() actually has the exact same
> > issue. The current code to fill up the pages array is:
> > 
> > 		pages[ret] = &folio->page;
> > 		if (++ret == nr_pages) {
> > 			*index = folio->index + folio_nr_pages(folio);
> > 			goto out;
> 
> "It's already broken so we can make it more broken" isn't an
> acceptible answer....
> 
> > Its not great to leave it 'broken' but its something that isn't - or at
> > least shouldn't be - creating any problems at present. And I believe Matthew
> > has plans to address them at some point before they actually become problems?
> 
> You are modifying the interfaces and doing folio conversions that
> expose and propagate the brokenness. The brokeness needs to be
> either avoided or fixed and not propagated further. Doing the above
> write_cache_folios() conversion avoids the propagating the
> brokenness, adds runtime detection of brokenness, and provides the
> right interface for writeback iteration of folios.
> 
> Fixing the generic writeback iterator properly is not much extra
> work, and it sets the model for filesytsems that have copy-pasted
> write_cache_pages() and then hacked it around for their own purposes
> (e.g. ext4, btrfs) to follow.
> 
> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner Nov. 4, 2022, 3:36 a.m. UTC | #5
On Thu, Nov 03, 2022 at 07:45:01PM -0700, Darrick J. Wong wrote:
> On Fri, Nov 04, 2022 at 11:32:35AM +1100, Dave Chinner wrote:
> > On Thu, Nov 03, 2022 at 03:28:05PM -0700, Vishal Moola wrote:
> > > On Wed, Oct 19, 2022 at 08:01:52AM +1100, Dave Chinner wrote:
> > > > On Thu, Sep 01, 2022 at 03:01:19PM -0700, Vishal Moola (Oracle) wrote:
> > > > > -			BUG_ON(PageWriteback(page));
> > > > > -			if (!clear_page_dirty_for_io(page))
> > > > > +			BUG_ON(folio_test_writeback(folio));
> > > > > +			if (!folio_clear_dirty_for_io(folio))
> > > > >  				goto continue_unlock;
> > > > >  
> > > > >  			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
> > > > > -			error = (*writepage)(page, wbc, data);
> > > > > +			error = writepage(&folio->page, wbc, data);
> > > > 
> > > > Yet, IIUC, this treats all folios as if they are single page folios.
> > > > i.e. it passes the head page of a multi-page folio to a callback
> > > > that will treat it as a single PAGE_SIZE page, because that's all
> > > > the writepage callbacks are currently expected to be passed...
> > > > 
> > > > So won't this break writeback of dirty multipage folios?
> > > 
> > > Yes, it appears it would. But it wouldn't because its already 'broken'.
> > 
> > It is? Then why isn't XFS broken on existing kernels? Oh, we don't
> > know because it hasn't been tested?
> > 
> > Seriously - if this really is broken, and this patchset further
> > propagating the brokeness, then somebody needs to explain to me why
> > this is not corrupting data in XFS.
> 
> It looks like iomap_do_writepage finds the folio size correctly
> 
> 	end_pos = folio_pos(folio) + folio_size(folio);
> 
> and iomap_writpage_map will map out the correct number of blocks
> 
> 	unsigned nblocks = i_blocks_per_folio(inode, folio);
> 
> 	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> 
> right?

Yup, that's how I read it, too.

But my recent experience with folios involved being repeatedly
burnt by edge case corruptions due to multipage folios showing up
when and where I least expected them.

Hence doing a 1:1 conversion of page based code to folio based code
and just assuming large folios will work without any testing seems
akin to playing russian roulette with loose cannons that have been
doused with napalm and then set on fire by an air-dropped barrel
bomb...

Cheers,

Dave.
Matthew Wilcox Nov. 4, 2022, 3:27 p.m. UTC | #6
On Wed, Oct 19, 2022 at 08:01:52AM +1100, Dave Chinner wrote:
> On Thu, Sep 01, 2022 at 03:01:19PM -0700, Vishal Moola (Oracle) wrote:
> > @@ -2313,17 +2313,18 @@ int write_cache_pages(struct address_space *mapping,
> >  	while (!done && (index <= end)) {
> >  		int i;
> >  
> > -		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
> > -				tag);
> > -		if (nr_pages == 0)
> > +		nr_folios = filemap_get_folios_tag(mapping, &index, end,
> > +				tag, &fbatch);
> 
> This can find and return dirty multi-page folios if the filesystem
> enables them in the mapping at instantiation time, right?

Correct.  Just like before the patch.  pagevec_lookup_range_tag() has
only ever returned head pages, never tail pages.  This is probably
because shmem (which was our only fs that supported compound pages)
never supported writeback, so never looked up pages by tag.

> >  			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
> > -			error = (*writepage)(page, wbc, data);
> > +			error = writepage(&folio->page, wbc, data);
> 
> Yet, IIUC, this treats all folios as if they are single page folios.
> i.e. it passes the head page of a multi-page folio to a callback
> that will treat it as a single PAGE_SIZE page, because that's all
> the writepage callbacks are currently expected to be passed...
> 
> So won't this break writeback of dirty multipage folios?

No.  A filesystem only sets the flag to create multipage folios once its
writeback callback handles multipage folios correctly (amongst many other
things that have to be fixed and tested).  I haven't written down all
the things that a filesystem maintainer needs to check at least partly
because I don't know how representative XFS/iomap are of all filesystems.
Matthew Wilcox Nov. 4, 2022, 8:06 p.m. UTC | #7
On Fri, Nov 04, 2022 at 11:32:35AM +1100, Dave Chinner wrote:
> At minimum, it needs to be documented, though I'd much prefer that
> we explicitly duplicate write_cache_pages() as write_cache_folios()
> with a callback that takes a folio and change the code to be fully
> multi-page folio safe. Then filesystems that support folios (and
> large folios) natively can be passed folios without going through
> this crappy "folio->page, page->folio" dance because the writepage
> APIs are unaware of multi-page folio constructs.

There are a lot of places which go through the folio->page->folio
dance, and this one wasn't even close to the top of my list.  That
said, it has a fairly small number of callers -- ext4, fuse, iomap,
mpage, nfs, orangefs.  So Vishal, this seems like a good project for you
to take on next -- convert write_cache_pages() to write_cache_folios()
and writepage_t to write_folio_t.
diff mbox series

Patch

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 032a7bf8d259..087165357a5a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2285,15 +2285,15 @@  int write_cache_pages(struct address_space *mapping,
 	int ret = 0;
 	int done = 0;
 	int error;
-	struct pagevec pvec;
-	int nr_pages;
+	struct folio_batch fbatch;
+	int nr_folios;
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	pgoff_t done_index;
 	int range_whole = 0;
 	xa_mark_t tag;
 
-	pagevec_init(&pvec);
+	folio_batch_init(&fbatch);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* prev offset */
 		end = -1;
@@ -2313,17 +2313,18 @@  int write_cache_pages(struct address_space *mapping,
 	while (!done && (index <= end)) {
 		int i;
 
-		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
-				tag);
-		if (nr_pages == 0)
+		nr_folios = filemap_get_folios_tag(mapping, &index, end,
+				tag, &fbatch);
+
+		if (nr_folios == 0)
 			break;
 
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
+		for (i = 0; i < nr_folios; i++) {
+			struct folio *folio = fbatch.folios[i];
 
-			done_index = page->index;
+			done_index = folio->index;
 
-			lock_page(page);
+			folio_lock(folio);
 
 			/*
 			 * Page truncated or invalidated. We can freely skip it
@@ -2333,30 +2334,30 @@  int write_cache_pages(struct address_space *mapping,
 			 * even if there is now a new, dirty page at the same
 			 * pagecache address.
 			 */
-			if (unlikely(page->mapping != mapping)) {
+			if (unlikely(folio->mapping != mapping)) {
 continue_unlock:
-				unlock_page(page);
+				folio_unlock(folio);
 				continue;
 			}
 
-			if (!PageDirty(page)) {
+			if (!folio_test_dirty(folio)) {
 				/* someone wrote it for us */
 				goto continue_unlock;
 			}
 
-			if (PageWriteback(page)) {
+			if (folio_test_writeback(folio)) {
 				if (wbc->sync_mode != WB_SYNC_NONE)
-					wait_on_page_writeback(page);
+					folio_wait_writeback(folio);
 				else
 					goto continue_unlock;
 			}
 
-			BUG_ON(PageWriteback(page));
-			if (!clear_page_dirty_for_io(page))
+			BUG_ON(folio_test_writeback(folio));
+			if (!folio_clear_dirty_for_io(folio))
 				goto continue_unlock;
 
 			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
-			error = (*writepage)(page, wbc, data);
+			error = writepage(&folio->page, wbc, data);
 			if (unlikely(error)) {
 				/*
 				 * Handle errors according to the type of
@@ -2371,11 +2372,12 @@  int write_cache_pages(struct address_space *mapping,
 				 * the first error.
 				 */
 				if (error == AOP_WRITEPAGE_ACTIVATE) {
-					unlock_page(page);
+					folio_unlock(folio);
 					error = 0;
 				} else if (wbc->sync_mode != WB_SYNC_ALL) {
 					ret = error;
-					done_index = page->index + 1;
+					done_index = folio->index +
+						folio_nr_pages(folio);
 					done = 1;
 					break;
 				}
@@ -2395,7 +2397,7 @@  int write_cache_pages(struct address_space *mapping,
 				break;
 			}
 		}
-		pagevec_release(&pvec);
+		folio_batch_release(&fbatch);
 		cond_resched();
 	}