diff mbox series

[14/17] gup: Convert for_each_compound_head() to gup_for_each_folio()

Message ID 20220102215729.2943705-15-willy@infradead.org (mailing list archive)
State New
Headers show
Series Convert GUP to folios | expand

Commit Message

Matthew Wilcox Jan. 2, 2022, 9:57 p.m. UTC
This macro can be considerably simplified by returning the folio from
gup_folio_next() instead of void from compound_next().  Convert both
callers to work on folios instead of pages.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/gup.c | 47 ++++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

Comments

Christoph Hellwig Jan. 4, 2022, 8:32 a.m. UTC | #1
On Sun, Jan 02, 2022 at 09:57:26PM +0000, Matthew Wilcox (Oracle) wrote:
> This macro can be considerably simplified by returning the folio from
> gup_folio_next() instead of void from compound_next().  Convert both
> callers to work on folios instead of pages.

This looks sensible, but looking at the macro I wonder if an open
coded while loop (using your new calling conventions) wouldn't make
formore readable code than the macro:

	int i = 0;

	...

	while ((folio = gup_folio_next(i, npages, list, &ntail) != NULL)) {
		...
		i += ntails;
	}
John Hubbard Jan. 5, 2022, 8:17 a.m. UTC | #2
On 1/2/22 13:57, Matthew Wilcox (Oracle) wrote:
> This macro can be considerably simplified by returning the folio from
> gup_folio_next() instead of void from compound_next().  Convert both
> callers to work on folios instead of pages.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>   mm/gup.c | 47 ++++++++++++++++++++++++-----------------------
>   1 file changed, 24 insertions(+), 23 deletions(-)
> 
> diff --git a/mm/gup.c b/mm/gup.c
> index 7bd1e4a2648a..eaffa6807609 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -239,31 +239,29 @@ static inline void compound_range_next(unsigned long i, unsigned long npages,
>   	     __i < __npages; __i += __ntails, \
>   	     compound_range_next(__i, __npages, __list, &(__head), &(__ntails)))
>   
> -static inline void compound_next(unsigned long i, unsigned long npages,
> -				 struct page **list, struct page **head,
> -				 unsigned int *ntails)
> +static inline struct folio *gup_folio_next(unsigned long i,
> +		unsigned long npages, struct page **list, unsigned int *ntails)
>   {
> -	struct page *page;
> +	struct folio *folio;
>   	unsigned int nr;
>   
>   	if (i >= npages)
> -		return;
> +		return NULL;
>   
> -	page = compound_head(list[i]);
> +	folio = page_folio(list[i]);
>   	for (nr = i + 1; nr < npages; nr++) {
> -		if (compound_head(list[nr]) != page)
> +		if (page_folio(list[nr]) != folio)
>   			break;
>   	}
>   
> -	*head = page;
>   	*ntails = nr - i;
> +	return folio;
>   }
>   
> -#define for_each_compound_head(__i, __list, __npages, __head, __ntails) \
> -	for (__i = 0, \
> -	     compound_next(__i, __npages, __list, &(__head), &(__ntails)); \
> -	     __i < __npages; __i += __ntails, \
> -	     compound_next(__i, __npages, __list, &(__head), &(__ntails)))
> +#define gup_for_each_folio(__i, __list, __npages, __folio, __ntails) \
> +	for (__i = 0; \
> +	     (__folio = gup_folio_next(__i, __npages, __list, &(__ntails))) != NULL; \
> +	     __i += __ntails)


This is nice. I find these pre-existing macros to be really quite
horrible, but I was unable to suggest anything better at the time, so
it's good to see the simplification. :)

>   
>   /**
>    * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
> @@ -291,15 +289,15 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
>   				 bool make_dirty)
>   {
>   	unsigned long index;
> -	struct page *head;
> -	unsigned int ntails;
> +	struct folio *folio;
> +	unsigned int nr;
>   
>   	if (!make_dirty) {
>   		unpin_user_pages(pages, npages);
>   		return;
>   	}
>   
> -	for_each_compound_head(index, pages, npages, head, ntails) {
> +	gup_for_each_folio(index, pages, npages, folio, nr) {
>   		/*
>   		 * Checking PageDirty at this point may race with
>   		 * clear_page_dirty_for_io(), but that's OK. Two key
> @@ -320,9 +318,12 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
>   		 * written back, so it gets written back again in the
>   		 * next writeback cycle. This is harmless.
>   		 */
> -		if (!PageDirty(head))
> -			set_page_dirty_lock(head);
> -		put_compound_head(head, ntails, FOLL_PIN);
> +		if (!folio_test_dirty(folio)) {
> +			folio_lock(folio);
> +			folio_mark_dirty(folio);
> +			folio_unlock(folio);

At some point, maybe even here, I suspect that creating the folio
version of set_page_dirty_lock() would help. I'm sure you have
a better feel for whether it helps, after doing all of this conversion
work, but it just sort of jumped out at me as surprising to see it
in this form.

In any case, this all looks correct, so


Reviewed-by: John Hubbard <jhubbard@nvidia.com>

thanks,
Matthew Wilcox Jan. 9, 2022, 4:39 a.m. UTC | #3
On Wed, Jan 05, 2022 at 12:17:46AM -0800, John Hubbard wrote:
> > +		if (!folio_test_dirty(folio)) {
> > +			folio_lock(folio);
> > +			folio_mark_dirty(folio);
> > +			folio_unlock(folio);
> 
> At some point, maybe even here, I suspect that creating the folio
> version of set_page_dirty_lock() would help. I'm sure you have
> a better feel for whether it helps, after doing all of this conversion
> work, but it just sort of jumped out at me as surprising to see it
> in this form.

I really hate set_page_dirty_lock().  It smacks of "there is a locking
rule here which we're violating, so we'll just take the lock to fix it"
without understanding why there's a locking problem here.

As far as I can tell, originally, the intent was that you would lock
the page before modifying any of the data in the page.  ie you would
do:

	gup()
	lock_page()
	addr = kmap_page()
	*addr = 1;
	kunmap_page()
	set_page_dirty()
	unlock_page()
	put_page()

and that would prevent races between modifying the page and (starting)
writeback, not to mention truncate() and various other operations.

Clearly we can't do that for DMA-pinned pages.  There's only one lock
bit.  But do we even need to take the lock if we have the page pinned?
What are we protecting against?

If it's truncate(), I think we must already have protection against
that as we could easily have:

	pup()
				truncate()
	lock_page()
	set_page_dirty()
	unlock_page()
	unpin
John Hubbard Jan. 9, 2022, 8:01 a.m. UTC | #4
On 1/8/22 20:39, Matthew Wilcox wrote:
> On Wed, Jan 05, 2022 at 12:17:46AM -0800, John Hubbard wrote:
>>> +		if (!folio_test_dirty(folio)) {
>>> +			folio_lock(folio);
>>> +			folio_mark_dirty(folio);
>>> +			folio_unlock(folio);
>>
>> At some point, maybe even here, I suspect that creating the folio
>> version of set_page_dirty_lock() would help. I'm sure you have
>> a better feel for whether it helps, after doing all of this conversion
>> work, but it just sort of jumped out at me as surprising to see it
>> in this form.
> 
> I really hate set_page_dirty_lock().  It smacks of "there is a locking
> rule here which we're violating, so we'll just take the lock to fix it"
> without understanding why there's a locking problem here.
> 
> As far as I can tell, originally, the intent was that you would lock
> the page before modifying any of the data in the page.  ie you would
> do:
> 
> 	gup()
> 	lock_page()
> 	addr = kmap_page()
> 	*addr = 1;
> 	kunmap_page()
> 	set_page_dirty()
> 	unlock_page()
> 	put_page()
> 
> and that would prevent races between modifying the page and (starting)
> writeback, not to mention truncate() and various other operations.
> 
> Clearly we can't do that for DMA-pinned pages.  There's only one lock
> bit.  But do we even need to take the lock if we have the page pinned?
> What are we protecting against?

This is a fun question, because you're asking it at a point when the
overall problem remains unsolved. That is, the interaction between
file-backed pages and gup/pup is still completely broken.

And I don't have an answer for you: it does seem like lock_page() is
completely pointless here. Looking back, there are some 25 callers of
unpin_user_pages_dirty_lock(), and during all those patch reviews, no
one noticed this point!

Anyway...so maybe most or even all of the unpin_user_pages_dirty_lock()
callers do not really need a _lock variant, after all.

Or, maybe it really is required and we're overlooking some subtle
filesystem-related point. It would be nice to get a second look from
Christoph Hellwig and Jan Kara (+CC).

> 
> If it's truncate(), I think we must already have protection against
> that as we could easily have:
> 
> 	pup()
> 				truncate()
> 	lock_page()
> 	set_page_dirty()
> 	unlock_page()
> 	unpin
> 

I think truncate vs. gup/pup() is no more and and no less broken in this
regard than it's ever been. We need another LSF/MM conference with some
more shouting, and/or file leases, for that. :)


thanks,
Jan Kara Jan. 10, 2022, 3:22 p.m. UTC | #5
On Sun 09-01-22 00:01:49, John Hubbard wrote:
> On 1/8/22 20:39, Matthew Wilcox wrote:
> > On Wed, Jan 05, 2022 at 12:17:46AM -0800, John Hubbard wrote:
> > > > +		if (!folio_test_dirty(folio)) {
> > > > +			folio_lock(folio);
> > > > +			folio_mark_dirty(folio);
> > > > +			folio_unlock(folio);
> > > 
> > > At some point, maybe even here, I suspect that creating the folio
> > > version of set_page_dirty_lock() would help. I'm sure you have
> > > a better feel for whether it helps, after doing all of this conversion
> > > work, but it just sort of jumped out at me as surprising to see it
> > > in this form.
> > 
> > I really hate set_page_dirty_lock().  It smacks of "there is a locking
> > rule here which we're violating, so we'll just take the lock to fix it"
> > without understanding why there's a locking problem here.
> > 
> > As far as I can tell, originally, the intent was that you would lock
> > the page before modifying any of the data in the page.  ie you would
> > do:
> > 
> > 	gup()
> > 	lock_page()
> > 	addr = kmap_page()
> > 	*addr = 1;
> > 	kunmap_page()
> > 	set_page_dirty()
> > 	unlock_page()
> > 	put_page()
> > 
> > and that would prevent races between modifying the page and (starting)
> > writeback, not to mention truncate() and various other operations.
> > 
> > Clearly we can't do that for DMA-pinned pages.  There's only one lock
> > bit.  But do we even need to take the lock if we have the page pinned?
> > What are we protecting against?
> 
> This is a fun question, because you're asking it at a point when the
> overall problem remains unsolved. That is, the interaction between
> file-backed pages and gup/pup is still completely broken.
> 
> And I don't have an answer for you: it does seem like lock_page() is
> completely pointless here. Looking back, there are some 25 callers of
> unpin_user_pages_dirty_lock(), and during all those patch reviews, no
> one noticed this point!

I'd say it is underdocumented but not obviously pointless :) AFAIR (and
Christoph or Andrew may well correct me) the page lock in
set_page_dirty_lock() is there to protect metadata associated with the page
through page->private. Otherwise truncate could free these (e.g.
block_invalidatepage()) while ->set_page_dirty() callback (e.g.
__set_page_dirty_buffers()) works on this metadata.

								Honza
Matthew Wilcox Jan. 10, 2022, 3:52 p.m. UTC | #6
On Mon, Jan 10, 2022 at 04:22:08PM +0100, Jan Kara wrote:
> On Sun 09-01-22 00:01:49, John Hubbard wrote:
> > On 1/8/22 20:39, Matthew Wilcox wrote:
> > > On Wed, Jan 05, 2022 at 12:17:46AM -0800, John Hubbard wrote:
> > > > > +		if (!folio_test_dirty(folio)) {
> > > > > +			folio_lock(folio);
> > > > > +			folio_mark_dirty(folio);
> > > > > +			folio_unlock(folio);
> > > > 
> > > > At some point, maybe even here, I suspect that creating the folio
> > > > version of set_page_dirty_lock() would help. I'm sure you have
> > > > a better feel for whether it helps, after doing all of this conversion
> > > > work, but it just sort of jumped out at me as surprising to see it
> > > > in this form.
> > > 
> > > I really hate set_page_dirty_lock().  It smacks of "there is a locking
> > > rule here which we're violating, so we'll just take the lock to fix it"
> > > without understanding why there's a locking problem here.
> > > 
> > > As far as I can tell, originally, the intent was that you would lock
> > > the page before modifying any of the data in the page.  ie you would
> > > do:
> > > 
> > > 	gup()
> > > 	lock_page()
> > > 	addr = kmap_page()
> > > 	*addr = 1;
> > > 	kunmap_page()
> > > 	set_page_dirty()
> > > 	unlock_page()
> > > 	put_page()
> > > 
> > > and that would prevent races between modifying the page and (starting)
> > > writeback, not to mention truncate() and various other operations.
> > > 
> > > Clearly we can't do that for DMA-pinned pages.  There's only one lock
> > > bit.  But do we even need to take the lock if we have the page pinned?
> > > What are we protecting against?
> > 
> > This is a fun question, because you're asking it at a point when the
> > overall problem remains unsolved. That is, the interaction between
> > file-backed pages and gup/pup is still completely broken.
> > 
> > And I don't have an answer for you: it does seem like lock_page() is
> > completely pointless here. Looking back, there are some 25 callers of
> > unpin_user_pages_dirty_lock(), and during all those patch reviews, no
> > one noticed this point!
> 
> I'd say it is underdocumented but not obviously pointless :) AFAIR (and
> Christoph or Andrew may well correct me) the page lock in
> set_page_dirty_lock() is there to protect metadata associated with the page
> through page->private. Otherwise truncate could free these (e.g.
> block_invalidatepage()) while ->set_page_dirty() callback (e.g.
> __set_page_dirty_buffers()) works on this metadata.

Yes, but ... we have an inconsistency between DMA writes to the page and
CPU writes to the page.

	fd = open(file)
	write(fd, 1024 * 1024)
	mmap(NULL, 1024 * 1024, PROT_RW, MAP_SHARED, fd, 0)
	register-memory-with-RDMA
	ftruncate(fd, 0);	// page is removed from page cache
	ftruncate(fd, 1024 * 1024) 

Now if we do a store from the CPU, we instantiate a new page in the
page cache and the store will be written back to the file.  If we do
an RDMA-write, the write goes to the old page and will be lost.  Indeed,
it's no longer visible to the CPU (but is visible to other RDMA reads!)

Which is fine if the program did it itself because it's doing something
clearly bonkers, but another program might be the one doing the
two truncate() steps, and this would surprise an innocent program.

I still favour blocking the truncate-down (or holepunch) until there
are no pinned pages in the inode.  But I know this is a change in
behaviour since for some reason, truncate() gets to override mmap().
Jan Kara Jan. 10, 2022, 8:36 p.m. UTC | #7
On Mon 10-01-22 15:52:51, Matthew Wilcox wrote:
> On Mon, Jan 10, 2022 at 04:22:08PM +0100, Jan Kara wrote:
> > On Sun 09-01-22 00:01:49, John Hubbard wrote:
> > > On 1/8/22 20:39, Matthew Wilcox wrote:
> > > > On Wed, Jan 05, 2022 at 12:17:46AM -0800, John Hubbard wrote:
> > > > > > +		if (!folio_test_dirty(folio)) {
> > > > > > +			folio_lock(folio);
> > > > > > +			folio_mark_dirty(folio);
> > > > > > +			folio_unlock(folio);
> > > > > 
> > > > > At some point, maybe even here, I suspect that creating the folio
> > > > > version of set_page_dirty_lock() would help. I'm sure you have
> > > > > a better feel for whether it helps, after doing all of this conversion
> > > > > work, but it just sort of jumped out at me as surprising to see it
> > > > > in this form.
> > > > 
> > > > I really hate set_page_dirty_lock().  It smacks of "there is a locking
> > > > rule here which we're violating, so we'll just take the lock to fix it"
> > > > without understanding why there's a locking problem here.
> > > > 
> > > > As far as I can tell, originally, the intent was that you would lock
> > > > the page before modifying any of the data in the page.  ie you would
> > > > do:
> > > > 
> > > > 	gup()
> > > > 	lock_page()
> > > > 	addr = kmap_page()
> > > > 	*addr = 1;
> > > > 	kunmap_page()
> > > > 	set_page_dirty()
> > > > 	unlock_page()
> > > > 	put_page()
> > > > 
> > > > and that would prevent races between modifying the page and (starting)
> > > > writeback, not to mention truncate() and various other operations.
> > > > 
> > > > Clearly we can't do that for DMA-pinned pages.  There's only one lock
> > > > bit.  But do we even need to take the lock if we have the page pinned?
> > > > What are we protecting against?
> > > 
> > > This is a fun question, because you're asking it at a point when the
> > > overall problem remains unsolved. That is, the interaction between
> > > file-backed pages and gup/pup is still completely broken.
> > > 
> > > And I don't have an answer for you: it does seem like lock_page() is
> > > completely pointless here. Looking back, there are some 25 callers of
> > > unpin_user_pages_dirty_lock(), and during all those patch reviews, no
> > > one noticed this point!
> > 
> > I'd say it is underdocumented but not obviously pointless :) AFAIR (and
> > Christoph or Andrew may well correct me) the page lock in
> > set_page_dirty_lock() is there to protect metadata associated with the page
> > through page->private. Otherwise truncate could free these (e.g.
> > block_invalidatepage()) while ->set_page_dirty() callback (e.g.
> > __set_page_dirty_buffers()) works on this metadata.
> 
> Yes, but ... we have an inconsistency between DMA writes to the page and
> CPU writes to the page.
> 
> 	fd = open(file)
> 	write(fd, 1024 * 1024)
> 	mmap(NULL, 1024 * 1024, PROT_RW, MAP_SHARED, fd, 0)
> 	register-memory-with-RDMA
> 	ftruncate(fd, 0);	// page is removed from page cache
> 	ftruncate(fd, 1024 * 1024) 
> 
> Now if we do a store from the CPU, we instantiate a new page in the
> page cache and the store will be written back to the file.  If we do
> an RDMA-write, the write goes to the old page and will be lost.  Indeed,
> it's no longer visible to the CPU (but is visible to other RDMA reads!)
> 
> Which is fine if the program did it itself because it's doing something
> clearly bonkers, but another program might be the one doing the
> two truncate() steps, and this would surprise an innocent program.
> 
> I still favour blocking the truncate-down (or holepunch) until there
> are no pinned pages in the inode.  But I know this is a change in
> behaviour since for some reason, truncate() gets to override mmap().

I agree although this is unrelated to the page lock discussion above. In
principle we can consider such change (after all we chose this solution for
DAX) but it has some consequences - e.g. that disk space cannot be
reclaimed when someone has pagecache pages pinned (which may be unexpected
from sysadmin POV) or that we have to be careful or eager application
doing DIO (once it is converted to pinning) can block truncate
indefinitely.

								Honza
Matthew Wilcox Jan. 10, 2022, 9:10 p.m. UTC | #8
On Mon, Jan 10, 2022 at 09:36:11PM +0100, Jan Kara wrote:
> On Mon 10-01-22 15:52:51, Matthew Wilcox wrote:
> > On Mon, Jan 10, 2022 at 04:22:08PM +0100, Jan Kara wrote:
> > > On Sun 09-01-22 00:01:49, John Hubbard wrote:
> > > > On 1/8/22 20:39, Matthew Wilcox wrote:
> > > > > On Wed, Jan 05, 2022 at 12:17:46AM -0800, John Hubbard wrote:
> > > > > > > +		if (!folio_test_dirty(folio)) {
> > > > > > > +			folio_lock(folio);
> > > > > > > +			folio_mark_dirty(folio);
> > > > > > > +			folio_unlock(folio);
> > > > > > 
> > > > > > At some point, maybe even here, I suspect that creating the folio
> > > > > > version of set_page_dirty_lock() would help. I'm sure you have
> > > > > > a better feel for whether it helps, after doing all of this conversion
> > > > > > work, but it just sort of jumped out at me as surprising to see it
> > > > > > in this form.
> > > > > 
> > > > > I really hate set_page_dirty_lock().  It smacks of "there is a locking
> > > > > rule here which we're violating, so we'll just take the lock to fix it"
> > > > > without understanding why there's a locking problem here.
> > > > > 
> > > > > As far as I can tell, originally, the intent was that you would lock
> > > > > the page before modifying any of the data in the page.  ie you would
> > > > > do:
> > > > > 
> > > > > 	gup()
> > > > > 	lock_page()
> > > > > 	addr = kmap_page()
> > > > > 	*addr = 1;
> > > > > 	kunmap_page()
> > > > > 	set_page_dirty()
> > > > > 	unlock_page()
> > > > > 	put_page()
> > > > > 
> > > > > and that would prevent races between modifying the page and (starting)
> > > > > writeback, not to mention truncate() and various other operations.
> > > > > 
> > > > > Clearly we can't do that for DMA-pinned pages.  There's only one lock
> > > > > bit.  But do we even need to take the lock if we have the page pinned?
> > > > > What are we protecting against?
> > > > 
> > > > This is a fun question, because you're asking it at a point when the
> > > > overall problem remains unsolved. That is, the interaction between
> > > > file-backed pages and gup/pup is still completely broken.
> > > > 
> > > > And I don't have an answer for you: it does seem like lock_page() is
> > > > completely pointless here. Looking back, there are some 25 callers of
> > > > unpin_user_pages_dirty_lock(), and during all those patch reviews, no
> > > > one noticed this point!
> > > 
> > > I'd say it is underdocumented but not obviously pointless :) AFAIR (and
> > > Christoph or Andrew may well correct me) the page lock in
> > > set_page_dirty_lock() is there to protect metadata associated with the page
> > > through page->private. Otherwise truncate could free these (e.g.
> > > block_invalidatepage()) while ->set_page_dirty() callback (e.g.
> > > __set_page_dirty_buffers()) works on this metadata.
> > 
> > Yes, but ... we have an inconsistency between DMA writes to the page and
> > CPU writes to the page.
> > 
> > 	fd = open(file)
> > 	write(fd, 1024 * 1024)
> > 	mmap(NULL, 1024 * 1024, PROT_RW, MAP_SHARED, fd, 0)
> > 	register-memory-with-RDMA
> > 	ftruncate(fd, 0);	// page is removed from page cache
> > 	ftruncate(fd, 1024 * 1024) 
> > 
> > Now if we do a store from the CPU, we instantiate a new page in the
> > page cache and the store will be written back to the file.  If we do
> > an RDMA-write, the write goes to the old page and will be lost.  Indeed,
> > it's no longer visible to the CPU (but is visible to other RDMA reads!)
> > 
> > Which is fine if the program did it itself because it's doing something
> > clearly bonkers, but another program might be the one doing the
> > two truncate() steps, and this would surprise an innocent program.
> > 
> > I still favour blocking the truncate-down (or holepunch) until there
> > are no pinned pages in the inode.  But I know this is a change in
> > behaviour since for some reason, truncate() gets to override mmap().
> 
> I agree although this is unrelated to the page lock discussion above. In
> principle we can consider such change (after all we chose this solution for
> DAX) but it has some consequences - e.g. that disk space cannot be
> reclaimed when someone has pagecache pages pinned (which may be unexpected
> from sysadmin POV) or that we have to be careful or eager application
> doing DIO (once it is converted to pinning) can block truncate
> indefinitely.

It's not unrelated ... once we figure out how to solve this problem,
the set_page_dirty() call happens while the page is still DMA-pinned,
so any solution can be applicable to both places.  Maybe the solution
to truncate vs DMA-pin won't be applicable to both ...

As far as badly behaved applications doing DMA-pinning blocking truncate()
goes, have we considered the possibility of declining the DMA pin if the
process does not own the mmaped file?  That would limit the amount of
trouble it can cause, but maybe it would break some interesting use cases.
Jan Kara Jan. 17, 2022, 12:07 p.m. UTC | #9
On Mon 10-01-22 21:10:36, Matthew Wilcox wrote:
> On Mon, Jan 10, 2022 at 09:36:11PM +0100, Jan Kara wrote:
> > On Mon 10-01-22 15:52:51, Matthew Wilcox wrote:
> > > On Mon, Jan 10, 2022 at 04:22:08PM +0100, Jan Kara wrote:
> > > > On Sun 09-01-22 00:01:49, John Hubbard wrote:
> > > > > On 1/8/22 20:39, Matthew Wilcox wrote:
> > > > > > On Wed, Jan 05, 2022 at 12:17:46AM -0800, John Hubbard wrote:
> > > > > > > > +		if (!folio_test_dirty(folio)) {
> > > > > > > > +			folio_lock(folio);
> > > > > > > > +			folio_mark_dirty(folio);
> > > > > > > > +			folio_unlock(folio);
> > > > > > > 
> > > > > > > At some point, maybe even here, I suspect that creating the folio
> > > > > > > version of set_page_dirty_lock() would help. I'm sure you have
> > > > > > > a better feel for whether it helps, after doing all of this conversion
> > > > > > > work, but it just sort of jumped out at me as surprising to see it
> > > > > > > in this form.
> > > > > > 
> > > > > > I really hate set_page_dirty_lock().  It smacks of "there is a locking
> > > > > > rule here which we're violating, so we'll just take the lock to fix it"
> > > > > > without understanding why there's a locking problem here.
> > > > > > 
> > > > > > As far as I can tell, originally, the intent was that you would lock
> > > > > > the page before modifying any of the data in the page.  ie you would
> > > > > > do:
> > > > > > 
> > > > > > 	gup()
> > > > > > 	lock_page()
> > > > > > 	addr = kmap_page()
> > > > > > 	*addr = 1;
> > > > > > 	kunmap_page()
> > > > > > 	set_page_dirty()
> > > > > > 	unlock_page()
> > > > > > 	put_page()
> > > > > > 
> > > > > > and that would prevent races between modifying the page and (starting)
> > > > > > writeback, not to mention truncate() and various other operations.
> > > > > > 
> > > > > > Clearly we can't do that for DMA-pinned pages.  There's only one lock
> > > > > > bit.  But do we even need to take the lock if we have the page pinned?
> > > > > > What are we protecting against?
> > > > > 
> > > > > This is a fun question, because you're asking it at a point when the
> > > > > overall problem remains unsolved. That is, the interaction between
> > > > > file-backed pages and gup/pup is still completely broken.
> > > > > 
> > > > > And I don't have an answer for you: it does seem like lock_page() is
> > > > > completely pointless here. Looking back, there are some 25 callers of
> > > > > unpin_user_pages_dirty_lock(), and during all those patch reviews, no
> > > > > one noticed this point!
> > > > 
> > > > I'd say it is underdocumented but not obviously pointless :) AFAIR (and
> > > > Christoph or Andrew may well correct me) the page lock in
> > > > set_page_dirty_lock() is there to protect metadata associated with the page
> > > > through page->private. Otherwise truncate could free these (e.g.
> > > > block_invalidatepage()) while ->set_page_dirty() callback (e.g.
> > > > __set_page_dirty_buffers()) works on this metadata.
> > > 
> > > Yes, but ... we have an inconsistency between DMA writes to the page and
> > > CPU writes to the page.
> > > 
> > > 	fd = open(file)
> > > 	write(fd, 1024 * 1024)
> > > 	mmap(NULL, 1024 * 1024, PROT_RW, MAP_SHARED, fd, 0)
> > > 	register-memory-with-RDMA
> > > 	ftruncate(fd, 0);	// page is removed from page cache
> > > 	ftruncate(fd, 1024 * 1024) 
> > > 
> > > Now if we do a store from the CPU, we instantiate a new page in the
> > > page cache and the store will be written back to the file.  If we do
> > > an RDMA-write, the write goes to the old page and will be lost.  Indeed,
> > > it's no longer visible to the CPU (but is visible to other RDMA reads!)
> > > 
> > > Which is fine if the program did it itself because it's doing something
> > > clearly bonkers, but another program might be the one doing the
> > > two truncate() steps, and this would surprise an innocent program.
> > > 
> > > I still favour blocking the truncate-down (or holepunch) until there
> > > are no pinned pages in the inode.  But I know this is a change in
> > > behaviour since for some reason, truncate() gets to override mmap().
> > 
> > I agree although this is unrelated to the page lock discussion above. In
> > principle we can consider such change (after all we chose this solution for
> > DAX) but it has some consequences - e.g. that disk space cannot be
> > reclaimed when someone has pagecache pages pinned (which may be unexpected
> > from sysadmin POV) or that we have to be careful or eager application
> > doing DIO (once it is converted to pinning) can block truncate
> > indefinitely.
> 
> It's not unrelated ... once we figure out how to solve this problem,
> the set_page_dirty() call happens while the page is still DMA-pinned,
> so any solution can be applicable to both places.  Maybe the solution
> to truncate vs DMA-pin won't be applicable to both ...
> 
> As far as badly behaved applications doing DMA-pinning blocking truncate()
> goes, have we considered the possibility of declining the DMA pin if the
> process does not own the mmaped file?  That would limit the amount of
> trouble it can cause, but maybe it would break some interesting use cases.

Sorry for delayed reply, this fell through the cracks. IMO this isn't going
to fly.
1) Direct IO needs to use DMA pinning for its buffers and you cannot
regress that by changing required permissions.
2) Also requiring file ownership for such operation looks a bit weird from
userspace POV. Userspace is just using mmaped files, why should it require
extra priviledges?
3) It would be definitive goodbye to GUP-fast for pinning.

								Honza
diff mbox series

Patch

diff --git a/mm/gup.c b/mm/gup.c
index 7bd1e4a2648a..eaffa6807609 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -239,31 +239,29 @@  static inline void compound_range_next(unsigned long i, unsigned long npages,
 	     __i < __npages; __i += __ntails, \
 	     compound_range_next(__i, __npages, __list, &(__head), &(__ntails)))
 
-static inline void compound_next(unsigned long i, unsigned long npages,
-				 struct page **list, struct page **head,
-				 unsigned int *ntails)
+static inline struct folio *gup_folio_next(unsigned long i,
+		unsigned long npages, struct page **list, unsigned int *ntails)
 {
-	struct page *page;
+	struct folio *folio;
 	unsigned int nr;
 
 	if (i >= npages)
-		return;
+		return NULL;
 
-	page = compound_head(list[i]);
+	folio = page_folio(list[i]);
 	for (nr = i + 1; nr < npages; nr++) {
-		if (compound_head(list[nr]) != page)
+		if (page_folio(list[nr]) != folio)
 			break;
 	}
 
-	*head = page;
 	*ntails = nr - i;
+	return folio;
 }
 
-#define for_each_compound_head(__i, __list, __npages, __head, __ntails) \
-	for (__i = 0, \
-	     compound_next(__i, __npages, __list, &(__head), &(__ntails)); \
-	     __i < __npages; __i += __ntails, \
-	     compound_next(__i, __npages, __list, &(__head), &(__ntails)))
+#define gup_for_each_folio(__i, __list, __npages, __folio, __ntails) \
+	for (__i = 0; \
+	     (__folio = gup_folio_next(__i, __npages, __list, &(__ntails))) != NULL; \
+	     __i += __ntails)
 
 /**
  * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
@@ -291,15 +289,15 @@  void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
 				 bool make_dirty)
 {
 	unsigned long index;
-	struct page *head;
-	unsigned int ntails;
+	struct folio *folio;
+	unsigned int nr;
 
 	if (!make_dirty) {
 		unpin_user_pages(pages, npages);
 		return;
 	}
 
-	for_each_compound_head(index, pages, npages, head, ntails) {
+	gup_for_each_folio(index, pages, npages, folio, nr) {
 		/*
 		 * Checking PageDirty at this point may race with
 		 * clear_page_dirty_for_io(), but that's OK. Two key
@@ -320,9 +318,12 @@  void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
 		 * written back, so it gets written back again in the
 		 * next writeback cycle. This is harmless.
 		 */
-		if (!PageDirty(head))
-			set_page_dirty_lock(head);
-		put_compound_head(head, ntails, FOLL_PIN);
+		if (!folio_test_dirty(folio)) {
+			folio_lock(folio);
+			folio_mark_dirty(folio);
+			folio_unlock(folio);
+		}
+		gup_put_folio(folio, nr, FOLL_PIN);
 	}
 }
 EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
@@ -375,8 +376,8 @@  EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
 void unpin_user_pages(struct page **pages, unsigned long npages)
 {
 	unsigned long index;
-	struct page *head;
-	unsigned int ntails;
+	struct folio *folio;
+	unsigned int nr;
 
 	/*
 	 * If this WARN_ON() fires, then the system *might* be leaking pages (by
@@ -386,8 +387,8 @@  void unpin_user_pages(struct page **pages, unsigned long npages)
 	if (WARN_ON(IS_ERR_VALUE(npages)))
 		return;
 
-	for_each_compound_head(index, pages, npages, head, ntails)
-		put_compound_head(head, ntails, FOLL_PIN);
+	gup_for_each_folio(index, pages, npages, folio, nr)
+		gup_put_folio(folio, nr, FOLL_PIN);
 }
 EXPORT_SYMBOL(unpin_user_pages);