diff mbox series

[2/3] mm: provide mapping_wrprotect_page() function

Message ID c802c17cdba59e3455f3d7db07659d5da0ed6dc1.1738347308.git.lorenzo.stoakes@oracle.com (mailing list archive)
State New
Headers show
Series expose mapping wrprotect, fix fb_defio use | expand

Commit Message

Lorenzo Stoakes Jan. 31, 2025, 6:28 p.m. UTC
in the fb_defio video driver, page dirty state is used to determine when
frame buffer pages have been changed, allowing for batched, deferred I/O to
be performed for efficiency.

This implementation had only one means of doing so effectively - the use of
the folio_mkclean() function.

However, this use of the function is inappropriate, as the fb_defio
implementation allocates kernel memory to back the framebuffer, and then is
forced to specified page->index, mapping fields in order to permit the
folio_mkclean() rmap traversal to proceed correctly.

It is not correct to specify these fields on kernel-allocated memory, and
moreover since these are not folios, page->index, mapping are deprecated
fields, soon to be removed.

We therefore need to provide a means by which we can correctly traverse the
reverse mapping and write-protect mappings for a page backing an
address_space page cache object at a given offset.

This patch provides this - mapping_wrprotect_page() allows for this
operation to be performed for a specified address_space, offset and page,
without requiring a folio nor, of course, an inappropriate use of
page->index, mapping.

With this provided, we can subequently adjust the fb_defio implementation
to make use of this function and avoid incorrect invocation of
folio_mkclean() and more importantly, incorrect manipulation of
page->index, mapping fields.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 include/linux/rmap.h |  3 ++
 mm/rmap.c            | 73 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

Comments

Simona Vetter Feb. 3, 2025, 3:49 p.m. UTC | #1
On Fri, Jan 31, 2025 at 06:28:57PM +0000, Lorenzo Stoakes wrote:
> in the fb_defio video driver, page dirty state is used to determine when
> frame buffer pages have been changed, allowing for batched, deferred I/O to
> be performed for efficiency.
> 
> This implementation had only one means of doing so effectively - the use of
> the folio_mkclean() function.
> 
> However, this use of the function is inappropriate, as the fb_defio
> implementation allocates kernel memory to back the framebuffer, and then is
> forced to specified page->index, mapping fields in order to permit the
> folio_mkclean() rmap traversal to proceed correctly.
> 
> It is not correct to specify these fields on kernel-allocated memory, and
> moreover since these are not folios, page->index, mapping are deprecated
> fields, soon to be removed.
> 
> We therefore need to provide a means by which we can correctly traverse the
> reverse mapping and write-protect mappings for a page backing an
> address_space page cache object at a given offset.
> 
> This patch provides this - mapping_wrprotect_page() allows for this
> operation to be performed for a specified address_space, offset and page,
> without requiring a folio nor, of course, an inappropriate use of
> page->index, mapping.
> 
> With this provided, we can subequently adjust the fb_defio implementation
> to make use of this function and avoid incorrect invocation of
> folio_mkclean() and more importantly, incorrect manipulation of
> page->index, mapping fields.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
>  include/linux/rmap.h |  3 ++
>  mm/rmap.c            | 73 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 76 insertions(+)
> 
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 683a04088f3f..0bf5f64884df 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -739,6 +739,9 @@ unsigned long page_address_in_vma(const struct folio *folio,
>   */
>  int folio_mkclean(struct folio *);
>  
> +int mapping_wrprotect_page(struct address_space *mapping, pgoff_t pgoff,
> +		unsigned long nr_pages, struct page *page);
> +
>  int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
>  		      struct vm_area_struct *vma);
>  
> diff --git a/mm/rmap.c b/mm/rmap.c
> index a2ff20c2eccd..bb5a42d95c48 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1127,6 +1127,79 @@ int folio_mkclean(struct folio *folio)
>  }
>  EXPORT_SYMBOL_GPL(folio_mkclean);
>  
> +struct wrprotect_file_state {
> +	int cleaned;
> +	pgoff_t pgoff;
> +	unsigned long pfn;
> +	unsigned long nr_pages;
> +};
> +
> +static bool mapping_wrprotect_page_one(struct folio *folio,
> +		struct vm_area_struct *vma, unsigned long address, void *arg)
> +{
> +	struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
> +	struct page_vma_mapped_walk pvmw = {
> +		.pfn		= state->pfn,
> +		.nr_pages	= state->nr_pages,
> +		.pgoff		= state->pgoff,
> +		.vma		= vma,
> +		.address	= address,
> +		.flags		= PVMW_SYNC,
> +	};
> +
> +	state->cleaned += page_vma_mkclean_one(&pvmw);
> +
> +	return true;
> +}
> +
> +static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
> +			     pgoff_t pgoff_start, unsigned long nr_pages,
> +			     struct rmap_walk_control *rwc, bool locked);
> +
> +/**
> + * mapping_wrprotect_page() - Write protect all mappings of this page.
> + *
> + * @mapping:	The mapping whose reverse mapping should be traversed.
> + * @pgoff:	The page offset at which @page is mapped within @mapping.
> + * @nr_pages:	The number of physically contiguous base pages spanned.
> + * @page:	The page mapped in @mapping at @pgoff.
> + *
> + * Traverses the reverse mapping, finding all VMAs which contain a shared
> + * mapping of the single @page in @mapping at offset @pgoff and write-protecting
> + * the mappings.
> + *
> + * The page does not have to be a folio, but rather can be a kernel allocation
> + * that is mapped into userland. We therefore do not require that the page maps
> + * to a folio with a valid mapping or index field, rather these are specified in
> + * @mapping and @pgoff.
> + *
> + * Return: the number of write-protected PTEs, or an error.
> + */
> +int mapping_wrprotect_page(struct address_space *mapping, pgoff_t pgoff,
> +		unsigned long nr_pages, struct page *page)
> +{
> +	struct wrprotect_file_state state = {
> +		.cleaned = 0,
> +		.pgoff = pgoff,
> +		.pfn = page_to_pfn(page),

Could we go one step further and entirely drop the struct page? Similar to
unmap_mapping_range for VM_SPECIAL mappings, except it only updates the
write protection. The reason is that ideally we'd like fbdev defio to
entirely get rid of any struct page usage, because with some dma_alloc()
memory regions there's simply no struct page for them (it's a carveout).
See e.g.  Sa498d4d06d6 ("drm/fbdev-dma: Only install deferred I/O if
necessary") for some of the pain this has caused.

So entirely struct page less way to write protect a pfn would be best. And
it doesn't look like you need the page here at all?

Cheers, Sima


> +		.nr_pages = nr_pages,
> +	};
> +	struct rmap_walk_control rwc = {
> +		.arg = (void *)&state,
> +		.rmap_one = mapping_wrprotect_page_one,
> +		.invalid_vma = invalid_mkclean_vma,
> +	};
> +
> +	if (!mapping)
> +		return 0;
> +
> +	__rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc,
> +			 /* locked = */false);
> +
> +	return state.cleaned;
> +}
> +EXPORT_SYMBOL_GPL(mapping_wrprotect_page);
> +
>  /**
>   * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
>   *                     [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
> -- 
> 2.48.1
>
Lorenzo Stoakes Feb. 3, 2025, 4:30 p.m. UTC | #2
On Mon, Feb 03, 2025 at 04:49:34PM +0100, Simona Vetter wrote:
> On Fri, Jan 31, 2025 at 06:28:57PM +0000, Lorenzo Stoakes wrote:
> > in the fb_defio video driver, page dirty state is used to determine when
> > frame buffer pages have been changed, allowing for batched, deferred I/O to
> > be performed for efficiency.
> >
> > This implementation had only one means of doing so effectively - the use of
> > the folio_mkclean() function.
> >
> > However, this use of the function is inappropriate, as the fb_defio
> > implementation allocates kernel memory to back the framebuffer, and then is
> > forced to specified page->index, mapping fields in order to permit the
> > folio_mkclean() rmap traversal to proceed correctly.
> >
> > It is not correct to specify these fields on kernel-allocated memory, and
> > moreover since these are not folios, page->index, mapping are deprecated
> > fields, soon to be removed.
> >
> > We therefore need to provide a means by which we can correctly traverse the
> > reverse mapping and write-protect mappings for a page backing an
> > address_space page cache object at a given offset.
> >
> > This patch provides this - mapping_wrprotect_page() allows for this
> > operation to be performed for a specified address_space, offset and page,
> > without requiring a folio nor, of course, an inappropriate use of
> > page->index, mapping.
> >
> > With this provided, we can subequently adjust the fb_defio implementation
> > to make use of this function and avoid incorrect invocation of
> > folio_mkclean() and more importantly, incorrect manipulation of
> > page->index, mapping fields.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> >  include/linux/rmap.h |  3 ++
> >  mm/rmap.c            | 73 ++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 76 insertions(+)
> >
> > diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > index 683a04088f3f..0bf5f64884df 100644
> > --- a/include/linux/rmap.h
> > +++ b/include/linux/rmap.h
> > @@ -739,6 +739,9 @@ unsigned long page_address_in_vma(const struct folio *folio,
> >   */
> >  int folio_mkclean(struct folio *);
> >
> > +int mapping_wrprotect_page(struct address_space *mapping, pgoff_t pgoff,
> > +		unsigned long nr_pages, struct page *page);
> > +
> >  int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
> >  		      struct vm_area_struct *vma);
> >
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index a2ff20c2eccd..bb5a42d95c48 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1127,6 +1127,79 @@ int folio_mkclean(struct folio *folio)
> >  }
> >  EXPORT_SYMBOL_GPL(folio_mkclean);
> >
> > +struct wrprotect_file_state {
> > +	int cleaned;
> > +	pgoff_t pgoff;
> > +	unsigned long pfn;
> > +	unsigned long nr_pages;
> > +};
> > +
> > +static bool mapping_wrprotect_page_one(struct folio *folio,
> > +		struct vm_area_struct *vma, unsigned long address, void *arg)
> > +{
> > +	struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
> > +	struct page_vma_mapped_walk pvmw = {
> > +		.pfn		= state->pfn,
> > +		.nr_pages	= state->nr_pages,
> > +		.pgoff		= state->pgoff,
> > +		.vma		= vma,
> > +		.address	= address,
> > +		.flags		= PVMW_SYNC,
> > +	};
> > +
> > +	state->cleaned += page_vma_mkclean_one(&pvmw);
> > +
> > +	return true;
> > +}
> > +
> > +static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
> > +			     pgoff_t pgoff_start, unsigned long nr_pages,
> > +			     struct rmap_walk_control *rwc, bool locked);
> > +
> > +/**
> > + * mapping_wrprotect_page() - Write protect all mappings of this page.
> > + *
> > + * @mapping:	The mapping whose reverse mapping should be traversed.
> > + * @pgoff:	The page offset at which @page is mapped within @mapping.
> > + * @nr_pages:	The number of physically contiguous base pages spanned.
> > + * @page:	The page mapped in @mapping at @pgoff.
> > + *
> > + * Traverses the reverse mapping, finding all VMAs which contain a shared
> > + * mapping of the single @page in @mapping at offset @pgoff and write-protecting
> > + * the mappings.
> > + *
> > + * The page does not have to be a folio, but rather can be a kernel allocation
> > + * that is mapped into userland. We therefore do not require that the page maps
> > + * to a folio with a valid mapping or index field, rather these are specified in
> > + * @mapping and @pgoff.
> > + *
> > + * Return: the number of write-protected PTEs, or an error.
> > + */
> > +int mapping_wrprotect_page(struct address_space *mapping, pgoff_t pgoff,
> > +		unsigned long nr_pages, struct page *page)
> > +{
> > +	struct wrprotect_file_state state = {
> > +		.cleaned = 0,
> > +		.pgoff = pgoff,
> > +		.pfn = page_to_pfn(page),
>
> Could we go one step further and entirely drop the struct page? Similar to
> unmap_mapping_range for VM_SPECIAL mappings, except it only updates the
> write protection. The reason is that ideally we'd like fbdev defio to
> entirely get rid of any struct page usage, because with some dma_alloc()
> memory regions there's simply no struct page for them (it's a carveout).
> See e.g.  Sa498d4d06d6 ("drm/fbdev-dma: Only install deferred I/O if
> necessary") for some of the pain this has caused.
>
> So entirely struct page less way to write protect a pfn would be best. And
> it doesn't look like you need the page here at all?

In the original version [1] we did indeed take a PFN, so this shouldn't be
a problem to change.

Since we make it possible here to explicitly reference the address_space
object mapping the range, and from that can find all the VMAs that map the
page range [pgoff, pgoff + nr_pages), I don't think we do need to think
about a struct page here at all.

The defio code does seem to have some questionable assumptions in place, or
at least ones I couldn't explain away re: attempting to folio-lock (the
non-folios...), so there'd need to be changes on that side, which I suggest
would probably be best for a follow-up series given this one's urgency.

But I'm more than happy to make this interface work with that by doing
another revision where we export PFN only, I think something like:

int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
		unsigned long pfn, unsigned long nr_pages);

Should work?

[1]:https://lore.kernel.org/all/cover.1736352361.git.lorenzo.stoakes@oracle.com/

>
> Cheers, Sima

Thanks!

>
>
> > +		.nr_pages = nr_pages,
> > +	};
> > +	struct rmap_walk_control rwc = {
> > +		.arg = (void *)&state,
> > +		.rmap_one = mapping_wrprotect_page_one,
> > +		.invalid_vma = invalid_mkclean_vma,
> > +	};
> > +
> > +	if (!mapping)
> > +		return 0;
> > +
> > +	__rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc,
> > +			 /* locked = */false);
> > +
> > +	return state.cleaned;
> > +}
> > +EXPORT_SYMBOL_GPL(mapping_wrprotect_page);
> > +
> >  /**
> >   * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
> >   *                     [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
> > --
> > 2.48.1
> >
>
> --
> Simona Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Christoph Hellwig Feb. 4, 2025, 5:36 a.m. UTC | #3
Hi Simona,


On Mon, Feb 03, 2025 at 04:49:34PM +0100, Simona Vetter wrote:
> 
> Could we go one step further and entirely drop the struct page? Similar to
> unmap_mapping_range for VM_SPECIAL mappings, except it only updates the
> write protection. The reason is that ideally we'd like fbdev defio to
> entirely get rid of any struct page usage, because with some dma_alloc()
> memory regions there's simply no struct page for them (it's a carveout).

Umm, for dma_alloc* where * is not _pages you never can get a page or
PFN form them.  They are block boxes and drivers must not attempt to
translated them into either a page or PFN or things will go wrong.
Only the kernel virtual address and dma_address may be used.

> See e.g.  Sa498d4d06d6 ("drm/fbdev-dma: Only install deferred I/O if
> necessary") for some of the pain this has caused.

The commit hash is corrupted, I guess this is 5a498d4d06d6 as the
subject line matches.  And that commit (just like the code it is trying
to fix) is completely broken as it violates the above.
Thomas Zimmermann Feb. 4, 2025, 8:16 a.m. UTC | #4
Hi


Am 04.02.25 um 06:36 schrieb Christoph Hellwig:
> Hi Simona,
>
>
> On Mon, Feb 03, 2025 at 04:49:34PM +0100, Simona Vetter wrote:
>> Could we go one step further and entirely drop the struct page? Similar to
>> unmap_mapping_range for VM_SPECIAL mappings, except it only updates the
>> write protection. The reason is that ideally we'd like fbdev defio to
>> entirely get rid of any struct page usage, because with some dma_alloc()
>> memory regions there's simply no struct page for them (it's a carveout).
> Umm, for dma_alloc* where * is not _pages you never can get a page or
> PFN form them.  They are block boxes and drivers must not attempt to
> translated them into either a page or PFN or things will go wrong.
> Only the kernel virtual address and dma_address may be used.
>
>> See e.g.  Sa498d4d06d6 ("drm/fbdev-dma: Only install deferred I/O if
>> necessary") for some of the pain this has caused.
> The commit hash is corrupted, I guess this is 5a498d4d06d6 as the
> subject line matches.  And that commit (just like the code it is trying
> to fix) is completely broken as it violates the above.

As the author of these commits, I have no doubt that the code would 
break on some systems. It works in practice for the current use cases 
though. Is it possible to create something that tracks framebuffer 
writes without requiring pages?

Best regards
Thomas

>
>
Simona Vetter Feb. 4, 2025, 10:19 a.m. UTC | #5
On Mon, Feb 03, 2025 at 04:30:04PM +0000, Lorenzo Stoakes wrote:
> On Mon, Feb 03, 2025 at 04:49:34PM +0100, Simona Vetter wrote:
> > On Fri, Jan 31, 2025 at 06:28:57PM +0000, Lorenzo Stoakes wrote:
> > > in the fb_defio video driver, page dirty state is used to determine when
> > > frame buffer pages have been changed, allowing for batched, deferred I/O to
> > > be performed for efficiency.
> > >
> > > This implementation had only one means of doing so effectively - the use of
> > > the folio_mkclean() function.
> > >
> > > However, this use of the function is inappropriate, as the fb_defio
> > > implementation allocates kernel memory to back the framebuffer, and then is
> > > forced to specified page->index, mapping fields in order to permit the
> > > folio_mkclean() rmap traversal to proceed correctly.
> > >
> > > It is not correct to specify these fields on kernel-allocated memory, and
> > > moreover since these are not folios, page->index, mapping are deprecated
> > > fields, soon to be removed.
> > >
> > > We therefore need to provide a means by which we can correctly traverse the
> > > reverse mapping and write-protect mappings for a page backing an
> > > address_space page cache object at a given offset.
> > >
> > > This patch provides this - mapping_wrprotect_page() allows for this
> > > operation to be performed for a specified address_space, offset and page,
> > > without requiring a folio nor, of course, an inappropriate use of
> > > page->index, mapping.
> > >
> > > With this provided, we can subequently adjust the fb_defio implementation
> > > to make use of this function and avoid incorrect invocation of
> > > folio_mkclean() and more importantly, incorrect manipulation of
> > > page->index, mapping fields.
> > >
> > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > ---
> > >  include/linux/rmap.h |  3 ++
> > >  mm/rmap.c            | 73 ++++++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 76 insertions(+)
> > >
> > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > > index 683a04088f3f..0bf5f64884df 100644
> > > --- a/include/linux/rmap.h
> > > +++ b/include/linux/rmap.h
> > > @@ -739,6 +739,9 @@ unsigned long page_address_in_vma(const struct folio *folio,
> > >   */
> > >  int folio_mkclean(struct folio *);
> > >
> > > +int mapping_wrprotect_page(struct address_space *mapping, pgoff_t pgoff,
> > > +		unsigned long nr_pages, struct page *page);
> > > +
> > >  int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
> > >  		      struct vm_area_struct *vma);
> > >
> > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > index a2ff20c2eccd..bb5a42d95c48 100644
> > > --- a/mm/rmap.c
> > > +++ b/mm/rmap.c
> > > @@ -1127,6 +1127,79 @@ int folio_mkclean(struct folio *folio)
> > >  }
> > >  EXPORT_SYMBOL_GPL(folio_mkclean);
> > >
> > > +struct wrprotect_file_state {
> > > +	int cleaned;
> > > +	pgoff_t pgoff;
> > > +	unsigned long pfn;
> > > +	unsigned long nr_pages;
> > > +};
> > > +
> > > +static bool mapping_wrprotect_page_one(struct folio *folio,
> > > +		struct vm_area_struct *vma, unsigned long address, void *arg)
> > > +{
> > > +	struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
> > > +	struct page_vma_mapped_walk pvmw = {
> > > +		.pfn		= state->pfn,
> > > +		.nr_pages	= state->nr_pages,
> > > +		.pgoff		= state->pgoff,
> > > +		.vma		= vma,
> > > +		.address	= address,
> > > +		.flags		= PVMW_SYNC,
> > > +	};
> > > +
> > > +	state->cleaned += page_vma_mkclean_one(&pvmw);
> > > +
> > > +	return true;
> > > +}
> > > +
> > > +static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
> > > +			     pgoff_t pgoff_start, unsigned long nr_pages,
> > > +			     struct rmap_walk_control *rwc, bool locked);
> > > +
> > > +/**
> > > + * mapping_wrprotect_page() - Write protect all mappings of this page.
> > > + *
> > > + * @mapping:	The mapping whose reverse mapping should be traversed.
> > > + * @pgoff:	The page offset at which @page is mapped within @mapping.
> > > + * @nr_pages:	The number of physically contiguous base pages spanned.
> > > + * @page:	The page mapped in @mapping at @pgoff.
> > > + *
> > > + * Traverses the reverse mapping, finding all VMAs which contain a shared
> > > + * mapping of the single @page in @mapping at offset @pgoff and write-protecting
> > > + * the mappings.
> > > + *
> > > + * The page does not have to be a folio, but rather can be a kernel allocation
> > > + * that is mapped into userland. We therefore do not require that the page maps
> > > + * to a folio with a valid mapping or index field, rather these are specified in
> > > + * @mapping and @pgoff.
> > > + *
> > > + * Return: the number of write-protected PTEs, or an error.
> > > + */
> > > +int mapping_wrprotect_page(struct address_space *mapping, pgoff_t pgoff,
> > > +		unsigned long nr_pages, struct page *page)
> > > +{
> > > +	struct wrprotect_file_state state = {
> > > +		.cleaned = 0,
> > > +		.pgoff = pgoff,
> > > +		.pfn = page_to_pfn(page),
> >
> > Could we go one step further and entirely drop the struct page? Similar to
> > unmap_mapping_range for VM_SPECIAL mappings, except it only updates the
> > write protection. The reason is that ideally we'd like fbdev defio to
> > entirely get rid of any struct page usage, because with some dma_alloc()
> > memory regions there's simply no struct page for them (it's a carveout).
> > See e.g.  Sa498d4d06d6 ("drm/fbdev-dma: Only install deferred I/O if
> > necessary") for some of the pain this has caused.
> >
> > So entirely struct page less way to write protect a pfn would be best. And
> > it doesn't look like you need the page here at all?
> 
> In the original version [1] we did indeed take a PFN, so this shouldn't be
> a problem to change.
> 
> Since we make it possible here to explicitly reference the address_space
> object mapping the range, and from that can find all the VMAs that map the
> page range [pgoff, pgoff + nr_pages), I don't think we do need to think
> about a struct page here at all.
> 
> The defio code does seem to have some questionable assumptions in place, or
> at least ones I couldn't explain away re: attempting to folio-lock (the
> non-folios...), so there'd need to be changes on that side, which I suggest
> would probably be best for a follow-up series given this one's urgency.

Yeah there's a bunch more things we need to do to get there. It was the
lack of a pfn-based core mm function that stopped us from doing that thus
far, plus also fbdev defio being very low priority. But it would
definitely avoid a bunch of corner cases and duplication in fbdev
emulation code in drivers/gpu/drm.

> But I'm more than happy to make this interface work with that by doing
> another revision where we export PFN only, I think something like:
> 
> int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
> 		unsigned long pfn, unsigned long nr_pages);
> 
> Should work?
> 
> [1]:https://lore.kernel.org/all/cover.1736352361.git.lorenzo.stoakes@oracle.com/


Yup that looks like the thing we'll need to wean defio of all that
questionable folio/page wrangling. But like you say, should be easy to
add/update when we get there.

Thanks, Sima

> 
> >
> > Cheers, Sima
> 
> Thanks!
> 
> >
> >
> > > +		.nr_pages = nr_pages,
> > > +	};
> > > +	struct rmap_walk_control rwc = {
> > > +		.arg = (void *)&state,
> > > +		.rmap_one = mapping_wrprotect_page_one,
> > > +		.invalid_vma = invalid_mkclean_vma,
> > > +	};
> > > +
> > > +	if (!mapping)
> > > +		return 0;
> > > +
> > > +	__rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc,
> > > +			 /* locked = */false);
> > > +
> > > +	return state.cleaned;
> > > +}
> > > +EXPORT_SYMBOL_GPL(mapping_wrprotect_page);
> > > +
> > >  /**
> > >   * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
> > >   *                     [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
> > > --
> > > 2.48.1
> > >
> >
> > --
> > Simona Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
diff mbox series

Patch

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 683a04088f3f..0bf5f64884df 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -739,6 +739,9 @@  unsigned long page_address_in_vma(const struct folio *folio,
  */
 int folio_mkclean(struct folio *);
 
+int mapping_wrprotect_page(struct address_space *mapping, pgoff_t pgoff,
+		unsigned long nr_pages, struct page *page);
+
 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
 		      struct vm_area_struct *vma);
 
diff --git a/mm/rmap.c b/mm/rmap.c
index a2ff20c2eccd..bb5a42d95c48 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1127,6 +1127,79 @@  int folio_mkclean(struct folio *folio)
 }
 EXPORT_SYMBOL_GPL(folio_mkclean);
 
+struct wrprotect_file_state {
+	int cleaned;
+	pgoff_t pgoff;
+	unsigned long pfn;
+	unsigned long nr_pages;
+};
+
+static bool mapping_wrprotect_page_one(struct folio *folio,
+		struct vm_area_struct *vma, unsigned long address, void *arg)
+{
+	struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
+	struct page_vma_mapped_walk pvmw = {
+		.pfn		= state->pfn,
+		.nr_pages	= state->nr_pages,
+		.pgoff		= state->pgoff,
+		.vma		= vma,
+		.address	= address,
+		.flags		= PVMW_SYNC,
+	};
+
+	state->cleaned += page_vma_mkclean_one(&pvmw);
+
+	return true;
+}
+
+static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
+			     pgoff_t pgoff_start, unsigned long nr_pages,
+			     struct rmap_walk_control *rwc, bool locked);
+
+/**
+ * mapping_wrprotect_page() - Write protect all mappings of this page.
+ *
+ * @mapping:	The mapping whose reverse mapping should be traversed.
+ * @pgoff:	The page offset at which @page is mapped within @mapping.
+ * @nr_pages:	The number of physically contiguous base pages spanned.
+ * @page:	The page mapped in @mapping at @pgoff.
+ *
+ * Traverses the reverse mapping, finding all VMAs which contain a shared
+ * mapping of the single @page in @mapping at offset @pgoff and write-protecting
+ * the mappings.
+ *
+ * The page does not have to be a folio, but rather can be a kernel allocation
+ * that is mapped into userland. We therefore do not require that the page maps
+ * to a folio with a valid mapping or index field, rather these are specified in
+ * @mapping and @pgoff.
+ *
+ * Return: the number of write-protected PTEs, or an error.
+ */
+int mapping_wrprotect_page(struct address_space *mapping, pgoff_t pgoff,
+		unsigned long nr_pages, struct page *page)
+{
+	struct wrprotect_file_state state = {
+		.cleaned = 0,
+		.pgoff = pgoff,
+		.pfn = page_to_pfn(page),
+		.nr_pages = nr_pages,
+	};
+	struct rmap_walk_control rwc = {
+		.arg = (void *)&state,
+		.rmap_one = mapping_wrprotect_page_one,
+		.invalid_vma = invalid_mkclean_vma,
+	};
+
+	if (!mapping)
+		return 0;
+
+	__rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc,
+			 /* locked = */false);
+
+	return state.cleaned;
+}
+EXPORT_SYMBOL_GPL(mapping_wrprotect_page);
+
 /**
  * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
  *                     [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)