diff mbox series

[v4,09/10] userfaultfd/shmem: modify shmem_mcopy_atomic_pte to use install_pte()

Message ID 20210420220804.486803-10-axelrasmussen@google.com (mailing list archive)
State New
Headers show
Series userfaultfd: add minor fault handling for shmem | expand

Commit Message

Axel Rasmussen April 20, 2021, 10:08 p.m. UTC
In a previous commit, we added the mcopy_atomic_install_pte() helper.
This helper does the job of setting up PTEs for an existing page, to map
it into a given VMA. It deals with both the anon and shmem cases, as
well as the shared and private cases.

In other words, shmem_mcopy_atomic_pte() duplicates a case it already
handles. So, expose it, and let shmem_mcopy_atomic_pte() use it
directly, to reduce code duplication.

This requires that we refactor shmem_mcopy_atomic_pte() a bit:

Instead of doing accounting (shmem_recalc_inode() et al) part-way
through the PTE setup, do it beforehand. This frees up
mcopy_atomic_install_pte() from having to care about this accounting,
but it does mean we need to clean it up if we get a failure afterwards
(shmem_uncharge()).

We can *almost* use shmem_charge() to do this, reducing code
duplication. But, it does `inode->i_mapping->nrpages++`, which would
double-count since shmem_add_to_page_cache() also does this.

Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
---
 include/linux/userfaultfd_k.h |  5 ++++
 mm/shmem.c                    | 53 ++++++++---------------------------
 mm/userfaultfd.c              | 17 ++++-------
 3 files changed, 22 insertions(+), 53 deletions(-)

Comments

Peter Xu April 21, 2021, 1:02 a.m. UTC | #1
On Tue, Apr 20, 2021 at 03:08:03PM -0700, Axel Rasmussen wrote:
> In a previous commit, we added the mcopy_atomic_install_pte() helper.
> This helper does the job of setting up PTEs for an existing page, to map
> it into a given VMA. It deals with both the anon and shmem cases, as
> well as the shared and private cases.
> 
> In other words, shmem_mcopy_atomic_pte() duplicates a case it already
> handles. So, expose it, and let shmem_mcopy_atomic_pte() use it
> directly, to reduce code duplication.
> 
> This requires that we refactor shmem_mcopy_atomic_pte() a bit:
> 
> Instead of doing accounting (shmem_recalc_inode() et al) part-way
> through the PTE setup, do it beforehand. This frees up
> mcopy_atomic_install_pte() from having to care about this accounting,
> but it does mean we need to clean it up if we get a failure afterwards
> (shmem_uncharge()).
> 
> We can *almost* use shmem_charge() to do this, reducing code
> duplication. But, it does `inode->i_mapping->nrpages++`, which would
> double-count since shmem_add_to_page_cache() also does this.

Missing to mention the lru_cache_add() replacement comment as Hugh commented on
this?

> 
> Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
> ---
>  include/linux/userfaultfd_k.h |  5 ++++
>  mm/shmem.c                    | 53 ++++++++---------------------------
>  mm/userfaultfd.c              | 17 ++++-------
>  3 files changed, 22 insertions(+), 53 deletions(-)
> 
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index 794d1538b8ba..39c094cc6641 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -53,6 +53,11 @@ enum mcopy_atomic_mode {
>  	MCOPY_ATOMIC_CONTINUE,
>  };
>  
> +extern int mcopy_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> +				    struct vm_area_struct *dst_vma,
> +				    unsigned long dst_addr, struct page *page,
> +				    bool newly_allocated, bool wp_copy);
> +
>  extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
>  			    unsigned long src_start, unsigned long len,
>  			    bool *mmap_changing, __u64 mode);
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 30c0bb501dc9..9bfa80fcd414 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2378,10 +2378,8 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
>  	struct address_space *mapping = inode->i_mapping;
>  	gfp_t gfp = mapping_gfp_mask(mapping);
>  	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
> -	spinlock_t *ptl;
>  	void *page_kaddr;
>  	struct page *page;
> -	pte_t _dst_pte, *dst_pte;
>  	int ret;
>  	pgoff_t max_off;
>  
> @@ -2391,8 +2389,10 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
>  
>  	if (!*pagep) {
>  		page = shmem_alloc_page(gfp, info, pgoff);
> -		if (!page)
> -			goto out_unacct_blocks;
> +		if (!page) {
> +			shmem_inode_unacct_blocks(inode, 1);
> +			goto out;
> +		}
>  
>  		if (!zeropage) {	/* COPY */
>  			page_kaddr = kmap_atomic(page);
> @@ -2432,59 +2432,28 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
>  	if (ret)
>  		goto out_release;
>  
> -	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
> -	if (dst_vma->vm_flags & VM_WRITE)
> -		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
> -	else {
> -		/*
> -		 * We don't set the pte dirty if the vma has no
> -		 * VM_WRITE permission, so mark the page dirty or it
> -		 * could be freed from under us. We could do it
> -		 * unconditionally before unlock_page(), but doing it
> -		 * only if VM_WRITE is not set is faster.
> -		 */
> -		set_page_dirty(page);
> -	}
> -
> -	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
> -
> -	ret = -EFAULT;
> -	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
> -	if (unlikely(pgoff >= max_off))
> -		goto out_release_unlock;
> -
> -	ret = -EEXIST;
> -	if (!pte_none(*dst_pte))
> -		goto out_release_unlock;
> -
> -	lru_cache_add(page);
> -
>  	spin_lock_irq(&info->lock);
>  	info->alloced++;
>  	inode->i_blocks += BLOCKS_PER_PAGE;
>  	shmem_recalc_inode(inode);
>  	spin_unlock_irq(&info->lock);
>  
> -	inc_mm_counter(dst_mm, mm_counter_file(page));
> -	page_add_file_rmap(page, false);
> -	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
> +	ret = mcopy_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> +				       page, true, false);
> +	if (ret)
> +		goto out_release_uncharge;
>  
> -	/* No need to invalidate - it was non-present before */
> -	update_mmu_cache(dst_vma, dst_addr, dst_pte);
> -	pte_unmap_unlock(dst_pte, ptl);
> +	SetPageDirty(page);
>  	unlock_page(page);
>  	ret = 0;
>  out:
>  	return ret;
> -out_release_unlock:
> -	pte_unmap_unlock(dst_pte, ptl);
> -	ClearPageDirty(page);
> +out_release_uncharge:
>  	delete_from_page_cache(page);
> +	shmem_uncharge(inode, 1);
>  out_release:
>  	unlock_page(page);
>  	put_page(page);
> -out_unacct_blocks:

Will all the "goto out_release" miss one call to shmem_inode_unacct_blocks()?

> -	shmem_inode_unacct_blocks(inode, 1);
>  	goto out;
>  }
>  #endif /* CONFIG_USERFAULTFD */
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 51d8c0127161..3a9ddbb2dbbd 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -51,18 +51,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
>  /*
>   * Install PTEs, to map dst_addr (within dst_vma) to page.
>   *
> - * This function handles MCOPY_ATOMIC_CONTINUE (which is always file-backed),
> - * whether or not dst_vma is VM_SHARED. It also handles the more general
> - * MCOPY_ATOMIC_NORMAL case, when dst_vma is *not* VM_SHARED (it may be file
> - * backed, or not).
> - *
> - * Note that MCOPY_ATOMIC_NORMAL for a VM_SHARED dst_vma is handled by
> - * shmem_mcopy_atomic_pte instead.
> + * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
> + * and anon, and for both shared and private VMAs.
>   */
> -static int mcopy_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> -				    struct vm_area_struct *dst_vma,
> -				    unsigned long dst_addr, struct page *page,
> -				    bool newly_allocated, bool wp_copy)
> +int mcopy_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> +			     struct vm_area_struct *dst_vma,
> +			     unsigned long dst_addr, struct page *page,
> +			     bool newly_allocated, bool wp_copy)
>  {
>  	int ret;
>  	pte_t _dst_pte, *dst_pte;
> -- 
> 2.31.1.368.gbe11c130af-goog
>
Hugh Dickins April 27, 2021, 2:59 a.m. UTC | #2
On Tue, 20 Apr 2021, Peter Xu wrote:
> On Tue, Apr 20, 2021 at 03:08:03PM -0700, Axel Rasmussen wrote:
> > In a previous commit, we added the mcopy_atomic_install_pte() helper.
> > This helper does the job of setting up PTEs for an existing page, to map
> > it into a given VMA. It deals with both the anon and shmem cases, as
> > well as the shared and private cases.
> > 
> > In other words, shmem_mcopy_atomic_pte() duplicates a case it already
> > handles. So, expose it, and let shmem_mcopy_atomic_pte() use it
> > directly, to reduce code duplication.
> > 
> > This requires that we refactor shmem_mcopy_atomic_pte() a bit:
> > 
> > Instead of doing accounting (shmem_recalc_inode() et al) part-way
> > through the PTE setup, do it beforehand. This frees up
> > mcopy_atomic_install_pte() from having to care about this accounting,
> > but it does mean we need to clean it up if we get a failure afterwards
> > (shmem_uncharge()).
> > 
> > We can *almost* use shmem_charge() to do this, reducing code
> > duplication. But, it does `inode->i_mapping->nrpages++`, which would
> > double-count since shmem_add_to_page_cache() also does this.
> 
> Missing to mention the lru_cache_add() replacement comment as Hugh commented on
> this?

Yes, please add that mention.

And personally, I'd much prefer this and the Doc commit,
the non-selftest commits, to be moved up before the selftests.

The selftest situation is confusing at present, since Peter's series
got dropped as [to-be-updated] from mmotm, so we're waiting for those
updates to be added back to mmotm before this series can go in.

(But akpm will be busy with merge window stuff at present:
now is not the time to be adding or re-adding a series.)

> 
> > 
> > Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>

Peter raised a good point below, so no Ack from me yet.

> > ---
> >  include/linux/userfaultfd_k.h |  5 ++++
> >  mm/shmem.c                    | 53 ++++++++---------------------------
> >  mm/userfaultfd.c              | 17 ++++-------
> >  3 files changed, 22 insertions(+), 53 deletions(-)
> > 
> > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > index 794d1538b8ba..39c094cc6641 100644
> > --- a/include/linux/userfaultfd_k.h
> > +++ b/include/linux/userfaultfd_k.h
> > @@ -53,6 +53,11 @@ enum mcopy_atomic_mode {
> >  	MCOPY_ATOMIC_CONTINUE,
> >  };
> >  
> > +extern int mcopy_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> > +				    struct vm_area_struct *dst_vma,
> > +				    unsigned long dst_addr, struct page *page,
> > +				    bool newly_allocated, bool wp_copy);
> > +
> >  extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
> >  			    unsigned long src_start, unsigned long len,
> >  			    bool *mmap_changing, __u64 mode);
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 30c0bb501dc9..9bfa80fcd414 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -2378,10 +2378,8 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
> >  	struct address_space *mapping = inode->i_mapping;
> >  	gfp_t gfp = mapping_gfp_mask(mapping);
> >  	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
> > -	spinlock_t *ptl;
> >  	void *page_kaddr;
> >  	struct page *page;
> > -	pte_t _dst_pte, *dst_pte;
> >  	int ret;
> >  	pgoff_t max_off;
> >  
> > @@ -2391,8 +2389,10 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
> >  
> >  	if (!*pagep) {
> >  		page = shmem_alloc_page(gfp, info, pgoff);
> > -		if (!page)
> > -			goto out_unacct_blocks;
> > +		if (!page) {
> > +			shmem_inode_unacct_blocks(inode, 1);
> > +			goto out;
> > +		}
> >  
> >  		if (!zeropage) {	/* COPY */
> >  			page_kaddr = kmap_atomic(page);
> > @@ -2432,59 +2432,28 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
> >  	if (ret)
> >  		goto out_release;
> >  
> > -	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
> > -	if (dst_vma->vm_flags & VM_WRITE)
> > -		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
> > -	else {
> > -		/*
> > -		 * We don't set the pte dirty if the vma has no
> > -		 * VM_WRITE permission, so mark the page dirty or it
> > -		 * could be freed from under us. We could do it
> > -		 * unconditionally before unlock_page(), but doing it
> > -		 * only if VM_WRITE is not set is faster.
> > -		 */
> > -		set_page_dirty(page);
> > -	}
> > -
> > -	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
> > -
> > -	ret = -EFAULT;
> > -	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
> > -	if (unlikely(pgoff >= max_off))
> > -		goto out_release_unlock;
> > -
> > -	ret = -EEXIST;
> > -	if (!pte_none(*dst_pte))
> > -		goto out_release_unlock;
> > -
> > -	lru_cache_add(page);
> > -
> >  	spin_lock_irq(&info->lock);
> >  	info->alloced++;
> >  	inode->i_blocks += BLOCKS_PER_PAGE;
> >  	shmem_recalc_inode(inode);
> >  	spin_unlock_irq(&info->lock);

I think it's best to move that info->locked block down to above the
new SetPageDirty(), where we know we have succeeded, because...

> >  
> > -	inc_mm_counter(dst_mm, mm_counter_file(page));
> > -	page_add_file_rmap(page, false);
> > -	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
> > +	ret = mcopy_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> > +				       page, true, false);
> > +	if (ret)
> > +		goto out_release_uncharge;
> >  
> > -	/* No need to invalidate - it was non-present before */
> > -	update_mmu_cache(dst_vma, dst_addr, dst_pte);
> > -	pte_unmap_unlock(dst_pte, ptl);
> > +	SetPageDirty(page);
> >  	unlock_page(page);
> >  	ret = 0;
> >  out:
> >  	return ret;
> > -out_release_unlock:
> > -	pte_unmap_unlock(dst_pte, ptl);
> > -	ClearPageDirty(page);
> > +out_release_uncharge:
> >  	delete_from_page_cache(page);
> > +	shmem_uncharge(inode, 1);
> >  out_release:
> >  	unlock_page(page);
> >  	put_page(page);
> > -out_unacct_blocks:
> 
> Will all the "goto out_release" miss one call to shmem_inode_unacct_blocks()?

Good catch, yes.

Which is why I suggest above to move the info->locked block down:
then you can forget about using shmem_uncharge(), and just use
shmem_inode_unacct_blocks() directly in all cases that need it here.

(But I notice the commit message refers to shmem_uncharge(), so that
will need adjusting. shmem_uncharge() is more of a hack than a proper
interface, introduced to manage certain THP split/collapse cases.)

> 
> > -	shmem_inode_unacct_blocks(inode, 1);
> >  	goto out;
> >  }
> >  #endif /* CONFIG_USERFAULTFD */
diff mbox series

Patch

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 794d1538b8ba..39c094cc6641 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -53,6 +53,11 @@  enum mcopy_atomic_mode {
 	MCOPY_ATOMIC_CONTINUE,
 };
 
+extern int mcopy_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+				    struct vm_area_struct *dst_vma,
+				    unsigned long dst_addr, struct page *page,
+				    bool newly_allocated, bool wp_copy);
+
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 			    unsigned long src_start, unsigned long len,
 			    bool *mmap_changing, __u64 mode);
diff --git a/mm/shmem.c b/mm/shmem.c
index 30c0bb501dc9..9bfa80fcd414 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2378,10 +2378,8 @@  int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	struct address_space *mapping = inode->i_mapping;
 	gfp_t gfp = mapping_gfp_mask(mapping);
 	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
-	spinlock_t *ptl;
 	void *page_kaddr;
 	struct page *page;
-	pte_t _dst_pte, *dst_pte;
 	int ret;
 	pgoff_t max_off;
 
@@ -2391,8 +2389,10 @@  int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 
 	if (!*pagep) {
 		page = shmem_alloc_page(gfp, info, pgoff);
-		if (!page)
-			goto out_unacct_blocks;
+		if (!page) {
+			shmem_inode_unacct_blocks(inode, 1);
+			goto out;
+		}
 
 		if (!zeropage) {	/* COPY */
 			page_kaddr = kmap_atomic(page);
@@ -2432,59 +2432,28 @@  int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	if (ret)
 		goto out_release;
 
-	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
-	if (dst_vma->vm_flags & VM_WRITE)
-		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
-	else {
-		/*
-		 * We don't set the pte dirty if the vma has no
-		 * VM_WRITE permission, so mark the page dirty or it
-		 * could be freed from under us. We could do it
-		 * unconditionally before unlock_page(), but doing it
-		 * only if VM_WRITE is not set is faster.
-		 */
-		set_page_dirty(page);
-	}
-
-	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
-
-	ret = -EFAULT;
-	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
-	if (unlikely(pgoff >= max_off))
-		goto out_release_unlock;
-
-	ret = -EEXIST;
-	if (!pte_none(*dst_pte))
-		goto out_release_unlock;
-
-	lru_cache_add(page);
-
 	spin_lock_irq(&info->lock);
 	info->alloced++;
 	inode->i_blocks += BLOCKS_PER_PAGE;
 	shmem_recalc_inode(inode);
 	spin_unlock_irq(&info->lock);
 
-	inc_mm_counter(dst_mm, mm_counter_file(page));
-	page_add_file_rmap(page, false);
-	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+	ret = mcopy_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+				       page, true, false);
+	if (ret)
+		goto out_release_uncharge;
 
-	/* No need to invalidate - it was non-present before */
-	update_mmu_cache(dst_vma, dst_addr, dst_pte);
-	pte_unmap_unlock(dst_pte, ptl);
+	SetPageDirty(page);
 	unlock_page(page);
 	ret = 0;
 out:
 	return ret;
-out_release_unlock:
-	pte_unmap_unlock(dst_pte, ptl);
-	ClearPageDirty(page);
+out_release_uncharge:
 	delete_from_page_cache(page);
+	shmem_uncharge(inode, 1);
 out_release:
 	unlock_page(page);
 	put_page(page);
-out_unacct_blocks:
-	shmem_inode_unacct_blocks(inode, 1);
 	goto out;
 }
 #endif /* CONFIG_USERFAULTFD */
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 51d8c0127161..3a9ddbb2dbbd 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -51,18 +51,13 @@  struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
 /*
  * Install PTEs, to map dst_addr (within dst_vma) to page.
  *
- * This function handles MCOPY_ATOMIC_CONTINUE (which is always file-backed),
- * whether or not dst_vma is VM_SHARED. It also handles the more general
- * MCOPY_ATOMIC_NORMAL case, when dst_vma is *not* VM_SHARED (it may be file
- * backed, or not).
- *
- * Note that MCOPY_ATOMIC_NORMAL for a VM_SHARED dst_vma is handled by
- * shmem_mcopy_atomic_pte instead.
+ * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
+ * and anon, and for both shared and private VMAs.
  */
-static int mcopy_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
-				    struct vm_area_struct *dst_vma,
-				    unsigned long dst_addr, struct page *page,
-				    bool newly_allocated, bool wp_copy)
+int mcopy_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+			     struct vm_area_struct *dst_vma,
+			     unsigned long dst_addr, struct page *page,
+			     bool newly_allocated, bool wp_copy)
 {
 	int ret;
 	pte_t _dst_pte, *dst_pte;