diff mbox series

[RFC,v4,6/8] hugetlb: add vma based lock for pmd sharing synchronization

Message ID 20220706202347.95150-7-mike.kravetz@oracle.com (mailing list archive)
State New
Headers show
Series hugetlb: Change huge pmd sharing synchronization again | expand

Commit Message

Mike Kravetz July 6, 2022, 8:23 p.m. UTC
Allocate a rw semaphore and hang off vm_private_data for
synchronization use by vmas that could be involved in pmd sharing.  Only
add infrastructure for the new lock here.  Actual use will be added in
subsequent patch.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 include/linux/hugetlb.h |  36 +++++++++-
 kernel/fork.c           |   6 +-
 mm/hugetlb.c            | 150 ++++++++++++++++++++++++++++++++++++----
 mm/rmap.c               |   8 ++-
 4 files changed, 178 insertions(+), 22 deletions(-)

Comments

Miaohe Lin July 29, 2022, 2:55 a.m. UTC | #1
On 2022/7/7 4:23, Mike Kravetz wrote:
> Allocate a rw semaphore and hang off vm_private_data for
> synchronization use by vmas that could be involved in pmd sharing.  Only
> add infrastructure for the new lock here.  Actual use will be added in
> subsequent patch.
> 
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>  include/linux/hugetlb.h |  36 +++++++++-
>  kernel/fork.c           |   6 +-
>  mm/hugetlb.c            | 150 ++++++++++++++++++++++++++++++++++++----
>  mm/rmap.c               |   8 ++-
>  4 files changed, 178 insertions(+), 22 deletions(-)
> 

<snip>

>  
>  /* Forward declaration */
>  static int hugetlb_acct_memory(struct hstate *h, long delta);
> +static bool vma_pmd_shareable(struct vm_area_struct *vma);
>  
>  static inline bool subpool_is_free(struct hugepage_subpool *spool)
>  {
> @@ -904,6 +905,89 @@ resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
>  #endif
>  }
>  
> +static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
> +{
> +	return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&

Should me make __vma_aligned_range_pmd_shareable check (VM_MAYSHARE | VM_SHARED) like above
instead of VM_MAYSHARE to make code more consistent?

> +		vma->vm_private_data;
> +}
> +
> +void hugetlb_vma_lock_read(struct vm_area_struct *vma)
> +{
> +	if (__vma_shareable_flags_pmd(vma))
> +		down_read((struct rw_semaphore *)vma->vm_private_data);
> +}
> +
> +void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
> +{
> +	if (__vma_shareable_flags_pmd(vma))
> +		up_read((struct rw_semaphore *)vma->vm_private_data);
> +}
> +
> +void hugetlb_vma_lock_write(struct vm_area_struct *vma)
> +{
> +	if (__vma_shareable_flags_pmd(vma))
> +		down_write((struct rw_semaphore *)vma->vm_private_data);
> +}
> +
> +void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
> +{
> +	if (__vma_shareable_flags_pmd(vma))
> +		up_write((struct rw_semaphore *)vma->vm_private_data);
> +}
> +
> +int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
> +{
> +	if (!__vma_shareable_flags_pmd(vma))
> +		return 1;
> +
> +	return down_write_trylock((struct rw_semaphore *)vma->vm_private_data);
> +}
> +
> +void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
> +{
> +	if (__vma_shareable_flags_pmd(vma))
> +		lockdep_assert_held((struct rw_semaphore *)
> +				vma->vm_private_data);
> +}
> +
> +static void hugetlb_free_vma_lock(struct vm_area_struct *vma)
> +{
> +	/* Only present in sharable vmas */
> +	if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
> +		return;
> +
> +	if (vma->vm_private_data) {
> +		kfree(vma->vm_private_data);
> +		vma->vm_private_data = NULL;
> +	}
> +}
> +
> +static void hugetlb_alloc_vma_lock(struct vm_area_struct *vma)
> +{
> +	struct rw_semaphore *vma_sema;
> +
> +	/* Only establish in (flags) sharable vmas */
> +	if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
> +		return;
> +> +	if (!vma_pmd_shareable(vma)) {
> +		vma->vm_private_data = NULL;
> +		return;
> +	}
> +
> +	vma_sema = kmalloc(sizeof(*vma_sema), GFP_KERNEL);
> +	if (!vma_sema) {
> +		/*
> +		 * If we can not allocate semaphore, then vma can not
> +		 * participate in pmd sharing.
> +		 */
> +		vma->vm_private_data = NULL;
> +	} else {
> +		init_rwsem(vma_sema);
> +		vma->vm_private_data = vma_sema;
> +	}

This code is really subtle. If it's called from hugetlb_vm_op_open during fork after
hugetlb_dup_vma_private is done, there should already be a kmalloc-ed vma_sema for this
vma (because hugetlb_alloc_vma_lock is also called by hugetlb_dup_vma_private). So we
can't simply change the value of vm_private_data here or vma_sema will be leaked ? But
when hugetlb_alloc_vma_lock is called from hugetlb_reserve_pages, it should work fine.
Or am I miss something?

Thanks.
Mike Kravetz July 29, 2022, 6 p.m. UTC | #2
On 07/29/22 10:55, Miaohe Lin wrote:
> On 2022/7/7 4:23, Mike Kravetz wrote:
> > Allocate a rw semaphore and hang off vm_private_data for
> > synchronization use by vmas that could be involved in pmd sharing.  Only
> > add infrastructure for the new lock here.  Actual use will be added in
> > subsequent patch.
> > 
> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> > ---
> >  include/linux/hugetlb.h |  36 +++++++++-
> >  kernel/fork.c           |   6 +-
> >  mm/hugetlb.c            | 150 ++++++++++++++++++++++++++++++++++++----
> >  mm/rmap.c               |   8 ++-
> >  4 files changed, 178 insertions(+), 22 deletions(-)
> > 
> 
> <snip>
> 
> >  
> >  /* Forward declaration */
> >  static int hugetlb_acct_memory(struct hstate *h, long delta);
> > +static bool vma_pmd_shareable(struct vm_area_struct *vma);
> >  
> >  static inline bool subpool_is_free(struct hugepage_subpool *spool)
> >  {
> > @@ -904,6 +905,89 @@ resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
> >  #endif
> >  }
> >  
> > +static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
> > +{
> > +	return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
> 
> Should me make __vma_aligned_range_pmd_shareable check (VM_MAYSHARE | VM_SHARED) like above
> instead of VM_MAYSHARE to make code more consistent?
> 

I 'think' we want them to be different.  Note this subtle code and
explanation in __unmap_hugepage_range_final().

	/*
	 * Clear this flag so that x86's huge_pmd_share page_table_shareable
	 * test will fail on a vma being torn down, and not grab a page table
	 * on its way out.  We're lucky that the flag has such an appropriate
	 * name, and can in fact be safely cleared here. We could clear it
	 * before the __unmap_hugepage_range above, but all that's necessary
	 * is to clear it before releasing the i_mmap_rwsem. This works
	 * because in the context this is called, the VMA is about to be
	 * destroyed and the i_mmap_rwsem is held.
	 */
	vma->vm_flags &= ~VM_MAYSHARE;

So, when making a decision to share or not we need to only check VM_MAYSHARE.
When making decisions about about the vma_lock, we need to check both.  In most
cases, just VM_MAYSHARE would be sufficient but we need to handle this case
where VM_SHARED and !VM_MAYSHARE.  Mostly in the unmap/cleanup cases.

> > +		vma->vm_private_data;
> > +}
> > +
> > +void hugetlb_vma_lock_read(struct vm_area_struct *vma)
> > +{
> > +	if (__vma_shareable_flags_pmd(vma))
> > +		down_read((struct rw_semaphore *)vma->vm_private_data);
> > +}
> > +
> > +void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
> > +{
> > +	if (__vma_shareable_flags_pmd(vma))
> > +		up_read((struct rw_semaphore *)vma->vm_private_data);
> > +}
> > +
> > +void hugetlb_vma_lock_write(struct vm_area_struct *vma)
> > +{
> > +	if (__vma_shareable_flags_pmd(vma))
> > +		down_write((struct rw_semaphore *)vma->vm_private_data);
> > +}
> > +
> > +void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
> > +{
> > +	if (__vma_shareable_flags_pmd(vma))
> > +		up_write((struct rw_semaphore *)vma->vm_private_data);
> > +}
> > +
> > +int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
> > +{
> > +	if (!__vma_shareable_flags_pmd(vma))
> > +		return 1;
> > +
> > +	return down_write_trylock((struct rw_semaphore *)vma->vm_private_data);
> > +}
> > +
> > +void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
> > +{
> > +	if (__vma_shareable_flags_pmd(vma))
> > +		lockdep_assert_held((struct rw_semaphore *)
> > +				vma->vm_private_data);
> > +}
> > +
> > +static void hugetlb_free_vma_lock(struct vm_area_struct *vma)
> > +{
> > +	/* Only present in sharable vmas */
> > +	if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
> > +		return;
> > +
> > +	if (vma->vm_private_data) {
> > +		kfree(vma->vm_private_data);
> > +		vma->vm_private_data = NULL;
> > +	}
> > +}
> > +
> > +static void hugetlb_alloc_vma_lock(struct vm_area_struct *vma)
> > +{
> > +	struct rw_semaphore *vma_sema;
> > +
> > +	/* Only establish in (flags) sharable vmas */
> > +	if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))

Based on my explanation above, I think this should only check VM_MAYSHARE.

> > +		return;
> > +> +	if (!vma_pmd_shareable(vma)) {
> > +		vma->vm_private_data = NULL;
> > +		return;
> > +	}
> > +
> > +	vma_sema = kmalloc(sizeof(*vma_sema), GFP_KERNEL);
> > +	if (!vma_sema) {
> > +		/*
> > +		 * If we can not allocate semaphore, then vma can not
> > +		 * participate in pmd sharing.
> > +		 */
> > +		vma->vm_private_data = NULL;
> > +	} else {
> > +		init_rwsem(vma_sema);
> > +		vma->vm_private_data = vma_sema;
> > +	}
> 
> This code is really subtle. If it's called from hugetlb_vm_op_open during fork after
> hugetlb_dup_vma_private is done, there should already be a kmalloc-ed vma_sema for this
> vma (because hugetlb_alloc_vma_lock is also called by hugetlb_dup_vma_private). So we
> can't simply change the value of vm_private_data here or vma_sema will be leaked ?

Yes, I believe your analysis is correct.

>                                                                                    But
> when hugetlb_alloc_vma_lock is called from hugetlb_reserve_pages, it should work fine.
> Or am I miss something?

You are right.  This is an issue in the current code.  I will address in
the next version.

Thanks for all your comments on this series!
Miaohe Lin July 30, 2022, 2:12 a.m. UTC | #3
On 2022/7/30 2:00, Mike Kravetz wrote:
> On 07/29/22 10:55, Miaohe Lin wrote:
>> On 2022/7/7 4:23, Mike Kravetz wrote:
>>> Allocate a rw semaphore and hang off vm_private_data for
>>> synchronization use by vmas that could be involved in pmd sharing.  Only
>>> add infrastructure for the new lock here.  Actual use will be added in
>>> subsequent patch.
>>>
>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>> ---
>>>  include/linux/hugetlb.h |  36 +++++++++-
>>>  kernel/fork.c           |   6 +-
>>>  mm/hugetlb.c            | 150 ++++++++++++++++++++++++++++++++++++----
>>>  mm/rmap.c               |   8 ++-
>>>  4 files changed, 178 insertions(+), 22 deletions(-)
>>>
>>
>> <snip>
>>
>>>  
>>>  /* Forward declaration */
>>>  static int hugetlb_acct_memory(struct hstate *h, long delta);
>>> +static bool vma_pmd_shareable(struct vm_area_struct *vma);
>>>  
>>>  static inline bool subpool_is_free(struct hugepage_subpool *spool)
>>>  {
>>> @@ -904,6 +905,89 @@ resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
>>>  #endif
>>>  }
>>>  
>>> +static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
>>> +{
>>> +	return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
>>
>> Should me make __vma_aligned_range_pmd_shareable check (VM_MAYSHARE | VM_SHARED) like above
>> instead of VM_MAYSHARE to make code more consistent?
>>
> 
> I 'think' we want them to be different.  Note this subtle code and
> explanation in __unmap_hugepage_range_final().
> 
> 	/*
> 	 * Clear this flag so that x86's huge_pmd_share page_table_shareable
> 	 * test will fail on a vma being torn down, and not grab a page table
> 	 * on its way out.  We're lucky that the flag has such an appropriate
> 	 * name, and can in fact be safely cleared here. We could clear it
> 	 * before the __unmap_hugepage_range above, but all that's necessary
> 	 * is to clear it before releasing the i_mmap_rwsem. This works
> 	 * because in the context this is called, the VMA is about to be
> 	 * destroyed and the i_mmap_rwsem is held.
> 	 */
> 	vma->vm_flags &= ~VM_MAYSHARE;
> 
> So, when making a decision to share or not we need to only check VM_MAYSHARE.
> When making decisions about about the vma_lock, we need to check both.  In most
> cases, just VM_MAYSHARE would be sufficient but we need to handle this case
> where VM_SHARED and !VM_MAYSHARE.  Mostly in the unmap/cleanup cases.

Many thanks for your explanation. :)

> 
>>> +		vma->vm_private_data;
>>> +}
>>> +
>>> +void hugetlb_vma_lock_read(struct vm_area_struct *vma)
>>> +{
>>> +	if (__vma_shareable_flags_pmd(vma))
>>> +		down_read((struct rw_semaphore *)vma->vm_private_data);
>>> +}
>>> +
>>> +void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
>>> +{
>>> +	if (__vma_shareable_flags_pmd(vma))
>>> +		up_read((struct rw_semaphore *)vma->vm_private_data);
>>> +}
>>> +
>>> +void hugetlb_vma_lock_write(struct vm_area_struct *vma)
>>> +{
>>> +	if (__vma_shareable_flags_pmd(vma))
>>> +		down_write((struct rw_semaphore *)vma->vm_private_data);
>>> +}
>>> +
>>> +void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
>>> +{
>>> +	if (__vma_shareable_flags_pmd(vma))
>>> +		up_write((struct rw_semaphore *)vma->vm_private_data);
>>> +}
>>> +
>>> +int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
>>> +{
>>> +	if (!__vma_shareable_flags_pmd(vma))
>>> +		return 1;
>>> +
>>> +	return down_write_trylock((struct rw_semaphore *)vma->vm_private_data);
>>> +}
>>> +
>>> +void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
>>> +{
>>> +	if (__vma_shareable_flags_pmd(vma))
>>> +		lockdep_assert_held((struct rw_semaphore *)
>>> +				vma->vm_private_data);
>>> +}
>>> +
>>> +static void hugetlb_free_vma_lock(struct vm_area_struct *vma)
>>> +{
>>> +	/* Only present in sharable vmas */
>>> +	if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))

Does above check need a comment? VM_SHARED is check here but not in below hugetlb_alloc_vma_lock.

Thanks.

>>> +		return;
>>> +
>>> +	if (vma->vm_private_data) {
>>> +		kfree(vma->vm_private_data);
>>> +		vma->vm_private_data = NULL;
>>> +	}
>>> +}
>>> +
>>> +static void hugetlb_alloc_vma_lock(struct vm_area_struct *vma)
>>> +{
>>> +	struct rw_semaphore *vma_sema;
>>> +
>>> +	/* Only establish in (flags) sharable vmas */
>>> +	if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
> 
> Based on my explanation above, I think this should only check VM_MAYSHARE.
> 
>>> +		return;
>>> +> +	if (!vma_pmd_shareable(vma)) {
>>> +		vma->vm_private_data = NULL;
>>> +		return;
>>> +	}
>>> +
>>> +	vma_sema = kmalloc(sizeof(*vma_sema), GFP_KERNEL);
>>> +	if (!vma_sema) {
>>> +		/*
>>> +		 * If we can not allocate semaphore, then vma can not
>>> +		 * participate in pmd sharing.
>>> +		 */
>>> +		vma->vm_private_data = NULL;
>>> +	} else {
>>> +		init_rwsem(vma_sema);
>>> +		vma->vm_private_data = vma_sema;
>>> +	}
>>
>> This code is really subtle. If it's called from hugetlb_vm_op_open during fork after
>> hugetlb_dup_vma_private is done, there should already be a kmalloc-ed vma_sema for this
>> vma (because hugetlb_alloc_vma_lock is also called by hugetlb_dup_vma_private). So we
>> can't simply change the value of vm_private_data here or vma_sema will be leaked ?
> 
> Yes, I believe your analysis is correct.
> 
>>                                                                                    But
>> when hugetlb_alloc_vma_lock is called from hugetlb_reserve_pages, it should work fine.
>> Or am I miss something?
> 
> You are right.  This is an issue in the current code.  I will address in
> the next version.
> 
> Thanks for all your comments on this series!
>
diff mbox series

Patch

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 05c3a293dab2..248331c0f140 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -126,7 +126,7 @@  struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
 						long min_hpages);
 void hugepage_put_subpool(struct hugepage_subpool *spool);
 
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
+void hugetlb_dup_vma_private(struct vm_area_struct *vma);
 void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
 int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
 int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
@@ -214,6 +214,13 @@  struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
 struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
 			     pgd_t *pgd, int flags);
 
+void hugetlb_vma_lock_read(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
+void hugetlb_vma_lock_write(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
+
 int pmd_huge(pmd_t pmd);
 int pud_huge(pud_t pud);
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -225,7 +232,7 @@  void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
-static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
 {
 }
 
@@ -336,6 +343,31 @@  static inline int prepare_hugepage_range(struct file *file,
 	return -EINVAL;
 }
 
+static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+	return 1;
+}
+
+static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
 static inline int pmd_huge(pmd_t pmd)
 {
 	return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 23f0ba3affe5..ec6e7ddaae12 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -674,12 +674,10 @@  static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		}
 
 		/*
-		 * Clear hugetlb-related page reserves for children. This only
-		 * affects MAP_PRIVATE mappings. Faults generated by the child
-		 * are not guaranteed to succeed, even if read-only
+		 * Copy/update hugetlb private vma information.
 		 */
 		if (is_vm_hugetlb_page(tmp))
-			reset_vma_resv_huge_pages(tmp);
+			hugetlb_dup_vma_private(tmp);
 
 		/* Link the vma into the MT */
 		mas.index = tmp->vm_start;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3d5f3c103927..2eca89bb08ab 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -90,6 +90,7 @@  struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
+static bool vma_pmd_shareable(struct vm_area_struct *vma);
 
 static inline bool subpool_is_free(struct hugepage_subpool *spool)
 {
@@ -904,6 +905,89 @@  resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
 #endif
 }
 
+static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
+		vma->vm_private_data;
+}
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+	if (__vma_shareable_flags_pmd(vma))
+		down_read((struct rw_semaphore *)vma->vm_private_data);
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+	if (__vma_shareable_flags_pmd(vma))
+		up_read((struct rw_semaphore *)vma->vm_private_data);
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+	if (__vma_shareable_flags_pmd(vma))
+		down_write((struct rw_semaphore *)vma->vm_private_data);
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+	if (__vma_shareable_flags_pmd(vma))
+		up_write((struct rw_semaphore *)vma->vm_private_data);
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+	if (!__vma_shareable_flags_pmd(vma))
+		return 1;
+
+	return down_write_trylock((struct rw_semaphore *)vma->vm_private_data);
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+	if (__vma_shareable_flags_pmd(vma))
+		lockdep_assert_held((struct rw_semaphore *)
+				vma->vm_private_data);
+}
+
+static void hugetlb_free_vma_lock(struct vm_area_struct *vma)
+{
+	/* Only present in sharable vmas */
+	if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
+		return;
+
+	if (vma->vm_private_data) {
+		kfree(vma->vm_private_data);
+		vma->vm_private_data = NULL;
+	}
+}
+
+static void hugetlb_alloc_vma_lock(struct vm_area_struct *vma)
+{
+	struct rw_semaphore *vma_sema;
+
+	/* Only establish in (flags) sharable vmas */
+	if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
+		return;
+
+	if (!vma_pmd_shareable(vma)) {
+		vma->vm_private_data = NULL;
+		return;
+	}
+
+	vma_sema = kmalloc(sizeof(*vma_sema), GFP_KERNEL);
+	if (!vma_sema) {
+		/*
+		 * If we can not allocate semaphore, then vma can not
+		 * participate in pmd sharing.
+		 */
+		vma->vm_private_data = NULL;
+	} else {
+		init_rwsem(vma_sema);
+		vma->vm_private_data = vma_sema;
+	}
+}
+
 struct resv_map *resv_map_alloc(void)
 {
 	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
@@ -1007,12 +1091,22 @@  static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
 	return (get_vma_private_data(vma) & flag) != 0;
 }
 
-/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+void hugetlb_dup_vma_private(struct vm_area_struct *vma)
 {
+	/*
+	 * Clear hugetlb-related page reserves for children. This only
+	 * affects MAP_PRIVATE mappings. Faults generated by the child
+	 * are not guaranteed to succeed, even if read-only
+	 */
 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
 	if (!(vma->vm_flags & VM_MAYSHARE))
 		vma->vm_private_data = (void *)0;
+
+	/*
+	 * Allocate semaphore if pmd sharing is possible.  Private mappings
+	 * are ignored.
+	 */
+	hugetlb_alloc_vma_lock(vma);
 }
 
 /*
@@ -1043,7 +1137,7 @@  void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
 		kref_put(&reservations->refs, resv_map_release);
 	}
 
-	reset_vma_resv_huge_pages(vma);
+	hugetlb_dup_vma_private(vma);
 }
 
 /* Returns true if the VMA has associated reserve pages */
@@ -4591,16 +4685,21 @@  static void hugetlb_vm_op_open(struct vm_area_struct *vma)
 		resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
 		kref_get(&resv->refs);
 	}
+
+	hugetlb_alloc_vma_lock(vma);
 }
 
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
 	struct hstate *h = hstate_vma(vma);
-	struct resv_map *resv = vma_resv_map(vma);
+	struct resv_map *resv;
 	struct hugepage_subpool *spool = subpool_vma(vma);
 	unsigned long reserve, start, end;
 	long gbl_reserve;
 
+	hugetlb_free_vma_lock(vma);
+
+	resv = vma_resv_map(vma);
 	if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
 		return;
 
@@ -6438,6 +6537,11 @@  bool hugetlb_reserve_pages(struct inode *inode,
 		return false;
 	}
 
+	/*
+	 * vma specific semaphore used for pmd sharing synchronization
+	 */
+	hugetlb_alloc_vma_lock(vma);
+
 	/*
 	 * Only apply hugepage reservation if asked. At fault time, an
 	 * attempt will be made for VM_NORESERVE to allocate a page
@@ -6461,12 +6565,11 @@  bool hugetlb_reserve_pages(struct inode *inode,
 		resv_map = inode_resv_map(inode);
 
 		chg = region_chg(resv_map, from, to, &regions_needed);
-
 	} else {
 		/* Private mapping. */
 		resv_map = resv_map_alloc();
 		if (!resv_map)
-			return false;
+			goto out_err;
 
 		chg = to - from;
 
@@ -6561,6 +6664,7 @@  bool hugetlb_reserve_pages(struct inode *inode,
 	hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
 					    chg * pages_per_huge_page(h), h_cg);
 out_err:
+	hugetlb_free_vma_lock(vma);
 	if (!vma || vma->vm_flags & VM_MAYSHARE)
 		/* Only call region_abort if the region_chg succeeded but the
 		 * region_add failed or didn't run.
@@ -6640,14 +6744,34 @@  static unsigned long page_table_shareable(struct vm_area_struct *svma,
 }
 
 static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma,
-				unsigned long start, unsigned long end)
+				unsigned long start, unsigned long end,
+				bool check_vma_lock)
 {
+#ifdef CONFIG_USERFAULTFD
+	if (uffd_disable_huge_pmd_share(vma))
+		return false;
+#endif
 	/*
 	 * check on proper vm_flags and page table alignment
 	 */
-	if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end))
-		return true;
-	return false;
+	if (!(vma->vm_flags & VM_MAYSHARE))
+		return false;
+	if (check_vma_lock && !vma->vm_private_data)
+		return false;
+	if (!range_in_vma(vma, start, end))
+		return false;
+	return true;
+}
+
+static bool vma_pmd_shareable(struct vm_area_struct *vma)
+{
+	unsigned long start = ALIGN(vma->vm_start, PUD_SIZE),
+		      end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+	if (start >= end)
+		return false;
+
+	return __vma_aligned_range_pmd_shareable(vma, start, end, false);
 }
 
 static bool vma_addr_pmd_shareable(struct vm_area_struct *vma,
@@ -6656,15 +6780,11 @@  static bool vma_addr_pmd_shareable(struct vm_area_struct *vma,
 	unsigned long start = addr & PUD_MASK;
 	unsigned long end = start + PUD_SIZE;
 
-	return __vma_aligned_range_pmd_shareable(vma, start, end);
+	return __vma_aligned_range_pmd_shareable(vma, start, end, true);
 }
 
 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
 {
-#ifdef CONFIG_USERFAULTFD
-	if (uffd_disable_huge_pmd_share(vma))
-		return false;
-#endif
 	return vma_addr_pmd_shareable(vma, addr);
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 6593299d3b18..64076c2a49c1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@ 
  *   mm->mmap_lock
  *     mapping->invalidate_lock (in filemap_fault)
  *       page->flags PG_locked (lock_page)
- *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
  *           mapping->i_mmap_rwsem
  *             anon_vma->rwsem
  *               mm->page_table_lock or pte_lock
@@ -44,6 +44,12 @@ 
  * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
  *   ->tasklist_lock
  *     pte map lock
+ *
+ * hugetlbfs PageHuge() take locks in this order:
+ *   hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ *     vma_lock (hugetlb specific lock for pmd_sharing)
+ *       mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
+ *         page->flags PG_locked (lock_page)
  */
 
 #include <linux/mm.h>