diff mbox series

[RFC,02/14] s390/mm: Improve locking for huge page backings

Message ID 20180919084802.183381-3-frankja@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series KVM: s390: Huge page splitting and shadowing | expand

Commit Message

Janosch Frank Sept. 19, 2018, 8:47 a.m. UTC
The gmap guest_table_lock is used to protect changes to the guest's
DAT tables from region 1 to segments. Therefore it also protects the
host to guest radix tree where each new segment mapping by gmap_link()
is tracked. Changes to ptes are synchronized through the pte lock,
which is easyly retrievable, because the gmap shares the page tables
with userspace.

With huge pages the story changes. PMD tables are not shared and we're
left with the pmd lock on userspace side and the guest_table_lock on
the gmap side. Having two locks for an object is a guarantee for
locking problems.

Therefore the guest_table_lock will only be used for population of the
gmap tables and hence protecting the host_to_guest tree. While the pmd
lock will be used for all changes to the pmd from both userspace and
the gmap.

This means we need to retrieve the vmaddr to retrieve a gmap pmd,
which takes a bit longer than before. But we can now operate on
multiple pmds which are in disjoint segment tables instead of having a
global lock.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 arch/s390/include/asm/pgtable.h |  1 +
 arch/s390/mm/gmap.c             | 70 +++++++++++++++++++++++++----------------
 arch/s390/mm/pgtable.c          |  2 +-
 3 files changed, 45 insertions(+), 28 deletions(-)

Comments

David Hildenbrand Sept. 20, 2018, 11:14 a.m. UTC | #1
On 19/09/2018 10:47, Janosch Frank wrote:
> The gmap guest_table_lock is used to protect changes to the guest's
> DAT tables from region 1 to segments. Therefore it also protects the
> host to guest radix tree where each new segment mapping by gmap_link()
> is tracked. Changes to ptes are synchronized through the pte lock,
> which is easyly retrievable, because the gmap shares the page tables
> with userspace.
> 
> With huge pages the story changes. PMD tables are not shared and we're
> left with the pmd lock on userspace side and the guest_table_lock on
> the gmap side. Having two locks for an object is a guarantee for
> locking problems.
> 
> Therefore the guest_table_lock will only be used for population of the
> gmap tables and hence protecting the host_to_guest tree. While the pmd
> lock will be used for all changes to the pmd from both userspace and
> the gmap.
> 
> This means we need to retrieve the vmaddr to retrieve a gmap pmd,
> which takes a bit longer than before. But we can now operate on
> multiple pmds which are in disjoint segment tables instead of having a
> global lock.
> 

Has the time come to document how locking works? (especially also for
gmap shadows)

> Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
> ---
>  arch/s390/include/asm/pgtable.h |  1 +
>  arch/s390/mm/gmap.c             | 70 +++++++++++++++++++++++++----------------
>  arch/s390/mm/pgtable.c          |  2 +-
>  3 files changed, 45 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
> index 0e7cb0dc9c33..c0abd57c5a21 100644
> --- a/arch/s390/include/asm/pgtable.h
> +++ b/arch/s390/include/asm/pgtable.h
> @@ -1420,6 +1420,7 @@ static inline void __pudp_idte(unsigned long addr, pud_t *pudp,
>  	}
>  }
>  
> +pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr);
>  pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
>  pmd_t pmdp_xchg_lazy(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
>  pud_t pudp_xchg_direct(struct mm_struct *, unsigned long, pud_t *, pud_t);
> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> index 9ccd62cc7f37..04c24a284113 100644
> --- a/arch/s390/mm/gmap.c
> +++ b/arch/s390/mm/gmap.c
> @@ -895,47 +895,62 @@ static void gmap_pte_op_end(spinlock_t *ptl)
>  }
>  
>  /**
> - * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
> - *		      and return the pmd pointer
> + * gmap_pmd_op_walk - walk the gmap tables, get the pmd_lock if needed
> + *		      and return the pmd pointer or NULL
>   * @gmap: pointer to guest mapping meta data structure
>   * @gaddr: virtual address in the guest address space
>   *
>   * Returns a pointer to the pmd for a guest address, or NULL
>   */
> -static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
> +static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr,
> +				      spinlock_t **ptl)
>  {
> -	pmd_t *pmdp;
> +	pmd_t *pmdp, *hpmdp;
> +	unsigned long vmaddr;
> +
>  
>  	BUG_ON(gmap_is_shadow(gmap));
> -	pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
> -	if (!pmdp)
> -		return NULL;
>  
> -	/* without huge pages, there is no need to take the table lock */
> -	if (!gmap->mm->context.allow_gmap_hpage_1m)
> -		return pmd_none(*pmdp) ? NULL : pmdp;
> -
> -	spin_lock(&gmap->guest_table_lock);
> -	if (pmd_none(*pmdp)) {
> -		spin_unlock(&gmap->guest_table_lock);
> -		return NULL;
> +	*ptl = NULL;
> +	if (gmap->mm->context.allow_gmap_hpage_1m) {
> +		vmaddr = __gmap_translate(gmap, gaddr);
> +		if (IS_ERR_VALUE(vmaddr))
> +			return NULL;
> +		hpmdp = pmd_alloc_map(gmap->mm, vmaddr);
> +		if (!hpmdp)
> +			return NULL;
> +		*ptl = pmd_lock(gmap->mm, hpmdp);
> +		if (pmd_none(*hpmdp)) {
> +			spin_unlock(*ptl);
> +			*ptl = NULL;
> +			return NULL;
> +		}
> +		if (!pmd_large(*hpmdp)) {
> +			spin_unlock(*ptl);
> +			*ptl = NULL;
> +		}
> +	}
> +
> +	pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
> +	if (!pmdp || pmd_none(*pmdp)) {
> +		if (*ptl)
> +			spin_unlock(*ptl);
> +		pmdp = NULL;
> +		*ptl = NULL;
>  	}
>  
> -	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
> -	if (!pmd_large(*pmdp))
> -		spin_unlock(&gmap->guest_table_lock);
>  	return pmdp;
>  }
>  
>  /**
> - * gmap_pmd_op_end - release the guest_table_lock if needed
> + * gmap_pmd_op_end - release the pmd lock if needed
>   * @gmap: pointer to the guest mapping meta data structure
>   * @pmdp: pointer to the pmd
>   */
> -static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
> +static inline void gmap_pmd_op_end(spinlock_t *ptl)
>  {
> -	if (pmd_large(*pmdp))
> -		spin_unlock(&gmap->guest_table_lock);
> +	if (ptl)
> +		spin_unlock(ptl);
>  }
>  
>  /*
> @@ -1037,13 +1052,14 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
>  			      unsigned long len, int prot, unsigned long bits)
>  {
>  	unsigned long vmaddr, dist;
> +	spinlock_t *ptl = NULL;
>  	pmd_t *pmdp;
>  	int rc;
>  
>  	BUG_ON(gmap_is_shadow(gmap));
>  	while (len) {
>  		rc = -EAGAIN;
> -		pmdp = gmap_pmd_op_walk(gmap, gaddr);
> +		pmdp = gmap_pmd_op_walk(gmap, gaddr, &ptl);
>  		if (pmdp) {
>  			if (!pmd_large(*pmdp)) {
>  				rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
> @@ -1061,7 +1077,7 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
>  					gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
>  				}
>  			}
> -			gmap_pmd_op_end(gmap, pmdp);
> +			gmap_pmd_op_end(ptl);
>  		}
>  		if (rc) {
>  			if (rc == -EINVAL)
> @@ -2457,9 +2473,9 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
>  	int i;
>  	pmd_t *pmdp;
>  	pte_t *ptep;
> -	spinlock_t *ptl;
> +	spinlock_t *ptl = NULL;
>  
> -	pmdp = gmap_pmd_op_walk(gmap, gaddr);
> +	pmdp = gmap_pmd_op_walk(gmap, gaddr, &ptl);
>  	if (!pmdp)
>  		return;
>  
> @@ -2476,7 +2492,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
>  			spin_unlock(ptl);
>  		}
>  	}
> -	gmap_pmd_op_end(gmap, pmdp);
> +	gmap_pmd_op_end(ptl);
>  }
>  EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
>  
> diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
> index 16d35b881a11..4b184744350b 100644
> --- a/arch/s390/mm/pgtable.c
> +++ b/arch/s390/mm/pgtable.c
> @@ -410,7 +410,7 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
>  	return old;
>  }
>  
> -static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr)
> +pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr)
>  {
>  	pgd_t *pgd;
>  	p4d_t *p4d;
>
Janosch Frank Sept. 20, 2018, 1:16 p.m. UTC | #2
On 9/20/18 1:14 PM, David Hildenbrand wrote:
> On 19/09/2018 10:47, Janosch Frank wrote:
>> The gmap guest_table_lock is used to protect changes to the guest's
>> DAT tables from region 1 to segments. Therefore it also protects the
>> host to guest radix tree where each new segment mapping by gmap_link()
>> is tracked. Changes to ptes are synchronized through the pte lock,
>> which is easyly retrievable, because the gmap shares the page tables
>> with userspace.
>>
>> With huge pages the story changes. PMD tables are not shared and we're
>> left with the pmd lock on userspace side and the guest_table_lock on
>> the gmap side. Having two locks for an object is a guarantee for
>> locking problems.
>>
>> Therefore the guest_table_lock will only be used for population of the
>> gmap tables and hence protecting the host_to_guest tree. While the pmd
>> lock will be used for all changes to the pmd from both userspace and
>> the gmap.
>>
>> This means we need to retrieve the vmaddr to retrieve a gmap pmd,
>> which takes a bit longer than before. But we can now operate on
>> multiple pmds which are in disjoint segment tables instead of having a
>> global lock.
>>
> 
> Has the time come to document how locking works? (especially also for
> gmap shadows)

I'll add more locking instructions into the function comments for v1.

Also I'll write something longer up after KVM forum and the inclusion of
this patch set. Maybe I'll also add a simple GMAP documentation, after
all I have ~30 pages of KVM mm documentation flying around.
diff mbox series

Patch

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 0e7cb0dc9c33..c0abd57c5a21 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1420,6 +1420,7 @@  static inline void __pudp_idte(unsigned long addr, pud_t *pudp,
 	}
 }
 
+pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr);
 pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
 pmd_t pmdp_xchg_lazy(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
 pud_t pudp_xchg_direct(struct mm_struct *, unsigned long, pud_t *, pud_t);
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 9ccd62cc7f37..04c24a284113 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -895,47 +895,62 @@  static void gmap_pte_op_end(spinlock_t *ptl)
 }
 
 /**
- * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
- *		      and return the pmd pointer
+ * gmap_pmd_op_walk - walk the gmap tables, get the pmd_lock if needed
+ *		      and return the pmd pointer or NULL
  * @gmap: pointer to guest mapping meta data structure
  * @gaddr: virtual address in the guest address space
  *
  * Returns a pointer to the pmd for a guest address, or NULL
  */
-static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
+static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr,
+				      spinlock_t **ptl)
 {
-	pmd_t *pmdp;
+	pmd_t *pmdp, *hpmdp;
+	unsigned long vmaddr;
+
 
 	BUG_ON(gmap_is_shadow(gmap));
-	pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
-	if (!pmdp)
-		return NULL;
 
-	/* without huge pages, there is no need to take the table lock */
-	if (!gmap->mm->context.allow_gmap_hpage_1m)
-		return pmd_none(*pmdp) ? NULL : pmdp;
-
-	spin_lock(&gmap->guest_table_lock);
-	if (pmd_none(*pmdp)) {
-		spin_unlock(&gmap->guest_table_lock);
-		return NULL;
+	*ptl = NULL;
+	if (gmap->mm->context.allow_gmap_hpage_1m) {
+		vmaddr = __gmap_translate(gmap, gaddr);
+		if (IS_ERR_VALUE(vmaddr))
+			return NULL;
+		hpmdp = pmd_alloc_map(gmap->mm, vmaddr);
+		if (!hpmdp)
+			return NULL;
+		*ptl = pmd_lock(gmap->mm, hpmdp);
+		if (pmd_none(*hpmdp)) {
+			spin_unlock(*ptl);
+			*ptl = NULL;
+			return NULL;
+		}
+		if (!pmd_large(*hpmdp)) {
+			spin_unlock(*ptl);
+			*ptl = NULL;
+		}
+	}
+
+	pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
+	if (!pmdp || pmd_none(*pmdp)) {
+		if (*ptl)
+			spin_unlock(*ptl);
+		pmdp = NULL;
+		*ptl = NULL;
 	}
 
-	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
-	if (!pmd_large(*pmdp))
-		spin_unlock(&gmap->guest_table_lock);
 	return pmdp;
 }
 
 /**
- * gmap_pmd_op_end - release the guest_table_lock if needed
+ * gmap_pmd_op_end - release the pmd lock if needed
  * @gmap: pointer to the guest mapping meta data structure
  * @pmdp: pointer to the pmd
  */
-static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
+static inline void gmap_pmd_op_end(spinlock_t *ptl)
 {
-	if (pmd_large(*pmdp))
-		spin_unlock(&gmap->guest_table_lock);
+	if (ptl)
+		spin_unlock(ptl);
 }
 
 /*
@@ -1037,13 +1052,14 @@  static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
 			      unsigned long len, int prot, unsigned long bits)
 {
 	unsigned long vmaddr, dist;
+	spinlock_t *ptl = NULL;
 	pmd_t *pmdp;
 	int rc;
 
 	BUG_ON(gmap_is_shadow(gmap));
 	while (len) {
 		rc = -EAGAIN;
-		pmdp = gmap_pmd_op_walk(gmap, gaddr);
+		pmdp = gmap_pmd_op_walk(gmap, gaddr, &ptl);
 		if (pmdp) {
 			if (!pmd_large(*pmdp)) {
 				rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
@@ -1061,7 +1077,7 @@  static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
 					gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
 				}
 			}
-			gmap_pmd_op_end(gmap, pmdp);
+			gmap_pmd_op_end(ptl);
 		}
 		if (rc) {
 			if (rc == -EINVAL)
@@ -2457,9 +2473,9 @@  void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
 	int i;
 	pmd_t *pmdp;
 	pte_t *ptep;
-	spinlock_t *ptl;
+	spinlock_t *ptl = NULL;
 
-	pmdp = gmap_pmd_op_walk(gmap, gaddr);
+	pmdp = gmap_pmd_op_walk(gmap, gaddr, &ptl);
 	if (!pmdp)
 		return;
 
@@ -2476,7 +2492,7 @@  void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
 			spin_unlock(ptl);
 		}
 	}
-	gmap_pmd_op_end(gmap, pmdp);
+	gmap_pmd_op_end(ptl);
 }
 EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
 
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 16d35b881a11..4b184744350b 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -410,7 +410,7 @@  static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
 	return old;
 }
 
-static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr)
+pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;