[v4,3/9] s390/mm: add gmap pmd invalidation notification
diff mbox

Message ID 20180627135510.117945-4-frankja@linux.ibm.com
State New
Headers show

Commit Message

Janosch Frank June 27, 2018, 1:55 p.m. UTC
From: Janosch Frank <frankja@linux.vnet.ibm.com>

Like for ptes, we also need invalidation notification for pmds, to
remove the fake page tables when they are split and later addition of
shadowed pmds.

With PMDs we do not have PGSTEs or some other bits we could use in the
host PMD. Instead we pick one of the free bits in the gmap PMD. Every
time a host pmd will be invalidated, we will check if the respective
gmap PMD has the bit set and in that case fire up the notifier.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
---
 arch/s390/include/asm/gmap.h    |  15 +++
 arch/s390/include/asm/pgtable.h |   9 +-
 arch/s390/mm/gmap.c             | 213 +++++++++++++++++++++++++++++++++++-----
 arch/s390/mm/pgtable.c          |   4 +
 4 files changed, 217 insertions(+), 24 deletions(-)

Comments

David Hildenbrand June 28, 2018, 12:55 p.m. UTC | #1
On 27.06.2018 15:55, Janosch Frank wrote:
> From: Janosch Frank <frankja@linux.vnet.ibm.com>
> 
> Like for ptes, we also need invalidation notification for pmds, to
> remove the fake page tables when they are split and later addition of
> shadowed pmds.

I think the subject should rather be

"s390/mm: split huge pages in GMAP when protecting"

It would be helpful to explain why we have to split huge pages when
protecting. (complicated stuff we discussed). The pmdp_notify()
introduction could be moved to a separate patch (and keep this subject).

AFAICS, transparent huge page handling could be fairly easy, no? Do you
know what exactly we are missing to make it work? (assuming
CMMA=SKEY=PFMFI=OFF - so PGSTE don't matter)

> 
> With PMDs we do not have PGSTEs or some other bits we could use in the
> host PMD. Instead we pick one of the free bits in the gmap PMD. Every
> time a host pmd will be invalidated, we will check if the respective
> gmap PMD has the bit set and in that case fire up the notifier.
> 
> Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
> ---
>  arch/s390/include/asm/gmap.h    |  15 +++
>  arch/s390/include/asm/pgtable.h |   9 +-
>  arch/s390/mm/gmap.c             | 213 +++++++++++++++++++++++++++++++++++-----
>  arch/s390/mm/pgtable.c          |   4 +
>  4 files changed, 217 insertions(+), 24 deletions(-)
> 
> diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
> index c1bc5633fc6e..4324b2a55aa3 100644
> --- a/arch/s390/include/asm/gmap.h
> +++ b/arch/s390/include/asm/gmap.h
> @@ -13,6 +13,9 @@
>  #define GMAP_NOTIFY_SHADOW	0x2
>  #define GMAP_NOTIFY_MPROT	0x1
>  
> +/* Status bits in the gmap segment entry. */
> +#define _SEGMENT_ENTRY_GMAP_SPLIT	0x0001  /* split huge pmd */
> +
>  /**
>   * struct gmap_struct - guest address space
>   * @list: list head for the mm->context gmap list
> @@ -52,6 +55,7 @@ struct gmap {
>  	struct radix_tree_root host_to_rmap;
>  	struct list_head children;
>  	struct list_head pt_list;
> +	struct list_head split_list;
>  	spinlock_t shadow_lock;
>  	struct gmap *parent;
>  	unsigned long orig_asce;
> @@ -92,6 +96,17 @@ static inline int gmap_is_shadow(struct gmap *gmap)
>  	return !!gmap->parent;
>  }
>  
> +/**
> + * gmap_pmd_is_split - Returns if a huge gmap pmd has been split.
> + * @pmdp: pointer to the pmd
> + *
> + * Returns true if the passed huge gmap pmd has been split.
> + */
> +static inline bool gmap_pmd_is_split(pmd_t *pmdp)
> +{
> +	return !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_SPLIT);
> +}
> +
>  struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
>  void gmap_remove(struct gmap *gmap);
>  struct gmap *gmap_get(struct gmap *gmap);
> diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
> index 5ab636089c60..34a5ff928cd4 100644
> --- a/arch/s390/include/asm/pgtable.h
> +++ b/arch/s390/include/asm/pgtable.h
> @@ -268,8 +268,10 @@ static inline int is_module_addr(void *addr)
>  #define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe2fUL
>  
>  /* Bits in the segment table entry */
> -#define _SEGMENT_ENTRY_BITS	0xfffffffffffffe33UL
> -#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL
> +#define _SEGMENT_ENTRY_BITS			0xfffffffffffffe33UL
> +#define _SEGMENT_ENTRY_BITS_LARGE		0xfffffffffff0ff33UL
> +#define _SEGMENT_ENTRY_HARDWARE_BITS		0xfffffffffffffe30UL
> +#define _SEGMENT_ENTRY_HARDWARE_BITS_LARGE	0xfffffffffff00730UL
>  #define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address	    */
>  #define _SEGMENT_ENTRY_ORIGIN	~0x7ffUL/* page table origin		    */
>  #define _SEGMENT_ENTRY_PROTECT	0x200	/* segment protection bit	    */
> @@ -1092,6 +1094,9 @@ void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
>  void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
>  void ptep_notify(struct mm_struct *mm, unsigned long addr,
>  		 pte_t *ptep, unsigned long bits);
> +void ptep_notify_gmap(struct mm_struct *mm, unsigned long vmaddr,
> +		      pte_t *pte, unsigned long bits);
> +void pmdp_notify(struct mm_struct *mm, unsigned long addr);
>  int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
>  		    pte_t *ptep, int prot, unsigned long bit);
>  void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> index f5b48426dde8..5ba43ef8ff40 100644
> --- a/arch/s390/mm/gmap.c
> +++ b/arch/s390/mm/gmap.c
> @@ -63,6 +63,7 @@ static struct gmap *gmap_alloc(unsigned long limit)
>  	INIT_LIST_HEAD(&gmap->crst_list);
>  	INIT_LIST_HEAD(&gmap->children);
>  	INIT_LIST_HEAD(&gmap->pt_list);
> +	INIT_LIST_HEAD(&gmap->split_list);
>  	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
>  	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
>  	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
> @@ -194,6 +195,10 @@ static void gmap_free(struct gmap *gmap)
>  	gmap_radix_tree_free(&gmap->guest_to_host);
>  	gmap_radix_tree_free(&gmap->host_to_guest);
>  
> +	/* Free split pmd page tables */
> +	list_for_each_entry_safe(page, next, &gmap->split_list, lru)
> +		page_table_free_pgste(page);
> +
>  	/* Free additional data for a shadow gmap */
>  	if (gmap_is_shadow(gmap)) {
>  		/* Free all page tables. */
> @@ -599,10 +604,15 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
>  	if (*table == _SEGMENT_ENTRY_EMPTY) {
>  		rc = radix_tree_insert(&gmap->host_to_guest,
>  				       vmaddr >> PMD_SHIFT, table);
> -		if (!rc)
> -			*table = pmd_val(*pmd);
> -	} else
> -		rc = 0;
> +		if (!rc) {
> +			if (pmd_large(*pmd)) {
> +				*table = pmd_val(*pmd) &
> +					_SEGMENT_ENTRY_HARDWARE_BITS_LARGE;
> +			} else
> +				*table = pmd_val(*pmd) &
> +					_SEGMENT_ENTRY_HARDWARE_BITS;
> +		}
> +	}

Does this part really belong into this patch *confused*

>  	spin_unlock(&gmap->guest_table_lock);
>  	spin_unlock(ptl);
>  	radix_tree_preload_end();
> @@ -833,7 +843,7 @@ static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
>  }
>  
>  /**
> - * gmap_pte_op_fixup - force a page in and connect the gmap page table
> + * gmap_fixup - force memory in and connect the gmap table entry
>   * @gmap: pointer to guest mapping meta data structure
>   * @gaddr: virtual address in the guest address space
>   * @vmaddr: address in the host process address space
> @@ -841,10 +851,10 @@ static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
>   *
>   * Returns 0 if the caller can retry __gmap_translate (might fail again),
>   * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
> - * up or connecting the gmap page table.
> + * up or connecting the gmap table entry.
>   */
> -static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
> -			     unsigned long vmaddr, int prot)
> +static int gmap_fixup(struct gmap *gmap, unsigned long gaddr,
> +		      unsigned long vmaddr, int prot)
>  {
>  	struct mm_struct *mm = gmap->mm;
>  	unsigned int fault_flags;
> @@ -892,8 +902,11 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
>  		return NULL;
>  	}
>  
> -	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
> -	if (!pmd_large(*pmdp))
> +	/*
> +	 * Non-split 4k page table entries are locked via the pte
> +	 * (pte_alloc_map_lock).
> +	 */
> +	if (!gmap_pmd_is_split(pmdp) && !pmd_large(*pmdp))
>  		spin_unlock(&gmap->guest_table_lock);
>  	return pmdp;
>  }
> @@ -905,10 +918,77 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
>   */
>  static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
>  {
> -	if (pmd_large(*pmdp))
> +	if (pmd_large(*pmdp) || gmap_pmd_is_split(pmdp))
>  		spin_unlock(&gmap->guest_table_lock);
>  }
>  
> +static pte_t *gmap_pte_from_pmd(struct gmap *gmap, pmd_t *pmdp,
> +				unsigned long addr, spinlock_t **ptl)
> +{
> +	if (likely(!gmap_pmd_is_split(pmdp)))
> +		return pte_alloc_map_lock(gmap->mm, pmdp, addr, ptl);
> +
> +	*ptl = NULL;
> +	return pte_offset_map(pmdp, addr);
> +}
> +
> +/**
> + * gmap_pmd_split_free - Free a split pmd's page table
> + * @pmdp The split pmd that we free of its page table
> + *
> + * If the userspace pmds are exchanged, we'll remove the gmap pmds as
> + * well, so we fault on them and link them again. We would leak
> + * memory, if we didn't free split pmds here.
> + */
> +static inline void gmap_pmd_split_free(pmd_t *pmdp)
> +{
> +	unsigned long pgt = pmd_val(*pmdp) & _SEGMENT_ENTRY_ORIGIN;
> +	struct page *page;
> +
> +	if (gmap_pmd_is_split(pmdp)) {

can this ever not be the case? This function is not used in this patch.

> +		page = pfn_to_page(pgt >> PAGE_SHIFT);
> +		list_del(&page->lru);
> +		page_table_free_pgste(page);
> +	}
> +}
> +
> +/**
> + * gmap_pmd_split - Split a huge gmap pmd and use a page table instead
> + * @gmap: pointer to guest mapping meta data structure
> + * @gaddr: virtual address in the guest address space
> + * @pmdp: pointer to the pmd that will be split
> + *
> + * When splitting gmap pmds, we have to make the resulting page table
> + * look like it's a normal one to be able to use the common pte
> + * handling functions. Also we need to track these new tables as they
> + * aren't tracked anywhere else.
> + */
> +static int gmap_pmd_split(struct gmap *gmap, unsigned long gaddr, pmd_t *pmdp)
> +{
> +	unsigned long *table;
> +	struct page *page;
> +	pmd_t new;
> +	int i;
> +
> +	page = page_table_alloc_pgste(gmap->mm);
> +	if (!page)
> +		return -ENOMEM;
> +	table = (unsigned long *) page_to_phys(page);
> +	for (i = 0; i < 256; i++) {
> +		table[i] = (pmd_val(*pmdp) & HPAGE_MASK) + i * PAGE_SIZE;
> +		/* pmd_large() implies pmd/pte_present() */
> +		table[i] |=  _PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE;
> +		/* ptes are directly marked as dirty */
> +		table[i + PTRS_PER_PTE] |= PGSTE_UC_BIT;
> +	}
> +
> +	pmd_val(new) = ((unsigned long)table | _SEGMENT_ENTRY |
> +			(_SEGMENT_ENTRY_GMAP_SPLIT));
> +	list_add(&page->lru, &gmap->split_list);
> +	gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
> +	return 0;
> +}
> +
>  /*
>   * gmap_protect_pte - remove access rights to memory and set pgste bits
>   * @gmap: pointer to guest mapping meta data structure
> @@ -930,7 +1010,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
>  	spinlock_t *ptl = NULL;
>  	unsigned long pbits = 0;
>  
> -	ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
> +	ptep = gmap_pte_from_pmd(gmap, pmdp, gaddr, &ptl);
>  	if (!ptep)
>  		return -ENOMEM;
>  
> @@ -967,19 +1047,25 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
>  		rc = -EAGAIN;
>  		pmdp = gmap_pmd_op_walk(gmap, gaddr);
>  		if (pmdp && !(pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)) {
> -			rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
> -					      bits);
> -			if (!rc) {
> -				len -= PAGE_SIZE;
> -				gaddr += PAGE_SIZE;
> +			if (!pmd_large(*pmdp)) {
> +				rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
> +						      bits);
> +				if (!rc) {
> +					len -= PAGE_SIZE;
> +					gaddr += PAGE_SIZE;
> +				}
> +			} else {
> +				rc = gmap_pmd_split(gmap, gaddr, pmdp);
> +				if (!rc)
> +					rc = -EFAULT;
>  			}
>  			gmap_pmd_op_end(gmap, pmdp);
>  		}
> -		if (rc) {
> +		if (rc && rc != -EFAULT) {
>  			vmaddr = __gmap_translate(gmap, gaddr);
>  			if (IS_ERR_VALUE(vmaddr))
>  				return vmaddr;
> -			rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
> +			rc = gmap_fixup(gmap, gaddr, vmaddr, prot);
>  			if (rc)
>  				return rc;
>  		}
> @@ -1062,7 +1148,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
>  			rc = vmaddr;
>  			break;
>  		}
> -		rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
> +		rc = gmap_fixup(gmap, gaddr, vmaddr, PROT_READ);
>  		if (rc)
>  			break;
>  	}
> @@ -1145,7 +1231,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
>  		radix_tree_preload_end();
>  		if (rc) {
>  			kfree(rmap);
> -			rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
> +			rc = gmap_fixup(parent, paddr, vmaddr, PROT_READ);
>  			if (rc)
>  				return rc;
>  			continue;
> @@ -2058,7 +2144,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
>  		radix_tree_preload_end();
>  		if (!rc)
>  			break;
> -		rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
> +		rc = gmap_fixup(parent, paddr, vmaddr, prot);
>  		if (rc)
>  			break;
>  	}
> @@ -2124,6 +2210,39 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
>  	spin_unlock(&sg->guest_table_lock);
>  }
>  
> +/*
> + * ptep_notify_gmap - call all invalidation callbacks for a specific pte of a gmap
> + * @mm: pointer to the process mm_struct
> + * @addr: virtual address in the process address space
> + * @pte: pointer to the page table entry
> + * @bits: bits from the pgste that caused the notify call
> + *
> + * This function is assumed to be called with the guest_table_lock held.
> + */
> +void ptep_notify_gmap(struct mm_struct *mm, unsigned long vmaddr,
> +		      pte_t *pte, unsigned long bits)
> +{
> +	unsigned long offset, gaddr = 0;
> +	unsigned long *table;
> +	struct gmap *gmap;
> +
> +	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
> +	offset = offset * (4096 / sizeof(pte_t));
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
> +		table = radix_tree_lookup(&gmap->host_to_guest,
> +					  vmaddr >> PMD_SHIFT);
> +		if (table)
> +			gaddr = __gmap_segment_gaddr(table) + offset;
> +		else
> +			continue;
> +
> +		if (bits & PGSTE_IN_BIT)
> +			gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
> +	}
> +	rcu_read_unlock();
> +}
> +
>  /**
>   * ptep_notify - call all invalidation callbacks for a specific pte.
>   * @mm: pointer to the process mm_struct
> @@ -2168,6 +2287,23 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
>  }
>  EXPORT_SYMBOL_GPL(ptep_notify);
>  
> +static void pmdp_notify_split(struct mm_struct *mm, unsigned long vmaddr,
> +			      unsigned long *table)

The function name should contain "gmap", as this is gmap specific.

> +{
> +	int i = 0;
> +	unsigned long bits;
> +	unsigned long *ptep = (unsigned long *)(*table & PAGE_MASK);
> +	unsigned long *pgste = ptep + PTRS_PER_PTE;
> +
> +	for (; i < 256; i++, vmaddr += PAGE_SIZE, ptep++, pgste++) {
> +		bits = *pgste & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
> +		if (bits) {
> +			*pgste ^= bits;
> +			ptep_notify_gmap(mm, vmaddr, (pte_t *)ptep, bits);
> +		}
> +	}
> +}
> +
>  /**
>   * gmap_pmdp_xchg - exchange a gmap pmd with another
>   * @gmap: pointer to the guest address space structure
> @@ -2191,6 +2327,39 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
>  	*pmdp = new;
>  }
>  
> +/**
> + * pmdp_notify - call all invalidation callbacks for a specific pmd
> + * @mm: pointer to the process mm_struct
> + * @vmaddr: virtual address in the process address space
> + *
> + * This function is expected to be called with mmap_sem held in read.
> + */
> +void pmdp_notify(struct mm_struct *mm, unsigned long vmaddr)
> +{
> +	unsigned long *table, gaddr;
> +	struct gmap *gmap;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
> +		spin_lock(&gmap->guest_table_lock);
> +		table = radix_tree_lookup(&gmap->host_to_guest,
> +					  vmaddr >> PMD_SHIFT);
> +		if (!table) {
> +			spin_unlock(&gmap->guest_table_lock);
> +			continue;
> +		}
> +		gaddr = __gmap_segment_gaddr(table);
> +		if (gmap_pmd_is_split((pmd_t *)table)) {
> +			pmdp_notify_split(mm, vmaddr, table);
> +			spin_unlock(&gmap->guest_table_lock);
> +			continue;
> +		}
> +		spin_unlock(&gmap->guest_table_lock);
> +	}
> +	rcu_read_unlock();
> +}
> +EXPORT_SYMBOL_GPL(pmdp_notify);
> +
>  static inline void thp_split_mm(struct mm_struct *mm)
>  {
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
> index 301e466e4263..7e1c17b1a24a 100644
> --- a/arch/s390/mm/pgtable.c
> +++ b/arch/s390/mm/pgtable.c
> @@ -405,6 +405,8 @@ pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
>  	pmd_t old;
>  
>  	preempt_disable();
> +	if (mm_has_pgste(mm))

I am staring to wonder if mm_has_pgste(mm) is the right thing to check
for. With huge pages we might even be able to start VMs completely
without PGSTE. Right now this is an indication that "this is used by KVM"

Would something like "mm_has_gmap()" be me more clear?

> +		pmdp_notify(mm, addr);
>  	old = pmdp_flush_direct(mm, addr, pmdp);
>  	*pmdp = new;
>  	preempt_enable();
> @@ -418,6 +420,8 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
>  	pmd_t old;
>  
>  	preempt_disable();
> +	if (mm_has_pgste(mm))
> +		pmdp_notify(mm, addr);
>  	old = pmdp_flush_lazy(mm, addr, pmdp);
>  	*pmdp = new;
>  	preempt_enable();
>
Janosch Frank June 28, 2018, 1:47 p.m. UTC | #2
On 28.06.2018 14:55, David Hildenbrand wrote:
> On 27.06.2018 15:55, Janosch Frank wrote:
>> From: Janosch Frank <frankja@linux.vnet.ibm.com>
>>
>> Like for ptes, we also need invalidation notification for pmds, to
>> remove the fake page tables when they are split and later addition of
>> shadowed pmds.
> 
> I think the subject should rather be
> 
> "s390/mm: split huge pages in GMAP when protecting"
> 
> It would be helpful to explain why we have to split huge pages when
> protecting. (complicated stuff we discussed). The pmdp_notify()
> introduction could be moved to a separate patch (and keep this subject).

I'll revise the commit message and have a look into splitting.

> 
> AFAICS, transparent huge page handling could be fairly easy, no? Do you
> know what exactly we are missing to make it work? (assuming
> CMMA=SKEY=PFMFI=OFF - so PGSTE don't matter)

I have not looked into THP, Martin has had a look.
One step at a time :)

> 
>>
>> With PMDs we do not have PGSTEs or some other bits we could use in the
>> host PMD. Instead we pick one of the free bits in the gmap PMD. Every
>> time a host pmd will be invalidated, we will check if the respective
>> gmap PMD has the bit set and in that case fire up the notifier.
>>
>> Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
[...]
>> @@ -63,6 +63,7 @@ static struct gmap *gmap_alloc(unsigned long limit)
>>  	INIT_LIST_HEAD(&gmap->crst_list);
>>  	INIT_LIST_HEAD(&gmap->children);
>>  	INIT_LIST_HEAD(&gmap->pt_list);
>> +	INIT_LIST_HEAD(&gmap->split_list);
>>  	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
>>  	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
>>  	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
>> @@ -194,6 +195,10 @@ static void gmap_free(struct gmap *gmap)
>>  	gmap_radix_tree_free(&gmap->guest_to_host);
>>  	gmap_radix_tree_free(&gmap->host_to_guest);
>>  
>> +	/* Free split pmd page tables */
>> +	list_for_each_entry_safe(page, next, &gmap->split_list, lru)
>> +		page_table_free_pgste(page);
>> +
>>  	/* Free additional data for a shadow gmap */
>>  	if (gmap_is_shadow(gmap)) {
>>  		/* Free all page tables. */
>> @@ -599,10 +604,15 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
>>  	if (*table == _SEGMENT_ENTRY_EMPTY) {
>>  		rc = radix_tree_insert(&gmap->host_to_guest,
>>  				       vmaddr >> PMD_SHIFT, table);
>> -		if (!rc)
>> -			*table = pmd_val(*pmd);
>> -	} else
>> -		rc = 0;
>> +		if (!rc) {
>> +			if (pmd_large(*pmd)) {
>> +				*table = pmd_val(*pmd) &
>> +					_SEGMENT_ENTRY_HARDWARE_BITS_LARGE;
>> +			} else
>> +				*table = pmd_val(*pmd) &
>> +					_SEGMENT_ENTRY_HARDWARE_BITS;
>> +		}
>> +	}
> 
> Does this part really belong into this patch *confused*

Hmm, I'll move this to the enablement patch where we also remove the
EFAULT on huge pmds.

> 
>>  	spin_unlock(&gmap->guest_table_lock);
>>  	spin_unlock(ptl);
>>  	radix_tree_preload_end();
>> @@ -833,7 +843,7 @@ static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
>>  }
>>  
>>  /**
>> - * gmap_pte_op_fixup - force a page in and connect the gmap page table
>> + * gmap_fixup - force memory in and connect the gmap table entry
>>   * @gmap: pointer to guest mapping meta data structure
>>   * @gaddr: virtual address in the guest address space
>>   * @vmaddr: address in the host process address space
>> @@ -841,10 +851,10 @@ static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
>>   *
>>   * Returns 0 if the caller can retry __gmap_translate (might fail again),
>>   * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
>> - * up or connecting the gmap page table.
>> + * up or connecting the gmap table entry.
>>   */
>> -static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
>> -			     unsigned long vmaddr, int prot)
>> +static int gmap_fixup(struct gmap *gmap, unsigned long gaddr,
>> +		      unsigned long vmaddr, int prot)
>>  {
>>  	struct mm_struct *mm = gmap->mm;
>>  	unsigned int fault_flags;
>> @@ -892,8 +902,11 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
>>  		return NULL;
>>  	}
>>  
>> -	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
>> -	if (!pmd_large(*pmdp))
>> +	/*
>> +	 * Non-split 4k page table entries are locked via the pte
>> +	 * (pte_alloc_map_lock).
>> +	 */
>> +	if (!gmap_pmd_is_split(pmdp) && !pmd_large(*pmdp))
>>  		spin_unlock(&gmap->guest_table_lock);
>>  	return pmdp;
>>  }
>> @@ -905,10 +918,77 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
>>   */
>>  static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
>>  {
>> -	if (pmd_large(*pmdp))
>> +	if (pmd_large(*pmdp) || gmap_pmd_is_split(pmdp))
>>  		spin_unlock(&gmap->guest_table_lock);
>>  }
>>  
>> +static pte_t *gmap_pte_from_pmd(struct gmap *gmap, pmd_t *pmdp,
>> +				unsigned long addr, spinlock_t **ptl)
>> +{
>> +	if (likely(!gmap_pmd_is_split(pmdp)))
>> +		return pte_alloc_map_lock(gmap->mm, pmdp, addr, ptl);
>> +
>> +	*ptl = NULL;
>> +	return pte_offset_map(pmdp, addr);
>> +}
>> +
>> +/**
>> + * gmap_pmd_split_free - Free a split pmd's page table
>> + * @pmdp The split pmd that we free of its page table
>> + *
>> + * If the userspace pmds are exchanged, we'll remove the gmap pmds as
>> + * well, so we fault on them and link them again. We would leak
>> + * memory, if we didn't free split pmds here.
>> + */
>> +static inline void gmap_pmd_split_free(pmd_t *pmdp)
>> +{
>> +	unsigned long pgt = pmd_val(*pmdp) & _SEGMENT_ENTRY_ORIGIN;
>> +	struct page *page;
>> +
>> +	if (gmap_pmd_is_split(pmdp)) {
> 
> can this ever not be the case? This function is not used in this patch.

Look into the next one.

[...]
>>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
>> index 301e466e4263..7e1c17b1a24a 100644
>> --- a/arch/s390/mm/pgtable.c
>> +++ b/arch/s390/mm/pgtable.c
>> @@ -405,6 +405,8 @@ pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
>>  	pmd_t old;
>>  
>>  	preempt_disable();
>> +	if (mm_has_pgste(mm))
> 
> I am staring to wonder if mm_has_pgste(mm) is the right thing to check
> for. With huge pages we might even be able to start VMs completely
> without PGSTE. Right now this is an indication that "this is used by KVM"
> 
> Would something like "mm_has_gmap()" be me more clear?

Yes, after your allocate_pgste patch and considering hlp we should
rename the function and the mm context variable.

Still, such a patch will not be part of this patchset and I'd appreciate
it, if we could schedule it after its integration.

> 
>> +		pmdp_notify(mm, addr);
>>  	old = pmdp_flush_direct(mm, addr, pmdp);
>>  	*pmdp = new;
>>  	preempt_enable();
>> @@ -418,6 +420,8 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
>>  	pmd_t old;
>>  
>>  	preempt_disable();
>> +	if (mm_has_pgste(mm))
>> +		pmdp_notify(mm, addr);
>>  	old = pmdp_flush_lazy(mm, addr, pmdp);
>>  	*pmdp = new;
>>  	preempt_enable();
>>
> 
>
David Hildenbrand June 28, 2018, 2:03 p.m. UTC | #3
>>> - * gmap_pte_op_fixup - force a page in and connect the gmap page table
>>> + * gmap_fixup - force memory in and connect the gmap table entry
>>>   * @gmap: pointer to guest mapping meta data structure
>>>   * @gaddr: virtual address in the guest address space
>>>   * @vmaddr: address in the host process address space
>>> @@ -841,10 +851,10 @@ static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
>>>   *
>>>   * Returns 0 if the caller can retry __gmap_translate (might fail again),
>>>   * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
>>> - * up or connecting the gmap page table.
>>> + * up or connecting the gmap table entry.
>>>   */
>>> -static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
>>> -			     unsigned long vmaddr, int prot)
>>> +static int gmap_fixup(struct gmap *gmap, unsigned long gaddr,
>>> +		      unsigned long vmaddr, int prot)
>>>  {
>>>  	struct mm_struct *mm = gmap->mm;
>>>  	unsigned int fault_flags;
>>> @@ -892,8 +902,11 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
>>>  		return NULL;
>>>  	}
>>>  
>>> -	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
>>> -	if (!pmd_large(*pmdp))
>>> +	/*
>>> +	 * Non-split 4k page table entries are locked via the pte
>>> +	 * (pte_alloc_map_lock).
>>> +	 */
>>> +	if (!gmap_pmd_is_split(pmdp) && !pmd_large(*pmdp))
>>>  		spin_unlock(&gmap->guest_table_lock);
>>>  	return pmdp;
>>>  }
>>> @@ -905,10 +918,77 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
>>>   */
>>>  static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
>>>  {
>>> -	if (pmd_large(*pmdp))
>>> +	if (pmd_large(*pmdp) || gmap_pmd_is_split(pmdp))
>>>  		spin_unlock(&gmap->guest_table_lock);
>>>  }
>>>  
>>> +static pte_t *gmap_pte_from_pmd(struct gmap *gmap, pmd_t *pmdp,
>>> +				unsigned long addr, spinlock_t **ptl)
>>> +{
>>> +	if (likely(!gmap_pmd_is_split(pmdp)))
>>> +		return pte_alloc_map_lock(gmap->mm, pmdp, addr, ptl);
>>> +
>>> +	*ptl = NULL;
>>> +	return pte_offset_map(pmdp, addr);
>>> +}
>>> +
>>> +/**
>>> + * gmap_pmd_split_free - Free a split pmd's page table
>>> + * @pmdp The split pmd that we free of its page table
>>> + *
>>> + * If the userspace pmds are exchanged, we'll remove the gmap pmds as
>>> + * well, so we fault on them and link them again. We would leak
>>> + * memory, if we didn't free split pmds here.
>>> + */
>>> +static inline void gmap_pmd_split_free(pmd_t *pmdp)
>>> +{
>>> +	unsigned long pgt = pmd_val(*pmdp) & _SEGMENT_ENTRY_ORIGIN;
>>> +	struct page *page;
>>> +
>>> +	if (gmap_pmd_is_split(pmdp)) {
>>
>> can this ever not be the case? This function is not used in this patch.
> 
> Look into the next one.

Move it to the next one :)
Janosch Frank June 28, 2018, 2:23 p.m. UTC | #4
On 28.06.2018 15:47, Janosch Frank wrote:
> On 28.06.2018 14:55, David Hildenbrand wrote:
>> On 27.06.2018 15:55, Janosch Frank wrote:
>>> From: Janosch Frank <frankja@linux.vnet.ibm.com>
>>>

>>> @@ -63,6 +63,7 @@ static struct gmap *gmap_alloc(unsigned long limit)
>>>  	INIT_LIST_HEAD(&gmap->crst_list);
>>>  	INIT_LIST_HEAD(&gmap->children);
>>>  	INIT_LIST_HEAD(&gmap->pt_list);
>>> +	INIT_LIST_HEAD(&gmap->split_list);
>>>  	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
>>>  	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
>>>  	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
>>> @@ -194,6 +195,10 @@ static void gmap_free(struct gmap *gmap)
>>>  	gmap_radix_tree_free(&gmap->guest_to_host);
>>>  	gmap_radix_tree_free(&gmap->host_to_guest);
>>>  
>>> +	/* Free split pmd page tables */
>>> +	list_for_each_entry_safe(page, next, &gmap->split_list, lru)
>>> +		page_table_free_pgste(page);
>>> +
>>>  	/* Free additional data for a shadow gmap */
>>>  	if (gmap_is_shadow(gmap)) {
>>>  		/* Free all page tables. */
>>> @@ -599,10 +604,15 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
>>>  	if (*table == _SEGMENT_ENTRY_EMPTY) {
>>>  		rc = radix_tree_insert(&gmap->host_to_guest,
>>>  				       vmaddr >> PMD_SHIFT, table);
>>> -		if (!rc)
>>> -			*table = pmd_val(*pmd);
>>> -	} else
>>> -		rc = 0;
>>> +		if (!rc) {
>>> +			if (pmd_large(*pmd)) {
>>> +				*table = pmd_val(*pmd) &
>>> +					_SEGMENT_ENTRY_HARDWARE_BITS_LARGE;
>>> +			} else
>>> +				*table = pmd_val(*pmd) &
>>> +					_SEGMENT_ENTRY_HARDWARE_BITS;
>>> +		}
>>> +	}
>>
>> Does this part really belong into this patch *confused*
> 
> Hmm, I'll move this to the enablement patch where we also remove the
> EFAULT on huge pmds.

It doesn't really fit into this patch, but it'll stay, as I make
additions to this afterwards with the dirty sync and split handling.

Maybe I'll find a suitable patch after splitting this one.

Patch
diff mbox

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index c1bc5633fc6e..4324b2a55aa3 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -13,6 +13,9 @@ 
 #define GMAP_NOTIFY_SHADOW	0x2
 #define GMAP_NOTIFY_MPROT	0x1
 
+/* Status bits in the gmap segment entry. */
+#define _SEGMENT_ENTRY_GMAP_SPLIT	0x0001  /* split huge pmd */
+
 /**
  * struct gmap_struct - guest address space
  * @list: list head for the mm->context gmap list
@@ -52,6 +55,7 @@  struct gmap {
 	struct radix_tree_root host_to_rmap;
 	struct list_head children;
 	struct list_head pt_list;
+	struct list_head split_list;
 	spinlock_t shadow_lock;
 	struct gmap *parent;
 	unsigned long orig_asce;
@@ -92,6 +96,17 @@  static inline int gmap_is_shadow(struct gmap *gmap)
 	return !!gmap->parent;
 }
 
+/**
+ * gmap_pmd_is_split - Returns if a huge gmap pmd has been split.
+ * @pmdp: pointer to the pmd
+ *
+ * Returns true if the passed huge gmap pmd has been split.
+ */
+static inline bool gmap_pmd_is_split(pmd_t *pmdp)
+{
+	return !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_SPLIT);
+}
+
 struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
 void gmap_remove(struct gmap *gmap);
 struct gmap *gmap_get(struct gmap *gmap);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 5ab636089c60..34a5ff928cd4 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -268,8 +268,10 @@  static inline int is_module_addr(void *addr)
 #define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe2fUL
 
 /* Bits in the segment table entry */
-#define _SEGMENT_ENTRY_BITS	0xfffffffffffffe33UL
-#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL
+#define _SEGMENT_ENTRY_BITS			0xfffffffffffffe33UL
+#define _SEGMENT_ENTRY_BITS_LARGE		0xfffffffffff0ff33UL
+#define _SEGMENT_ENTRY_HARDWARE_BITS		0xfffffffffffffe30UL
+#define _SEGMENT_ENTRY_HARDWARE_BITS_LARGE	0xfffffffffff00730UL
 #define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address	    */
 #define _SEGMENT_ENTRY_ORIGIN	~0x7ffUL/* page table origin		    */
 #define _SEGMENT_ENTRY_PROTECT	0x200	/* segment protection bit	    */
@@ -1092,6 +1094,9 @@  void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
 void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 void ptep_notify(struct mm_struct *mm, unsigned long addr,
 		 pte_t *ptep, unsigned long bits);
+void ptep_notify_gmap(struct mm_struct *mm, unsigned long vmaddr,
+		      pte_t *pte, unsigned long bits);
+void pmdp_notify(struct mm_struct *mm, unsigned long addr);
 int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
 		    pte_t *ptep, int prot, unsigned long bit);
 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index f5b48426dde8..5ba43ef8ff40 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -63,6 +63,7 @@  static struct gmap *gmap_alloc(unsigned long limit)
 	INIT_LIST_HEAD(&gmap->crst_list);
 	INIT_LIST_HEAD(&gmap->children);
 	INIT_LIST_HEAD(&gmap->pt_list);
+	INIT_LIST_HEAD(&gmap->split_list);
 	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
 	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
 	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
@@ -194,6 +195,10 @@  static void gmap_free(struct gmap *gmap)
 	gmap_radix_tree_free(&gmap->guest_to_host);
 	gmap_radix_tree_free(&gmap->host_to_guest);
 
+	/* Free split pmd page tables */
+	list_for_each_entry_safe(page, next, &gmap->split_list, lru)
+		page_table_free_pgste(page);
+
 	/* Free additional data for a shadow gmap */
 	if (gmap_is_shadow(gmap)) {
 		/* Free all page tables. */
@@ -599,10 +604,15 @@  int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 	if (*table == _SEGMENT_ENTRY_EMPTY) {
 		rc = radix_tree_insert(&gmap->host_to_guest,
 				       vmaddr >> PMD_SHIFT, table);
-		if (!rc)
-			*table = pmd_val(*pmd);
-	} else
-		rc = 0;
+		if (!rc) {
+			if (pmd_large(*pmd)) {
+				*table = pmd_val(*pmd) &
+					_SEGMENT_ENTRY_HARDWARE_BITS_LARGE;
+			} else
+				*table = pmd_val(*pmd) &
+					_SEGMENT_ENTRY_HARDWARE_BITS;
+		}
+	}
 	spin_unlock(&gmap->guest_table_lock);
 	spin_unlock(ptl);
 	radix_tree_preload_end();
@@ -833,7 +843,7 @@  static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
 }
 
 /**
- * gmap_pte_op_fixup - force a page in and connect the gmap page table
+ * gmap_fixup - force memory in and connect the gmap table entry
  * @gmap: pointer to guest mapping meta data structure
  * @gaddr: virtual address in the guest address space
  * @vmaddr: address in the host process address space
@@ -841,10 +851,10 @@  static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
  *
  * Returns 0 if the caller can retry __gmap_translate (might fail again),
  * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
- * up or connecting the gmap page table.
+ * up or connecting the gmap table entry.
  */
-static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
-			     unsigned long vmaddr, int prot)
+static int gmap_fixup(struct gmap *gmap, unsigned long gaddr,
+		      unsigned long vmaddr, int prot)
 {
 	struct mm_struct *mm = gmap->mm;
 	unsigned int fault_flags;
@@ -892,8 +902,11 @@  static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
 		return NULL;
 	}
 
-	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
-	if (!pmd_large(*pmdp))
+	/*
+	 * Non-split 4k page table entries are locked via the pte
+	 * (pte_alloc_map_lock).
+	 */
+	if (!gmap_pmd_is_split(pmdp) && !pmd_large(*pmdp))
 		spin_unlock(&gmap->guest_table_lock);
 	return pmdp;
 }
@@ -905,10 +918,77 @@  static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
  */
 static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
 {
-	if (pmd_large(*pmdp))
+	if (pmd_large(*pmdp) || gmap_pmd_is_split(pmdp))
 		spin_unlock(&gmap->guest_table_lock);
 }
 
+static pte_t *gmap_pte_from_pmd(struct gmap *gmap, pmd_t *pmdp,
+				unsigned long addr, spinlock_t **ptl)
+{
+	if (likely(!gmap_pmd_is_split(pmdp)))
+		return pte_alloc_map_lock(gmap->mm, pmdp, addr, ptl);
+
+	*ptl = NULL;
+	return pte_offset_map(pmdp, addr);
+}
+
+/**
+ * gmap_pmd_split_free - Free a split pmd's page table
+ * @pmdp The split pmd that we free of its page table
+ *
+ * If the userspace pmds are exchanged, we'll remove the gmap pmds as
+ * well, so we fault on them and link them again. We would leak
+ * memory, if we didn't free split pmds here.
+ */
+static inline void gmap_pmd_split_free(pmd_t *pmdp)
+{
+	unsigned long pgt = pmd_val(*pmdp) & _SEGMENT_ENTRY_ORIGIN;
+	struct page *page;
+
+	if (gmap_pmd_is_split(pmdp)) {
+		page = pfn_to_page(pgt >> PAGE_SHIFT);
+		list_del(&page->lru);
+		page_table_free_pgste(page);
+	}
+}
+
+/**
+ * gmap_pmd_split - Split a huge gmap pmd and use a page table instead
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @pmdp: pointer to the pmd that will be split
+ *
+ * When splitting gmap pmds, we have to make the resulting page table
+ * look like it's a normal one to be able to use the common pte
+ * handling functions. Also we need to track these new tables as they
+ * aren't tracked anywhere else.
+ */
+static int gmap_pmd_split(struct gmap *gmap, unsigned long gaddr, pmd_t *pmdp)
+{
+	unsigned long *table;
+	struct page *page;
+	pmd_t new;
+	int i;
+
+	page = page_table_alloc_pgste(gmap->mm);
+	if (!page)
+		return -ENOMEM;
+	table = (unsigned long *) page_to_phys(page);
+	for (i = 0; i < 256; i++) {
+		table[i] = (pmd_val(*pmdp) & HPAGE_MASK) + i * PAGE_SIZE;
+		/* pmd_large() implies pmd/pte_present() */
+		table[i] |=  _PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE;
+		/* ptes are directly marked as dirty */
+		table[i + PTRS_PER_PTE] |= PGSTE_UC_BIT;
+	}
+
+	pmd_val(new) = ((unsigned long)table | _SEGMENT_ENTRY |
+			(_SEGMENT_ENTRY_GMAP_SPLIT));
+	list_add(&page->lru, &gmap->split_list);
+	gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
+	return 0;
+}
+
 /*
  * gmap_protect_pte - remove access rights to memory and set pgste bits
  * @gmap: pointer to guest mapping meta data structure
@@ -930,7 +1010,7 @@  static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
 	spinlock_t *ptl = NULL;
 	unsigned long pbits = 0;
 
-	ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
+	ptep = gmap_pte_from_pmd(gmap, pmdp, gaddr, &ptl);
 	if (!ptep)
 		return -ENOMEM;
 
@@ -967,19 +1047,25 @@  static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
 		rc = -EAGAIN;
 		pmdp = gmap_pmd_op_walk(gmap, gaddr);
 		if (pmdp && !(pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)) {
-			rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
-					      bits);
-			if (!rc) {
-				len -= PAGE_SIZE;
-				gaddr += PAGE_SIZE;
+			if (!pmd_large(*pmdp)) {
+				rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
+						      bits);
+				if (!rc) {
+					len -= PAGE_SIZE;
+					gaddr += PAGE_SIZE;
+				}
+			} else {
+				rc = gmap_pmd_split(gmap, gaddr, pmdp);
+				if (!rc)
+					rc = -EFAULT;
 			}
 			gmap_pmd_op_end(gmap, pmdp);
 		}
-		if (rc) {
+		if (rc && rc != -EFAULT) {
 			vmaddr = __gmap_translate(gmap, gaddr);
 			if (IS_ERR_VALUE(vmaddr))
 				return vmaddr;
-			rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
+			rc = gmap_fixup(gmap, gaddr, vmaddr, prot);
 			if (rc)
 				return rc;
 		}
@@ -1062,7 +1148,7 @@  int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
 			rc = vmaddr;
 			break;
 		}
-		rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
+		rc = gmap_fixup(gmap, gaddr, vmaddr, PROT_READ);
 		if (rc)
 			break;
 	}
@@ -1145,7 +1231,7 @@  static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
 		radix_tree_preload_end();
 		if (rc) {
 			kfree(rmap);
-			rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
+			rc = gmap_fixup(parent, paddr, vmaddr, PROT_READ);
 			if (rc)
 				return rc;
 			continue;
@@ -2058,7 +2144,7 @@  int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
 		radix_tree_preload_end();
 		if (!rc)
 			break;
-		rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+		rc = gmap_fixup(parent, paddr, vmaddr, prot);
 		if (rc)
 			break;
 	}
@@ -2124,6 +2210,39 @@  static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
 	spin_unlock(&sg->guest_table_lock);
 }
 
+/*
+ * ptep_notify_gmap - call all invalidation callbacks for a specific pte of a gmap
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the process address space
+ * @pte: pointer to the page table entry
+ * @bits: bits from the pgste that caused the notify call
+ *
+ * This function is assumed to be called with the guest_table_lock held.
+ */
+void ptep_notify_gmap(struct mm_struct *mm, unsigned long vmaddr,
+		      pte_t *pte, unsigned long bits)
+{
+	unsigned long offset, gaddr = 0;
+	unsigned long *table;
+	struct gmap *gmap;
+
+	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
+	offset = offset * (4096 / sizeof(pte_t));
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+		table = radix_tree_lookup(&gmap->host_to_guest,
+					  vmaddr >> PMD_SHIFT);
+		if (table)
+			gaddr = __gmap_segment_gaddr(table) + offset;
+		else
+			continue;
+
+		if (bits & PGSTE_IN_BIT)
+			gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
+	}
+	rcu_read_unlock();
+}
+
 /**
  * ptep_notify - call all invalidation callbacks for a specific pte.
  * @mm: pointer to the process mm_struct
@@ -2168,6 +2287,23 @@  void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
 }
 EXPORT_SYMBOL_GPL(ptep_notify);
 
+static void pmdp_notify_split(struct mm_struct *mm, unsigned long vmaddr,
+			      unsigned long *table)
+{
+	int i = 0;
+	unsigned long bits;
+	unsigned long *ptep = (unsigned long *)(*table & PAGE_MASK);
+	unsigned long *pgste = ptep + PTRS_PER_PTE;
+
+	for (; i < 256; i++, vmaddr += PAGE_SIZE, ptep++, pgste++) {
+		bits = *pgste & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+		if (bits) {
+			*pgste ^= bits;
+			ptep_notify_gmap(mm, vmaddr, (pte_t *)ptep, bits);
+		}
+	}
+}
+
 /**
  * gmap_pmdp_xchg - exchange a gmap pmd with another
  * @gmap: pointer to the guest address space structure
@@ -2191,6 +2327,39 @@  static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
 	*pmdp = new;
 }
 
+/**
+ * pmdp_notify - call all invalidation callbacks for a specific pmd
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ *
+ * This function is expected to be called with mmap_sem held in read.
+ */
+void pmdp_notify(struct mm_struct *mm, unsigned long vmaddr)
+{
+	unsigned long *table, gaddr;
+	struct gmap *gmap;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+		spin_lock(&gmap->guest_table_lock);
+		table = radix_tree_lookup(&gmap->host_to_guest,
+					  vmaddr >> PMD_SHIFT);
+		if (!table) {
+			spin_unlock(&gmap->guest_table_lock);
+			continue;
+		}
+		gaddr = __gmap_segment_gaddr(table);
+		if (gmap_pmd_is_split((pmd_t *)table)) {
+			pmdp_notify_split(mm, vmaddr, table);
+			spin_unlock(&gmap->guest_table_lock);
+			continue;
+		}
+		spin_unlock(&gmap->guest_table_lock);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(pmdp_notify);
+
 static inline void thp_split_mm(struct mm_struct *mm)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 301e466e4263..7e1c17b1a24a 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -405,6 +405,8 @@  pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
 	pmd_t old;
 
 	preempt_disable();
+	if (mm_has_pgste(mm))
+		pmdp_notify(mm, addr);
 	old = pmdp_flush_direct(mm, addr, pmdp);
 	*pmdp = new;
 	preempt_enable();
@@ -418,6 +420,8 @@  pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
 	pmd_t old;
 
 	preempt_disable();
+	if (mm_has_pgste(mm))
+		pmdp_notify(mm, addr);
 	old = pmdp_flush_lazy(mm, addr, pmdp);
 	*pmdp = new;
 	preempt_enable();