diff mbox

[RFC] KVM: PPC: BOOK3S: HV: THP support for guest

Message ID 1399224616-25142-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Aneesh Kumar K.V May 4, 2014, 5:30 p.m. UTC
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 146 ++++++++++++++++++++++++++-----
 arch/powerpc/kvm/book3s_hv.c             |   7 ++
 2 files changed, 130 insertions(+), 23 deletions(-)

Comments

Aneesh Kumar K.V May 4, 2014, 5:36 p.m. UTC | #1
"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> writes:

> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/include/asm/kvm_book3s_64.h | 146 ++++++++++++++++++++++++++-----
>  arch/powerpc/kvm/book3s_hv.c             |   7 ++
>  2 files changed, 130 insertions(+), 23 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 51388befeddb..f03ea8f90576 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -77,34 +77,122 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
>  	return old == 0;
>  }
>
> +static inline int __hpte_actual_psize(unsigned int lp, int psize)
> +{
> +	int i, shift;
> +	unsigned int mask;
> +
> +	/* start from 1 ignoring MMU_PAGE_4K */
> +	for (i = 1; i < MMU_PAGE_COUNT; i++) {
> +
> +		/* invalid penc */
> +		if (mmu_psize_defs[psize].penc[i] == -1)
> +			continue;
> +		/*
> +		 * encoding bits per actual page size
> +		 *        PTE LP     actual page size
> +		 *    rrrr rrrz		>=8KB
> +		 *    rrrr rrzz		>=16KB
> +		 *    rrrr rzzz		>=32KB
> +		 *    rrrr zzzz		>=64KB
> +		 * .......
> +		 */
> +		shift = mmu_psize_defs[i].shift - LP_SHIFT;
> +		if (shift > LP_BITS)
> +			shift = LP_BITS;
> +		mask = (1 << shift) - 1;
> +		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
> +			return i;
> +	}
> +	return -1;
> +}
> +
>  static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
>  					     unsigned long pte_index)
>  {
> -	unsigned long rb, va_low;
> +	int b_size, a_size;
> +	unsigned int penc;
> +	unsigned long rb = 0, va_low, sllp;
> +	unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
> +
> +	if (!(v & HPTE_V_LARGE)) {
> +		/* both base and actual psize is 4k */
> +		b_size = MMU_PAGE_4K;
> +		a_size = MMU_PAGE_4K;
> +	} else {
> +		for (b_size = 0; b_size < MMU_PAGE_COUNT; b_size++) {
> +
> +			/* valid entries have a shift value */
> +			if (!mmu_psize_defs[b_size].shift)
> +				continue;
>
> +			a_size = __hpte_actual_psize(lp, b_size);
> +			if (a_size != -1)
> +				break;
> +		}
> +	}
> +	/*
> +	 * Ignore the top 14 bits of va
> +	 * v have top two bits covering segment size, hence move
> +	 * by 16 bits, Also clear the lower HPTE_V_AVPN_SHIFT (7) bits.
> +	 * AVA field in v also have the lower 23 bits ignored.
> +	 * For base page size 4K we need 14 .. 65 bits (so need to
> +	 * collect extra 11 bits)
> +	 * For others we need 14..14+i
> +	 */
> +	/* This covers 14..54 bits of va*/
>  	rb = (v & ~0x7fUL) << 16;		/* AVA field */
> +	/*
> +	 * AVA in v had cleared lower 23 bits. We need to derive
> +	 * that from pteg index
> +	 */
>  	va_low = pte_index >> 3;
>  	if (v & HPTE_V_SECONDARY)
>  		va_low = ~va_low;
> -	/* xor vsid from AVA */
> +	/*
> +	 * get the vpn bits from va_low using reverse of hashing.
> +	 * In v we have va with 23 bits dropped and then left shifted
> +	 * HPTE_V_AVPN_SHIFT (7) bits. Now to find vsid we need
> +	 * right shift it with (SID_SHIFT - (23 - 7))
> +	 */
>  	if (!(v & HPTE_V_1TB_SEG))
> -		va_low ^= v >> 12;
> +		va_low ^= v >> (SID_SHIFT - 16);
>  	else
> -		va_low ^= v >> 24;
> +		va_low ^= v >> (SID_SHIFT_1T - 16);
>  	va_low &= 0x7ff;
> -	if (v & HPTE_V_LARGE) {
> -		rb |= 1;			/* L field */
> -		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
> -		    (r & 0xff000)) {
> -			/* non-16MB large page, must be 64k */
> -			/* (masks depend on page size) */
> -			rb |= 0x1000;		/* page encoding in LP field */
> -			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
> -			rb |= ((va_low << 4) & 0xf0);	/* AVAL field (P7 doesn't seem to care) */
> -		}
> -	} else {
> -		/* 4kB page */
> -		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
> +
> +	switch (b_size) {
> +	case MMU_PAGE_4K:
> +		sllp = ((mmu_psize_defs[a_size].sllp & SLB_VSID_L) >> 6) |
> +			((mmu_psize_defs[a_size].sllp & SLB_VSID_LP) >> 4);
> +		rb |= sllp << 5;	/*  AP field */
> +		rb |= (va_low & 0x7ff) << 12;	/* remaining 11 bits of AVA */
> +		break;
> +	default:
> +	{
> +		int aval_shift;
> +		/*
> +		 * remaining 7bits of AVA/LP fields
> +		 * Also contain the rr bits of LP
> +		 */
> +		rb |= (va_low & 0x7f) << 16;
> +		/*
> +		 * Now clear not needed LP bits based on actual psize
> +		 */
> +		rb &= ~((1ul << mmu_psize_defs[a_size].shift) - 1);
> +		/*
> +		 * AVAL field 58..77 - base_page_shift bits of va
> +		 * we have space for 58..64 bits, Missing bits should
> +		 * be zero filled. +1 is to take care of L bit shift
> +		 */
> +		aval_shift = 64 - (77 - mmu_psize_defs[b_size].shift) + 1;
> +		rb |= ((va_low << aval_shift) & 0xfe);
> +
> +		rb |= 1;		/* L field */
> +		penc = mmu_psize_defs[b_size].penc[a_size];
> +		rb |= penc << 12;	/* LP field */
> +		break;
> +	}
>  	}
>  	rb |= (v >> 54) & 0x300;		/* B field */
>  	return rb;
> @@ -112,14 +200,26 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
>
>  static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
>  {
> +	int size, a_size;
> +	/* Look at the 8 bit LP value */
> +	unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
> +
>  	/* only handle 4k, 64k and 16M pages for now */
>  	if (!(h & HPTE_V_LARGE))
> -		return 1ul << 12;		/* 4k page */
> -	if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
> -		return 1ul << 16;		/* 64k page */
> -	if ((l & 0xff000) == 0)
> -		return 1ul << 24;		/* 16M page */
> -	return 0;				/* error */
> +		return 1ul << 12;
> +	else {
> +		for (size = 0; size < MMU_PAGE_COUNT; size++) {
> +			/* valid entries have a shift value */
> +			if (!mmu_psize_defs[size].shift)
> +				continue;
> +
> +			a_size = __hpte_actual_psize(lp, size);
> +			if (a_size != -1)
> +				return 1ul << mmu_psize_defs[a_size].shift;
> +		}
> +
> +	}
> +	return 0;
>  }
>
>  static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 8227dba5af0f..a38d3289320a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1949,6 +1949,13 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
>  	 * support pte_enc here
>  	 */
>  	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
> +	/*
> +	 * Add 16MB MPSS support
> +	 */
> +	if (linux_psize != MMU_PAGE_16M) {
> +		(*sps)->enc[1].page_shift = 24;
> +		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
> +	}

We ideally want to do this only when the guest memory is backed up by
hugetlbfs. I was thinking qemu should ensure that. But then i am not
sure existing qemu work that way. So we may want to look at how to
enable MPSS.

-aneesh

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf May 5, 2014, 11:38 a.m. UTC | #2
On 05/04/2014 07:30 PM, Aneesh Kumar K.V wrote:
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

No patch description, no proper explanations anywhere why you're doing 
what. All of that in a pretty sensitive piece of code. There's no way 
this patch can go upstream in its current form.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Aneesh Kumar K.V May 5, 2014, 2:47 p.m. UTC | #3
Alexander Graf <agraf@suse.de> writes:

> On 05/04/2014 07:30 PM, Aneesh Kumar K.V wrote:
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>
> No patch description, no proper explanations anywhere why you're doing 
> what. All of that in a pretty sensitive piece of code. There's no way 
> this patch can go upstream in its current form.
>

Sorry about being vague. Will add a better commit message. The goal is
to export MPSS support to guest if the host support the same. MPSS
support is exported via penc encoding in "ibm,segment-page-sizes". The
actual format can be found at htab_dt_scan_page_sizes. When the guest
memory is backed by hugetlbfs we expose the penc encoding the host
support to guest via kvmppc_add_seg_page_size. 

Now the challenge to THP support is to make sure that our henter,
hremove etc decode base page size and actual page size correctly
from the hash table entry values. Most of the changes is to do that.
Rest of the stuff is already handled by kvm. 

NOTE: It is much easier to read the code after applying the patch rather
than reading the diff. I have added comments around each steps in the
code.

-aneesh

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras May 6, 2014, 4:20 a.m. UTC | #4
On Mon, May 05, 2014 at 08:17:00PM +0530, Aneesh Kumar K.V wrote:
> Alexander Graf <agraf@suse.de> writes:
> 
> > On 05/04/2014 07:30 PM, Aneesh Kumar K.V wrote:
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> >
> > No patch description, no proper explanations anywhere why you're doing 
> > what. All of that in a pretty sensitive piece of code. There's no way 
> > this patch can go upstream in its current form.
> >
> 
> Sorry about being vague. Will add a better commit message. The goal is
> to export MPSS support to guest if the host support the same. MPSS
> support is exported via penc encoding in "ibm,segment-page-sizes". The
> actual format can be found at htab_dt_scan_page_sizes. When the guest
> memory is backed by hugetlbfs we expose the penc encoding the host
> support to guest via kvmppc_add_seg_page_size. 

In a case like this it's good to assume the reader doesn't know very
much about Power CPUs, and probably isn't familiar with acronyms such
as MPSS.  The patch needs an introductory paragraph explaining that on
recent IBM Power CPUs, while the hashed page table is looked up using
the page size from the segmentation hardware (i.e. the SLB), it is
possible to have the HPT entry indicate a larger page size.  Thus for
example it is possible to put a 16MB page in a 64kB segment, but since
the hash lookup is done using a 64kB page size, it may be necessary to
put multiple entries in the HPT for a single 16MB page.  This
capability is called mixed page-size segment (MPSS).  With MPSS,
there are two relevant page sizes: the base page size, which is the
size used in searching the HPT, and the actual page size, which is the
size indicated in the HPT entry.  Note that the actual page size is
always >= base page size.

> Now the challenge to THP support is to make sure that our henter,
> hremove etc decode base page size and actual page size correctly
> from the hash table entry values. Most of the changes is to do that.
> Rest of the stuff is already handled by kvm. 
> 
> NOTE: It is much easier to read the code after applying the patch rather
> than reading the diff. I have added comments around each steps in the
> code.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf May 6, 2014, 9:12 a.m. UTC | #5
On 05/04/2014 07:30 PM, Aneesh Kumar K.V wrote:
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>   arch/powerpc/include/asm/kvm_book3s_64.h | 146 ++++++++++++++++++++++++++-----
>   arch/powerpc/kvm/book3s_hv.c             |   7 ++
>   2 files changed, 130 insertions(+), 23 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 51388befeddb..f03ea8f90576 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -77,34 +77,122 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
>   	return old == 0;
>   }
>   
> +static inline int __hpte_actual_psize(unsigned int lp, int psize)
> +{
> +	int i, shift;
> +	unsigned int mask;
> +
> +	/* start from 1 ignoring MMU_PAGE_4K */
> +	for (i = 1; i < MMU_PAGE_COUNT; i++) {
> +
> +		/* invalid penc */
> +		if (mmu_psize_defs[psize].penc[i] == -1)
> +			continue;
> +		/*
> +		 * encoding bits per actual page size
> +		 *        PTE LP     actual page size
> +		 *    rrrr rrrz		>=8KB
> +		 *    rrrr rrzz		>=16KB
> +		 *    rrrr rzzz		>=32KB
> +		 *    rrrr zzzz		>=64KB
> +		 * .......
> +		 */
> +		shift = mmu_psize_defs[i].shift - LP_SHIFT;
> +		if (shift > LP_BITS)
> +			shift = LP_BITS;
> +		mask = (1 << shift) - 1;
> +		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
> +			return i;
> +	}
> +	return -1;
> +}
> +
>   static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
>   					     unsigned long pte_index)
>   {
> -	unsigned long rb, va_low;
> +	int b_size, a_size;
> +	unsigned int penc;
> +	unsigned long rb = 0, va_low, sllp;
> +	unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
> +
> +	if (!(v & HPTE_V_LARGE)) {
> +		/* both base and actual psize is 4k */
> +		b_size = MMU_PAGE_4K;
> +		a_size = MMU_PAGE_4K;
> +	} else {
> +		for (b_size = 0; b_size < MMU_PAGE_COUNT; b_size++) {
> +
> +			/* valid entries have a shift value */
> +			if (!mmu_psize_defs[b_size].shift)
> +				continue;
>   
> +			a_size = __hpte_actual_psize(lp, b_size);
> +			if (a_size != -1)
> +				break;
> +		}
> +	}
> +	/*
> +	 * Ignore the top 14 bits of va
> +	 * v have top two bits covering segment size, hence move
> +	 * by 16 bits, Also clear the lower HPTE_V_AVPN_SHIFT (7) bits.
> +	 * AVA field in v also have the lower 23 bits ignored.
> +	 * For base page size 4K we need 14 .. 65 bits (so need to
> +	 * collect extra 11 bits)
> +	 * For others we need 14..14+i
> +	 */
> +	/* This covers 14..54 bits of va*/
>   	rb = (v & ~0x7fUL) << 16;		/* AVA field */
> +	/*
> +	 * AVA in v had cleared lower 23 bits. We need to derive
> +	 * that from pteg index
> +	 */
>   	va_low = pte_index >> 3;
>   	if (v & HPTE_V_SECONDARY)
>   		va_low = ~va_low;
> -	/* xor vsid from AVA */
> +	/*
> +	 * get the vpn bits from va_low using reverse of hashing.
> +	 * In v we have va with 23 bits dropped and then left shifted
> +	 * HPTE_V_AVPN_SHIFT (7) bits. Now to find vsid we need
> +	 * right shift it with (SID_SHIFT - (23 - 7))
> +	 */
>   	if (!(v & HPTE_V_1TB_SEG))
> -		va_low ^= v >> 12;
> +		va_low ^= v >> (SID_SHIFT - 16);
>   	else
> -		va_low ^= v >> 24;
> +		va_low ^= v >> (SID_SHIFT_1T - 16);
>   	va_low &= 0x7ff;
> -	if (v & HPTE_V_LARGE) {
> -		rb |= 1;			/* L field */
> -		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
> -		    (r & 0xff000)) {
> -			/* non-16MB large page, must be 64k */
> -			/* (masks depend on page size) */
> -			rb |= 0x1000;		/* page encoding in LP field */
> -			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
> -			rb |= ((va_low << 4) & 0xf0);	/* AVAL field (P7 doesn't seem to care) */
> -		}
> -	} else {
> -		/* 4kB page */
> -		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
> +
> +	switch (b_size) {
> +	case MMU_PAGE_4K:
> +		sllp = ((mmu_psize_defs[a_size].sllp & SLB_VSID_L) >> 6) |
> +			((mmu_psize_defs[a_size].sllp & SLB_VSID_LP) >> 4);
> +		rb |= sllp << 5;	/*  AP field */
> +		rb |= (va_low & 0x7ff) << 12;	/* remaining 11 bits of AVA */
> +		break;
> +	default:
> +	{
> +		int aval_shift;
> +		/*
> +		 * remaining 7bits of AVA/LP fields
> +		 * Also contain the rr bits of LP
> +		 */
> +		rb |= (va_low & 0x7f) << 16;
> +		/*
> +		 * Now clear not needed LP bits based on actual psize
> +		 */
> +		rb &= ~((1ul << mmu_psize_defs[a_size].shift) - 1);
> +		/*
> +		 * AVAL field 58..77 - base_page_shift bits of va
> +		 * we have space for 58..64 bits, Missing bits should
> +		 * be zero filled. +1 is to take care of L bit shift
> +		 */
> +		aval_shift = 64 - (77 - mmu_psize_defs[b_size].shift) + 1;
> +		rb |= ((va_low << aval_shift) & 0xfe);
> +
> +		rb |= 1;		/* L field */
> +		penc = mmu_psize_defs[b_size].penc[a_size];
> +		rb |= penc << 12;	/* LP field */
> +		break;
> +	}
>   	}
>   	rb |= (v >> 54) & 0x300;		/* B field */
>   	return rb;
> @@ -112,14 +200,26 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
>   
>   static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
>   {
> +	int size, a_size;
> +	/* Look at the 8 bit LP value */
> +	unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
> +
>   	/* only handle 4k, 64k and 16M pages for now */
>   	if (!(h & HPTE_V_LARGE))
> -		return 1ul << 12;		/* 4k page */
> -	if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
> -		return 1ul << 16;		/* 64k page */
> -	if ((l & 0xff000) == 0)
> -		return 1ul << 24;		/* 16M page */
> -	return 0;				/* error */
> +		return 1ul << 12;
> +	else {
> +		for (size = 0; size < MMU_PAGE_COUNT; size++) {
> +			/* valid entries have a shift value */
> +			if (!mmu_psize_defs[size].shift)
> +				continue;
> +
> +			a_size = __hpte_actual_psize(lp, size);

a_size as psize is probably a slightly confusing namer. Just call it 
a_psize.

So if I understand this patch correctly, it simply introduces logic to 
handle page sizes other than 4k, 64k, 16M by analyzing the actual page 
size field in the HPTE. Mind to explain why exactly that enables us to 
use THP?

What exactly is the flow if the pages are not backed by huge pages? What 
is the flow when they start to get backed by huge pages?

> +			if (a_size != -1)
> +				return 1ul << mmu_psize_defs[a_size].shift;
> +		}
> +
> +	}
> +	return 0;
>   }
>   
>   static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 8227dba5af0f..a38d3289320a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1949,6 +1949,13 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
>   	 * support pte_enc here
>   	 */
>   	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
> +	/*
> +	 * Add 16MB MPSS support
> +	 */
> +	if (linux_psize != MMU_PAGE_16M) {
> +		(*sps)->enc[1].page_shift = 24;
> +		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
> +	}

So this basically indicates that every segment (except for the 16MB one) 
can also handle 16MB MPSS page sizes? I suppose you want to remove the 
comment in kvm_vm_ioctl_get_smmu_info_hv() that says we don't do MPSS here.

Can we also ensure that every system we run on can do MPSS?


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 6, 2014, 9:26 a.m. UTC | #6
On Tue, 2014-05-06 at 11:12 +0200, Alexander Graf wrote:

> So if I understand this patch correctly, it simply introduces logic to 
> handle page sizes other than 4k, 64k, 16M by analyzing the actual page 
> size field in the HPTE. Mind to explain why exactly that enables us to 
> use THP?
>
> What exactly is the flow if the pages are not backed by huge pages? What 
> is the flow when they start to get backed by huge pages?

The hypervisor doesn't care about segments ... but it needs to properly
decode the page size requested by the guest, if anything, to issue the
right form of tlbie instruction.

The encoding in the HPTE for a 16M page inside a 64K segment is
different than the encoding for a 16M in a 16M segment, this is done so
that the encoding carries both information, which allows broadcast
tlbie to properly find the right set in the TLB for invalidations among
others.

So from a KVM perspective, we don't know whether the guest is doing THP
or something else (Linux calls it THP but all we care here is that this
is MPSS, another guest than Linux might exploit that differently).

What we do know is that if we advertise MPSS, we need to decode the page
sizes encoded in the HPTE so that we know what we are dealing with in
H_ENTER and can do the appropriate TLB invalidations in H_REMOVE &
evictions.

> > +			if (a_size != -1)
> > +				return 1ul << mmu_psize_defs[a_size].shift;
> > +		}
> > +
> > +	}
> > +	return 0;
> >   }
> >   
> >   static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
> > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > index 8227dba5af0f..a38d3289320a 100644
> > --- a/arch/powerpc/kvm/book3s_hv.c
> > +++ b/arch/powerpc/kvm/book3s_hv.c
> > @@ -1949,6 +1949,13 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
> >   	 * support pte_enc here
> >   	 */
> >   	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
> > +	/*
> > +	 * Add 16MB MPSS support
> > +	 */
> > +	if (linux_psize != MMU_PAGE_16M) {
> > +		(*sps)->enc[1].page_shift = 24;
> > +		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
> > +	}
> 
> So this basically indicates that every segment (except for the 16MB one) 
> can also handle 16MB MPSS page sizes? I suppose you want to remove the 
> comment in kvm_vm_ioctl_get_smmu_info_hv() that says we don't do MPSS here.

I haven't reviewed the code there, make sure it will indeed do a
different encoding for every combination of segment/actual page size.

> Can we also ensure that every system we run on can do MPSS?

P7 and P8 are identical in that regard. However 970 doesn't do MPSS so
let's make sure we get that right.

Cheers,
Ben.
 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf May 6, 2014, 9:39 a.m. UTC | #7
On 05/06/2014 11:26 AM, Benjamin Herrenschmidt wrote:
> On Tue, 2014-05-06 at 11:12 +0200, Alexander Graf wrote:
>
>> So if I understand this patch correctly, it simply introduces logic to
>> handle page sizes other than 4k, 64k, 16M by analyzing the actual page
>> size field in the HPTE. Mind to explain why exactly that enables us to
>> use THP?
>>
>> What exactly is the flow if the pages are not backed by huge pages? What
>> is the flow when they start to get backed by huge pages?
> The hypervisor doesn't care about segments ... but it needs to properly
> decode the page size requested by the guest, if anything, to issue the
> right form of tlbie instruction.
>
> The encoding in the HPTE for a 16M page inside a 64K segment is
> different than the encoding for a 16M in a 16M segment, this is done so
> that the encoding carries both information, which allows broadcast
> tlbie to properly find the right set in the TLB for invalidations among
> others.
>
> So from a KVM perspective, we don't know whether the guest is doing THP
> or something else (Linux calls it THP but all we care here is that this
> is MPSS, another guest than Linux might exploit that differently).

Ugh. So we're just talking about a guest using MPSS here? Not about the 
host doing THP? I must've missed that part.

>
> What we do know is that if we advertise MPSS, we need to decode the page
> sizes encoded in the HPTE so that we know what we are dealing with in
> H_ENTER and can do the appropriate TLB invalidations in H_REMOVE &
> evictions.

Yes. That makes a lot of sense. So this patch really is all about 
enabling MPSS support for 16MB pages. No more, no less.

>
>>> +			if (a_size != -1)
>>> +				return 1ul << mmu_psize_defs[a_size].shift;
>>> +		}
>>> +
>>> +	}
>>> +	return 0;
>>>    }
>>>    
>>>    static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
>>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>>> index 8227dba5af0f..a38d3289320a 100644
>>> --- a/arch/powerpc/kvm/book3s_hv.c
>>> +++ b/arch/powerpc/kvm/book3s_hv.c
>>> @@ -1949,6 +1949,13 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
>>>    	 * support pte_enc here
>>>    	 */
>>>    	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
>>> +	/*
>>> +	 * Add 16MB MPSS support
>>> +	 */
>>> +	if (linux_psize != MMU_PAGE_16M) {
>>> +		(*sps)->enc[1].page_shift = 24;
>>> +		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
>>> +	}
>> So this basically indicates that every segment (except for the 16MB one)
>> can also handle 16MB MPSS page sizes? I suppose you want to remove the
>> comment in kvm_vm_ioctl_get_smmu_info_hv() that says we don't do MPSS here.
> I haven't reviewed the code there, make sure it will indeed do a
> different encoding for every combination of segment/actual page size.
>
>> Can we also ensure that every system we run on can do MPSS?
> P7 and P8 are identical in that regard. However 970 doesn't do MPSS so
> let's make sure we get that right.

yes. When / if people can easily get their hands on p7/p8 bare metal 
systems I'll be more than happy to remove 970 support as well, but for 
now it's probably good to keep in.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Aneesh Kumar K.V May 6, 2014, 2:23 p.m. UTC | #8
Alexander Graf <agraf@suse.de> writes:

> On 05/04/2014 07:30 PM, Aneesh Kumar K.V wrote:
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

....
....

>>   static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
>>   {
>> +	int size, a_size;
>> +	/* Look at the 8 bit LP value */
>> +	unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
>> +
>>   	/* only handle 4k, 64k and 16M pages for now */
>>   	if (!(h & HPTE_V_LARGE))
>> -		return 1ul << 12;		/* 4k page */
>> -	if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
>> -		return 1ul << 16;		/* 64k page */
>> -	if ((l & 0xff000) == 0)
>> -		return 1ul << 24;		/* 16M page */
>> -	return 0;				/* error */
>> +		return 1ul << 12;
>> +	else {
>> +		for (size = 0; size < MMU_PAGE_COUNT; size++) {
>> +			/* valid entries have a shift value */
>> +			if (!mmu_psize_defs[size].shift)
>> +				continue;
>> +
>> +			a_size = __hpte_actual_psize(lp, size);
>
> a_size as psize is probably a slightly confusing namer. Just call it 
> a_psize.

Will update.

>
> So if I understand this patch correctly, it simply introduces logic to 
> handle page sizes other than 4k, 64k, 16M by analyzing the actual page 
> size field in the HPTE. Mind to explain why exactly that enables us to 
> use THP?
>
> What exactly is the flow if the pages are not backed by huge pages? What 
> is the flow when they start to get backed by huge pages?
>
>> +			if (a_size != -1)
>> +				return 1ul << mmu_psize_defs[a_size].shift;
>> +		}
>> +
>> +	}
>> +	return 0;
>>   }
>>   
>>   static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>> index 8227dba5af0f..a38d3289320a 100644
>> --- a/arch/powerpc/kvm/book3s_hv.c
>> +++ b/arch/powerpc/kvm/book3s_hv.c
>> @@ -1949,6 +1949,13 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
>>   	 * support pte_enc here
>>   	 */
>>   	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
>> +	/*
>> +	 * Add 16MB MPSS support
>> +	 */
>> +	if (linux_psize != MMU_PAGE_16M) {
>> +		(*sps)->enc[1].page_shift = 24;
>> +		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
>> +	}
>
> So this basically indicates that every segment (except for the 16MB one) 
> can also handle 16MB MPSS page sizes? I suppose you want to remove the 
> comment in kvm_vm_ioctl_get_smmu_info_hv() that says we don't do MPSS
> here.

Will do

>
> Can we also ensure that every system we run on can do MPSS?
>

Will do

-aneesh

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Aneesh Kumar K.V May 6, 2014, 2:25 p.m. UTC | #9
Paul Mackerras <paulus@samba.org> writes:

> On Mon, May 05, 2014 at 08:17:00PM +0530, Aneesh Kumar K.V wrote:
>> Alexander Graf <agraf@suse.de> writes:
>> 
>> > On 05/04/2014 07:30 PM, Aneesh Kumar K.V wrote:
>> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>> >
>> > No patch description, no proper explanations anywhere why you're doing 
>> > what. All of that in a pretty sensitive piece of code. There's no way 
>> > this patch can go upstream in its current form.
>> >
>> 
>> Sorry about being vague. Will add a better commit message. The goal is
>> to export MPSS support to guest if the host support the same. MPSS
>> support is exported via penc encoding in "ibm,segment-page-sizes". The
>> actual format can be found at htab_dt_scan_page_sizes. When the guest
>> memory is backed by hugetlbfs we expose the penc encoding the host
>> support to guest via kvmppc_add_seg_page_size. 
>
> In a case like this it's good to assume the reader doesn't know very
> much about Power CPUs, and probably isn't familiar with acronyms such
> as MPSS.  The patch needs an introductory paragraph explaining that on
> recent IBM Power CPUs, while the hashed page table is looked up using
> the page size from the segmentation hardware (i.e. the SLB), it is
> possible to have the HPT entry indicate a larger page size.  Thus for
> example it is possible to put a 16MB page in a 64kB segment, but since
> the hash lookup is done using a 64kB page size, it may be necessary to
> put multiple entries in the HPT for a single 16MB page.  This
> capability is called mixed page-size segment (MPSS).  With MPSS,
> there are two relevant page sizes: the base page size, which is the
> size used in searching the HPT, and the actual page size, which is the
> size indicated in the HPT entry.  Note that the actual page size is
> always >= base page size.

I will update the commit message with the above details

-aneesh

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Aneesh Kumar K.V May 6, 2014, 3:06 p.m. UTC | #10
Alexander Graf <agraf@suse.de> writes:

> On 05/06/2014 11:26 AM, Benjamin Herrenschmidt wrote:
>> On Tue, 2014-05-06 at 11:12 +0200, Alexander Graf wrote:
>>

.....


I updated the commit message as below. Let me know if this is ok.

    KVM: PPC: BOOK3S: HV: THP support for guest
    
    On recent IBM Power CPUs, while the hashed page table is looked up using
    the page size from the segmentation hardware (i.e. the SLB), it is
    possible to have the HPT entry indicate a larger page size.  Thus for
    example it is possible to put a 16MB page in a 64kB segment, but since
    the hash lookup is done using a 64kB page size, it may be necessary to
    put multiple entries in the HPT for a single 16MB page.  This
    capability is called mixed page-size segment (MPSS).  With MPSS,
    there are two relevant page sizes: the base page size, which is the
    size used in searching the HPT, and the actual page size, which is the
    size indicated in the HPT entry. [ Note that the actual page size is
    always >= base page size ].
    
    We advertise MPSS feature to guest only if the host CPU supports the
    same. We use "ibm,segment-page-sizes" device tree node to advertise
    the MPSS support. The penc encoding indicate whether we support
    a specific combination of base page size and actual page size
    in the same segment. It is also the value used in the L|LP encoding
    of HPTE entry.
    
    In-order to support MPSS in guest, KVM need to handle the below details
    * advertise MPSS via ibm,segment-page-sizes
    * Decode the base and actual page size correctly from the HPTE entry
      so that we know what we are dealing with in H_ENTER and and can do
      the appropriate TLB invalidation in H_REMOVE and evictions.
    


>
> yes. When / if people can easily get their hands on p7/p8 bare metal 
> systems I'll be more than happy to remove 970 support as well, but for 
> now it's probably good to keep in.
>

This should handle that.

+	/*
+	 * Add 16MB MPSS support if host supports it
+	 */
+	if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) {
+		(*sps)->enc[1].page_shift = 24;
+		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
+	}
 	(*sps)++;

-aneesh

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf May 6, 2014, 3:23 p.m. UTC | #11
On 05/06/2014 05:06 PM, Aneesh Kumar K.V wrote:
> Alexander Graf <agraf@suse.de> writes:
>
>> On 05/06/2014 11:26 AM, Benjamin Herrenschmidt wrote:
>>> On Tue, 2014-05-06 at 11:12 +0200, Alexander Graf wrote:
>>>
> .....
>
>
> I updated the commit message as below. Let me know if this is ok.
>
>      KVM: PPC: BOOK3S: HV: THP support for guest

This has nothing to do with THP.

>      
>      On recent IBM Power CPUs, while the hashed page table is looked up using
>      the page size from the segmentation hardware (i.e. the SLB), it is
>      possible to have the HPT entry indicate a larger page size.  Thus for
>      example it is possible to put a 16MB page in a 64kB segment, but since
>      the hash lookup is done using a 64kB page size, it may be necessary to
>      put multiple entries in the HPT for a single 16MB page.  This
>      capability is called mixed page-size segment (MPSS).  With MPSS,
>      there are two relevant page sizes: the base page size, which is the
>      size used in searching the HPT, and the actual page size, which is the
>      size indicated in the HPT entry. [ Note that the actual page size is
>      always >= base page size ].
>      
>      We advertise MPSS feature to guest only if the host CPU supports the
>      same. We use "ibm,segment-page-sizes" device tree node to advertise
>      the MPSS support. The penc encoding indicate whether we support
>      a specific combination of base page size and actual page size
>      in the same segment. It is also the value used in the L|LP encoding
>      of HPTE entry.
>      
>      In-order to support MPSS in guest, KVM need to handle the below details
>      * advertise MPSS via ibm,segment-page-sizes
>      * Decode the base and actual page size correctly from the HPTE entry
>        so that we know what we are dealing with in H_ENTER and and can do

Which code path exactly changes for H_ENTER?

>        the appropriate TLB invalidation in H_REMOVE and evictions.

Apart from the grammar (which is pretty broken for the part that is not 
copied from Paul) and the subject line this sounds quite reasonable.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Aneesh Kumar K.V May 6, 2014, 4:08 p.m. UTC | #12
Alexander Graf <agraf@suse.de> writes:

> On 05/06/2014 05:06 PM, Aneesh Kumar K.V wrote:
>> Alexander Graf <agraf@suse.de> writes:
>>
>>> On 05/06/2014 11:26 AM, Benjamin Herrenschmidt wrote:
>>>> On Tue, 2014-05-06 at 11:12 +0200, Alexander Graf wrote:
>>>>
>> .....
>>
>>
>> I updated the commit message as below. Let me know if this is ok.
>>
>>      KVM: PPC: BOOK3S: HV: THP support for guest
>
> This has nothing to do with THP.

THP support in guest depend on KVM advertising MPSS feature. We already
have rest of the changes needed to support transparent huge pages
upstream. (We do support THP with PowerVM LPAR already). The primary
motivation of this patch is to enable THP in powerkvm guest. 

>
>>      
>>      On recent IBM Power CPUs, while the hashed page table is looked up using
>>      the page size from the segmentation hardware (i.e. the SLB), it is
>>      possible to have the HPT entry indicate a larger page size.  Thus for
>>      example it is possible to put a 16MB page in a 64kB segment, but since
>>      the hash lookup is done using a 64kB page size, it may be necessary to
>>      put multiple entries in the HPT for a single 16MB page.  This
>>      capability is called mixed page-size segment (MPSS).  With MPSS,
>>      there are two relevant page sizes: the base page size, which is the
>>      size used in searching the HPT, and the actual page size, which is the
>>      size indicated in the HPT entry. [ Note that the actual page size is
>>      always >= base page size ].
>>      
>>      We advertise MPSS feature to guest only if the host CPU supports the
>>      same. We use "ibm,segment-page-sizes" device tree node to advertise
>>      the MPSS support. The penc encoding indicate whether we support
>>      a specific combination of base page size and actual page size
>>      in the same segment. It is also the value used in the L|LP encoding
>>      of HPTE entry.
>>      
>>      In-order to support MPSS in guest, KVM need to handle the below details
>>      * advertise MPSS via ibm,segment-page-sizes
>>      * Decode the base and actual page size correctly from the HPTE entry
>>        so that we know what we are dealing with in H_ENTER and and can do
>
> Which code path exactly changes for H_ENTER?

There is no real code path changes. Any code path that use
hpte_page_size() is impacted. We return actual page size there. 

>
>>        the appropriate TLB invalidation in H_REMOVE and evictions.
>
> Apart from the grammar (which is pretty broken for the part that is not 
> copied from Paul) and the subject line this sounds quite reasonable.
>

Wll try to fix.

-aneesh

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf May 6, 2014, 4:18 p.m. UTC | #13
On 05/06/2014 06:08 PM, Aneesh Kumar K.V wrote:
> Alexander Graf <agraf@suse.de> writes:
>
>> On 05/06/2014 05:06 PM, Aneesh Kumar K.V wrote:
>>> Alexander Graf <agraf@suse.de> writes:
>>>
>>>> On 05/06/2014 11:26 AM, Benjamin Herrenschmidt wrote:
>>>>> On Tue, 2014-05-06 at 11:12 +0200, Alexander Graf wrote:
>>>>>
>>> .....
>>>
>>>
>>> I updated the commit message as below. Let me know if this is ok.
>>>
>>>       KVM: PPC: BOOK3S: HV: THP support for guest
>> This has nothing to do with THP.
> THP support in guest depend on KVM advertising MPSS feature. We already
> have rest of the changes needed to support transparent huge pages
> upstream. (We do support THP with PowerVM LPAR already). The primary
> motivation of this patch is to enable THP in powerkvm guest.

But KVM doesn't care. KVM cares about MPSS. It's like saying "Support 
fork()" in a subject line while your patch implements page faults.

>
>>>       
>>>       On recent IBM Power CPUs, while the hashed page table is looked up using
>>>       the page size from the segmentation hardware (i.e. the SLB), it is
>>>       possible to have the HPT entry indicate a larger page size.  Thus for
>>>       example it is possible to put a 16MB page in a 64kB segment, but since
>>>       the hash lookup is done using a 64kB page size, it may be necessary to
>>>       put multiple entries in the HPT for a single 16MB page.  This
>>>       capability is called mixed page-size segment (MPSS).  With MPSS,
>>>       there are two relevant page sizes: the base page size, which is the
>>>       size used in searching the HPT, and the actual page size, which is the
>>>       size indicated in the HPT entry. [ Note that the actual page size is
>>>       always >= base page size ].
>>>       
>>>       We advertise MPSS feature to guest only if the host CPU supports the
>>>       same. We use "ibm,segment-page-sizes" device tree node to advertise
>>>       the MPSS support. The penc encoding indicate whether we support
>>>       a specific combination of base page size and actual page size
>>>       in the same segment. It is also the value used in the L|LP encoding
>>>       of HPTE entry.
>>>       
>>>       In-order to support MPSS in guest, KVM need to handle the below details
>>>       * advertise MPSS via ibm,segment-page-sizes
>>>       * Decode the base and actual page size correctly from the HPTE entry
>>>         so that we know what we are dealing with in H_ENTER and and can do
>> Which code path exactly changes for H_ENTER?
> There is no real code path changes. Any code path that use
> hpte_page_size() is impacted. We return actual page size there.

Ah, I see :).

>
>>>         the appropriate TLB invalidation in H_REMOVE and evictions.
>> Apart from the grammar (which is pretty broken for the part that is not
>> copied from Paul) and the subject line this sounds quite reasonable.
>>
> Wll try to fix.

Awesome. Thanks a lot!


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 6, 2014, 8:35 p.m. UTC | #14
On Tue, 2014-05-06 at 21:38 +0530, Aneesh Kumar K.V wrote:

> >> I updated the commit message as below. Let me know if this is ok.
> >>
> >>      KVM: PPC: BOOK3S: HV: THP support for guest
> >
> > This has nothing to do with THP.
> 
> THP support in guest depend on KVM advertising MPSS feature. We already
> have rest of the changes needed to support transparent huge pages
> upstream. (We do support THP with PowerVM LPAR already). The primary
> motivation of this patch is to enable THP in powerkvm guest. 

I would argue (nit picking, I know ... :-) that the subject should be
"Enable MPSS support for guests", and the description can then explain
that this allows Linux guests to use THP.

Cheers,
Ben.

> >
> >>      
> >>      On recent IBM Power CPUs, while the hashed page table is looked up using
> >>      the page size from the segmentation hardware (i.e. the SLB), it is
> >>      possible to have the HPT entry indicate a larger page size.  Thus for
> >>      example it is possible to put a 16MB page in a 64kB segment, but since
> >>      the hash lookup is done using a 64kB page size, it may be necessary to
> >>      put multiple entries in the HPT for a single 16MB page.  This
> >>      capability is called mixed page-size segment (MPSS).  With MPSS,
> >>      there are two relevant page sizes: the base page size, which is the
> >>      size used in searching the HPT, and the actual page size, which is the
> >>      size indicated in the HPT entry. [ Note that the actual page size is
> >>      always >= base page size ].
> >>      
> >>      We advertise MPSS feature to guest only if the host CPU supports the
> >>      same. We use "ibm,segment-page-sizes" device tree node to advertise
> >>      the MPSS support. The penc encoding indicate whether we support
> >>      a specific combination of base page size and actual page size
> >>      in the same segment. It is also the value used in the L|LP encoding
> >>      of HPTE entry.
> >>      
> >>      In-order to support MPSS in guest, KVM need to handle the below details
> >>      * advertise MPSS via ibm,segment-page-sizes
> >>      * Decode the base and actual page size correctly from the HPTE entry
> >>        so that we know what we are dealing with in H_ENTER and and can do
> >
> > Which code path exactly changes for H_ENTER?
> 
> There is no real code path changes. Any code path that use
> hpte_page_size() is impacted. We return actual page size there. 
> 
> >
> >>        the appropriate TLB invalidation in H_REMOVE and evictions.
> >
> > Apart from the grammar (which is pretty broken for the part that is not 
> > copied from Paul) and the subject line this sounds quite reasonable.
> >
> 
> Wll try to fix.
> 
> -aneesh


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 51388befeddb..f03ea8f90576 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -77,34 +77,122 @@  static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
 	return old == 0;
 }
 
+static inline int __hpte_actual_psize(unsigned int lp, int psize)
+{
+	int i, shift;
+	unsigned int mask;
+
+	/* start from 1 ignoring MMU_PAGE_4K */
+	for (i = 1; i < MMU_PAGE_COUNT; i++) {
+
+		/* invalid penc */
+		if (mmu_psize_defs[psize].penc[i] == -1)
+			continue;
+		/*
+		 * encoding bits per actual page size
+		 *        PTE LP     actual page size
+		 *    rrrr rrrz		>=8KB
+		 *    rrrr rrzz		>=16KB
+		 *    rrrr rzzz		>=32KB
+		 *    rrrr zzzz		>=64KB
+		 * .......
+		 */
+		shift = mmu_psize_defs[i].shift - LP_SHIFT;
+		if (shift > LP_BITS)
+			shift = LP_BITS;
+		mask = (1 << shift) - 1;
+		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
+			return i;
+	}
+	return -1;
+}
+
 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 					     unsigned long pte_index)
 {
-	unsigned long rb, va_low;
+	int b_size, a_size;
+	unsigned int penc;
+	unsigned long rb = 0, va_low, sllp;
+	unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
+	if (!(v & HPTE_V_LARGE)) {
+		/* both base and actual psize is 4k */
+		b_size = MMU_PAGE_4K;
+		a_size = MMU_PAGE_4K;
+	} else {
+		for (b_size = 0; b_size < MMU_PAGE_COUNT; b_size++) {
+
+			/* valid entries have a shift value */
+			if (!mmu_psize_defs[b_size].shift)
+				continue;
 
+			a_size = __hpte_actual_psize(lp, b_size);
+			if (a_size != -1)
+				break;
+		}
+	}
+	/*
+	 * Ignore the top 14 bits of va
+	 * v have top two bits covering segment size, hence move
+	 * by 16 bits, Also clear the lower HPTE_V_AVPN_SHIFT (7) bits.
+	 * AVA field in v also have the lower 23 bits ignored.
+	 * For base page size 4K we need 14 .. 65 bits (so need to
+	 * collect extra 11 bits)
+	 * For others we need 14..14+i
+	 */
+	/* This covers 14..54 bits of va*/
 	rb = (v & ~0x7fUL) << 16;		/* AVA field */
+	/*
+	 * AVA in v had cleared lower 23 bits. We need to derive
+	 * that from pteg index
+	 */
 	va_low = pte_index >> 3;
 	if (v & HPTE_V_SECONDARY)
 		va_low = ~va_low;
-	/* xor vsid from AVA */
+	/*
+	 * get the vpn bits from va_low using reverse of hashing.
+	 * In v we have va with 23 bits dropped and then left shifted
+	 * HPTE_V_AVPN_SHIFT (7) bits. Now to find vsid we need
+	 * right shift it with (SID_SHIFT - (23 - 7))
+	 */
 	if (!(v & HPTE_V_1TB_SEG))
-		va_low ^= v >> 12;
+		va_low ^= v >> (SID_SHIFT - 16);
 	else
-		va_low ^= v >> 24;
+		va_low ^= v >> (SID_SHIFT_1T - 16);
 	va_low &= 0x7ff;
-	if (v & HPTE_V_LARGE) {
-		rb |= 1;			/* L field */
-		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
-		    (r & 0xff000)) {
-			/* non-16MB large page, must be 64k */
-			/* (masks depend on page size) */
-			rb |= 0x1000;		/* page encoding in LP field */
-			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
-			rb |= ((va_low << 4) & 0xf0);	/* AVAL field (P7 doesn't seem to care) */
-		}
-	} else {
-		/* 4kB page */
-		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
+
+	switch (b_size) {
+	case MMU_PAGE_4K:
+		sllp = ((mmu_psize_defs[a_size].sllp & SLB_VSID_L) >> 6) |
+			((mmu_psize_defs[a_size].sllp & SLB_VSID_LP) >> 4);
+		rb |= sllp << 5;	/*  AP field */
+		rb |= (va_low & 0x7ff) << 12;	/* remaining 11 bits of AVA */
+		break;
+	default:
+	{
+		int aval_shift;
+		/*
+		 * remaining 7bits of AVA/LP fields
+		 * Also contain the rr bits of LP
+		 */
+		rb |= (va_low & 0x7f) << 16;
+		/*
+		 * Now clear not needed LP bits based on actual psize
+		 */
+		rb &= ~((1ul << mmu_psize_defs[a_size].shift) - 1);
+		/*
+		 * AVAL field 58..77 - base_page_shift bits of va
+		 * we have space for 58..64 bits, Missing bits should
+		 * be zero filled. +1 is to take care of L bit shift
+		 */
+		aval_shift = 64 - (77 - mmu_psize_defs[b_size].shift) + 1;
+		rb |= ((va_low << aval_shift) & 0xfe);
+
+		rb |= 1;		/* L field */
+		penc = mmu_psize_defs[b_size].penc[a_size];
+		rb |= penc << 12;	/* LP field */
+		break;
+	}
 	}
 	rb |= (v >> 54) & 0x300;		/* B field */
 	return rb;
@@ -112,14 +200,26 @@  static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 
 static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
 {
+	int size, a_size;
+	/* Look at the 8 bit LP value */
+	unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
 	/* only handle 4k, 64k and 16M pages for now */
 	if (!(h & HPTE_V_LARGE))
-		return 1ul << 12;		/* 4k page */
-	if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
-		return 1ul << 16;		/* 64k page */
-	if ((l & 0xff000) == 0)
-		return 1ul << 24;		/* 16M page */
-	return 0;				/* error */
+		return 1ul << 12;
+	else {
+		for (size = 0; size < MMU_PAGE_COUNT; size++) {
+			/* valid entries have a shift value */
+			if (!mmu_psize_defs[size].shift)
+				continue;
+
+			a_size = __hpte_actual_psize(lp, size);
+			if (a_size != -1)
+				return 1ul << mmu_psize_defs[a_size].shift;
+		}
+
+	}
+	return 0;
 }
 
 static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8227dba5af0f..a38d3289320a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1949,6 +1949,13 @@  static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
 	 * support pte_enc here
 	 */
 	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
+	/*
+	 * Add 16MB MPSS support
+	 */
+	if (linux_psize != MMU_PAGE_16M) {
+		(*sps)->enc[1].page_shift = 24;
+		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
+	}
 	(*sps)++;
 }