diff mbox series

[v3,26/32] KVM: arm64: Introduce PROT_NONE mappings for stage 2

Message ID 20210302150002.3685113-27-qperret@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: A stage 2 for the host | expand

Commit Message

Quentin Perret March 2, 2021, 2:59 p.m. UTC
Once we start unmapping portions of memory from the host stage 2 (such
as e.g. the hypervisor memory sections, or pages that belong to
protected guests), we will need a way to track page ownership. And
given that all mappings in the host stage 2 will be identity-mapped, we
can use the host stage 2 page-table itself as a simplistic rmap.

As a first step towards this, introduce a new protection attribute
in the stage 2 page table code, called KVM_PGTABLE_PROT_NONE, which
allows to annotate portions of the IPA space as inaccessible. For
simplicity, PROT_NONE mappings are created as invalid mappings with a
software bit set.

Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pgtable.h |  2 ++
 arch/arm64/kvm/hyp/pgtable.c         | 26 ++++++++++++++++++++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)

Comments

Will Deacon March 4, 2021, 8 p.m. UTC | #1
On Tue, Mar 02, 2021 at 02:59:56PM +0000, Quentin Perret wrote:
> Once we start unmapping portions of memory from the host stage 2 (such
> as e.g. the hypervisor memory sections, or pages that belong to
> protected guests), we will need a way to track page ownership. And
> given that all mappings in the host stage 2 will be identity-mapped, we
> can use the host stage 2 page-table itself as a simplistic rmap.
> 
> As a first step towards this, introduce a new protection attribute
> in the stage 2 page table code, called KVM_PGTABLE_PROT_NONE, which
> allows to annotate portions of the IPA space as inaccessible. For
> simplicity, PROT_NONE mappings are created as invalid mappings with a
> software bit set.

Just as an observation, but given that they're invalid we can use any bit
from [63:2] to indicate that it's a PROT_NONE mapping, and that way we
can keep the real "software bits" for live mappings.

But we can of course change that later when we need the bit for something
else.

> 
> Signed-off-by: Quentin Perret <qperret@google.com>
> ---
>  arch/arm64/include/asm/kvm_pgtable.h |  2 ++
>  arch/arm64/kvm/hyp/pgtable.c         | 26 ++++++++++++++++++++++++--
>  2 files changed, 26 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index 9935dbae2cc1..c9f6ed76e0ad 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -80,6 +80,7 @@ struct kvm_pgtable {
>   * @KVM_PGTABLE_PROT_W:		Write permission.
>   * @KVM_PGTABLE_PROT_R:		Read permission.
>   * @KVM_PGTABLE_PROT_DEVICE:	Device attributes.
> + * @KVM_PGTABLE_PROT_NONE:	No permission.
>   */
>  enum kvm_pgtable_prot {
>  	KVM_PGTABLE_PROT_X			= BIT(0),
> @@ -87,6 +88,7 @@ enum kvm_pgtable_prot {
>  	KVM_PGTABLE_PROT_R			= BIT(2),
>  
>  	KVM_PGTABLE_PROT_DEVICE			= BIT(3),
> +	KVM_PGTABLE_PROT_NONE			= BIT(4),

Why do we need an extra entry here? Couldn't we just create PROT_NONE
entries when none of R,W or X are set?

>  };
>  
>  #define PAGE_HYP		(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index bdd6e3d4eeb6..8e7059fcfd40 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -48,6 +48,8 @@
>  					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
>  					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
>  
> +#define KVM_PTE_LEAF_SW_BIT_PROT_NONE	BIT(55)
> +
>  struct kvm_pgtable_walk_data {
>  	struct kvm_pgtable		*pgt;
>  	struct kvm_pgtable_walker	*walker;
> @@ -120,6 +122,16 @@ static bool kvm_pte_valid(kvm_pte_t pte)
>  	return pte & KVM_PTE_VALID;
>  }
>  
> +static bool kvm_pte_prot_none(kvm_pte_t pte)
> +{
> +	return pte & KVM_PTE_LEAF_SW_BIT_PROT_NONE;
> +}

I think it would be a good idea to check !kvm_pte_valid() in here too,
since it doesn't make sense to report true for valid (or table) entries.

Will
Quentin Perret March 5, 2021, 9:52 a.m. UTC | #2
On Thursday 04 Mar 2021 at 20:00:45 (+0000), Will Deacon wrote:
> On Tue, Mar 02, 2021 at 02:59:56PM +0000, Quentin Perret wrote:
> > Once we start unmapping portions of memory from the host stage 2 (such
> > as e.g. the hypervisor memory sections, or pages that belong to
> > protected guests), we will need a way to track page ownership. And
> > given that all mappings in the host stage 2 will be identity-mapped, we
> > can use the host stage 2 page-table itself as a simplistic rmap.
> > 
> > As a first step towards this, introduce a new protection attribute
> > in the stage 2 page table code, called KVM_PGTABLE_PROT_NONE, which
> > allows to annotate portions of the IPA space as inaccessible. For
> > simplicity, PROT_NONE mappings are created as invalid mappings with a
> > software bit set.
> 
> Just as an observation, but given that they're invalid we can use any bit
> from [63:2] to indicate that it's a PROT_NONE mapping, and that way we
> can keep the real "software bits" for live mappings.
> 
> But we can of course change that later when we need the bit for something
> else.

Right, so I used this approach for consistency with the kernel's
PROT_NONE mappings:

	#define PTE_PROT_NONE	(_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */

And in fact now that I think about it, it might be worth re-using the
same bit in stage 2.

But yes it would be pretty neat to use the other bits of invalid
mappings to add metadata about the pages. I could even drop the
PROT_NONE stuff straight away in favor of a more extensive mechanism for
tracking page ownership...

Thinking about it, it should be relatively straightforward to construct
the host stage 2 with the following invariants:

  1. everything is identity-mapped in the host stage 2;

  2. all valid mappings imply the underlying PA range belongs to the
     host;

  3. bits [63:32] (say) of all invalid mappings are used to store a
     unique identifier for the owner of the underlying PA range;

  4. the host identifier is 0, such that it owns all of memory by
     default at boot as its pgd is zeroed;

And then I could replace my PROT_NONE permission stuff by an ownership
change. E.g. the hypervisor would have its own identifier, and I could
use it to mark the .hyp memory sections as owned by the hyp (which
implies inaccessible by the host). And that should scale quite easily
when we start running protected guests as we'll assign them their own
identifiers. Sharing pages between guests (or even worse, between host
and guests) is a bit trickier, but maybe that is for later.

Thoughts?

> > 
> > Signed-off-by: Quentin Perret <qperret@google.com>
> > ---
> >  arch/arm64/include/asm/kvm_pgtable.h |  2 ++
> >  arch/arm64/kvm/hyp/pgtable.c         | 26 ++++++++++++++++++++++++--
> >  2 files changed, 26 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> > index 9935dbae2cc1..c9f6ed76e0ad 100644
> > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > @@ -80,6 +80,7 @@ struct kvm_pgtable {
> >   * @KVM_PGTABLE_PROT_W:		Write permission.
> >   * @KVM_PGTABLE_PROT_R:		Read permission.
> >   * @KVM_PGTABLE_PROT_DEVICE:	Device attributes.
> > + * @KVM_PGTABLE_PROT_NONE:	No permission.
> >   */
> >  enum kvm_pgtable_prot {
> >  	KVM_PGTABLE_PROT_X			= BIT(0),
> > @@ -87,6 +88,7 @@ enum kvm_pgtable_prot {
> >  	KVM_PGTABLE_PROT_R			= BIT(2),
> >  
> >  	KVM_PGTABLE_PROT_DEVICE			= BIT(3),
> > +	KVM_PGTABLE_PROT_NONE			= BIT(4),
> 
> Why do we need an extra entry here? Couldn't we just create PROT_NONE
> entries when none of R,W or X are set?

The kernel has an explicit PAGE_NONE permission, so I followed the same
idea, but that could work as well. Now, as per the above discussion that
may not be relevant if we implement the page ownership thing.

> >  };
> >  
> >  #define PAGE_HYP		(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index bdd6e3d4eeb6..8e7059fcfd40 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -48,6 +48,8 @@
> >  					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
> >  					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
> >  
> > +#define KVM_PTE_LEAF_SW_BIT_PROT_NONE	BIT(55)
> > +
> >  struct kvm_pgtable_walk_data {
> >  	struct kvm_pgtable		*pgt;
> >  	struct kvm_pgtable_walker	*walker;
> > @@ -120,6 +122,16 @@ static bool kvm_pte_valid(kvm_pte_t pte)
> >  	return pte & KVM_PTE_VALID;
> >  }
> >  
> > +static bool kvm_pte_prot_none(kvm_pte_t pte)
> > +{
> > +	return pte & KVM_PTE_LEAF_SW_BIT_PROT_NONE;
> > +}
> 
> I think it would be a good idea to check !kvm_pte_valid() in here too,
> since it doesn't make sense to report true for valid (or table) entries.

Ack.

Thanks,
Quentin
Will Deacon March 5, 2021, 7:03 p.m. UTC | #3
On Fri, Mar 05, 2021 at 09:52:12AM +0000, Quentin Perret wrote:
> On Thursday 04 Mar 2021 at 20:00:45 (+0000), Will Deacon wrote:
> > On Tue, Mar 02, 2021 at 02:59:56PM +0000, Quentin Perret wrote:
> > > Once we start unmapping portions of memory from the host stage 2 (such
> > > as e.g. the hypervisor memory sections, or pages that belong to
> > > protected guests), we will need a way to track page ownership. And
> > > given that all mappings in the host stage 2 will be identity-mapped, we
> > > can use the host stage 2 page-table itself as a simplistic rmap.
> > > 
> > > As a first step towards this, introduce a new protection attribute
> > > in the stage 2 page table code, called KVM_PGTABLE_PROT_NONE, which
> > > allows to annotate portions of the IPA space as inaccessible. For
> > > simplicity, PROT_NONE mappings are created as invalid mappings with a
> > > software bit set.
> > 
> > Just as an observation, but given that they're invalid we can use any bit
> > from [63:2] to indicate that it's a PROT_NONE mapping, and that way we
> > can keep the real "software bits" for live mappings.
> > 
> > But we can of course change that later when we need the bit for something
> > else.
> 
> Right, so I used this approach for consistency with the kernel's
> PROT_NONE mappings:
> 
> 	#define PTE_PROT_NONE	(_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */
> 
> And in fact now that I think about it, it might be worth re-using the
> same bit in stage 2.
> 
> But yes it would be pretty neat to use the other bits of invalid
> mappings to add metadata about the pages. I could even drop the
> PROT_NONE stuff straight away in favor of a more extensive mechanism for
> tracking page ownership...
> 
> Thinking about it, it should be relatively straightforward to construct
> the host stage 2 with the following invariants:
> 
>   1. everything is identity-mapped in the host stage 2;
> 
>   2. all valid mappings imply the underlying PA range belongs to the
>      host;
> 
>   3. bits [63:32] (say) of all invalid mappings are used to store a
>      unique identifier for the owner of the underlying PA range;
> 
>   4. the host identifier is 0, such that it owns all of memory by
>      default at boot as its pgd is zeroed;
> 
> And then I could replace my PROT_NONE permission stuff by an ownership
> change. E.g. the hypervisor would have its own identifier, and I could
> use it to mark the .hyp memory sections as owned by the hyp (which
> implies inaccessible by the host). And that should scale quite easily
> when we start running protected guests as we'll assign them their own
> identifiers. Sharing pages between guests (or even worse, between host
> and guests) is a bit trickier, but maybe that is for later.
> 
> Thoughts?

I think this sounds like a worthwhile generalisation to me, although virtio
brings an immediate need for shared pages and so we'll still need a software
bit for those so that we e.g. prevent the host from donating such a shared
page to the hypervisor.

Will
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 9935dbae2cc1..c9f6ed76e0ad 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -80,6 +80,7 @@  struct kvm_pgtable {
  * @KVM_PGTABLE_PROT_W:		Write permission.
  * @KVM_PGTABLE_PROT_R:		Read permission.
  * @KVM_PGTABLE_PROT_DEVICE:	Device attributes.
+ * @KVM_PGTABLE_PROT_NONE:	No permission.
  */
 enum kvm_pgtable_prot {
 	KVM_PGTABLE_PROT_X			= BIT(0),
@@ -87,6 +88,7 @@  enum kvm_pgtable_prot {
 	KVM_PGTABLE_PROT_R			= BIT(2),
 
 	KVM_PGTABLE_PROT_DEVICE			= BIT(3),
+	KVM_PGTABLE_PROT_NONE			= BIT(4),
 };
 
 #define PAGE_HYP		(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index bdd6e3d4eeb6..8e7059fcfd40 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -48,6 +48,8 @@ 
 					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
 					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
 
+#define KVM_PTE_LEAF_SW_BIT_PROT_NONE	BIT(55)
+
 struct kvm_pgtable_walk_data {
 	struct kvm_pgtable		*pgt;
 	struct kvm_pgtable_walker	*walker;
@@ -120,6 +122,16 @@  static bool kvm_pte_valid(kvm_pte_t pte)
 	return pte & KVM_PTE_VALID;
 }
 
+static bool kvm_pte_prot_none(kvm_pte_t pte)
+{
+	return pte & KVM_PTE_LEAF_SW_BIT_PROT_NONE;
+}
+
+static inline bool stage2_is_permanent_mapping(kvm_pte_t pte)
+{
+	return kvm_pte_prot_none(pte);
+}
+
 static bool kvm_pte_table(kvm_pte_t pte, u32 level)
 {
 	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
@@ -182,7 +194,8 @@  static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
 
 	pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
 	pte |= FIELD_PREP(KVM_PTE_TYPE, type);
-	pte |= KVM_PTE_VALID;
+	if (!kvm_pte_prot_none(pte))
+		pte |= KVM_PTE_VALID;
 
 	return pte;
 }
@@ -317,7 +330,7 @@  static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
 	u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
 					       KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
 
-	if (!(prot & KVM_PGTABLE_PROT_R))
+	if (!(prot & KVM_PGTABLE_PROT_R) || (prot & KVM_PGTABLE_PROT_NONE))
 		return -EINVAL;
 
 	if (prot & KVM_PGTABLE_PROT_X) {
@@ -489,6 +502,13 @@  static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
 			    PAGE_S2_MEMATTR(NORMAL);
 	u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
 
+	if (prot & KVM_PGTABLE_PROT_NONE) {
+		if (prot != KVM_PGTABLE_PROT_NONE)
+			return -EINVAL;
+		attr |= KVM_PTE_LEAF_SW_BIT_PROT_NONE;
+		goto out;
+	}
+
 	if (!(prot & KVM_PGTABLE_PROT_X))
 		attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
 	else if (device)
@@ -502,6 +522,8 @@  static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
 
 	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
 	attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+
+out:
 	data->attr = attr;
 	return 0;
 }