new file mode 100644
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#ifndef __ARM64_KVM_PGTABLE_H__
+#define __ARM64_KVM_PGTABLE_H__
+
+#include <linux/bits.h>
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+
+typedef u64 kvm_pte_t;
+
+/**
+ * struct kvm_pgtable - KVM page-table.
+ * @ia_bits: Maximum input address size, in bits.
+ * @start_level: Level at which the page-table walk starts.
+ * @pgd: Pointer to the first top-level entry of the page-table.
+ * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
+ */
+struct kvm_pgtable {
+ u32 ia_bits;
+ u32 start_level;
+ kvm_pte_t *pgd;
+
+ /* Stage-2 only */
+ struct kvm_s2_mmu *mmu;
+};
+
+/**
+ * enum kvm_pgtable_prot - Page-table permissions and attributes.
+ * @KVM_PGTABLE_PROT_R: Read permission.
+ * @KVM_PGTABLE_PROT_W: Write permission.
+ * @KVM_PGTABLE_PROT_X: Execute permission.
+ * @KVM_PGTABLE_PROT_DEVICE: Device attributes.
+ */
+enum kvm_pgtable_prot {
+ KVM_PGTABLE_PROT_R = BIT(0),
+ KVM_PGTABLE_PROT_W = BIT(1),
+ KVM_PGTABLE_PROT_X = BIT(2),
+
+ KVM_PGTABLE_PROT_DEVICE = BIT(3),
+};
+
+/**
+ * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
+ * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid
+ * entries.
+ * @KVM_PGTABLE_WALK_TABLE_PRE: Visit table entries before their
+ * children.
+ * @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their
+ * children.
+ */
+enum kvm_pgtable_walk_flags {
+ KVM_PGTABLE_WALK_LEAF = BIT(0),
+ KVM_PGTABLE_WALK_TABLE_PRE = BIT(1),
+ KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
+};
+
+typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg);
+
+/**
+ * struct kvm_pgtable_walker - Hook into a page-table walk.
+ * @cb: Callback function to invoke during the walk.
+ * @arg: Argument passed to the callback function.
+ * @flags: Bitwise-OR of flags to identify the entry types on which to
+ * invoke the callback function.
+ */
+struct kvm_pgtable_walker {
+ const kvm_pgtable_visitor_fn_t cb;
+ void * const arg;
+ const enum kvm_pgtable_walk_flags flags;
+};
+
+/**
+ * kvm_pgtable_walk() - Walk a page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_*_init().
+ * @addr: Input address for the start of the walk.
+ * @size: Size of the range to walk.
+ * @walker: Walker callback description.
+ *
+ * The walker will walk the page-table entries corresponding to the input
+ * address range specified, visiting entries according to the walker flags.
+ * Invalid entries are treated as leaf entries. Leaf entries are reloaded
+ * after invoking the walker callback, allowing the walker to descend into
+ * a newly installed table.
+ *
+ * Returning a negative error code from the walker callback function will
+ * terminate the walk immediately with the same error code.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_pgtable_walker *walker);
+
+#endif /* __ARM64_KVM_PGTABLE_H__ */
@@ -10,5 +10,5 @@ subdir-ccflags-y := -I$(incdir) \
-DDISABLE_BRANCH_PROFILING \
$(DISABLE_STACKLEAK_PLUGIN)
-obj-$(CONFIG_KVM) += vhe/ nvhe/
+obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o
obj-$(CONFIG_KVM_INDIRECT_VECTORS) += smccc_wa.o
new file mode 100644
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
+ * No bombay mix was harmed in the writing of this file.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#include <linux/bitfield.h>
+#include <asm/kvm_pgtable.h>
+
+#define KVM_PGTABLE_MAX_LEVELS 4U
+
+#define KVM_PTE_VALID BIT(0)
+
+#define KVM_PTE_TYPE BIT(1)
+#define KVM_PTE_TYPE_BLOCK 0
+#define KVM_PTE_TYPE_PAGE 1
+#define KVM_PTE_TYPE_TABLE 1
+
+#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT)
+#define KVM_PTE_ADDR_51_48 GENMASK(15, 12)
+
+#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
+
+#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51)
+
+struct kvm_pgtable_walk_data {
+ struct kvm_pgtable *pgt;
+ struct kvm_pgtable_walker *walker;
+
+ u64 addr;
+ u64 end;
+};
+
+static u64 kvm_granule_shift(u32 level)
+{
+ return (KVM_PGTABLE_MAX_LEVELS - level) * (PAGE_SHIFT - 3) + 3;
+}
+
+static u64 kvm_granule_size(u32 level)
+{
+ return BIT(kvm_granule_shift(level));
+}
+
+static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+{
+ u64 granule = kvm_granule_size(level);
+
+ /*
+ * Reject invalid block mappings and don't bother with 4TB mappings for
+ * 52-bit PAs.
+ */
+ if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
+ return false;
+
+ if (granule > (end - addr))
+ return false;
+
+ return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
+}
+
+static u32 kvm_start_level(u64 ia_bits)
+{
+ u64 levels = DIV_ROUND_UP(ia_bits - PAGE_SHIFT, PAGE_SHIFT - 3);
+ return KVM_PGTABLE_MAX_LEVELS - levels;
+}
+
+static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
+{
+ u64 shift = kvm_granule_shift(level);
+ u64 mask = BIT(PAGE_SHIFT - 3) - 1;
+
+ return (data->addr >> shift) & mask;
+}
+
+static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
+{
+ u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
+ u64 mask = BIT(pgt->ia_bits) - 1;
+
+ return (addr & mask) >> shift;
+}
+
+static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
+{
+ return __kvm_pgd_page_idx(data->pgt, data->addr);
+}
+
+static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
+{
+ struct kvm_pgtable pgt = {
+ .ia_bits = ia_bits,
+ .start_level = start_level,
+ };
+
+ return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
+}
+
+static bool kvm_pte_valid(kvm_pte_t pte)
+{
+ return pte & KVM_PTE_VALID;
+}
+
+static bool kvm_pte_table(kvm_pte_t pte, u32 level)
+{
+ if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+ return false;
+
+ if (!kvm_pte_valid(pte))
+ return false;
+
+ return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
+}
+
+static u64 kvm_pte_to_phys(kvm_pte_t pte)
+{
+ u64 pa = pte & KVM_PTE_ADDR_MASK;
+
+ if (PAGE_SHIFT == 16)
+ pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
+
+ return pa;
+}
+
+static kvm_pte_t kvm_phys_to_pte(u64 pa)
+{
+ kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
+
+ if (PAGE_SHIFT == 16)
+ pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
+
+ return pte;
+}
+
+static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
+{
+ return __va(kvm_pte_to_phys(pte));
+}
+
+static void kvm_set_invalid_pte(kvm_pte_t *ptep)
+{
+ kvm_pte_t pte = 0;
+ WRITE_ONCE(*ptep, pte);
+}
+
+static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
+{
+ kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
+
+ pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
+ pte |= KVM_PTE_VALID;
+
+ WARN_ON(kvm_pte_valid(old));
+ smp_store_release(ptep, pte);
+}
+
+static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
+ u32 level)
+{
+ kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
+ u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
+ KVM_PTE_TYPE_BLOCK;
+
+ pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
+ pte |= FIELD_PREP(KVM_PTE_TYPE, type);
+ pte |= KVM_PTE_VALID;
+
+ /* Tolerate KVM recreating the exact same mapping. */
+ if (kvm_pte_valid(old))
+ return old == pte;
+
+ smp_store_release(ptep, pte);
+ return true;
+}
+
+static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
+ u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag)
+{
+ struct kvm_pgtable_walker *walker = data->walker;
+ return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *pgtable, u32 level);
+
+static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *ptep, u32 level)
+{
+ int ret = 0;
+ u64 addr = data->addr;
+ kvm_pte_t *childp, pte = *ptep;
+ bool table = kvm_pte_table(pte, level);
+ enum kvm_pgtable_walk_flags flags = data->walker->flags;
+
+ if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_TABLE_PRE);
+ }
+
+ if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_LEAF);
+ pte = *ptep;
+ table = kvm_pte_table(pte, level);
+ }
+
+ if (ret)
+ goto out;
+
+ if (!table) {
+ data->addr += kvm_granule_size(level);
+ goto out;
+ }
+
+ childp = kvm_pte_follow(pte);
+ ret = __kvm_pgtable_walk(data, childp, level + 1);
+ if (ret)
+ goto out;
+
+ if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_TABLE_POST);
+ }
+
+out:
+ return ret;
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *pgtable, u32 level)
+{
+ u32 idx;
+ int ret = 0;
+
+ if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
+ return -EINVAL;
+
+ for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
+ kvm_pte_t *ptep = &pgtable[idx];
+
+ if (data->addr >= data->end)
+ break;
+
+ ret = __kvm_pgtable_visit(data, ptep, level);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
+{
+ u32 idx;
+ int ret = 0;
+ struct kvm_pgtable *pgt = data->pgt;
+ u64 limit = BIT(pgt->ia_bits);
+
+ if (data->addr > limit || data->end > limit)
+ return -ERANGE;
+
+ if (!pgt->pgd)
+ return -EINVAL;
+
+ for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
+ kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
+
+ ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_pgtable_walker *walker)
+{
+ struct kvm_pgtable_walk_data walk_data = {
+ .pgt = pgt,
+ .addr = ALIGN_DOWN(addr, PAGE_SIZE),
+ .end = PAGE_ALIGN(walk_data.addr + size),
+ .walker = walker,
+ };
+
+ return _kvm_pgtable_walk(&walk_data);
+}
The KVM page-table code is intricately tied into the kernel page-table code and re-uses the pte/pmd/pud/p4d/pgd macros directly in an attempt to reduce code duplication. Unfortunately, the reality is that there is an awful lot of code required to make this work, and at the end of the day you're limited to creating page-tables with the same configuration as the host kernel. Furthermore, lifting the page-table code to run directly at EL2 on a non-VHE system (as we plan to to do in future patches) is practically impossible due to the number of dependencies it has on the core kernel. Introduce a framework for walking Armv8 page-tables configured independently from the host kernel. Cc: Marc Zyngier <maz@kernel.org> Cc: Quentin Perret <qperret@google.com> Signed-off-by: Will Deacon <will@kernel.org> --- arch/arm64/include/asm/kvm_pgtable.h | 101 ++++++++++ arch/arm64/kvm/hyp/Makefile | 2 +- arch/arm64/kvm/hyp/pgtable.c | 290 +++++++++++++++++++++++++++ 3 files changed, 392 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/include/asm/kvm_pgtable.h create mode 100644 arch/arm64/kvm/hyp/pgtable.c