diff mbox series

[13/16] iommupt: Add the x86 PAE page table format

Message ID 13-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Consolidate iommu page table implementations | expand

Commit Message

Jason Gunthorpe Aug. 15, 2024, 3:11 p.m. UTC
This is used by x86 CPUs and can be used in both x86 IOMMUs. When the x86
IOMMU is running SVA it is using this page table format.

This implementation follows the AMD v2 io-pgtable version.

There is nothing remarkable here, the format has a variable top and
limited support for different page sizes and no contiguous pages support.

In principle this can support the 32 bit configuration with fewer table
levels.

FIXME: Compare the bits against the VT-D version too.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/generic_pt/Kconfig            |   6 +
 drivers/iommu/generic_pt/fmt/Makefile       |   2 +
 drivers/iommu/generic_pt/fmt/defs_x86pae.h  |  21 ++
 drivers/iommu/generic_pt/fmt/iommu_x86pae.c |   8 +
 drivers/iommu/generic_pt/fmt/x86pae.h       | 283 ++++++++++++++++++++
 include/linux/generic_pt/common.h           |   4 +
 include/linux/generic_pt/iommu.h            |  12 +
 7 files changed, 336 insertions(+)
 create mode 100644 drivers/iommu/generic_pt/fmt/defs_x86pae.h
 create mode 100644 drivers/iommu/generic_pt/fmt/iommu_x86pae.c
 create mode 100644 drivers/iommu/generic_pt/fmt/x86pae.h

Comments

Sean Christopherson Aug. 16, 2024, 7:21 p.m. UTC | #1
On Thu, Aug 15, 2024, Jason Gunthorpe wrote:
> This is used by x86 CPUs and can be used in both x86 IOMMUs. When the x86
> IOMMU is running SVA it is using this page table format.
> 
> This implementation follows the AMD v2 io-pgtable version.
> 
> There is nothing remarkable here, the format has a variable top and
> limited support for different page sizes and no contiguous pages support.
> 
> In principle this can support the 32 bit configuration with fewer table
> levels.

What's "the 32 bit configuration"?

> FIXME: Compare the bits against the VT-D version too.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/generic_pt/Kconfig            |   6 +
>  drivers/iommu/generic_pt/fmt/Makefile       |   2 +
>  drivers/iommu/generic_pt/fmt/defs_x86pae.h  |  21 ++
>  drivers/iommu/generic_pt/fmt/iommu_x86pae.c |   8 +
>  drivers/iommu/generic_pt/fmt/x86pae.h       | 283 ++++++++++++++++++++
>  include/linux/generic_pt/common.h           |   4 +
>  include/linux/generic_pt/iommu.h            |  12 +
>  7 files changed, 336 insertions(+)
>  create mode 100644 drivers/iommu/generic_pt/fmt/defs_x86pae.h
>  create mode 100644 drivers/iommu/generic_pt/fmt/iommu_x86pae.c
>  create mode 100644 drivers/iommu/generic_pt/fmt/x86pae.h
> 
> diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
> index e34be10cf8bac2..a7c006234fc218 100644
> --- a/drivers/iommu/generic_pt/Kconfig
> +++ b/drivers/iommu/generic_pt/Kconfig
> @@ -70,6 +70,11 @@ config IOMMU_PT_ARMV8_64K
>  
>  	  If unsure, say N here.
>  
> +config IOMMU_PT_X86PAE
> +       tristate "IOMMU page table for x86 PAE"
> +#include "iommu_template.h"
> diff --git a/drivers/iommu/generic_pt/fmt/x86pae.h b/drivers/iommu/generic_pt/fmt/x86pae.h
> new file mode 100644
> index 00000000000000..9e0ee74275fcb3
> --- /dev/null
> +++ b/drivers/iommu/generic_pt/fmt/x86pae.h
> @@ -0,0 +1,283 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
> + *
> + * x86 PAE page table
> + *
> + * This is described in
> + *   Section "4.4 PAE Paging" of the Intel Software Developer's Manual Volume 3

I highly doubt what's implemented here is actually PAE paging, as the SDM (that
is referenced above) and most x86 folks describe PAE paging.  PAE paging is
specifically used when the CPU is in 32-bit mode (NOT including compatibility mode!).

  PAE paging translates 32-bit linear addresses to 52-bit physical addresses.

Presumably what's implemented here is what Intel calls 4-level and 5-level paging.
Those are _really_ similar to PAE paging, e.g. have the same encodings for bits
11:0, and even require CR4.PAE=1, but they aren't 100% identical.  E.g. true PAE
paging doesn't have software-available bits in 62:MAXPHYADDR.

Unfortuntately, I have no idea what name to use for this flavor.  x86pae is
actually kinda good, but I think it'll be confusing to people that are familiar
with the more canonical version of PAE paging.

> + *   Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O
> + *   Virtualization Technology (IOMMU) Specification"
> + *
> + * It is used by x86 CPUs and The AMD and VT-D IOMMU HW.
> + *
> + * The named levels in the spec map to the pts->level as:
> + *   Table/PTE - 0
> + *   Directory/PDE - 1
> + *   Directory Ptr/PDPTE - 2
> + *   PML4/PML4E - 3
> + *   PML5/PML5E - 4

Any particularly reason not to use x86's (and KVM's) effective 1-based system?
(level '0' is essentially the 4KiB leaf entries in a page table)

Starting at '1' is kinda odd, but it aligns with thing like PML4/5, allows using
the pg_level enums from x86, and diverging from both x86 MM and KVM is likely
going to confuse people.
	
> + * FIXME: __sme_set
> + */
> +#ifndef __GENERIC_PT_FMT_X86PAE_H
> +#define __GENERIC_PT_FMT_X86PAE_H
> +
> +#include "defs_x86pae.h"
> +#include "../pt_defs.h"
> +
> +#include <linux/bitfield.h>
> +#include <linux/container_of.h>
> +#include <linux/log2.h>
> +
> +enum {
> +	PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
> +	PT_MAX_VA_ADDRESS_LG2 = 57,
> +	PT_ENTRY_WORD_SIZE = sizeof(u64),
> +	PT_MAX_TOP_LEVEL = 4,
> +	PT_GRANUAL_LG2SZ = 12,
> +	PT_TABLEMEM_LG2SZ = 12,
> +};
> +
> +/* Shared descriptor bits */
> +enum {
> +	X86PAE_FMT_P = BIT(0),
> +	X86PAE_FMT_RW = BIT(1),
> +	X86PAE_FMT_U = BIT(2),
> +	X86PAE_FMT_A = BIT(5),
> +	X86PAE_FMT_D = BIT(6),
> +	X86PAE_FMT_OA = GENMASK_ULL(51, 12),
> +	X86PAE_FMT_XD = BIT_ULL(63),

Any reason not to use the #defines in arch/x86/include/asm/pgtable_types.h?

> +static inline bool x86pae_pt_install_table(struct pt_state *pts,
> +					   pt_oaddr_t table_pa,
> +					   const struct pt_write_attrs *attrs)
> +{
> +	u64 *tablep = pt_cur_table(pts, u64);
> +	u64 entry;
> +
> +	/*
> +	 * FIXME according to the SDM D is ignored by HW on table pointers?

Correct, only leaf entries have dirty bits.  

> +	 * io_pgtable_v2 sets it
> +	 */
> +	entry = X86PAE_FMT_P | X86PAE_FMT_RW | X86PAE_FMT_U | X86PAE_FMT_A |

What happens with the USER bit for I/O page tables?  Ignored, I assume?

> +		X86PAE_FMT_D |
> +		FIELD_PREP(X86PAE_FMT_OA, log2_div(table_pa, PT_GRANUAL_LG2SZ));
> +	return pt_table_install64(&tablep[pts->index], entry, pts->entry);
> +}
Jason Gunthorpe Aug. 17, 2024, 12:36 a.m. UTC | #2
On Fri, Aug 16, 2024 at 12:21:18PM -0700, Sean Christopherson wrote:
> On Thu, Aug 15, 2024, Jason Gunthorpe wrote:
> > This is used by x86 CPUs and can be used in both x86 IOMMUs. When the x86
> > IOMMU is running SVA it is using this page table format.
> > 
> > This implementation follows the AMD v2 io-pgtable version.
> > 
> > There is nothing remarkable here, the format has a variable top and
> > limited support for different page sizes and no contiguous pages support.
> > 
> > In principle this can support the 32 bit configuration with fewer table
> > levels.
> 
> What's "the 32 bit configuration"?

Oh, the three level version

> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
> > + *
> > + * x86 PAE page table
> > + *
> > + * This is described in
> > + *   Section "4.4 PAE Paging" of the Intel Software Developer's Manual Volume 3
> 
> I highly doubt what's implemented here is actually PAE paging, as the SDM (that
> is referenced above) and most x86 folks describe PAE paging.  PAE paging is
> specifically used when the CPU is in 32-bit mode (NOT including compatibility mode!).
> 
>   PAE paging translates 32-bit linear addresses to 52-bit physical addresses.

> Presumably what's implemented here is what Intel calls 4-level and 5-level paging.
> Those are _really_ similar to PAE paging, 

I *think* this needs to support the three level "PAE" format as well?

I'm not really sure yet, but it looked to me like really old
non-scalable Intel vt-d iommu might need it?? Or maybe it uses the
vtdss format with 3 levels.. It is not something I've checked deeply
into yet.

So the intention was to also capture the 32 bit PAE format with three
levels and the 4/5 level as well. The idea will be that like arm and
others you select how many levels you want when you init the table.

If the three level format needs some bit adjustments also it would be
done with a feature bit. I haven't yet got to comparing against the
bit patterns the VT-D driver uses for this file, but I expect any
differences are minor.

> Unfortuntately, I have no idea what name to use for this flavor.  x86pae is
> actually kinda good, but I think it'll be confusing to people that are familiar
> with the more canonical version of PAE paging.

I struggled too. The name wasn't good to me either.. I think if this
is confusing lets just call it x86_64? Sort of a focus on the new.

> > + *   Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O
> > + *   Virtualization Technology (IOMMU) Specification"
> > + *
> > + * It is used by x86 CPUs and The AMD and VT-D IOMMU HW.
> > + *
> > + * The named levels in the spec map to the pts->level as:
> > + *   Table/PTE - 0
> > + *   Directory/PDE - 1
> > + *   Directory Ptr/PDPTE - 2
> > + *   PML4/PML4E - 3
> > + *   PML5/PML5E - 4
> 
> Any particularly reason not to use x86's (and KVM's) effective 1-based system?
> (level '0' is essentially the 4KiB leaf entries in a page table)

Not any super strong one. The math is slightly more natural with 0
based, for instance the most general version is arm 32 bit:

	return PT_GRANUAL_LG2SZ +
	       (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u32))) * pts->level;

This is the only case where PT_GRANUAL_LG2SZ=12 and
PT_TABLEMEM_LG2SZ=10, so the above needs no adjustment to level.

It also ensures that 0 is not an invalid value that needs to be
considered, and that little detail helps a few micro optimization.

Every document seems to have its own take of this, the intel/amd ones
all like to start at 1 and go up, the ARM ones are reversed and start
at 4 and goes down to 0.

> Starting at '1' is kinda odd, but it aligns with thing like PML4/5,
> allows using the pg_level enums from x86, and diverging from both
> x86 MM and KVM is likely going to confuse people.

And ARM people will say not using their totally different numbers
confuses them. I feel there is no winning here. So I went with
something mathematically clean and assumed we'd have this
discussion :)

At the end of the day the intersting stuff is done using the generic
code and API, so that can't make assumptions about the structure from
any of the documents. In that regard having it be different from
everything else (because it has to be a superset of everything else)
is not necessarily a bad thing.

In truth the number of places where you have to look at level is
really pretty small so I felt this was OK.

> > +/* Shared descriptor bits */
> > +enum {
> > +	X86PAE_FMT_P = BIT(0),
> > +	X86PAE_FMT_RW = BIT(1),
> > +	X86PAE_FMT_U = BIT(2),
> > +	X86PAE_FMT_A = BIT(5),
> > +	X86PAE_FMT_D = BIT(6),
> > +	X86PAE_FMT_OA = GENMASK_ULL(51, 12),
> > +	X86PAE_FMT_XD = BIT_ULL(63),
> 
> Any reason not to use the #defines in arch/x86/include/asm/pgtable_types.h?

This is arch independent code, I don't think I can include that header
from here?? I've seen Linus be negative about wild ../../ includes..

Keeping everything here arch independent is one of the big value adds
here, IMHO.

> > +static inline bool x86pae_pt_install_table(struct pt_state *pts,
> > +					   pt_oaddr_t table_pa,
> > +					   const struct pt_write_attrs *attrs)
> > +{
> > +	u64 *tablep = pt_cur_table(pts, u64);
> > +	u64 entry;
> > +
> > +	/*
> > +	 * FIXME according to the SDM D is ignored by HW on table pointers?
> 
> Correct, only leaf entries have dirty bits.  

To add some colour, this logic is exactly matching the bit patterns
that existing amd v2 iommu code is creating.

It looks to me like the AMD IOMMU manual also says it ignores this bit
in table levels, so it is possibly a little mistake in the existing
code.

I'll make a little patch fixing it and ask them that way..

> > +	 * io_pgtable_v2 sets it
> > +	 */
> > +	entry = X86PAE_FMT_P | X86PAE_FMT_RW | X86PAE_FMT_U | X86PAE_FMT_A |
> 
> What happens with the USER bit for I/O page tables?  Ignored, I assume?

Not ignored, it does stuff. AMD's IOMMU manual says:

 U/S: User/Supervisor. IOMMU uses same meaning as AMD64 processor page tables. 0=access is
 restricted to supervisor level. 1=both user and supervisor access is
 allowed.

 Software Note: For a peripheral not using U/S, software should program
 the bit to signal user mode.  If MMIO Offset 0030h[USSup] = 0, this
 field is ignored

IIRC it also comes out through the ATS replies.

I expect VTD is similar.

Thanks for checking, your remarks in the room at LPC were inspiring
this was the right way to go!

Thanks,
Jason
diff mbox series

Patch

diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
index e34be10cf8bac2..a7c006234fc218 100644
--- a/drivers/iommu/generic_pt/Kconfig
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -70,6 +70,11 @@  config IOMMU_PT_ARMV8_64K
 
 	  If unsure, say N here.
 
+config IOMMU_PT_X86PAE
+       tristate "IOMMU page table for x86 PAE"
+	depends on !GENERIC_ATOMIC64 # for cmpxchg64
+	default n
+
 config IOMMUT_PT_KUNIT_TEST
 	tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS
 	select IOMMU_IO_PGTABLE
@@ -78,6 +83,7 @@  config IOMMUT_PT_KUNIT_TEST
 	depends on IOMMU_PT_ARMV8_4K || !IOMMU_PT_ARMV8_4K
 	depends on IOMMU_PT_ARMV8_16K || !IOMMU_PT_ARMV8_16K
 	depends on IOMMU_PT_ARMV8_64K || !IOMMU_PT_ARMV8_64K
+	depends on IOMMU_PT_X86PAE || !IOMMU_PT_X86PAE
 	default KUNIT_ALL_TESTS
 endif
 endif
diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile
index 16031fc1270178..fe3d7ae3685468 100644
--- a/drivers/iommu/generic_pt/fmt/Makefile
+++ b/drivers/iommu/generic_pt/fmt/Makefile
@@ -6,6 +6,8 @@  iommu_pt_fmt-$(CONFIG_IOMMU_PT_ARMV8_4K) += armv8_4k
 iommu_pt_fmt-$(CONFIG_IOMMU_PT_ARMV8_16K) += armv8_16k
 iommu_pt_fmt-$(CONFIG_IOMMU_PT_ARMV8_64K) += armv8_64k
 
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86PAE) += x86pae
+
 IOMMU_PT_KUNIT_TEST :=
 define create_format
 obj-$(2) += iommu_$(1).o
diff --git a/drivers/iommu/generic_pt/fmt/defs_x86pae.h b/drivers/iommu/generic_pt/fmt/defs_x86pae.h
new file mode 100644
index 00000000000000..0d93454264b5da
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_x86pae.h
@@ -0,0 +1,21 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_X86PAE_H
+#define __GENERIC_PT_FMT_DEFS_X86PAE_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct x86pae_pt_write_attrs {
+	u64 descriptor_bits;
+	gfp_t gfp;
+};
+#define pt_write_attrs x86pae_pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_x86pae.c b/drivers/iommu/generic_pt/fmt/iommu_x86pae.c
new file mode 100644
index 00000000000000..f7ec71c61729e3
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_x86pae.c
@@ -0,0 +1,8 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT x86pae
+#define PT_SUPPORTED_FEATURES 0
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/x86pae.h b/drivers/iommu/generic_pt/fmt/x86pae.h
new file mode 100644
index 00000000000000..9e0ee74275fcb3
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/x86pae.h
@@ -0,0 +1,283 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ * x86 PAE page table
+ *
+ * This is described in
+ *   Section "4.4 PAE Paging" of the Intel Software Developer's Manual Volume 3
+ *   Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O
+ *   Virtualization Technology (IOMMU) Specification"
+ *
+ * It is used by x86 CPUs and The AMD and VT-D IOMMU HW.
+ *
+ * The named levels in the spec map to the pts->level as:
+ *   Table/PTE - 0
+ *   Directory/PDE - 1
+ *   Directory Ptr/PDPTE - 2
+ *   PML4/PML4E - 3
+ *   PML5/PML5E - 4
+ * FIXME: __sme_set
+ */
+#ifndef __GENERIC_PT_FMT_X86PAE_H
+#define __GENERIC_PT_FMT_X86PAE_H
+
+#include "defs_x86pae.h"
+#include "../pt_defs.h"
+
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/log2.h>
+
+enum {
+	PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+	PT_MAX_VA_ADDRESS_LG2 = 57,
+	PT_ENTRY_WORD_SIZE = sizeof(u64),
+	PT_MAX_TOP_LEVEL = 4,
+	PT_GRANUAL_LG2SZ = 12,
+	PT_TABLEMEM_LG2SZ = 12,
+};
+
+/* Shared descriptor bits */
+enum {
+	X86PAE_FMT_P = BIT(0),
+	X86PAE_FMT_RW = BIT(1),
+	X86PAE_FMT_U = BIT(2),
+	X86PAE_FMT_A = BIT(5),
+	X86PAE_FMT_D = BIT(6),
+	X86PAE_FMT_OA = GENMASK_ULL(51, 12),
+	X86PAE_FMT_XD = BIT_ULL(63),
+};
+
+/* PDPTE/PDE */
+enum {
+	X86PAE_FMT_PS = BIT(7),
+};
+
+#define common_to_x86pae_pt(common_ptr) \
+	container_of_const(common_ptr, struct pt_x86pae, common)
+#define to_x86pae_pt(pts) common_to_x86pae_pt((pts)->range->common)
+
+static inline pt_oaddr_t x86pae_pt_table_pa(const struct pt_state *pts)
+{
+	return log2_mul(FIELD_GET(X86PAE_FMT_OA, pts->entry),
+			PT_TABLEMEM_LG2SZ);
+}
+#define pt_table_pa x86pae_pt_table_pa
+
+static inline pt_oaddr_t x86pae_pt_entry_oa(const struct pt_state *pts)
+{
+	return log2_mul(FIELD_GET(X86PAE_FMT_OA, pts->entry), PT_GRANUAL_LG2SZ);
+}
+#define pt_entry_oa x86pae_pt_entry_oa
+
+static inline bool x86pae_pt_can_have_leaf(const struct pt_state *pts)
+{
+	return pts->level <= 2;
+}
+#define pt_can_have_leaf x86pae_pt_can_have_leaf
+
+static inline unsigned int
+x86pae_pt_table_item_lg2sz(const struct pt_state *pts)
+{
+	return PT_GRANUAL_LG2SZ +
+	       (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) * pts->level;
+}
+#define pt_table_item_lg2sz x86pae_pt_table_item_lg2sz
+
+static inline unsigned int x86pae_pt_num_items_lg2(const struct pt_state *pts)
+{
+	return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 x86pae_pt_num_items_lg2
+
+static inline enum pt_entry_type x86pae_pt_load_entry_raw(struct pt_state *pts)
+{
+	const u64 *tablep = pt_cur_table(pts, u64);
+	u64 entry;
+
+	pts->entry = entry = READ_ONCE(tablep[pts->index]);
+	if (!(entry & X86PAE_FMT_P))
+		return PT_ENTRY_EMPTY;
+	if (pts->level == 0 ||
+	    (x86pae_pt_can_have_leaf(pts) && (pts->entry & X86PAE_FMT_PS)))
+		return PT_ENTRY_OA;
+	return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw x86pae_pt_load_entry_raw
+
+static inline void
+x86pae_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+			     unsigned int oasz_lg2,
+			     const struct pt_write_attrs *attrs)
+{
+	u64 *tablep = pt_cur_table(pts, u64);
+	u64 entry;
+
+	entry = X86PAE_FMT_P |
+		FIELD_PREP(X86PAE_FMT_OA, log2_div(oa, PT_GRANUAL_LG2SZ)) |
+		attrs->descriptor_bits;
+	if (pts->level != 0)
+		entry |= X86PAE_FMT_PS;
+
+	WRITE_ONCE(tablep[pts->index], entry);
+	pts->entry = entry;
+}
+#define pt_install_leaf_entry x86pae_pt_install_leaf_entry
+
+static inline bool x86pae_pt_install_table(struct pt_state *pts,
+					   pt_oaddr_t table_pa,
+					   const struct pt_write_attrs *attrs)
+{
+	u64 *tablep = pt_cur_table(pts, u64);
+	u64 entry;
+
+	/*
+	 * FIXME according to the SDM D is ignored by HW on table pointers?
+	 * io_pgtable_v2 sets it
+	 */
+	entry = X86PAE_FMT_P | X86PAE_FMT_RW | X86PAE_FMT_U | X86PAE_FMT_A |
+		X86PAE_FMT_D |
+		FIELD_PREP(X86PAE_FMT_OA, log2_div(table_pa, PT_GRANUAL_LG2SZ));
+	return pt_table_install64(&tablep[pts->index], entry, pts->entry);
+}
+#define pt_install_table x86pae_pt_install_table
+
+static inline void x86pae_pt_attr_from_entry(const struct pt_state *pts,
+					     struct pt_write_attrs *attrs)
+{
+	attrs->descriptor_bits = pts->entry &
+				 (X86PAE_FMT_RW | X86PAE_FMT_U | X86PAE_FMT_A |
+				  X86PAE_FMT_D | X86PAE_FMT_XD);
+}
+#define pt_attr_from_entry x86pae_pt_attr_from_entry
+
+static inline void x86pae_pt_clear_entry(struct pt_state *pts,
+					 unsigned int num_contig_lg2)
+{
+	u64 *tablep = pt_cur_table(pts, u64);
+
+	WRITE_ONCE(tablep[pts->index], 0);
+}
+#define pt_clear_entry x86pae_pt_clear_entry
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_x86pae
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+	return &container_of(iommu_table, struct pt_iommu_table, iommu)
+			->x86pae_pt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+	return &container_of(common, struct pt_iommu_table, x86pae_pt.common)
+			->iommu;
+}
+
+static inline int x86pae_pt_iommu_set_prot(struct pt_common *common,
+					   struct pt_write_attrs *attrs,
+					   unsigned int iommu_prot)
+{
+	u64 pte;
+
+	pte = X86PAE_FMT_U | X86PAE_FMT_A | X86PAE_FMT_D;
+	if (iommu_prot & IOMMU_WRITE)
+		pte |= X86PAE_FMT_RW;
+
+	attrs->descriptor_bits = pte;
+	return 0;
+}
+#define pt_iommu_set_prot x86pae_pt_iommu_set_prot
+
+static inline int x86pae_pt_iommu_fmt_init(struct pt_iommu_x86pae *iommu_table,
+					   struct pt_iommu_x86pae_cfg *cfg)
+{
+	struct pt_x86pae *table = &iommu_table->x86pae_pt;
+
+	pt_top_set_level(&table->common, 3); // FIXME settable
+	return 0;
+}
+#define pt_iommu_fmt_init x86pae_pt_iommu_fmt_init
+
+#if defined(GENERIC_PT_KUNIT)
+static void x86pae_pt_kunit_setup_cfg(struct pt_iommu_x86pae_cfg *cfg)
+{
+}
+#define pt_kunit_setup_cfg x86pae_pt_kunit_setup_cfg
+#endif
+
+#if defined(GENERIC_PT_KUNIT) && IS_ENABLED(CONFIG_AMD_IOMMU)
+#include <linux/io-pgtable.h>
+#include "../../amd/amd_iommu_types.h"
+
+static struct io_pgtable_ops *
+x86pae_pt_iommu_alloc_io_pgtable(struct pt_iommu_x86pae_cfg *cfg,
+				 struct device *iommu_dev,
+				 struct io_pgtable_cfg **pgtbl_cfg)
+{
+	struct amd_io_pgtable *pgtable;
+	struct io_pgtable_ops *pgtbl_ops;
+
+	/*
+	 * AMD expects that io_pgtable_cfg is allocated to its type by the
+	 * caller.
+	 */
+	pgtable = kzalloc(sizeof(*pgtable), GFP_KERNEL);
+	if (!pgtable)
+		return NULL;
+
+	pgtable->iop.cfg.iommu_dev = iommu_dev;
+	pgtable->iop.cfg.amd.nid = NUMA_NO_NODE;
+	pgtbl_ops = alloc_io_pgtable_ops(AMD_IOMMU_V2, &pgtable->iop.cfg, NULL);
+	if (!pgtbl_ops) {
+		kfree(pgtable);
+		return NULL;
+	}
+	*pgtbl_cfg = &pgtable->iop.cfg;
+	return pgtbl_ops;
+}
+#define pt_iommu_alloc_io_pgtable x86pae_pt_iommu_alloc_io_pgtable
+
+static void x86pae_pt_iommu_free_pgtbl_cfg(struct io_pgtable_cfg *pgtbl_cfg)
+{
+	struct amd_io_pgtable *pgtable =
+		container_of(pgtbl_cfg, struct amd_io_pgtable, iop.cfg);
+
+	kfree(pgtable);
+}
+#define pt_iommu_free_pgtbl_cfg x86pae_pt_iommu_free_pgtbl_cfg
+
+static void x86pae_pt_iommu_setup_ref_table(struct pt_iommu_x86pae *iommu_table,
+					    struct io_pgtable_ops *pgtbl_ops)
+{
+	struct io_pgtable_cfg *pgtbl_cfg =
+		&io_pgtable_ops_to_pgtable(pgtbl_ops)->cfg;
+	struct amd_io_pgtable *pgtable =
+		container_of(pgtbl_cfg, struct amd_io_pgtable, iop.cfg);
+	struct pt_common *common = &iommu_table->x86pae_pt.common;
+
+	if (pgtbl_cfg->ias == 52 && PT_MAX_TOP_LEVEL >= 3)
+		pt_top_set(common, (struct pt_table_p *)pgtable->pgd, 3);
+	else if (pgtbl_cfg->ias == 57 && PT_MAX_TOP_LEVEL >= 4)
+		pt_top_set(common, (struct pt_table_p *)pgtable->pgd, 4);
+	else
+		WARN_ON(true);
+}
+#define pt_iommu_setup_ref_table x86pae_pt_iommu_setup_ref_table
+
+static u64 x86pae_pt_kunit_cmp_mask_entry(struct pt_state *pts)
+{
+	if (pts->type == PT_ENTRY_TABLE)
+		return pts->entry & (~(u64)(X86PAE_FMT_OA));
+	return pts->entry;
+}
+#define pt_kunit_cmp_mask_entry x86pae_pt_kunit_cmp_mask_entry
+#endif
+
+#endif
diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h
index e8d489dff756a8..e35fb83657f73b 100644
--- a/include/linux/generic_pt/common.h
+++ b/include/linux/generic_pt/common.h
@@ -126,4 +126,8 @@  enum {
 	PT_FEAT_ARMV8_NS,
 };
 
+struct pt_x86pae {
+	struct pt_common common;
+};
+
 #endif
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index bf139c5657fc06..ca69bb6192d1a7 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -289,4 +289,16 @@  static inline int pt_iommu_armv8_init(struct pt_iommu_armv8 *table,
 	}
 }
 
+struct pt_iommu_x86pae {
+	struct pt_iommu iommu;
+	struct pt_x86pae x86pae_pt;
+};
+
+struct pt_iommu_x86pae_cfg {
+	struct device *iommu_device;
+	unsigned int features;
+};
+int pt_iommu_x86pae_init(struct pt_iommu_x86pae *table,
+			 struct pt_iommu_x86pae_cfg *cfg, gfp_t gfp);
+
 #endif