diff mbox series

[v2] drviers/iommu/amd: support P2P access through IOMMU when SME is enabled

Message ID 20250117071423.469880-1-east.moutain.yang@gmail.com (mailing list archive)
State New
Headers show
Series [v2] drviers/iommu/amd: support P2P access through IOMMU when SME is enabled | expand

Commit Message

Wencheng Yang Jan. 17, 2025, 7:14 a.m. UTC
When SME is enabled, memory encryption bit is set in IOMMU page table
pte entry, it works fine if the pfn of the pte entry is memory.
However, if the pfn is MMIO address, for example, map other device's mmio
space to its io page table, in such situation, setting memory encryption
bit in pte would cause P2P failure.

Clear memory encryption bit in io page table if the mapping is MMIO
rather than memory.

Signed-off-by: Wencheng Yang <east.moutain.yang@gmail.com>
---
 drivers/iommu/amd/amd_iommu_types.h | 7 ++++---
 drivers/iommu/amd/io_pgtable.c      | 2 ++
 drivers/iommu/amd/io_pgtable_v2.c   | 5 ++++-
 drivers/iommu/amd/iommu.c           | 2 ++
 drivers/vfio/vfio_iommu_type1.c     | 4 +++-
 include/uapi/linux/vfio.h           | 1 +
 6 files changed, 16 insertions(+), 5 deletions(-)

Comments

Alex Williamson Jan. 17, 2025, 1:44 p.m. UTC | #1
On Fri, 17 Jan 2025 15:14:18 +0800
Wencheng Yang <east.moutain.yang@gmail.com> wrote:

> When SME is enabled, memory encryption bit is set in IOMMU page table
> pte entry, it works fine if the pfn of the pte entry is memory.
> However, if the pfn is MMIO address, for example, map other device's mmio
> space to its io page table, in such situation, setting memory encryption
> bit in pte would cause P2P failure.
> 
> Clear memory encryption bit in io page table if the mapping is MMIO
> rather than memory.
> 
> Signed-off-by: Wencheng Yang <east.moutain.yang@gmail.com>
> ---
>  drivers/iommu/amd/amd_iommu_types.h | 7 ++++---
>  drivers/iommu/amd/io_pgtable.c      | 2 ++
>  drivers/iommu/amd/io_pgtable_v2.c   | 5 ++++-
>  drivers/iommu/amd/iommu.c           | 2 ++
>  drivers/vfio/vfio_iommu_type1.c     | 4 +++-
>  include/uapi/linux/vfio.h           | 1 +
>  6 files changed, 16 insertions(+), 5 deletions(-)

This needs to:

 - Be split into separate IOMMU vs VFIO patches
 - Consider and consolidate with other IOMMU implementations of the same
 - Provide introspection to userspace relative to the availability of
   the resulting mapping option

It's also not clear to me that the user should be responsible for
setting this flag versus something in the VFIO or IOMMU layer.  For
example what are the implications of the user setting this flag
incorrectly (not just failing to set it for MMIO, but using it for RAM)?
Thanks,

Alex

> 
> diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
> index fdb0357e0bb9..b0f055200cf3 100644
> --- a/drivers/iommu/amd/amd_iommu_types.h
> +++ b/drivers/iommu/amd/amd_iommu_types.h
> @@ -434,9 +434,10 @@
>  #define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
>  #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
>  
> -#define IOMMU_PROT_MASK 0x03
> -#define IOMMU_PROT_IR 0x01
> -#define IOMMU_PROT_IW 0x02
> +#define IOMMU_PROT_MASK 0x07
> +#define IOMMU_PROT_IR   0x01
> +#define IOMMU_PROT_IW   0x02
> +#define IOMMU_PROT_MMIO 0x04
>  
>  #define IOMMU_UNITY_MAP_FLAG_EXCL_RANGE	(1 << 2)
>  
> diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
> index f3399087859f..dff887958a56 100644
> --- a/drivers/iommu/amd/io_pgtable.c
> +++ b/drivers/iommu/amd/io_pgtable.c
> @@ -373,6 +373,8 @@ static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
>  			__pte |= IOMMU_PTE_IR;
>  		if (prot & IOMMU_PROT_IW)
>  			__pte |= IOMMU_PTE_IW;
> +		if (prot & IOMMU_PROT_MMIO)
> +			__pte = __sme_clr(__pte);
>  
>  		for (i = 0; i < count; ++i)
>  			pte[i] = __pte;
> diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c
> index c616de2c5926..55f969727dea 100644
> --- a/drivers/iommu/amd/io_pgtable_v2.c
> +++ b/drivers/iommu/amd/io_pgtable_v2.c
> @@ -65,7 +65,10 @@ static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot)
>  {
>  	u64 pte;
>  
> -	pte = __sme_set(paddr & PM_ADDR_MASK);
> +	pte = paddr & PM_ADDR_MASK;
> +	if (!(prot & IOMMU_PROT_MMIO))
> +		pte = __sme_set(pte);
> +
>  	pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER;
>  	pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY;
>  
> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> index 16f40b8000d7..9194ad681504 100644
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -2578,6 +2578,8 @@ static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
>  		prot |= IOMMU_PROT_IR;
>  	if (iommu_prot & IOMMU_WRITE)
>  		prot |= IOMMU_PROT_IW;
> +	if (iommu_prot & IOMMU_MMIO)
> +		prot |= IOMMU_PROT_MMIO;
>  
>  	if (ops->map_pages) {
>  		ret = ops->map_pages(ops, iova, paddr, pgsize,
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 50ebc9593c9d..08be1ef8514b 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -1557,6 +1557,8 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  		prot |= IOMMU_WRITE;
>  	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
>  		prot |= IOMMU_READ;
> +    if (map->flags & VFIO_DMA_MAP_FLAG_MMIO)
> +        prot |= IOMMU_MMIO;
>  
>  	if ((prot && set_vaddr) || (!prot && !set_vaddr))
>  		return -EINVAL;
> @@ -2801,7 +2803,7 @@ static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
>  	struct vfio_iommu_type1_dma_map map;
>  	unsigned long minsz;
>  	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
> -			VFIO_DMA_MAP_FLAG_VADDR;
> +			VFIO_DMA_MAP_FLAG_VADDR | VFIO_DMA_MAP_FLAG_MMIO;
>  
>  	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
>  
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index c8dbf8219c4f..68002c8f1157 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1560,6 +1560,7 @@ struct vfio_iommu_type1_dma_map {
>  #define VFIO_DMA_MAP_FLAG_READ (1 << 0)		/* readable from device */
>  #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)	/* writable from device */
>  #define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
> +#define VFIO_DMA_MAP_FLAG_MMIO (1 << 3)     /* map of mmio */
>  	__u64	vaddr;				/* Process virtual address */
>  	__u64	iova;				/* IO virtual address */
>  	__u64	size;				/* Size of mapping (bytes) */
Jason Gunthorpe Jan. 20, 2025, 1:59 p.m. UTC | #2
On Fri, Jan 17, 2025 at 03:14:18PM +0800, Wencheng Yang wrote:
> When SME is enabled, memory encryption bit is set in IOMMU page table
> pte entry, it works fine if the pfn of the pte entry is memory.
> However, if the pfn is MMIO address, for example, map other device's mmio
> space to its io page table, in such situation, setting memory encryption
> bit in pte would cause P2P failure.

This doesn't seem entirely right to me, the encrypted bit should flow
in from the entity doing the map and be based on more detailed
knowledge about what is happening.

Not be guessed at inside the iommu.

We have non-encrpyted CPU memory, and (someday) encrypted MMIO.

Jason
Wencheng Yang Jan. 21, 2025, 9:27 a.m. UTC | #3
On Mon, Jan 20, 2025 at 9:59 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Fri, Jan 17, 2025 at 03:14:18PM +0800, Wencheng Yang wrote:
> > When SME is enabled, memory encryption bit is set in IOMMU page table
> > pte entry, it works fine if the pfn of the pte entry is memory.
> > However, if the pfn is MMIO address, for example, map other device's mmio
> > space to its io page table, in such situation, setting memory encryption
> > bit in pte would cause P2P failure.
>
> This doesn't seem entirely right to me, the encrypted bit should flow
> in from the entity doing the map and be based on more detailed
> knowledge about what is happening.
>
> Not be guessed at inside the iommu.
>
> We have non-encrpyted CPU memory, and (someday) encrypted MMIO.

hi Jason

IOMMU shouldn't and can't guess the type of the mapping, e.g. memory
or device MMIO,
VFIO passes the info in a flag to IOMMU to setup IO page table entry.
There is another Qemu patch which will set the flag.
Qemu path: https://lists.nongnu.org/archive/html/qemu-devel/2025-01/msg02837.html

Thanks,
Wencheng

>
> Jason
Wencheng Yang Jan. 21, 2025, 11:07 a.m. UTC | #4
> This needs to:
>
>  - Be split into separate IOMMU vs VFIO patches
>  - Consider and consolidate with other IOMMU implementations of the same

I will do that in the next patch.

>  - Provide introspection to userspace relative to the availability of
>    the resulting mapping option
I don't get your meaning, can you expain in detail?

>
> It's also not clear to me that the user should be responsible for
> setting this flag versus something in the VFIO or IOMMU layer.  For
> example what are the implications of the user setting this flag
> incorrectly (not just failing to set it for MMIO, but using it for RAM)?

If user sets this flag to RAM region, it has no effect on the platform
that memory
encrytion is disable. If memory encrytion is enabled, then device
can't get correct
data from RAM, for example, CPU writes data to RAM that is encrypted by
memory controller, but device read the data from RAM as plaintext, but will
never leak confidential data.

Thanks,
Wencheng

On Fri, Jan 17, 2025 at 9:45 PM Alex Williamson
<alex.williamson@redhat.com> wrote:
>
> On Fri, 17 Jan 2025 15:14:18 +0800
> Wencheng Yang <east.moutain.yang@gmail.com> wrote:
>
> > When SME is enabled, memory encryption bit is set in IOMMU page table
> > pte entry, it works fine if the pfn of the pte entry is memory.
> > However, if the pfn is MMIO address, for example, map other device's mmio
> > space to its io page table, in such situation, setting memory encryption
> > bit in pte would cause P2P failure.
> >
> > Clear memory encryption bit in io page table if the mapping is MMIO
> > rather than memory.
> >
> > Signed-off-by: Wencheng Yang <east.moutain.yang@gmail.com>
> > ---
> >  drivers/iommu/amd/amd_iommu_types.h | 7 ++++---
> >  drivers/iommu/amd/io_pgtable.c      | 2 ++
> >  drivers/iommu/amd/io_pgtable_v2.c   | 5 ++++-
> >  drivers/iommu/amd/iommu.c           | 2 ++
> >  drivers/vfio/vfio_iommu_type1.c     | 4 +++-
> >  include/uapi/linux/vfio.h           | 1 +
> >  6 files changed, 16 insertions(+), 5 deletions(-)
>
> This needs to:
>
>  - Be split into separate IOMMU vs VFIO patches
>  - Consider and consolidate with other IOMMU implementations of the same
>  - Provide introspection to userspace relative to the availability of
>    the resulting mapping option
>
> It's also not clear to me that the user should be responsible for
> setting this flag versus something in the VFIO or IOMMU layer.  For
> example what are the implications of the user setting this flag
> incorrectly (not just failing to set it for MMIO, but using it for RAM)?

> Thanks,
>
> Alex
>
> >
> > diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
> > index fdb0357e0bb9..b0f055200cf3 100644
> > --- a/drivers/iommu/amd/amd_iommu_types.h
> > +++ b/drivers/iommu/amd/amd_iommu_types.h
> > @@ -434,9 +434,10 @@
> >  #define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
> >  #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
> >
> > -#define IOMMU_PROT_MASK 0x03
> > -#define IOMMU_PROT_IR 0x01
> > -#define IOMMU_PROT_IW 0x02
> > +#define IOMMU_PROT_MASK 0x07
> > +#define IOMMU_PROT_IR   0x01
> > +#define IOMMU_PROT_IW   0x02
> > +#define IOMMU_PROT_MMIO 0x04
> >
> >  #define IOMMU_UNITY_MAP_FLAG_EXCL_RANGE      (1 << 2)
> >
> > diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
> > index f3399087859f..dff887958a56 100644
> > --- a/drivers/iommu/amd/io_pgtable.c
> > +++ b/drivers/iommu/amd/io_pgtable.c
> > @@ -373,6 +373,8 @@ static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
> >                       __pte |= IOMMU_PTE_IR;
> >               if (prot & IOMMU_PROT_IW)
> >                       __pte |= IOMMU_PTE_IW;
> > +             if (prot & IOMMU_PROT_MMIO)
> > +                     __pte = __sme_clr(__pte);
> >
> >               for (i = 0; i < count; ++i)
> >                       pte[i] = __pte;
> > diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c
> > index c616de2c5926..55f969727dea 100644
> > --- a/drivers/iommu/amd/io_pgtable_v2.c
> > +++ b/drivers/iommu/amd/io_pgtable_v2.c
> > @@ -65,7 +65,10 @@ static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot)
> >  {
> >       u64 pte;
> >
> > -     pte = __sme_set(paddr & PM_ADDR_MASK);
> > +     pte = paddr & PM_ADDR_MASK;
> > +     if (!(prot & IOMMU_PROT_MMIO))
> > +             pte = __sme_set(pte);
> > +
> >       pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER;
> >       pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY;
> >
> > diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> > index 16f40b8000d7..9194ad681504 100644
> > --- a/drivers/iommu/amd/iommu.c
> > +++ b/drivers/iommu/amd/iommu.c
> > @@ -2578,6 +2578,8 @@ static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
> >               prot |= IOMMU_PROT_IR;
> >       if (iommu_prot & IOMMU_WRITE)
> >               prot |= IOMMU_PROT_IW;
> > +     if (iommu_prot & IOMMU_MMIO)
> > +             prot |= IOMMU_PROT_MMIO;
> >
> >       if (ops->map_pages) {
> >               ret = ops->map_pages(ops, iova, paddr, pgsize,
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 50ebc9593c9d..08be1ef8514b 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -1557,6 +1557,8 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> >               prot |= IOMMU_WRITE;
> >       if (map->flags & VFIO_DMA_MAP_FLAG_READ)
> >               prot |= IOMMU_READ;
> > +    if (map->flags & VFIO_DMA_MAP_FLAG_MMIO)
> > +        prot |= IOMMU_MMIO;
> >
> >       if ((prot && set_vaddr) || (!prot && !set_vaddr))
> >               return -EINVAL;
> > @@ -2801,7 +2803,7 @@ static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
> >       struct vfio_iommu_type1_dma_map map;
> >       unsigned long minsz;
> >       uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
> > -                     VFIO_DMA_MAP_FLAG_VADDR;
> > +                     VFIO_DMA_MAP_FLAG_VADDR | VFIO_DMA_MAP_FLAG_MMIO;
> >
> >       minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
> >
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index c8dbf8219c4f..68002c8f1157 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -1560,6 +1560,7 @@ struct vfio_iommu_type1_dma_map {
> >  #define VFIO_DMA_MAP_FLAG_READ (1 << 0)              /* readable from device */
> >  #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)     /* writable from device */
> >  #define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
> > +#define VFIO_DMA_MAP_FLAG_MMIO (1 << 3)     /* map of mmio */
> >       __u64   vaddr;                          /* Process virtual address */
> >       __u64   iova;                           /* IO virtual address */
> >       __u64   size;                           /* Size of mapping (bytes) */
>
Alex Williamson Jan. 21, 2025, 3:34 p.m. UTC | #5
On Tue, 21 Jan 2025 19:07:26 +0800
Wencheng Yang <east.moutain.yang@gmail.com> wrote:

> > This needs to:
> >
> >  - Be split into separate IOMMU vs VFIO patches
> >  - Consider and consolidate with other IOMMU implementations of the same  
> 
> I will do that in the next patch.

Clearly the latter bullet is not considered in the most recent posting.

> >  - Provide introspection to userspace relative to the availability of
> >    the resulting mapping option  
> I don't get your meaning, can you expain in detail?

Generally it would be polite to get these sorts of clarifications
before spamming the list with another version of the series.  Userspace
has no ability to determine whether the kernel supports this flag other
than trial and error.  The ability to determine the kernel support for
a new feature is introspection.  For example, if QEMU blindly adds the
MMIO flag the mapping will fail on older kernels.  How does QEMU know
whether support for the flag is available on the underlying kernel?

> > It's also not clear to me that the user should be responsible for
> > setting this flag versus something in the VFIO or IOMMU layer.  For
> > example what are the implications of the user setting this flag
> > incorrectly (not just failing to set it for MMIO, but using it for RAM)?  
> 
> If user sets this flag to RAM region, it has no effect on the platform
> that memory
> encrytion is disable. If memory encrytion is enabled, then device
> can't get correct
> data from RAM, for example, CPU writes data to RAM that is encrypted by
> memory controller, but device read the data from RAM as plaintext, but will
> never leak confidential data.

This description is unclear to me.  As others have noted, we probably
need to look at whether the flag should be automatically applied by the
kernel.  We certainly know in the vfio IOMMU layer whether we're
mapping a page or a pfnmap.  In any case, we're in the process of
phasing out the vfio type1 IOMMU backend for iommufd, so whatever the
implementation, and especially if there's a uapi component, it needs to
be implemented in iommufd first.  Thanks,

Alex

> On Fri, Jan 17, 2025 at 9:45 PM Alex Williamson
> <alex.williamson@redhat.com> wrote:
> >
> > On Fri, 17 Jan 2025 15:14:18 +0800
> > Wencheng Yang <east.moutain.yang@gmail.com> wrote:
> >  
> > > When SME is enabled, memory encryption bit is set in IOMMU page table
> > > pte entry, it works fine if the pfn of the pte entry is memory.
> > > However, if the pfn is MMIO address, for example, map other device's mmio
> > > space to its io page table, in such situation, setting memory encryption
> > > bit in pte would cause P2P failure.
> > >
> > > Clear memory encryption bit in io page table if the mapping is MMIO
> > > rather than memory.
> > >
> > > Signed-off-by: Wencheng Yang <east.moutain.yang@gmail.com>
> > > ---
> > >  drivers/iommu/amd/amd_iommu_types.h | 7 ++++---
> > >  drivers/iommu/amd/io_pgtable.c      | 2 ++
> > >  drivers/iommu/amd/io_pgtable_v2.c   | 5 ++++-
> > >  drivers/iommu/amd/iommu.c           | 2 ++
> > >  drivers/vfio/vfio_iommu_type1.c     | 4 +++-
> > >  include/uapi/linux/vfio.h           | 1 +
> > >  6 files changed, 16 insertions(+), 5 deletions(-)  
> >
> > This needs to:
> >
> >  - Be split into separate IOMMU vs VFIO patches
> >  - Consider and consolidate with other IOMMU implementations of the same
> >  - Provide introspection to userspace relative to the availability of
> >    the resulting mapping option
> >
> > It's also not clear to me that the user should be responsible for
> > setting this flag versus something in the VFIO or IOMMU layer.  For
> > example what are the implications of the user setting this flag
> > incorrectly (not just failing to set it for MMIO, but using it for RAM)?  
> 
> > Thanks,
> >
> > Alex
> >  
> > >
> > > diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
> > > index fdb0357e0bb9..b0f055200cf3 100644
> > > --- a/drivers/iommu/amd/amd_iommu_types.h
> > > +++ b/drivers/iommu/amd/amd_iommu_types.h
> > > @@ -434,9 +434,10 @@
> > >  #define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
> > >  #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
> > >
> > > -#define IOMMU_PROT_MASK 0x03
> > > -#define IOMMU_PROT_IR 0x01
> > > -#define IOMMU_PROT_IW 0x02
> > > +#define IOMMU_PROT_MASK 0x07
> > > +#define IOMMU_PROT_IR   0x01
> > > +#define IOMMU_PROT_IW   0x02
> > > +#define IOMMU_PROT_MMIO 0x04
> > >
> > >  #define IOMMU_UNITY_MAP_FLAG_EXCL_RANGE      (1 << 2)
> > >
> > > diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
> > > index f3399087859f..dff887958a56 100644
> > > --- a/drivers/iommu/amd/io_pgtable.c
> > > +++ b/drivers/iommu/amd/io_pgtable.c
> > > @@ -373,6 +373,8 @@ static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
> > >                       __pte |= IOMMU_PTE_IR;
> > >               if (prot & IOMMU_PROT_IW)
> > >                       __pte |= IOMMU_PTE_IW;
> > > +             if (prot & IOMMU_PROT_MMIO)
> > > +                     __pte = __sme_clr(__pte);
> > >
> > >               for (i = 0; i < count; ++i)
> > >                       pte[i] = __pte;
> > > diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c
> > > index c616de2c5926..55f969727dea 100644
> > > --- a/drivers/iommu/amd/io_pgtable_v2.c
> > > +++ b/drivers/iommu/amd/io_pgtable_v2.c
> > > @@ -65,7 +65,10 @@ static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot)
> > >  {
> > >       u64 pte;
> > >
> > > -     pte = __sme_set(paddr & PM_ADDR_MASK);
> > > +     pte = paddr & PM_ADDR_MASK;
> > > +     if (!(prot & IOMMU_PROT_MMIO))
> > > +             pte = __sme_set(pte);
> > > +
> > >       pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER;
> > >       pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY;
> > >
> > > diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> > > index 16f40b8000d7..9194ad681504 100644
> > > --- a/drivers/iommu/amd/iommu.c
> > > +++ b/drivers/iommu/amd/iommu.c
> > > @@ -2578,6 +2578,8 @@ static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
> > >               prot |= IOMMU_PROT_IR;
> > >       if (iommu_prot & IOMMU_WRITE)
> > >               prot |= IOMMU_PROT_IW;
> > > +     if (iommu_prot & IOMMU_MMIO)
> > > +             prot |= IOMMU_PROT_MMIO;
> > >
> > >       if (ops->map_pages) {
> > >               ret = ops->map_pages(ops, iova, paddr, pgsize,
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > index 50ebc9593c9d..08be1ef8514b 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -1557,6 +1557,8 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> > >               prot |= IOMMU_WRITE;
> > >       if (map->flags & VFIO_DMA_MAP_FLAG_READ)
> > >               prot |= IOMMU_READ;
> > > +    if (map->flags & VFIO_DMA_MAP_FLAG_MMIO)
> > > +        prot |= IOMMU_MMIO;
> > >
> > >       if ((prot && set_vaddr) || (!prot && !set_vaddr))
> > >               return -EINVAL;
> > > @@ -2801,7 +2803,7 @@ static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
> > >       struct vfio_iommu_type1_dma_map map;
> > >       unsigned long minsz;
> > >       uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
> > > -                     VFIO_DMA_MAP_FLAG_VADDR;
> > > +                     VFIO_DMA_MAP_FLAG_VADDR | VFIO_DMA_MAP_FLAG_MMIO;
> > >
> > >       minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
> > >
> > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > index c8dbf8219c4f..68002c8f1157 100644
> > > --- a/include/uapi/linux/vfio.h
> > > +++ b/include/uapi/linux/vfio.h
> > > @@ -1560,6 +1560,7 @@ struct vfio_iommu_type1_dma_map {
> > >  #define VFIO_DMA_MAP_FLAG_READ (1 << 0)              /* readable from device */
> > >  #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)     /* writable from device */
> > >  #define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
> > > +#define VFIO_DMA_MAP_FLAG_MMIO (1 << 3)     /* map of mmio */
> > >       __u64   vaddr;                          /* Process virtual address */
> > >       __u64   iova;                           /* IO virtual address */
> > >       __u64   size;                           /* Size of mapping (bytes) */  
> >  
>
Jason Gunthorpe Jan. 21, 2025, 8:47 p.m. UTC | #6
On Tue, Jan 21, 2025 at 08:34:43AM -0700, Alex Williamson wrote:

> This description is unclear to me.  As others have noted, we probably
> need to look at whether the flag should be automatically applied by the
> kernel.  We certainly know in the vfio IOMMU layer whether we're
> mapping a page or a pfnmap.  

It is not page or pfnmap.. When vfio is using follow_pte() it should
extract information from the PTE and then relay it to the IOMMU. The
iommu page table and the CPU page table should have the same PTE
flags.

So, a pte that is pgprot_cached() should be IOMMU_CACHE, otherwise
IOMMU_MMIO.

The encrypted bit in the PTE should be mapped to some new
IOMMU_ENCRYPTED.

I suspect AMD has created a troublesome issue that IOMMU_CACHE
conditionally implies encrypted depending on their platform features
(meaning cachable decrypted is impossible). Arguably a higher level
should be deciding this and the iommu page table code should simply
follow IOMMU_ENCRYPTED always.

That might be something for later, but I would note it :\

> In any case, we're in the process of phasing out the vfio type1
> IOMMU backend for iommufd, so whatever the implementation, and
> especially if there's a uapi component, it needs to be implemented
> in iommufd first. 

Since iommufd won't be using follow_pte() it will have to get this
meta information from the FD, eg through DMABUf, and there is a huge
thread on how to go about doing that..

There should be no uapi component.

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index fdb0357e0bb9..b0f055200cf3 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -434,9 +434,10 @@ 
 #define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
 #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
 
-#define IOMMU_PROT_MASK 0x03
-#define IOMMU_PROT_IR 0x01
-#define IOMMU_PROT_IW 0x02
+#define IOMMU_PROT_MASK 0x07
+#define IOMMU_PROT_IR   0x01
+#define IOMMU_PROT_IW   0x02
+#define IOMMU_PROT_MMIO 0x04
 
 #define IOMMU_UNITY_MAP_FLAG_EXCL_RANGE	(1 << 2)
 
diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
index f3399087859f..dff887958a56 100644
--- a/drivers/iommu/amd/io_pgtable.c
+++ b/drivers/iommu/amd/io_pgtable.c
@@ -373,6 +373,8 @@  static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
 			__pte |= IOMMU_PTE_IR;
 		if (prot & IOMMU_PROT_IW)
 			__pte |= IOMMU_PTE_IW;
+		if (prot & IOMMU_PROT_MMIO)
+			__pte = __sme_clr(__pte);
 
 		for (i = 0; i < count; ++i)
 			pte[i] = __pte;
diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c
index c616de2c5926..55f969727dea 100644
--- a/drivers/iommu/amd/io_pgtable_v2.c
+++ b/drivers/iommu/amd/io_pgtable_v2.c
@@ -65,7 +65,10 @@  static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot)
 {
 	u64 pte;
 
-	pte = __sme_set(paddr & PM_ADDR_MASK);
+	pte = paddr & PM_ADDR_MASK;
+	if (!(prot & IOMMU_PROT_MMIO))
+		pte = __sme_set(pte);
+
 	pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER;
 	pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY;
 
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 16f40b8000d7..9194ad681504 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2578,6 +2578,8 @@  static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
 		prot |= IOMMU_PROT_IR;
 	if (iommu_prot & IOMMU_WRITE)
 		prot |= IOMMU_PROT_IW;
+	if (iommu_prot & IOMMU_MMIO)
+		prot |= IOMMU_PROT_MMIO;
 
 	if (ops->map_pages) {
 		ret = ops->map_pages(ops, iova, paddr, pgsize,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 50ebc9593c9d..08be1ef8514b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1557,6 +1557,8 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 		prot |= IOMMU_WRITE;
 	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
 		prot |= IOMMU_READ;
+    if (map->flags & VFIO_DMA_MAP_FLAG_MMIO)
+        prot |= IOMMU_MMIO;
 
 	if ((prot && set_vaddr) || (!prot && !set_vaddr))
 		return -EINVAL;
@@ -2801,7 +2803,7 @@  static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
 	struct vfio_iommu_type1_dma_map map;
 	unsigned long minsz;
 	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
-			VFIO_DMA_MAP_FLAG_VADDR;
+			VFIO_DMA_MAP_FLAG_VADDR | VFIO_DMA_MAP_FLAG_MMIO;
 
 	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index c8dbf8219c4f..68002c8f1157 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1560,6 +1560,7 @@  struct vfio_iommu_type1_dma_map {
 #define VFIO_DMA_MAP_FLAG_READ (1 << 0)		/* readable from device */
 #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)	/* writable from device */
 #define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
+#define VFIO_DMA_MAP_FLAG_MMIO (1 << 3)     /* map of mmio */
 	__u64	vaddr;				/* Process virtual address */
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */