diff mbox series

[hyperv-next,5/6] arch, drivers: Add device struct bitfield to not bounce-buffer

Message ID 20250409000835.285105-6-romank@linux.microsoft.com (mailing list archive)
State New
Headers show
Series Confidential VMBus | expand

Commit Message

Roman Kisel April 9, 2025, 12:08 a.m. UTC
Bounce-buffering makes the system spend more time copying
I/O data. When the I/O transaction take place between
a confidential and a non-confidential endpoints, there is
no other way around.

Introduce a device bitfield to indicate that the device
doesn't need to perform bounce buffering. The capable
device may employ it to save on copying data around.

Signed-off-by: Roman Kisel <romank@linux.microsoft.com>
---
 arch/x86/mm/mem_encrypt.c  | 3 +++
 include/linux/device.h     | 8 ++++++++
 include/linux/dma-direct.h | 3 +++
 include/linux/swiotlb.h    | 3 +++
 4 files changed, 17 insertions(+)

Comments

Christoph Hellwig April 9, 2025, 10:52 a.m. UTC | #1
On Tue, Apr 08, 2025 at 05:08:34PM -0700, Roman Kisel wrote:
> Bounce-buffering makes the system spend more time copying
> I/O data. When the I/O transaction take place between
> a confidential and a non-confidential endpoints, there is
> no other way around.
> 
> Introduce a device bitfield to indicate that the device
> doesn't need to perform bounce buffering. The capable
> device may employ it to save on copying data around.

I have no idea what this is supposed to mean, you need to explain it
much better.
Roman Kisel April 9, 2025, 3:27 p.m. UTC | #2
On 4/9/2025 3:52 AM, Christoph Hellwig wrote:
> On Tue, Apr 08, 2025 at 05:08:34PM -0700, Roman Kisel wrote:
>> Bounce-buffering makes the system spend more time copying
>> I/O data. When the I/O transaction take place between
>> a confidential and a non-confidential endpoints, there is
>> no other way around.
>>
>> Introduce a device bitfield to indicate that the device
>> doesn't need to perform bounce buffering. The capable
>> device may employ it to save on copying data around.
> 
> I have no idea what this is supposed to mean, you need to explain it
> much better.

Thanks for reviewing! I'll fix the description.

>
Robin Murphy April 9, 2025, 4:03 p.m. UTC | #3
On 2025-04-09 1:08 am, Roman Kisel wrote:
> Bounce-buffering makes the system spend more time copying
> I/O data. When the I/O transaction take place between
> a confidential and a non-confidential endpoints, there is
> no other way around.
> 
> Introduce a device bitfield to indicate that the device
> doesn't need to perform bounce buffering. The capable
> device may employ it to save on copying data around.

It's not so much about bounce buffering, it's more fundamentally about 
whether the device is trusted and able to access private memory at all 
or not. And performance is hardly the biggest concern either - if you do 
trust a device to operate on confidential data in private memory, then 
surely it is crucial to actively *prevent* that data ever getting into 
shared SWIOTLB pages where anyone else could also get at it. At worst 
that means CoCo VMs might need an *additional* non-shared SWIOTLB to 
support trusted devices with addressing limitations (and/or 
"swiotlb=force" debugging, potentially).

Also whatever we do for this really wants to tie in with the nascent 
TDISP stuff as well, since we definitely don't want to end up with more 
than one notion of whether a device is in a trusted/locked/private/etc. 
vs. unlocked/shared/etc. state with respect to DMA (or indeed anything 
else if we can avoid it).

Thanks,
Robin.

> Signed-off-by: Roman Kisel <romank@linux.microsoft.com>
> ---
>   arch/x86/mm/mem_encrypt.c  | 3 +++
>   include/linux/device.h     | 8 ++++++++
>   include/linux/dma-direct.h | 3 +++
>   include/linux/swiotlb.h    | 3 +++
>   4 files changed, 17 insertions(+)
> 
> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> index 95bae74fdab2..6349a02a1da3 100644
> --- a/arch/x86/mm/mem_encrypt.c
> +++ b/arch/x86/mm/mem_encrypt.c
> @@ -19,6 +19,9 @@
>   /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
>   bool force_dma_unencrypted(struct device *dev)
>   {
> +	if (dev->use_priv_pages_for_io)
> +		return false;
> +
>   	/*
>   	 * For SEV, all DMA must be to unencrypted addresses.
>   	 */
> diff --git a/include/linux/device.h b/include/linux/device.h
> index 80a5b3268986..4aa4a6fd9580 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -725,6 +725,8 @@ struct device_physical_location {
>    * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers.
>    * @dma_iommu: Device is using default IOMMU implementation for DMA and
>    *		doesn't rely on dma_ops structure.
> + * @use_priv_pages_for_io: Device is using private pages for I/O, no need to
> + *		bounce-buffer.
>    *
>    * At the lowest level, every device in a Linux system is represented by an
>    * instance of struct device. The device structure contains the information
> @@ -843,6 +845,7 @@ struct device {
>   #ifdef CONFIG_IOMMU_DMA
>   	bool			dma_iommu:1;
>   #endif
> +	bool			use_priv_pages_for_io:1;
>   };
>   
>   /**
> @@ -1079,6 +1082,11 @@ static inline bool dev_removable_is_valid(struct device *dev)
>   	return dev->removable != DEVICE_REMOVABLE_NOT_SUPPORTED;
>   }
>   
> +static inline bool dev_priv_pages_for_io(struct device *dev)
> +{
> +	return dev->use_priv_pages_for_io;
> +}
> +
>   /*
>    * High level routines for use by the bus drivers
>    */
> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index d7e30d4f7503..b096369f847e 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -94,6 +94,9 @@ static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev,
>    */
>   static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
>   {
> +	if (dev_priv_pages_for_io(dev))
> +		return phys_to_dma_unencrypted(dev, paddr);
> +
>   	return __sme_set(phys_to_dma_unencrypted(dev, paddr));
>   }
>   
> diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> index 3dae0f592063..35ee10641b42 100644
> --- a/include/linux/swiotlb.h
> +++ b/include/linux/swiotlb.h
> @@ -173,6 +173,9 @@ static inline bool is_swiotlb_force_bounce(struct device *dev)
>   {
>   	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
>   
> +	if (dev_priv_pages_for_io(dev))
> +		return false;
> +
>   	return mem && mem->force_bounce;
>   }
>
Roman Kisel April 9, 2025, 4:44 p.m. UTC | #4
On 4/9/2025 9:03 AM, Robin Murphy wrote:
> On 2025-04-09 1:08 am, Roman Kisel wrote:
>> Bounce-buffering makes the system spend more time copying
>> I/O data. When the I/O transaction take place between
>> a confidential and a non-confidential endpoints, there is
>> no other way around.
>>
>> Introduce a device bitfield to indicate that the device
>> doesn't need to perform bounce buffering. The capable
>> device may employ it to save on copying data around.
> 
> It's not so much about bounce buffering, it's more fundamentally about 
> whether the device is trusted and able to access private memory at all 
> or not. And performance is hardly the biggest concern either - if you do 
> trust a device to operate on confidential data in private memory, then 
> surely it is crucial to actively *prevent* that data ever getting into 
> shared SWIOTLB pages where anyone else could also get at it. At worst 
> that means CoCo VMs might need an *additional* non-shared SWIOTLB to 
> support trusted devices with addressing limitations (and/or 
> "swiotlb=force" debugging, potentially).

Thanks, I should've highlighted that facet most certainly!

> 
> Also whatever we do for this really wants to tie in with the nascent 
> TDISP stuff as well, since we definitely don't want to end up with more 
> than one notion of whether a device is in a trusted/locked/private/etc. 
> vs. unlocked/shared/etc. state with respect to DMA (or indeed anything 
> else if we can avoid it).

Wouldn't TDISP be per-device as well? In which case, a flag would be
needed just as being added in this patch.

Although, there must be a difference between a device with TDISP where
the flag would be the indication of the feature, and this code where the
driver may flip that back and forth...

Do you feel this is shoehorned in `struct device`? I couldn't find an
appropriate private (== opaque pointer) part in the structure to store
that bit (`struct device_private` wouldn't fit the bill) and looked like
adding it to the struct itself would do no harm. However, my read of the
room is that folks see that as dubious :)

What would be your opinion on where to store that flag to tie together
its usage in the Hyper-V SCSI and not bounce-buffering?

> 
> Thanks,
> Robin.
> 
>> Signed-off-by: Roman Kisel <romank@linux.microsoft.com>
>> ---
>>   arch/x86/mm/mem_encrypt.c  | 3 +++
>>   include/linux/device.h     | 8 ++++++++
>>   include/linux/dma-direct.h | 3 +++
>>   include/linux/swiotlb.h    | 3 +++
>>   4 files changed, 17 insertions(+)
>>
>> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
>> index 95bae74fdab2..6349a02a1da3 100644
>> --- a/arch/x86/mm/mem_encrypt.c
>> +++ b/arch/x86/mm/mem_encrypt.c
>> @@ -19,6 +19,9 @@
>>   /* Override for DMA direct allocation check - 
>> ARCH_HAS_FORCE_DMA_UNENCRYPTED */
>>   bool force_dma_unencrypted(struct device *dev)
>>   {
>> +    if (dev->use_priv_pages_for_io)
>> +        return false;
>> +
>>       /*
>>        * For SEV, all DMA must be to unencrypted addresses.
>>        */
>> diff --git a/include/linux/device.h b/include/linux/device.h
>> index 80a5b3268986..4aa4a6fd9580 100644
>> --- a/include/linux/device.h
>> +++ b/include/linux/device.h
>> @@ -725,6 +725,8 @@ struct device_physical_location {
>>    * @dma_skip_sync: DMA sync operations can be skipped for coherent 
>> buffers.
>>    * @dma_iommu: Device is using default IOMMU implementation for DMA and
>>    *        doesn't rely on dma_ops structure.
>> + * @use_priv_pages_for_io: Device is using private pages for I/O, no 
>> need to
>> + *        bounce-buffer.
>>    *
>>    * At the lowest level, every device in a Linux system is 
>> represented by an
>>    * instance of struct device. The device structure contains the 
>> information
>> @@ -843,6 +845,7 @@ struct device {
>>   #ifdef CONFIG_IOMMU_DMA
>>       bool            dma_iommu:1;
>>   #endif
>> +    bool            use_priv_pages_for_io:1;
>>   };
>>   /**
>> @@ -1079,6 +1082,11 @@ static inline bool 
>> dev_removable_is_valid(struct device *dev)
>>       return dev->removable != DEVICE_REMOVABLE_NOT_SUPPORTED;
>>   }
>> +static inline bool dev_priv_pages_for_io(struct device *dev)
>> +{
>> +    return dev->use_priv_pages_for_io;
>> +}
>> +
>>   /*
>>    * High level routines for use by the bus drivers
>>    */
>> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
>> index d7e30d4f7503..b096369f847e 100644
>> --- a/include/linux/dma-direct.h
>> +++ b/include/linux/dma-direct.h
>> @@ -94,6 +94,9 @@ static inline dma_addr_t 
>> phys_to_dma_unencrypted(struct device *dev,
>>    */
>>   static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t 
>> paddr)
>>   {
>> +    if (dev_priv_pages_for_io(dev))
>> +        return phys_to_dma_unencrypted(dev, paddr);
>> +
>>       return __sme_set(phys_to_dma_unencrypted(dev, paddr));
>>   }
>> diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
>> index 3dae0f592063..35ee10641b42 100644
>> --- a/include/linux/swiotlb.h
>> +++ b/include/linux/swiotlb.h
>> @@ -173,6 +173,9 @@ static inline bool is_swiotlb_force_bounce(struct 
>> device *dev)
>>   {
>>       struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
>> +    if (dev_priv_pages_for_io(dev))
>> +        return false;
>> +
>>       return mem && mem->force_bounce;
>>   }
>
Dan Williams April 9, 2025, 11:30 p.m. UTC | #5
[ add linux-coco ]

Roman Kisel wrote:
> 
> 
> On 4/9/2025 9:03 AM, Robin Murphy wrote:
> > On 2025-04-09 1:08 am, Roman Kisel wrote:
> >> Bounce-buffering makes the system spend more time copying
> >> I/O data. When the I/O transaction take place between
> >> a confidential and a non-confidential endpoints, there is
> >> no other way around.
> >>
> >> Introduce a device bitfield to indicate that the device
> >> doesn't need to perform bounce buffering. The capable
> >> device may employ it to save on copying data around.
> > 
> > It's not so much about bounce buffering, it's more fundamentally about 
> > whether the device is trusted and able to access private memory at all 
> > or not. And performance is hardly the biggest concern either - if you do 
> > trust a device to operate on confidential data in private memory, then 
> > surely it is crucial to actively *prevent* that data ever getting into 
> > shared SWIOTLB pages where anyone else could also get at it. At worst 
> > that means CoCo VMs might need an *additional* non-shared SWIOTLB to 
> > support trusted devices with addressing limitations (and/or 
> > "swiotlb=force" debugging, potentially).
> 
> Thanks, I should've highlighted that facet most certainly!

One would hope that no one is building a modern device with trusted I/O
capability, *and* with a swiotlb addressing dependency. However, I agree
that a non-shared swiotlb would be needed in such a scenario.

Otherwise the policy around "a device should not even be allowed to
bounce buffer any private page" is a userspace responsibility to either
not load the driver, not release secrets to this CVM, or otherwise make
sure the device is only ever bounce buffering private memory that does
not contain secrets.

> > Also whatever we do for this really wants to tie in with the nascent 
> > TDISP stuff as well, since we definitely don't want to end up with more 
> > than one notion of whether a device is in a trusted/locked/private/etc. 
> > vs. unlocked/shared/etc. state with respect to DMA (or indeed anything 
> > else if we can avoid it).
> 
> Wouldn't TDISP be per-device as well? In which case, a flag would be
> needed just as being added in this patch.
> 
> Although, there must be a difference between a device with TDISP where
> the flag would be the indication of the feature, and this code where the
> driver may flip that back and forth...
> 
> Do you feel this is shoehorned in `struct device`? I couldn't find an
> appropriate private (== opaque pointer) part in the structure to store
> that bit (`struct device_private` wouldn't fit the bill) and looked like
> adding it to the struct itself would do no harm. However, my read of the
> room is that folks see that as dubious :)
> 
> What would be your opinion on where to store that flag to tie together
> its usage in the Hyper-V SCSI and not bounce-buffering?

The name and location of a flag bit is not the issue, it is the common
expectation of how and when that flag is set.

tl;dr Linux likely needs a "private_accepted" flag for devices

Like Christoph said, a driver really has no business opting itself into
different DMA addressing domains. For TDISP we are also being careful to
make sure that flipping a device from shared to private is a suitably
violent event. This is because the Linux DMA layer does not have a
concept of allowing a device to have mappings from two different
addressing domains simultaneously.

In the current TDISP proposal, a device starts in shared mode and only
after validating all of the launch state of the CVM, device
measurements, and a device interface report is it granted access to
private memory. Without dumping a bunch of golden measurement data into
the kernel that validation can really only be performed by userspace.

Enter this vmbus proposal that wants to emulate devices with a paravisor
that is presumably within the TCB at launch, but the kernel can not
really trust that until a "launch state of the CVM + paravisor"
attestation event.

Like PCIe TDISP the capability of this device to access private memory
is a property of the bus and the iommu. However, acceptance of the
device into private operation is a willful policy action. It needs to
validate not only the device provenance and state, but also the Linux
DMA layer requirements of not holding shared or swiotlb mappings over
the "entry into private mode operation" event.

All that said, I would advocate to require a userspace driven "device
accept" event for all devices, not just TDISP, that want to enter
private operation. Maybe later circle back to figure out if there is a
lingering need for accepting devices via golden measurement, or other
means, to skip the userpace round-trip dependency.

A "private_capable" flag might also make sense, but that is really a
property of a bus that need not be carried necessarily in 'struct
device'.

So for this confidential vmbus SCSI device to mesh with the mechanisms
needed for TDISP I would expect it continues to launch in swiotlb mode
by default. Export an attribute via hv_bus->dev_groups to indicate that
the device is "private_capable" and then require userspace to twiddle a
private_accepted flag with some safety for in-flight DMA.
Michael Kelley April 10, 2025, 1:16 a.m. UTC | #6
From: Dan Williams <dan.j.williams@intel.com> Sent: Wednesday, April 9, 2025 4:30 PM
> 
> [ add linux-coco ]
> 
> Roman Kisel wrote:
> >
> >
> > On 4/9/2025 9:03 AM, Robin Murphy wrote:
> > > On 2025-04-09 1:08 am, Roman Kisel wrote:
> > >> Bounce-buffering makes the system spend more time copying
> > >> I/O data. When the I/O transaction take place between
> > >> a confidential and a non-confidential endpoints, there is
> > >> no other way around.
> > >>
> > >> Introduce a device bitfield to indicate that the device
> > >> doesn't need to perform bounce buffering. The capable
> > >> device may employ it to save on copying data around.
> > >
> > > It's not so much about bounce buffering, it's more fundamentally about
> > > whether the device is trusted and able to access private memory at all
> > > or not. And performance is hardly the biggest concern either - if you do
> > > trust a device to operate on confidential data in private memory, then
> > > surely it is crucial to actively *prevent* that data ever getting into
> > > shared SWIOTLB pages where anyone else could also get at it. At worst
> > > that means CoCo VMs might need an *additional* non-shared SWIOTLB to
> > > support trusted devices with addressing limitations (and/or
> > > "swiotlb=force" debugging, potentially).
> >
> > Thanks, I should've highlighted that facet most certainly!
> 
> One would hope that no one is building a modern device with trusted I/O
> capability, *and* with a swiotlb addressing dependency. However, I agree
> that a non-shared swiotlb would be needed in such a scenario.
> 
> Otherwise the policy around "a device should not even be allowed to
> bounce buffer any private page" is a userspace responsibility to either
> not load the driver, not release secrets to this CVM, or otherwise make
> sure the device is only ever bounce buffering private memory that does
> not contain secrets.
> 
> > > Also whatever we do for this really wants to tie in with the nascent
> > > TDISP stuff as well, since we definitely don't want to end up with more
> > > than one notion of whether a device is in a trusted/locked/private/etc.
> > > vs. unlocked/shared/etc. state with respect to DMA (or indeed anything
> > > else if we can avoid it).
> >
> > Wouldn't TDISP be per-device as well? In which case, a flag would be
> > needed just as being added in this patch.
> >
> > Although, there must be a difference between a device with TDISP where
> > the flag would be the indication of the feature, and this code where the
> > driver may flip that back and forth...
> >
> > Do you feel this is shoehorned in `struct device`? I couldn't find an
> > appropriate private (== opaque pointer) part in the structure to store
> > that bit (`struct device_private` wouldn't fit the bill) and looked like
> > adding it to the struct itself would do no harm. However, my read of the
> > room is that folks see that as dubious :)
> >
> > What would be your opinion on where to store that flag to tie together
> > its usage in the Hyper-V SCSI and not bounce-buffering?
> 
> The name and location of a flag bit is not the issue, it is the common
> expectation of how and when that flag is set.
> 
> tl;dr Linux likely needs a "private_accepted" flag for devices
> 
> Like Christoph said, a driver really has no business opting itself into
> different DMA addressing domains. For TDISP we are also being careful to
> make sure that flipping a device from shared to private is a suitably
> violent event. This is because the Linux DMA layer does not have a
> concept of allowing a device to have mappings from two different
> addressing domains simultaneously.
> 
> In the current TDISP proposal, a device starts in shared mode and only
> after validating all of the launch state of the CVM, device
> measurements, and a device interface report is it granted access to
> private memory. Without dumping a bunch of golden measurement data into
> the kernel that validation can really only be performed by userspace.
> 
> Enter this vmbus proposal that wants to emulate devices with a paravisor
> that is presumably within the TCB at launch, but the kernel can not
> really trust that until a "launch state of the CVM + paravisor"
> attestation event.
> 
> Like PCIe TDISP the capability of this device to access private memory
> is a property of the bus and the iommu. However, acceptance of the
> device into private operation is a willful policy action. It needs to
> validate not only the device provenance and state, but also the Linux
> DMA layer requirements of not holding shared or swiotlb mappings over
> the "entry into private mode operation" event.

To flesh this out the swiotlb aspect a bit, once a TDISP device has
gone private, how does it prevent the DMA layer from ever doing
bounce buffering through the swiotlb? My understanding is that
the DMA layer doesn't make any promises to not do bounce buffering.
Given the vagaries of memory alignment, perhaps add in a virtual
IOMMU, etc., it seems like a device driver can't necessarily predict
what DMA operations might result in bounce buffering. Does TDISP
anticipate needing a formal way to tell the DMA layer "don't bounce
buffer"? (and return an error instead?) Or would there be a separate
swiotlb memory pool that is private memory so that bounce buffer
could be done when necessary and still maintain confidentiality?

Just wondering if there's any thinking on this topic ...

Thanks,

Michael
Christoph Hellwig April 10, 2025, 7:21 a.m. UTC | #7
On Wed, Apr 09, 2025 at 09:44:03AM -0700, Roman Kisel wrote:
> Do you feel this is shoehorned in `struct device`? I couldn't find an
> appropriate private (== opaque pointer) part in the structure to store
> that bit (`struct device_private` wouldn't fit the bill) and looked like
> adding it to the struct itself would do no harm. However, my read of the
> room is that folks see that as dubious :)

We'll need per-device information.  But it is much higher level than a
need bounce buffer flag.
Christoph Hellwig April 10, 2025, 7:23 a.m. UTC | #8
On Wed, Apr 09, 2025 at 04:30:17PM -0700, Dan Williams wrote:
> > Thanks, I should've highlighted that facet most certainly!
> 
> One would hope that no one is building a modern device with trusted I/O
> capability, *and* with a swiotlb addressing dependency. However, I agree
> that a non-shared swiotlb would be needed in such a scenario.

Hope is never a good idea when dealing with hardware :(  PCIe already
requires no addressing limitations, and programming interface specs
like NVMe double down on that.  But at least one big hyperscaler still
managed to build such a device.

Also even if the periphal device is not addressing limited, the root
port or interconnect might still be, we've seen quite a lot of that.
Roman Kisel April 10, 2025, 3:16 p.m. UTC | #9
On 4/10/2025 12:21 AM, Christoph Hellwig wrote:
> On Wed, Apr 09, 2025 at 09:44:03AM -0700, Roman Kisel wrote:
>> Do you feel this is shoehorned in `struct device`? I couldn't find an
>> appropriate private (== opaque pointer) part in the structure to store
>> that bit (`struct device_private` wouldn't fit the bill) and looked like
>> adding it to the struct itself would do no harm. However, my read of the
>> room is that folks see that as dubious :)
> 
> We'll need per-device information.  But it is much higher level than a
> need bounce buffer flag.
> 

I see, thanks for the explanation!
Jason Gunthorpe April 10, 2025, 11:44 p.m. UTC | #10
On Thu, Apr 10, 2025 at 09:23:54AM +0200, Christoph Hellwig wrote:
> On Wed, Apr 09, 2025 at 04:30:17PM -0700, Dan Williams wrote:
> > > Thanks, I should've highlighted that facet most certainly!
> > 
> > One would hope that no one is building a modern device with trusted I/O
> > capability, *and* with a swiotlb addressing dependency. However, I agree
> > that a non-shared swiotlb would be needed in such a scenario.
> 
> Hope is never a good idea when dealing with hardware :(  PCIe already
> requires no addressing limitations, and programming interface specs
> like NVMe double down on that.  But at least one big hyperscaler still
> managed to build such a device.
> 
> Also even if the periphal device is not addressing limited, the root
> port or interconnect might still be, we've seen quite a lot of that.

Still it would be very obnoxious for someone to build a CC VM platform
where CC DMA devices can't access all the guest physical memory in the
CC address map :\

Keeping in mind that that the CC address map is being created by using
the CPU MMU and the CPU IOMMU so it is entirely virtual and can be
configured to match most problems the devices might have.

Too much memory would be the main issue..

IMHO I wouldn't implement secure SWIOTLB until someone does such a
foolish thing, and I'd make them do the work as some kind of pennance
:P

Jason
Jason Gunthorpe April 10, 2025, 11:50 p.m. UTC | #11
On Wed, Apr 09, 2025 at 04:30:17PM -0700, Dan Williams wrote:

> Like Christoph said, a driver really has no business opting itself into
> different DMA addressing domains. For TDISP we are also being careful to
> make sure that flipping a device from shared to private is a suitably
> violent event. This is because the Linux DMA layer does not have a
> concept of allowing a device to have mappings from two different
> addressing domains simultaneously.

And this is a very important point, several of the architectures have
two completely independent iommu tables, and maybe even completely
different IOMMU instances for trusted and non-trusted DMA traffic.

I expect configurations where trusted traffic is translated through
the vIOMMU while non-trusted traffic is locked to an identity
translation.

There are more issue here than just swiotlb :\

> A "private_capable" flag might also make sense, but that is really a
> property of a bus that need not be carried necessarily in 'struct
> device'.

However it works, it should be done before the driver is probed and
remain stable for the duration of the driver attachment. From the
iommu side the correct iommu domain, on the correct IOMMU instance to
handle the expected traffic should be setup as the DMA API's iommu
domain.

Jason
Dan Williams April 11, 2025, 12:03 a.m. UTC | #12
Michael Kelley wrote:
> From: Dan Williams <dan.j.williams@intel.com> Sent: Wednesday, April 9, 2025 4:30 PM
[..]
> > Like PCIe TDISP the capability of this device to access private memory
> > is a property of the bus and the iommu. However, acceptance of the
> > device into private operation is a willful policy action. It needs to
> > validate not only the device provenance and state, but also the Linux
> > DMA layer requirements of not holding shared or swiotlb mappings over
> > the "entry into private mode operation" event.
> 
> To flesh this out the swiotlb aspect a bit, once a TDISP device has
> gone private, how does it prevent the DMA layer from ever doing
> bounce buffering through the swiotlb? My understanding is that
> the DMA layer doesn't make any promises to not do bounce buffering.
> Given the vagaries of memory alignment, perhaps add in a virtual
> IOMMU, etc., it seems like a device driver can't necessarily predict
> what DMA operations might result in bounce buffering. Does TDISP
> anticipate needing a formal way to tell the DMA layer "don't bounce
> buffer"? (and return an error instead?) Or would there be a separate
> swiotlb memory pool that is private memory so that bounce buffer
> could be done when necessary and still maintain confidentiality?

I expect step 1 is just add some rude errors / safety for attempting to
convert the mode of a device while it has any DMA mappings established,
and explicit failures for attempts to fallback to swiotlb for
'private_accepted' devices.

The easiest way to enforce that a device does not cross the
shared/private boundary while DMA mappings are live is to simply not
allow that transition while a driver is bound (i.e. "dev->driver" is
non-NULL).
diff mbox series

Patch

diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 95bae74fdab2..6349a02a1da3 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -19,6 +19,9 @@ 
 /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
 bool force_dma_unencrypted(struct device *dev)
 {
+	if (dev->use_priv_pages_for_io)
+		return false;
+
 	/*
 	 * For SEV, all DMA must be to unencrypted addresses.
 	 */
diff --git a/include/linux/device.h b/include/linux/device.h
index 80a5b3268986..4aa4a6fd9580 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -725,6 +725,8 @@  struct device_physical_location {
  * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers.
  * @dma_iommu: Device is using default IOMMU implementation for DMA and
  *		doesn't rely on dma_ops structure.
+ * @use_priv_pages_for_io: Device is using private pages for I/O, no need to
+ *		bounce-buffer.
  *
  * At the lowest level, every device in a Linux system is represented by an
  * instance of struct device. The device structure contains the information
@@ -843,6 +845,7 @@  struct device {
 #ifdef CONFIG_IOMMU_DMA
 	bool			dma_iommu:1;
 #endif
+	bool			use_priv_pages_for_io:1;
 };
 
 /**
@@ -1079,6 +1082,11 @@  static inline bool dev_removable_is_valid(struct device *dev)
 	return dev->removable != DEVICE_REMOVABLE_NOT_SUPPORTED;
 }
 
+static inline bool dev_priv_pages_for_io(struct device *dev)
+{
+	return dev->use_priv_pages_for_io;
+}
+
 /*
  * High level routines for use by the bus drivers
  */
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index d7e30d4f7503..b096369f847e 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -94,6 +94,9 @@  static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev,
  */
 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
+	if (dev_priv_pages_for_io(dev))
+		return phys_to_dma_unencrypted(dev, paddr);
+
 	return __sme_set(phys_to_dma_unencrypted(dev, paddr));
 }
 
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 3dae0f592063..35ee10641b42 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -173,6 +173,9 @@  static inline bool is_swiotlb_force_bounce(struct device *dev)
 {
 	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
 
+	if (dev_priv_pages_for_io(dev))
+		return false;
+
 	return mem && mem->force_bounce;
 }