diff mbox series

[v4,11/15] pci: Add pci_iomap_shared{,_range}

Message ID 20210805005218.2912076-12-sathyanarayanan.kuppuswamy@linux.intel.com (mailing list archive)
State Not Applicable
Headers show
Series Add TDX Guest Support (shared-mm support) | expand

Commit Message

Kuppuswamy Sathyanarayanan Aug. 5, 2021, 12:52 a.m. UTC
From: Andi Kleen <ak@linux.intel.com>

Add a new variant of pci_iomap for mapping all PCI resources
of a devices as shared memory with a hypervisor in a confidential
guest.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
---
 include/asm-generic/pci_iomap.h |  6 +++++
 lib/pci_iomap.c                 | 46 +++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

Comments

Christoph Hellwig Aug. 13, 2021, 8:02 a.m. UTC | #1
On Wed, Aug 04, 2021 at 05:52:14PM -0700, Kuppuswamy Sathyanarayanan wrote:
> +extern void __iomem *pci_iomap_shared(struct pci_dev *dev, int bar,
> +				      unsigned long max);
> +extern void __iomem *pci_iomap_shared_range(struct pci_dev *dev, int bar,
> +					    unsigned long offset,
> +					    unsigned long maxlen);

No need for externs here.

> +/**
> + * pci_iomap_shared_range - create a virtual shared mapping cookie for a
> + *                          PCI BAR

This reads like completely garbage from a markow chain.
Michael S. Tsirkin Aug. 23, 2021, 11:56 p.m. UTC | #2
On Wed, Aug 04, 2021 at 05:52:14PM -0700, Kuppuswamy Sathyanarayanan wrote:
> From: Andi Kleen <ak@linux.intel.com>
> 
> Add a new variant of pci_iomap for mapping all PCI resources
> of a devices as shared memory with a hypervisor in a confidential
> guest.
> 
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>

I'm a bit puzzled by this part. So why should the guest *not* map
pci memory as shared? And if the answer is never (as it seems to be)
then why not just make regular pci_iomap DTRT?

Thanks!

> ---
>  include/asm-generic/pci_iomap.h |  6 +++++
>  lib/pci_iomap.c                 | 46 +++++++++++++++++++++++++++++++++
>  2 files changed, 52 insertions(+)
> 
> diff --git a/include/asm-generic/pci_iomap.h b/include/asm-generic/pci_iomap.h
> index d4f16dcc2ed7..0178ddd7ad88 100644
> --- a/include/asm-generic/pci_iomap.h
> +++ b/include/asm-generic/pci_iomap.h
> @@ -18,6 +18,12 @@ extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
>  extern void __iomem *pci_iomap_wc_range(struct pci_dev *dev, int bar,
>  					unsigned long offset,
>  					unsigned long maxlen);
> +extern void __iomem *pci_iomap_shared(struct pci_dev *dev, int bar,
> +				      unsigned long max);
> +extern void __iomem *pci_iomap_shared_range(struct pci_dev *dev, int bar,
> +					    unsigned long offset,
> +					    unsigned long maxlen);
> +
>  /* Create a virtual mapping cookie for a port on a given PCI device.
>   * Do not call this directly, it exists to make it easier for architectures
>   * to override */
> diff --git a/lib/pci_iomap.c b/lib/pci_iomap.c
> index 6251c3f651c6..b04e8689eab3 100644
> --- a/lib/pci_iomap.c
> +++ b/lib/pci_iomap.c
> @@ -25,6 +25,11 @@ static void __iomem *map_ioremap_wc(phys_addr_t addr, size_t size)
>  	return ioremap_wc(addr, size);
>  }
>  
> +static void __iomem *map_ioremap_shared(phys_addr_t addr, size_t size)
> +{
> +	return ioremap_shared(addr, size);
> +}
> +
>  static void __iomem *pci_iomap_range_map(struct pci_dev *dev,
>  					 int bar,
>  					 unsigned long offset,
> @@ -101,6 +106,47 @@ void __iomem *pci_iomap_wc_range(struct pci_dev *dev,
>  }
>  EXPORT_SYMBOL_GPL(pci_iomap_wc_range);
>  
> +/**
> + * pci_iomap_shared_range - create a virtual shared mapping cookie for a
> + *                          PCI BAR
> + * @dev: PCI device that owns the BAR
> + * @bar: BAR number
> + * @offset: map memory at the given offset in BAR
> + * @maxlen: max length of the memory to map
> + *
> + * Remap a pci device's resources shared in a confidential guest.
> + * For more details see pci_iomap_range's documentation.
> + *
> + * @maxlen specifies the maximum length to map. To get access to
> + * the complete BAR from offset to the end, pass %0 here.
> + */
> +void __iomem *pci_iomap_shared_range(struct pci_dev *dev, int bar,
> +				     unsigned long offset, unsigned long maxlen)
> +{
> +	return pci_iomap_range_map(dev, bar, offset, maxlen,
> +				   map_ioremap_shared);
> +}
> +EXPORT_SYMBOL_GPL(pci_iomap_shared_range);
> +
> +/**
> + * pci_iomap_shared - create a virtual shared mapping cookie for a PCI BAR
> + * @dev: PCI device that owns the BAR
> + * @bar: BAR number
> + * @maxlen: length of the memory to map
> + *
> + * See pci_iomap for details. This function creates a shared mapping
> + * with the host for confidential hosts.
> + *
> + * @maxlen specifies the maximum length to map. To get access to the
> + * complete BAR without checking for its length first, pass %0 here.
> + */
> +void __iomem *pci_iomap_shared(struct pci_dev *dev, int bar,
> +			       unsigned long maxlen)
> +{
> +	return pci_iomap_shared_range(dev, bar, 0, maxlen);
> +}
> +EXPORT_SYMBOL_GPL(pci_iomap_shared);
> +
>  /**
>   * pci_iomap - create a virtual mapping cookie for a PCI BAR
>   * @dev: PCI device that owns the BAR
> -- 
> 2.25.1
Kuppuswamy Sathyanarayanan Aug. 24, 2021, 12:30 a.m. UTC | #3
On 8/23/21 4:56 PM, Michael S. Tsirkin wrote:
>> Add a new variant of pci_iomap for mapping all PCI resources
>> of a devices as shared memory with a hypervisor in a confidential
>> guest.
>>
>> Signed-off-by: Andi Kleen<ak@linux.intel.com>
>> Signed-off-by: Kuppuswamy Sathyanarayanan<sathyanarayanan.kuppuswamy@linux.intel.com>
> I'm a bit puzzled by this part. So why should the guest*not*  map
> pci memory as shared? And if the answer is never (as it seems to be)
> then why not just make regular pci_iomap DTRT?

It is in the context of confidential guest (where VMM is un-trusted). So
we don't want to make all PCI resource as shared. It should be allowed
only for hardened drivers/devices.
Dan Williams Aug. 24, 2021, 1:04 a.m. UTC | #4
On Mon, Aug 23, 2021 at 5:31 PM Kuppuswamy, Sathyanarayanan
<sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
>
>
>
> On 8/23/21 4:56 PM, Michael S. Tsirkin wrote:
> >> Add a new variant of pci_iomap for mapping all PCI resources
> >> of a devices as shared memory with a hypervisor in a confidential
> >> guest.
> >>
> >> Signed-off-by: Andi Kleen<ak@linux.intel.com>
> >> Signed-off-by: Kuppuswamy Sathyanarayanan<sathyanarayanan.kuppuswamy@linux.intel.com>
> > I'm a bit puzzled by this part. So why should the guest*not*  map
> > pci memory as shared? And if the answer is never (as it seems to be)
> > then why not just make regular pci_iomap DTRT?
>
> It is in the context of confidential guest (where VMM is un-trusted). So
> we don't want to make all PCI resource as shared. It should be allowed
> only for hardened drivers/devices.

That's confusing, isn't device authorization what keeps unaudited
drivers from loading against untrusted devices? I'm feeling like
Michael that this should be a detail that drivers need not care about
explicitly, in which case it does not need to be exported because the
detail can be buried in lower levels.

Note, I specifically said "unaudited", not "hardened" because as Greg
mentioned the kernel must trust drivers, its devices that may not be
trusted.
Andi Kleen Aug. 24, 2021, 2:14 a.m. UTC | #5
On 8/23/2021 6:04 PM, Dan Williams wrote:
> On Mon, Aug 23, 2021 at 5:31 PM Kuppuswamy, Sathyanarayanan
> <sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
>>
>>
>> On 8/23/21 4:56 PM, Michael S. Tsirkin wrote:
>>>> Add a new variant of pci_iomap for mapping all PCI resources
>>>> of a devices as shared memory with a hypervisor in a confidential
>>>> guest.
>>>>
>>>> Signed-off-by: Andi Kleen<ak@linux.intel.com>
>>>> Signed-off-by: Kuppuswamy Sathyanarayanan<sathyanarayanan.kuppuswamy@linux.intel.com>
>>> I'm a bit puzzled by this part. So why should the guest*not*  map
>>> pci memory as shared? And if the answer is never (as it seems to be)
>>> then why not just make regular pci_iomap DTRT?
>> It is in the context of confidential guest (where VMM is un-trusted). So
>> we don't want to make all PCI resource as shared. It should be allowed
>> only for hardened drivers/devices.
> That's confusing, isn't device authorization what keeps unaudited
> drivers from loading against untrusted devices? I'm feeling like
> Michael that this should be a detail that drivers need not care about
> explicitly, in which case it does not need to be exported because the
> detail can be buried in lower levels.

We originally made it default (similar to AMD), but it during code audit 
we found a lot of drivers who do ioremap early outside the probe 
function. Since it would be difficult to change them all we made it 
opt-in, which ensures that only drivers that have been enabled can talk 
with the host at all and can't be attacked. That made the problem of 
hardening all these drivers a lot more practical.

Currently we only really need virtio and MSI-X shared, so for changing 
two places in the tree you avoid a lot of headache elsewhere.

Note there is still a command line option to override if you want to 
allow and load other drivers.

-Andi
Christoph Hellwig Aug. 24, 2021, 7:07 a.m. UTC | #6
On Mon, Aug 23, 2021 at 05:30:54PM -0700, Kuppuswamy, Sathyanarayanan wrote:
> 
> 
> On 8/23/21 4:56 PM, Michael S. Tsirkin wrote:
> > > Add a new variant of pci_iomap for mapping all PCI resources
> > > of a devices as shared memory with a hypervisor in a confidential
> > > guest.
> > > 
> > > Signed-off-by: Andi Kleen<ak@linux.intel.com>
> > > Signed-off-by: Kuppuswamy Sathyanarayanan<sathyanarayanan.kuppuswamy@linux.intel.com>
> > I'm a bit puzzled by this part. So why should the guest*not*  map
> > pci memory as shared? And if the answer is never (as it seems to be)
> > then why not just make regular pci_iomap DTRT?
> 
> It is in the context of confidential guest (where VMM is un-trusted). So
> we don't want to make all PCI resource as shared. It should be allowed
> only for hardened drivers/devices.

Well, assuming the host can do any damage when mapped shared that also
means not mapping it shared will completely break the drivers.
Michael S. Tsirkin Aug. 24, 2021, 9:12 a.m. UTC | #7
On Mon, Aug 23, 2021 at 05:30:54PM -0700, Kuppuswamy, Sathyanarayanan wrote:
> 
> 
> On 8/23/21 4:56 PM, Michael S. Tsirkin wrote:
> > > Add a new variant of pci_iomap for mapping all PCI resources
> > > of a devices as shared memory with a hypervisor in a confidential
> > > guest.
> > > 
> > > Signed-off-by: Andi Kleen<ak@linux.intel.com>
> > > Signed-off-by: Kuppuswamy Sathyanarayanan<sathyanarayanan.kuppuswamy@linux.intel.com>
> > I'm a bit puzzled by this part. So why should the guest*not*  map
> > pci memory as shared? And if the answer is never (as it seems to be)
> > then why not just make regular pci_iomap DTRT?
> 
> It is in the context of confidential guest (where VMM is un-trusted). So
> we don't want to make all PCI resource as shared. It should be allowed
> only for hardened drivers/devices.

I can't say this answers the question at all. PCI devices are part of
the VMM and so un-trusted. In particular PCI devices do not have
the key to decrypt memory. Therefore as far as I can see PCI resources
should not be encrypted.  I conclude they all should be marked
shared.

If I'm wrong can you please give an example of a PCI resource
that is encrypted?
Michael S. Tsirkin Aug. 24, 2021, 9:47 a.m. UTC | #8
On Mon, Aug 23, 2021 at 07:14:18PM -0700, Andi Kleen wrote:
> 
> On 8/23/2021 6:04 PM, Dan Williams wrote:
> > On Mon, Aug 23, 2021 at 5:31 PM Kuppuswamy, Sathyanarayanan
> > <sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
> > > 
> > > 
> > > On 8/23/21 4:56 PM, Michael S. Tsirkin wrote:
> > > > > Add a new variant of pci_iomap for mapping all PCI resources
> > > > > of a devices as shared memory with a hypervisor in a confidential
> > > > > guest.
> > > > > 
> > > > > Signed-off-by: Andi Kleen<ak@linux.intel.com>
> > > > > Signed-off-by: Kuppuswamy Sathyanarayanan<sathyanarayanan.kuppuswamy@linux.intel.com>
> > > > I'm a bit puzzled by this part. So why should the guest*not*  map
> > > > pci memory as shared? And if the answer is never (as it seems to be)
> > > > then why not just make regular pci_iomap DTRT?
> > > It is in the context of confidential guest (where VMM is un-trusted). So
> > > we don't want to make all PCI resource as shared. It should be allowed
> > > only for hardened drivers/devices.
> > That's confusing, isn't device authorization what keeps unaudited
> > drivers from loading against untrusted devices? I'm feeling like
> > Michael that this should be a detail that drivers need not care about
> > explicitly, in which case it does not need to be exported because the
> > detail can be buried in lower levels.
> 
> We originally made it default (similar to AMD), but it during code audit we
> found a lot of drivers who do ioremap early outside the probe function.
> Since it would be difficult to change them all we made it opt-in, which
> ensures that only drivers that have been enabled can talk with the host at
> all and can't be attacked. That made the problem of hardening all these
> drivers a lot more practical.
> 
> Currently we only really need virtio and MSI-X shared, so for changing two
> places in the tree you avoid a lot of headache elsewhere.
> 
> Note there is still a command line option to override if you want to allow
> and load other drivers.
> 
> -Andi

I see. Hmm. It's a bit of a random thing to do it at the map time
though. E.g. DMA is all handled transparently behind the DMA API.
Hardening is much more than just replacing map with map_shared
and I suspect what you will end up with is basically
vendors replacing map with map shared to make things work
for their users and washing their hands.

I would say an explicit flag in the driver that says "hardened"
and refusing to init a non hardened one would be better.
Andi Kleen Aug. 24, 2021, 5:04 p.m. UTC | #9
On 8/24/2021 12:07 AM, Christoph Hellwig wrote:
> On Mon, Aug 23, 2021 at 05:30:54PM -0700, Kuppuswamy, Sathyanarayanan wrote:
>>
>> On 8/23/21 4:56 PM, Michael S. Tsirkin wrote:
>>>> Add a new variant of pci_iomap for mapping all PCI resources
>>>> of a devices as shared memory with a hypervisor in a confidential
>>>> guest.
>>>>
>>>> Signed-off-by: Andi Kleen<ak@linux.intel.com>
>>>> Signed-off-by: Kuppuswamy Sathyanarayanan<sathyanarayanan.kuppuswamy@linux.intel.com>
>>> I'm a bit puzzled by this part. So why should the guest*not*  map
>>> pci memory as shared? And if the answer is never (as it seems to be)
>>> then why not just make regular pci_iomap DTRT?
>> It is in the context of confidential guest (where VMM is un-trusted). So
>> we don't want to make all PCI resource as shared. It should be allowed
>> only for hardened drivers/devices.
> Well, assuming the host can do any damage when mapped shared that also
> means not mapping it shared will completely break the drivers.

There are several cases:

- We have driver filtering active to protect you against attacks from 
the host against unhardened drivers.

In this case the drivers not working is the intended behavior.

- There is an command allow list override for some new driver, but the 
driver is hardened and shared

The other drivers will still not work, but that's also the intended behavior

- Driver filtering is disabled or the allow list override is used to 
enable some non hardened/enabled driver

There is a command line option to override the ioremap sharing default, 
it will allow all drivers to do ioremap. We would really prefer to make 
it more finegrained, but it's not possible in this case. Other drivers 
are likely attackable.

- Driver filtering is disabled (allowing attacks on the drivers) and the 
command line option for forced sharing is set.

All drivers initialize and can talk to the host through MMIO. Lots of 
unhardened drivers are likely attackable.

-Andi
Andi Kleen Aug. 24, 2021, 5:20 p.m. UTC | #10
> I see. Hmm. It's a bit of a random thing to do it at the map time
> though. E.g. DMA is all handled transparently behind the DMA API.
> Hardening is much more than just replacing map with map_shared
> and I suspect what you will end up with is basically
> vendors replacing map with map shared to make things work
> for their users and washing their hands.

That concept exists too. There is a separate allow list for the drivers. 
So just adding shared to a driver is not enough, until it's also added 
to the allowlist

Users can of course chose to disable the allowlist, but they need to 
understand the security implications.


>
> I would say an explicit flag in the driver that says "hardened"
> and refusing to init a non hardened one would be better.


We have that too (that's the device filtering)

But the problem is that device filtering just stops the probe functions, 
not the initcalls, and lot of legacy drivers do MMIO interactions before 
going into probe. In some cases it's unavoidable because of the device 
doesn't have a separate enumeration mechanism it needs some kind of 
probing to even check for its existence And since we don't want to 
change all of them it's far safer to make the ioremap opt-in.


-Andi
Bjorn Helgaas Aug. 24, 2021, 6:55 p.m. UTC | #11
[+cc Rajat; I still don't know what "shared memory with a hypervisor
in a confidential guest" means, but now we're talking about hardened
drivers and allow lists, which Rajat is interested in]

On Tue, Aug 24, 2021 at 10:20:44AM -0700, Andi Kleen wrote:
> 
> > I see. Hmm. It's a bit of a random thing to do it at the map time
> > though. E.g. DMA is all handled transparently behind the DMA API.
> > Hardening is much more than just replacing map with map_shared
> > and I suspect what you will end up with is basically
> > vendors replacing map with map shared to make things work
> > for their users and washing their hands.
> 
> That concept exists too. There is a separate allow list for the drivers. So
> just adding shared to a driver is not enough, until it's also added to the
> allowlist
> 
> Users can of course chose to disable the allowlist, but they need to
> understand the security implications.
> 
> > I would say an explicit flag in the driver that says "hardened"
> > and refusing to init a non hardened one would be better.
> 
> We have that too (that's the device filtering)
> 
> But the problem is that device filtering just stops the probe functions, not
> the initcalls, and lot of legacy drivers do MMIO interactions before going
> into probe. In some cases it's unavoidable because of the device doesn't
> have a separate enumeration mechanism it needs some kind of probing to even
> check for its existence And since we don't want to change all of them it's
> far safer to make the ioremap opt-in.
> 
> 
> -Andi
>
Andi Kleen Aug. 24, 2021, 8:14 p.m. UTC | #12
On 8/24/2021 11:55 AM, Bjorn Helgaas wrote:
> [+cc Rajat; I still don't know what "shared memory with a hypervisor
> in a confidential guest" means,

A confidential guest is a guest which uses memory encryption to isolate 
itself from the host. It doesn't trust the host. But it still needs to 
communicate with the host for IO, so it has some special memory areas 
that are explicitly marked shared. These are used to do IO with the 
host. All their usage needs to be carefully hardened to avoid any 
security attacks on the guest, that's why we want to limit this 
interaction only to a small set of hardened drivers. For MMIO, the set 
is currently only virtio and MSI-X.

-Andi
Bjorn Helgaas Aug. 24, 2021, 8:31 p.m. UTC | #13
On Tue, Aug 24, 2021 at 01:14:02PM -0700, Andi Kleen wrote:
> 
> On 8/24/2021 11:55 AM, Bjorn Helgaas wrote:
> > [+cc Rajat; I still don't know what "shared memory with a hypervisor
> > in a confidential guest" means,
> 
> A confidential guest is a guest which uses memory encryption to isolate
> itself from the host. It doesn't trust the host. But it still needs to
> communicate with the host for IO, so it has some special memory areas that
> are explicitly marked shared. These are used to do IO with the host. All
> their usage needs to be carefully hardened to avoid any security attacks on
> the guest, that's why we want to limit this interaction only to a small set
> of hardened drivers. For MMIO, the set is currently only virtio and MSI-X.

Good material for the commit log next time around.  Thanks!

Bjorn
Andi Kleen Aug. 24, 2021, 8:50 p.m. UTC | #14
On 8/24/2021 1:31 PM, Bjorn Helgaas wrote:
> On Tue, Aug 24, 2021 at 01:14:02PM -0700, Andi Kleen wrote:
>> On 8/24/2021 11:55 AM, Bjorn Helgaas wrote:
>>> [+cc Rajat; I still don't know what "shared memory with a hypervisor
>>> in a confidential guest" means,
>> A confidential guest is a guest which uses memory encryption to isolate
>> itself from the host. It doesn't trust the host. But it still needs to
>> communicate with the host for IO, so it has some special memory areas that
>> are explicitly marked shared. These are used to do IO with the host. All
>> their usage needs to be carefully hardened to avoid any security attacks on
>> the guest, that's why we want to limit this interaction only to a small set
>> of hardened drivers. For MMIO, the set is currently only virtio and MSI-X.
> Good material for the commit log next time around.  Thanks!

This is all in the patch intro too, which should make it into the merge 
commits.

I don't think we can reexplain the basic concepts for every individual 
patch in a large patch kit.


-Andi
Dan Williams Aug. 24, 2021, 9:05 p.m. UTC | #15
On Tue, Aug 24, 2021 at 1:50 PM Andi Kleen <ak@linux.intel.com> wrote:
>
>
> On 8/24/2021 1:31 PM, Bjorn Helgaas wrote:
> > On Tue, Aug 24, 2021 at 01:14:02PM -0700, Andi Kleen wrote:
> >> On 8/24/2021 11:55 AM, Bjorn Helgaas wrote:
> >>> [+cc Rajat; I still don't know what "shared memory with a hypervisor
> >>> in a confidential guest" means,
> >> A confidential guest is a guest which uses memory encryption to isolate
> >> itself from the host. It doesn't trust the host. But it still needs to
> >> communicate with the host for IO, so it has some special memory areas that
> >> are explicitly marked shared. These are used to do IO with the host. All
> >> their usage needs to be carefully hardened to avoid any security attacks on
> >> the guest, that's why we want to limit this interaction only to a small set
> >> of hardened drivers. For MMIO, the set is currently only virtio and MSI-X.
> > Good material for the commit log next time around.  Thanks!
>
> This is all in the patch intro too, which should make it into the merge
> commits.
>
> I don't think we can reexplain the basic concepts for every individual
> patch in a large patch kit.

Maybe not the whole cover letter, but how about just a line in this
one that says "Recall that 'shared' in this context refers to memory
that lacks confidentiality and integrity protection from the VMM so
that it can communicate with the VM." Although I think
ioremap_noprotect() might be clearer than shared for the protected
guest use case?
Rajat Jain Aug. 24, 2021, 9:55 p.m. UTC | #16
Thanks a lot Bjorn for adding me!


On Tue, Aug 24, 2021 at 11:55 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> [+cc Rajat; I still don't know what "shared memory with a hypervisor
> in a confidential guest" means, but now we're talking about hardened
> drivers and allow lists, which Rajat is interested in]
>
> On Tue, Aug 24, 2021 at 10:20:44AM -0700, Andi Kleen wrote:
> >
> > > I see. Hmm. It's a bit of a random thing to do it at the map time
> > > though. E.g. DMA is all handled transparently behind the DMA API.
> > > Hardening is much more than just replacing map with map_shared
> > > and I suspect what you will end up with is basically
> > > vendors replacing map with map shared to make things work
> > > for their users and washing their hands.
> >
> > That concept exists too. There is a separate allow list for the drivers. So
> > just adding shared to a driver is not enough, until it's also added to the
> > allowlist
> >
> > Users can of course chose to disable the allowlist, but they need to
> > understand the security implications.

This is great. I'd be interested in looking at this allowlist
mechanism. Is this something in-kernel or in userspace? Is this
available upstream or are you maintaining this allowlist elsewhere?
(Background: https://lore.kernel.org/linux-pci/CACK8Z6E8pjVeC934oFgr=VB3pULx_GyT2NkzAogdRQJ9TKSX9A@mail.gmail.com/)

Short Summary: we also have our security team that audits drivers, and
we'd like to enable only audited drivers for the untrusted devices.
Currently, we're carrying this allowlist mechanism on our own since
the idea was Nack'ed by upstream. So if there is something available,
we'd like to use it too.

Thanks,

Rajat


> >
> > > I would say an explicit flag in the driver that says "hardened"
> > > and refusing to init a non hardened one would be better.
> >
> > We have that too (that's the device filtering)
> >
> > But the problem is that device filtering just stops the probe functions, not
> > the initcalls, and lot of legacy drivers do MMIO interactions before going
> > into probe. In some cases it's unavoidable because of the device doesn't
> > have a separate enumeration mechanism it needs some kind of probing to even
> > check for its existence And since we don't want to change all of them it's
> > far safer to make the ioremap opt-in.
> >
> >
> > -Andi
> >
Rajat Jain Aug. 24, 2021, 9:56 p.m. UTC | #17
On Mon, Aug 23, 2021 at 6:06 PM Dan Williams <dan.j.williams@intel.com> wrote:
>
> On Mon, Aug 23, 2021 at 5:31 PM Kuppuswamy, Sathyanarayanan
> <sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
> >
> >
> >
> > On 8/23/21 4:56 PM, Michael S. Tsirkin wrote:
> > >> Add a new variant of pci_iomap for mapping all PCI resources
> > >> of a devices as shared memory with a hypervisor in a confidential
> > >> guest.
> > >>
> > >> Signed-off-by: Andi Kleen<ak@linux.intel.com>
> > >> Signed-off-by: Kuppuswamy Sathyanarayanan<sathyanarayanan.kuppuswamy@linux.intel.com>
> > > I'm a bit puzzled by this part. So why should the guest*not*  map
> > > pci memory as shared? And if the answer is never (as it seems to be)
> > > then why not just make regular pci_iomap DTRT?
> >
> > It is in the context of confidential guest (where VMM is un-trusted). So
> > we don't want to make all PCI resource as shared. It should be allowed
> > only for hardened drivers/devices.
>
> That's confusing, isn't device authorization what keeps unaudited
> drivers from loading against untrusted devices? I'm feeling like
> Michael that this should be a detail that drivers need not care about
> explicitly, in which case it does not need to be exported because the
> detail can be buried in lower levels.
>
> Note, I specifically said "unaudited", not "hardened" because as Greg
> mentioned the kernel must trust drivers, its devices that may not be
> trusted.

Can you please point me to the thread where this discussion with Greg
is ongoing?

Thanks,

Rajat
Dan Williams Aug. 24, 2021, 9:59 p.m. UTC | #18
On Tue, Aug 24, 2021 at 2:57 PM Rajat Jain <rajatja@google.com> wrote:
>
> On Mon, Aug 23, 2021 at 6:06 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > On Mon, Aug 23, 2021 at 5:31 PM Kuppuswamy, Sathyanarayanan
> > <sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
> > >
> > >
> > >
> > > On 8/23/21 4:56 PM, Michael S. Tsirkin wrote:
> > > >> Add a new variant of pci_iomap for mapping all PCI resources
> > > >> of a devices as shared memory with a hypervisor in a confidential
> > > >> guest.
> > > >>
> > > >> Signed-off-by: Andi Kleen<ak@linux.intel.com>
> > > >> Signed-off-by: Kuppuswamy Sathyanarayanan<sathyanarayanan.kuppuswamy@linux.intel.com>
> > > > I'm a bit puzzled by this part. So why should the guest*not*  map
> > > > pci memory as shared? And if the answer is never (as it seems to be)
> > > > then why not just make regular pci_iomap DTRT?
> > >
> > > It is in the context of confidential guest (where VMM is un-trusted). So
> > > we don't want to make all PCI resource as shared. It should be allowed
> > > only for hardened drivers/devices.
> >
> > That's confusing, isn't device authorization what keeps unaudited
> > drivers from loading against untrusted devices? I'm feeling like
> > Michael that this should be a detail that drivers need not care about
> > explicitly, in which case it does not need to be exported because the
> > detail can be buried in lower levels.
> >
> > Note, I specifically said "unaudited", not "hardened" because as Greg
> > mentioned the kernel must trust drivers, its devices that may not be
> > trusted.
>
> Can you please point me to the thread where this discussion with Greg
> is ongoing?

It slowed down to implement the "move to the 'authorized' device
model" recommendation. LWN has a good writeup (as usual) and a link to
the thread:

https://lwn.net/Articles/865918/
Bjorn Helgaas Aug. 25, 2021, 2:52 p.m. UTC | #19
On Tue, Aug 24, 2021 at 01:50:00PM -0700, Andi Kleen wrote:
> 
> On 8/24/2021 1:31 PM, Bjorn Helgaas wrote:
> > On Tue, Aug 24, 2021 at 01:14:02PM -0700, Andi Kleen wrote:
> > > On 8/24/2021 11:55 AM, Bjorn Helgaas wrote:
> > > > [+cc Rajat; I still don't know what "shared memory with a hypervisor
> > > > in a confidential guest" means,
> > > A confidential guest is a guest which uses memory encryption to isolate
> > > itself from the host. It doesn't trust the host. But it still needs to
> > > communicate with the host for IO, so it has some special memory areas that
> > > are explicitly marked shared. These are used to do IO with the host. All
> > > their usage needs to be carefully hardened to avoid any security attacks on
> > > the guest, that's why we want to limit this interaction only to a small set
> > > of hardened drivers. For MMIO, the set is currently only virtio and MSI-X.
> > Good material for the commit log next time around.  Thanks!
> 
> This is all in the patch intro too, which should make it into the merge
> commits.

It's good if the cover letter makes into the merge commit log.

It's probably just because my git foo is lacking, but merge commit
logs don't seem as discoverable as the actual patch commit logs.  Five
years from now, if I want to learn about pci_iomap_shared() history, I
would "git log -p lib/pci_iomap.c" and search for it.  But I don't
think I would see the merge commit then.

Bjorn
Michael S. Tsirkin Aug. 29, 2021, 3:27 p.m. UTC | #20
On Tue, Aug 24, 2021 at 10:20:44AM -0700, Andi Kleen wrote:
> 
> > I see. Hmm. It's a bit of a random thing to do it at the map time
> > though. E.g. DMA is all handled transparently behind the DMA API.
> > Hardening is much more than just replacing map with map_shared
> > and I suspect what you will end up with is basically
> > vendors replacing map with map shared to make things work
> > for their users and washing their hands.
> 
> That concept exists too. There is a separate allow list for the drivers. So
> just adding shared to a driver is not enough, until it's also added to the
> allowlist
> 
> Users can of course chose to disable the allowlist, but they need to
> understand the security implications.

Right. So given that, why do we need to tweak a random API like the map?
If you just make all maps be shared then the user is in control.
Seems sensible to me.

> 
> > 
> > I would say an explicit flag in the driver that says "hardened"
> > and refusing to init a non hardened one would be better.
> 
> 
> We have that too (that's the device filtering)
> 
> But the problem is that device filtering just stops the probe functions, not
> the initcalls, and lot of legacy drivers do MMIO interactions before going
> into probe. In some cases it's unavoidable because of the device doesn't
> have a separate enumeration mechanism it needs some kind of probing to even
> check for its existence And since we don't want to change all of them it's
> far safer to make the ioremap opt-in.
> 
> 
> -Andi

Let's be frank, even without encryption disabling most drivers -
especially weird ones that poke at hardware before probe -
is far safer than keeping them, but one loses a bunch of features.
IOW all this hardening is nice but which security/feature tradeoff
to take it a policy decision, not something kernel should do
imho.
Michael S. Tsirkin Aug. 29, 2021, 3:34 p.m. UTC | #21
On Tue, Aug 24, 2021 at 10:04:26AM -0700, Andi Kleen wrote:
> 
> On 8/24/2021 12:07 AM, Christoph Hellwig wrote:
> > On Mon, Aug 23, 2021 at 05:30:54PM -0700, Kuppuswamy, Sathyanarayanan wrote:
> > > 
> > > On 8/23/21 4:56 PM, Michael S. Tsirkin wrote:
> > > > > Add a new variant of pci_iomap for mapping all PCI resources
> > > > > of a devices as shared memory with a hypervisor in a confidential
> > > > > guest.
> > > > > 
> > > > > Signed-off-by: Andi Kleen<ak@linux.intel.com>
> > > > > Signed-off-by: Kuppuswamy Sathyanarayanan<sathyanarayanan.kuppuswamy@linux.intel.com>
> > > > I'm a bit puzzled by this part. So why should the guest*not*  map
> > > > pci memory as shared? And if the answer is never (as it seems to be)
> > > > then why not just make regular pci_iomap DTRT?
> > > It is in the context of confidential guest (where VMM is un-trusted). So
> > > we don't want to make all PCI resource as shared. It should be allowed
> > > only for hardened drivers/devices.
> > Well, assuming the host can do any damage when mapped shared that also
> > means not mapping it shared will completely break the drivers.
> 
> There are several cases:
> 
> - We have driver filtering active to protect you against attacks from the
> host against unhardened drivers.
> 
> In this case the drivers not working is the intended behavior.
> 
> - There is an command allow list override for some new driver, but the
> driver is hardened and shared
> 
> The other drivers will still not work, but that's also the intended behavior
> 
> - Driver filtering is disabled or the allow list override is used to enable
> some non hardened/enabled driver
> 
> There is a command line option to override the ioremap sharing default, it
> will allow all drivers to do ioremap. We would really prefer to make it more
> finegrained, but it's not possible in this case. Other drivers are likely
> attackable.
> 
> - Driver filtering is disabled (allowing attacks on the drivers) and the
> command line option for forced sharing is set.
> 
> All drivers initialize and can talk to the host through MMIO. Lots of
> unhardened drivers are likely attackable.
> 
> -Andi

All this makes sense but ioremap is such a random place to declare
driver has been audited, and it's baked into the binary with no way for
userspace to set policy.

Again all we will end up with is gradual replacement of all ioremap
calls with ioremap_shared as people discover a given driver does not
work in a VM. How are you going to know driver has actually been
audited? what the quality of the audit was? did the people doing the
auditing understand what they are auditing for?  No way, right?
So IMHO, let it be for now.
Andi Kleen Aug. 29, 2021, 4:17 p.m. UTC | #22
> Let's be frank, even without encryption disabling most drivers -
> especially weird ones that poke at hardware before probe -
> is far safer than keeping them, but one loses a bunch of features.

Usually we don't lose features at all. None of the legacy drivers are 
needed on a guest (or even a modern native system). It's all just all 
for old hardware. Maybe in 20+ years it can be all removed, but we can't 
wait that long.

> IOW all this hardening is nice but which security/feature tradeoff
> to take it a policy decision, not something kernel should do
> imho.

There's no mechanism to push this kind of policy to user space. Users 
don't have control what initcalls run. At the time they execute there 
isn't even any user space yet.

Even if they could somehow control them it's very unlikely they would 
understand them and make an informed decision.

Doing it at build time is not feasible either since we want to run on 
standard distribution kernels.

For modules we have a policy mechanism (prevent udev probing by 
preventing enumeration), and that is implemented, but only handling 
modules is not enough. The compiled in drivers have to be handled too, 
otherwise you have gaping holes in the protection. We don't prevent 
users manually loading modules that might probe, but that is a policy 
decision that users actually sensibly make in user space.

Also I changing this single call really that bad? It's not that we 
changing anything drastic here, just give the low level subsystem a 
better hint about the intention. If you don't like the function name, 
could make it an argument instead?

-Andi



>
Andi Kleen Aug. 29, 2021, 4:43 p.m. UTC | #23
> All this makes sense but ioremap is such a random place to declare
> driver has been audited, and it's baked into the binary with no way for
> userspace to set policy.
>
> Again all we will end up with is gradual replacement of all ioremap
> calls with ioremap_shared as people discover a given driver does not
> work in a VM.

No the device filter will prevent that. They would need to submit 
patches to the central list.

Or they can override it at the command line, but there is already an 
option to force all ioremaps to be shared. So if you just want some 
driver to run without caring about security you can already do that 
without modifying it.

Besides the shared concept only makes sense for virtual devices, so if 
you don't have a device model for a device this will never happen either.

So I don't think your scenario of all ioremaps becoming shared will ever 
happen.


> How are you going to know driver has actually been
> audited? what the quality of the audit was? did the people doing the
> auditing understand what they are auditing for?  No way, right?

First the primary purpose of the ioremap policy is to avoid messing with 
all the legacy drivers (which is 99+% of the code base)

How to handle someone maliciously submitting a driver to the kernel is a 
completely different problem. To some degree there is trust of course. 
If someone says they audited and a maintainer trusts them with their 
statement, but they actually didn't there is a problem, but it's larger 
than just TDX. But in such a case the community code audit will also 
focus on such areas, and people interested in confidential computing 
security will also start taking a look.

And we're also working on fuzzing, so these drivers will be fuzzed at 
some point, so mistakes will be eventually found.

But in any case the ioremap policy is mainly to prevent having to handle 
this for all legacy drivers.

I would rather change the few drivers that are actually needed, than all 
the other drivers.

That's really the fundamental problem this is addressing: how to get 
reasonable security with all the legacy drivers out of the box without 
touching them.


-Andi
Michael S. Tsirkin Aug. 29, 2021, 10:26 p.m. UTC | #24
On Sun, Aug 29, 2021 at 09:17:53AM -0700, Andi Kleen wrote:
> Also I changing this single call really that bad? It's not that we changing
> anything drastic here, just give the low level subsystem a better hint about
> the intention. If you don't like the function name, could make it an
> argument instead?

My point however is that the API should say that the
driver has been audited, not that the mapping has been
done in some special way. For example the mapping can be
in some kind of wrapper, not directly in the driver.
However you want the driver validated, not the wrapper.

Here's an idea:



diff --git a/include/linux/audited.h b/include/linux/audited.h
new file mode 100644
index 000000000000..e23fd6ad50db
--- /dev/null
+++ b/include/linux/audited.h
@@ -0,0 +1,3 @@
+#ifndef AUDITED_MODULE
+#define AUDITED_MODULE
+#endif

Now any audited driver must do
#include <linux/audited.h>
first of all.
Implementation-wise it can do any number of things,
e.g. if you like then sure you can do:

#ifdef AUDITED_MODULE
#define pci_ioremap pci_ioremap_shared
#else
#define pci_ioremap pci_ioremap
#endif

but you can also thinkably do something like (won't work,
but just to give you the idea):

#ifdef AUDITED_MODULE
#define __init __init
#else
#define __init
#endif

or any number of hacks like this.
Andi Kleen Aug. 30, 2021, 5:11 a.m. UTC | #25
On 8/29/2021 3:26 PM, Michael S. Tsirkin wrote:
> On Sun, Aug 29, 2021 at 09:17:53AM -0700, Andi Kleen wrote:
>> Also I changing this single call really that bad? It's not that we changing
>> anything drastic here, just give the low level subsystem a better hint about
>> the intention. If you don't like the function name, could make it an
>> argument instead?
> My point however is that the API should say that the
> driver has been audited,

We have that status in the struct device. If you want to tie the ioremap 
to that we could define a ioremap_device() with a device argument and 
decide based on that.

Or we can add _audited to the name. ioremap_shared_audited?

> not that the mapping has been
> done in some special way. For example the mapping can be
> in some kind of wrapper, not directly in the driver.
> However you want the driver validated, not the wrapper.
>
> Here's an idea:


I don't think magic differences of API behavior based on some define are 
a good idea.  That's easy to miss.

That's a "COME FROM" in API design.

Also it wouldn't handle the case that a driver has both private and 
shared ioremaps, e.g. for BIOS structures.

And we've been avoiding that drivers can self declare auditing, we've 
been trying to have a separate centralized list so that it's easier to 
enforce and avoids any cut'n'paste mistakes.

-Andi
Michael S. Tsirkin Aug. 30, 2021, 8:59 p.m. UTC | #26
On Sun, Aug 29, 2021 at 10:11:46PM -0700, Andi Kleen wrote:
> 
> On 8/29/2021 3:26 PM, Michael S. Tsirkin wrote:
> > On Sun, Aug 29, 2021 at 09:17:53AM -0700, Andi Kleen wrote:
> > > Also I changing this single call really that bad? It's not that we changing
> > > anything drastic here, just give the low level subsystem a better hint about
> > > the intention. If you don't like the function name, could make it an
> > > argument instead?
> > My point however is that the API should say that the
> > driver has been audited,
> 
> We have that status in the struct device. If you want to tie the ioremap to
> that we could define a ioremap_device() with a device argument and decide
> based on that.

But it's not the device that is audited. And it's not the device
that might be secure or insecure. It's the driver.

> Or we can add _audited to the name. ioremap_shared_audited?

But it's not the mapping that has to be done in handled special way.
It's any data we get from device, not all of it coming from IO, e.g.
there's DMA and interrupts that all have to be validated.
Wouldn't you say that what is really wanted is just not running
unaudited drivers in the first place?

> 
> > not that the mapping has been
> > done in some special way. For example the mapping can be
> > in some kind of wrapper, not directly in the driver.
> > However you want the driver validated, not the wrapper.
> > 
> > Here's an idea:
> 
> 
> I don't think magic differences of API behavior based on some define are a
> good idea.  That's easy to miss.

Well ... my point is that actually there is no difference in API
behaviour. the map is the same map, exactly same data goes to device. If
anything any non-shared map is special in that encrypted data goes to
device.

> 
> That's a "COME FROM" in API design.
> 
> Also it wouldn't handle the case that a driver has both private and shared
> ioremaps, e.g. for BIOS structures.

Hmm. Interesting.  It's bios maps that are unusual and need to be private though ...

> And we've been avoiding that drivers can self declare auditing, we've been
> trying to have a separate centralized list so that it's easier to enforce
> and avoids any cut'n'paste mistakes.
> 
> -Andi

Now I'm confused. What is proposed here seems to be basically that,
drivers need to declare auditing by replacing ioremap with
ioremap_shared.
Andi Kleen Aug. 31, 2021, 12:23 a.m. UTC | #27
On 8/30/2021 1:59 PM, Michael S. Tsirkin wrote:
>
>> Or we can add _audited to the name. ioremap_shared_audited?
> But it's not the mapping that has to be done in handled special way.
> It's any data we get from device, not all of it coming from IO, e.g.
> there's DMA and interrupts that all have to be validated.
> Wouldn't you say that what is really wanted is just not running
> unaudited drivers in the first place?


Yes.


>
>> And we've been avoiding that drivers can self declare auditing, we've been
>> trying to have a separate centralized list so that it's easier to enforce
>> and avoids any cut'n'paste mistakes.
>>
>> -Andi
> Now I'm confused. What is proposed here seems to be basically that,
> drivers need to declare auditing by replacing ioremap with
> ioremap_shared.

Auditing is declared on the device model level using a central allow list.

But this cannot do anything to initcalls that run before probe, that's 
why an extra level of defense of ioremap opt-in is useful. But it's not 
the primary mechanism to declare a driver audited, that's the allow 
list. The ioremap is just another mechanism to avoid having to touch a 
lot of legacy drivers.

If we agree on that then the original proposed semantics of 
"ioremap_shared" may be acceptable?

-Andi
Michael S. Tsirkin Sept. 10, 2021, 9:54 a.m. UTC | #28
On Mon, Aug 30, 2021 at 05:23:17PM -0700, Andi Kleen wrote:
> 
> On 8/30/2021 1:59 PM, Michael S. Tsirkin wrote:
> > 
> > > Or we can add _audited to the name. ioremap_shared_audited?
> > But it's not the mapping that has to be done in handled special way.
> > It's any data we get from device, not all of it coming from IO, e.g.
> > there's DMA and interrupts that all have to be validated.
> > Wouldn't you say that what is really wanted is just not running
> > unaudited drivers in the first place?
> 
> 
> Yes.

Then ... let's do just that?

> 
> > 
> > > And we've been avoiding that drivers can self declare auditing, we've been
> > > trying to have a separate centralized list so that it's easier to enforce
> > > and avoids any cut'n'paste mistakes.
> > > 
> > > -Andi
> > Now I'm confused. What is proposed here seems to be basically that,
> > drivers need to declare auditing by replacing ioremap with
> > ioremap_shared.
> 
> Auditing is declared on the device model level using a central allow list.

Can we not have an init call allow list instead of, or in addition to, a
device allow list?

> But this cannot do anything to initcalls that run before probe,

Can't we extend module_init so init calls are validated against the
allow list?

> that's why
> an extra level of defense of ioremap opt-in is useful.

OK even assuming this, why is pci_iomap opt-in useful?
That never happens before probe - there's simply no pci_device then.

> But it's not the
> primary mechanism to declare a driver audited, that's the allow list. The
> ioremap is just another mechanism to avoid having to touch a lot of legacy
> drivers.
> 
> If we agree on that then the original proposed semantics of "ioremap_shared"
> may be acceptable?
> 
> -Andi
> 

It looks suspiciously like drivers self-declaring auditing to me which
we both seem to agree is undesirable. What exactly is the difference?

Or are you just trying to disable anything that runs before probe?
In that case I don't see a reason to touch pci drivers though.
These should be fine with just the device model list.
Andi Kleen Sept. 10, 2021, 4:34 p.m. UTC | #29
>>>> And we've been avoiding that drivers can self declare auditing, we've been
>>>> trying to have a separate centralized list so that it's easier to enforce
>>>> and avoids any cut'n'paste mistakes.
>>>>
>>>> -Andi
>>> Now I'm confused. What is proposed here seems to be basically that,
>>> drivers need to declare auditing by replacing ioremap with
>>> ioremap_shared.
>> Auditing is declared on the device model level using a central allow list.
> Can we not have an init call allow list instead of, or in addition to, a
> device allow list?


That would be quite complicated and intrusive. In fact I'm not even sure 
how to do maintain something like this. There are a lot of needed 
initcalls, they would all need to be marked. How can we distinguish 
them? It would be a giant auditing project. And of course how would you 
prevent it from bitrotting?


Basically it would be hundreds of changes all over the tree, just to 
avoid two changes in virtio and MSI. Approach of just stopping the 
initcalls from doing bad things is much less intrusive.

>
>> But this cannot do anything to initcalls that run before probe,
> Can't we extend module_init so init calls are validated against the
> allow list?

See above.


Also the problem isn't really with modules (we rely on udev not loading 
them), but with builtin initcalls


>
>> that's why
>> an extra level of defense of ioremap opt-in is useful.
> OK even assuming this, why is pci_iomap opt-in useful?
> That never happens before probe - there's simply no pci_device then.


Hmm, yes that's true. I guess we can make it default to opt-in for 
pci_iomap.

It only really matters for device less ioremaps.

>
> It looks suspiciously like drivers self-declaring auditing to me which
> we both seem to agree is undesirable. What exactly is the difference?


Just allow listing the ioremaps is not self declaration because the 
device will still not initialize due to the central device filter. If 
you want to use it that has to be changed.

It's just an additional safety net to contain code running before probe.


>
> Or are you just trying to disable anything that runs before probe?


Well anything that could do dangerous host interactions (like processing 
ioremap data) A lot of things are harmless and can be allowed, or 
already blocked elsewhere (e.g. we have a IO port filter). This just 
handles the ioremap/MMIO case.

> In that case I don't see a reason to touch pci drivers though.
> These should be fine with just the device model list.


That won't stop initcalls.


-Andi


>
Michael S. Tsirkin Sept. 11, 2021, 11:54 p.m. UTC | #30
On Fri, Sep 10, 2021 at 09:34:45AM -0700, Andi Kleen wrote:
> > > that's why
> > > an extra level of defense of ioremap opt-in is useful.
> > OK even assuming this, why is pci_iomap opt-in useful?
> > That never happens before probe - there's simply no pci_device then.
> 
> 
> Hmm, yes that's true. I guess we can make it default to opt-in for
> pci_iomap.
> 
> It only really matters for device less ioremaps.

OK. And same thing for other things with device, such as
devm_platform_ioremap_resource.
If we agree on all that, this will basically remove virtio
changes from the picture ;)
Michael S. Tsirkin Sept. 13, 2021, 5:53 a.m. UTC | #31
On Sat, Sep 11, 2021 at 07:54:43PM -0400, Michael S. Tsirkin wrote:
> On Fri, Sep 10, 2021 at 09:34:45AM -0700, Andi Kleen wrote:
> > > > that's why
> > > > an extra level of defense of ioremap opt-in is useful.
> > > OK even assuming this, why is pci_iomap opt-in useful?
> > > That never happens before probe - there's simply no pci_device then.
> > 
> > 
> > Hmm, yes that's true. I guess we can make it default to opt-in for
> > pci_iomap.
> > 
> > It only really matters for device less ioremaps.
> 
> OK. And same thing for other things with device, such as
> devm_platform_ioremap_resource.
> If we agree on all that, this will basically remove virtio
> changes from the picture ;)


Something else that was pointed out to me:

         fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap);
         if (IS_ERR(fs->window_kaddr))
                 return PTR_ERR(fs->window_kaddr);


looks like if we forget to set the shared flag then it will
corrupt the DAX data?


> -- 
> MST
>
Andi Kleen Sept. 24, 2021, 10:43 p.m. UTC | #32
>> Hmm, yes that's true. I guess we can make it default to opt-in for
>> pci_iomap.
>>
>> It only really matters for device less ioremaps.
> OK. And same thing for other things with device, such as
> devm_platform_ioremap_resource.
> If we agree on all that, this will basically remove virtio
> changes from the picture ;)

Hi we revisited this now. One problem with removing the ioremap opt-in 
is that it's still possible for drivers to get at devices without going 
through probe. For example they can walk the PCI device list. Some 
drivers do that for various reasons. So if we remove the opt-in we would 
need to audit and possibly fix all that, which would be potentially a 
lot of churn. That's why I think it's better to keep the opt-in.


-Andi
Michael S. Tsirkin Sept. 27, 2021, 9:07 a.m. UTC | #33
On Fri, Sep 24, 2021 at 03:43:40PM -0700, Andi Kleen wrote:
> 
> > > Hmm, yes that's true. I guess we can make it default to opt-in for
> > > pci_iomap.
> > > 
> > > It only really matters for device less ioremaps.
> > OK. And same thing for other things with device, such as
> > devm_platform_ioremap_resource.
> > If we agree on all that, this will basically remove virtio
> > changes from the picture ;)
> 
> Hi we revisited this now. One problem with removing the ioremap opt-in is
> that it's still possible for drivers to get at devices without going through
> probe. For example they can walk the PCI device list. Some drivers do that
> for various reasons. So if we remove the opt-in we would need to audit and
> possibly fix all that, which would be potentially a lot of churn. That's why
> I think it's better to keep the opt-in.
> 
> 
> -Andi
> 

I've been thinking about why this still feels wrong to me.

Here's what I came up with: at some point someone will want one of these
modules (poking at devices in the initcall) in the encrypted
environment, and will change ioremap to ioremap_shared.
At that point the allowlist will be broken again, and
by that time it will be set in stone and too late to fix.

Isn't the problem that what is actually audited is modules,
but you are trying to add devices to allow list?
So why not have modules/initcalls in the allowlist then?
For built-in modules, we already have initcall_blacklisted, right?
This could be an extension ... no?
diff mbox series

Patch

diff --git a/include/asm-generic/pci_iomap.h b/include/asm-generic/pci_iomap.h
index d4f16dcc2ed7..0178ddd7ad88 100644
--- a/include/asm-generic/pci_iomap.h
+++ b/include/asm-generic/pci_iomap.h
@@ -18,6 +18,12 @@  extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
 extern void __iomem *pci_iomap_wc_range(struct pci_dev *dev, int bar,
 					unsigned long offset,
 					unsigned long maxlen);
+extern void __iomem *pci_iomap_shared(struct pci_dev *dev, int bar,
+				      unsigned long max);
+extern void __iomem *pci_iomap_shared_range(struct pci_dev *dev, int bar,
+					    unsigned long offset,
+					    unsigned long maxlen);
+
 /* Create a virtual mapping cookie for a port on a given PCI device.
  * Do not call this directly, it exists to make it easier for architectures
  * to override */
diff --git a/lib/pci_iomap.c b/lib/pci_iomap.c
index 6251c3f651c6..b04e8689eab3 100644
--- a/lib/pci_iomap.c
+++ b/lib/pci_iomap.c
@@ -25,6 +25,11 @@  static void __iomem *map_ioremap_wc(phys_addr_t addr, size_t size)
 	return ioremap_wc(addr, size);
 }
 
+static void __iomem *map_ioremap_shared(phys_addr_t addr, size_t size)
+{
+	return ioremap_shared(addr, size);
+}
+
 static void __iomem *pci_iomap_range_map(struct pci_dev *dev,
 					 int bar,
 					 unsigned long offset,
@@ -101,6 +106,47 @@  void __iomem *pci_iomap_wc_range(struct pci_dev *dev,
 }
 EXPORT_SYMBOL_GPL(pci_iomap_wc_range);
 
+/**
+ * pci_iomap_shared_range - create a virtual shared mapping cookie for a
+ *                          PCI BAR
+ * @dev: PCI device that owns the BAR
+ * @bar: BAR number
+ * @offset: map memory at the given offset in BAR
+ * @maxlen: max length of the memory to map
+ *
+ * Remap a pci device's resources shared in a confidential guest.
+ * For more details see pci_iomap_range's documentation.
+ *
+ * @maxlen specifies the maximum length to map. To get access to
+ * the complete BAR from offset to the end, pass %0 here.
+ */
+void __iomem *pci_iomap_shared_range(struct pci_dev *dev, int bar,
+				     unsigned long offset, unsigned long maxlen)
+{
+	return pci_iomap_range_map(dev, bar, offset, maxlen,
+				   map_ioremap_shared);
+}
+EXPORT_SYMBOL_GPL(pci_iomap_shared_range);
+
+/**
+ * pci_iomap_shared - create a virtual shared mapping cookie for a PCI BAR
+ * @dev: PCI device that owns the BAR
+ * @bar: BAR number
+ * @maxlen: length of the memory to map
+ *
+ * See pci_iomap for details. This function creates a shared mapping
+ * with the host for confidential hosts.
+ *
+ * @maxlen specifies the maximum length to map. To get access to the
+ * complete BAR without checking for its length first, pass %0 here.
+ */
+void __iomem *pci_iomap_shared(struct pci_dev *dev, int bar,
+			       unsigned long maxlen)
+{
+	return pci_iomap_shared_range(dev, bar, 0, maxlen);
+}
+EXPORT_SYMBOL_GPL(pci_iomap_shared);
+
 /**
  * pci_iomap - create a virtual mapping cookie for a PCI BAR
  * @dev: PCI device that owns the BAR