diff mbox series

[v3,09/23] hw/uefi: add var-service-core.c

Message ID 20250211092324.965440-10-kraxel@redhat.com (mailing list archive)
State New
Headers show
Series hw/uefi: add uefi variable service | expand

Commit Message

Gerd Hoffmann Feb. 11, 2025, 9:23 a.m. UTC
This is the core code for guest <-> host communication.  This accepts
request messages from the guest, dispatches them to the service called,
and sends back the response message.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 hw/uefi/var-service-core.c | 237 +++++++++++++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)
 create mode 100644 hw/uefi/var-service-core.c

Comments

Alexander Graf Feb. 11, 2025, 9:45 a.m. UTC | #1
On 11.02.25 10:23, Gerd Hoffmann wrote:
> This is the core code for guest <-> host communication.  This accepts
> request messages from the guest, dispatches them to the service called,
> and sends back the response message.
>
> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
> ---
>   hw/uefi/var-service-core.c | 237 +++++++++++++++++++++++++++++++++++++
>   1 file changed, 237 insertions(+)
>   create mode 100644 hw/uefi/var-service-core.c
>
> diff --git a/hw/uefi/var-service-core.c b/hw/uefi/var-service-core.c
> new file mode 100644
> index 000000000000..78a668e68fa2
> --- /dev/null
> +++ b/hw/uefi/var-service-core.c
> @@ -0,0 +1,237 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * uefi vars device
> + */
> +#include "qemu/osdep.h"
> +#include "system/dma.h"
> +#include "migration/vmstate.h"
> +
> +#include "hw/uefi/var-service.h"
> +#include "hw/uefi/var-service-api.h"
> +#include "hw/uefi/var-service-edk2.h"
> +
> +#include "trace/trace-hw_uefi.h"
> +
> +static int uefi_vars_pre_load(void *opaque)
> +{
> +    uefi_vars_state *uv = opaque;
> +
> +    uefi_vars_clear_all(uv);
> +    uefi_vars_policies_clear(uv);
> +    g_free(uv->buffer);
> +    return 0;
> +}
> +
> +static int uefi_vars_post_load(void *opaque, int version_id)
> +{
> +    uefi_vars_state *uv = opaque;
> +
> +    uefi_vars_update_storage(uv);
> +    uv->buffer = g_malloc(uv->buf_size);
> +    return 0;
> +}
> +
> +const VMStateDescription vmstate_uefi_vars = {
> +    .name = "uefi-vars",
> +    .pre_load = uefi_vars_pre_load,
> +    .post_load = uefi_vars_post_load,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT16(sts, uefi_vars_state),
> +        VMSTATE_UINT32(buf_size, uefi_vars_state),
> +        VMSTATE_UINT32(buf_addr_lo, uefi_vars_state),
> +        VMSTATE_UINT32(buf_addr_hi, uefi_vars_state),
> +        VMSTATE_BOOL(end_of_dxe, uefi_vars_state),
> +        VMSTATE_BOOL(ready_to_boot, uefi_vars_state),
> +        VMSTATE_BOOL(exit_boot_service, uefi_vars_state),
> +        VMSTATE_BOOL(policy_locked, uefi_vars_state),
> +        VMSTATE_UINT64(used_storage, uefi_vars_state),
> +        VMSTATE_QTAILQ_V(variables, uefi_vars_state, 0,
> +                         vmstate_uefi_variable, uefi_variable, next),
> +        VMSTATE_QTAILQ_V(var_policies, uefi_vars_state, 0,
> +                         vmstate_uefi_var_policy, uefi_var_policy, next),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
> +static uint32_t uefi_vars_cmd_mm(uefi_vars_state *uv)
> +{
> +    hwaddr    dma;
> +    mm_header *mhdr;
> +    uint64_t  size;
> +    uint32_t  retval;
> +
> +    dma = uv->buf_addr_lo | ((hwaddr)uv->buf_addr_hi << 32);
> +    mhdr = (mm_header *) uv->buffer;
> +
> +    if (!uv->buffer || uv->buf_size < sizeof(*mhdr)) {
> +        return UEFI_VARS_STS_ERR_BAD_BUFFER_SIZE;
> +    }
> +
> +    /* read header */
> +    dma_memory_read(&address_space_memory, dma,
> +                    uv->buffer, sizeof(*mhdr),
> +                    MEMTXATTRS_UNSPECIFIED);


Depending on DMA sounds appealing at first, but can fall apart in corner 
cases. I know of 2 cases where DMA failed for me in the EC2 equivalent 
of this:

1) SEV-SNP. If you want the hypervisor to implement UEFI variable 
services for you, the buffer region must always be in shared state. 
Ensuring that during boot time is tricky but doable. At runtime you no 
longer really have control over the sharability of pages.

2) Mac OS X. MacOS is the only OS I'm aware of that really makes use of 
relocation. They move your physical pages to random locations, give you 
a non-1:1 mapping to that and once you're in real OS land, you have no 
more knowledge at all about the physical location of anything. Maybe you 
can work around that by declaring the buffer region as MMIO space? But 
then it really should be a memory region in the device.


To address the 2 cases above, I ended up implementing a special "PIO 
mode" which does not rely on DMA at all:

https://github.com/aws/uefi/blob/main/edk2-stable202211/0023-edk2-stable202211-ExtVarStore-Add-support-for-PIO-transfer.patch

Also, I'm surprised you cut the variable service off at the SMM boundary 
instead of the RTS callback boundary. Why is that cleaner/better than 
implementing variables completely in QEMU? In the EC2 version, we just 
built a separate variable store implementation that completely replaces 
the edk2 variable store:

https://github.com/aws/uefi/blob/main/edk2-stable202211/0012-edk2-stable202211-nitro-Add-ExtVarStore-for-vmm-based-variable-storage.patch

It would be nice to agree on a single external variable store 
implementation :).


Alex
Gerd Hoffmann Feb. 12, 2025, 10:24 a.m. UTC | #2
Hi,

> > +    /* read header */
> > +    dma_memory_read(&address_space_memory, dma,
> > +                    uv->buffer, sizeof(*mhdr),
> > +                    MEMTXATTRS_UNSPECIFIED);
> 
> Depending on DMA sounds appealing at first, but can fall apart in corner
> cases. I know of 2 cases where DMA failed for me in the EC2 equivalent of
> this:
> 
> 1) SEV-SNP. If you want the hypervisor to implement UEFI variable services
> for you, the buffer region must always be in shared state. Ensuring that
> during boot time is tricky but doable. At runtime you no longer really have
> control over the sharability of pages.

With SEV-SNP I don't see the point in using this.

Why do you use confidential computing in the first place if you trust
the host with your EFI variables?  I'd rather see something simliar
running under guest control, in svsm context.

> 2) Mac OS X. MacOS is the only OS I'm aware of that really makes use of
> relocation. They move your physical pages to random locations, give you a
> non-1:1 mapping to that and once you're in real OS land, you have no more
> knowledge at all about the physical location of anything.

On the host side you have no insight into this indeed.

The firmware knows all this very well though.  The OS passes a mapping
table to the firmware, efi runtime drivers can subscribe to mapping
updates and can use RT->ConvertPointer to translate addresses from
physical to virtual.

The edk2 code (https://github.com/tianocore/edk2/pull/10695) does
exactly that.

I see your driver does that too, so in theory it should work just fine.
I'm wondering what exactly the problem with macOS is?

> Also, I'm surprised you cut the variable service off at the SMM boundary
> instead of the RTS callback boundary. Why is that cleaner/better than
> implementing variables completely in QEMU?

Well, the variable service /is/ completely in qemu.  See patch #6 which
implements getvariable & friends.  edk2 serializes the variable calls
into a buffer and sends them over to the SMM side (or to qemu with the
patches).

I didn't feel like inventing a new serialization protocol if we already
have a proven one in the edk2 code base.  Also it is possible to send
over more than just the variable call.  There is a variable policy
protocol implementation (patch #8), and we also get some events
forwarded.  More can easily be added should the need for that arise.

> It would be nice to agree on a single external variable store implementation
> :).

It would be nice to have nitro support merged upstream,
especially with BYOF coming.

take care,
  Gerd
Alexander Graf Feb. 12, 2025, 11:30 a.m. UTC | #3
On 12.02.25 11:24, Gerd Hoffmann wrote:
>    Hi,
>
>>> +    /* read header */
>>> +    dma_memory_read(&address_space_memory, dma,
>>> +                    uv->buffer, sizeof(*mhdr),
>>> +                    MEMTXATTRS_UNSPECIFIED);
>> Depending on DMA sounds appealing at first, but can fall apart in corner
>> cases. I know of 2 cases where DMA failed for me in the EC2 equivalent of
>> this:
>>
>> 1) SEV-SNP. If you want the hypervisor to implement UEFI variable services
>> for you, the buffer region must always be in shared state. Ensuring that
>> during boot time is tricky but doable. At runtime you no longer really have
>> control over the sharability of pages.
> With SEV-SNP I don't see the point in using this.
>
> Why do you use confidential computing in the first place if you trust
> the host with your EFI variables?  I'd rather see something simliar
> running under guest control, in svsm context.


That depends heavily on your threat model. You can use a host provided 
variable store to gain variable persistence for things like boot 
variables and then have an ephemeral SVSM based TPM that you use to 
measure the loaded payloads. A malicious host can already replace your 
root volume, so extending the threat to variables is not the end of the 
world.


>
>> 2) Mac OS X. MacOS is the only OS I'm aware of that really makes use of
>> relocation. They move your physical pages to random locations, give you a
>> non-1:1 mapping to that and once you're in real OS land, you have no more
>> knowledge at all about the physical location of anything.
> On the host side you have no insight into this indeed.
>
> The firmware knows all this very well though.  The OS passes a mapping
> table to the firmware, efi runtime drivers can subscribe to mapping
> updates and can use RT->ConvertPointer to translate addresses from
> physical to virtual.
>
> The edk2 code (https://github.com/tianocore/edk2/pull/10695) does
> exactly that.
>
> I see your driver does that too, so in theory it should work just fine.
> I'm wondering what exactly the problem with macOS is?


You get to know the new virtual address, but ConvertPointer never tells 
you what the new *physical* address is. That means you have no idea 
where to DMA from once you're in virtual land. Most OSs just keep a 1:1 
map of virtual to physical, but MacOS does not.


>> Also, I'm surprised you cut the variable service off at the SMM boundary
>> instead of the RTS callback boundary. Why is that cleaner/better than
>> implementing variables completely in QEMU?
> Well, the variable service /is/ completely in qemu.  See patch #6 which
> implements getvariable & friends.  edk2 serializes the variable calls
> into a buffer and sends them over to the SMM side (or to qemu with the
> patches).
>
> I didn't feel like inventing a new serialization protocol if we already
> have a proven one in the edk2 code base.  Also it is possible to send
> over more than just the variable call.  There is a variable policy
> protocol implementation (patch #8), and we also get some events
> forwarded.  More can easily be added should the need for that arise.
>
>> It would be nice to agree on a single external variable store implementation
>> :).
> It would be nice to have nitro support merged upstream,
> especially with BYOF coming.


Yes. Or converge on this protocol instead to simplify the firmware 
implementation so we don't create needless work if someone wants to do 
an actually trivial (and reusable?) UEFI firmware for BYOF.


Alex
Gerd Hoffmann Feb. 12, 2025, 12:28 p.m. UTC | #4
On Wed, Feb 12, 2025 at 12:30:20PM +0100, Alexander Graf wrote:
> 
> On 12.02.25 11:24, Gerd Hoffmann wrote:
> > 
> > Why do you use confidential computing in the first place if you trust
> > the host with your EFI variables?  I'd rather see something simliar
> > running under guest control, in svsm context.
> 
> That depends heavily on your threat model. You can use a host provided
> variable store to gain variable persistence for things like boot variables
> and then have an ephemeral SVSM based TPM that you use to measure the loaded
> payloads. A malicious host can already replace your root volume, so
> extending the threat to variables is not the end of the world.

If you go depend on measured boot instead of secure boot then yes, this
might be a workable model.  Should be doable with a small svsm driver
which forwards requests to the host via svsm-controlled bounce buffer
(where the svsm has control over page properties).

> > The firmware knows all this very well though.  The OS passes a mapping
> > table to the firmware, efi runtime drivers can subscribe to mapping
> > updates and can use RT->ConvertPointer to translate addresses from
> > physical to virtual.
> > 
> > The edk2 code (https://github.com/tianocore/edk2/pull/10695) does
> > exactly that.
> > 
> > I see your driver does that too, so in theory it should work just fine.
> > I'm wondering what exactly the problem with macOS is?
> 
> You get to know the new virtual address, but ConvertPointer never tells you
> what the new *physical* address is. That means you have no idea where to DMA
> from once you're in virtual land.

Yes.  Knowing both physical and virtual address works only for memory
you allocated yourself before ExitBootServices.  So you can't pass on
pointers from the OS, you have to copy the data to a buffer where you
know the physical address instead.  Yes, some overhead.  Should still
be much faster than going to pio transfer mode ...

> > It would be nice to have nitro support merged upstream,
> > especially with BYOF coming.
> 
> Yes. Or converge on this protocol instead to simplify the firmware
> implementation

Yes, that works too and would reduce your stack of unmerged patches a
bit.

take care,
  Gerd
Alexander Graf Feb. 12, 2025, 1:45 p.m. UTC | #5
On 12.02.25 13:28, Gerd Hoffmann wrote:
> On Wed, Feb 12, 2025 at 12:30:20PM +0100, Alexander Graf wrote:
>> On 12.02.25 11:24, Gerd Hoffmann wrote:
>>> Why do you use confidential computing in the first place if you trust
>>> the host with your EFI variables?  I'd rather see something simliar
>>> running under guest control, in svsm context.
>> That depends heavily on your threat model. You can use a host provided
>> variable store to gain variable persistence for things like boot variables
>> and then have an ephemeral SVSM based TPM that you use to measure the loaded
>> payloads. A malicious host can already replace your root volume, so
>> extending the threat to variables is not the end of the world.
> If you go depend on measured boot instead of secure boot then yes, this
> might be a workable model.  Should be doable with a small svsm driver
> which forwards requests to the host via svsm-controlled bounce buffer
> (where the svsm has control over page properties).


In a BYOF world it's even useful without SVSM at all, because the launch 
digest performs measurement for you, but you still want to find your 
boot variables.


>>> The firmware knows all this very well though.  The OS passes a mapping
>>> table to the firmware, efi runtime drivers can subscribe to mapping
>>> updates and can use RT->ConvertPointer to translate addresses from
>>> physical to virtual.
>>>
>>> The edk2 code (https://github.com/tianocore/edk2/pull/10695) does
>>> exactly that.
>>>
>>> I see your driver does that too, so in theory it should work just fine.
>>> I'm wondering what exactly the problem with macOS is?
>> You get to know the new virtual address, but ConvertPointer never tells you
>> what the new *physical* address is. That means you have no idea where to DMA
>> from once you're in virtual land.
> Yes.  Knowing both physical and virtual address works only for memory
> you allocated yourself before ExitBootServices.  So you can't pass on
> pointers from the OS, you have to copy the data to a buffer where you
> know the physical address instead.  Yes, some overhead.  Should still
> be much faster than going to pio transfer mode ...


MacOS takes over the full physical address map past ExitBootServices: 
Your code no longer has VA access to random code and it literally 
memcpy()'s all preserved (virtual available) code and data to different 
physical addresses. You simply have nothing that is all of 1) RAM 
(mapped as cacheable on ARM), 2) known VA 3) known PA.

So we really really need a fallback mechanism that works without DMA :).


Alex
Gerd Hoffmann Feb. 12, 2025, 3:18 p.m. UTC | #6
Hi,

> > Yes.  Knowing both physical and virtual address works only for memory
> > you allocated yourself before ExitBootServices.  So you can't pass on
> > pointers from the OS, you have to copy the data to a buffer where you
> > know the physical address instead.  Yes, some overhead.  Should still
> > be much faster than going to pio transfer mode ...
> 
> MacOS takes over the full physical address map past ExitBootServices: Your
> code no longer has VA access to random code

That is totally fine.  EFI drivers must register everything they need as
runtime memory.  Anything else can be unmapped by the OS when calling
EFI services.

> and it literally memcpy()'s all preserved (virtual available) code and
> data to different physical addresses.

Uhm.  I have my doubts this copying behavior is blessed by the UEFI spec.

> You simply have nothing that is all of 1) RAM (mapped as cacheable on
> ARM), 2) known VA 3) known PA.

Bummer.

> So we really really need a fallback mechanism that works without DMA
> :).

On arm it should be relatively simple to move the buffer to device
memory.  Just place one more region on the platform bus, advertise
address + size via device tree, done.

Not sure how to do that best on x86 though.  Find 64k unused address
space over ioapic?  Do we have enough free space there?  And how
future-proof would that be?

take care,
  Gerd
Alexander Graf Feb. 12, 2025, 9:26 p.m. UTC | #7
On 12.02.25 16:18, Gerd Hoffmann wrote:
>    Hi,
>
>>> Yes.  Knowing both physical and virtual address works only for memory
>>> you allocated yourself before ExitBootServices.  So you can't pass on
>>> pointers from the OS, you have to copy the data to a buffer where you
>>> know the physical address instead.  Yes, some overhead.  Should still
>>> be much faster than going to pio transfer mode ...
>> MacOS takes over the full physical address map past ExitBootServices: Your
>> code no longer has VA access to random code
> That is totally fine.  EFI drivers must register everything they need as
> runtime memory.  Anything else can be unmapped by the OS when calling
> EFI services.
>
>> and it literally memcpy()'s all preserved (virtual available) code and
>> data to different physical addresses.
> Uhm.  I have my doubts this copying behavior is blessed by the UEFI spec.


I don't remember anything in the spec prohibiting it.


>> You simply have nothing that is all of 1) RAM (mapped as cacheable on
>> ARM), 2) known VA 3) known PA.
> Bummer.
>
>> So we really really need a fallback mechanism that works without DMA
>> :).
> On arm it should be relatively simple to move the buffer to device
> memory.  Just place one more region on the platform bus, advertise
> address + size via device tree, done.


That will bring back all issues with cached vs non-cached memory 
accesses, no? So edk2 will always access that memory as device memory 
which means it bypasses the cache, while QEMU will access it through the 
cache. So that buffer would need to actually be MMIO memory I suppose?


> Not sure how to do that best on x86 though.  Find 64k unused address
> space over ioapic?  Do we have enough free space there?  And how
> future-proof would that be?


I'm not worried yet about where we place that memory, but more about 
ensuring that we actually have a working path to access it. We can 
always find space in the PCI hole, as long as we properly advertise it 
to all stakeholders via ACPI and memory map.


Alex
Ard Biesheuvel Feb. 13, 2025, 9:28 a.m. UTC | #8
On Wed, 12 Feb 2025 at 22:26, Alexander Graf <graf@amazon.com> wrote:
>
>
> On 12.02.25 16:18, Gerd Hoffmann wrote:
> >    Hi,
> >
> >>> Yes.  Knowing both physical and virtual address works only for memory
> >>> you allocated yourself before ExitBootServices.  So you can't pass on
> >>> pointers from the OS, you have to copy the data to a buffer where you
> >>> know the physical address instead.  Yes, some overhead.  Should still
> >>> be much faster than going to pio transfer mode ...
> >> MacOS takes over the full physical address map past ExitBootServices: Your
> >> code no longer has VA access to random code
> > That is totally fine.  EFI drivers must register everything they need as
> > runtime memory.  Anything else can be unmapped by the OS when calling
> > EFI services.
> >
> >> and it literally memcpy()'s all preserved (virtual available) code and
> >> data to different physical addresses.
> > Uhm.  I have my doubts this copying behavior is blessed by the UEFI spec.
>
>
> I don't remember anything in the spec prohibiting it.
>

The UEFI spec clearly states that runtime services must either be
called using a 1:1 mapping, or via a virtual remapping but in that
case, SetVirtualAddresMap() must be called to inform the firmware of
the new virtual mapping.

Even if this is not clearly stated, this violates the intent of the
UEFI spec: the code reasons about mappings of physical memory,
implying that the mapping is the only thing that changes. Moving
memory contents around can only be done safely after
SetVirtualAddressMap(), making it mandatory on these systems, whereas
the spec clearly states that it is entirely optional.

But whatever OSX does on x86 is irrelevant anyway: it is vertically
integrated with the firmware, which is vaguely EFI based but does not
aim for spec compliance. The OSX EULA does not permit running it on
anything other than Apple hardware. And x86 Apple hardware will be
reaching obsolescence pretty soon, at least where future development
is concerned.

My colleague filed a USWG proposal for a EFI_MEMORY_SHARED attribute
that must be honored by the OS when creating runtime mappings, and map
the region in a way that allows access by another observer (typically
the VMM but semantically it could mean other things too)

>
> >> You simply have nothing that is all of 1) RAM (mapped as cacheable on
> >> ARM), 2) known VA 3) known PA.
> > Bummer.
> >
> >> So we really really need a fallback mechanism that works without DMA
> >> :).
> > On arm it should be relatively simple to move the buffer to device
> > memory.  Just place one more region on the platform bus, advertise
> > address + size via device tree, done.
>
>
> That will bring back all issues with cached vs non-cached memory
> accesses, no? So edk2 will always access that memory as device memory
> which means it bypasses the cache, while QEMU will access it through the
> cache. So that buffer would need to actually be MMIO memory I suppose?
>

Indeed. Presenting memory as MMIO just to trick the guest into mapping
it shared is not the right approach here, which is why we need
EFI_MEMORY_SHARED on ARM. On x86, using the EfiMemoryMappedIo type
happens to work, but it is a hack (e.g., you cannot allocate memory of
this type)
Gerd Hoffmann Feb. 13, 2025, 9:52 a.m. UTC | #9
Hi,
 
> That will bring back all issues with cached vs non-cached memory accesses,
> no? So edk2 will always access that memory as device memory which means it
> bypasses the cache, while QEMU will access it through the cache. So that
> buffer would need to actually be MMIO memory I suppose?

I don't think so.  The firmware driver knows this actually is normal ram
and can setup mappings and memory attributes accordingly.  The situation
is a bit different from vga memory bars which are handled by pci bus
management which doesn't know anything about virtualization specifics.

Well, unless macos thinks it knows everything better and goes setup
uncached mappings ...

> > Not sure how to do that best on x86 though.  Find 64k unused address
> > space over ioapic?  Do we have enough free space there?  And how
> > future-proof would that be?
> 
> I'm not worried yet about where we place that memory, but more about
> ensuring that we actually have a working path to access it. We can always
> find space in the PCI hole, as long as we properly advertise it to all
> stakeholders via ACPI and memory map.

Well, the host can't easily place stuff in the pci hole because the
guest will manage that (map pci bars etc).  But making the pci hole
a bit smaller to make room is an option I think.

take care,
  Gerd
Alexander Graf Feb. 13, 2025, 10:06 a.m. UTC | #10
On 13.02.25 10:28, Ard Biesheuvel wrote:
> On Wed, 12 Feb 2025 at 22:26, Alexander Graf<graf@amazon.com> wrote:
>>
>> On 12.02.25 16:18, Gerd Hoffmann wrote:
>>>     Hi,
>>>
>>>>> Yes.  Knowing both physical and virtual address works only for memory
>>>>> you allocated yourself before ExitBootServices.  So you can't pass on
>>>>> pointers from the OS, you have to copy the data to a buffer where you
>>>>> know the physical address instead.  Yes, some overhead.  Should still
>>>>> be much faster than going to pio transfer mode ...
>>>> MacOS takes over the full physical address map past ExitBootServices: Your
>>>> code no longer has VA access to random code
>>> That is totally fine.  EFI drivers must register everything they need as
>>> runtime memory.  Anything else can be unmapped by the OS when calling
>>> EFI services.
>>>
>>>> and it literally memcpy()'s all preserved (virtual available) code and
>>>> data to different physical addresses.
>>> Uhm.  I have my doubts this copying behavior is blessed by the UEFI spec.
>>
>> I don't remember anything in the spec prohibiting it.
>>
> The UEFI spec clearly states that runtime services must either be
> called using a 1:1 mapping, or via a virtual remapping but in that
> case, SetVirtualAddresMap() must be called to inform the firmware of
> the new virtual mapping.


Correct, which is what boot.efi does. That mapping only tells firmware 
about the change from old VA -> new VA though.


> Even if this is not clearly stated, this violates the intent of the
> UEFI spec: the code reasons about mappings of physical memory,
> implying that the mapping is the only thing that changes. Moving
> memory contents around can only be done safely after
> SetVirtualAddressMap(), making it mandatory on these systems, whereas
> the spec clearly states that it is entirely optional.


The spec says that calling SetVirtualAddressMap is optional from the 
firmware's point of view. A boot loader may call it - and even depend on 
its functionality. The spec even goes further and almost endorses the 
case boot.efi does:

> The call to SetVirtualAddressMap() must be done with the physical 
> mappings. On successful return from this function, the system must 
> then make any future calls with the newly assigned virtual mappings. 
> All address space mappings must be done in accordance to the 
> cacheability flags as specified in the original address map. [...]


You can absolutely read that paragraph as "From here on, only virtual 
mappings matter".


> But whatever OSX does on x86 is irrelevant anyway: it is vertically
> integrated with the firmware, which is vaguely EFI based but does not
> aim for spec compliance. The OSX EULA does not permit running it on
> anything other than Apple hardware. And x86 Apple hardware will be
> reaching obsolescence pretty soon, at least where future development
> is concerned.


I halfway agree. It's fairly trivial to make x86 Mac OS X work with 
stock edk2 [1]. All you need are a few boot services, a non-standard way 
(or BootXXX variables) to find the real boot binary and APFS driver 
(binary is included in every APFS FS) load support. The rest is a fairly 
standard compliant UEFI environment.

But I'm not sure how that's relevant to the argument that we need a way 
to perform runtime service calls without relying on DMA? In addition to 
the potential VA/PA mismatch issue, relying on DMA from RTS can quickly 
also get broken for any confidential compute environments where you need 
to take explicit action to make memory visible to the hypervisor.

All I'm asking for is an (optional) viable path that works without DMA :).


> My colleague filed a USWG proposal for a EFI_MEMORY_SHARED attribute
> that must be honored by the OS when creating runtime mappings, and map
> the region in a way that allows access by another observer (typically
> the VMM but semantically it could mean other things too)


For something as core as a UEFI variable service that should become the 
default in a generic platform like QEMU, I don't think we should rely on 
new additions to the UEFI spec :(. Users want to be able to run old 
Operating Systems.


>
>>>> You simply have nothing that is all of 1) RAM (mapped as cacheable on
>>>> ARM), 2) known VA 3) known PA.
>>> Bummer.
>>>
>>>> So we really really need a fallback mechanism that works without DMA
>>>> :).
>>> On arm it should be relatively simple to move the buffer to device
>>> memory.  Just place one more region on the platform bus, advertise
>>> address + size via device tree, done.
>>
>> That will bring back all issues with cached vs non-cached memory
>> accesses, no? So edk2 will always access that memory as device memory
>> which means it bypasses the cache, while QEMU will access it through the
>> cache. So that buffer would need to actually be MMIO memory I suppose?
>>
> Indeed. Presenting memory as MMIO just to trick the guest into mapping
> it shared is not the right approach here, which is why we need
> EFI_MEMORY_SHARED on ARM. On x86, using the EfiMemoryMappedIo type
> happens to work, but it is a hack (e.g., you cannot allocate memory of
> this type)


The non-hacky alternative would be to expose a real 64KiB large MMIO 
window into host memory that goes through full instruction emulation for 
every access. Then we could expose it as real MMIO target memory to RTS 
which should always work. DMA can then be an (optional) optimization on 
top to copy to/from that buffer.


Alex


[1] 
https://github.com/tianocore/edk2/compare/vUDK2018...agraf:edk2:vUDK2018-AppleSupportPkg
Alexander Graf Feb. 13, 2025, 10:14 a.m. UTC | #11
On 13.02.25 10:52, Gerd Hoffmann wrote:
>    Hi,
>
>> That will bring back all issues with cached vs non-cached memory accesses,
>> no? So edk2 will always access that memory as device memory which means it
>> bypasses the cache, while QEMU will access it through the cache. So that
>> buffer would need to actually be MMIO memory I suppose?
> I don't think so.  The firmware driver knows this actually is normal ram
> and can setup mappings and memory attributes accordingly.  The situation
> is a bit different from vga memory bars which are handled by pci bus
> management which doesn't know anything about virtualization specifics.
>
> Well, unless macos thinks it knows everything better and goes setup
> uncached mappings ...


It's not only macOS. After SetVirtualAddressMap, the OS owns the virtual 
address space of Runtime Services. So in theory it also owns 
cacheability attributes of all mappings.

>
>>> Not sure how to do that best on x86 though.  Find 64k unused address
>>> space over ioapic?  Do we have enough free space there?  And how
>>> future-proof would that be?
>> I'm not worried yet about where we place that memory, but more about
>> ensuring that we actually have a working path to access it. We can always
>> find space in the PCI hole, as long as we properly advertise it to all
>> stakeholders via ACPI and memory map.
> Well, the host can't easily place stuff in the pci hole because the
> guest will manage that (map pci bars etc).  But making the pci hole
> a bit smaller to make room is an option I think.


Yes, IIRC we advertise where the hole is. I'm sure we can find a spot. 
Somewhere next to the HPET :).


Alex
Gerd Hoffmann Feb. 13, 2025, 2:54 p.m. UTC | #12
On Thu, Feb 13, 2025 at 11:14:03AM +0100, Alexander Graf wrote:
> 
> > I don't think so.  The firmware driver knows this actually is normal ram
> > and can setup mappings and memory attributes accordingly.  The situation
> > is a bit different from vga memory bars which are handled by pci bus
> > management which doesn't know anything about virtualization specifics.
> > 
> > Well, unless macos thinks it knows everything better and goes setup
> > uncached mappings ...
> 
> It's not only macOS. After SetVirtualAddressMap, the OS owns the virtual
> address space of Runtime Services. So in theory it also owns cacheability
> attributes of all mappings.

Hmm.  Played around with the device memory approach a bit today.  Looks
workable for both arm/sysbus and x86/isa.  Problem is, if that does
leave any unsolved corner cases on the table it doesn't buy us much, and
the arm caching issues start to make me a bit nervous ...

So, maybe allowing pio data transfers is the better approach after all.

How do your patches pick the transfer mode?  Is that dictated by the
host?  Or is the guest free to choose?  In case of the latter:  How does
the guest decide what to do?

> Yes, IIRC we advertise where the hole is. I'm sure we can find a spot.
> Somewhere next to the HPET :).

0xfef1000 seems to be free, which is kida fun b/c of the 'ef1' in the
address.

take care,
  Gerd
Alexander Graf Feb. 13, 2025, 10:25 p.m. UTC | #13
On 13.02.25 15:54, Gerd Hoffmann wrote:
> On Thu, Feb 13, 2025 at 11:14:03AM +0100, Alexander Graf wrote:
>>> I don't think so.  The firmware driver knows this actually is normal ram
>>> and can setup mappings and memory attributes accordingly.  The situation
>>> is a bit different from vga memory bars which are handled by pci bus
>>> management which doesn't know anything about virtualization specifics.
>>>
>>> Well, unless macos thinks it knows everything better and goes setup
>>> uncached mappings ...
>> It's not only macOS. After SetVirtualAddressMap, the OS owns the virtual
>> address space of Runtime Services. So in theory it also owns cacheability
>> attributes of all mappings.
> Hmm.  Played around with the device memory approach a bit today.  Looks
> workable for both arm/sysbus and x86/isa.  Problem is, if that does
> leave any unsolved corner cases on the table it doesn't buy us much, and
> the arm caching issues start to make me a bit nervous ...
>
> So, maybe allowing pio data transfers is the better approach after all.
>
> How do your patches pick the transfer mode?  Is that dictated by the
> host?  Or is the guest free to choose?  In case of the latter:  How does
> the guest decide what to do?


In our version, the guest gets to pick. It defaults to the DMA interface 
unless it detects that it's running either the macOS logic (a case you 
can ignore for now) or is running with SEV-SNP.

I think for the upstream interface, it would be best to have the host 
indicate which one it recommends the guest to use. That way you can 
force the fallback path without requiring tedious edk2 changes.


>
>> Yes, IIRC we advertise where the hole is. I'm sure we can find a spot.
>> Somewhere next to the HPET :).
> 0xfef1000 seems to be free, which is kida fun b/c of the 'ef1' in the
> address.


True, I love it! :)

It's not enough address space to fit the full 64k buffer though, right? 
Would all of 0xfef00000 be free by chance? Then you could just direct 
map the transfer buffer there.


Alex
Gerd Hoffmann Feb. 14, 2025, 7:55 a.m. UTC | #14
Hi,

> > How do your patches pick the transfer mode?  Is that dictated by the
> > host?  Or is the guest free to choose?  In case of the latter:  How does
> > the guest decide what to do?
> 
> In our version, the guest gets to pick. It defaults to the DMA interface
> unless it detects that it's running either the macOS logic (a case you can
> ignore for now) or is running with SEV-SNP.
> 
> I think for the upstream interface, it would be best to have the host
> indicate which one it recommends the guest to use. That way you can force
> the fallback path without requiring tedious edk2 changes.

I'm more thinking about a hard switch, i.e. the host would support only
the one or the other.  That way we'll go need less register space,
because we'll need either the buffer location register (dma mode) or the
pio transfer register (pio mode) but not both at the same time so they
can share the location.

> > 0xfef1000 seems to be free, which is kida fun b/c of the 'ef1' in the
> > address.
> 
> True, I love it! :)
> 
> It's not enough address space to fit the full 64k buffer though, right?
> Would all of 0xfef00000 be free by chance? Then you could just direct map
> the transfer buffer there.

All of 0xfef00000 is 1M, i.e. 16 x 64k.

take care,
  Gerd
Alexander Graf Feb. 14, 2025, 9:51 a.m. UTC | #15
On 14.02.25 08:55, Gerd Hoffmann wrote:
>    Hi,
>
>>> How do your patches pick the transfer mode?  Is that dictated by the
>>> host?  Or is the guest free to choose?  In case of the latter:  How does
>>> the guest decide what to do?
>> In our version, the guest gets to pick. It defaults to the DMA interface
>> unless it detects that it's running either the macOS logic (a case you can
>> ignore for now) or is running with SEV-SNP.
>>
>> I think for the upstream interface, it would be best to have the host
>> indicate which one it recommends the guest to use. That way you can force
>> the fallback path without requiring tedious edk2 changes.
> I'm more thinking about a hard switch, i.e. the host would support only
> the one or the other.  That way we'll go need less register space,
> because we'll need either the buffer location register (dma mode) or the
> pio transfer register (pio mode) but not both at the same time so they
> can share the location.


The nice thing about supporting both in the hypervisor and advertising 
preference is that a minimal guest firmware could choose to only support 
the safe one. Given the simplicity of the DMA protocol, it's not a hill 
I will die on though :).

I also like to have dedicated register spaces per component. So even if 
you choose to make it a hard split, I think we're better off with 4k at 
0xfef10000 for control and 64k at 0xfef20000 for the buffer for example. 
Even in the buffer case, you need some control registers. And you want 
to leave the door open to making the buffer space be a direct RAM map, 
which means you want to make it a page granule of the largest typical 
page size (64k).


>
>>> 0xfef1000 seems to be free, which is kida fun b/c of the 'ef1' in the
>>> address.
>> True, I love it! :)
>>
>> It's not enough address space to fit the full 64k buffer though, right?
>> Would all of 0xfef00000 be free by chance? Then you could just direct map
>> the transfer buffer there.
> All of 0xfef00000 is 1M, i.e. 16 x 64k.


Oh, there was a missing 0! So the target space is really 64k right now 
(0xfef10000).


Alex
Gerd Hoffmann Feb. 14, 2025, 11:16 a.m. UTC | #16
On Fri, Feb 14, 2025 at 10:51:17AM +0100, Alexander Graf wrote:

> I also like to have dedicated register spaces per component. So even if you
> choose to make it a hard split, I think we're better off with 4k at
> 0xfef10000 for control and 64k at 0xfef20000 for the buffer for example.

Well, if we go for PIO transfer mode instead of device memory we don't
need map the buffer any more.

The control registers for the x86 variant are in io address space right
now (0x520, next to fw_cfg).  We could place them in a mmio page @
0xfef10000 instead.  Any preference, and if so, why?

First sketch below, on top of this series.  No edk2 counterpart yet, so
untested beyond compiling the code.

take care,
  Gerd

--------------------- cut here ------------------------
diff --git a/include/hw/uefi/var-service-api.h b/include/hw/uefi/var-service-api.h
index 6765925d9ed0..99911e904652 100644
--- a/include/hw/uefi/var-service-api.h
+++ b/include/hw/uefi/var-service-api.h
@@ -21,16 +21,21 @@
 #define UEFI_VARS_REG_MAGIC                  0x00  /* 16 bit */
 #define UEFI_VARS_REG_CMD_STS                0x02  /* 16 bit */
 #define UEFI_VARS_REG_BUFFER_SIZE            0x04  /* 32 bit */
-#define UEFI_VARS_REG_BUFFER_ADDR_LO         0x08  /* 32 bit */
-#define UEFI_VARS_REG_BUFFER_ADDR_HI         0x0c  /* 32 bit */
-#define UEFI_VARS_REGS_SIZE                  0x10
+#define UEFI_VARS_REG_DMA_BUFFER_ADDR_LO     0x08  /* 32 bit */
+#define UEFI_VARS_REG_DMA_BUFFER_ADDR_HI     0x0c  /* 32 bit */
+#define UEFI_VARS_REG_PIO_BUFFER_TRANSFER    0x10  /* 8-64 bit */
+#define UEFI_VARS_REG_PIO_BUFFER_CRC32C      0x18  /* 32 bit (read-only) */
+#define UEFI_VARS_REG_RESERVED               0x1c  /* 32 bit */
+#define UEFI_VARS_REGS_SIZE                  0x20
 
 /* magic value */
 #define UEFI_VARS_MAGIC_VALUE               0xef1
 
 /* command values */
 #define UEFI_VARS_CMD_RESET                  0x01
-#define UEFI_VARS_CMD_MM                     0x02
+#define UEFI_VARS_CMD_DMA_MM                 0x02
+#define UEFI_VARS_CMD_PIO_MM                 0x03
+#define UEFI_VARS_CMD_PIO_ZERO_OFFSET        0x04
 
 /* status values */
 #define UEFI_VARS_STS_SUCCESS                0x00
diff --git a/include/hw/uefi/var-service.h b/include/hw/uefi/var-service.h
index e078d2b0e68f..7dbede659a8f 100644
--- a/include/hw/uefi/var-service.h
+++ b/include/hw/uefi/var-service.h
@@ -56,6 +56,10 @@ struct uefi_vars_state {
     QTAILQ_HEAD(, uefi_variable)      variables;
     QTAILQ_HEAD(, uefi_var_policy)    var_policies;
 
+    /* pio transfer buffer */
+    uint32_t                          pio_xfer_offset;
+    uint8_t                           *pio_xfer_buffer;
+
     /* boot phases */
     bool                              end_of_dxe;
     bool                              ready_to_boot;
diff --git a/hw/uefi/var-service-core.c b/hw/uefi/var-service-core.c
index 78a668e68fa2..a96b66934769 100644
--- a/hw/uefi/var-service-core.c
+++ b/hw/uefi/var-service-core.c
@@ -4,6 +4,7 @@
  * uefi vars device
  */
 #include "qemu/osdep.h"
+#include "qemu/crc32c.h"
 #include "system/dma.h"
 #include "migration/vmstate.h"
 
@@ -41,6 +42,7 @@ const VMStateDescription vmstate_uefi_vars = {
         VMSTATE_UINT32(buf_size, uefi_vars_state),
         VMSTATE_UINT32(buf_addr_lo, uefi_vars_state),
         VMSTATE_UINT32(buf_addr_hi, uefi_vars_state),
+        /* TODO: pio xfer offset + buffer */
         VMSTATE_BOOL(end_of_dxe, uefi_vars_state),
         VMSTATE_BOOL(ready_to_boot, uefi_vars_state),
         VMSTATE_BOOL(exit_boot_service, uefi_vars_state),
@@ -54,7 +56,7 @@ const VMStateDescription vmstate_uefi_vars = {
     },
 };
 
-static uint32_t uefi_vars_cmd_mm(uefi_vars_state *uv)
+static uint32_t uefi_vars_cmd_mm(uefi_vars_state *uv, bool dma_mode)
 {
     hwaddr    dma;
     mm_header *mhdr;
@@ -69,9 +71,13 @@ static uint32_t uefi_vars_cmd_mm(uefi_vars_state *uv)
     }
 
     /* read header */
-    dma_memory_read(&address_space_memory, dma,
-                    uv->buffer, sizeof(*mhdr),
-                    MEMTXATTRS_UNSPECIFIED);
+    if (dma_mode) {
+        dma_memory_read(&address_space_memory, dma,
+                        uv->buffer, sizeof(*mhdr),
+                        MEMTXATTRS_UNSPECIFIED);
+    } else {
+        memcpy(uv->buffer, uv->pio_xfer_buffer, sizeof(*mhdr));
+    }
 
     if (uadd64_overflow(sizeof(*mhdr), mhdr->length, &size)) {
         return UEFI_VARS_STS_ERR_BAD_BUFFER_SIZE;
@@ -81,9 +87,15 @@ static uint32_t uefi_vars_cmd_mm(uefi_vars_state *uv)
     }
 
     /* read buffer (excl header) */
-    dma_memory_read(&address_space_memory, dma + sizeof(*mhdr),
-                    uv->buffer + sizeof(*mhdr), mhdr->length,
-                    MEMTXATTRS_UNSPECIFIED);
+    if (dma_mode) {
+        dma_memory_read(&address_space_memory, dma + sizeof(*mhdr),
+                        uv->buffer + sizeof(*mhdr), mhdr->length,
+                        MEMTXATTRS_UNSPECIFIED);
+    } else {
+        memcpy(uv->buffer + sizeof(*mhdr),
+               uv->pio_xfer_buffer + sizeof(*mhdr),
+               mhdr->length);
+    }
     memset(uv->buffer + size, 0, uv->buf_size - size);
 
     /* dispatch */
@@ -113,9 +125,15 @@ static uint32_t uefi_vars_cmd_mm(uefi_vars_state *uv)
     }
 
     /* write buffer */
-    dma_memory_write(&address_space_memory, dma,
-                     uv->buffer, sizeof(*mhdr) + mhdr->length,
-                     MEMTXATTRS_UNSPECIFIED);
+    if (dma_mode) {
+        dma_memory_write(&address_space_memory, dma,
+                         uv->buffer, sizeof(*mhdr) + mhdr->length,
+                         MEMTXATTRS_UNSPECIFIED);
+    } else {
+        memcpy(uv->pio_xfer_buffer + sizeof(*mhdr),
+               uv->buffer + sizeof(*mhdr),
+               sizeof(*mhdr) + mhdr->length);
+    }
 
     return retval;
 }
@@ -150,8 +168,13 @@ static uint32_t uefi_vars_cmd(uefi_vars_state *uv, uint32_t cmd)
     case UEFI_VARS_CMD_RESET:
         uefi_vars_soft_reset(uv);
         return UEFI_VARS_STS_SUCCESS;
-    case UEFI_VARS_CMD_MM:
-        return uefi_vars_cmd_mm(uv);
+    case UEFI_VARS_CMD_DMA_MM:
+        return uefi_vars_cmd_mm(uv, true);
+    case UEFI_VARS_CMD_PIO_MM:
+        return uefi_vars_cmd_mm(uv, false);
+    case UEFI_VARS_CMD_PIO_ZERO_OFFSET:
+        uv->pio_xfer_offset = 0;
+        return UEFI_VARS_STS_SUCCESS;
     default:
         return UEFI_VARS_STS_ERR_NOT_SUPPORTED;
     }
@@ -161,6 +184,7 @@ static uint64_t uefi_vars_read(void *opaque, hwaddr addr, unsigned size)
 {
     uefi_vars_state *uv = opaque;
     uint64_t retval = -1;
+    void *xfer_ptr;
 
     trace_uefi_reg_read(addr, size);
 
@@ -174,12 +198,37 @@ static uint64_t uefi_vars_read(void *opaque, hwaddr addr, unsigned size)
     case UEFI_VARS_REG_BUFFER_SIZE:
         retval = uv->buf_size;
         break;
-    case UEFI_VARS_REG_BUFFER_ADDR_LO:
+    case UEFI_VARS_REG_DMA_BUFFER_ADDR_LO:
         retval = uv->buf_addr_lo;
         break;
-    case UEFI_VARS_REG_BUFFER_ADDR_HI:
+    case UEFI_VARS_REG_DMA_BUFFER_ADDR_HI:
         retval = uv->buf_addr_hi;
         break;
+    case UEFI_VARS_REG_PIO_BUFFER_TRANSFER:
+        if (uv->pio_xfer_offset + size > uv->buf_size) {
+            retval = 0;
+            break;
+        }
+        xfer_ptr = uv->pio_xfer_buffer + uv->pio_xfer_offset;
+        switch (size) {
+        case 1:
+            retval = *(uint8_t *)xfer_ptr;
+            break;
+        case 2:
+            retval = *(uint16_t *)xfer_ptr;
+            break;
+        case 4:
+            retval = *(uint32_t *)xfer_ptr;
+            break;
+        case 8:
+            retval = *(uint64_t *)xfer_ptr;
+            break;
+        }
+        uv->pio_xfer_offset += size;
+        break;
+    case UEFI_VARS_REG_PIO_BUFFER_CRC32C:
+        retval = crc32c(0xffffffff, uv->pio_xfer_buffer, uv->pio_xfer_offset);
+        break;
     }
     return retval;
 }
@@ -187,6 +236,7 @@ static uint64_t uefi_vars_read(void *opaque, hwaddr addr, unsigned size)
 static void uefi_vars_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
 {
     uefi_vars_state *uv = opaque;
+    void *xfer_ptr;
 
     trace_uefi_reg_write(addr, val, size);
 
@@ -200,14 +250,40 @@ static void uefi_vars_write(void *opaque, hwaddr addr, uint64_t val, unsigned si
         }
         uv->buf_size = val;
         g_free(uv->buffer);
+        g_free(uv->pio_xfer_buffer);
         uv->buffer = g_malloc(uv->buf_size);
+        uv->pio_xfer_buffer = g_malloc(uv->buf_size);
         break;
-    case UEFI_VARS_REG_BUFFER_ADDR_LO:
+    case UEFI_VARS_REG_DMA_BUFFER_ADDR_LO:
         uv->buf_addr_lo = val;
         break;
-    case UEFI_VARS_REG_BUFFER_ADDR_HI:
+    case UEFI_VARS_REG_DMA_BUFFER_ADDR_HI:
         uv->buf_addr_hi = val;
         break;
+    case UEFI_VARS_REG_PIO_BUFFER_TRANSFER:
+        if (uv->pio_xfer_offset + size > uv->buf_size) {
+            break;
+        }
+        xfer_ptr = uv->pio_xfer_buffer + uv->pio_xfer_offset;
+        switch (size) {
+        case 1:
+            *(uint8_t *)xfer_ptr = val;
+            break;
+        case 2:
+            *(uint16_t *)xfer_ptr = val;
+            break;
+        case 4:
+            *(uint32_t *)xfer_ptr = val;
+            break;
+        case 8:
+            *(uint64_t *)xfer_ptr = val;
+            break;
+        }
+        uv->pio_xfer_offset += size;
+        break;
+    case UEFI_VARS_REG_PIO_BUFFER_CRC32C:
+    default:
+        break;
     }
 }
Alexander Graf Feb. 14, 2025, 12:22 p.m. UTC | #17
On 14.02.25 12:16, Gerd Hoffmann wrote:
> On Fri, Feb 14, 2025 at 10:51:17AM +0100, Alexander Graf wrote:
>
>> I also like to have dedicated register spaces per component. So even if you
>> choose to make it a hard split, I think we're better off with 4k at
>> 0xfef10000 for control and 64k at 0xfef20000 for the buffer for example.
> Well, if we go for PIO transfer mode instead of device memory we don't
> need map the buffer any more.
>
> The control registers for the x86 variant are in io address space right
> now (0x520, next to fw_cfg).  We could place them in a mmio page @
> 0xfef10000 instead.  Any preference, and if so, why?


I did the same mistake in my version and use PIO for x86 but MMIO for 
ARM. In hindsight, I think the same mechanism for both would have 
simplified things a lot: You get better testing coverage of the exact 
same code path. If you split between PIO and MMIO, you always have 
issues that only pop up in one of the implementations. It complexifies 
your test matrix for little gain.

Since you need an MMIO avenue anyway, you may as well always use that. 
This is not a high performance interface where the exit latency 
difference between PIO and MMIO really matters.


Alex
diff mbox series

Patch

diff --git a/hw/uefi/var-service-core.c b/hw/uefi/var-service-core.c
new file mode 100644
index 000000000000..78a668e68fa2
--- /dev/null
+++ b/hw/uefi/var-service-core.c
@@ -0,0 +1,237 @@ 
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * uefi vars device
+ */
+#include "qemu/osdep.h"
+#include "system/dma.h"
+#include "migration/vmstate.h"
+
+#include "hw/uefi/var-service.h"
+#include "hw/uefi/var-service-api.h"
+#include "hw/uefi/var-service-edk2.h"
+
+#include "trace/trace-hw_uefi.h"
+
+static int uefi_vars_pre_load(void *opaque)
+{
+    uefi_vars_state *uv = opaque;
+
+    uefi_vars_clear_all(uv);
+    uefi_vars_policies_clear(uv);
+    g_free(uv->buffer);
+    return 0;
+}
+
+static int uefi_vars_post_load(void *opaque, int version_id)
+{
+    uefi_vars_state *uv = opaque;
+
+    uefi_vars_update_storage(uv);
+    uv->buffer = g_malloc(uv->buf_size);
+    return 0;
+}
+
+const VMStateDescription vmstate_uefi_vars = {
+    .name = "uefi-vars",
+    .pre_load = uefi_vars_pre_load,
+    .post_load = uefi_vars_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT16(sts, uefi_vars_state),
+        VMSTATE_UINT32(buf_size, uefi_vars_state),
+        VMSTATE_UINT32(buf_addr_lo, uefi_vars_state),
+        VMSTATE_UINT32(buf_addr_hi, uefi_vars_state),
+        VMSTATE_BOOL(end_of_dxe, uefi_vars_state),
+        VMSTATE_BOOL(ready_to_boot, uefi_vars_state),
+        VMSTATE_BOOL(exit_boot_service, uefi_vars_state),
+        VMSTATE_BOOL(policy_locked, uefi_vars_state),
+        VMSTATE_UINT64(used_storage, uefi_vars_state),
+        VMSTATE_QTAILQ_V(variables, uefi_vars_state, 0,
+                         vmstate_uefi_variable, uefi_variable, next),
+        VMSTATE_QTAILQ_V(var_policies, uefi_vars_state, 0,
+                         vmstate_uefi_var_policy, uefi_var_policy, next),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+static uint32_t uefi_vars_cmd_mm(uefi_vars_state *uv)
+{
+    hwaddr    dma;
+    mm_header *mhdr;
+    uint64_t  size;
+    uint32_t  retval;
+
+    dma = uv->buf_addr_lo | ((hwaddr)uv->buf_addr_hi << 32);
+    mhdr = (mm_header *) uv->buffer;
+
+    if (!uv->buffer || uv->buf_size < sizeof(*mhdr)) {
+        return UEFI_VARS_STS_ERR_BAD_BUFFER_SIZE;
+    }
+
+    /* read header */
+    dma_memory_read(&address_space_memory, dma,
+                    uv->buffer, sizeof(*mhdr),
+                    MEMTXATTRS_UNSPECIFIED);
+
+    if (uadd64_overflow(sizeof(*mhdr), mhdr->length, &size)) {
+        return UEFI_VARS_STS_ERR_BAD_BUFFER_SIZE;
+    }
+    if (uv->buf_size < size) {
+        return UEFI_VARS_STS_ERR_BAD_BUFFER_SIZE;
+    }
+
+    /* read buffer (excl header) */
+    dma_memory_read(&address_space_memory, dma + sizeof(*mhdr),
+                    uv->buffer + sizeof(*mhdr), mhdr->length,
+                    MEMTXATTRS_UNSPECIFIED);
+    memset(uv->buffer + size, 0, uv->buf_size - size);
+
+    /* dispatch */
+    if (qemu_uuid_is_equal(&mhdr->guid, &EfiSmmVariableProtocolGuid)) {
+        retval = uefi_vars_mm_vars_proto(uv);
+
+    } else if (qemu_uuid_is_equal(&mhdr->guid, &VarCheckPolicyLibMmiHandlerGuid)) {
+        retval = uefi_vars_mm_check_policy_proto(uv);
+
+    } else if (qemu_uuid_is_equal(&mhdr->guid, &EfiEndOfDxeEventGroupGuid)) {
+        trace_uefi_event("end-of-dxe");
+        uv->end_of_dxe = true;
+        retval = UEFI_VARS_STS_SUCCESS;
+
+    } else if (qemu_uuid_is_equal(&mhdr->guid, &EfiEventReadyToBootGuid)) {
+        trace_uefi_event("ready-to-boot");
+        uv->ready_to_boot = true;
+        retval = UEFI_VARS_STS_SUCCESS;
+
+    } else if (qemu_uuid_is_equal(&mhdr->guid, &EfiEventExitBootServicesGuid)) {
+        trace_uefi_event("exit-boot-service");
+        uv->exit_boot_service = true;
+        retval = UEFI_VARS_STS_SUCCESS;
+
+    } else {
+        retval = UEFI_VARS_STS_ERR_NOT_SUPPORTED;
+    }
+
+    /* write buffer */
+    dma_memory_write(&address_space_memory, dma,
+                     uv->buffer, sizeof(*mhdr) + mhdr->length,
+                     MEMTXATTRS_UNSPECIFIED);
+
+    return retval;
+}
+
+static void uefi_vars_soft_reset(uefi_vars_state *uv)
+{
+    g_free(uv->buffer);
+    uv->buffer = NULL;
+    uv->buf_size = 0;
+    uv->buf_addr_lo = 0;
+    uv->buf_addr_hi = 0;
+}
+
+void uefi_vars_hard_reset(uefi_vars_state *uv)
+{
+    trace_uefi_hard_reset();
+    uefi_vars_soft_reset(uv);
+
+    uv->end_of_dxe        = false;
+    uv->ready_to_boot     = false;
+    uv->exit_boot_service = false;
+    uv->policy_locked     = false;
+
+    uefi_vars_clear_volatile(uv);
+    uefi_vars_policies_clear(uv);
+    uefi_vars_auth_init(uv);
+}
+
+static uint32_t uefi_vars_cmd(uefi_vars_state *uv, uint32_t cmd)
+{
+    switch (cmd) {
+    case UEFI_VARS_CMD_RESET:
+        uefi_vars_soft_reset(uv);
+        return UEFI_VARS_STS_SUCCESS;
+    case UEFI_VARS_CMD_MM:
+        return uefi_vars_cmd_mm(uv);
+    default:
+        return UEFI_VARS_STS_ERR_NOT_SUPPORTED;
+    }
+}
+
+static uint64_t uefi_vars_read(void *opaque, hwaddr addr, unsigned size)
+{
+    uefi_vars_state *uv = opaque;
+    uint64_t retval = -1;
+
+    trace_uefi_reg_read(addr, size);
+
+    switch (addr) {
+    case UEFI_VARS_REG_MAGIC:
+        retval = UEFI_VARS_MAGIC_VALUE;
+        break;
+    case UEFI_VARS_REG_CMD_STS:
+        retval = uv->sts;
+        break;
+    case UEFI_VARS_REG_BUFFER_SIZE:
+        retval = uv->buf_size;
+        break;
+    case UEFI_VARS_REG_BUFFER_ADDR_LO:
+        retval = uv->buf_addr_lo;
+        break;
+    case UEFI_VARS_REG_BUFFER_ADDR_HI:
+        retval = uv->buf_addr_hi;
+        break;
+    }
+    return retval;
+}
+
+static void uefi_vars_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
+{
+    uefi_vars_state *uv = opaque;
+
+    trace_uefi_reg_write(addr, val, size);
+
+    switch (addr) {
+    case UEFI_VARS_REG_CMD_STS:
+        uv->sts = uefi_vars_cmd(uv, val);
+        break;
+    case UEFI_VARS_REG_BUFFER_SIZE:
+        if (val > MAX_BUFFER_SIZE) {
+            val = MAX_BUFFER_SIZE;
+        }
+        uv->buf_size = val;
+        g_free(uv->buffer);
+        uv->buffer = g_malloc(uv->buf_size);
+        break;
+    case UEFI_VARS_REG_BUFFER_ADDR_LO:
+        uv->buf_addr_lo = val;
+        break;
+    case UEFI_VARS_REG_BUFFER_ADDR_HI:
+        uv->buf_addr_hi = val;
+        break;
+    }
+}
+
+static const MemoryRegionOps uefi_vars_ops = {
+    .read = uefi_vars_read,
+    .write = uefi_vars_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 2,
+        .max_access_size = 4,
+    },
+};
+
+void uefi_vars_init(Object *obj, uefi_vars_state *uv)
+{
+    QTAILQ_INIT(&uv->variables);
+    QTAILQ_INIT(&uv->var_policies);
+    uv->jsonfd = -1;
+    memory_region_init_io(&uv->mr, obj, &uefi_vars_ops, uv,
+                          "uefi-vars", UEFI_VARS_REGS_SIZE);
+}
+
+void uefi_vars_realize(uefi_vars_state *uv, Error **errp)
+{
+    uefi_vars_json_init(uv, errp);
+    uefi_vars_json_load(uv, errp);
+}