diff mbox series

[v1] target/i386/host-cpu: Use IOMMU addr width for passthrough devices on Intel platforms

Message ID 20231113073239.270591-1-vivek.kasireddy@intel.com (mailing list archive)
State New, archived
Headers show
Series [v1] target/i386/host-cpu: Use IOMMU addr width for passthrough devices on Intel platforms | expand

Commit Message

Kasireddy, Vivek Nov. 13, 2023, 7:32 a.m. UTC
A recent OVMF update has resulted in MMIO regions being placed at
the upper end of the physical address space. As a result, when a
Host device is passthrough'd to the Guest via VFIO, the following
mapping failures occur when VFIO tries to map the MMIO regions of
the device:
VFIO_MAP_DMA failed: Invalid argument
vfio_dma_map(0x557b2f2736d0, 0x380000000000, 0x1000000, 0x7f98ac400000) = -22 (Invalid argument)

The above failures are mainly seen on some Intel platforms where
the physical address width is larger than the Host's IOMMU
address width. In these cases, VFIO fails to map the MMIO regions
because the IOVAs would be larger than the IOMMU aperture regions.

Therefore, one way to solve this problem would be to ensure that
cpu->phys_bits = <IOMMU phys_bits>
This can be done by parsing the IOMMU caps value from sysfs and
extracting the address width and using it to override the
phys_bits value as shown in this patch.

Previous attempt at solving this issue in OVMF:
https://edk2.groups.io/g/devel/topic/102359124

Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Laszlo Ersek <lersek@redhat.com>
Cc: Dongwon Kim <dongwon.kim@intel.com>
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
---
 target/i386/host-cpu.c | 61 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

Comments

Laszlo Ersek Nov. 13, 2023, 9:29 a.m. UTC | #1
On 11/13/23 08:32, Vivek Kasireddy wrote:
> A recent OVMF update has resulted in MMIO regions being placed at
> the upper end of the physical address space. As a result, when a
> Host device is passthrough'd to the Guest via VFIO, the following
> mapping failures occur when VFIO tries to map the MMIO regions of
> the device:
> VFIO_MAP_DMA failed: Invalid argument
> vfio_dma_map(0x557b2f2736d0, 0x380000000000, 0x1000000, 0x7f98ac400000) = -22 (Invalid argument)
> 
> The above failures are mainly seen on some Intel platforms where
> the physical address width is larger than the Host's IOMMU
> address width. In these cases, VFIO fails to map the MMIO regions
> because the IOVAs would be larger than the IOMMU aperture regions.
> 
> Therefore, one way to solve this problem would be to ensure that
> cpu->phys_bits = <IOMMU phys_bits>
> This can be done by parsing the IOMMU caps value from sysfs and
> extracting the address width and using it to override the
> phys_bits value as shown in this patch.
> 
> Previous attempt at solving this issue in OVMF:
> https://edk2.groups.io/g/devel/topic/102359124
> 
> Cc: Gerd Hoffmann <kraxel@redhat.com>
> Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Laszlo Ersek <lersek@redhat.com>
> Cc: Dongwon Kim <dongwon.kim@intel.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> ---
>  target/i386/host-cpu.c | 61 +++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 60 insertions(+), 1 deletion(-)
> 
> diff --git a/target/i386/host-cpu.c b/target/i386/host-cpu.c
> index 92ecb7254b..8326ec95bc 100644
> --- a/target/i386/host-cpu.c
> +++ b/target/i386/host-cpu.c
> @@ -12,6 +12,8 @@
>  #include "host-cpu.h"
>  #include "qapi/error.h"
>  #include "qemu/error-report.h"
> +#include "qemu/config-file.h"
> +#include "qemu/option.h"
>  #include "sysemu/sysemu.h"
>  
>  /* Note: Only safe for use on x86(-64) hosts */
> @@ -51,11 +53,58 @@ static void host_cpu_enable_cpu_pm(X86CPU *cpu)
>      env->features[FEAT_1_ECX] |= CPUID_EXT_MONITOR;
>  }
>  
> +static int intel_iommu_check(void *opaque, QemuOpts *opts, Error **errp)
> +{
> +    g_autofree char *dev_path = NULL, *iommu_path = NULL, *caps = NULL;
> +    const char *driver = qemu_opt_get(opts, "driver");
> +    const char *device = qemu_opt_get(opts, "host");
> +    uint32_t *iommu_phys_bits = opaque;
> +    struct stat st;
> +    uint64_t iommu_caps;
> +
> +    /*
> +     * Check if the user is passthroughing any devices via VFIO. We don't
> +     * have to limit phys_bits if there are no valid passthrough devices.
> +     */
> +    if (g_strcmp0(driver, "vfio-pci") || !device) {
> +        return 0;
> +    }
> +
> +    dev_path = g_strdup_printf("/sys/bus/pci/devices/%s", device);
> +    if (stat(dev_path, &st) < 0) {
> +        return 0;
> +    }
> +
> +    iommu_path = g_strdup_printf("%s/iommu/intel-iommu/cap", dev_path);
> +    if (stat(iommu_path, &st) < 0) {
> +        return 0;
> +    }
> +
> +    if (g_file_get_contents(iommu_path, &caps, NULL, NULL)) {
> +        if (sscanf(caps, "%lx", &iommu_caps) != 1) {
> +            return 0;
> +        }
> +        *iommu_phys_bits = ((iommu_caps >> 16) & 0x3f) + 1;
> +    }
> +
> +    return 0;
> +}
> +
> +static uint32_t host_iommu_phys_bits(void)
> +{
> +    uint32_t iommu_phys_bits = 0;
> +
> +    qemu_opts_foreach(qemu_find_opts("device"),
> +                      intel_iommu_check, &iommu_phys_bits, NULL);
> +    return iommu_phys_bits;
> +}
> +
>  static uint32_t host_cpu_adjust_phys_bits(X86CPU *cpu)
>  {
>      uint32_t host_phys_bits = host_cpu_phys_bits();
> +    uint32_t iommu_phys_bits = host_iommu_phys_bits();
>      uint32_t phys_bits = cpu->phys_bits;
> -    static bool warned;
> +    static bool warned, warned2;
>  
>      /*
>       * Print a warning if the user set it to a value that's not the
> @@ -78,6 +127,16 @@ static uint32_t host_cpu_adjust_phys_bits(X86CPU *cpu)
>          }
>      }
>  
> +    if (iommu_phys_bits && phys_bits > iommu_phys_bits) {
> +        phys_bits = iommu_phys_bits;
> +        if (!warned2) {
> +            warn_report("Using physical bits (%u)"
> +                        " to prevent VFIO mapping failures",
> +                        iommu_phys_bits);
> +            warned2 = true;
> +        }
> +    }
> +
>      return phys_bits;
>  }
>  

I only have very superficial comments here (sorry about that -- I find
it too bad that this QEMU source file seems to have no designated
reviewer or maintainer in QEMU, so I don't want to ignore it).

- Terminology: I think we like to call these devices "assigned", and not
"passed through". Also, in noun form, "device assignment" and not
"device passthrough". Sorry about being pedantic.

- As I (may have) mentioned in my OVMF comments, I'm unsure if narrowing
the VCPU "phys address bits" property due to host IOMMU limitations is a
good design. To me it feels like hacking one piece of information into
another (unrelated) piece of information. It vaguely makes me think
we're going to regret this later. But I don't have any specific, current
counter-argument, admittedly.

Laszlo
Kasireddy, Vivek Nov. 14, 2023, 6:38 a.m. UTC | #2
Hi Laszlo,

> 
> On 11/13/23 08:32, Vivek Kasireddy wrote:
> > A recent OVMF update has resulted in MMIO regions being placed at
> > the upper end of the physical address space. As a result, when a
> > Host device is passthrough'd to the Guest via VFIO, the following
> > mapping failures occur when VFIO tries to map the MMIO regions of
> > the device:
> > VFIO_MAP_DMA failed: Invalid argument
> > vfio_dma_map(0x557b2f2736d0, 0x380000000000, 0x1000000,
> 0x7f98ac400000) = -22 (Invalid argument)
> >
> > The above failures are mainly seen on some Intel platforms where
> > the physical address width is larger than the Host's IOMMU
> > address width. In these cases, VFIO fails to map the MMIO regions
> > because the IOVAs would be larger than the IOMMU aperture regions.
> >
> > Therefore, one way to solve this problem would be to ensure that
> > cpu->phys_bits = <IOMMU phys_bits>
> > This can be done by parsing the IOMMU caps value from sysfs and
> > extracting the address width and using it to override the
> > phys_bits value as shown in this patch.
> >
> > Previous attempt at solving this issue in OVMF:
> > https://edk2.groups.io/g/devel/topic/102359124
> >
> > Cc: Gerd Hoffmann <kraxel@redhat.com>
> > Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
> > Cc: Alex Williamson <alex.williamson@redhat.com>
> > Cc: Laszlo Ersek <lersek@redhat.com>
> > Cc: Dongwon Kim <dongwon.kim@intel.com>
> > Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> > ---
> >  target/i386/host-cpu.c | 61
> +++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 60 insertions(+), 1 deletion(-)
> >
> > diff --git a/target/i386/host-cpu.c b/target/i386/host-cpu.c
> > index 92ecb7254b..8326ec95bc 100644
> > --- a/target/i386/host-cpu.c
> > +++ b/target/i386/host-cpu.c
> > @@ -12,6 +12,8 @@
> >  #include "host-cpu.h"
> >  #include "qapi/error.h"
> >  #include "qemu/error-report.h"
> > +#include "qemu/config-file.h"
> > +#include "qemu/option.h"
> >  #include "sysemu/sysemu.h"
> >
> >  /* Note: Only safe for use on x86(-64) hosts */
> > @@ -51,11 +53,58 @@ static void host_cpu_enable_cpu_pm(X86CPU
> *cpu)
> >      env->features[FEAT_1_ECX] |= CPUID_EXT_MONITOR;
> >  }
> >
> > +static int intel_iommu_check(void *opaque, QemuOpts *opts, Error
> **errp)
> > +{
> > +    g_autofree char *dev_path = NULL, *iommu_path = NULL, *caps = NULL;
> > +    const char *driver = qemu_opt_get(opts, "driver");
> > +    const char *device = qemu_opt_get(opts, "host");
> > +    uint32_t *iommu_phys_bits = opaque;
> > +    struct stat st;
> > +    uint64_t iommu_caps;
> > +
> > +    /*
> > +     * Check if the user is passthroughing any devices via VFIO. We don't
> > +     * have to limit phys_bits if there are no valid passthrough devices.
> > +     */
> > +    if (g_strcmp0(driver, "vfio-pci") || !device) {
> > +        return 0;
> > +    }
> > +
> > +    dev_path = g_strdup_printf("/sys/bus/pci/devices/%s", device);
> > +    if (stat(dev_path, &st) < 0) {
> > +        return 0;
> > +    }
> > +
> > +    iommu_path = g_strdup_printf("%s/iommu/intel-iommu/cap",
> dev_path);
> > +    if (stat(iommu_path, &st) < 0) {
> > +        return 0;
> > +    }
> > +
> > +    if (g_file_get_contents(iommu_path, &caps, NULL, NULL)) {
> > +        if (sscanf(caps, "%lx", &iommu_caps) != 1) {
> > +            return 0;
> > +        }
> > +        *iommu_phys_bits = ((iommu_caps >> 16) & 0x3f) + 1;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static uint32_t host_iommu_phys_bits(void)
> > +{
> > +    uint32_t iommu_phys_bits = 0;
> > +
> > +    qemu_opts_foreach(qemu_find_opts("device"),
> > +                      intel_iommu_check, &iommu_phys_bits, NULL);
> > +    return iommu_phys_bits;
> > +}
> > +
> >  static uint32_t host_cpu_adjust_phys_bits(X86CPU *cpu)
> >  {
> >      uint32_t host_phys_bits = host_cpu_phys_bits();
> > +    uint32_t iommu_phys_bits = host_iommu_phys_bits();
> >      uint32_t phys_bits = cpu->phys_bits;
> > -    static bool warned;
> > +    static bool warned, warned2;
> >
> >      /*
> >       * Print a warning if the user set it to a value that's not the
> > @@ -78,6 +127,16 @@ static uint32_t host_cpu_adjust_phys_bits(X86CPU
> *cpu)
> >          }
> >      }
> >
> > +    if (iommu_phys_bits && phys_bits > iommu_phys_bits) {
> > +        phys_bits = iommu_phys_bits;
> > +        if (!warned2) {
> > +            warn_report("Using physical bits (%u)"
> > +                        " to prevent VFIO mapping failures",
> > +                        iommu_phys_bits);
> > +            warned2 = true;
> > +        }
> > +    }
> > +
> >      return phys_bits;
> >  }
> >
> 
> I only have very superficial comments here (sorry about that -- I find
> it too bad that this QEMU source file seems to have no designated
> reviewer or maintainer in QEMU, so I don't want to ignore it).
> 
> - Terminology: I think we like to call these devices "assigned", and not
> "passed through". Also, in noun form, "device assignment" and not
> "device passthrough". Sorry about being pedantic.
No problem; I'll try to start using the right terminology.

> 
> - As I (may have) mentioned in my OVMF comments, I'm unsure if narrowing
> the VCPU "phys address bits" property due to host IOMMU limitations is a
> good design. To me it feels like hacking one piece of information into
> another (unrelated) piece of information. It vaguely makes me think
> we're going to regret this later. But I don't have any specific, current
> counter-argument, admittedly.
The physical address space restriction is only applied if the user requests a VFIO
assigned device but not in other general cases; which I think is somewhat
reasonable. However, I do agree with you that this solution feels a bit lackluster.

AFAICS, since the main issue here is the placement of the MMIO window, I
am wondering if we can address this specific issue. IIRC, prior to Gerd's patch,
the MMIO base and size were both 32 GB and this seemed to have worked
fine with VFIO-assigned devices. So, I am wondering what are the pros and
cons of keeping this behavior vs the new one.

Thanks,
Vivek

> 
> Laszlo
Laszlo Ersek Nov. 14, 2023, 3:07 p.m. UTC | #3
On 11/14/23 07:38, Kasireddy, Vivek wrote:
> Hi Laszlo,
> 
>>
>> On 11/13/23 08:32, Vivek Kasireddy wrote:
>>> A recent OVMF update has resulted in MMIO regions being placed at
>>> the upper end of the physical address space. As a result, when a
>>> Host device is passthrough'd to the Guest via VFIO, the following
>>> mapping failures occur when VFIO tries to map the MMIO regions of
>>> the device:
>>> VFIO_MAP_DMA failed: Invalid argument
>>> vfio_dma_map(0x557b2f2736d0, 0x380000000000, 0x1000000,
>> 0x7f98ac400000) = -22 (Invalid argument)
>>>
>>> The above failures are mainly seen on some Intel platforms where
>>> the physical address width is larger than the Host's IOMMU
>>> address width. In these cases, VFIO fails to map the MMIO regions
>>> because the IOVAs would be larger than the IOMMU aperture regions.
>>>
>>> Therefore, one way to solve this problem would be to ensure that
>>> cpu->phys_bits = <IOMMU phys_bits>
>>> This can be done by parsing the IOMMU caps value from sysfs and
>>> extracting the address width and using it to override the
>>> phys_bits value as shown in this patch.
>>>
>>> Previous attempt at solving this issue in OVMF:
>>> https://edk2.groups.io/g/devel/topic/102359124
>>>
>>> Cc: Gerd Hoffmann <kraxel@redhat.com>
>>> Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
>>> Cc: Alex Williamson <alex.williamson@redhat.com>
>>> Cc: Laszlo Ersek <lersek@redhat.com>
>>> Cc: Dongwon Kim <dongwon.kim@intel.com>
>>> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
>>> ---
>>>  target/i386/host-cpu.c | 61
>> +++++++++++++++++++++++++++++++++++++++++-
>>>  1 file changed, 60 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/target/i386/host-cpu.c b/target/i386/host-cpu.c
>>> index 92ecb7254b..8326ec95bc 100644
>>> --- a/target/i386/host-cpu.c
>>> +++ b/target/i386/host-cpu.c
>>> @@ -12,6 +12,8 @@
>>>  #include "host-cpu.h"
>>>  #include "qapi/error.h"
>>>  #include "qemu/error-report.h"
>>> +#include "qemu/config-file.h"
>>> +#include "qemu/option.h"
>>>  #include "sysemu/sysemu.h"
>>>
>>>  /* Note: Only safe for use on x86(-64) hosts */
>>> @@ -51,11 +53,58 @@ static void host_cpu_enable_cpu_pm(X86CPU
>> *cpu)
>>>      env->features[FEAT_1_ECX] |= CPUID_EXT_MONITOR;
>>>  }
>>>
>>> +static int intel_iommu_check(void *opaque, QemuOpts *opts, Error
>> **errp)
>>> +{
>>> +    g_autofree char *dev_path = NULL, *iommu_path = NULL, *caps = NULL;
>>> +    const char *driver = qemu_opt_get(opts, "driver");
>>> +    const char *device = qemu_opt_get(opts, "host");
>>> +    uint32_t *iommu_phys_bits = opaque;
>>> +    struct stat st;
>>> +    uint64_t iommu_caps;
>>> +
>>> +    /*
>>> +     * Check if the user is passthroughing any devices via VFIO. We don't
>>> +     * have to limit phys_bits if there are no valid passthrough devices.
>>> +     */
>>> +    if (g_strcmp0(driver, "vfio-pci") || !device) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    dev_path = g_strdup_printf("/sys/bus/pci/devices/%s", device);
>>> +    if (stat(dev_path, &st) < 0) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    iommu_path = g_strdup_printf("%s/iommu/intel-iommu/cap",
>> dev_path);
>>> +    if (stat(iommu_path, &st) < 0) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    if (g_file_get_contents(iommu_path, &caps, NULL, NULL)) {
>>> +        if (sscanf(caps, "%lx", &iommu_caps) != 1) {
>>> +            return 0;
>>> +        }
>>> +        *iommu_phys_bits = ((iommu_caps >> 16) & 0x3f) + 1;
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static uint32_t host_iommu_phys_bits(void)
>>> +{
>>> +    uint32_t iommu_phys_bits = 0;
>>> +
>>> +    qemu_opts_foreach(qemu_find_opts("device"),
>>> +                      intel_iommu_check, &iommu_phys_bits, NULL);
>>> +    return iommu_phys_bits;
>>> +}
>>> +
>>>  static uint32_t host_cpu_adjust_phys_bits(X86CPU *cpu)
>>>  {
>>>      uint32_t host_phys_bits = host_cpu_phys_bits();
>>> +    uint32_t iommu_phys_bits = host_iommu_phys_bits();
>>>      uint32_t phys_bits = cpu->phys_bits;
>>> -    static bool warned;
>>> +    static bool warned, warned2;
>>>
>>>      /*
>>>       * Print a warning if the user set it to a value that's not the
>>> @@ -78,6 +127,16 @@ static uint32_t host_cpu_adjust_phys_bits(X86CPU
>> *cpu)
>>>          }
>>>      }
>>>
>>> +    if (iommu_phys_bits && phys_bits > iommu_phys_bits) {
>>> +        phys_bits = iommu_phys_bits;
>>> +        if (!warned2) {
>>> +            warn_report("Using physical bits (%u)"
>>> +                        " to prevent VFIO mapping failures",
>>> +                        iommu_phys_bits);
>>> +            warned2 = true;
>>> +        }
>>> +    }
>>> +
>>>      return phys_bits;
>>>  }
>>>
>>
>> I only have very superficial comments here (sorry about that -- I find
>> it too bad that this QEMU source file seems to have no designated
>> reviewer or maintainer in QEMU, so I don't want to ignore it).
>>
>> - Terminology: I think we like to call these devices "assigned", and not
>> "passed through". Also, in noun form, "device assignment" and not
>> "device passthrough". Sorry about being pedantic.
> No problem; I'll try to start using the right terminology.
> 
>>
>> - As I (may have) mentioned in my OVMF comments, I'm unsure if narrowing
>> the VCPU "phys address bits" property due to host IOMMU limitations is a
>> good design. To me it feels like hacking one piece of information into
>> another (unrelated) piece of information. It vaguely makes me think
>> we're going to regret this later. But I don't have any specific, current
>> counter-argument, admittedly.
> The physical address space restriction is only applied if the user requests a VFIO
> assigned device but not in other general cases; which I think is somewhat
> reasonable. However, I do agree with you that this solution feels a bit lackluster.
> 
> AFAICS, since the main issue here is the placement of the MMIO window, I
> am wondering if we can address this specific issue. IIRC, prior to Gerd's patch,
> the MMIO base and size were both 32 GB and this seemed to have worked
> fine with VFIO-assigned devices.

This (default) placement and size were the consequence of the default
VCPU phys address width (allowing for a 64GB phys address space) and
that in most cases one wouldn't have large enough guest RAM (or a large
enough guest RAM hotplug range) to disturb this default.

In general, yes, it worked fine, but there were numerous exceptions too
where it didn't work. Especially GPU cards with huge BARs.

> So, I am wondering what are the pros and
> cons of keeping this behavior vs the new one.

The only motivation is user convenience. The explicit (albeit
experimental) fw_cfg works 100% well if we're willing to expose users to
manual tweaking. The alternative is to automate the calculations
*fully*. (It makes no sense, IMO, to unburden users "somewhat", in this
regard.)

In that sense, considering user experience, your patch is actually
great. On the other hand, it's not entirely sound from an engineering
perspective (IMO). I do feel we need a new information channel for this.

Laszlo
Gerd Hoffmann Nov. 15, 2023, 10:52 a.m. UTC | #4
Hi,

> > +    if (iommu_phys_bits && phys_bits > iommu_phys_bits) {
> > +        phys_bits = iommu_phys_bits;
> > +        if (!warned2) {
> > +            warn_report("Using physical bits (%u)"
> > +                        " to prevent VFIO mapping failures",
> > +                        iommu_phys_bits);
> > +            warned2 = true;
> > +        }
> > +    }
> > +
> >      return phys_bits;
> >  }
 
> - As I (may have) mentioned in my OVMF comments, I'm unsure if narrowing
> the VCPU "phys address bits" property due to host IOMMU limitations is a
> good design. To me it feels like hacking one piece of information into
> another (unrelated) piece of information. It vaguely makes me think
> we're going to regret this later. But I don't have any specific, current
> counter-argument, admittedly.

It boils down to:

  (a) do MIN(cpu-phys-bits,iommu-phys-bits) in qemu (this patch)

or

  (b1) communicate iommu-phys-bits to the guest firmware
  (b2) do MIN(cpu-phys-bits,iommu-phys-bits) in the guest.

We certainly had cases cases in the past where taking shortcuts in the
design to simplify things caused problems later on.  So variant (a)
leaves the somewhat ugly feeling that we might regret this some day.

On the other hand switching from (a) to (b) at some point in the future
(should the need arise) shouldn't be much different from doing (b) now.
And the whole phys-bits situation is already messy enough even without
a new iommu-phys-bits setting for the firmware.

So, all in all I think I'm fine with taking this approach.

Acked-by: Gerd Hoffmann <kraxel@redhat.com>

take care,
  Gerd
Cédric Le Goater Dec. 5, 2023, 5:42 p.m. UTC | #5
On 11/13/23 08:32, Vivek Kasireddy wrote:
> A recent OVMF update has resulted in MMIO regions being placed at
> the upper end of the physical address space. As a result, when a
> Host device is passthrough'd to the Guest via VFIO, the following
> mapping failures occur when VFIO tries to map the MMIO regions of
> the device:
> VFIO_MAP_DMA failed: Invalid argument
> vfio_dma_map(0x557b2f2736d0, 0x380000000000, 0x1000000, 0x7f98ac400000) = -22 (Invalid argument)

OVMF and Seabios guests are impacted. Seabios 1.16.3 introduced
the same change of the pci window placement.

C.

> The above failures are mainly seen on some Intel platforms where
> the physical address width is larger than the Host's IOMMU
> address width. In these cases, VFIO fails to map the MMIO regions
> because the IOVAs would be larger than the IOMMU aperture regions.
> 
> Therefore, one way to solve this problem would be to ensure that
> cpu->phys_bits = <IOMMU phys_bits>
> This can be done by parsing the IOMMU caps value from sysfs and
> extracting the address width and using it to override the
> phys_bits value as shown in this patch.
> 
> Previous attempt at solving this issue in OVMF:
> https://edk2.groups.io/g/devel/topic/102359124
> 
> Cc: Gerd Hoffmann <kraxel@redhat.com>
> Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Laszlo Ersek <lersek@redhat.com>
> Cc: Dongwon Kim <dongwon.kim@intel.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> ---
>   target/i386/host-cpu.c | 61 +++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 60 insertions(+), 1 deletion(-)
> 
> diff --git a/target/i386/host-cpu.c b/target/i386/host-cpu.c
> index 92ecb7254b..8326ec95bc 100644
> --- a/target/i386/host-cpu.c
> +++ b/target/i386/host-cpu.c
> @@ -12,6 +12,8 @@
>   #include "host-cpu.h"
>   #include "qapi/error.h"
>   #include "qemu/error-report.h"
> +#include "qemu/config-file.h"
> +#include "qemu/option.h"
>   #include "sysemu/sysemu.h"
>   
>   /* Note: Only safe for use on x86(-64) hosts */
> @@ -51,11 +53,58 @@ static void host_cpu_enable_cpu_pm(X86CPU *cpu)
>       env->features[FEAT_1_ECX] |= CPUID_EXT_MONITOR;
>   }
>   
> +static int intel_iommu_check(void *opaque, QemuOpts *opts, Error **errp)
> +{
> +    g_autofree char *dev_path = NULL, *iommu_path = NULL, *caps = NULL;
> +    const char *driver = qemu_opt_get(opts, "driver");
> +    const char *device = qemu_opt_get(opts, "host");
> +    uint32_t *iommu_phys_bits = opaque;
> +    struct stat st;
> +    uint64_t iommu_caps;
> +
> +    /*
> +     * Check if the user is passthroughing any devices via VFIO. We don't
> +     * have to limit phys_bits if there are no valid passthrough devices.
> +     */
> +    if (g_strcmp0(driver, "vfio-pci") || !device) {
> +        return 0;
> +    }
> +
> +    dev_path = g_strdup_printf("/sys/bus/pci/devices/%s", device);
> +    if (stat(dev_path, &st) < 0) {
> +        return 0;
> +    }
> +
> +    iommu_path = g_strdup_printf("%s/iommu/intel-iommu/cap", dev_path);
> +    if (stat(iommu_path, &st) < 0) {
> +        return 0;
> +    }
> +
> +    if (g_file_get_contents(iommu_path, &caps, NULL, NULL)) {
> +        if (sscanf(caps, "%lx", &iommu_caps) != 1) {
> +            return 0;
> +        }
> +        *iommu_phys_bits = ((iommu_caps >> 16) & 0x3f) + 1;
> +    }
> +
> +    return 0;
> +}
> +
> +static uint32_t host_iommu_phys_bits(void)
> +{
> +    uint32_t iommu_phys_bits = 0;
> +
> +    qemu_opts_foreach(qemu_find_opts("device"),
> +                      intel_iommu_check, &iommu_phys_bits, NULL);
> +    return iommu_phys_bits;
> +}
> +
>   static uint32_t host_cpu_adjust_phys_bits(X86CPU *cpu)
>   {
>       uint32_t host_phys_bits = host_cpu_phys_bits();
> +    uint32_t iommu_phys_bits = host_iommu_phys_bits();
>       uint32_t phys_bits = cpu->phys_bits;
> -    static bool warned;
> +    static bool warned, warned2;
>   
>       /*
>        * Print a warning if the user set it to a value that's not the
> @@ -78,6 +127,16 @@ static uint32_t host_cpu_adjust_phys_bits(X86CPU *cpu)
>           }
>       }
>   
> +    if (iommu_phys_bits && phys_bits > iommu_phys_bits) {
> +        phys_bits = iommu_phys_bits;
> +        if (!warned2) {
> +            warn_report("Using physical bits (%u)"
> +                        " to prevent VFIO mapping failures",
> +                        iommu_phys_bits);
> +            warned2 = true;
> +        }
> +    }
> +
>       return phys_bits;
>   }
>
YangHang Liu Dec. 6, 2023, 3:33 a.m. UTC | #6
After applying this patch, the Q35 + OVMF L2 VM with a igbvf will not
throw the error like:
[1]VFIO_MAP_DMA failed: Invalid argument.
[2]vfio_dma_map(0x560a1a64e3b0, 0x383000004000, 0x4000,
0x7fcfc4053000) = -22 (Invalid argument)

Tested-by: Yanghang Liu <yanghliu@redhat.com>





On Wed, Dec 6, 2023 at 2:08 AM Cédric Le Goater <clegoate@redhat.com> wrote:
>
> On 11/13/23 08:32, Vivek Kasireddy wrote:
> > A recent OVMF update has resulted in MMIO regions being placed at
> > the upper end of the physical address space. As a result, when a
> > Host device is passthrough'd to the Guest via VFIO, the following
> > mapping failures occur when VFIO tries to map the MMIO regions of
> > the device:
> > VFIO_MAP_DMA failed: Invalid argument
> > vfio_dma_map(0x557b2f2736d0, 0x380000000000, 0x1000000, 0x7f98ac400000) = -22 (Invalid argument)
>
> OVMF and Seabios guests are impacted. Seabios 1.16.3 introduced
> the same change of the pci window placement.
>
> C.
>
> > The above failures are mainly seen on some Intel platforms where
> > the physical address width is larger than the Host's IOMMU
> > address width. In these cases, VFIO fails to map the MMIO regions
> > because the IOVAs would be larger than the IOMMU aperture regions.
> >
> > Therefore, one way to solve this problem would be to ensure that
> > cpu->phys_bits = <IOMMU phys_bits>
> > This can be done by parsing the IOMMU caps value from sysfs and
> > extracting the address width and using it to override the
> > phys_bits value as shown in this patch.
> >
> > Previous attempt at solving this issue in OVMF:
> > https://edk2.groups.io/g/devel/topic/102359124
> >
> > Cc: Gerd Hoffmann <kraxel@redhat.com>
> > Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
> > Cc: Alex Williamson <alex.williamson@redhat.com>
> > Cc: Laszlo Ersek <lersek@redhat.com>
> > Cc: Dongwon Kim <dongwon.kim@intel.com>
> > Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> > ---
> >   target/i386/host-cpu.c | 61 +++++++++++++++++++++++++++++++++++++++++-
> >   1 file changed, 60 insertions(+), 1 deletion(-)
> >
> > diff --git a/target/i386/host-cpu.c b/target/i386/host-cpu.c
> > index 92ecb7254b..8326ec95bc 100644
> > --- a/target/i386/host-cpu.c
> > +++ b/target/i386/host-cpu.c
> > @@ -12,6 +12,8 @@
> >   #include "host-cpu.h"
> >   #include "qapi/error.h"
> >   #include "qemu/error-report.h"
> > +#include "qemu/config-file.h"
> > +#include "qemu/option.h"
> >   #include "sysemu/sysemu.h"
> >
> >   /* Note: Only safe for use on x86(-64) hosts */
> > @@ -51,11 +53,58 @@ static void host_cpu_enable_cpu_pm(X86CPU *cpu)
> >       env->features[FEAT_1_ECX] |= CPUID_EXT_MONITOR;
> >   }
> >
> > +static int intel_iommu_check(void *opaque, QemuOpts *opts, Error **errp)
> > +{
> > +    g_autofree char *dev_path = NULL, *iommu_path = NULL, *caps = NULL;
> > +    const char *driver = qemu_opt_get(opts, "driver");
> > +    const char *device = qemu_opt_get(opts, "host");
> > +    uint32_t *iommu_phys_bits = opaque;
> > +    struct stat st;
> > +    uint64_t iommu_caps;
> > +
> > +    /*
> > +     * Check if the user is passthroughing any devices via VFIO. We don't
> > +     * have to limit phys_bits if there are no valid passthrough devices.
> > +     */
> > +    if (g_strcmp0(driver, "vfio-pci") || !device) {
> > +        return 0;
> > +    }
> > +
> > +    dev_path = g_strdup_printf("/sys/bus/pci/devices/%s", device);
> > +    if (stat(dev_path, &st) < 0) {
> > +        return 0;
> > +    }
> > +
> > +    iommu_path = g_strdup_printf("%s/iommu/intel-iommu/cap", dev_path);
> > +    if (stat(iommu_path, &st) < 0) {
> > +        return 0;
> > +    }
> > +
> > +    if (g_file_get_contents(iommu_path, &caps, NULL, NULL)) {
> > +        if (sscanf(caps, "%lx", &iommu_caps) != 1) {
> > +            return 0;
> > +        }
> > +        *iommu_phys_bits = ((iommu_caps >> 16) & 0x3f) + 1;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static uint32_t host_iommu_phys_bits(void)
> > +{
> > +    uint32_t iommu_phys_bits = 0;
> > +
> > +    qemu_opts_foreach(qemu_find_opts("device"),
> > +                      intel_iommu_check, &iommu_phys_bits, NULL);
> > +    return iommu_phys_bits;
> > +}
> > +
> >   static uint32_t host_cpu_adjust_phys_bits(X86CPU *cpu)
> >   {
> >       uint32_t host_phys_bits = host_cpu_phys_bits();
> > +    uint32_t iommu_phys_bits = host_iommu_phys_bits();
> >       uint32_t phys_bits = cpu->phys_bits;
> > -    static bool warned;
> > +    static bool warned, warned2;
> >
> >       /*
> >        * Print a warning if the user set it to a value that's not the
> > @@ -78,6 +127,16 @@ static uint32_t host_cpu_adjust_phys_bits(X86CPU *cpu)
> >           }
> >       }
> >
> > +    if (iommu_phys_bits && phys_bits > iommu_phys_bits) {
> > +        phys_bits = iommu_phys_bits;
> > +        if (!warned2) {
> > +            warn_report("Using physical bits (%u)"
> > +                        " to prevent VFIO mapping failures",
> > +                        iommu_phys_bits);
> > +            warned2 = true;
> > +        }
> > +    }
> > +
> >       return phys_bits;
> >   }
> >
>
>
diff mbox series

Patch

diff --git a/target/i386/host-cpu.c b/target/i386/host-cpu.c
index 92ecb7254b..8326ec95bc 100644
--- a/target/i386/host-cpu.c
+++ b/target/i386/host-cpu.c
@@ -12,6 +12,8 @@ 
 #include "host-cpu.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
+#include "qemu/config-file.h"
+#include "qemu/option.h"
 #include "sysemu/sysemu.h"
 
 /* Note: Only safe for use on x86(-64) hosts */
@@ -51,11 +53,58 @@  static void host_cpu_enable_cpu_pm(X86CPU *cpu)
     env->features[FEAT_1_ECX] |= CPUID_EXT_MONITOR;
 }
 
+static int intel_iommu_check(void *opaque, QemuOpts *opts, Error **errp)
+{
+    g_autofree char *dev_path = NULL, *iommu_path = NULL, *caps = NULL;
+    const char *driver = qemu_opt_get(opts, "driver");
+    const char *device = qemu_opt_get(opts, "host");
+    uint32_t *iommu_phys_bits = opaque;
+    struct stat st;
+    uint64_t iommu_caps;
+
+    /*
+     * Check if the user is passthroughing any devices via VFIO. We don't
+     * have to limit phys_bits if there are no valid passthrough devices.
+     */
+    if (g_strcmp0(driver, "vfio-pci") || !device) {
+        return 0;
+    }
+
+    dev_path = g_strdup_printf("/sys/bus/pci/devices/%s", device);
+    if (stat(dev_path, &st) < 0) {
+        return 0;
+    }
+
+    iommu_path = g_strdup_printf("%s/iommu/intel-iommu/cap", dev_path);
+    if (stat(iommu_path, &st) < 0) {
+        return 0;
+    }
+
+    if (g_file_get_contents(iommu_path, &caps, NULL, NULL)) {
+        if (sscanf(caps, "%lx", &iommu_caps) != 1) {
+            return 0;
+        }
+        *iommu_phys_bits = ((iommu_caps >> 16) & 0x3f) + 1;
+    }
+
+    return 0;
+}
+
+static uint32_t host_iommu_phys_bits(void)
+{
+    uint32_t iommu_phys_bits = 0;
+
+    qemu_opts_foreach(qemu_find_opts("device"),
+                      intel_iommu_check, &iommu_phys_bits, NULL);
+    return iommu_phys_bits;
+}
+
 static uint32_t host_cpu_adjust_phys_bits(X86CPU *cpu)
 {
     uint32_t host_phys_bits = host_cpu_phys_bits();
+    uint32_t iommu_phys_bits = host_iommu_phys_bits();
     uint32_t phys_bits = cpu->phys_bits;
-    static bool warned;
+    static bool warned, warned2;
 
     /*
      * Print a warning if the user set it to a value that's not the
@@ -78,6 +127,16 @@  static uint32_t host_cpu_adjust_phys_bits(X86CPU *cpu)
         }
     }
 
+    if (iommu_phys_bits && phys_bits > iommu_phys_bits) {
+        phys_bits = iommu_phys_bits;
+        if (!warned2) {
+            warn_report("Using physical bits (%u)"
+                        " to prevent VFIO mapping failures",
+                        iommu_phys_bits);
+            warned2 = true;
+        }
+    }
+
     return phys_bits;
 }