diff mbox series

[for-4.18,v2] x86/pvh: fix identity mapping of low 1MB

Message ID 20231013085654.2789-1-roger.pau@citrix.com (mailing list archive)
State Superseded
Headers show
Series [for-4.18,v2] x86/pvh: fix identity mapping of low 1MB | expand

Commit Message

Roger Pau Monné Oct. 13, 2023, 8:56 a.m. UTC
The mapping of memory regions below the 1MB mark was all done by the PVH dom0
builder code, causing the region to be avoided by the arch specific IOMMU
hardware domain initialization code.  That lead to the IOMMU being enabled
without reserved regions in the low 1MB identity mapped in the p2m for PVH
hardware domains.  Firmware which happens to be missing RMRR/IVMD ranges
describing E820 reserved regions in the low 1MB would transiently trigger IOMMU
faults until the p2m is populated by the PVH dom0 builder:

AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb380 flags 0x20 RW
AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb340 flags 0
AMD-Vi: IO_PAGE_FAULT: 0000:00:13.2 d0 addr 00000000000ea1c0 flags 0
AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb480 flags 0x20 RW
AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb080 flags 0x20 RW
AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb400 flags 0
AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb040 flags 0

Those errors have been observed on the osstest pinot{0,1} boxes (AMD Fam15h
Opteron(tm) Processor 3350 HE).

Mostly remove the special handling of the low 1MB done by the PVH dom0 builder,
leaving just the data copy between RAM regions.  Otherwise rely on the IOMMU
arch init code to create any identity mappings for reserved regions in that
range (like it already does for reserved regions elsewhere).

Note there's a small difference in behavior, as holes in the low 1MB will no
longer be identity mapped to the p2m.

Fixes: 6b4f6a31ace1 ('x86/PVH: de-duplicate mappings for first Mb of Dom0 memory')
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Changes since v1:
 - Reword commit message.
---
 xen/arch/x86/hvm/dom0_build.c       | 22 ----------------------
 xen/drivers/passthrough/x86/iommu.c |  8 +-------
 2 files changed, 1 insertion(+), 29 deletions(-)

Comments

Henry Wang Oct. 13, 2023, 9:17 a.m. UTC | #1
Hi Roger,

> On Oct 13, 2023, at 16:56, Roger Pau Monne <roger.pau@citrix.com> wrote:
> 
> The mapping of memory regions below the 1MB mark was all done by the PVH dom0
> builder code, causing the region to be avoided by the arch specific IOMMU
> hardware domain initialization code.  That lead to the IOMMU being enabled
> without reserved regions in the low 1MB identity mapped in the p2m for PVH
> hardware domains.  Firmware which happens to be missing RMRR/IVMD ranges
> describing E820 reserved regions in the low 1MB would transiently trigger IOMMU
> faults until the p2m is populated by the PVH dom0 builder:
> 
> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb380 flags 0x20 RW
> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb340 flags 0
> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.2 d0 addr 00000000000ea1c0 flags 0
> AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb480 flags 0x20 RW
> AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb080 flags 0x20 RW
> AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb400 flags 0
> AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb040 flags 0
> 
> Those errors have been observed on the osstest pinot{0,1} boxes (AMD Fam15h
> Opteron(tm) Processor 3350 HE).
> 
> Mostly remove the special handling of the low 1MB done by the PVH dom0 builder,
> leaving just the data copy between RAM regions.  Otherwise rely on the IOMMU
> arch init code to create any identity mappings for reserved regions in that
> range (like it already does for reserved regions elsewhere).
> 
> Note there's a small difference in behavior, as holes in the low 1MB will no
> longer be identity mapped to the p2m.
> 
> Fixes: 6b4f6a31ace1 ('x86/PVH: de-duplicate mappings for first Mb of Dom0 memory')
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>

Release-acked-by: Henry Wang <Henry.Wang@arm.com>

Kind regards,
Henry

> ---
> Changes since v1:
> - Reword commit message.
> ---
> xen/arch/x86/hvm/dom0_build.c       | 22 ----------------------
> xen/drivers/passthrough/x86/iommu.c |  8 +-------
> 2 files changed, 1 insertion(+), 29 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c
> index bc0e290db612..979db7d1ec4d 100644
> --- a/xen/arch/x86/hvm/dom0_build.c
> +++ b/xen/arch/x86/hvm/dom0_build.c
> @@ -449,28 +449,6 @@ static int __init pvh_populate_p2m(struct domain *d)
>         }
>     }
> 
> -    /* Non-RAM regions of space below 1MB get identity mapped. */
> -    for ( i = rc = 0; i < MB1_PAGES; ++i )
> -    {
> -        p2m_type_t p2mt;
> -        mfn_t mfn = get_gfn_query(d, i, &p2mt);
> -
> -        if ( mfn_eq(mfn, INVALID_MFN) )
> -            rc = set_mmio_p2m_entry(d, _gfn(i), _mfn(i), PAGE_ORDER_4K);
> -        else
> -            /*
> -             * If the p2m entry is already set it must belong to a RMRR and
> -             * already be identity mapped, or be a RAM region.
> -             */
> -            ASSERT(p2mt == p2m_ram_rw || mfn_eq(mfn, _mfn(i)));
> -        put_gfn(d, i);
> -        if ( rc )
> -        {
> -            printk("Failed to identity map PFN %x: %d\n", i, rc);
> -            return rc;
> -        }
> -    }
> -
>     if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) )
>     {
>         /*
> diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c
> index c85549ccad6e..857dccb6a465 100644
> --- a/xen/drivers/passthrough/x86/iommu.c
> +++ b/xen/drivers/passthrough/x86/iommu.c
> @@ -400,13 +400,7 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
>     max_pfn = (GB(4) >> PAGE_SHIFT) - 1;
>     top = max(max_pdx, pfn_to_pdx(max_pfn) + 1);
> 
> -    /*
> -     * First Mb will get mapped in one go by pvh_populate_p2m(). Avoid
> -     * setting up potentially conflicting mappings here.
> -     */
> -    start = paging_mode_translate(d) ? PFN_DOWN(MB(1)) : 0;
> -
> -    for ( i = pfn_to_pdx(start), count = 0; i < top; )
> +    for ( i = 0, start = 0, count = 0; i < top; )
>     {
>         unsigned long pfn = pdx_to_pfn(i);
>         unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn);
> -- 
> 2.42.0
>
Jan Beulich Oct. 16, 2023, 1:32 p.m. UTC | #2
On 13.10.2023 10:56, Roger Pau Monne wrote:
> The mapping of memory regions below the 1MB mark was all done by the PVH dom0
> builder code, causing the region to be avoided by the arch specific IOMMU
> hardware domain initialization code.  That lead to the IOMMU being enabled
> without reserved regions in the low 1MB identity mapped in the p2m for PVH
> hardware domains.  Firmware which happens to be missing RMRR/IVMD ranges
> describing E820 reserved regions in the low 1MB would transiently trigger IOMMU
> faults until the p2m is populated by the PVH dom0 builder:
> 
> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb380 flags 0x20 RW
> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb340 flags 0
> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.2 d0 addr 00000000000ea1c0 flags 0
> AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb480 flags 0x20 RW
> AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb080 flags 0x20 RW
> AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb400 flags 0
> AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb040 flags 0
> 
> Those errors have been observed on the osstest pinot{0,1} boxes (AMD Fam15h
> Opteron(tm) Processor 3350 HE).
> 
> Mostly remove the special handling of the low 1MB done by the PVH dom0 builder,
> leaving just the data copy between RAM regions.  Otherwise rely on the IOMMU
> arch init code to create any identity mappings for reserved regions in that
> range (like it already does for reserved regions elsewhere).
> 
> Note there's a small difference in behavior, as holes in the low 1MB will no
> longer be identity mapped to the p2m.

I certainly like the simplification, but I'm concerned by this: The BDA
is not normally reserved, yet may want accessing by Dom0 (to see the real
machine contents). We do access that first page of memory ourselves, so
I expect OSes may do so as well (even if the specific aspect I'm thinking
of - the warm/cold reboot field - is under Xen's control).

Jan
Roger Pau Monné Oct. 16, 2023, 1:51 p.m. UTC | #3
On Mon, Oct 16, 2023 at 03:32:54PM +0200, Jan Beulich wrote:
> On 13.10.2023 10:56, Roger Pau Monne wrote:
> > The mapping of memory regions below the 1MB mark was all done by the PVH dom0
> > builder code, causing the region to be avoided by the arch specific IOMMU
> > hardware domain initialization code.  That lead to the IOMMU being enabled
> > without reserved regions in the low 1MB identity mapped in the p2m for PVH
> > hardware domains.  Firmware which happens to be missing RMRR/IVMD ranges
> > describing E820 reserved regions in the low 1MB would transiently trigger IOMMU
> > faults until the p2m is populated by the PVH dom0 builder:
> > 
> > AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb380 flags 0x20 RW
> > AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb340 flags 0
> > AMD-Vi: IO_PAGE_FAULT: 0000:00:13.2 d0 addr 00000000000ea1c0 flags 0
> > AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb480 flags 0x20 RW
> > AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb080 flags 0x20 RW
> > AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb400 flags 0
> > AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb040 flags 0
> > 
> > Those errors have been observed on the osstest pinot{0,1} boxes (AMD Fam15h
> > Opteron(tm) Processor 3350 HE).
> > 
> > Mostly remove the special handling of the low 1MB done by the PVH dom0 builder,
> > leaving just the data copy between RAM regions.  Otherwise rely on the IOMMU
> > arch init code to create any identity mappings for reserved regions in that
> > range (like it already does for reserved regions elsewhere).
> > 
> > Note there's a small difference in behavior, as holes in the low 1MB will no
> > longer be identity mapped to the p2m.
> 
> I certainly like the simplification, but I'm concerned by this: The BDA
> is not normally reserved, yet may want accessing by Dom0 (to see the real
> machine contents). We do access that first page of memory ourselves, so
> I expect OSes may do so as well (even if the specific aspect I'm thinking
> of - the warm/cold reboot field - is under Xen's control).

The BDA on the systems I've checked falls into a RAM area on the
memory map, but if you think it can be problematic I could arrange for
arch_iommu_hwdom_init() to also identity map holes in the low 1MB.

Keep in mind this is only for PVH, it won't affect PV.

Thanks, Roger.
Jan Beulich Oct. 16, 2023, 2:07 p.m. UTC | #4
On 16.10.2023 15:51, Roger Pau Monné wrote:
> On Mon, Oct 16, 2023 at 03:32:54PM +0200, Jan Beulich wrote:
>> On 13.10.2023 10:56, Roger Pau Monne wrote:
>>> The mapping of memory regions below the 1MB mark was all done by the PVH dom0
>>> builder code, causing the region to be avoided by the arch specific IOMMU
>>> hardware domain initialization code.  That lead to the IOMMU being enabled
>>> without reserved regions in the low 1MB identity mapped in the p2m for PVH
>>> hardware domains.  Firmware which happens to be missing RMRR/IVMD ranges
>>> describing E820 reserved regions in the low 1MB would transiently trigger IOMMU
>>> faults until the p2m is populated by the PVH dom0 builder:
>>>
>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb380 flags 0x20 RW
>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb340 flags 0
>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.2 d0 addr 00000000000ea1c0 flags 0
>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb480 flags 0x20 RW
>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb080 flags 0x20 RW
>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb400 flags 0
>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb040 flags 0
>>>
>>> Those errors have been observed on the osstest pinot{0,1} boxes (AMD Fam15h
>>> Opteron(tm) Processor 3350 HE).
>>>
>>> Mostly remove the special handling of the low 1MB done by the PVH dom0 builder,
>>> leaving just the data copy between RAM regions.  Otherwise rely on the IOMMU
>>> arch init code to create any identity mappings for reserved regions in that
>>> range (like it already does for reserved regions elsewhere).
>>>
>>> Note there's a small difference in behavior, as holes in the low 1MB will no
>>> longer be identity mapped to the p2m.
>>
>> I certainly like the simplification, but I'm concerned by this: The BDA
>> is not normally reserved, yet may want accessing by Dom0 (to see the real
>> machine contents). We do access that first page of memory ourselves, so
>> I expect OSes may do so as well (even if the specific aspect I'm thinking
>> of - the warm/cold reboot field - is under Xen's control).
> 
> The BDA on the systems I've checked falls into a RAM area on the
> memory map, but if you think it can be problematic I could arrange for
> arch_iommu_hwdom_init() to also identity map holes in the low 1MB.

Hmm, this again is a case where I'd wish CPU and IOMMU mappings could
be different. I don't see reasons to try I/O to such holes, but I can
see reasons for CPU accesses (of more or less probing kind).

> Keep in mind this is only for PVH, it won't affect PV.

Of course.

Jan
Roger Pau Monné Oct. 16, 2023, 2:51 p.m. UTC | #5
On Mon, Oct 16, 2023 at 04:07:22PM +0200, Jan Beulich wrote:
> On 16.10.2023 15:51, Roger Pau Monné wrote:
> > On Mon, Oct 16, 2023 at 03:32:54PM +0200, Jan Beulich wrote:
> >> On 13.10.2023 10:56, Roger Pau Monne wrote:
> >>> The mapping of memory regions below the 1MB mark was all done by the PVH dom0
> >>> builder code, causing the region to be avoided by the arch specific IOMMU
> >>> hardware domain initialization code.  That lead to the IOMMU being enabled
> >>> without reserved regions in the low 1MB identity mapped in the p2m for PVH
> >>> hardware domains.  Firmware which happens to be missing RMRR/IVMD ranges
> >>> describing E820 reserved regions in the low 1MB would transiently trigger IOMMU
> >>> faults until the p2m is populated by the PVH dom0 builder:
> >>>
> >>> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb380 flags 0x20 RW
> >>> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb340 flags 0
> >>> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.2 d0 addr 00000000000ea1c0 flags 0
> >>> AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb480 flags 0x20 RW
> >>> AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb080 flags 0x20 RW
> >>> AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb400 flags 0
> >>> AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb040 flags 0
> >>>
> >>> Those errors have been observed on the osstest pinot{0,1} boxes (AMD Fam15h
> >>> Opteron(tm) Processor 3350 HE).
> >>>
> >>> Mostly remove the special handling of the low 1MB done by the PVH dom0 builder,
> >>> leaving just the data copy between RAM regions.  Otherwise rely on the IOMMU
> >>> arch init code to create any identity mappings for reserved regions in that
> >>> range (like it already does for reserved regions elsewhere).
> >>>
> >>> Note there's a small difference in behavior, as holes in the low 1MB will no
> >>> longer be identity mapped to the p2m.
> >>
> >> I certainly like the simplification, but I'm concerned by this: The BDA
> >> is not normally reserved, yet may want accessing by Dom0 (to see the real
> >> machine contents). We do access that first page of memory ourselves, so
> >> I expect OSes may do so as well (even if the specific aspect I'm thinking
> >> of - the warm/cold reboot field - is under Xen's control).
> > 
> > The BDA on the systems I've checked falls into a RAM area on the
> > memory map, but if you think it can be problematic I could arrange for
> > arch_iommu_hwdom_init() to also identity map holes in the low 1MB.
> 
> Hmm, this again is a case where I'd wish CPU and IOMMU mappings could
> be different. I don't see reasons to try I/O to such holes, but I can
> see reasons for CPU accesses (of more or less probing kind).

Hm, while I agree devices have likely no reason to access holes (there
or elsewhere) I don't see much benefit of having this differentiation,
it's easier to just map everything for accesses from both device and
CPU rather than us having to decide (and maybe get wrong) whether
ranges should only be accessed by the CPU.

> > Keep in mind this is only for PVH, it won't affect PV.
> 
> Of course.

Would you be willing to Ack it?

Thanks, Roger.
Jan Beulich Oct. 16, 2023, 2:55 p.m. UTC | #6
On 16.10.2023 16:51, Roger Pau Monné wrote:
> On Mon, Oct 16, 2023 at 04:07:22PM +0200, Jan Beulich wrote:
>> On 16.10.2023 15:51, Roger Pau Monné wrote:
>>> On Mon, Oct 16, 2023 at 03:32:54PM +0200, Jan Beulich wrote:
>>>> On 13.10.2023 10:56, Roger Pau Monne wrote:
>>>>> The mapping of memory regions below the 1MB mark was all done by the PVH dom0
>>>>> builder code, causing the region to be avoided by the arch specific IOMMU
>>>>> hardware domain initialization code.  That lead to the IOMMU being enabled
>>>>> without reserved regions in the low 1MB identity mapped in the p2m for PVH
>>>>> hardware domains.  Firmware which happens to be missing RMRR/IVMD ranges
>>>>> describing E820 reserved regions in the low 1MB would transiently trigger IOMMU
>>>>> faults until the p2m is populated by the PVH dom0 builder:
>>>>>
>>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb380 flags 0x20 RW
>>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb340 flags 0
>>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.2 d0 addr 00000000000ea1c0 flags 0
>>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb480 flags 0x20 RW
>>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb080 flags 0x20 RW
>>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb400 flags 0
>>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb040 flags 0
>>>>>
>>>>> Those errors have been observed on the osstest pinot{0,1} boxes (AMD Fam15h
>>>>> Opteron(tm) Processor 3350 HE).
>>>>>
>>>>> Mostly remove the special handling of the low 1MB done by the PVH dom0 builder,
>>>>> leaving just the data copy between RAM regions.  Otherwise rely on the IOMMU
>>>>> arch init code to create any identity mappings for reserved regions in that
>>>>> range (like it already does for reserved regions elsewhere).
>>>>>
>>>>> Note there's a small difference in behavior, as holes in the low 1MB will no
>>>>> longer be identity mapped to the p2m.
>>>>
>>>> I certainly like the simplification, but I'm concerned by this: The BDA
>>>> is not normally reserved, yet may want accessing by Dom0 (to see the real
>>>> machine contents). We do access that first page of memory ourselves, so
>>>> I expect OSes may do so as well (even if the specific aspect I'm thinking
>>>> of - the warm/cold reboot field - is under Xen's control).
>>>
>>> The BDA on the systems I've checked falls into a RAM area on the
>>> memory map, but if you think it can be problematic I could arrange for
>>> arch_iommu_hwdom_init() to also identity map holes in the low 1MB.
>>
>> Hmm, this again is a case where I'd wish CPU and IOMMU mappings could
>> be different. I don't see reasons to try I/O to such holes, but I can
>> see reasons for CPU accesses (of more or less probing kind).
> 
> Hm, while I agree devices have likely no reason to access holes (there
> or elsewhere) I don't see much benefit of having this differentiation,
> it's easier to just map everything for accesses from both device and
> CPU rather than us having to decide (and maybe get wrong) whether
> ranges should only be accessed by the CPU.

I understand that, and I also follow Andrew's arguments towards not
making such a distinction. The consequence though is that we need
to map more than possibly necessary, and never too little.

>>> Keep in mind this is only for PVH, it won't affect PV.
>>
>> Of course.
> 
> Would you be willing to Ack it?

If "it" is the present version, then me doing so would be stretch.
How averse are you to re-adding the hole mappings?

Jan
Roger Pau Monné Oct. 17, 2023, 8:27 a.m. UTC | #7
On Mon, Oct 16, 2023 at 04:55:30PM +0200, Jan Beulich wrote:
> On 16.10.2023 16:51, Roger Pau Monné wrote:
> > On Mon, Oct 16, 2023 at 04:07:22PM +0200, Jan Beulich wrote:
> >> On 16.10.2023 15:51, Roger Pau Monné wrote:
> >>> On Mon, Oct 16, 2023 at 03:32:54PM +0200, Jan Beulich wrote:
> >>>> On 13.10.2023 10:56, Roger Pau Monne wrote:
> >>>>> The mapping of memory regions below the 1MB mark was all done by the PVH dom0
> >>>>> builder code, causing the region to be avoided by the arch specific IOMMU
> >>>>> hardware domain initialization code.  That lead to the IOMMU being enabled
> >>>>> without reserved regions in the low 1MB identity mapped in the p2m for PVH
> >>>>> hardware domains.  Firmware which happens to be missing RMRR/IVMD ranges
> >>>>> describing E820 reserved regions in the low 1MB would transiently trigger IOMMU
> >>>>> faults until the p2m is populated by the PVH dom0 builder:
> >>>>>
> >>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb380 flags 0x20 RW
> >>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.1 d0 addr 00000000000eb340 flags 0
> >>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:13.2 d0 addr 00000000000ea1c0 flags 0
> >>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb480 flags 0x20 RW
> >>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb080 flags 0x20 RW
> >>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:14.5 d0 addr 00000000000eb400 flags 0
> >>>>> AMD-Vi: IO_PAGE_FAULT: 0000:00:12.0 d0 addr 00000000000eb040 flags 0
> >>>>>
> >>>>> Those errors have been observed on the osstest pinot{0,1} boxes (AMD Fam15h
> >>>>> Opteron(tm) Processor 3350 HE).
> >>>>>
> >>>>> Mostly remove the special handling of the low 1MB done by the PVH dom0 builder,
> >>>>> leaving just the data copy between RAM regions.  Otherwise rely on the IOMMU
> >>>>> arch init code to create any identity mappings for reserved regions in that
> >>>>> range (like it already does for reserved regions elsewhere).
> >>>>>
> >>>>> Note there's a small difference in behavior, as holes in the low 1MB will no
> >>>>> longer be identity mapped to the p2m.
> >>>>
> >>>> I certainly like the simplification, but I'm concerned by this: The BDA
> >>>> is not normally reserved, yet may want accessing by Dom0 (to see the real
> >>>> machine contents). We do access that first page of memory ourselves, so
> >>>> I expect OSes may do so as well (even if the specific aspect I'm thinking
> >>>> of - the warm/cold reboot field - is under Xen's control).
> >>>
> >>> The BDA on the systems I've checked falls into a RAM area on the
> >>> memory map, but if you think it can be problematic I could arrange for
> >>> arch_iommu_hwdom_init() to also identity map holes in the low 1MB.
> >>
> >> Hmm, this again is a case where I'd wish CPU and IOMMU mappings could
> >> be different. I don't see reasons to try I/O to such holes, but I can
> >> see reasons for CPU accesses (of more or less probing kind).
> > 
> > Hm, while I agree devices have likely no reason to access holes (there
> > or elsewhere) I don't see much benefit of having this differentiation,
> > it's easier to just map everything for accesses from both device and
> > CPU rather than us having to decide (and maybe get wrong) whether
> > ranges should only be accessed by the CPU.
> 
> I understand that, and I also follow Andrew's arguments towards not
> making such a distinction. The consequence though is that we need
> to map more than possibly necessary, and never too little.
> 
> >>> Keep in mind this is only for PVH, it won't affect PV.
> >>
> >> Of course.
> > 
> > Would you be willing to Ack it?
> 
> If "it" is the present version, then me doing so would be stretch.
> How averse are you to re-adding the hole mappings?

Given the point we are regarding the release I guess it's safer to
leave the mapping of the holes in the low 1MB as-is, and consider
removing it for 4.19?  That would give us a full release cycle to
check whether it causes issues on systems.

I will send the updated patch.

Thanks, Roger.
diff mbox series

Patch

diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c
index bc0e290db612..979db7d1ec4d 100644
--- a/xen/arch/x86/hvm/dom0_build.c
+++ b/xen/arch/x86/hvm/dom0_build.c
@@ -449,28 +449,6 @@  static int __init pvh_populate_p2m(struct domain *d)
         }
     }
 
-    /* Non-RAM regions of space below 1MB get identity mapped. */
-    for ( i = rc = 0; i < MB1_PAGES; ++i )
-    {
-        p2m_type_t p2mt;
-        mfn_t mfn = get_gfn_query(d, i, &p2mt);
-
-        if ( mfn_eq(mfn, INVALID_MFN) )
-            rc = set_mmio_p2m_entry(d, _gfn(i), _mfn(i), PAGE_ORDER_4K);
-        else
-            /*
-             * If the p2m entry is already set it must belong to a RMRR and
-             * already be identity mapped, or be a RAM region.
-             */
-            ASSERT(p2mt == p2m_ram_rw || mfn_eq(mfn, _mfn(i)));
-        put_gfn(d, i);
-        if ( rc )
-        {
-            printk("Failed to identity map PFN %x: %d\n", i, rc);
-            return rc;
-        }
-    }
-
     if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) )
     {
         /*
diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c
index c85549ccad6e..857dccb6a465 100644
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -400,13 +400,7 @@  void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
     max_pfn = (GB(4) >> PAGE_SHIFT) - 1;
     top = max(max_pdx, pfn_to_pdx(max_pfn) + 1);
 
-    /*
-     * First Mb will get mapped in one go by pvh_populate_p2m(). Avoid
-     * setting up potentially conflicting mappings here.
-     */
-    start = paging_mode_translate(d) ? PFN_DOWN(MB(1)) : 0;
-
-    for ( i = pfn_to_pdx(start), count = 0; i < top; )
+    for ( i = 0, start = 0, count = 0; i < top; )
     {
         unsigned long pfn = pdx_to_pfn(i);
         unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn);