diff mbox series

[2/2] x86/msi: Allow writes to registers on the same page as MSI-X table

Message ID 20221114192100.1539267-2-marmarek@invisiblethingslab.com (mailing list archive)
State New, archived
Headers show
Series [1/2] x86/msi: passthrough all MSI-X vector ctrl writes to device model | expand

Commit Message

Marek Marczykowski-Górecki Nov. 14, 2022, 7:21 p.m. UTC
Some devices (notably Intel Wifi 6 AX210 card) keep auxiliary registers
on the same page as MSI-X table. Device model (especially one in
stubdomain) cannot really handle those, as direct writes to that page is
refused (page is on mmio_ro_ranges list). Instead, add internal ioreq
server that handle those writes.

This may be also used to read Pending Bit Array, if it lives on the same
page, making QEMU not needing /dev/mem access at all (especially helpful
with lockdown enabled in dom0). If PBA lives on another page, it can be
(and will be) mapped to the guest directly.
If PBA lives on the same page, forbid writes. Technically, writes outside
of PBA could be allowed, but at this moment the precise location of PBA
isn't saved.

Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
---
 xen/arch/x86/hvm/vmsi.c        | 135 +++++++++++++++++++++++++++++++++
 xen/arch/x86/include/asm/msi.h |   1 +
 xen/arch/x86/msi.c             |  21 +++++
 3 files changed, 157 insertions(+)

Comments

Jan Beulich Nov. 17, 2022, 4:34 p.m. UTC | #1
On 14.11.2022 20:21, Marek Marczykowski-Górecki wrote:
> Some devices (notably Intel Wifi 6 AX210 card) keep auxiliary registers
> on the same page as MSI-X table. Device model (especially one in
> stubdomain) cannot really handle those, as direct writes to that page is
> refused (page is on mmio_ro_ranges list). Instead, add internal ioreq
> server that handle those writes.
> 
> This may be also used to read Pending Bit Array, if it lives on the same

"may" sounds as if this would be future work, yet ...

> page, making QEMU not needing /dev/mem access at all (especially helpful
> with lockdown enabled in dom0). If PBA lives on another page, it can be
> (and will be) mapped to the guest directly.
> If PBA lives on the same page, forbid writes.

... here you say you actually handle the case (because otherwise you
wouldn't need to distinguish writes from reads). It is only ...

> Technically, writes outside
> of PBA could be allowed, but at this moment the precise location of PBA
> isn't saved.

... this part which right now you don't handle. I have to admit that I'm
not convinced we should take such a partial implementation, especially
if there's nothing recorded in the log (making it harder to tell whether
something not working is because of this implementation restriction or
some other issue).

> --- a/xen/arch/x86/hvm/vmsi.c
> +++ b/xen/arch/x86/hvm/vmsi.c
> @@ -428,6 +428,133 @@ static const struct hvm_io_ops msixtbl_mmio_ops = {
>      .write = _msixtbl_write,
>  };
>  
> +static void __iomem *msixtbl_page_handler_get_hwaddr(
> +        const struct vcpu *v,
> +        uint64_t address,
> +        bool write)

These want to be indented just like ...

> +{
> +    struct domain *d = v->domain;
> +    struct pci_dev *pdev = NULL;
> +    struct msixtbl_entry *entry;
> +    void __iomem *ret = NULL;
> +    uint64_t table_end_addr;

... function scope local variables.

Also: Pointer-to-const for the first three local variables? And maybe
omit "ret", which is effectively used just once (as you could use
"return" at the point where you assign to it). Also you don't further
use v afaics, so maybe have the callers pass in const struct domain *
right away?

> +    rcu_read_lock(&msixtbl_rcu_lock);
> +    /*
> +     * Check if it's on the same page as the end of the MSI-X table, but
> +     * outside of the table itself.
> +     */
> +    list_for_each_entry( entry, &d->arch.hvm.msixtbl_list, list )
> +        if ( PFN_DOWN(address) == PFN_DOWN(entry->gtable + entry->table_len) &&
> +             address >= entry->gtable + entry->table_len )
> +        {
> +            pdev = entry->pdev;
> +            break;
> +        }
> +    rcu_read_unlock(&msixtbl_rcu_lock);
> +
> +    if ( !pdev )
> +        return NULL;
> +
> +    ASSERT( pdev->msix );

Style: ASSERT is not a (pseudo-)keyword and hence should not have
blanks immediately inside the parentheses. (More instances further
down.)

> +    table_end_addr = (pdev->msix->table.first << PAGE_SHIFT) +
> +        pdev->msix->nr_entries * PCI_MSIX_ENTRY_SIZE;
> +    ASSERT( PFN_DOWN(table_end_addr) == pdev->msix->table.last );

What are you trying to catch here? I ask because the local variable
exists just for this checking afaics.

> +    /* If PBA lives on the same page too, forbid writes. */
> +    if ( write && pdev->msix->table.last == pdev->msix->pba.first )
> +        return NULL;
> +
> +    if ( pdev->msix->last_table_page )
> +        ret = pdev->msix->last_table_page + (address & (PAGE_SIZE - 1));
> +    else
> +        gdprintk(XENLOG_WARNING,
> +                 "MSI-X last_table_page not initialized for %04x:%02x:%02x.%u\n",
> +                 pdev->seg,
> +                 pdev->bus,
> +                 PCI_SLOT(pdev->devfn),
> +                 PCI_FUNC(pdev->devfn));
> +

Please use %pp.

> +static bool cf_check msixtbl_page_accept(
> +        const struct hvm_io_handler *handler, const ioreq_t *r)
> +{
> +    unsigned long addr = r->addr;

Any particular reason for having this local variable, which is used ...

> +    ASSERT( r->type == IOREQ_TYPE_COPY );
> +
> +    return msixtbl_page_handler_get_hwaddr(
> +            current, addr, r->dir == IOREQ_WRITE);
> +}

... exactly once?

> +static int cf_check msixtbl_page_read(
> +        const struct hvm_io_handler *handler,
> +        uint64_t address, uint32_t len, uint64_t *pval)
> +{
> +    void __iomem *hwaddr = msixtbl_page_handler_get_hwaddr(
> +            current, address, false);
> +
> +    if ( !hwaddr )
> +        return X86EMUL_UNHANDLEABLE;
> +
> +    switch ( len ) {

Style: Brace on its own line please and ...

> +        case 1:
> +            *pval = readb(hwaddr);
> +            break;
> +        case 2:
> +            *pval = readw(hwaddr);
> +            break;
> +        case 4:
> +            *pval = readl(hwaddr);
> +            break;
> +        case 8:
> +            *pval = readq(hwaddr);
> +            break;
> +        default:
> +            return X86EMUL_UNHANDLEABLE;

... the body un-indented by a level.

As to operation I'm unconvinced that carrying out misaligned accesses
here is generally safe. If we find devices really needing such, we
may need to think about ways to deal with them without putting at
risk everyone else. At the very least you need to make sure you don't
access beyond the end of the page.

> --- a/xen/arch/x86/msi.c
> +++ b/xen/arch/x86/msi.c
> @@ -961,6 +961,21 @@ static int msix_capability_init(struct pci_dev *dev,
>                  domain_crash(d);
>              /* XXX How to deal with existing mappings? */
>          }
> +
> +        /*
> +         * If the MSI-X table doesn't span full page(s), map the last page for
> +         * passthrough accesses.
> +         */
> +        if ( (msix->nr_entries * PCI_MSIX_ENTRY_SIZE) & (PAGE_SIZE - 1) )
> +        {
> +            uint64_t entry_paddr = table_paddr + msix->nr_entries * PCI_MSIX_ENTRY_SIZE;
> +            int idx = msix_get_fixmap(msix, table_paddr, entry_paddr);
> +
> +            if ( idx >= 0 )
> +                msix->last_table_page = fix_to_virt(idx);
> +            else
> +                gprintk(XENLOG_ERR, "Failed to map last MSI-X table page: %d\n", idx);
> +        }

Could we avoid the extra work if there's only less than one page's
worth of entries for a device? But then again maybe not worth any
extra code, as the same mapping will be re-used anyway due to the
refcounting that's being used.

Makes me think of another aspect though: Don't we also need to
handle stuff living on the same page as the start of the table, if
that doesn't start at a page boundary?

> @@ -1090,6 +1105,12 @@ static void _pci_cleanup_msix(struct arch_msix *msix)
>              WARN();
>          msix->table.first = 0;
>          msix->table.last = 0;
> +        if ( msix->last_table_page )
> +        {
> +            msix_put_fixmap(msix,
> +                            virt_to_fix((unsigned long)msix->last_table_page));
> +            msix->last_table_page = 0;

To set a pointer please use NULL.

Overall it looks like you're dealing with the issue for HVM only.
You will want to express this in the title, perhaps by using x86/hvm:
as the prefix. But then the question of course is whether this couldn't
be dealt with in/from mmio_ro_emulated_write(), which handles both HVM
and PV. Which in turn raises the question: Do you need to handle reads
in the new code in the first place?

Jan
Marek Marczykowski-Górecki Nov. 17, 2022, 5:31 p.m. UTC | #2
On Thu, Nov 17, 2022 at 05:34:36PM +0100, Jan Beulich wrote:
> On 14.11.2022 20:21, Marek Marczykowski-Górecki wrote:
> > Some devices (notably Intel Wifi 6 AX210 card) keep auxiliary registers
> > on the same page as MSI-X table. Device model (especially one in
> > stubdomain) cannot really handle those, as direct writes to that page is
> > refused (page is on mmio_ro_ranges list). Instead, add internal ioreq
> > server that handle those writes.
> > 
> > This may be also used to read Pending Bit Array, if it lives on the same
> 
> "may" sounds as if this would be future work, yet ...

I was meaning it applies only if it shares the page, but indeed it
should be "will".

> > page, making QEMU not needing /dev/mem access at all (especially helpful
> > with lockdown enabled in dom0). If PBA lives on another page, it can be
> > (and will be) mapped to the guest directly.
> > If PBA lives on the same page, forbid writes.
> 
> ... here you say you actually handle the case (because otherwise you
> wouldn't need to distinguish writes from reads). It is only ...
> 
> > Technically, writes outside
> > of PBA could be allowed, but at this moment the precise location of PBA
> > isn't saved.
> 
> ... this part which right now you don't handle. I have to admit that I'm
> not convinced we should take such a partial implementation, especially
> if there's nothing recorded in the log (making it harder to tell whether
> something not working is because of this implementation restriction or
> some other issue).

For this, I need to save exact offset to PBA. I can extend the patch for
that case too.

> > --- a/xen/arch/x86/hvm/vmsi.c
> > +++ b/xen/arch/x86/hvm/vmsi.c
> > @@ -428,6 +428,133 @@ static const struct hvm_io_ops msixtbl_mmio_ops = {
> >      .write = _msixtbl_write,
> >  };
> >  
> > +static void __iomem *msixtbl_page_handler_get_hwaddr(
> > +        const struct vcpu *v,
> > +        uint64_t address,
> > +        bool write)
> 
> These want to be indented just like ...
> 
> > +{
> > +    struct domain *d = v->domain;
> > +    struct pci_dev *pdev = NULL;
> > +    struct msixtbl_entry *entry;
> > +    void __iomem *ret = NULL;
> > +    uint64_t table_end_addr;
> 
> ... function scope local variables.
> 
> Also: Pointer-to-const for the first three local variables? And maybe
> omit "ret", which is effectively used just once (as you could use
> "return" at the point where you assign to it). Also you don't further
> use v afaics, so maybe have the callers pass in const struct domain *
> right away?

Makes sense indeed.

> > +    rcu_read_lock(&msixtbl_rcu_lock);
> > +    /*
> > +     * Check if it's on the same page as the end of the MSI-X table, but
> > +     * outside of the table itself.
> > +     */
> > +    list_for_each_entry( entry, &d->arch.hvm.msixtbl_list, list )
> > +        if ( PFN_DOWN(address) == PFN_DOWN(entry->gtable + entry->table_len) &&
> > +             address >= entry->gtable + entry->table_len )
> > +        {
> > +            pdev = entry->pdev;
> > +            break;
> > +        }
> > +    rcu_read_unlock(&msixtbl_rcu_lock);
> > +
> > +    if ( !pdev )
> > +        return NULL;
> > +
> > +    ASSERT( pdev->msix );
> 
> Style: ASSERT is not a (pseudo-)keyword and hence should not have
> blanks immediately inside the parentheses. (More instances further
> down.)

Ok.

> > +    table_end_addr = (pdev->msix->table.first << PAGE_SHIFT) +
> > +        pdev->msix->nr_entries * PCI_MSIX_ENTRY_SIZE;
> > +    ASSERT( PFN_DOWN(table_end_addr) == pdev->msix->table.last );
> 
> What are you trying to catch here? I ask because the local variable
> exists just for this checking afaics.

This is a double check if pdev->msix->table.last value is correct,
because I have two ways of calculating the address. Now, that I've
written this sentence, I see the way table_end_addr is calculated is
wrong, as it ignores table table not starting at the page boundary. I'll
remove both above lines.

> > +    /* If PBA lives on the same page too, forbid writes. */
> > +    if ( write && pdev->msix->table.last == pdev->msix->pba.first )
> > +        return NULL;
> > +
> > +    if ( pdev->msix->last_table_page )
> > +        ret = pdev->msix->last_table_page + (address & (PAGE_SIZE - 1));
> > +    else
> > +        gdprintk(XENLOG_WARNING,
> > +                 "MSI-X last_table_page not initialized for %04x:%02x:%02x.%u\n",
> > +                 pdev->seg,
> > +                 pdev->bus,
> > +                 PCI_SLOT(pdev->devfn),
> > +                 PCI_FUNC(pdev->devfn));
> > +
> 
> Please use %pp.

Ok.

> > +static bool cf_check msixtbl_page_accept(
> > +        const struct hvm_io_handler *handler, const ioreq_t *r)
> > +{
> > +    unsigned long addr = r->addr;
> 
> Any particular reason for having this local variable, which is used ...
> 
> > +    ASSERT( r->type == IOREQ_TYPE_COPY );
> > +
> > +    return msixtbl_page_handler_get_hwaddr(
> > +            current, addr, r->dir == IOREQ_WRITE);
> > +}
> 
> ... exactly once?

Ok, will reduce.

> > +static int cf_check msixtbl_page_read(
> > +        const struct hvm_io_handler *handler,
> > +        uint64_t address, uint32_t len, uint64_t *pval)
> > +{
> > +    void __iomem *hwaddr = msixtbl_page_handler_get_hwaddr(
> > +            current, address, false);
> > +
> > +    if ( !hwaddr )
> > +        return X86EMUL_UNHANDLEABLE;
> > +
> > +    switch ( len ) {
> 
> Style: Brace on its own line please and ...
> 
> > +        case 1:
> > +            *pval = readb(hwaddr);
> > +            break;
> > +        case 2:
> > +            *pval = readw(hwaddr);
> > +            break;
> > +        case 4:
> > +            *pval = readl(hwaddr);
> > +            break;
> > +        case 8:
> > +            *pval = readq(hwaddr);
> > +            break;
> > +        default:
> > +            return X86EMUL_UNHANDLEABLE;
> 
> ... the body un-indented by a level.

Ok.

> As to operation I'm unconvinced that carrying out misaligned accesses
> here is generally safe. If we find devices really needing such, we
> may need to think about ways to deal with them without putting at
> risk everyone else. At the very least you need to make sure you don't
> access beyond the end of the page.

Right.

> > --- a/xen/arch/x86/msi.c
> > +++ b/xen/arch/x86/msi.c
> > @@ -961,6 +961,21 @@ static int msix_capability_init(struct pci_dev *dev,
> >                  domain_crash(d);
> >              /* XXX How to deal with existing mappings? */
> >          }
> > +
> > +        /*
> > +         * If the MSI-X table doesn't span full page(s), map the last page for
> > +         * passthrough accesses.
> > +         */
> > +        if ( (msix->nr_entries * PCI_MSIX_ENTRY_SIZE) & (PAGE_SIZE - 1) )
> > +        {
> > +            uint64_t entry_paddr = table_paddr + msix->nr_entries * PCI_MSIX_ENTRY_SIZE;
> > +            int idx = msix_get_fixmap(msix, table_paddr, entry_paddr);
> > +
> > +            if ( idx >= 0 )
> > +                msix->last_table_page = fix_to_virt(idx);
> > +            else
> > +                gprintk(XENLOG_ERR, "Failed to map last MSI-X table page: %d\n", idx);
> > +        }
> 
> Could we avoid the extra work if there's only less than one page's
> worth of entries for a device? But then again maybe not worth any
> extra code, as the same mapping will be re-used anyway due to the
> refcounting that's being used.

I was considering that, but decided against exactly because of
msix_get_fixmap() reusing existing mappings.

> Makes me think of another aspect though: Don't we also need to
> handle stuff living on the same page as the start of the table, if
> that doesn't start at a page boundary?

I have considered that, but decided against given every single device I
tried have MSI-X table at the page boundary. But if you prefer, I can
add such handling too (will require adding another variable to the
arch_msix structure - to store the fixmap location).

> > @@ -1090,6 +1105,12 @@ static void _pci_cleanup_msix(struct arch_msix *msix)
> >              WARN();
> >          msix->table.first = 0;
> >          msix->table.last = 0;
> > +        if ( msix->last_table_page )
> > +        {
> > +            msix_put_fixmap(msix,
> > +                            virt_to_fix((unsigned long)msix->last_table_page));
> > +            msix->last_table_page = 0;
> 
> To set a pointer please use NULL.

Ok.

> Overall it looks like you're dealing with the issue for HVM only.
> You will want to express this in the title, perhaps by using x86/hvm:
> as the prefix. But then the question of course is whether this couldn't
> be dealt with in/from mmio_ro_emulated_write(), which handles both HVM
> and PV. 

The issue is correlating BAR mapping location with guest's view.
Writable BAR areas are mapped (by qemu) via xc_domain_memory_mapping(), but
that fails for read-only pages (and indeed, qemu doesn't attempt to do
that for the pages with the MSI-X table). Lacking that, I need to use
msixtbl_entry->gtable, which is HVM-only thing.

In fact there is another corner case I don't handle here: guest
accessing those registers when MSI-X is disabled. In that case, there is
no related msixtbl_entry, so I can't correlate the access, but the
page(s) is still read-only, so direct mapping would fail. In practice,
such access will trap into qemu, which will complain "Should not
read/write BAR through QEMU". I have seen this happening several times
when developing the series (due to bugs in my patches), but I haven't
found any case where it would happen with the final patch version.
In fact, I have considered handling this whole thing via qemu (as it
knows better where BAR live from the guest PoV), but stubdomain still
don't have write access to that pages, so that would need to be trapped
(for the second time) by Xen anyway.

For the PV case, I think this extra translation wouldn't be necessary as
BAR are mapped at their actual location, right? But then, it makes it
rather different implementation (separate feature), than just having a
common one for PV and HVM.

> Which in turn raises the question: Do you need to handle reads
> in the new code in the first place?

The page not being mapped is also the reason why I do need to handle
reads too.
Jan Beulich Nov. 18, 2022, 7:20 a.m. UTC | #3
On 17.11.2022 18:31, Marek Marczykowski-Górecki wrote:
> On Thu, Nov 17, 2022 at 05:34:36PM +0100, Jan Beulich wrote:
>> On 14.11.2022 20:21, Marek Marczykowski-Górecki wrote:
>>> --- a/xen/arch/x86/msi.c
>>> +++ b/xen/arch/x86/msi.c
>>> @@ -961,6 +961,21 @@ static int msix_capability_init(struct pci_dev *dev,
>>>                  domain_crash(d);
>>>              /* XXX How to deal with existing mappings? */
>>>          }
>>> +
>>> +        /*
>>> +         * If the MSI-X table doesn't span full page(s), map the last page for
>>> +         * passthrough accesses.
>>> +         */
>>> +        if ( (msix->nr_entries * PCI_MSIX_ENTRY_SIZE) & (PAGE_SIZE - 1) )
>>> +        {
>>> +            uint64_t entry_paddr = table_paddr + msix->nr_entries * PCI_MSIX_ENTRY_SIZE;
>>> +            int idx = msix_get_fixmap(msix, table_paddr, entry_paddr);
>>> +
>>> +            if ( idx >= 0 )
>>> +                msix->last_table_page = fix_to_virt(idx);
>>> +            else
>>> +                gprintk(XENLOG_ERR, "Failed to map last MSI-X table page: %d\n", idx);
>>> +        }
>>
>> Could we avoid the extra work if there's only less than one page's
>> worth of entries for a device? But then again maybe not worth any
>> extra code, as the same mapping will be re-used anyway due to the
>> refcounting that's being used.
> 
> I was considering that, but decided against exactly because of
> msix_get_fixmap() reusing existing mappings.
> 
>> Makes me think of another aspect though: Don't we also need to
>> handle stuff living on the same page as the start of the table, if
>> that doesn't start at a page boundary?
> 
> I have considered that, but decided against given every single device I
> tried have MSI-X table at the page boundary. But if you prefer, I can
> add such handling too (will require adding another variable to the
> arch_msix structure - to store the fixmap location).

To limit growth of the struct, please at least consider storing the fixmap
indexes instead of full pointers.

>>> @@ -1090,6 +1105,12 @@ static void _pci_cleanup_msix(struct arch_msix *msix)
>>>              WARN();
>>>          msix->table.first = 0;
>>>          msix->table.last = 0;
>>> +        if ( msix->last_table_page )
>>> +        {
>>> +            msix_put_fixmap(msix,
>>> +                            virt_to_fix((unsigned long)msix->last_table_page));
>>> +            msix->last_table_page = 0;
>>
>> To set a pointer please use NULL.
> 
> Ok.
> 
>> Overall it looks like you're dealing with the issue for HVM only.
>> You will want to express this in the title, perhaps by using x86/hvm:
>> as the prefix. But then the question of course is whether this couldn't
>> be dealt with in/from mmio_ro_emulated_write(), which handles both HVM
>> and PV. 
> 
> The issue is correlating BAR mapping location with guest's view.
> Writable BAR areas are mapped (by qemu) via xc_domain_memory_mapping(), but
> that fails for read-only pages (and indeed, qemu doesn't attempt to do
> that for the pages with the MSI-X table). Lacking that, I need to use
> msixtbl_entry->gtable, which is HVM-only thing.
> 
> In fact there is another corner case I don't handle here: guest
> accessing those registers when MSI-X is disabled. In that case, there is
> no related msixtbl_entry, so I can't correlate the access, but the
> page(s) is still read-only, so direct mapping would fail. In practice,
> such access will trap into qemu, which will complain "Should not
> read/write BAR through QEMU". I have seen this happening several times
> when developing the series (due to bugs in my patches), but I haven't
> found any case where it would happen with the final patch version.
> In fact, I have considered handling this whole thing via qemu (as it
> knows better where BAR live from the guest PoV), but stubdomain still
> don't have write access to that pages, so that would need to be trapped
> (for the second time) by Xen anyway.
> 
> For the PV case, I think this extra translation wouldn't be necessary as
> BAR are mapped at their actual location, right?

I think so, yes.

> But then, it makes it
> rather different implementation (separate feature), than just having a
> common one for PV and HVM.

It would be different, yes, and if - as you explain above - there are
technical reasons why it cannot be shared, then so be it. Mentioning
this in the description may be worthwhile, or else the same question
may be asked again (even by me, in case I forget part of the discussion
by the time I look at a particular future version).

>> Which in turn raises the question: Do you need to handle reads
>> in the new code in the first place?
> 
> The page not being mapped is also the reason why I do need to handle
> reads too.

Just for my own clarity: You mean "not mapped to qemu" here?

Jan
Marek Marczykowski-Górecki Nov. 18, 2022, 12:19 p.m. UTC | #4
On Fri, Nov 18, 2022 at 08:20:14AM +0100, Jan Beulich wrote:
> On 17.11.2022 18:31, Marek Marczykowski-Górecki wrote:
> > On Thu, Nov 17, 2022 at 05:34:36PM +0100, Jan Beulich wrote:
> >> On 14.11.2022 20:21, Marek Marczykowski-Górecki wrote:
> >>> --- a/xen/arch/x86/msi.c
> >>> +++ b/xen/arch/x86/msi.c
> >>> @@ -961,6 +961,21 @@ static int msix_capability_init(struct pci_dev *dev,
> >>>                  domain_crash(d);
> >>>              /* XXX How to deal with existing mappings? */
> >>>          }
> >>> +
> >>> +        /*
> >>> +         * If the MSI-X table doesn't span full page(s), map the last page for
> >>> +         * passthrough accesses.
> >>> +         */
> >>> +        if ( (msix->nr_entries * PCI_MSIX_ENTRY_SIZE) & (PAGE_SIZE - 1) )
> >>> +        {
> >>> +            uint64_t entry_paddr = table_paddr + msix->nr_entries * PCI_MSIX_ENTRY_SIZE;
> >>> +            int idx = msix_get_fixmap(msix, table_paddr, entry_paddr);
> >>> +
> >>> +            if ( idx >= 0 )
> >>> +                msix->last_table_page = fix_to_virt(idx);
> >>> +            else
> >>> +                gprintk(XENLOG_ERR, "Failed to map last MSI-X table page: %d\n", idx);
> >>> +        }
> >>
> >> Could we avoid the extra work if there's only less than one page's
> >> worth of entries for a device? But then again maybe not worth any
> >> extra code, as the same mapping will be re-used anyway due to the
> >> refcounting that's being used.
> > 
> > I was considering that, but decided against exactly because of
> > msix_get_fixmap() reusing existing mappings.
> > 
> >> Makes me think of another aspect though: Don't we also need to
> >> handle stuff living on the same page as the start of the table, if
> >> that doesn't start at a page boundary?
> > 
> > I have considered that, but decided against given every single device I
> > tried have MSI-X table at the page boundary. But if you prefer, I can
> > add such handling too (will require adding another variable to the
> > arch_msix structure - to store the fixmap location).
> 
> To limit growth of the struct, please at least consider storing the fixmap
> indexes instead of full pointers.

Ok.

> >>> @@ -1090,6 +1105,12 @@ static void _pci_cleanup_msix(struct arch_msix *msix)
> >>>              WARN();
> >>>          msix->table.first = 0;
> >>>          msix->table.last = 0;
> >>> +        if ( msix->last_table_page )
> >>> +        {
> >>> +            msix_put_fixmap(msix,
> >>> +                            virt_to_fix((unsigned long)msix->last_table_page));
> >>> +            msix->last_table_page = 0;
> >>
> >> To set a pointer please use NULL.
> > 
> > Ok.
> > 
> >> Overall it looks like you're dealing with the issue for HVM only.
> >> You will want to express this in the title, perhaps by using x86/hvm:
> >> as the prefix. But then the question of course is whether this couldn't
> >> be dealt with in/from mmio_ro_emulated_write(), which handles both HVM
> >> and PV. 
> > 
> > The issue is correlating BAR mapping location with guest's view.
> > Writable BAR areas are mapped (by qemu) via xc_domain_memory_mapping(), but
> > that fails for read-only pages (and indeed, qemu doesn't attempt to do
> > that for the pages with the MSI-X table). Lacking that, I need to use
> > msixtbl_entry->gtable, which is HVM-only thing.
> > 
> > In fact there is another corner case I don't handle here: guest
> > accessing those registers when MSI-X is disabled. In that case, there is
> > no related msixtbl_entry, so I can't correlate the access, but the
> > page(s) is still read-only, so direct mapping would fail. In practice,
> > such access will trap into qemu, which will complain "Should not
> > read/write BAR through QEMU". I have seen this happening several times
> > when developing the series (due to bugs in my patches), but I haven't
> > found any case where it would happen with the final patch version.
> > In fact, I have considered handling this whole thing via qemu (as it
> > knows better where BAR live from the guest PoV), but stubdomain still
> > don't have write access to that pages, so that would need to be trapped
> > (for the second time) by Xen anyway.
> > 
> > For the PV case, I think this extra translation wouldn't be necessary as
> > BAR are mapped at their actual location, right?
> 
> I think so, yes.
> 
> > But then, it makes it
> > rather different implementation (separate feature), than just having a
> > common one for PV and HVM.
> 
> It would be different, yes, and if - as you explain above - there are
> technical reasons why it cannot be shared, then so be it. Mentioning
> this in the description may be worthwhile, or else the same question
> may be asked again (even by me, in case I forget part of the discussion
> by the time I look at a particular future version).

Ok, I'll extend the commit message.

> >> Which in turn raises the question: Do you need to handle reads
> >> in the new code in the first place?
> > 
> > The page not being mapped is also the reason why I do need to handle
> > reads too.
> 
> Just for my own clarity: You mean "not mapped to qemu" here?

No, to the HVM domain (in p2m). Xen (outside of MSI-X specific code for
HVM) doesn't know where those reads should be from.
Jan Beulich Nov. 18, 2022, 12:33 p.m. UTC | #5
On 18.11.2022 13:19, Marek Marczykowski-Górecki wrote:
> On Fri, Nov 18, 2022 at 08:20:14AM +0100, Jan Beulich wrote:
>> On 17.11.2022 18:31, Marek Marczykowski-Górecki wrote:
>>> On Thu, Nov 17, 2022 at 05:34:36PM +0100, Jan Beulich wrote:
>>>> Which in turn raises the question: Do you need to handle reads
>>>> in the new code in the first place?
>>>
>>> The page not being mapped is also the reason why I do need to handle
>>> reads too.
>>
>> Just for my own clarity: You mean "not mapped to qemu" here?
> 
> No, to the HVM domain (in p2m). Xen (outside of MSI-X specific code for
> HVM) doesn't know where those reads should be from.

Hmm, I was expecting them to be mapped r/o to the guest, but perhaps I'm
misremembering. Clearly both ept_p2m_type_to_flags() and
p2m_type_to_flags() take mmio_ro_ranges into consideration, which is
what I was basing my understanding on (without having looked at other
places in detail).

Jan
Marek Marczykowski-Górecki Nov. 18, 2022, 1 p.m. UTC | #6
On Fri, Nov 18, 2022 at 01:33:39PM +0100, Jan Beulich wrote:
> On 18.11.2022 13:19, Marek Marczykowski-Górecki wrote:
> > On Fri, Nov 18, 2022 at 08:20:14AM +0100, Jan Beulich wrote:
> >> On 17.11.2022 18:31, Marek Marczykowski-Górecki wrote:
> >>> On Thu, Nov 17, 2022 at 05:34:36PM +0100, Jan Beulich wrote:
> >>>> Which in turn raises the question: Do you need to handle reads
> >>>> in the new code in the first place?
> >>>
> >>> The page not being mapped is also the reason why I do need to handle
> >>> reads too.
> >>
> >> Just for my own clarity: You mean "not mapped to qemu" here?
> > 
> > No, to the HVM domain (in p2m). Xen (outside of MSI-X specific code for
> > HVM) doesn't know where those reads should be from.
> 
> Hmm, I was expecting them to be mapped r/o to the guest, but perhaps I'm
> misremembering. Clearly both ept_p2m_type_to_flags() and
> p2m_type_to_flags() take mmio_ro_ranges into consideration, which is
> what I was basing my understanding on (without having looked at other
> places in detail).

Qemu doesn't map the page (using xc_domain_memory_mapping()) where MSI-X
table lives. I tried to modify this part, but it resulted in EPT
violation on write attempt (and my emulation code wasn't called at all).
Jan Beulich Nov. 18, 2022, 1:07 p.m. UTC | #7
On 18.11.2022 14:00, Marek Marczykowski-Górecki wrote:
> On Fri, Nov 18, 2022 at 01:33:39PM +0100, Jan Beulich wrote:
>> On 18.11.2022 13:19, Marek Marczykowski-Górecki wrote:
>>> On Fri, Nov 18, 2022 at 08:20:14AM +0100, Jan Beulich wrote:
>>>> On 17.11.2022 18:31, Marek Marczykowski-Górecki wrote:
>>>>> On Thu, Nov 17, 2022 at 05:34:36PM +0100, Jan Beulich wrote:
>>>>>> Which in turn raises the question: Do you need to handle reads
>>>>>> in the new code in the first place?
>>>>>
>>>>> The page not being mapped is also the reason why I do need to handle
>>>>> reads too.
>>>>
>>>> Just for my own clarity: You mean "not mapped to qemu" here?
>>>
>>> No, to the HVM domain (in p2m). Xen (outside of MSI-X specific code for
>>> HVM) doesn't know where those reads should be from.
>>
>> Hmm, I was expecting them to be mapped r/o to the guest, but perhaps I'm
>> misremembering. Clearly both ept_p2m_type_to_flags() and
>> p2m_type_to_flags() take mmio_ro_ranges into consideration, which is
>> what I was basing my understanding on (without having looked at other
>> places in detail).
> 
> Qemu doesn't map the page (using xc_domain_memory_mapping()) where MSI-X
> table lives. I tried to modify this part, but it resulted in EPT
> violation on write attempt (and my emulation code wasn't called at all).

Well, yes - that leads to the other path I pointed you at, which is
used for both HVM and PV (and which, as per what you say, then looks
to be relevant for PVH only right now).

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index ba4cf46dfe91..57cfcf70741e 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -428,6 +428,133 @@  static const struct hvm_io_ops msixtbl_mmio_ops = {
     .write = _msixtbl_write,
 };
 
+static void __iomem *msixtbl_page_handler_get_hwaddr(
+        const struct vcpu *v,
+        uint64_t address,
+        bool write)
+{
+    struct domain *d = v->domain;
+    struct pci_dev *pdev = NULL;
+    struct msixtbl_entry *entry;
+    void __iomem *ret = NULL;
+    uint64_t table_end_addr;
+
+    rcu_read_lock(&msixtbl_rcu_lock);
+    /*
+     * Check if it's on the same page as the end of the MSI-X table, but
+     * outside of the table itself.
+     */
+    list_for_each_entry( entry, &d->arch.hvm.msixtbl_list, list )
+        if ( PFN_DOWN(address) == PFN_DOWN(entry->gtable + entry->table_len) &&
+             address >= entry->gtable + entry->table_len )
+        {
+            pdev = entry->pdev;
+            break;
+        }
+    rcu_read_unlock(&msixtbl_rcu_lock);
+
+    if ( !pdev )
+        return NULL;
+
+    ASSERT( pdev->msix );
+
+    table_end_addr = (pdev->msix->table.first << PAGE_SHIFT) +
+        pdev->msix->nr_entries * PCI_MSIX_ENTRY_SIZE;
+    ASSERT( PFN_DOWN(table_end_addr) == pdev->msix->table.last );
+
+    /* If PBA lives on the same page too, forbid writes. */
+    if ( write && pdev->msix->table.last == pdev->msix->pba.first )
+        return NULL;
+
+    if ( pdev->msix->last_table_page )
+        ret = pdev->msix->last_table_page + (address & (PAGE_SIZE - 1));
+    else
+        gdprintk(XENLOG_WARNING,
+                 "MSI-X last_table_page not initialized for %04x:%02x:%02x.%u\n",
+                 pdev->seg,
+                 pdev->bus,
+                 PCI_SLOT(pdev->devfn),
+                 PCI_FUNC(pdev->devfn));
+
+    return ret;
+}
+
+static bool cf_check msixtbl_page_accept(
+        const struct hvm_io_handler *handler, const ioreq_t *r)
+{
+    unsigned long addr = r->addr;
+
+    ASSERT( r->type == IOREQ_TYPE_COPY );
+
+    return msixtbl_page_handler_get_hwaddr(
+            current, addr, r->dir == IOREQ_WRITE);
+}
+
+static int cf_check msixtbl_page_read(
+        const struct hvm_io_handler *handler,
+        uint64_t address, uint32_t len, uint64_t *pval)
+{
+    void __iomem *hwaddr = msixtbl_page_handler_get_hwaddr(
+            current, address, false);
+
+    if ( !hwaddr )
+        return X86EMUL_UNHANDLEABLE;
+
+    switch ( len ) {
+        case 1:
+            *pval = readb(hwaddr);
+            break;
+        case 2:
+            *pval = readw(hwaddr);
+            break;
+        case 4:
+            *pval = readl(hwaddr);
+            break;
+        case 8:
+            *pval = readq(hwaddr);
+            break;
+        default:
+            return X86EMUL_UNHANDLEABLE;
+    }
+    return X86EMUL_OKAY;
+}
+
+static int cf_check msixtbl_page_write(
+        const struct hvm_io_handler *handler,
+        uint64_t address, uint32_t len, uint64_t val)
+{
+    void __iomem *hwaddr = msixtbl_page_handler_get_hwaddr(
+            current, address, true);
+
+    if ( !hwaddr )
+        return X86EMUL_UNHANDLEABLE;
+
+    switch ( len ) {
+        case 1:
+            writeb(val, hwaddr);
+            break;
+        case 2:
+            writew(val, hwaddr);
+            break;
+        case 4:
+            writel(val, hwaddr);
+            break;
+        case 8:
+            writeq(val, hwaddr);
+            break;
+        default:
+            return X86EMUL_UNHANDLEABLE;
+    }
+    return X86EMUL_OKAY;
+
+}
+
+static const struct hvm_io_ops msixtbl_mmio_page_ops = {
+    .accept = msixtbl_page_accept,
+    .read = msixtbl_page_read,
+    .write = msixtbl_page_write,
+};
+
 static void add_msixtbl_entry(struct domain *d,
                               struct pci_dev *pdev,
                               uint64_t gtable,
@@ -583,6 +710,14 @@  void msixtbl_init(struct domain *d)
         handler->type = IOREQ_TYPE_COPY;
         handler->ops = &msixtbl_mmio_ops;
     }
+
+    /* passthrough access to other registers on the same page */
+    handler = hvm_next_io_handler(d);
+    if ( handler )
+    {
+        handler->type = IOREQ_TYPE_COPY;
+        handler->ops = &msixtbl_mmio_page_ops;
+    }
 }
 
 void msixtbl_pt_cleanup(struct domain *d)
diff --git a/xen/arch/x86/include/asm/msi.h b/xen/arch/x86/include/asm/msi.h
index fe670895eed2..d4287140f04c 100644
--- a/xen/arch/x86/include/asm/msi.h
+++ b/xen/arch/x86/include/asm/msi.h
@@ -236,6 +236,7 @@  struct arch_msix {
     } table, pba;
     int table_refcnt[MAX_MSIX_TABLE_PAGES];
     int table_idx[MAX_MSIX_TABLE_PAGES];
+    void __iomem *last_table_page;
     spinlock_t table_lock;
     bool host_maskall, guest_maskall;
     domid_t warned;
diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
index d0bf63df1def..e7fe41f424d8 100644
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -961,6 +961,21 @@  static int msix_capability_init(struct pci_dev *dev,
                 domain_crash(d);
             /* XXX How to deal with existing mappings? */
         }
+
+        /*
+         * If the MSI-X table doesn't span full page(s), map the last page for
+         * passthrough accesses.
+         */
+        if ( (msix->nr_entries * PCI_MSIX_ENTRY_SIZE) & (PAGE_SIZE - 1) )
+        {
+            uint64_t entry_paddr = table_paddr + msix->nr_entries * PCI_MSIX_ENTRY_SIZE;
+            int idx = msix_get_fixmap(msix, table_paddr, entry_paddr);
+
+            if ( idx >= 0 )
+                msix->last_table_page = fix_to_virt(idx);
+            else
+                gprintk(XENLOG_ERR, "Failed to map last MSI-X table page: %d\n", idx);
+        }
     }
     WARN_ON(msix->table.first != (table_paddr >> PAGE_SHIFT));
     ++msix->used_entries;
@@ -1090,6 +1105,12 @@  static void _pci_cleanup_msix(struct arch_msix *msix)
             WARN();
         msix->table.first = 0;
         msix->table.last = 0;
+        if ( msix->last_table_page )
+        {
+            msix_put_fixmap(msix,
+                            virt_to_fix((unsigned long)msix->last_table_page));
+            msix->last_table_page = 0;
+        }
 
         if ( rangeset_remove_range(mmio_ro_ranges, msix->pba.first,
                                    msix->pba.last) )