diff mbox

[v7,2/2] VT-d: Fix vt-d Device-TLB flush timeout issue

Message ID 1458198767-61293-3-git-send-email-quan.xu@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Quan Xu March 17, 2016, 7:12 a.m. UTC
If Device-TLB flush timed out, we would hide the target ATS
device and crash the domain owning this ATS device. If impacted
domain is hardware domain, just throw out a warning.

The hidden device should be disallowed to be further assigned
to any domain.

Signed-off-by: Quan Xu <quan.xu@intel.com>
---
 xen/drivers/passthrough/pci.c         |  6 ++--
 xen/drivers/passthrough/vtd/extern.h  |  2 ++
 xen/drivers/passthrough/vtd/qinval.c  | 65 ++++++++++++++++++++++++++++++++---
 xen/drivers/passthrough/vtd/x86/ats.c | 12 +++++++
 xen/include/xen/pci.h                 |  1 +
 5 files changed, 78 insertions(+), 8 deletions(-)

Comments

Tian, Kevin March 17, 2016, 8:17 a.m. UTC | #1
> From: Xu, Quan
> Sent: Thursday, March 17, 2016 3:13 PM
> diff --git a/xen/drivers/passthrough/vtd/qinval.c b/xen/drivers/passthrough/vtd/qinval.c
> index 37a15fb..2a5c638 100644
> --- a/xen/drivers/passthrough/vtd/qinval.c
> +++ b/xen/drivers/passthrough/vtd/qinval.c
> @@ -233,6 +233,57 @@ int qinval_device_iotlb(struct iommu *iommu,
>      return 0;
>  }
> 
> +static void dev_invalidate_iotlb_timeout(struct iommu *iommu, u16 did,
> +                                         u16 seg, u8 bus, u8 devfn)
> +{
> +    struct domain *d = NULL;
> +    struct pci_dev *pdev;
> +
> +    if ( test_bit(did, iommu->domid_bitmap) )
> +        d = rcu_lock_domain_by_id(iommu->domid_map[did]);
> +
> +    if ( d == NULL )
> +        return;
> +
> +    pcidevs_lock();
> +    for_each_pdev(d, pdev)

we need a 'safe' version here since you're deleting nodes
when walking list. for_each_pdev today is based on 
list_for_each_entry. Or if it's sure that only one pdev
can match, we can break out of the loop to do removal.

> +    {
> +        if ( ( pdev->seg == seg ) &&
> +             ( pdev->bus == bus ) &&
> +             ( pdev->devfn == devfn ) )
> +        {
> +            ASSERT ( pdev->domain );
> +            list_del(&pdev->domain_list);
> +            pdev->domain = NULL;
> +            pci_hide_existing_device(pdev);
> +            break;
> +        }
> +    }
> +
> +    pcidevs_unlock();
> +
> +    if ( !is_hardware_domain(d) )
> +        domain_crash(d);
> +
> +    rcu_unlock_domain(d);
> +}
> +
> +int dev_invalidate_iotlb_sync(struct iommu *iommu, u16 did,
> +                              u16 seg, u8 bus, u8 devfn)
> +{
> +    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
> +    int rc = 0;
> +
> +    if ( qi_ctrl->qinval_maddr )
> +    {
> +        rc = queue_invalidate_wait(iommu, 0, 1, 1);
> +        if ( rc == -ETIMEDOUT )
> +            dev_invalidate_iotlb_timeout(iommu, did, seg, bus, devfn);
> +    }
> +
> +    return rc;
> +}
> +

Is this function a temporary one which will be removed later once we
can handle timeout for all types of flushes (at that time suppose this
logic will be reflected in invalidate_sync directly)?

>  static void queue_invalidate_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx)
>  {
>      unsigned long flags;
> @@ -342,8 +393,6 @@ static int flush_iotlb_qi(
> 
>      if ( qi_ctrl->qinval_maddr != 0 )
>      {
> -        int rc;
> -
>          /* use queued invalidation */
>          if (cap_write_drain(iommu->cap))
>              dw = 1;
> @@ -353,11 +402,17 @@ static int flush_iotlb_qi(
>          queue_invalidate_iotlb(iommu,
>                                 type >> DMA_TLB_FLUSH_GRANU_OFFSET, dr,
>                                 dw, did, size_order, 0, addr);
> +
> +        /*
> +         * Before Device-TLB invalidation we need to synchronize
> +         * invalidation completions with hardware.
> +         */
> +        ret = invalidate_sync(iommu);
> +        if ( ret )
> +             return ret;
> +
>          if ( flush_dev_iotlb )
>              ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);
> -        rc = invalidate_sync(iommu);
> -        if ( !ret )
> -            ret = rc;

Current change looks not consistent. For IOMMU iotlb flush, we have
invalidate_sync out of invalidate operation, however below...

>      }
>      return ret;
>  }
> diff --git a/xen/drivers/passthrough/vtd/x86/ats.c
> b/xen/drivers/passthrough/vtd/x86/ats.c
> index 334b9c1..c87ffe3 100644
> --- a/xen/drivers/passthrough/vtd/x86/ats.c
> +++ b/xen/drivers/passthrough/vtd/x86/ats.c
> @@ -162,6 +162,18 @@ int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
>              return -EOPNOTSUPP;
>          }
> 
> +        /*
> +         * Synchronize with hardware for Device-TLB invalidate
> +         * descriptor.
> +         */
> +        rc = dev_invalidate_iotlb_sync(iommu, did, pdev->seg,
> +                                       pdev->bus, pdev->devfn);
> +        if ( rc )
> +            printk(XENLOG_ERR
> +                   "Flush error %d on device %04x:%02x:%02x.%u.\n",
> +                   ret, pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
> +                   PCI_FUNC(pdev->devfn));
> +
>          if ( !ret )

for device iotlb flush, you moved the invalidate_sync inside the
invalidate operation.

If this is only temporary as I guessed earlier, is it clearer to change
like below:

> @@ -353,11 +402,17 @@ static int flush_iotlb_qi(
          queue_invalidate_iotlb(iommu,
                                 type >> DMA_TLB_FLUSH_GRANU_OFFSET, dr,
                                 dw, did, size_order, 0, addr);
 
         /*
          * Before Device-TLB invalidation we need to synchronize
          * invalidation completions with hardware. 
          * TODO: timeout error handling to be added later
          */
         ret = invalidate_sync(iommu);
         if ( ret )
              return ret;
 
          if ( flush_dev_iotlb )
              ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);

         rc = invalidate_sync(iommu);
         if ( rc == -ETIMEDOUT )
            dev_invalidate_iotlb_timeout(iommu, did, seg, bus, devfn);
         if ( !ret )
             ret = rc;

This way later when we have invalidate_sync handling timeout error
for all types of flushes, above two lines of timeout handling can be
removed.

Thanks
Kevin
Jan Beulich March 17, 2016, 9:43 a.m. UTC | #2
>>> On 17.03.16 at 09:17, <kevin.tian@intel.com> wrote:
>>  From: Xu, Quan
>> Sent: Thursday, March 17, 2016 3:13 PM
>> --- a/xen/drivers/passthrough/vtd/qinval.c
>> +++ b/xen/drivers/passthrough/vtd/qinval.c
>> @@ -233,6 +233,57 @@ int qinval_device_iotlb(struct iommu *iommu,
>>      return 0;
>>  }
>> 
>> +static void dev_invalidate_iotlb_timeout(struct iommu *iommu, u16 did,
>> +                                         u16 seg, u8 bus, u8 devfn)
>> +{
>> +    struct domain *d = NULL;
>> +    struct pci_dev *pdev;
>> +
>> +    if ( test_bit(did, iommu->domid_bitmap) )
>> +        d = rcu_lock_domain_by_id(iommu->domid_map[did]);
>> +
>> +    if ( d == NULL )
>> +        return;
>> +
>> +    pcidevs_lock();
>> +    for_each_pdev(d, pdev)
> 
> we need a 'safe' version here since you're deleting nodes
> when walking list. for_each_pdev today is based on 
> list_for_each_entry. Or if it's sure that only one pdev
> can match, we can break out of the loop to do removal.

But breaking out of the loop is what is already being done ...

>> +    {
>> +        if ( ( pdev->seg == seg ) &&
>> +             ( pdev->bus == bus ) &&
>> +             ( pdev->devfn == devfn ) )
>> +        {
>> +            ASSERT ( pdev->domain );
>> +            list_del(&pdev->domain_list);
>> +            pdev->domain = NULL;
>> +            pci_hide_existing_device(pdev);
>> +            break;

... here.

>> +        }
>> +    }
>> +
>> +    pcidevs_unlock();

No need for using "safe" list traversal afaict.

Jan
Tian, Kevin March 17, 2016, 11:13 a.m. UTC | #3
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: Thursday, March 17, 2016 5:43 PM
> 
> >>> On 17.03.16 at 09:17, <kevin.tian@intel.com> wrote:
> >>  From: Xu, Quan
> >> Sent: Thursday, March 17, 2016 3:13 PM
> >> --- a/xen/drivers/passthrough/vtd/qinval.c
> >> +++ b/xen/drivers/passthrough/vtd/qinval.c
> >> @@ -233,6 +233,57 @@ int qinval_device_iotlb(struct iommu *iommu,
> >>      return 0;
> >>  }
> >>
> >> +static void dev_invalidate_iotlb_timeout(struct iommu *iommu, u16 did,
> >> +                                         u16 seg, u8 bus, u8 devfn)
> >> +{
> >> +    struct domain *d = NULL;
> >> +    struct pci_dev *pdev;
> >> +
> >> +    if ( test_bit(did, iommu->domid_bitmap) )
> >> +        d = rcu_lock_domain_by_id(iommu->domid_map[did]);
> >> +
> >> +    if ( d == NULL )
> >> +        return;
> >> +
> >> +    pcidevs_lock();
> >> +    for_each_pdev(d, pdev)
> >
> > we need a 'safe' version here since you're deleting nodes
> > when walking list. for_each_pdev today is based on
> > list_for_each_entry. Or if it's sure that only one pdev
> > can match, we can break out of the loop to do removal.
> 
> But breaking out of the loop is what is already being done ...
> 
> >> +    {
> >> +        if ( ( pdev->seg == seg ) &&
> >> +             ( pdev->bus == bus ) &&
> >> +             ( pdev->devfn == devfn ) )
> >> +        {
> >> +            ASSERT ( pdev->domain );
> >> +            list_del(&pdev->domain_list);
> >> +            pdev->domain = NULL;
> >> +            pci_hide_existing_device(pdev);
> >> +            break;
> 
> ... here.
> 

however you see list_del happens before breaking out, right?

Thanks
Kevin
Quan Xu March 17, 2016, 11:30 a.m. UTC | #4
On March 17, 2016 7:14pm, Tian, Kevin <kevin.tian@intel.com> wrote:
> > From: Jan Beulich [mailto:JBeulich@suse.com]
> > Sent: Thursday, March 17, 2016 5:43 PM
> >
> > >>> On 17.03.16 at 09:17, <kevin.tian@intel.com> wrote:
> > >>  From: Xu, Quan
> > >> Sent: Thursday, March 17, 2016 3:13 PM
> > >> --- a/xen/drivers/passthrough/vtd/qinval.c
> > >> +++ b/xen/drivers/passthrough/vtd/qinval.c
> > >> @@ -233,6 +233,57 @@ int qinval_device_iotlb(struct iommu *iommu,
> > >>      return 0;
> > >>  }
> > >>
> > >> +static void dev_invalidate_iotlb_timeout(struct iommu *iommu, u16 did,
> > >> +                                         u16 seg, u8 bus, u8
> > >> +devfn) {
> > >> +    struct domain *d = NULL;
> > >> +    struct pci_dev *pdev;
> > >> +
> > >> +    if ( test_bit(did, iommu->domid_bitmap) )
> > >> +        d = rcu_lock_domain_by_id(iommu->domid_map[did]);
> > >> +
> > >> +    if ( d == NULL )
> > >> +        return;
> > >> +
> > >> +    pcidevs_lock();
> > >> +    for_each_pdev(d, pdev)
> > >
> > > we need a 'safe' version here since you're deleting nodes when
> > > walking list. for_each_pdev today is based on list_for_each_entry.
> > > Or if it's sure that only one pdev can match, we can break out of
> > > the loop to do removal.
> >
> > But breaking out of the loop is what is already being done ...
> >
> > >> +    {
> > >> +        if ( ( pdev->seg == seg ) &&
> > >> +             ( pdev->bus == bus ) &&
> > >> +             ( pdev->devfn == devfn ) )
> > >> +        {
> > >> +            ASSERT ( pdev->domain );
> > >> +            list_del(&pdev->domain_list);
> > >> +            pdev->domain = NULL;
> > >> +            pci_hide_existing_device(pdev);
> > >> +            break;
> >
> > ... here.
> >
> 
> however you see list_del happens before breaking out, right?
> 
For these version of macros:
  a). list_for_each_entry - iterate over list of given type
  b). list_for_each_entry_safe - iterate over list of given type safe against removal of list entry

__iiuc__, I think the key point is:
  -in general, we'd better remove entry under list_for_each_entry_safe. Yes, it is correct to use list_for_each_entry_safe for this point too.
  - If we delete the iterate from list_for_each_entry, it may point to undefined state, leading to xen panic.
   but the pdev->domain_list is not a point ( it is a "struct list_head domain_list" variable), so it is safe to
   delete it under list_for_each_entry in this case. 

   Jan, is it similar to yours?

Quan
Tian, Kevin March 17, 2016, 11:32 a.m. UTC | #5
> From: Tian, Kevin

> Sent: Thursday, March 17, 2016 7:14 PM

> 

> > From: Jan Beulich [mailto:JBeulich@suse.com]

> > Sent: Thursday, March 17, 2016 5:43 PM

> >

> > >>> On 17.03.16 at 09:17, <kevin.tian@intel.com> wrote:

> > >>  From: Xu, Quan

> > >> Sent: Thursday, March 17, 2016 3:13 PM

> > >> --- a/xen/drivers/passthrough/vtd/qinval.c

> > >> +++ b/xen/drivers/passthrough/vtd/qinval.c

> > >> @@ -233,6 +233,57 @@ int qinval_device_iotlb(struct iommu *iommu,

> > >>      return 0;

> > >>  }

> > >>

> > >> +static void dev_invalidate_iotlb_timeout(struct iommu *iommu, u16 did,

> > >> +                                         u16 seg, u8 bus, u8 devfn)

> > >> +{

> > >> +    struct domain *d = NULL;

> > >> +    struct pci_dev *pdev;

> > >> +

> > >> +    if ( test_bit(did, iommu->domid_bitmap) )

> > >> +        d = rcu_lock_domain_by_id(iommu->domid_map[did]);

> > >> +

> > >> +    if ( d == NULL )

> > >> +        return;

> > >> +

> > >> +    pcidevs_lock();

> > >> +    for_each_pdev(d, pdev)

> > >

> > > we need a 'safe' version here since you're deleting nodes

> > > when walking list. for_each_pdev today is based on

> > > list_for_each_entry. Or if it's sure that only one pdev

> > > can match, we can break out of the loop to do removal.

> >

> > But breaking out of the loop is what is already being done ...

> >

> > >> +    {

> > >> +        if ( ( pdev->seg == seg ) &&

> > >> +             ( pdev->bus == bus ) &&

> > >> +             ( pdev->devfn == devfn ) )

> > >> +        {

> > >> +            ASSERT ( pdev->domain );

> > >> +            list_del(&pdev->domain_list);

> > >> +            pdev->domain = NULL;

> > >> +            pci_hide_existing_device(pdev);

> > >> +            break;

> >

> > ... here.

> >

> 

> however you see list_del happens before breaking out, right?

> 


Sorry you're right and please forget my earlier reply. :-)
Jan Beulich March 17, 2016, 11:33 a.m. UTC | #6
>>> On 17.03.16 at 12:13, <kevin.tian@intel.com> wrote:
>>  From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: Thursday, March 17, 2016 5:43 PM
>> 
>> >>> On 17.03.16 at 09:17, <kevin.tian@intel.com> wrote:
>> >>  From: Xu, Quan
>> >> Sent: Thursday, March 17, 2016 3:13 PM
>> >> --- a/xen/drivers/passthrough/vtd/qinval.c
>> >> +++ b/xen/drivers/passthrough/vtd/qinval.c
>> >> @@ -233,6 +233,57 @@ int qinval_device_iotlb(struct iommu *iommu,
>> >>      return 0;
>> >>  }
>> >>
>> >> +static void dev_invalidate_iotlb_timeout(struct iommu *iommu, u16 did,
>> >> +                                         u16 seg, u8 bus, u8 devfn)
>> >> +{
>> >> +    struct domain *d = NULL;
>> >> +    struct pci_dev *pdev;
>> >> +
>> >> +    if ( test_bit(did, iommu->domid_bitmap) )
>> >> +        d = rcu_lock_domain_by_id(iommu->domid_map[did]);
>> >> +
>> >> +    if ( d == NULL )
>> >> +        return;
>> >> +
>> >> +    pcidevs_lock();
>> >> +    for_each_pdev(d, pdev)
>> >
>> > we need a 'safe' version here since you're deleting nodes
>> > when walking list. for_each_pdev today is based on
>> > list_for_each_entry. Or if it's sure that only one pdev
>> > can match, we can break out of the loop to do removal.
>> 
>> But breaking out of the loop is what is already being done ...
>> 
>> >> +    {
>> >> +        if ( ( pdev->seg == seg ) &&
>> >> +             ( pdev->bus == bus ) &&
>> >> +             ( pdev->devfn == devfn ) )
>> >> +        {
>> >> +            ASSERT ( pdev->domain );
>> >> +            list_del(&pdev->domain_list);
>> >> +            pdev->domain = NULL;
>> >> +            pci_hide_existing_device(pdev);
>> >> +            break;
>> 
>> ... here.
>> 
> 
> however you see list_del happens before breaking out, right?

I'm sorry Kevin, but ???

Jan
Jan Beulich March 18, 2016, 11:18 a.m. UTC | #7
>>> On 17.03.16 at 08:12, <quan.xu@intel.com> wrote:
> --- a/xen/drivers/passthrough/vtd/qinval.c
> +++ b/xen/drivers/passthrough/vtd/qinval.c
> @@ -233,6 +233,57 @@ int qinval_device_iotlb(struct iommu *iommu,
>      return 0;
>  }
>  
> +static void dev_invalidate_iotlb_timeout(struct iommu *iommu, u16 did,
> +                                         u16 seg, u8 bus, u8 devfn)
> +{
> +    struct domain *d = NULL;
> +    struct pci_dev *pdev;
> +
> +    if ( test_bit(did, iommu->domid_bitmap) )
> +        d = rcu_lock_domain_by_id(iommu->domid_map[did]);
> +
> +    if ( d == NULL )
> +        return;
> +
> +    pcidevs_lock();
> +    for_each_pdev(d, pdev)

Blank line please between these two, for symmetry with ...

> +    {
> +        if ( ( pdev->seg == seg ) &&
> +             ( pdev->bus == bus ) &&
> +             ( pdev->devfn == devfn ) )
> +        {
> +            ASSERT ( pdev->domain );
> +            list_del(&pdev->domain_list);
> +            pdev->domain = NULL;
> +            pci_hide_existing_device(pdev);
> +            break;
> +        }
> +    }
> +
> +    pcidevs_unlock();

... this.

I assume you're aware that there's pending feedback from Kevin
which you didn't reply to so far. Of course there's no need for a
reply if you mean to address/incorporate in v8 everything he said.

Jan
Quan Xu March 18, 2016, 11:31 a.m. UTC | #8
On March 18, 2016 7:19pm, <JBeulich@suse.com> wrote:
> >>> On 17.03.16 at 08:12, <quan.xu@intel.com> wrote:
> > --- a/xen/drivers/passthrough/vtd/qinval.c
> > +++ b/xen/drivers/passthrough/vtd/qinval.c
> > @@ -233,6 +233,57 @@ int qinval_device_iotlb(struct iommu *iommu,
> >      return 0;
> >  }
> >
> > +static void dev_invalidate_iotlb_timeout(struct iommu *iommu, u16 did,
> > +                                         u16 seg, u8 bus, u8 devfn)
> {
> > +    struct domain *d = NULL;
> > +    struct pci_dev *pdev;
> > +
> > +    if ( test_bit(did, iommu->domid_bitmap) )
> > +        d = rcu_lock_domain_by_id(iommu->domid_map[did]);
> > +
> > +    if ( d == NULL )
> > +        return;
> > +
> > +    pcidevs_lock();
> > +    for_each_pdev(d, pdev)
> 
> Blank line please between these two, for symmetry with ...
> 
for_each_pdev( d, pdev )  ??

> > +    {
> > +        if ( ( pdev->seg == seg ) &&
> > +             ( pdev->bus == bus ) &&
> > +             ( pdev->devfn == devfn ) )
> > +        {
> > +            ASSERT ( pdev->domain );
> > +            list_del(&pdev->domain_list);
> > +            pdev->domain = NULL;
> > +            pci_hide_existing_device(pdev);
> > +            break;
> > +        }
> > +    }
> > +
> > +    pcidevs_unlock();
> 
> ... this.
> 
> I assume you're aware that there's pending feedback from Kevin which you
> didn't reply to so far.

Yes,

> Of course there's no need for a reply if you mean to
> address/incorporate in v8 everything he said.
> 
I will reply and explain much more.
It is dinner time, and I will be back soon.

Quan
Quan Xu March 18, 2016, 12:21 p.m. UTC | #9
On March 17, 2016 4:17pm, Tian, Kevin <kevin.tian@intel.com> wrote:
> > From: Xu, Quan
> > Sent: Thursday, March 17, 2016 3:13 PM diff --git
> > a/xen/drivers/passthrough/vtd/qinval.c
> > b/xen/drivers/passthrough/vtd/qinval.c
> > index 37a15fb..2a5c638 100644
> > --- a/xen/drivers/passthrough/vtd/qinval.c
> > +++ b/xen/drivers/passthrough/vtd/qinval.c
> > +    {
> > +        if ( ( pdev->seg == seg ) &&
> > +             ( pdev->bus == bus ) &&
> > +             ( pdev->devfn == devfn ) )
> > +        {
> > +            ASSERT ( pdev->domain );
> > +            list_del(&pdev->domain_list);
> > +            pdev->domain = NULL;
> > +            pci_hide_existing_device(pdev);
> > +            break;
> > +        }
> > +    }
> > +
> > +    pcidevs_unlock();
> > +
> > +    if ( !is_hardware_domain(d) )
> > +        domain_crash(d);
> > +
> > +    rcu_unlock_domain(d);
> > +}
> > +
> > +int dev_invalidate_iotlb_sync(struct iommu *iommu, u16 did,
> > +                              u16 seg, u8 bus, u8 devfn) {
> > +    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
> > +    int rc = 0;
> > +
> > +    if ( qi_ctrl->qinval_maddr )
> > +    {
> > +        rc = queue_invalidate_wait(iommu, 0, 1, 1);
> > +        if ( rc == -ETIMEDOUT )
> > +            dev_invalidate_iotlb_timeout(iommu, did, seg, bus, devfn);
> > +    }
> > +
> > +    return rc;
> > +}
> > +
> 
> Is this function a temporary one which will be removed later once we can
> handle timeout for all types of flushes (at that time suppose this logic will be
> reflected in invalidate_sync directly)?
> 
No, it's not a temporary one.
dev_invalidate_iotlb_sync -- for Device-TLB invalidation sync, as we need SBDF to indicate which device flush timed out.
invalidate_sync -- for VT-d iotlb/iec/context invalidation sync.


> >  static void queue_invalidate_iec(struct iommu *iommu, u8 granu, u8
> > im, u16 iidx)  {
> >      unsigned long flags;
> > @@ -342,8 +393,6 @@ static int flush_iotlb_qi(
> >
> >      if ( qi_ctrl->qinval_maddr != 0 )
> >      {
> > -        int rc;
> > -
> >          /* use queued invalidation */
> >          if (cap_write_drain(iommu->cap))
> >              dw = 1;
> > @@ -353,11 +402,17 @@ static int flush_iotlb_qi(
> >          queue_invalidate_iotlb(iommu,
> >                                 type >>
> DMA_TLB_FLUSH_GRANU_OFFSET, dr,
> >                                 dw, did, size_order, 0, addr);
> > +
> > +        /*
> > +         * Before Device-TLB invalidation we need to synchronize
> > +         * invalidation completions with hardware.
> > +         */
> > +        ret = invalidate_sync(iommu);
> > +        if ( ret )
> > +             return ret;
> > +
> >          if ( flush_dev_iotlb )
> >              ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);
> > -        rc = invalidate_sync(iommu);
> > -        if ( !ret )
> > -            ret = rc;
> 
> Current change looks not consistent. For IOMMU iotlb flush, we have
> invalidate_sync out of invalidate operation, however below...
> 

Now, does it still look not consistent?

> >      }
> >      return ret;
> >  }
> > diff --git a/xen/drivers/passthrough/vtd/x86/ats.c
> > b/xen/drivers/passthrough/vtd/x86/ats.c
> > index 334b9c1..c87ffe3 100644
> > --- a/xen/drivers/passthrough/vtd/x86/ats.c
> > +++ b/xen/drivers/passthrough/vtd/x86/ats.c
> > @@ -162,6 +162,18 @@ int dev_invalidate_iotlb(struct iommu *iommu, u16
> did,
> >              return -EOPNOTSUPP;
> >          }
> >
> > +        /*
> > +         * Synchronize with hardware for Device-TLB invalidate
> > +         * descriptor.
> > +         */
> > +        rc = dev_invalidate_iotlb_sync(iommu, did, pdev->seg,
> > +                                       pdev->bus, pdev->devfn);
> > +        if ( rc )
> > +            printk(XENLOG_ERR
> > +                   "Flush error %d on device %04x:%02x:%02x.%u.\n",
> > +                   ret, pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
> > +                   PCI_FUNC(pdev->devfn));
> > +
> >          if ( !ret )
> 
> for device iotlb flush, you moved the invalidate_sync inside the invalidate
> operation.
> 
> If this is only temporary as I guessed earlier, is it clearer to change like below:
> 
> > @@ -353,11 +402,17 @@ static int flush_iotlb_qi(
>           queue_invalidate_iotlb(iommu,
>                                  type >>
> DMA_TLB_FLUSH_GRANU_OFFSET, dr,
>                                  dw, did, size_order, 0, addr);
> 
>          /*
>           * Before Device-TLB invalidation we need to synchronize
>           * invalidation completions with hardware.
>           * TODO: timeout error handling to be added later
>           */

I'd like this TODO, and I would add it in v8.

Quan

>          ret = invalidate_sync(iommu);
>          if ( ret )
>               return ret;
> 
>           if ( flush_dev_iotlb )
>               ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);
> 
>          rc = invalidate_sync(iommu);
>          if ( rc == -ETIMEDOUT )
>             dev_invalidate_iotlb_timeout(iommu, did, seg, bus, devfn);
>          if ( !ret )
>              ret = rc;
> 
> This way later when we have invalidate_sync handling timeout error for all
> types of flushes, above two lines of timeout handling can be removed.
> 
> Thanks
> Kevin
Jan Beulich March 18, 2016, 1:35 p.m. UTC | #10
>>> On 18.03.16 at 12:31, <quan.xu@intel.com> wrote:
> On March 18, 2016 7:19pm, <JBeulich@suse.com> wrote:
>> >>> On 17.03.16 at 08:12, <quan.xu@intel.com> wrote:
>> > --- a/xen/drivers/passthrough/vtd/qinval.c
>> > +++ b/xen/drivers/passthrough/vtd/qinval.c
>> > @@ -233,6 +233,57 @@ int qinval_device_iotlb(struct iommu *iommu,
>> >      return 0;
>> >  }
>> >
>> > +static void dev_invalidate_iotlb_timeout(struct iommu *iommu, u16 did,
>> > +                                         u16 seg, u8 bus, u8 devfn)
>> {
>> > +    struct domain *d = NULL;
>> > +    struct pci_dev *pdev;
>> > +
>> > +    if ( test_bit(did, iommu->domid_bitmap) )
>> > +        d = rcu_lock_domain_by_id(iommu->domid_map[did]);
>> > +
>> > +    if ( d == NULL )
>> > +        return;
>> > +
>> > +    pcidevs_lock();
>> > +    for_each_pdev(d, pdev)
>> 
>> Blank line please between these two, for symmetry with ...
>> 
> for_each_pdev( d, pdev )  ??

That's blanks you added, not a blank line.

Jan
Tian, Kevin March 21, 2016, 3:27 a.m. UTC | #11
> From: Xu, Quan
> Sent: Friday, March 18, 2016 8:22 PM
> 
> > > +int dev_invalidate_iotlb_sync(struct iommu *iommu, u16 did,
> > > +                              u16 seg, u8 bus, u8 devfn) {
> > > +    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
> > > +    int rc = 0;
> > > +
> > > +    if ( qi_ctrl->qinval_maddr )
> > > +    {
> > > +        rc = queue_invalidate_wait(iommu, 0, 1, 1);
> > > +        if ( rc == -ETIMEDOUT )
> > > +            dev_invalidate_iotlb_timeout(iommu, did, seg, bus, devfn);
> > > +    }
> > > +
> > > +    return rc;
> > > +}
> > > +
> >
> > Is this function a temporary one which will be removed later once we can
> > handle timeout for all types of flushes (at that time suppose this logic will be
> > reflected in invalidate_sync directly)?
> >
> No, it's not a temporary one.
> dev_invalidate_iotlb_sync -- for Device-TLB invalidation sync, as we need SBDF to indicate
> which device flush timed out.
> invalidate_sync -- for VT-d iotlb/iec/context invalidation sync.

Thanks. I recalled it. Once you defined some INVALID seg/bus/devfn to
reuse same interface, and then the suggestion is to go with different
interfaces.:-)

> 
> 
> > >  static void queue_invalidate_iec(struct iommu *iommu, u8 granu, u8
> > > im, u16 iidx)  {
> > >      unsigned long flags;
> > > @@ -342,8 +393,6 @@ static int flush_iotlb_qi(
> > >
> > >      if ( qi_ctrl->qinval_maddr != 0 )
> > >      {
> > > -        int rc;
> > > -
> > >          /* use queued invalidation */
> > >          if (cap_write_drain(iommu->cap))
> > >              dw = 1;
> > > @@ -353,11 +402,17 @@ static int flush_iotlb_qi(
> > >          queue_invalidate_iotlb(iommu,
> > >                                 type >>
> > DMA_TLB_FLUSH_GRANU_OFFSET, dr,
> > >                                 dw, did, size_order, 0, addr);
> > > +
> > > +        /*
> > > +         * Before Device-TLB invalidation we need to synchronize
> > > +         * invalidation completions with hardware.
> > > +         */
> > > +        ret = invalidate_sync(iommu);
> > > +        if ( ret )
> > > +             return ret;
> > > +
> > >          if ( flush_dev_iotlb )
> > >              ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);
> > > -        rc = invalidate_sync(iommu);
> > > -        if ( !ret )
> > > -            ret = rc;
> >
> > Current change looks not consistent. For IOMMU iotlb flush, we have
> > invalidate_sync out of invalidate operation, however below...
> >
> 
> Now, does it still look not consistent?
> 

Yes, still inconsistent. As I said, you put invalidation sync within 
dev_invalidate_iotlb, while for all other IOMMU invalidations the
sync is put after. Below would be consistent then:

        if ( flush_dev_iotlb )
            ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);
        rc = dev_invalidate_iotlb_sync(...);
        if ( !ret )
            ret = rc;

Thanks
Kevin
Quan Xu March 23, 2016, 2:12 a.m. UTC | #12
(( __ sorry, I was out of office on Mon./Tues. __))

On March 21, 2016 11:27am, Tian, Kevin <kevin.tian@intel.com> wrote:
> > From: Xu, Quan
> > Sent: Friday, March 18, 2016 8:22 PM
> > > >  static void queue_invalidate_iec(struct iommu *iommu, u8 granu,
> > > > u8 im, u16 iidx)  {
> > > >      unsigned long flags;
> > > > @@ -342,8 +393,6 @@ static int flush_iotlb_qi(
> > > >
> > > >      if ( qi_ctrl->qinval_maddr != 0 )
> > > >      {
> > > > -        int rc;
> > > > -
> > > >          /* use queued invalidation */
> > > >          if (cap_write_drain(iommu->cap))
> > > >              dw = 1;
> > > > @@ -353,11 +402,17 @@ static int flush_iotlb_qi(
> > > >          queue_invalidate_iotlb(iommu,
> > > >                                 type >>
> > > DMA_TLB_FLUSH_GRANU_OFFSET, dr,
> > > >                                 dw, did, size_order, 0, addr);
> > > > +
> > > > +        /*
> > > > +         * Before Device-TLB invalidation we need to synchronize
> > > > +         * invalidation completions with hardware.
> > > > +         */
> > > > +        ret = invalidate_sync(iommu);
> > > > +        if ( ret )
> > > > +             return ret;
> > > > +
> > > >          if ( flush_dev_iotlb )
> > > >              ret = dev_invalidate_iotlb(iommu, did, addr, size_order,
> type);
> > > > -        rc = invalidate_sync(iommu);
> > > > -        if ( !ret )
> > > > -            ret = rc;
> > >
> > > Current change looks not consistent. For IOMMU iotlb flush, we have
> > > invalidate_sync out of invalidate operation, however below...
> > >
> >
> > Now, does it still look not consistent?
> >
> 
> Yes, still inconsistent. As I said, you put invalidation sync within
> dev_invalidate_iotlb, while for all other IOMMU invalidations the sync is put
> after. Below would be consistent then:
> 
>         if ( flush_dev_iotlb )
>             ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);
>         rc = dev_invalidate_iotlb_sync(...);
>         if ( !ret )
>             ret = rc;
> 

Make sense, I will fix it in next v8.
Now this patch set looks clear, I'd better summarize and send out v8. Of course, I would continue to
explain and fix the prereq patch set.


Quan
Quan Xu March 23, 2016, 3:29 a.m. UTC | #13
On March 21, 2016 11:27am, Tian, Kevin <kevin.tian@intel.com> wrote:
> > From: Xu, Quan
> > Sent: Friday, March 18, 2016 8:22 PM

> > > >  static void queue_invalidate_iec(struct iommu *iommu, u8 granu,
> > > > u8 im, u16 iidx)  {
> > > >      unsigned long flags;
> > > > @@ -342,8 +393,6 @@ static int flush_iotlb_qi(
> > > >
> > > >      if ( qi_ctrl->qinval_maddr != 0 )
> > > >      {
> > > > -        int rc;
> > > > -
> > > >          /* use queued invalidation */
> > > >          if (cap_write_drain(iommu->cap))
> > > >              dw = 1;
> > > > @@ -353,11 +402,17 @@ static int flush_iotlb_qi(
> > > >          queue_invalidate_iotlb(iommu,
> > > >                                 type >>
> > > DMA_TLB_FLUSH_GRANU_OFFSET, dr,
> > > >                                 dw, did, size_order, 0, addr);
> > > > +
> > > > +        /*
> > > > +         * Before Device-TLB invalidation we need to synchronize
> > > > +         * invalidation completions with hardware.
> > > > +         */
> > > > +        ret = invalidate_sync(iommu);
> > > > +        if ( ret )
> > > > +             return ret;
> > > > +
> > > >          if ( flush_dev_iotlb )
> > > >              ret = dev_invalidate_iotlb(iommu, did, addr, size_order,
> type);
> > > > -        rc = invalidate_sync(iommu);
> > > > -        if ( !ret )
> > > > -            ret = rc;
> > >
> > > Current change looks not consistent. For IOMMU iotlb flush, we have
> > > invalidate_sync out of invalidate operation, however below...
> > >
> >
> > Now, does it still look not consistent?
> >
> 
> Yes, still inconsistent. As I said, you put invalidation sync within
> dev_invalidate_iotlb, while for all other IOMMU invalidations the sync is put
> after. Below would be consistent then:
> 
>         if ( flush_dev_iotlb )
>             ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);
>         rc = dev_invalidate_iotlb_sync(...);
>         if ( !ret )
>             ret = rc;
> 
  Kevin,
  now I doubt that I should put invalidation sync within dev_invalidate_iotlb, which was also your suggestion.
As the dev_invalidate_iotlb() is invalidation for all of domain's ATS devices. If in this consistent way, we couldn't
Find which ATS device flush timed out, then we need to hide all of domain's ATS devices.
Do you recall it?
  Also I think it is reluctant to put invalidate_sync within queue_invalidate_iotlb() for consistent issue.
Quan
Tian, Kevin March 23, 2016, 5:36 a.m. UTC | #14
> From: Xu, Quan
> Sent: Wednesday, March 23, 2016 11:30 AM
> 
> >
> > Yes, still inconsistent. As I said, you put invalidation sync within
> > dev_invalidate_iotlb, while for all other IOMMU invalidations the sync is put
> > after. Below would be consistent then:
> >
> >         if ( flush_dev_iotlb )
> >             ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);
> >         rc = dev_invalidate_iotlb_sync(...);
> >         if ( !ret )
> >             ret = rc;
> >
>   Kevin,
>   now I doubt that I should put invalidation sync within dev_invalidate_iotlb, which was also
> your suggestion.
> As the dev_invalidate_iotlb() is invalidation for all of domain's ATS devices. If in this
> consistent way, we couldn't
> Find which ATS device flush timed out, then we need to hide all of domain's ATS devices.
> Do you recall it?
>   Also I think it is reluctant to put invalidate_sync within queue_invalidate_iotlb() for
> consistent issue.
> Quan

Yes I recall this story.

What about doing this? Let's wrap a _sync version for all flush interfaces, like below:

static int queue_invalidate_context_sync(...)
{
	queue_invalidate_context(...);
	return invalidate_sync(...);
}

Then invoke _sync version at all callers, e.g.:
static int flush_context_qi(...)
{
	...
	if ( qi_ctrl->qinval_maddr != 0 )
		ret = queue_invalidate_context_sync(...);
}

similarly we'll have dev_invalidate_iotlb_sync for device IOTLB flush.

It simplifies caller logic and make code more readable. :-)

Thanks
Kevin
Quan Xu March 23, 2016, 5:39 a.m. UTC | #15
On March 23, 2016 1:37pm, <kevin.tian@intel.com> wrote:
> > From: Xu, Quan
> > Sent: Wednesday, March 23, 2016 11:30 AM
> >
> > >
> > > Yes, still inconsistent. As I said, you put invalidation sync within
> > > dev_invalidate_iotlb, while for all other IOMMU invalidations the
> > > sync is put after. Below would be consistent then:
> > >
> > >         if ( flush_dev_iotlb )
> > >             ret = dev_invalidate_iotlb(iommu, did, addr, size_order,
> type);
> > >         rc = dev_invalidate_iotlb_sync(...);
> > >         if ( !ret )
> > >             ret = rc;
> > >
> >   Kevin,
> >   now I doubt that I should put invalidation sync within
> > dev_invalidate_iotlb, which was also your suggestion.
> > As the dev_invalidate_iotlb() is invalidation for all of domain's ATS
> > devices. If in this consistent way, we couldn't Find which ATS device
> > flush timed out, then we need to hide all of domain's ATS devices.
> > Do you recall it?
> >   Also I think it is reluctant to put invalidate_sync within
> > queue_invalidate_iotlb() for consistent issue.
> > Quan
> 
> Yes I recall this story.
> 
> What about doing this? Let's wrap a _sync version for all flush interfaces, like
> below:
> 
> static int queue_invalidate_context_sync(...)
> {
> 	queue_invalidate_context(...);
> 	return invalidate_sync(...);
> }
> 
> Then invoke _sync version at all callers, e.g.:
> static int flush_context_qi(...)
> {
> 	...
> 	if ( qi_ctrl->qinval_maddr != 0 )
> 		ret = queue_invalidate_context_sync(...);
> }
> 
> similarly we'll have dev_invalidate_iotlb_sync for device IOTLB flush.
> 
> It simplifies caller logic and make code more readable. :-)
> 

Agreed, I would follow it and send out v8 ASAP, then I could focus on prereq patch set. :)


Quan
diff mbox

Patch

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 9f1716a..9a214c6 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -420,7 +420,7 @@  static void free_pdev(struct pci_seg *pseg, struct pci_dev *pdev)
     xfree(pdev);
 }
 
-static void _pci_hide_device(struct pci_dev *pdev)
+void pci_hide_existing_device(struct pci_dev *pdev)
 {
     if ( pdev->domain )
         return;
@@ -437,7 +437,7 @@  int __init pci_hide_device(int bus, int devfn)
     pdev = alloc_pdev(get_pseg(0), bus, devfn);
     if ( pdev )
     {
-        _pci_hide_device(pdev);
+        pci_hide_existing_device(pdev);
         rc = 0;
     }
     pcidevs_unlock();
@@ -467,7 +467,7 @@  int __init pci_ro_device(int seg, int bus, int devfn)
     }
 
     __set_bit(PCI_BDF2(bus, devfn), pseg->ro_map);
-    _pci_hide_device(pdev);
+    pci_hide_existing_device(pdev);
 
     return 0;
 }
diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
index d4d37c3..5b8673e 100644
--- a/xen/drivers/passthrough/vtd/extern.h
+++ b/xen/drivers/passthrough/vtd/extern.h
@@ -58,6 +58,8 @@  int ats_device(const struct pci_dev *, const struct acpi_drhd_unit *);
 
 int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
                          u64 addr, unsigned int size_order, u64 type);
+int dev_invalidate_iotlb_sync(struct iommu *iommu, u16 did,
+                              u16 seg, u8 bus, u8 devfn);
 
 int qinval_device_iotlb(struct iommu *iommu,
                         u32 max_invs_pend, u16 sid, u16 size, u64 addr);
diff --git a/xen/drivers/passthrough/vtd/qinval.c b/xen/drivers/passthrough/vtd/qinval.c
index 37a15fb..2a5c638 100644
--- a/xen/drivers/passthrough/vtd/qinval.c
+++ b/xen/drivers/passthrough/vtd/qinval.c
@@ -233,6 +233,57 @@  int qinval_device_iotlb(struct iommu *iommu,
     return 0;
 }
 
+static void dev_invalidate_iotlb_timeout(struct iommu *iommu, u16 did,
+                                         u16 seg, u8 bus, u8 devfn)
+{
+    struct domain *d = NULL;
+    struct pci_dev *pdev;
+
+    if ( test_bit(did, iommu->domid_bitmap) )
+        d = rcu_lock_domain_by_id(iommu->domid_map[did]);
+
+    if ( d == NULL )
+        return;
+
+    pcidevs_lock();
+    for_each_pdev(d, pdev)
+    {
+        if ( ( pdev->seg == seg ) &&
+             ( pdev->bus == bus ) &&
+             ( pdev->devfn == devfn ) )
+        {
+            ASSERT ( pdev->domain );
+            list_del(&pdev->domain_list);
+            pdev->domain = NULL;
+            pci_hide_existing_device(pdev);
+            break;
+        }
+    }
+
+    pcidevs_unlock();
+
+    if ( !is_hardware_domain(d) )
+        domain_crash(d);
+
+    rcu_unlock_domain(d);
+}
+
+int dev_invalidate_iotlb_sync(struct iommu *iommu, u16 did,
+                              u16 seg, u8 bus, u8 devfn)
+{
+    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
+    int rc = 0;
+
+    if ( qi_ctrl->qinval_maddr )
+    {
+        rc = queue_invalidate_wait(iommu, 0, 1, 1);
+        if ( rc == -ETIMEDOUT )
+            dev_invalidate_iotlb_timeout(iommu, did, seg, bus, devfn);
+    }
+
+    return rc;
+}
+
 static void queue_invalidate_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx)
 {
     unsigned long flags;
@@ -342,8 +393,6 @@  static int flush_iotlb_qi(
 
     if ( qi_ctrl->qinval_maddr != 0 )
     {
-        int rc;
-
         /* use queued invalidation */
         if (cap_write_drain(iommu->cap))
             dw = 1;
@@ -353,11 +402,17 @@  static int flush_iotlb_qi(
         queue_invalidate_iotlb(iommu,
                                type >> DMA_TLB_FLUSH_GRANU_OFFSET, dr,
                                dw, did, size_order, 0, addr);
+
+        /*
+         * Before Device-TLB invalidation we need to synchronize
+         * invalidation completions with hardware.
+         */
+        ret = invalidate_sync(iommu);
+        if ( ret )
+             return ret;
+
         if ( flush_dev_iotlb )
             ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);
-        rc = invalidate_sync(iommu);
-        if ( !ret )
-            ret = rc;
     }
     return ret;
 }
diff --git a/xen/drivers/passthrough/vtd/x86/ats.c b/xen/drivers/passthrough/vtd/x86/ats.c
index 334b9c1..c87ffe3 100644
--- a/xen/drivers/passthrough/vtd/x86/ats.c
+++ b/xen/drivers/passthrough/vtd/x86/ats.c
@@ -162,6 +162,18 @@  int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
             return -EOPNOTSUPP;
         }
 
+        /*
+         * Synchronize with hardware for Device-TLB invalidate
+         * descriptor.
+         */
+        rc = dev_invalidate_iotlb_sync(iommu, did, pdev->seg,
+                                       pdev->bus, pdev->devfn);
+        if ( rc )
+            printk(XENLOG_ERR
+                   "Flush error %d on device %04x:%02x:%02x.%u.\n",
+                   ret, pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                   PCI_FUNC(pdev->devfn));
+
         if ( !ret )
             ret = rc;
     }
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 6ed29dd..bb9f791 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -116,6 +116,7 @@  const unsigned long *pci_get_ro_map(u16 seg);
 int pci_add_device(u16 seg, u8 bus, u8 devfn,
                    const struct pci_dev_info *, nodeid_t node);
 int pci_remove_device(u16 seg, u8 bus, u8 devfn);
+void pci_hide_existing_device(struct pci_dev *pdev);
 int pci_ro_device(int seg, int bus, int devfn);
 int pci_hide_device(int bus, int devfn);
 struct pci_dev *pci_get_pdev(int seg, int bus, int devfn);