diff mbox

[v9,2/3] VT-d: wrap a _sync version for all VT-d flush interfaces

Message ID 1459522059-102365-3-git-send-email-quan.xu@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Quan Xu April 1, 2016, 2:47 p.m. UTC
The dev_invalidate_iotlb() scans ats_devices list to flush ATS devices,
and the invalidate_sync() is put after dev_invalidate_iotlb() to
synchronize with hardware for flush status. If we assign multiple
ATS devices to a domain, the flush status is about all these multiple
ATS devices. Once flush timeout expires, we couldn't find out which
one is the buggy ATS device.

Then, The invalidate_sync() variant (We need to pass down the device's
SBDF to hide the ATS device) is put within dev_invalidate_iotlb() to
synchronize for the flush status one by one. If flush timeout expires,
we could find out the buggy ATS device and hide it. However, for other
VT-d flush interfaces, the invalidate_sync() is still put after at present.
This is inconsistent.

So we wrap a _sync version for all VT-d flush interfaces. It simplifies
caller logic and makes code more readable as well.

Signed-off-by: Quan Xu <quan.xu@intel.com>
---
 xen/drivers/passthrough/vtd/extern.h  |  2 ++
 xen/drivers/passthrough/vtd/qinval.c  | 60 +++++++++++++++++++++++++----------
 xen/drivers/passthrough/vtd/x86/ats.c | 12 +++----
 3 files changed, 50 insertions(+), 24 deletions(-)

Comments

Jan Beulich April 5, 2016, 9:35 a.m. UTC | #1
>>> On 01.04.16 at 16:47, <quan.xu@intel.com> wrote:
> The dev_invalidate_iotlb() scans ats_devices list to flush ATS devices,
> and the invalidate_sync() is put after dev_invalidate_iotlb() to
> synchronize with hardware for flush status. If we assign multiple
> ATS devices to a domain, the flush status is about all these multiple
> ATS devices. Once flush timeout expires, we couldn't find out which
> one is the buggy ATS device.

Is that true? Or is that just a limitation of our implementation?

> Then, The invalidate_sync() variant (We need to pass down the device's
> SBDF to hide the ATS device) is put within dev_invalidate_iotlb() to
> synchronize for the flush status one by one.

I don't think this is stating current state of things. So ...

> If flush timeout expires,
> we could find out the buggy ATS device and hide it. However, for other
> VT-d flush interfaces, the invalidate_sync() is still put after at present.
> This is inconsistent.

... taken together, what is inconsistent here needs to be
described better, as well as what it is you do to eliminate the
inconsistency. Note that you should not refer back (or imply
knowledge of) the previous discussion on the earlier version.
In any of that discussion is useful here, you need to summarize
it instead.

> --- a/xen/drivers/passthrough/vtd/extern.h
> +++ b/xen/drivers/passthrough/vtd/extern.h
> @@ -61,6 +61,8 @@ int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
>  
>  int qinval_device_iotlb(struct iommu *iommu,
>                          u32 max_invs_pend, u16 sid, u16 size, u64 addr);
> +int qinval_device_iotlb_sync(struct iommu *iommu, u32 max_invs_pend,
> +                             u16 sid, u16 size, u64 addr);

So are then both functions needed to be externally accessible?
That would seem contrary to the last paragraph of the patch
description.

> --- a/xen/drivers/passthrough/vtd/qinval.c
> +++ b/xen/drivers/passthrough/vtd/qinval.c
> @@ -33,6 +33,10 @@ integer_param("vtd_qi_timeout", vtd_qi_timeout);
>  
>  #define IOMMU_QI_TIMEOUT (vtd_qi_timeout * MILLISECS(1))
>  
> +static int __must_check queue_invalidate_wait(struct iommu *iommu,
> +    u8 iflag, u8 sw, u8 fn);

Indentation.

> @@ -102,6 +106,15 @@ static void queue_invalidate_context(struct iommu *iommu,
>      unmap_vtd_domain_page(qinval_entries);
>  }
>  
> +static int queue_invalidate_context_sync(struct iommu *iommu,

__must_check?

> +    u16 did, u16 source_id, u8 function_mask, u8 granu)

Indentation again.

> +{
> +    queue_invalidate_context(iommu, did, source_id,
> +                             function_mask, granu);
> +
> +    return invalidate_sync(iommu);
> +}

Further down you replace the only call to
queue_invalidate_context() - why keep both functions instead of
just making the existing one do the sync? (That would the likely
also apply to qinval_device_iotlb() and others below.)

> @@ -338,23 +365,24 @@ static int flush_iotlb_qi(
>  
>      if ( qi_ctrl->qinval_maddr != 0 )
>      {
> -        int rc;
> -
>          /* use queued invalidation */
>          if (cap_write_drain(iommu->cap))
>              dw = 1;
>          if (cap_read_drain(iommu->cap))
>              dr = 1;
>          /* Need to conside the ih bit later */
> -        queue_invalidate_iotlb(iommu,
> -                               type >> DMA_TLB_FLUSH_GRANU_OFFSET, dr,
> -                               dw, did, size_order, 0, addr);
> +        ret = queue_invalidate_iotlb_sync(iommu,
> +                  type >> DMA_TLB_FLUSH_GRANU_OFFSET, dr, dw, did,
> +                  size_order, 0, addr);
> +
> +        /* TODO: Timeout error handling to be added later */

As of today queue_invalidate_wait() panics, so this comment is
not very helpful as there is not timeout that could possibly be
detected here.

> +        if ( ret )
> +            return ret;
> +
>          if ( flush_dev_iotlb )
>              ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);
> -        rc = invalidate_sync(iommu);
> -        if ( !ret )
> -            ret = rc;
>      }

I think leaving the existing logic as is would be better - best effort
invalidation even when an error has occurred.

> --- a/xen/drivers/passthrough/vtd/x86/ats.c
> +++ b/xen/drivers/passthrough/vtd/x86/ats.c
> @@ -118,7 +118,6 @@ int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
>      {
>          u16 sid = PCI_BDF2(pdev->bus, pdev->devfn);
>          bool_t sbit;
> -        int rc = 0;
>  
>          /* Only invalidate devices that belong to this IOMMU */
>          if ( pdev->iommu != iommu )
> @@ -134,8 +133,8 @@ int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
>              /* invalidate all translations: sbit=1,bit_63=0,bit[62:12]=1 */
>              sbit = 1;
>              addr = (~0UL << PAGE_SHIFT_4K) & 0x7FFFFFFFFFFFFFFF;
> -            rc = qinval_device_iotlb(iommu, pdev->ats_queue_depth,
> -                                     sid, sbit, addr);
> +            ret = qinval_device_iotlb_sync(iommu, pdev->ats_queue_depth,
> +                                           sid, sbit, addr);
>              break;
>          case DMA_TLB_PSI_FLUSH:
>              if ( !device_in_domain(iommu, pdev, did) )
> @@ -154,16 +153,13 @@ int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
>                  addr |= (((u64)1 << (size_order - 1)) - 1) << PAGE_SHIFT_4K;
>              }
>  
> -            rc = qinval_device_iotlb(iommu, pdev->ats_queue_depth,
> -                                     sid, sbit, addr);
> +            ret = qinval_device_iotlb_sync(iommu, pdev->ats_queue_depth,
> +                                           sid, sbit, addr);
>              break;
>          default:
>              dprintk(XENLOG_WARNING VTDPREFIX, "invalid vt-d flush type\n");
>              return -EOPNOTSUPP;
>          }
> -
> -        if ( !ret )
> -            ret = rc;
>      }

The removal of "rc" (which btw is unrelated to the purpose of your
patch) means that if an earlier iteration encountering an error is
followed by later successful ones, no error would get reported to
the caller. Hence this part of the change needs to be undone.

Jan
Quan Xu April 7, 2016, 7:44 a.m. UTC | #2
On April 05, 2016 5:35pm, Jan Beulich <JBeulich@suse.com> wrote:
> >>> On 01.04.16 at 16:47, <quan.xu@intel.com> wrote:

> > The dev_invalidate_iotlb() scans ats_devices list to flush ATS

> > devices, and the invalidate_sync() is put after dev_invalidate_iotlb()

> > to synchronize with hardware for flush status. If we assign multiple

> > ATS devices to a domain, the flush status is about all these multiple

> > ATS devices. Once flush timeout expires, we couldn't find out which

> > one is the buggy ATS device.

> 

> Is that true? Or is that just a limitation of our implementation?

> 


IMO, both.
I hope vt-d maintainers help me double check it.

> > Then, The invalidate_sync() variant (We need to pass down the device's

> > SBDF to hide the ATS device) is put within dev_invalidate_iotlb() to

> > synchronize for the flush status one by one.

> 

> I don't think this is stating current state of things. So ...

> 

> > If flush timeout expires,

> > we could find out the buggy ATS device and hide it. However, for other

> > VT-d flush interfaces, the invalidate_sync() is still put after at present.

> > This is inconsistent.

> 

> ... taken together, what is inconsistent here needs to be described better, as well

> as what it is you do to eliminate the inconsistency. Note that you should not

> refer back (or imply knowledge of) the previous discussion on the earlier

> version.

> In any of that discussion is useful here, you need to summarize it instead.

> 


I will continue to summarize it and send out later.

> > --- a/xen/drivers/passthrough/vtd/extern.h

> > +++ b/xen/drivers/passthrough/vtd/extern.h

> > @@ -61,6 +61,8 @@ int dev_invalidate_iotlb(struct iommu *iommu, u16

> > did,

> >

> >  int qinval_device_iotlb(struct iommu *iommu,

> >                          u32 max_invs_pend, u16 sid, u16 size, u64

> > addr);

> > +int qinval_device_iotlb_sync(struct iommu *iommu, u32 max_invs_pend,

> > +                             u16 sid, u16 size, u64 addr);

> 

> So are then both functions needed to be externally accessible?

> That would seem contrary to the last paragraph of the patch description.

> 


I was aware of this. I'd better make the qinval_device_iotlb() a static one in next v10.

[...]

> > +static int queue_invalidate_context_sync(struct iommu *iommu,

> 

> __must_check?

> 


Agreed.

[...]

> > +{

> > +    queue_invalidate_context(iommu, did, source_id,

> > +                             function_mask, granu);

> > +

> > +    return invalidate_sync(iommu);

> > +}

> 

> Further down you replace the only call to

> queue_invalidate_context() - why keep both functions instead of just making the

> existing one do the sync? (That would the likely also apply to

> qinval_device_iotlb() and others below.)

> 


It is optional.
 I think:
1. in the long term, we may need no _sync version.
2. At least, the current wrap looks good to me. e.g. queue_invalidate_context() is for context-cache Invalidate Descriptor, and the
invalidate_sync() is for Invalidation Wait Descriptor. It is much clearer.

> > @@ -338,23 +365,24 @@ static int flush_iotlb_qi(

> >

> >      if ( qi_ctrl->qinval_maddr != 0 )

> >      {

> > -        int rc;

> > -

> >          /* use queued invalidation */

> >          if (cap_write_drain(iommu->cap))

> >              dw = 1;

> >          if (cap_read_drain(iommu->cap))

> >              dr = 1;

> >          /* Need to conside the ih bit later */

> > -        queue_invalidate_iotlb(iommu,

> > -                               type >>

> DMA_TLB_FLUSH_GRANU_OFFSET, dr,

> > -                               dw, did, size_order, 0, addr);

> > +        ret = queue_invalidate_iotlb_sync(iommu,

> > +                  type >> DMA_TLB_FLUSH_GRANU_OFFSET, dr, dw, did,

> > +                  size_order, 0, addr);

> > +

> > +        /* TODO: Timeout error handling to be added later */

> 

> As of today queue_invalidate_wait() panics, so this comment is not very helpful

> as there is not timeout that could possibly be detected here.

> 


Okay, I will drop it.


> > +        if ( ret )

> > +            return ret;

> > +

> >          if ( flush_dev_iotlb )

> >              ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);

> > -        rc = invalidate_sync(iommu);

> > -        if ( !ret )

> > -            ret = rc;

> >      }

> 

> I think leaving the existing logic as is would be better - best effort invalidation

> even when an error has occurred.

> 


I have an open:
As vt-d spec(:Queued Invalidation Ordering Considerations) said,
     1. If the Fence(FN) flag is 1 in a inv_wait_dsc, hardware must execute descriptors following the inv_wait_dsc only after wait command is completed.
     2. when a Device-TLB invalidation timeout is detected, hardware must not complete any pending inv_wait_dsc commands.
In current code, the Fence(FN) is always 1.
if a Device-TLB invalidation timeout is detected, this additional inv_wait_dsc is not completed.
__iiuc__, 
the new coming descriptors, in that queue, _might_ be not executed any more, waiting for this additional inv_wait_dsc which is not completed.
is it true?


> > --- a/xen/drivers/passthrough/vtd/x86/ats.c

> > +++ b/xen/drivers/passthrough/vtd/x86/ats.c

> > @@ -118,7 +118,6 @@ int dev_invalidate_iotlb(struct iommu *iommu, u16

> did,

> >      {

> >          u16 sid = PCI_BDF2(pdev->bus, pdev->devfn);

> >          bool_t sbit;

> > -        int rc = 0;

> >

> >          /* Only invalidate devices that belong to this IOMMU */

> >          if ( pdev->iommu != iommu )

> > @@ -134,8 +133,8 @@ int dev_invalidate_iotlb(struct iommu *iommu, u16

> did,

> >              /* invalidate all translations: sbit=1,bit_63=0,bit[62:12]=1 */

> >              sbit = 1;

> >              addr = (~0UL << PAGE_SHIFT_4K) & 0x7FFFFFFFFFFFFFFF;

> > -            rc = qinval_device_iotlb(iommu, pdev->ats_queue_depth,

> > -                                     sid, sbit, addr);

> > +            ret = qinval_device_iotlb_sync(iommu,

> pdev->ats_queue_depth,

> > +                                           sid, sbit, addr);

> >              break;

> >          case DMA_TLB_PSI_FLUSH:

> >              if ( !device_in_domain(iommu, pdev, did) ) @@ -154,16

> > +153,13 @@ int dev_invalidate_iotlb(struct iommu *iommu, u16 did,

> >                  addr |= (((u64)1 << (size_order - 1)) - 1) <<

> PAGE_SHIFT_4K;

> >              }

> >

> > -            rc = qinval_device_iotlb(iommu, pdev->ats_queue_depth,

> > -                                     sid, sbit, addr);

> > +            ret = qinval_device_iotlb_sync(iommu,

> pdev->ats_queue_depth,

> > +                                           sid, sbit, addr);

> >              break;

> >          default:

> >              dprintk(XENLOG_WARNING VTDPREFIX, "invalid vt-d flush

> type\n");

> >              return -EOPNOTSUPP;

> >          }

> > -

> > -        if ( !ret )

> > -            ret = rc;

> >      }

> 

> The removal of "rc" (which btw is unrelated to the purpose of your

> patch) means that if an earlier iteration encountering an error is followed by

> later successful ones, no error would get reported to the caller. Hence this part

> of the change needs to be undone.

> 


Agreed.
I tried to drop rc, as the ret was always zero in previous code.


Quan
Jan Beulich April 7, 2016, 3:28 p.m. UTC | #3
>>> On 07.04.16 at 09:44, <quan.xu@intel.com> wrote:
> On April 05, 2016 5:35pm, Jan Beulich <JBeulich@suse.com> wrote:
>> >>> On 01.04.16 at 16:47, <quan.xu@intel.com> wrote:
>> > +{
>> > +    queue_invalidate_context(iommu, did, source_id,
>> > +                             function_mask, granu);
>> > +
>> > +    return invalidate_sync(iommu);
>> > +}
>> 
>> Further down you replace the only call to
>> queue_invalidate_context() - why keep both functions instead of just making 
> the
>> existing one do the sync? (That would the likely also apply to
>> qinval_device_iotlb() and others below.)
>> 
> 
> It is optional.
>  I think:
> 1. in the long term, we may need no _sync version.
> 2. At least, the current wrap looks good to me. e.g. 
> queue_invalidate_context() is for context-cache Invalidate Descriptor, and 
> the
> invalidate_sync() is for Invalidation Wait Descriptor. It is much clearer.

I don't really agree, but will leave it to the VT-d maintainers to judge.

>> > +        if ( ret )
>> > +            return ret;
>> > +
>> >          if ( flush_dev_iotlb )
>> >              ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);
>> > -        rc = invalidate_sync(iommu);
>> > -        if ( !ret )
>> > -            ret = rc;
>> >      }
>> 
>> I think leaving the existing logic as is would be better - best effort 
> invalidation
>> even when an error has occurred.
>> 
> 
> I have an open:
> As vt-d spec(:Queued Invalidation Ordering Considerations) said,
>      1. If the Fence(FN) flag is 1 in a inv_wait_dsc, hardware must execute 
> descriptors following the inv_wait_dsc only after wait command is completed.
>      2. when a Device-TLB invalidation timeout is detected, hardware must 
> not complete any pending inv_wait_dsc commands.
> In current code, the Fence(FN) is always 1.
> if a Device-TLB invalidation timeout is detected, this additional 
> inv_wait_dsc is not completed.
> __iiuc__, 
> the new coming descriptors, in that queue, _might_ be not executed any more, 
> waiting for this additional inv_wait_dsc which is not completed.
> is it true?

That's not a question to me, is it?

Jan
Quan Xu April 8, 2016, 2:20 a.m. UTC | #4
On April 07, 2016 11:29pm, Jan Beulich <JBeulich@suse.com> wrote:
> >>> On 07.04.16 at 09:44, <quan.xu@intel.com> wrote:

> > On April 05, 2016 5:35pm, Jan Beulich <JBeulich@suse.com> wrote:

> >> >>> On 01.04.16 at 16:47, <quan.xu@intel.com> wrote:

> >> > +{

> >> > +    queue_invalidate_context(iommu, did, source_id,

> >> > +                             function_mask, granu);

> >> > +

> >> > +    return invalidate_sync(iommu); }

> >>

> >> Further down you replace the only call to

> >> queue_invalidate_context() - why keep both functions instead of just

> >> making

> > the

> >> existing one do the sync? (That would the likely also apply to

> >> qinval_device_iotlb() and others below.)

> >>

> >

> > It is optional.

> >  I think:

> > 1. in the long term, we may need no _sync version.

> > 2. At least, the current wrap looks good to me. e.g.

> > queue_invalidate_context() is for context-cache Invalidate Descriptor,

> > and the

> > invalidate_sync() is for Invalidation Wait Descriptor. It is much clearer.

> 

> I don't really agree, but will leave it to the VT-d maintainers to judge.

> 


+to Kevin and Feng, I am open for it.


> >> > +        if ( ret )

> >> > +            return ret;

> >> > +

> >> >          if ( flush_dev_iotlb )

> >> >              ret = dev_invalidate_iotlb(iommu, did, addr, size_order,

> type);

> >> > -        rc = invalidate_sync(iommu);

> >> > -        if ( !ret )

> >> > -            ret = rc;

> >> >      }

> >>

> >> I think leaving the existing logic as is would be better - best

> >> effort

> > invalidation

> >> even when an error has occurred.

> >>

> >

> > I have an open:

> > As vt-d spec(:Queued Invalidation Ordering Considerations) said,

> >      1. If the Fence(FN) flag is 1 in a inv_wait_dsc, hardware must

> > execute descriptors following the inv_wait_dsc only after wait command is

> completed.

> >      2. when a Device-TLB invalidation timeout is detected, hardware

> > must not complete any pending inv_wait_dsc commands.

> > In current code, the Fence(FN) is always 1.

> > if a Device-TLB invalidation timeout is detected, this additional

> > inv_wait_dsc is not completed.

> > __iiuc__,

> > the new coming descriptors, in that queue, _might_ be not executed any

> > more, waiting for this additional inv_wait_dsc which is not completed.

> > is it true?

> 

> That's not a question to me, is it?


To community, but vt-d maintainers are someone who can explain to me.

Quan
Tian, Kevin April 11, 2016, 6:52 a.m. UTC | #5
> From: Xu, Quan

> Sent: Thursday, April 07, 2016 3:45 PM

> 

> On April 05, 2016 5:35pm, Jan Beulich <JBeulich@suse.com> wrote:

> > >>> On 01.04.16 at 16:47, <quan.xu@intel.com> wrote:

> > > The dev_invalidate_iotlb() scans ats_devices list to flush ATS

> > > devices, and the invalidate_sync() is put after dev_invalidate_iotlb()

> > > to synchronize with hardware for flush status. If we assign multiple

> > > ATS devices to a domain, the flush status is about all these multiple

> > > ATS devices. Once flush timeout expires, we couldn't find out which

> > > one is the buggy ATS device.

> >

> > Is that true? Or is that just a limitation of our implementation?

> >

> 

> IMO, both.

> I hope vt-d maintainers help me double check it.


Just a limitation of our implementation. Now dev_invalidate_iotlb puts
all possible IOTLB flush requests to the queue, and then invalidate_sync
pushes a wait descriptor w/ timeout to detect error. VT-d spec says one
or more descriptors may be fetched together by hardware. So when a 
timeout is triggered, we cannot tell which flush request actually has 
problem by reading queue head register. If we change the implementation
to one-invalidation-sync-per-request, then we can tell. I discussed with
Quan not to go that complexity though.

Thanks
Kevin
Tian, Kevin April 11, 2016, 7:25 a.m. UTC | #6
> From: Xu, Quan

> Sent: Friday, April 08, 2016 10:21 AM

> 

> On April 07, 2016 11:29pm, Jan Beulich <JBeulich@suse.com> wrote:

> > >>> On 07.04.16 at 09:44, <quan.xu@intel.com> wrote:

> > > On April 05, 2016 5:35pm, Jan Beulich <JBeulich@suse.com> wrote:

> > >> >>> On 01.04.16 at 16:47, <quan.xu@intel.com> wrote:

> > >> > +{

> > >> > +    queue_invalidate_context(iommu, did, source_id,

> > >> > +                             function_mask, granu);

> > >> > +

> > >> > +    return invalidate_sync(iommu); }

> > >>

> > >> Further down you replace the only call to

> > >> queue_invalidate_context() - why keep both functions instead of just

> > >> making

> > > the

> > >> existing one do the sync? (That would the likely also apply to

> > >> qinval_device_iotlb() and others below.)

> > >>

> > >

> > > It is optional.

> > >  I think:

> > > 1. in the long term, we may need no _sync version.

> > > 2. At least, the current wrap looks good to me. e.g.

> > > queue_invalidate_context() is for context-cache Invalidate Descriptor,

> > > and the

> > > invalidate_sync() is for Invalidation Wait Descriptor. It is much clearer.

> >

> > I don't really agree, but will leave it to the VT-d maintainers to judge.

> >

> 

> +to Kevin and Feng, I am open for it.


Let's just change existing one to _sync.

> 

> 

> > >> > +        if ( ret )

> > >> > +            return ret;

> > >> > +

> > >> >          if ( flush_dev_iotlb )

> > >> >              ret = dev_invalidate_iotlb(iommu, did, addr, size_order,

> > type);

> > >> > -        rc = invalidate_sync(iommu);

> > >> > -        if ( !ret )

> > >> > -            ret = rc;

> > >> >      }

> > >>

> > >> I think leaving the existing logic as is would be better - best

> > >> effort

> > > invalidation

> > >> even when an error has occurred.

> > >>

> > >

> > > I have an open:

> > > As vt-d spec(:Queued Invalidation Ordering Considerations) said,

> > >      1. If the Fence(FN) flag is 1 in a inv_wait_dsc, hardware must

> > > execute descriptors following the inv_wait_dsc only after wait command is

> > completed.

> > >      2. when a Device-TLB invalidation timeout is detected, hardware

> > > must not complete any pending inv_wait_dsc commands.

> > > In current code, the Fence(FN) is always 1.

> > > if a Device-TLB invalidation timeout is detected, this additional

> > > inv_wait_dsc is not completed.

> > > __iiuc__,

> > > the new coming descriptors, in that queue, _might_ be not executed any

> > > more, waiting for this additional inv_wait_dsc which is not completed.

> > > is it true?

> >

> > That's not a question to me, is it?

> 

> To community, but vt-d maintainers are someone who can explain to me.

> 


'not completed' here means 'abort', so your timeout check will be
hit since the status is never 'done' then.

But I'm not sure how your question is related to Jan's comment, which
looks reasonable to me. :-)

Thanks
Kevin
Quan Xu April 11, 2016, 8:01 a.m. UTC | #7
On April 11, 2016 2:53pm, Tian, Kevin wrote:
> > From: Xu, Quan

> > Sent: Thursday, April 07, 2016 3:45 PM

> >

> > On April 05, 2016 5:35pm, Jan Beulich <JBeulich@suse.com> wrote:

> > > >>> On 01.04.16 at 16:47, <quan.xu@intel.com> wrote:

> > > > The dev_invalidate_iotlb() scans ats_devices list to flush ATS

> > > > devices, and the invalidate_sync() is put after

> > > > dev_invalidate_iotlb() to synchronize with hardware for flush

> > > > status. If we assign multiple ATS devices to a domain, the flush

> > > > status is about all these multiple ATS devices. Once flush timeout

> > > > expires, we couldn't find out which one is the buggy ATS device.

> > >

> > > Is that true? Or is that just a limitation of our implementation?

> > >

> >

> > IMO, both.

> > I hope vt-d maintainers help me double check it.

> 

> Just a limitation of our implementation. Now dev_invalidate_iotlb puts all

> possible IOTLB flush requests to the queue, and then invalidate_sync pushes a

> wait descriptor w/ timeout to detect error. VT-d spec says one or more

> descriptors may be fetched together by hardware. So when a timeout is

> triggered, we cannot tell which flush request actually has problem by reading

> queue head register. If we change the implementation to

> one-invalidation-sync-per-request, then we can tell. I discussed with Quan not

> to go that complexity though.

> 


Thanks for your correction!
I will enhance the commit log and send out later.

Quan
Quan Xu April 11, 2016, 9:07 a.m. UTC | #8
On April 11, 2016 3:25pm, Tian, Kevin <kevin.tian@intel.com> wrote:
> > From: Xu, Quan

> > Sent: Friday, April 08, 2016 10:21 AM

> >

> > On April 07, 2016 11:29pm, Jan Beulich <JBeulich@suse.com> wrote:

> > > >>> On 07.04.16 at 09:44, <quan.xu@intel.com> wrote:

> > > > On April 05, 2016 5:35pm, Jan Beulich <JBeulich@suse.com> wrote:

> > > >> >>> On 01.04.16 at 16:47, <quan.xu@intel.com> wrote:

> > > >> > +{

> > > >> > +    queue_invalidate_context(iommu, did, source_id,

> > > >> > +                             function_mask, granu);

> > > >> > +

> > > >> > +    return invalidate_sync(iommu); }

> > > >>

> > > >> Further down you replace the only call to

> > > >> queue_invalidate_context() - why keep both functions instead of

> > > >> just making

> > > > the

> > > >> existing one do the sync? (That would the likely also apply to

> > > >> qinval_device_iotlb() and others below.)

> > > >>

> > > >

> > > > It is optional.

> > > >  I think:

> > > > 1. in the long term, we may need no _sync version.

> > > > 2. At least, the current wrap looks good to me. e.g.

> > > > queue_invalidate_context() is for context-cache Invalidate

> > > > Descriptor, and the

> > > > invalidate_sync() is for Invalidation Wait Descriptor. It is much clearer.

> > >

> > > I don't really agree, but will leave it to the VT-d maintainers to judge.

> > >

> >

> > +to Kevin and Feng, I am open for it.

> 

> Let's just change existing one to _sync.

> 


Agreed.


> >

> >

> > > >> > +        if ( ret )

> > > >> > +            return ret;

> > > >> > +

> > > >> >          if ( flush_dev_iotlb )

> > > >> >              ret = dev_invalidate_iotlb(iommu, did, addr,

> > > >> > size_order,

> > > type);

> > > >> > -        rc = invalidate_sync(iommu);

> > > >> > -        if ( !ret )

> > > >> > -            ret = rc;

> > > >> >      }

> > > >>

> > > >> I think leaving the existing logic as is would be better - best

> > > >> effort

> > > > invalidation

> > > >> even when an error has occurred.

> > > >>

> > > >

> > > > I have an open:

> > > > As vt-d spec(:Queued Invalidation Ordering Considerations) said,

> > > >      1. If the Fence(FN) flag is 1 in a inv_wait_dsc, hardware

> > > > must execute descriptors following the inv_wait_dsc only after

> > > > wait command is

> > > completed.

> > > >      2. when a Device-TLB invalidation timeout is detected,

> > > > hardware must not complete any pending inv_wait_dsc commands.

> > > > In current code, the Fence(FN) is always 1.

> > > > if a Device-TLB invalidation timeout is detected, this additional

> > > > inv_wait_dsc is not completed.

> > > > __iiuc__,

> > > > the new coming descriptors, in that queue, _might_ be not executed

> > > > any more, waiting for this additional inv_wait_dsc which is not completed.

> > > > is it true?

> > >

> > > That's not a question to me, is it?

> >

> > To community, but vt-d maintainers are someone who can explain to me.

> >

> 

> 'not completed' here means 'abort', so your timeout check will be hit since the

> status is never 'done' then.

> 

> But I'm not sure how your question is related to Jan's comment, which looks

> reasonable to me. :-)

> 


I will take Jan's suggestion.
For this open, at least, to remind myself, I should consider software behavior and hardware behavior.
Let's ignore my open.

Quan
diff mbox

Patch

diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
index d4d37c3..6d3187d 100644
--- a/xen/drivers/passthrough/vtd/extern.h
+++ b/xen/drivers/passthrough/vtd/extern.h
@@ -61,6 +61,8 @@  int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
 
 int qinval_device_iotlb(struct iommu *iommu,
                         u32 max_invs_pend, u16 sid, u16 size, u64 addr);
+int qinval_device_iotlb_sync(struct iommu *iommu, u32 max_invs_pend,
+                             u16 sid, u16 size, u64 addr);
 
 unsigned int get_cache_line_size(void);
 void cacheline_flush(char *);
diff --git a/xen/drivers/passthrough/vtd/qinval.c b/xen/drivers/passthrough/vtd/qinval.c
index 52ba2c2..d12661b 100644
--- a/xen/drivers/passthrough/vtd/qinval.c
+++ b/xen/drivers/passthrough/vtd/qinval.c
@@ -33,6 +33,10 @@  integer_param("vtd_qi_timeout", vtd_qi_timeout);
 
 #define IOMMU_QI_TIMEOUT (vtd_qi_timeout * MILLISECS(1))
 
+static int __must_check queue_invalidate_wait(struct iommu *iommu,
+    u8 iflag, u8 sw, u8 fn);
+static int invalidate_sync(struct iommu *iommu);
+
 static void print_qi_regs(struct iommu *iommu)
 {
     u64 val;
@@ -102,6 +106,15 @@  static void queue_invalidate_context(struct iommu *iommu,
     unmap_vtd_domain_page(qinval_entries);
 }
 
+static int queue_invalidate_context_sync(struct iommu *iommu,
+    u16 did, u16 source_id, u8 function_mask, u8 granu)
+{
+    queue_invalidate_context(iommu, did, source_id,
+                             function_mask, granu);
+
+    return invalidate_sync(iommu);
+}
+
 static void queue_invalidate_iotlb(struct iommu *iommu,
     u8 granu, u8 dr, u8 dw, u16 did, u8 am, u8 ih, u64 addr)
 {
@@ -135,6 +148,14 @@  static void queue_invalidate_iotlb(struct iommu *iommu,
     spin_unlock_irqrestore(&iommu->register_lock, flags);
 }
 
+static int queue_invalidate_iotlb_sync(struct iommu *iommu,
+    u8 granu, u8 dr, u8 dw, u16 did, u8 am, u8 ih, u64 addr)
+{
+    queue_invalidate_iotlb(iommu, granu, dr, dw, did, am, ih, addr);
+
+    return invalidate_sync(iommu);
+}
+
 static int __must_check queue_invalidate_wait(struct iommu *iommu,
     u8 iflag, u8 sw, u8 fn)
 {
@@ -229,6 +250,14 @@  int qinval_device_iotlb(struct iommu *iommu,
     return 0;
 }
 
+int qinval_device_iotlb_sync(struct iommu *iommu,
+    u32 max_invs_pend, u16 sid, u16 size, u64 addr)
+{
+    qinval_device_iotlb(iommu, max_invs_pend, sid, size, addr);
+
+    return invalidate_sync(iommu);
+}
+
 static void queue_invalidate_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx)
 {
     unsigned long flags;
@@ -256,7 +285,7 @@  static void queue_invalidate_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx)
     spin_unlock_irqrestore(&iommu->register_lock, flags);
 }
 
-static int __iommu_flush_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx)
+static int queue_invalidate_iec_sync(struct iommu *iommu, u8 granu, u8 im, u16 iidx)
 {
     int ret;
 
@@ -273,12 +302,12 @@  static int __iommu_flush_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx)
 
 int iommu_flush_iec_global(struct iommu *iommu)
 {
-    return __iommu_flush_iec(iommu, IEC_GLOBAL_INVL, 0, 0);
+    return queue_invalidate_iec_sync(iommu, IEC_GLOBAL_INVL, 0, 0);
 }
 
 int iommu_flush_iec_index(struct iommu *iommu, u8 im, u16 iidx)
 {
-   return __iommu_flush_iec(iommu, IEC_INDEX_INVL, im, iidx);
+   return queue_invalidate_iec_sync(iommu, IEC_INDEX_INVL, im, iidx);
 }
 
 static int flush_context_qi(
@@ -304,11 +333,9 @@  static int flush_context_qi(
     }
 
     if ( qi_ctrl->qinval_maddr != 0 )
-    {
-        queue_invalidate_context(iommu, did, sid, fm,
-                                 type >> DMA_CCMD_INVL_GRANU_OFFSET);
-        ret = invalidate_sync(iommu);
-    }
+        ret = queue_invalidate_context_sync(iommu, did, sid, fm,
+                  type >> DMA_CCMD_INVL_GRANU_OFFSET);
+
     return ret;
 }
 
@@ -338,23 +365,24 @@  static int flush_iotlb_qi(
 
     if ( qi_ctrl->qinval_maddr != 0 )
     {
-        int rc;
-
         /* use queued invalidation */
         if (cap_write_drain(iommu->cap))
             dw = 1;
         if (cap_read_drain(iommu->cap))
             dr = 1;
         /* Need to conside the ih bit later */
-        queue_invalidate_iotlb(iommu,
-                               type >> DMA_TLB_FLUSH_GRANU_OFFSET, dr,
-                               dw, did, size_order, 0, addr);
+        ret = queue_invalidate_iotlb_sync(iommu,
+                  type >> DMA_TLB_FLUSH_GRANU_OFFSET, dr, dw, did,
+                  size_order, 0, addr);
+
+        /* TODO: Timeout error handling to be added later */
+        if ( ret )
+            return ret;
+
         if ( flush_dev_iotlb )
             ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);
-        rc = invalidate_sync(iommu);
-        if ( !ret )
-            ret = rc;
     }
+
     return ret;
 }
 
diff --git a/xen/drivers/passthrough/vtd/x86/ats.c b/xen/drivers/passthrough/vtd/x86/ats.c
index 334b9c1..7b1c07b 100644
--- a/xen/drivers/passthrough/vtd/x86/ats.c
+++ b/xen/drivers/passthrough/vtd/x86/ats.c
@@ -118,7 +118,6 @@  int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
     {
         u16 sid = PCI_BDF2(pdev->bus, pdev->devfn);
         bool_t sbit;
-        int rc = 0;
 
         /* Only invalidate devices that belong to this IOMMU */
         if ( pdev->iommu != iommu )
@@ -134,8 +133,8 @@  int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
             /* invalidate all translations: sbit=1,bit_63=0,bit[62:12]=1 */
             sbit = 1;
             addr = (~0UL << PAGE_SHIFT_4K) & 0x7FFFFFFFFFFFFFFF;
-            rc = qinval_device_iotlb(iommu, pdev->ats_queue_depth,
-                                     sid, sbit, addr);
+            ret = qinval_device_iotlb_sync(iommu, pdev->ats_queue_depth,
+                                           sid, sbit, addr);
             break;
         case DMA_TLB_PSI_FLUSH:
             if ( !device_in_domain(iommu, pdev, did) )
@@ -154,16 +153,13 @@  int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
                 addr |= (((u64)1 << (size_order - 1)) - 1) << PAGE_SHIFT_4K;
             }
 
-            rc = qinval_device_iotlb(iommu, pdev->ats_queue_depth,
-                                     sid, sbit, addr);
+            ret = qinval_device_iotlb_sync(iommu, pdev->ats_queue_depth,
+                                           sid, sbit, addr);
             break;
         default:
             dprintk(XENLOG_WARNING VTDPREFIX, "invalid vt-d flush type\n");
             return -EOPNOTSUPP;
         }
-
-        if ( !ret )
-            ret = rc;
     }
 
     return ret;