diff mbox

[v3,03/10] IOMMU/MMU: enhance the call trees of IOMMU unmapping and mapping

Message ID 945CA011AD5F084CBEA3E851C0AB28894B8A9316@SHSMSX101.ccr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Quan Xu May 5, 2016, 6:53 a.m. UTC
On May 04, 2016 9:49 PM, George Dunlap <George.Dunlap@eu.citrix.com> wrote:
> On Fri, Apr 29, 2016 at 10:25 AM, Quan Xu <quan.xu@intel.com> wrote:
> > When IOMMU mapping is failed, we issue a best effort rollback,
> > stopping IOMMU mapping, unmapping the previous IOMMU maps and
> then
> > reporting the error up to the call trees. When rollback is not
> > feasible (in early initialization phase or trade-off of complexity)
> > for the hardware domain, we do things on a best effort basis, only throwing
> out an error message.
> >
> > IOMMU unmapping should perhaps continue despite an error, in an
> > attempt to do best effort cleanup.
> >
> > Signed-off-by: Quan Xu <quan.xu@intel.com>
> >
> > CC: Keir Fraser <keir@xen.org>
> > CC: Jan Beulich <jbeulich@suse.com>
> > CC: Andrew Cooper <andrew.cooper3@citrix.com>
> > CC: Jun Nakajima <jun.nakajima@intel.com>
> > CC: Kevin Tian <kevin.tian@intel.com>
> > CC: George Dunlap <george.dunlap@eu.citrix.com>
> > ---
> >  xen/arch/x86/mm.c               | 13 ++++++++-----
> >  xen/arch/x86/mm/p2m-ept.c       | 27 +++++++++++++++++++++++++--
> >  xen/arch/x86/mm/p2m-pt.c        | 24 ++++++++++++++++++++----
> >  xen/arch/x86/mm/p2m.c           | 11 +++++++++--
> >  xen/drivers/passthrough/iommu.c | 14 +++++++++++++-
> >  5 files changed, 75 insertions(+), 14 deletions(-)
> >
> > diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index
> > a42097f..427097d 100644
> > --- a/xen/arch/x86/mm.c
> > +++ b/xen/arch/x86/mm.c
> > @@ -2467,7 +2467,7 @@ static int __get_page_type(struct page_info
> *page, unsigned long type,
> >                             int preemptible)  {
> >      unsigned long nx, x, y = page->u.inuse.type_info;
> > -    int rc = 0;
> > +    int rc = 0, ret = 0;
> >
> >      ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
> >
> > @@ -2578,11 +2578,11 @@ static int __get_page_type(struct page_info
> *page, unsigned long type,
> >          if ( d && is_pv_domain(d) && unlikely(need_iommu(d)) )
> >          {
> >              if ( (x & PGT_type_mask) == PGT_writable_page )
> > -                iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
> > +                ret = iommu_unmap_page(d, mfn_to_gmfn(d,
> page_to_mfn(page)));
> >              else if ( type == PGT_writable_page )
> > -                iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
> > -                               page_to_mfn(page),
> > -                               IOMMUF_readable|IOMMUF_writable);
> > +                ret = iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
> > +                                     page_to_mfn(page),
> > +                                     IOMMUF_readable|IOMMUF_writable);
> >          }
> >      }
> >
> > @@ -2599,6 +2599,9 @@ static int __get_page_type(struct page_info
> *page, unsigned long type,
> >      if ( (x & PGT_partial) && !(nx & PGT_partial) )
> >          put_page(page);
> >
> > +    if ( !rc )
> > +        rc = ret;
> > +
> >      return rc;
> >  }
> >
> > diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
> > index 1ed5b47..df87944 100644
> > --- a/xen/arch/x86/mm/p2m-ept.c
> > +++ b/xen/arch/x86/mm/p2m-ept.c
> > @@ -821,6 +821,8 @@ out:
> >      if ( needs_sync )
> >          ept_sync_domain(p2m);
> >
> > +    ret = 0;
> > +
> >      /* For host p2m, may need to change VT-d page table.*/
> >      if ( rc == 0 && p2m_is_hostp2m(p2m) && need_iommu(d) &&
> >           need_modify_vtd_table )
> > @@ -831,11 +833,29 @@ out:
> >          {
> >              if ( iommu_flags )
> >                  for ( i = 0; i < (1 << order); i++ )
> > -                    iommu_map_page(d, gfn + i, mfn_x(mfn) + i, iommu_flags);
> > +                {
> > +                    rc = iommu_map_page(d, gfn + i, mfn_x(mfn) + i, iommu_flags);
> > +
> > +                    if ( !ret && unlikely(rc) )
> > +                    {
> > +                        while ( i-- )
> > +                            iommu_unmap_page(d, gfn + i);
> > +
> > +                        ret = rc;
> > +                        break;
> > +                    }
> > +                }
> >              else
> >                  for ( i = 0; i < (1 << order); i++ )
> > -                    iommu_unmap_page(d, gfn + i);
> > +                {
> > +                    rc = iommu_unmap_page(d, gfn + i);
> > +
> > +                    if ( !ret && unlikely(rc) )
> > +                        ret = rc;
> > +                }
> >          }
> > +
> > +        rc = 0;
> >      }
> 
> So the reason for doing this thing with setting ret=0, then using rc,
> then setting rc to zero, is to make sure that the change is propagated
> to the altp2m if necessary?
> 
> I'm not a fan of this sort of "implied signalling".  The check at the
> altp2m conditional is meant to say, "everything went fine, go ahead
> and propagate the change".  But with your changes, that's not what we
> want anymore -- we want, "If the change actually made it into the
> hostp2m, propagate it to the altp2m."
> 
> As such, I think it would be a lot clearer to just make a new boolean
> variable, maybe "entry_written", which we initialize to false and then
> set to true when the entry is actually written; and then change the
> condition re the altp2m to "if ( entry_written && p2m_is_hostp2m(..)
> )".
> 
> Then I think you could make the ret / rc use mirror what you do
> elsewhere, where ret is used to store the return value of the function
> call, and it's propagated to rc only if rc is not already set.
> 

George,

Thanks for your detailed explanation. This seems an another matter of preference.
Of course, I'd follow your suggestion in p2m  field, while I need to hear the other maintainers' options as well
(especially when it comes from Kevin/Jan, as they do spend a lot of time for me).

A little bit of difference here, IMO, an int 'entry_written' is much better, as we also need  to propagate the 'entry_written' up to callers,
i.e. the errors returned from atomic_write_ept_entry() are '-EINVAL', '-ESRCH' and '-EBUSY'. Then, the follow is my modification (feel free to correct me):







> > @@ -680,11 +680,27 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
> unsigned long gfn, mfn_t mfn,
> >          }
> >          else if ( iommu_pte_flags )
> >              for ( i = 0; i < (1UL << page_order); i++ )
> > -                iommu_map_page(p2m->domain, gfn + i, mfn_x(mfn) + i,
> > -                               iommu_pte_flags);
> > +            {
> > +                ret = iommu_map_page(p2m->domain, gfn + i, mfn_x(mfn) + i,
> > +                                    iommu_pte_flags);
> > +
> > +                if ( !rc && unlikely(ret) )
> > +                {
> > +                    while ( i-- )
> > +                        iommu_unmap_page(p2m->domain, gfn + i);
> > +
> > +                    rc = ret;
> > +                    break;
> > +                }
> 
> Hmm, is this conditional here right?  What the code actually says to
> do is to always map pages, but to only unmap them on an error if there
> have been no other errors in the function so far.
> 
> As it happens, at the moment rc cannot be non-zero here since any time
> it's non-zero the code jumps down to the 'out' label, skipping this.
> But if that ever changed, this would fail to unmap when it probably
> should.
> 
> It seems like this be:
> 
> if ( unlikely(ret) )
> {
>   while ( i-- )
>     iommu_unmap_page(p2m->domain, gfn + i);
>   if ( !rc )
>     rc = ret;
>   break;
> }
> 

Looks good. Thanks.

> Or if you want to assume that rc will never be zero, then put an
> ASSERT() just before the for loop here.
> 
> Everything else looks good to me.  Thanks again for your work on this.
> 

Quan

Comments

George Dunlap May 5, 2016, 11:02 a.m. UTC | #1
On 05/05/16 07:53, Xu, Quan wrote:
> On May 04, 2016 9:49 PM, George Dunlap <George.Dunlap@eu.citrix.com> wrote:
>> On Fri, Apr 29, 2016 at 10:25 AM, Quan Xu <quan.xu@intel.com> wrote:
>>> When IOMMU mapping is failed, we issue a best effort rollback,
>>> stopping IOMMU mapping, unmapping the previous IOMMU maps and
>> then
>>> reporting the error up to the call trees. When rollback is not
>>> feasible (in early initialization phase or trade-off of complexity)
>>> for the hardware domain, we do things on a best effort basis, only throwing
>> out an error message.
>>>
>>> IOMMU unmapping should perhaps continue despite an error, in an
>>> attempt to do best effort cleanup.
>>>
>>> Signed-off-by: Quan Xu <quan.xu@intel.com>
>>>
>>> CC: Keir Fraser <keir@xen.org>
>>> CC: Jan Beulich <jbeulich@suse.com>
>>> CC: Andrew Cooper <andrew.cooper3@citrix.com>
>>> CC: Jun Nakajima <jun.nakajima@intel.com>
>>> CC: Kevin Tian <kevin.tian@intel.com>
>>> CC: George Dunlap <george.dunlap@eu.citrix.com>
>>> ---
>>>  xen/arch/x86/mm.c               | 13 ++++++++-----
>>>  xen/arch/x86/mm/p2m-ept.c       | 27 +++++++++++++++++++++++++--
>>>  xen/arch/x86/mm/p2m-pt.c        | 24 ++++++++++++++++++++----
>>>  xen/arch/x86/mm/p2m.c           | 11 +++++++++--
>>>  xen/drivers/passthrough/iommu.c | 14 +++++++++++++-
>>>  5 files changed, 75 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index
>>> a42097f..427097d 100644
>>> --- a/xen/arch/x86/mm.c
>>> +++ b/xen/arch/x86/mm.c
>>> @@ -2467,7 +2467,7 @@ static int __get_page_type(struct page_info
>> *page, unsigned long type,
>>>                             int preemptible)  {
>>>      unsigned long nx, x, y = page->u.inuse.type_info;
>>> -    int rc = 0;
>>> +    int rc = 0, ret = 0;
>>>
>>>      ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
>>>
>>> @@ -2578,11 +2578,11 @@ static int __get_page_type(struct page_info
>> *page, unsigned long type,
>>>          if ( d && is_pv_domain(d) && unlikely(need_iommu(d)) )
>>>          {
>>>              if ( (x & PGT_type_mask) == PGT_writable_page )
>>> -                iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
>>> +                ret = iommu_unmap_page(d, mfn_to_gmfn(d,
>> page_to_mfn(page)));
>>>              else if ( type == PGT_writable_page )
>>> -                iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
>>> -                               page_to_mfn(page),
>>> -                               IOMMUF_readable|IOMMUF_writable);
>>> +                ret = iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
>>> +                                     page_to_mfn(page),
>>> +                                     IOMMUF_readable|IOMMUF_writable);
>>>          }
>>>      }
>>>
>>> @@ -2599,6 +2599,9 @@ static int __get_page_type(struct page_info
>> *page, unsigned long type,
>>>      if ( (x & PGT_partial) && !(nx & PGT_partial) )
>>>          put_page(page);
>>>
>>> +    if ( !rc )
>>> +        rc = ret;
>>> +
>>>      return rc;
>>>  }
>>>
>>> diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
>>> index 1ed5b47..df87944 100644
>>> --- a/xen/arch/x86/mm/p2m-ept.c
>>> +++ b/xen/arch/x86/mm/p2m-ept.c
>>> @@ -821,6 +821,8 @@ out:
>>>      if ( needs_sync )
>>>          ept_sync_domain(p2m);
>>>
>>> +    ret = 0;
>>> +
>>>      /* For host p2m, may need to change VT-d page table.*/
>>>      if ( rc == 0 && p2m_is_hostp2m(p2m) && need_iommu(d) &&
>>>           need_modify_vtd_table )
>>> @@ -831,11 +833,29 @@ out:
>>>          {
>>>              if ( iommu_flags )
>>>                  for ( i = 0; i < (1 << order); i++ )
>>> -                    iommu_map_page(d, gfn + i, mfn_x(mfn) + i, iommu_flags);
>>> +                {
>>> +                    rc = iommu_map_page(d, gfn + i, mfn_x(mfn) + i, iommu_flags);
>>> +
>>> +                    if ( !ret && unlikely(rc) )
>>> +                    {
>>> +                        while ( i-- )
>>> +                            iommu_unmap_page(d, gfn + i);
>>> +
>>> +                        ret = rc;
>>> +                        break;
>>> +                    }
>>> +                }
>>>              else
>>>                  for ( i = 0; i < (1 << order); i++ )
>>> -                    iommu_unmap_page(d, gfn + i);
>>> +                {
>>> +                    rc = iommu_unmap_page(d, gfn + i);
>>> +
>>> +                    if ( !ret && unlikely(rc) )
>>> +                        ret = rc;
>>> +                }
>>>          }
>>> +
>>> +        rc = 0;
>>>      }
>>
>> So the reason for doing this thing with setting ret=0, then using rc,
>> then setting rc to zero, is to make sure that the change is propagated
>> to the altp2m if necessary?
>>
>> I'm not a fan of this sort of "implied signalling".  The check at the
>> altp2m conditional is meant to say, "everything went fine, go ahead
>> and propagate the change".  But with your changes, that's not what we
>> want anymore -- we want, "If the change actually made it into the
>> hostp2m, propagate it to the altp2m."
>>
>> As such, I think it would be a lot clearer to just make a new boolean
>> variable, maybe "entry_written", which we initialize to false and then
>> set to true when the entry is actually written; and then change the
>> condition re the altp2m to "if ( entry_written && p2m_is_hostp2m(..)
>> )".
>>
>> Then I think you could make the ret / rc use mirror what you do
>> elsewhere, where ret is used to store the return value of the function
>> call, and it's propagated to rc only if rc is not already set.
>>
> 
> George,
> 
> Thanks for your detailed explanation. This seems an another matter of preference.
> Of course, I'd follow your suggestion in p2m  field, while I need to hear the other maintainers' options as well
> (especially when it comes from Kevin/Jan, as they do spend a lot of time for me).
> 
> A little bit of difference here, IMO, an int 'entry_written' is much better, as we also need  to propagate the 'entry_written' up to callers,
> i.e. the errors returned from atomic_write_ept_entry() are '-EINVAL', '-ESRCH' and '-EBUSY'. Then, the follow is my modification (feel free to correct me):

But it doesn't look like you're actually taking advantage of the fact
that entry_written is non-boolean -- other than immediately copying the
value into rc, all the places you're only checking if it's zero or not
(where "zero" in this case actually means "the entry was written").

The code you had before was, as far as I can tell, functionally correct.
 The point of introducing the extra variable is to decrease cognitive
load on people coming along to modify the code later.  To do that, you
want to maximize the things you do that are within expectation, and
minimize the things you do outside of that.

The name "entry_written" sounds like a boolean; developers will expect
"0 / false" to mean "not written" and "1/true" to mean "entry written".
 Furthermore, rc looks like a return value; if coders see "rc =
atomic_write_ept_entry()" they immediately have a framework for what's
going on; whereas if they see "entry_written == [...]; if
(entry_written) { ... rc= entry_written; }", it takes some thinking to
figure out.  Not much, but every little bit of unnecessary thinking adds up.

My suggestion remains to:
1. Declare entry_written as a bool, initialized to false
2. In both atomic_write_ept_entry() calls, use rc to collect the return
value
3. In the second one (the one which may fail), add an else {
entry_written = true; }
4. In the conditional deciding whether to update the iommu, use
"entry_written", I think.  At this point rc == 0 and entry_written =
true will be identical, but if we ever insert something in between, we
want the iommu update to be attempted any time the entry is successfully
written.
5. Use "if ( entry_written && ...)" when deciding whether to propagate
the change to the altp2m.

Thanks,
 -George
Quan Xu May 6, 2016, 2:40 a.m. UTC | #2
On May 05, 2016 7:03 PM, George Dunlap <george.dunlap@citrix.com> wrote:
> On 05/05/16 07:53, Xu, Quan wrote:

> > On May 04, 2016 9:49 PM, George Dunlap <George.Dunlap@eu.citrix.com>

> wrote:

> >> On Fri, Apr 29, 2016 at 10:25 AM, Quan Xu <quan.xu@intel.com> wrote:

> >>> When IOMMU mapping is failed, we issue a best effort rollback,

> >>> stopping IOMMU mapping, unmapping the previous IOMMU maps and

> >> then

> >>> reporting the error up to the call trees. When rollback is not

> >>> feasible (in early initialization phase or trade-off of complexity)

> >>> for the hardware domain, we do things on a best effort basis, only

> >>> throwing

> >> out an error message.

> >>>

> >>> IOMMU unmapping should perhaps continue despite an error, in an

> >>> attempt to do best effort cleanup.

> >>>

> >>> Signed-off-by: Quan Xu <quan.xu@intel.com>

> >>>

> >>> CC: Keir Fraser <keir@xen.org>

> >>> CC: Jan Beulich <jbeulich@suse.com>

> >>> CC: Andrew Cooper <andrew.cooper3@citrix.com>

> >>> CC: Jun Nakajima <jun.nakajima@intel.com>

> >>> CC: Kevin Tian <kevin.tian@intel.com>

> >>> CC: George Dunlap <george.dunlap@eu.citrix.com>

> >>> ---

> >>>  xen/arch/x86/mm.c               | 13 ++++++++-----

> >>>  xen/arch/x86/mm/p2m-ept.c       | 27 +++++++++++++++++++++++++--

> >>>  xen/arch/x86/mm/p2m-pt.c        | 24 ++++++++++++++++++++----

> >>>  xen/arch/x86/mm/p2m.c           | 11 +++++++++--

> >>>  xen/drivers/passthrough/iommu.c | 14 +++++++++++++-

> >>>  5 files changed, 75 insertions(+), 14 deletions(-)

> >>>

> >>> diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index

> >>> a42097f..427097d 100644

> >>> --- a/xen/arch/x86/mm.c

> >>> +++ b/xen/arch/x86/mm.c

> >>> @@ -2467,7 +2467,7 @@ static int __get_page_type(struct page_info

> >> *page, unsigned long type,

> >>>                             int preemptible)  {

> >>>      unsigned long nx, x, y = page->u.inuse.type_info;

> >>> -    int rc = 0;

> >>> +    int rc = 0, ret = 0;

> >>>

> >>>      ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));

> >>>

> >>> @@ -2578,11 +2578,11 @@ static int __get_page_type(struct page_info

> >> *page, unsigned long type,

> >>>          if ( d && is_pv_domain(d) && unlikely(need_iommu(d)) )

> >>>          {

> >>>              if ( (x & PGT_type_mask) == PGT_writable_page )

> >>> -                iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));

> >>> +                ret = iommu_unmap_page(d, mfn_to_gmfn(d,

> >> page_to_mfn(page)));

> >>>              else if ( type == PGT_writable_page )

> >>> -                iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),

> >>> -                               page_to_mfn(page),

> >>> -                               IOMMUF_readable|IOMMUF_writable);

> >>> +                ret = iommu_map_page(d, mfn_to_gmfn(d,

> page_to_mfn(page)),

> >>> +                                     page_to_mfn(page),

> >>> +

> >>> + IOMMUF_readable|IOMMUF_writable);

> >>>          }

> >>>      }

> >>>

> >>> @@ -2599,6 +2599,9 @@ static int __get_page_type(struct page_info

> >> *page, unsigned long type,

> >>>      if ( (x & PGT_partial) && !(nx & PGT_partial) )

> >>>          put_page(page);

> >>>

> >>> +    if ( !rc )

> >>> +        rc = ret;

> >>> +

> >>>      return rc;

> >>>  }

> >>>

> >>> diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c

> >>> index 1ed5b47..df87944 100644

> >>> --- a/xen/arch/x86/mm/p2m-ept.c

> >>> +++ b/xen/arch/x86/mm/p2m-ept.c

> >>> @@ -821,6 +821,8 @@ out:

> >>>      if ( needs_sync )

> >>>          ept_sync_domain(p2m);

> >>>

> >>> +    ret = 0;

> >>> +

> >>>      /* For host p2m, may need to change VT-d page table.*/

> >>>      if ( rc == 0 && p2m_is_hostp2m(p2m) && need_iommu(d) &&

> >>>           need_modify_vtd_table )

> >>> @@ -831,11 +833,29 @@ out:

> >>>          {

> >>>              if ( iommu_flags )

> >>>                  for ( i = 0; i < (1 << order); i++ )

> >>> -                    iommu_map_page(d, gfn + i, mfn_x(mfn) + i, iommu_flags);

> >>> +                {

> >>> +                    rc = iommu_map_page(d, gfn + i, mfn_x(mfn) + i,

> >>> + iommu_flags);

> >>> +

> >>> +                    if ( !ret && unlikely(rc) )

> >>> +                    {

> >>> +                        while ( i-- )

> >>> +                            iommu_unmap_page(d, gfn + i);

> >>> +

> >>> +                        ret = rc;

> >>> +                        break;

> >>> +                    }

> >>> +                }

> >>>              else

> >>>                  for ( i = 0; i < (1 << order); i++ )

> >>> -                    iommu_unmap_page(d, gfn + i);

> >>> +                {

> >>> +                    rc = iommu_unmap_page(d, gfn + i);

> >>> +

> >>> +                    if ( !ret && unlikely(rc) )

> >>> +                        ret = rc;

> >>> +                }

> >>>          }

> >>> +

> >>> +        rc = 0;

> >>>      }

> >>

> >> So the reason for doing this thing with setting ret=0, then using rc,

> >> then setting rc to zero, is to make sure that the change is

> >> propagated to the altp2m if necessary?

> >>

> >> I'm not a fan of this sort of "implied signalling".  The check at the

> >> altp2m conditional is meant to say, "everything went fine, go ahead

> >> and propagate the change".  But with your changes, that's not what we

> >> want anymore -- we want, "If the change actually made it into the

> >> hostp2m, propagate it to the altp2m."

> >>

> >> As such, I think it would be a lot clearer to just make a new boolean

> >> variable, maybe "entry_written", which we initialize to false and

> >> then set to true when the entry is actually written; and then change

> >> the condition re the altp2m to "if ( entry_written &&

> >> p2m_is_hostp2m(..) )".

> >>

> >> Then I think you could make the ret / rc use mirror what you do

> >> elsewhere, where ret is used to store the return value of the

> >> function call, and it's propagated to rc only if rc is not already set.

> >>

> >

> > George,

> >

> > Thanks for your detailed explanation. This seems an another matter of

> preference.

> > Of course, I'd follow your suggestion in p2m  field, while I need to

> > hear the other maintainers' options as well (especially when it comes from

> Kevin/Jan, as they do spend a lot of time for me).

> >

> > A little bit of difference here, IMO, an int 'entry_written' is much

> > better, as we also need  to propagate the 'entry_written' up to callers, i.e. the

> errors returned from atomic_write_ept_entry() are '-EINVAL', '-ESRCH' and '-

> EBUSY'. Then, the follow is my modification (feel free to correct me):

> 

> But it doesn't look like you're actually taking advantage of the fact that

> entry_written is non-boolean -- other than immediately copying the value into

> rc, all the places you're only checking if it's zero or not (where "zero" in this

> case actually means "the entry was written").

> 

> The code you had before was, as far as I can tell, functionally correct.

>  The point of introducing the extra variable is to decrease cognitive load on

> people coming along to modify the code later.  To do that, you want to

> maximize the things you do that are within expectation, and minimize the

> things you do outside of that.

> 

> The name "entry_written" sounds like a boolean; developers will expect

> "0 / false" to mean "not written" and "1/true" to mean "entry written".

>  Furthermore, rc looks like a return value; if coders see "rc =

> atomic_write_ept_entry()" they immediately have a framework for what's

> going on; whereas if they see "entry_written == [...]; if

> (entry_written) { ... rc= entry_written; }", it takes some thinking to figure out.

> Not much, but every little bit of unnecessary thinking adds up.

> 

> My suggestion remains to:

> 1. Declare entry_written as a bool, initialized to false 2. In both

> atomic_write_ept_entry() calls, use rc to collect the return value 3. In the

> second one (the one which may fail), add an else { entry_written = true; } 4. In

> the conditional deciding whether to update the iommu, use "entry_written", I

> think.  At this point rc == 0 and entry_written = true will be identical, but if we

> ever insert something in between, we want the iommu update to be

> attempted any time the entry is successfully written.

> 5. Use "if ( entry_written && ...)" when deciding whether to propagate the

> change to the altp2m.

> 


George, 
Thanks. It is very clear now, and I will  follow it in next v4. 

Quan
diff mbox

Patch

--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -666,7 +666,7 @@  ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
     ept_entry_t *table, *ept_entry = NULL;
     unsigned long gfn_remainder = gfn;
     unsigned int i, target = order / EPT_TABLE_ORDER;
-    int ret, rc = 0;
+    int ret, rc = 0, entry_written = 0;
     bool_t direct_mmio = (p2mt == p2m_mmio_direct);
     uint8_t ipat = 0;
     bool_t need_modify_vtd_table = 1;
@@ -763,8 +763,8 @@  ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,

         /* now install the newly split ept sub-tree */
         /* NB: please make sure domian is paused and no in-fly VT-d DMA. */
-        rc = atomic_write_ept_entry(ept_entry, split_ept_entry, i);
-        ASSERT(rc == 0);
+        entry_written = atomic_write_ept_entry(ept_entry, split_ept_entry, i);
+        ASSERT(entry_written == 0);

         /* then move to the level we want to make real changes */
         for ( ; i > target; i-- )
@@ -809,9 +809,12 @@  ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         new_entry.suppress_ve = is_epte_valid(&old_entry) ?
                                     old_entry.suppress_ve : 1;

-    rc = atomic_write_ept_entry(ept_entry, new_entry, target);
-    if ( unlikely(rc) )
+    entry_written = atomic_write_ept_entry(ept_entry, new_entry, target);
+    if ( unlikely(entry_written) )
+    {
         old_entry.epte = 0;
+        rc = entry_written;
+    }
     else if ( p2mt != p2m_invalid &&
               (gfn + (1UL << order) - 1 > p2m->max_mapped_pfn) )
         /* Track the highest gfn for which we have ever had a valid mapping */
@@ -822,7 +825,7 @@  out:
         ept_sync_domain(p2m);

     /* For host p2m, may need to change VT-d page table.*/
-    if ( rc == 0 && p2m_is_hostp2m(p2m) && need_iommu(d) &&
+    if ( entry_written == 0 && p2m_is_hostp2m(p2m) && need_iommu(d) &&
          need_modify_vtd_table )
     {
         if ( iommu_hap_pt_share )
@@ -831,10 +834,28 @@  out:
         {
             if ( iommu_flags )
                 for ( i = 0; i < (1 << order); i++ )
-                    iommu_map_page(d, gfn + i, mfn_x(mfn) + i, iommu_flags);
+                {
+                    ret = iommu_map_page(d, gfn + i, mfn_x(mfn) + i, iommu_flags);
+
+                    if ( unlikely(ret) )
+                    {
+                        while ( i-- )
+                            iommu_unmap_page(p2m->domain, gfn + i);
+
+                        if ( !rc )
+                            rc = ret;
+
+                        break;
+                    }
+                }
             else
                 for ( i = 0; i < (1 << order); i++ )
-                    iommu_unmap_page(d, gfn + i);
+                {
+                    ret = iommu_unmap_page(d, gfn + i);
+
+                    if ( !rc && unlikely(ret) )
+                        rc = ret;
+                }
         }
     }