diff mbox

IB/ipoib: Skip napi_schedule if ib_poll_cq fails

Message ID 1468402436-25053-1-git-send-email-yuval.shaia@oracle.com (mailing list archive)
State Superseded
Headers show

Commit Message

Yuval Shaia July 13, 2016, 9:33 a.m. UTC
To avoid entering into endless loop when device can't poll CQE from CQ
driver should not reschedule if error is not -EAGAIN.

Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
---
 drivers/infiniband/ulp/ipoib/ipoib.h       |    5 +++++
 drivers/infiniband/ulp/ipoib/ipoib_ib.c    |   10 ++++++++++
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |    1 +
 3 files changed, 16 insertions(+), 0 deletions(-)

Comments

kernel test robot July 13, 2016, 10:15 a.m. UTC | #1
Hi,

[auto build test WARNING on rdma/master]
[also build test WARNING on v4.7-rc7 next-20160712]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Yuval-Shaia/IB-ipoib-Skip-napi_schedule-if-ib_poll_cq-fails/20160713-173838
base:   https://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma.git master
config: ia64-defconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 4.9.0
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=ia64 

Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings

All warnings (new ones prefixed by >>):

   drivers/infiniband/ulp/ipoib/ipoib_ib.c: In function 'ipoib_poll':
>> drivers/infiniband/ulp/ipoib/ipoib_ib.c:478:5: warning: 'n' may be used uninitialized in this function [-Wmaybe-uninitialized]
     if (unlikely(n < 0) && (n != -EAGAIN)) {
        ^

vim +/n +478 drivers/infiniband/ulp/ipoib/ipoib_ib.c

   462				struct ib_wc *wc = priv->ibwc + i;
   463	
   464				if (wc->wr_id & IPOIB_OP_RECV) {
   465					++done;
   466					if (wc->wr_id & IPOIB_OP_CM)
   467						ipoib_cm_handle_rx_wc(dev, wc);
   468					else
   469						ipoib_ib_handle_rx_wc(dev, wc);
   470				} else
   471					ipoib_cm_handle_tx_wc(priv->dev, wc);
   472			}
   473	
   474			if (n != t)
   475				break;
   476		}
   477	
 > 478		if (unlikely(n < 0) && (n != -EAGAIN)) {
   479			if (priv->recv_conseq_cq_errs++ >= IPOIB_MAX_CONSEQ_CQ_ERR) {
   480				ipoib_crit(priv,
   481					   "Too many poll_cq errors, last error: %d\n",
   482					   n);
   483				return done;
   484			}
   485		} else
   486			priv->recv_conseq_cq_errs = 0;

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Jason Gunthorpe July 13, 2016, 5:47 p.m. UTC | #2
On Wed, Jul 13, 2016 at 02:33:56AM -0700, Yuval Shaia wrote:
> To avoid entering into endless loop when device can't poll CQE from CQ
> driver should not reschedule if error is not -EAGAIN.

?? what causes ib_poll_cq to return an error?

You need to describe the motivation here.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuval Shaia July 13, 2016, 7:12 p.m. UTC | #3
On Wed, Jul 13, 2016 at 11:47:42AM -0600, Jason Gunthorpe wrote:
> On Wed, Jul 13, 2016 at 02:33:56AM -0700, Yuval Shaia wrote:
> > To avoid entering into endless loop when device can't poll CQE from CQ
> > driver should not reschedule if error is not -EAGAIN.
> 
> ?? what causes ib_poll_cq to return an error?
> 
> You need to describe the motivation here.

EAGAIN is fine - HW driver returns this to indicates temporary error and
caller should retry again.
However, other errors (such as EINVAL) may refer to some fatal error where
HW driver is unable to recover from.

Two examples:
- Mellanox folks may comment for example if the case where
  __mlx4_qp_lookup() returns NULL in function mlx4_ib_poll_one() means
  fatal or not.
- At least by reading the of c4iw_poll_cq_one() it is clear that it may
  return fatal error.

We must leave some exit point to HW driver to indicate a fatal and
unrecoverable state.

> 
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 13, 2016, 7:25 p.m. UTC | #4
On Wed, Jul 13, 2016 at 10:12:25PM +0300, Yuval Shaia wrote:
> On Wed, Jul 13, 2016 at 11:47:42AM -0600, Jason Gunthorpe wrote:
> > On Wed, Jul 13, 2016 at 02:33:56AM -0700, Yuval Shaia wrote:
> > > To avoid entering into endless loop when device can't poll CQE from CQ
> > > driver should not reschedule if error is not -EAGAIN.
> > 
> > ?? what causes ib_poll_cq to return an error?
> > 
> > You need to describe the motivation here.
> 
> EAGAIN is fine - HW driver returns this to indicates temporary error and
> caller should retry again.
> However, other errors (such as EINVAL) may refer to some fatal error where
> HW driver is unable to recover from.

So you've never seen this?

I question the sanity of a poll_cq implementation that can return a
hard error...

> Two examples:
> - Mellanox folks may comment for example if the case where
>   __mlx4_qp_lookup() returns NULL in function mlx4_ib_poll_one() means
>   fatal or not.
> - At least by reading the of c4iw_poll_cq_one() it is clear that it may
>   return fatal error.

If EAGAIN should be ignored, then all other errors indicate the CQ is
dead and needs to be reconstructed. So the approach in this patch to
add a 'recv_conseq_cq_errs' is nonsense. You need to trigger some kind
of restart of the QP instead.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuval Shaia July 13, 2016, 7:50 p.m. UTC | #5
On Wed, Jul 13, 2016 at 01:25:04PM -0600, Jason Gunthorpe wrote:
> On Wed, Jul 13, 2016 at 10:12:25PM +0300, Yuval Shaia wrote:
> > On Wed, Jul 13, 2016 at 11:47:42AM -0600, Jason Gunthorpe wrote:
> > > On Wed, Jul 13, 2016 at 02:33:56AM -0700, Yuval Shaia wrote:
> > > > To avoid entering into endless loop when device can't poll CQE from CQ
> > > > driver should not reschedule if error is not -EAGAIN.
> > > 
> > > ?? what causes ib_poll_cq to return an error?
> > > 
> > > You need to describe the motivation here.
> > 
> > EAGAIN is fine - HW driver returns this to indicates temporary error and
> > caller should retry again.
> > However, other errors (such as EINVAL) may refer to some fatal error where
> > HW driver is unable to recover from.
> 
> So you've never seen this?

Waiting for real use case might take some time.

> 
> I question the sanity of a poll_cq implementation that can return a
> hard error...
> 
> > Two examples:
> > - Mellanox folks may comment for example if the case where
> >   __mlx4_qp_lookup() returns NULL in function mlx4_ib_poll_one() means
> >   fatal or not.
> > - At least by reading the of c4iw_poll_cq_one() it is clear that it may
> >   return fatal error.
> 
> If EAGAIN should be ignored, then all other errors indicate the CQ is
> dead and needs to be reconstructed. So the approach in this patch to
> add a 'recv_conseq_cq_errs' is nonsense. You need to trigger some kind
> of restart of the QP instead.

The idea behind consec counter is not a recovery mechanism it is just some
why to "retry" for a while just before declaring game over. I do not have
strong opinion on that, actually my first try was w/o it, i.e. 'kill' on
the first error.

Patch does not offer any recovery mechanism it simply print fatal error to
console and exit NAPI. This fatal error will suggest admin to reload the
driver or something like that.
This takes us to "recovery-mechanism" :)
I'm not sure that restarting the QP will help as the error is while reading
the CQ and restarting the CQ is more or less like restarting the driver.

> 
> Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Santosh Shilimkar July 13, 2016, 8:46 p.m. UTC | #6
On 7/13/2016 12:50 PM, Yuval Shaia wrote:
> On Wed, Jul 13, 2016 at 01:25:04PM -0600, Jason Gunthorpe wrote:
>> On Wed, Jul 13, 2016 at 10:12:25PM +0300, Yuval Shaia wrote:
>>> On Wed, Jul 13, 2016 at 11:47:42AM -0600, Jason Gunthorpe wrote:
>>>> On Wed, Jul 13, 2016 at 02:33:56AM -0700, Yuval Shaia wrote:
>>>>> To avoid entering into endless loop when device can't poll CQE from CQ
>>>>> driver should not reschedule if error is not -EAGAIN.
>>>>
>>>> ?? what causes ib_poll_cq to return an error?
>>>>
>>>> You need to describe the motivation here.
>>>
>>> EAGAIN is fine - HW driver returns this to indicates temporary error and
>>> caller should retry again.
>>> However, other errors (such as EINVAL) may refer to some fatal error where
>>> HW driver is unable to recover from.
>>
>> So you've never seen this?
>
> Waiting for real use case might take some time.
>
>>
>> I question the sanity of a poll_cq implementation that can return a
>> hard error...
>>
>>> Two examples:
>>> - Mellanox folks may comment for example if the case where
>>>   __mlx4_qp_lookup() returns NULL in function mlx4_ib_poll_one() means
>>>   fatal or not.
>>> - At least by reading the of c4iw_poll_cq_one() it is clear that it may
>>>   return fatal error.
>>
>> If EAGAIN should be ignored, then all other errors indicate the CQ is
>> dead and needs to be reconstructed. So the approach in this patch to
>> add a 'recv_conseq_cq_errs' is nonsense. You need to trigger some kind
>> of restart of the QP instead.
>
> The idea behind consec counter is not a recovery mechanism it is just some
> why to "retry" for a while just before declaring game over. I do not have
> strong opinion on that, actually my first try was w/o it, i.e. 'kill' on
> the first error.
>
> Patch does not offer any recovery mechanism it simply print fatal error to
> console and exit NAPI. This fatal error will suggest admin to reload the
> driver or something like that.
> This takes us to "recovery-mechanism" :)
> I'm not sure that restarting the QP will help as the error is while reading
> the CQ and restarting the CQ is more or less like restarting the driver.
>
Probably Jason mean destroy the problematic CQ and create a new one. 
This is what Haakon suggested as well but it will lead to the leak
and also possible issue with outstanding WC's getting lost without
being flushed on that CQ.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 13, 2016, 8:53 p.m. UTC | #7
On Wed, Jul 13, 2016 at 01:46:07PM -0700, Santosh Shilimkar wrote:
> >Patch does not offer any recovery mechanism it simply print fatal error to
> >console and exit NAPI. This fatal error will suggest admin to reload the
> >driver or something like that.
> >This takes us to "recovery-mechanism" :)
> >I'm not sure that restarting the QP will help as the error is while reading
> >the CQ and restarting the CQ is more or less like restarting the driver.
> >
> Probably Jason mean destroy the problematic CQ and create a new one. This is
> what Haakon suggested as well but it will lead to the leak
> and also possible issue with outstanding WC's getting lost without
> being flushed on that CQ.

Again, this seems crazy.

What failure mode does a CQ have that does not require a full driver
restart after?

Drivers that hard fail a CQ poll should declare themselves dead and
require a full restart to recover. This is the same infrastructure
that was added to the mlx drivers to handle other forms of hard
errors.

This was the same direction we went in for the _destroy functions.

poll_cq should only return -EAGAIN or success (and EAGAIN seems
fairly strange, how is that different from returning 0 wcs?)

If there is some actual reason preventing a driver from implementing
that kind of API lets hear it. Otherwise I recommend you focus patches
on fixing any broken drivers and documenting this requirement.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuval Shaia July 14, 2016, 5:50 a.m. UTC | #8
On Wed, Jul 13, 2016 at 01:46:07PM -0700, Santosh Shilimkar wrote:
> 
> On 7/13/2016 12:50 PM, Yuval Shaia wrote:
> >On Wed, Jul 13, 2016 at 01:25:04PM -0600, Jason Gunthorpe wrote:
> >>On Wed, Jul 13, 2016 at 10:12:25PM +0300, Yuval Shaia wrote:
> >>>On Wed, Jul 13, 2016 at 11:47:42AM -0600, Jason Gunthorpe wrote:
> >>>>On Wed, Jul 13, 2016 at 02:33:56AM -0700, Yuval Shaia wrote:
> >>>>>To avoid entering into endless loop when device can't poll CQE from CQ
> >>>>>driver should not reschedule if error is not -EAGAIN.
> >>>>
> >>>>?? what causes ib_poll_cq to return an error?
> >>>>
> >>>>You need to describe the motivation here.
> >>>
> >>>EAGAIN is fine - HW driver returns this to indicates temporary error and
> >>>caller should retry again.
> >>>However, other errors (such as EINVAL) may refer to some fatal error where
> >>>HW driver is unable to recover from.
> >>
> >>So you've never seen this?
> >
> >Waiting for real use case might take some time.
> >
> >>
> >>I question the sanity of a poll_cq implementation that can return a
> >>hard error...
> >>
> >>>Two examples:
> >>>- Mellanox folks may comment for example if the case where
> >>>  __mlx4_qp_lookup() returns NULL in function mlx4_ib_poll_one() means
> >>>  fatal or not.
> >>>- At least by reading the of c4iw_poll_cq_one() it is clear that it may
> >>>  return fatal error.
> >>
> >>If EAGAIN should be ignored, then all other errors indicate the CQ is
> >>dead and needs to be reconstructed. So the approach in this patch to
> >>add a 'recv_conseq_cq_errs' is nonsense. You need to trigger some kind
> >>of restart of the QP instead.
> >
> >The idea behind consec counter is not a recovery mechanism it is just some
> >why to "retry" for a while just before declaring game over. I do not have
> >strong opinion on that, actually my first try was w/o it, i.e. 'kill' on
> >the first error.
> >
> >Patch does not offer any recovery mechanism it simply print fatal error to
> >console and exit NAPI. This fatal error will suggest admin to reload the
> >driver or something like that.
> >This takes us to "recovery-mechanism" :)
> >I'm not sure that restarting the QP will help as the error is while reading
> >the CQ and restarting the CQ is more or less like restarting the driver.
> >
> Probably Jason mean destroy the problematic CQ and create a new one.
> This is what Haakon suggested as well but it will lead to the leak
> and also possible issue with outstanding WC's getting lost without
> being flushed on that CQ.

It is not only leak.
This CQ serve many QP (== many connections). Destroying it seems to me
catastrophic as reloading the driver.

> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Santosh Shilimkar July 14, 2016, 4:50 p.m. UTC | #9
On 7/13/2016 10:50 PM, Yuval Shaia wrote:
> On Wed, Jul 13, 2016 at 01:46:07PM -0700, Santosh Shilimkar wrote:
>>
>> On 7/13/2016 12:50 PM, Yuval Shaia wrote:
>>> On Wed, Jul 13, 2016 at 01:25:04PM -0600, Jason Gunthorpe wrote:
>>>> On Wed, Jul 13, 2016 at 10:12:25PM +0300, Yuval Shaia wrote:
>>>>> On Wed, Jul 13, 2016 at 11:47:42AM -0600, Jason Gunthorpe wrote:
>>>>>> On Wed, Jul 13, 2016 at 02:33:56AM -0700, Yuval Shaia wrote:
>>>>>>> To avoid entering into endless loop when device can't poll CQE from CQ
>>>>>>> driver should not reschedule if error is not -EAGAIN.
>>>>>>
>>>>>> ?? what causes ib_poll_cq to return an error?
>>>>>>
>>>>>> You need to describe the motivation here.
>>>>>
>>>>> EAGAIN is fine - HW driver returns this to indicates temporary error and
>>>>> caller should retry again.
>>>>> However, other errors (such as EINVAL) may refer to some fatal error where
>>>>> HW driver is unable to recover from.
>>>>
>>>> So you've never seen this?
>>>
>>> Waiting for real use case might take some time.
>>>
>>>>
>>>> I question the sanity of a poll_cq implementation that can return a
>>>> hard error...
>>>>
>>>>> Two examples:
>>>>> - Mellanox folks may comment for example if the case where
>>>>>  __mlx4_qp_lookup() returns NULL in function mlx4_ib_poll_one() means
>>>>>  fatal or not.
>>>>> - At least by reading the of c4iw_poll_cq_one() it is clear that it may
>>>>>  return fatal error.
>>>>
>>>> If EAGAIN should be ignored, then all other errors indicate the CQ is
>>>> dead and needs to be reconstructed. So the approach in this patch to
>>>> add a 'recv_conseq_cq_errs' is nonsense. You need to trigger some kind
>>>> of restart of the QP instead.
>>>
>>> The idea behind consec counter is not a recovery mechanism it is just some
>>> why to "retry" for a while just before declaring game over. I do not have
>>> strong opinion on that, actually my first try was w/o it, i.e. 'kill' on
>>> the first error.
>>>
>>> Patch does not offer any recovery mechanism it simply print fatal error to
>>> console and exit NAPI. This fatal error will suggest admin to reload the
>>> driver or something like that.
>>> This takes us to "recovery-mechanism" :)
>>> I'm not sure that restarting the QP will help as the error is while reading
>>> the CQ and restarting the CQ is more or less like restarting the driver.
>>>
>> Probably Jason mean destroy the problematic CQ and create a new one.
>> This is what Haakon suggested as well but it will lead to the leak
>> and also possible issue with outstanding WC's getting lost without
>> being flushed on that CQ.
>
> It is not only leak.
> This CQ serve many QP (== many connections). Destroying it seems to me
> catastrophic as reloading the driver.
>
Fair enough.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haakon Bugge July 14, 2016, 5:12 p.m. UTC | #10
> On 13 Jul 2016, at 22:53, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:
> 
> On Wed, Jul 13, 2016 at 01:46:07PM -0700, Santosh Shilimkar wrote:
>>> Patch does not offer any recovery mechanism it simply print fatal error to
>>> console and exit NAPI. This fatal error will suggest admin to reload the
>>> driver or something like that.
>>> This takes us to "recovery-mechanism" :)
>>> I'm not sure that restarting the QP will help as the error is while reading
>>> the CQ and restarting the CQ is more or less like restarting the driver.
>>> 
>> Probably Jason mean destroy the problematic CQ and create a new one. This is
>> what Haakon suggested as well but it will lead to the leak
>> and also possible issue with outstanding WC's getting lost without
>> being flushed on that CQ.
> 
> Again, this seems crazy.
> 
> What failure mode does a CQ have that does not require a full driver
> restart after?

The OFED API clearly indicates that ib_poll_cq() might fail and return an error status.

This API is also conforms with the IBTA specification, section 11.4.2.1 Poll for Completion:

“If an immediate error, associated with executing the Poll CQ verb itself, is returned, the CQ and QP state shall not be affected.”

> Drivers that hard fail a CQ poll should declare themselves dead and
> require a full restart to recover. This is the same infrastructure
> that was added to the mlx drivers to handle other forms of hard
> errors.

Yes, but this is at the discretion of the driver. Provider drivers are free to have a BUG_ON() instead of a return -EINVAL. But that is not the case. And, if there is an intermittent error return (e.g., EAGAIN), I take it that we can agree that the driver shall not declare itself dead.

Now, for drivers that can fault isolate a CQ error to the CQ and affected QPs, as supported by the API, should be allowed to offer that possibility in my opinion. Errors do happen, although this kind of errors happen seldom. There are different failover models that can be used to overcome such an error, whether it is intermittent or hard.

> This was the same direction we went in for the _destroy functions.
> 
> poll_cq should only return -EAGAIN or success (and EAGAIN seems
> fairly strange, how is that different from returning 0 wcs?)

Well, now we are post the creation time of the OFED API and the IBTA standard, and you propose to change them. Thats is OK by itself, but orthogonal to this commit.

I see -EINVAL returned from poll_cq() from ehca and mlx4 for example.

> If there is some actual reason preventing a driver from implementing
> that kind of API lets hear it. Otherwise I recommend you focus patches
> on fixing any broken drivers and documenting this requirement.

Well, handling legitimate and possible error returns from HCA drivers is honourable enough in my opinion.



Håkon


> 
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 14, 2016, 5:34 p.m. UTC | #11
On Thu, Jul 14, 2016 at 07:12:48PM +0200, H??kon Bugge wrote:
> >> Probably Jason mean destroy the problematic CQ and create a new one. This is
> >> what Haakon suggested as well but it will lead to the leak
> >> and also possible issue with outstanding WC's getting lost without
> >> being flushed on that CQ.
> > 
> > Again, this seems crazy.
> > 
> > What failure mode does a CQ have that does not require a full driver
> > restart after?
> 
> The OFED API clearly indicates that ib_poll_cq() might fail and
> return an error status.
> 
> This API is also conforms with the IBTA specification, section
> 11.4.2.1 Poll for Completion:

Irrelevant. This is the kAPI, it is not defined by anyone except the
kernel community.

We have made mistakes in it's design in the past (eg see the _destroy
functions that used to return errors) and when those mistakes are
discovered they need to be fixed robustly across the whole kernel.

The approach in our subsystem for handling serious 'should never
happen' errors is to trigger a full device restart & recovery.

IMHO all CQ related errors at poll time fall into that category.

If you have a concrete counter example please share it.

> > Drivers that hard fail a CQ poll should declare themselves dead and
> > require a full restart to recover. This is the same infrastructure
> > that was added to the mlx drivers to handle other forms of hard
> > errors.
> 
> Yes, but this is at the discretion of the driver. Provider drivers
> are free to have a BUG_ON() instead of a return -EINVAL. But that is
> not the case. And, if there is an intermittent error return (e.g.,
> EAGAIN), I take it that we can agree that the driver shall not
> declare itself dead.

Giving the driver this discretion was the design error. The ULPs are
not prepared to handle this case, and any driver that returns an error
here is likely going to trigger kernel bugs elsewhere.

> Now, for drivers that can fault isolate a CQ error to the CQ and
> affected QPs, as supported by the API, should be allowed to offer
> that possibility in my opinion. Errors do happen, although this kind
> of errors happen seldom. There are different failover models that
> can be used to overcome such an error, whether it is intermittent or
> hard.

Again, that is not the recovery model we have in our subsystem.

Is this actually a real case? What scenario could possibly result in a
CQ localized polling error?

If a device really supports some kind of fine grained error detection
then it needs to use established mechanisms -> move the impacted QPs
to an error state and deliver a CQE. It is up to the driver to ensure
that this can happen without cq poll failure, not every ULP.

> I see -EINVAL returned from poll_cq() from ehca and mlx4 for example.

I'm not disputing there is a problem here, only that this patch
represents an acceptable fix.

For instance you can argue we should handle CQ isolated errors. Fine,
but this patch doesn't do that. It leaves IPoIB in a broken state if
such an error occurs. It doesn't represent an audit of all poll users
to check if they handle errors properly (I doubt they do, since this
patch doesn't even properly fix ipoib)

Thus, in my view, the shortest path to overall kernel correctness is
to follow our subsystem standard, and trigger a device restart in the
driver and ban failure of CQ poll in the kAPI.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 4f7d9b4..1946149 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -116,6 +116,8 @@  enum {
 	IPOIB_NON_CHILD		  = 0,
 	IPOIB_LEGACY_CHILD	  = 1,
 	IPOIB_RTNL_CHILD	  = 2,
+
+	IPOIB_MAX_CONSEQ_CQ_ERR	  = 10,
 };
 
 #define	IPOIB_OP_RECV   (1ul << 31)
@@ -347,6 +349,7 @@  struct ipoib_dev_priv {
 	u16		  pkey_index;
 	struct ib_pd	 *pd;
 	struct ib_cq	 *recv_cq;
+	int		  recv_conseq_cq_errs;
 	struct ib_cq	 *send_cq;
 	struct ib_qp	 *qp;
 	u32		  qkey;
@@ -772,6 +775,8 @@  static inline void ipoib_unregister_debugfs(void) { }
 	printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg)
 #define ipoib_warn(priv, format, arg...)		\
 	ipoib_printk(KERN_WARNING, priv, format , ## arg)
+#define ipoib_crit(priv, format, arg...)		\
+	ipoib_printk(KERN_CRIT, priv, format , ## arg)
 
 extern int ipoib_sendq_size;
 extern int ipoib_recvq_size;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index dc6d241..171a425 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -475,6 +475,16 @@  poll_more:
 			break;
 	}
 
+	if (unlikely(n < 0) && (n != -EAGAIN)) {
+		if (priv->recv_conseq_cq_errs++ >= IPOIB_MAX_CONSEQ_CQ_ERR) {
+			ipoib_crit(priv,
+				   "Too many poll_cq errors, last error: %d\n",
+				   n);
+			return done;
+		}
+	} else
+		priv->recv_conseq_cq_errs = 0;
+
 	if (done < budget) {
 		napi_complete(napi);
 		if (unlikely(ib_req_notify_cq(priv->recv_cq,
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 1e7cbba..6ca8bdd 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -181,6 +181,7 @@  int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 		printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name);
 		goto out_cm_dev_cleanup;
 	}
+	priv->recv_conseq_cq_errs = 0;
 
 	cq_attr.cqe = ipoib_sendq_size;
 	priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL,