diff mbox series

[for-next,2/3] RDMA/efa: Count mmap failures

Message ID 20200420062213.44577-3-galpress@amazon.com (mailing list archive)
State Mainlined
Commit eca5757f804f046dfaab4e9d3ea39af1f2523990
Delegated to: Leon Romanovsky
Headers show
Series EFA statistics minor updates | expand

Commit Message

Gal Pressman April 20, 2020, 6:22 a.m. UTC
Add a new stat that counts mmap failures, which might help when
debugging different issues.

Reviewed-by: Firas JahJah <firasj@amazon.com>
Reviewed-by: Yossi Leybovich <sleybo@amazon.com>
Signed-off-by: Gal Pressman <galpress@amazon.com>
---
 drivers/infiniband/hw/efa/efa.h       | 3 ++-
 drivers/infiniband/hw/efa/efa_verbs.c | 9 +++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

Comments

Jason Gunthorpe April 24, 2020, 2:59 p.m. UTC | #1
On Mon, Apr 20, 2020 at 09:22:12AM +0300, Gal Pressman wrote:
> Add a new stat that counts mmap failures, which might help when
> debugging different issues.
> 
> Reviewed-by: Firas JahJah <firasj@amazon.com>
> Reviewed-by: Yossi Leybovich <sleybo@amazon.com>
> Signed-off-by: Gal Pressman <galpress@amazon.com>
>  drivers/infiniband/hw/efa/efa.h       | 3 ++-
>  drivers/infiniband/hw/efa/efa_verbs.c | 9 +++++++--
>  2 files changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
> index aa7396a1588a..77c9ff798117 100644
> +++ b/drivers/infiniband/hw/efa/efa.h
> @@ -1,6 +1,6 @@
>  /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
>  /*
> - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
> + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
>   */
>  
>  #ifndef _EFA_H_
> @@ -40,6 +40,7 @@ struct efa_sw_stats {
>  	atomic64_t reg_mr_err;
>  	atomic64_t alloc_ucontext_err;
>  	atomic64_t create_ah_err;
> +	atomic64_t mmap_err;
>  };
>  
>  /* Don't use anything other than atomic64 */
> diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
> index b555845d6c14..75eef1ec2474 100644
> +++ b/drivers/infiniband/hw/efa/efa_verbs.c
> @@ -44,7 +44,8 @@ struct efa_user_mmap_entry {
>  	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
>  	op(EFA_REG_MR_ERR, "reg_mr_err") \
>  	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
> -	op(EFA_CREATE_AH_ERR, "create_ah_err")
> +	op(EFA_CREATE_AH_ERR, "create_ah_err") \
> +	op(EFA_MMAP_ERR, "mmap_err")
>  
>  #define EFA_STATS_ENUM(ename, name) ename,
>  #define EFA_STATS_STR(ename, name) [ename] = name,
> @@ -1569,6 +1570,7 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
>  		ibdev_dbg(&dev->ibdev,
>  			  "pgoff[%#lx] does not have valid entry\n",
>  			  vma->vm_pgoff);
> +		atomic64_inc(&dev->stats.sw_stats.mmap_err);
>  		return -EINVAL;
>  	}
>  	entry = to_emmap(rdma_entry);
> @@ -1604,12 +1606,14 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
>  		err = -EINVAL;
>  	}
>  
> -	if (err)
> +	if (err) {
>  		ibdev_dbg(
>  			&dev->ibdev,
>  			"Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n",
>  			entry->address, rdma_entry->npages * PAGE_SIZE,
>  			entry->mmap_flag, err);
> +		atomic64_inc(&dev->stats.sw_stats.mmap_err);

Really? Isn't this something that is only possible with a buggy
rdma-core provider? Why count it?

Jason
Gal Pressman April 24, 2020, 3:25 p.m. UTC | #2
On 24/04/2020 17:59, Jason Gunthorpe wrote:
> On Mon, Apr 20, 2020 at 09:22:12AM +0300, Gal Pressman wrote:
>> Add a new stat that counts mmap failures, which might help when
>> debugging different issues.
>>
>> Reviewed-by: Firas JahJah <firasj@amazon.com>
>> Reviewed-by: Yossi Leybovich <sleybo@amazon.com>
>> Signed-off-by: Gal Pressman <galpress@amazon.com>
>>  drivers/infiniband/hw/efa/efa.h       | 3 ++-
>>  drivers/infiniband/hw/efa/efa_verbs.c | 9 +++++++--
>>  2 files changed, 9 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
>> index aa7396a1588a..77c9ff798117 100644
>> +++ b/drivers/infiniband/hw/efa/efa.h
>> @@ -1,6 +1,6 @@
>>  /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
>>  /*
>> - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
>> + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
>>   */
>>  
>>  #ifndef _EFA_H_
>> @@ -40,6 +40,7 @@ struct efa_sw_stats {
>>  	atomic64_t reg_mr_err;
>>  	atomic64_t alloc_ucontext_err;
>>  	atomic64_t create_ah_err;
>> +	atomic64_t mmap_err;
>>  };
>>  
>>  /* Don't use anything other than atomic64 */
>> diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
>> index b555845d6c14..75eef1ec2474 100644
>> +++ b/drivers/infiniband/hw/efa/efa_verbs.c
>> @@ -44,7 +44,8 @@ struct efa_user_mmap_entry {
>>  	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
>>  	op(EFA_REG_MR_ERR, "reg_mr_err") \
>>  	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
>> -	op(EFA_CREATE_AH_ERR, "create_ah_err")
>> +	op(EFA_CREATE_AH_ERR, "create_ah_err") \
>> +	op(EFA_MMAP_ERR, "mmap_err")
>>  
>>  #define EFA_STATS_ENUM(ename, name) ename,
>>  #define EFA_STATS_STR(ename, name) [ename] = name,
>> @@ -1569,6 +1570,7 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
>>  		ibdev_dbg(&dev->ibdev,
>>  			  "pgoff[%#lx] does not have valid entry\n",
>>  			  vma->vm_pgoff);
>> +		atomic64_inc(&dev->stats.sw_stats.mmap_err);
>>  		return -EINVAL;
>>  	}
>>  	entry = to_emmap(rdma_entry);
>> @@ -1604,12 +1606,14 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
>>  		err = -EINVAL;
>>  	}
>>  
>> -	if (err)
>> +	if (err) {
>>  		ibdev_dbg(
>>  			&dev->ibdev,
>>  			"Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n",
>>  			entry->address, rdma_entry->npages * PAGE_SIZE,
>>  			entry->mmap_flag, err);
>> +		atomic64_inc(&dev->stats.sw_stats.mmap_err);
> 
> Really? Isn't this something that is only possible with a buggy
> rdma-core provider? Why count it?

Though unlikely, it could happen, otherwise this error flow wouldn't exist in
the first place.

If for some reason a customer app steps on a bug we're not aware of, this
counter could serve as a red flag.
Jason Gunthorpe April 24, 2020, 6:26 p.m. UTC | #3
On Fri, Apr 24, 2020 at 06:25:54PM +0300, Gal Pressman wrote:
> On 24/04/2020 17:59, Jason Gunthorpe wrote:
> > On Mon, Apr 20, 2020 at 09:22:12AM +0300, Gal Pressman wrote:
> >> Add a new stat that counts mmap failures, which might help when
> >> debugging different issues.
> >>
> >> Reviewed-by: Firas JahJah <firasj@amazon.com>
> >> Reviewed-by: Yossi Leybovich <sleybo@amazon.com>
> >> Signed-off-by: Gal Pressman <galpress@amazon.com>
> >>  drivers/infiniband/hw/efa/efa.h       | 3 ++-
> >>  drivers/infiniband/hw/efa/efa_verbs.c | 9 +++++++--
> >>  2 files changed, 9 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
> >> index aa7396a1588a..77c9ff798117 100644
> >> +++ b/drivers/infiniband/hw/efa/efa.h
> >> @@ -1,6 +1,6 @@
> >>  /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
> >>  /*
> >> - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
> >> + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
> >>   */
> >>  
> >>  #ifndef _EFA_H_
> >> @@ -40,6 +40,7 @@ struct efa_sw_stats {
> >>  	atomic64_t reg_mr_err;
> >>  	atomic64_t alloc_ucontext_err;
> >>  	atomic64_t create_ah_err;
> >> +	atomic64_t mmap_err;
> >>  };
> >>  
> >>  /* Don't use anything other than atomic64 */
> >> diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
> >> index b555845d6c14..75eef1ec2474 100644
> >> +++ b/drivers/infiniband/hw/efa/efa_verbs.c
> >> @@ -44,7 +44,8 @@ struct efa_user_mmap_entry {
> >>  	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
> >>  	op(EFA_REG_MR_ERR, "reg_mr_err") \
> >>  	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
> >> -	op(EFA_CREATE_AH_ERR, "create_ah_err")
> >> +	op(EFA_CREATE_AH_ERR, "create_ah_err") \
> >> +	op(EFA_MMAP_ERR, "mmap_err")
> >>  
> >>  #define EFA_STATS_ENUM(ename, name) ename,
> >>  #define EFA_STATS_STR(ename, name) [ename] = name,
> >> @@ -1569,6 +1570,7 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
> >>  		ibdev_dbg(&dev->ibdev,
> >>  			  "pgoff[%#lx] does not have valid entry\n",
> >>  			  vma->vm_pgoff);
> >> +		atomic64_inc(&dev->stats.sw_stats.mmap_err);
> >>  		return -EINVAL;
> >>  	}
> >>  	entry = to_emmap(rdma_entry);
> >> @@ -1604,12 +1606,14 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
> >>  		err = -EINVAL;
> >>  	}
> >>  
> >> -	if (err)
> >> +	if (err) {
> >>  		ibdev_dbg(
> >>  			&dev->ibdev,
> >>  			"Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n",
> >>  			entry->address, rdma_entry->npages * PAGE_SIZE,
> >>  			entry->mmap_flag, err);
> >> +		atomic64_inc(&dev->stats.sw_stats.mmap_err);
> > 
> > Really? Isn't this something that is only possible with a buggy
> > rdma-core provider? Why count it?
> 
> Though unlikely, it could happen, otherwise this error flow wouldn't exist in
> the first place.
> 
> If for some reason a customer app steps on a bug we're not aware of, this
> counter could serve as a red flag.

But there are lots of cases where a buggy provider can cause error
exits, why choose this one to count against all the others?

Jason
Gal Pressman April 26, 2020, 6:42 a.m. UTC | #4
On 24/04/2020 21:26, Jason Gunthorpe wrote:
> On Fri, Apr 24, 2020 at 06:25:54PM +0300, Gal Pressman wrote:
>> On 24/04/2020 17:59, Jason Gunthorpe wrote:
>>> On Mon, Apr 20, 2020 at 09:22:12AM +0300, Gal Pressman wrote:
>>>> Add a new stat that counts mmap failures, which might help when
>>>> debugging different issues.
>>>>
>>>> Reviewed-by: Firas JahJah <firasj@amazon.com>
>>>> Reviewed-by: Yossi Leybovich <sleybo@amazon.com>
>>>> Signed-off-by: Gal Pressman <galpress@amazon.com>
>>>>  drivers/infiniband/hw/efa/efa.h       | 3 ++-
>>>>  drivers/infiniband/hw/efa/efa_verbs.c | 9 +++++++--
>>>>  2 files changed, 9 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
>>>> index aa7396a1588a..77c9ff798117 100644
>>>> +++ b/drivers/infiniband/hw/efa/efa.h
>>>> @@ -1,6 +1,6 @@
>>>>  /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
>>>>  /*
>>>> - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
>>>> + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
>>>>   */
>>>>  
>>>>  #ifndef _EFA_H_
>>>> @@ -40,6 +40,7 @@ struct efa_sw_stats {
>>>>  	atomic64_t reg_mr_err;
>>>>  	atomic64_t alloc_ucontext_err;
>>>>  	atomic64_t create_ah_err;
>>>> +	atomic64_t mmap_err;
>>>>  };
>>>>  
>>>>  /* Don't use anything other than atomic64 */
>>>> diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
>>>> index b555845d6c14..75eef1ec2474 100644
>>>> +++ b/drivers/infiniband/hw/efa/efa_verbs.c
>>>> @@ -44,7 +44,8 @@ struct efa_user_mmap_entry {
>>>>  	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
>>>>  	op(EFA_REG_MR_ERR, "reg_mr_err") \
>>>>  	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
>>>> -	op(EFA_CREATE_AH_ERR, "create_ah_err")
>>>> +	op(EFA_CREATE_AH_ERR, "create_ah_err") \
>>>> +	op(EFA_MMAP_ERR, "mmap_err")
>>>>  
>>>>  #define EFA_STATS_ENUM(ename, name) ename,
>>>>  #define EFA_STATS_STR(ename, name) [ename] = name,
>>>> @@ -1569,6 +1570,7 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
>>>>  		ibdev_dbg(&dev->ibdev,
>>>>  			  "pgoff[%#lx] does not have valid entry\n",
>>>>  			  vma->vm_pgoff);
>>>> +		atomic64_inc(&dev->stats.sw_stats.mmap_err);
>>>>  		return -EINVAL;
>>>>  	}
>>>>  	entry = to_emmap(rdma_entry);
>>>> @@ -1604,12 +1606,14 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
>>>>  		err = -EINVAL;
>>>>  	}
>>>>  
>>>> -	if (err)
>>>> +	if (err) {
>>>>  		ibdev_dbg(
>>>>  			&dev->ibdev,
>>>>  			"Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n",
>>>>  			entry->address, rdma_entry->npages * PAGE_SIZE,
>>>>  			entry->mmap_flag, err);
>>>> +		atomic64_inc(&dev->stats.sw_stats.mmap_err);
>>>
>>> Really? Isn't this something that is only possible with a buggy
>>> rdma-core provider? Why count it?
>>
>> Though unlikely, it could happen, otherwise this error flow wouldn't exist in
>> the first place.
>>
>> If for some reason a customer app steps on a bug we're not aware of, this
>> counter could serve as a red flag.
> 
> But there are lots of cases where a buggy provider can cause error
> exits, why choose this one to count against all the others?

It's not one against all others, most if not all of our userspace facing API
error flows have a similar counter.
And TBH, I think that the mmap flow is quite convoluted with the cookie response
from the crate verb, so it deserves a counter IMO.
Jason Gunthorpe April 26, 2020, 1:30 p.m. UTC | #5
On Sun, Apr 26, 2020 at 09:42:27AM +0300, Gal Pressman wrote:
> On 24/04/2020 21:26, Jason Gunthorpe wrote:
> > On Fri, Apr 24, 2020 at 06:25:54PM +0300, Gal Pressman wrote:
> >> On 24/04/2020 17:59, Jason Gunthorpe wrote:
> >>> On Mon, Apr 20, 2020 at 09:22:12AM +0300, Gal Pressman wrote:
> >>>> Add a new stat that counts mmap failures, which might help when
> >>>> debugging different issues.
> >>>>
> >>>> Reviewed-by: Firas JahJah <firasj@amazon.com>
> >>>> Reviewed-by: Yossi Leybovich <sleybo@amazon.com>
> >>>> Signed-off-by: Gal Pressman <galpress@amazon.com>
> >>>>  drivers/infiniband/hw/efa/efa.h       | 3 ++-
> >>>>  drivers/infiniband/hw/efa/efa_verbs.c | 9 +++++++--
> >>>>  2 files changed, 9 insertions(+), 3 deletions(-)
> >>>>
> >>>> diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
> >>>> index aa7396a1588a..77c9ff798117 100644
> >>>> +++ b/drivers/infiniband/hw/efa/efa.h
> >>>> @@ -1,6 +1,6 @@
> >>>>  /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
> >>>>  /*
> >>>> - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
> >>>> + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
> >>>>   */
> >>>>  
> >>>>  #ifndef _EFA_H_
> >>>> @@ -40,6 +40,7 @@ struct efa_sw_stats {
> >>>>  	atomic64_t reg_mr_err;
> >>>>  	atomic64_t alloc_ucontext_err;
> >>>>  	atomic64_t create_ah_err;
> >>>> +	atomic64_t mmap_err;
> >>>>  };
> >>>>  
> >>>>  /* Don't use anything other than atomic64 */
> >>>> diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
> >>>> index b555845d6c14..75eef1ec2474 100644
> >>>> +++ b/drivers/infiniband/hw/efa/efa_verbs.c
> >>>> @@ -44,7 +44,8 @@ struct efa_user_mmap_entry {
> >>>>  	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
> >>>>  	op(EFA_REG_MR_ERR, "reg_mr_err") \
> >>>>  	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
> >>>> -	op(EFA_CREATE_AH_ERR, "create_ah_err")
> >>>> +	op(EFA_CREATE_AH_ERR, "create_ah_err") \
> >>>> +	op(EFA_MMAP_ERR, "mmap_err")
> >>>>  
> >>>>  #define EFA_STATS_ENUM(ename, name) ename,
> >>>>  #define EFA_STATS_STR(ename, name) [ename] = name,
> >>>> @@ -1569,6 +1570,7 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
> >>>>  		ibdev_dbg(&dev->ibdev,
> >>>>  			  "pgoff[%#lx] does not have valid entry\n",
> >>>>  			  vma->vm_pgoff);
> >>>> +		atomic64_inc(&dev->stats.sw_stats.mmap_err);
> >>>>  		return -EINVAL;
> >>>>  	}
> >>>>  	entry = to_emmap(rdma_entry);
> >>>> @@ -1604,12 +1606,14 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
> >>>>  		err = -EINVAL;
> >>>>  	}
> >>>>  
> >>>> -	if (err)
> >>>> +	if (err) {
> >>>>  		ibdev_dbg(
> >>>>  			&dev->ibdev,
> >>>>  			"Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n",
> >>>>  			entry->address, rdma_entry->npages * PAGE_SIZE,
> >>>>  			entry->mmap_flag, err);
> >>>> +		atomic64_inc(&dev->stats.sw_stats.mmap_err);
> >>>
> >>> Really? Isn't this something that is only possible with a buggy
> >>> rdma-core provider? Why count it?
> >>
> >> Though unlikely, it could happen, otherwise this error flow wouldn't exist in
> >> the first place.
> >>
> >> If for some reason a customer app steps on a bug we're not aware of, this
> >> counter could serve as a red flag.
> > 
> > But there are lots of cases where a buggy provider can cause error
> > exits, why choose this one to count against all the others?
> 
> It's not one against all others, most if not all of our userspace facing API
> error flows have a similar counter.

Hurm, seems a bit strange, but OK

> And TBH, I think that the mmap flow is quite convoluted with the cookie response
> from the crate verb, so it deserves a counter IMO.

How so? Userspace takes the u64 from the command and pass it to mmap,
what is convoluted?

Jason
Gal Pressman April 26, 2020, 2:17 p.m. UTC | #6
On 26/04/2020 16:30, Jason Gunthorpe wrote:
> On Sun, Apr 26, 2020 at 09:42:27AM +0300, Gal Pressman wrote:
>> On 24/04/2020 21:26, Jason Gunthorpe wrote:
>>> On Fri, Apr 24, 2020 at 06:25:54PM +0300, Gal Pressman wrote:
>>>> On 24/04/2020 17:59, Jason Gunthorpe wrote:
>>>>> On Mon, Apr 20, 2020 at 09:22:12AM +0300, Gal Pressman wrote:
>>>>>> Add a new stat that counts mmap failures, which might help when
>>>>>> debugging different issues.
>>>>>>
>>>>>> Reviewed-by: Firas JahJah <firasj@amazon.com>
>>>>>> Reviewed-by: Yossi Leybovich <sleybo@amazon.com>
>>>>>> Signed-off-by: Gal Pressman <galpress@amazon.com>
>>>>>>  drivers/infiniband/hw/efa/efa.h       | 3 ++-
>>>>>>  drivers/infiniband/hw/efa/efa_verbs.c | 9 +++++++--
>>>>>>  2 files changed, 9 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
>>>>>> index aa7396a1588a..77c9ff798117 100644
>>>>>> +++ b/drivers/infiniband/hw/efa/efa.h
>>>>>> @@ -1,6 +1,6 @@
>>>>>>  /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
>>>>>>  /*
>>>>>> - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
>>>>>> + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
>>>>>>   */
>>>>>>  
>>>>>>  #ifndef _EFA_H_
>>>>>> @@ -40,6 +40,7 @@ struct efa_sw_stats {
>>>>>>  	atomic64_t reg_mr_err;
>>>>>>  	atomic64_t alloc_ucontext_err;
>>>>>>  	atomic64_t create_ah_err;
>>>>>> +	atomic64_t mmap_err;
>>>>>>  };
>>>>>>  
>>>>>>  /* Don't use anything other than atomic64 */
>>>>>> diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
>>>>>> index b555845d6c14..75eef1ec2474 100644
>>>>>> +++ b/drivers/infiniband/hw/efa/efa_verbs.c
>>>>>> @@ -44,7 +44,8 @@ struct efa_user_mmap_entry {
>>>>>>  	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
>>>>>>  	op(EFA_REG_MR_ERR, "reg_mr_err") \
>>>>>>  	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
>>>>>> -	op(EFA_CREATE_AH_ERR, "create_ah_err")
>>>>>> +	op(EFA_CREATE_AH_ERR, "create_ah_err") \
>>>>>> +	op(EFA_MMAP_ERR, "mmap_err")
>>>>>>  
>>>>>>  #define EFA_STATS_ENUM(ename, name) ename,
>>>>>>  #define EFA_STATS_STR(ename, name) [ename] = name,
>>>>>> @@ -1569,6 +1570,7 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
>>>>>>  		ibdev_dbg(&dev->ibdev,
>>>>>>  			  "pgoff[%#lx] does not have valid entry\n",
>>>>>>  			  vma->vm_pgoff);
>>>>>> +		atomic64_inc(&dev->stats.sw_stats.mmap_err);
>>>>>>  		return -EINVAL;
>>>>>>  	}
>>>>>>  	entry = to_emmap(rdma_entry);
>>>>>> @@ -1604,12 +1606,14 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
>>>>>>  		err = -EINVAL;
>>>>>>  	}
>>>>>>  
>>>>>> -	if (err)
>>>>>> +	if (err) {
>>>>>>  		ibdev_dbg(
>>>>>>  			&dev->ibdev,
>>>>>>  			"Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n",
>>>>>>  			entry->address, rdma_entry->npages * PAGE_SIZE,
>>>>>>  			entry->mmap_flag, err);
>>>>>> +		atomic64_inc(&dev->stats.sw_stats.mmap_err);
>>>>>
>>>>> Really? Isn't this something that is only possible with a buggy
>>>>> rdma-core provider? Why count it?
>>>>
>>>> Though unlikely, it could happen, otherwise this error flow wouldn't exist in
>>>> the first place.
>>>>
>>>> If for some reason a customer app steps on a bug we're not aware of, this
>>>> counter could serve as a red flag.
>>>
>>> But there are lots of cases where a buggy provider can cause error
>>> exits, why choose this one to count against all the others?
>>
>> It's not one against all others, most if not all of our userspace facing API
>> error flows have a similar counter.
> 
> Hurm, seems a bit strange, but OK
> 
>> And TBH, I think that the mmap flow is quite convoluted with the cookie response
>> from the crate verb, so it deserves a counter IMO.
> 
> How so? Userspace takes the u64 from the command and pass it to mmap,
> what is convoluted?

It's the only flow that involves two phases/userspace calls and not a single
command-response call, so it's a bit more error prone. Convoluted was a bit
harsh :).
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
index aa7396a1588a..77c9ff798117 100644
--- a/drivers/infiniband/hw/efa/efa.h
+++ b/drivers/infiniband/hw/efa/efa.h
@@ -1,6 +1,6 @@ 
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_H_
@@ -40,6 +40,7 @@  struct efa_sw_stats {
 	atomic64_t reg_mr_err;
 	atomic64_t alloc_ucontext_err;
 	atomic64_t create_ah_err;
+	atomic64_t mmap_err;
 };
 
 /* Don't use anything other than atomic64 */
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index b555845d6c14..75eef1ec2474 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -44,7 +44,8 @@  struct efa_user_mmap_entry {
 	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
 	op(EFA_REG_MR_ERR, "reg_mr_err") \
 	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
-	op(EFA_CREATE_AH_ERR, "create_ah_err")
+	op(EFA_CREATE_AH_ERR, "create_ah_err") \
+	op(EFA_MMAP_ERR, "mmap_err")
 
 #define EFA_STATS_ENUM(ename, name) ename,
 #define EFA_STATS_STR(ename, name) [ename] = name,
@@ -1569,6 +1570,7 @@  static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
 		ibdev_dbg(&dev->ibdev,
 			  "pgoff[%#lx] does not have valid entry\n",
 			  vma->vm_pgoff);
+		atomic64_inc(&dev->stats.sw_stats.mmap_err);
 		return -EINVAL;
 	}
 	entry = to_emmap(rdma_entry);
@@ -1604,12 +1606,14 @@  static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
 		err = -EINVAL;
 	}
 
-	if (err)
+	if (err) {
 		ibdev_dbg(
 			&dev->ibdev,
 			"Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n",
 			entry->address, rdma_entry->npages * PAGE_SIZE,
 			entry->mmap_flag, err);
+		atomic64_inc(&dev->stats.sw_stats.mmap_err);
+	}
 
 	rdma_user_mmap_entry_put(rdma_entry);
 	return err;
@@ -1758,6 +1762,7 @@  int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
 	stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->sw_stats.reg_mr_err);
 	stats->value[EFA_ALLOC_UCONTEXT_ERR] = atomic64_read(&s->sw_stats.alloc_ucontext_err);
 	stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->sw_stats.create_ah_err);
+	stats->value[EFA_MMAP_ERR] = atomic64_read(&s->sw_stats.mmap_err);
 
 	return ARRAY_SIZE(efa_stats_names);
 }