diff mbox series

[rdma-core] ibacm: Fix improper refcnt

Message ID 20181012171301.2437913-1-Haakon.Bugge@oracle.com (mailing list archive)
State Not Applicable
Headers show
Series [rdma-core] ibacm: Fix improper refcnt | expand

Commit Message

Haakon Bugge Oct. 12, 2018, 5:13 p.m. UTC
In the default provider, acmp, the port's sa_dest.refcnt is
initialized to one in acmp_init_dest(). When a port associated with
the device is added, the refcnt is set once again to one in
acmp_port_up().

For a system with VFs enabled, the ports for both the PF and the VFs
are added. Here an extract from the log file on a system with a single
VF enabled:

    acmp_port_up: mlx4_0 1
    acmp_port_up: mlx4_0 1 is up
    acmp_port_up: mlx4_0 1
    acmp_port_up: mlx4_0 1 is up

Now, when/if this ports goes down, acmp_port_down() will be called. It
decrements the refcnt and waits endlessly until it becomes zero. For
the first call to acmp_port_down(), it becomes zero, but the
subsequent call will loop forever in said busy-waiting loop, due to
the refcnt becoming minus one.

Fixed by changing the atomic_set() to atomic_inc() in acmp_port_up()
and test for one instead of the while loop in acmp_port_down().

Also added printing of instance number to ease debugging.

With the fix, the extract from the log file yields:

    acmp_port_up: mlx4_0 1
    acmp_port_up: mlx4_0 1 1 is up
    acmp_port_up: mlx4_0 1
    acmp_port_up: mlx4_0 1 2 is up
    acmp_port_down: mlx4_0 1
    acmp_port_down: mlx4_0 1 2 is down
    acmp_port_down: mlx4_0 1
    acmp_port_down: mlx4_0 1 1 is down

Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
---
 ibacm/prov/acmp/src/acmp.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

Comments

Haakon Bugge Oct. 16, 2018, 10:58 a.m. UTC | #1
On 12 Oct 2018, at 19:13, Håkon Bugge <haakon.bugge@oracle.com> wrote:
> 
> In the default provider, acmp, the port's sa_dest.refcnt is
> initialized to one in acmp_init_dest(). When a port associated with
> the device is added, the refcnt is set once again to one in
> acmp_port_up().
> 
> For a system with VFs enabled, the ports for both the PF and the VFs
> are added. Here an extract from the log file on a system with a single
> VF enabled:
> 
>    acmp_port_up: mlx4_0 1
>    acmp_port_up: mlx4_0 1 is up
>    acmp_port_up: mlx4_0 1
>    acmp_port_up: mlx4_0 1 is up
> 
> Now, when/if this ports goes down, acmp_port_down() will be called. It
> decrements the refcnt and waits endlessly until it becomes zero. For
> the first call to acmp_port_down(), it becomes zero, but the
> subsequent call will loop forever in said busy-waiting loop, due to
> the refcnt becoming minus one.
> 
> Fixed by changing the atomic_set() to atomic_inc() in acmp_port_up()
> and test for one instead of the while loop in acmp_port_down().
> 
> Also added printing of instance number to ease debugging.
> 
> With the fix, the extract from the log file yields:
> 
>    acmp_port_up: mlx4_0 1
>    acmp_port_up: mlx4_0 1 1 is up
>    acmp_port_up: mlx4_0 1
>    acmp_port_up: mlx4_0 1 2 is up
>    acmp_port_down: mlx4_0 1
>    acmp_port_down: mlx4_0 1 2 is down
>    acmp_port_down: mlx4_0 1
>    acmp_port_down: mlx4_0 1 1 is down
> 
> Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
> ---
> ibacm/prov/acmp/src/acmp.c | 21 ++++++++++++---------
> 1 file changed, 12 insertions(+), 9 deletions(-)
> 
> diff --git a/ibacm/prov/acmp/src/acmp.c b/ibacm/prov/acmp/src/acmp.c
> index 0324ae2f..76c52fcb 100644
> --- a/ibacm/prov/acmp/src/acmp.c
> +++ b/ibacm/prov/acmp/src/acmp.c
> @@ -2618,6 +2618,7 @@ static void acmp_port_up(struct acmp_port *port)
> 	__be16 pkey_be;
> 	__be16 sm_lid;
> 	int i, ret;
> +	int instance;
> 
> 	acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num);
> 	ret = ibv_query_port(port->dev->verbs, port->port_num, &attr);
> @@ -2643,7 +2644,7 @@ static void acmp_port_up(struct acmp_port *port)
> 	acmp_set_dest_addr(&port->sa_dest, ACM_ADDRESS_LID,
> 			   (uint8_t *) &sm_lid, sizeof(sm_lid));
> 
> -	atomic_set(&port->sa_dest.refcnt, 1);
> +	instance = atomic_inc(&port->sa_dest.refcnt) - 1;
> 	port->sa_dest.state = ACMP_READY;
> 	for (i = 0; i < attr.pkey_tbl_len; i++) {
> 		ret = ibv_query_pkey(port->dev->verbs, port->port_num, i, &pkey_be);
> @@ -2663,11 +2664,13 @@ static void acmp_port_up(struct acmp_port *port)
> 	}
> 
> 	port->state = IBV_PORT_ACTIVE;
> -	acm_log(1, "%s %d is up\n", port->dev->verbs->device->name, port->port_num);
> +	acm_log(1, "%s %d %d is up\n", port->dev->verbs->device->name, port->port_num, instance);
> }
> 
> static void acmp_port_down(struct acmp_port *port)
> {
> +	int instance;
> +
> 	acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num);
> 	pthread_mutex_lock(&port->lock);
> 	port->state = IBV_PORT_DOWN;
> @@ -2678,13 +2681,13 @@ static void acmp_port_down(struct acmp_port *port)
> 	 * event instead of a sleep loop, but it's not worth it given how
> 	 * infrequently we should be processing a port down event in practice.
> 	 */
> -	atomic_dec(&port->sa_dest.refcnt);
> -	while (atomic_get(&port->sa_dest.refcnt))
> -		sleep(0);
> -	pthread_mutex_lock(&port->sa_dest.lock);
> -	port->sa_dest.state = ACMP_INIT;
> -	pthread_mutex_unlock(&port->sa_dest.lock);
> -	acm_log(1, "%s %d is down\n", port->dev->verbs->device->name, port->port_num);
> +	instance = atomic_dec(&port->sa_dest.refcnt);
> +	if (instance == 1) {
> +		pthread_mutex_lock(&port->sa_dest.lock);
> +		port->sa_dest.state = ACMP_INIT;
> +		pthread_mutex_unlock(&port->sa_dest.lock);
> +	}
> +	acm_log(1, "%s %d %d is down\n", port->dev->verbs->device->name, port->port_num, instance);
> }
> 
> static int acmp_open_port(const struct acm_port *cport, void *dev_context,
> -- 
> 2.14.3
> 

Just made a github pull request out of this.


Thxs, Håkon
Haakon Bugge Oct. 22, 2018, 11:19 a.m. UTC | #2
> On 16 Oct 2018, at 12:58, Håkon Bugge <haakon.bugge@oracle.com> wrote:
> 
> On 12 Oct 2018, at 19:13, Håkon Bugge <haakon.bugge@oracle.com> wrote:
>> 
>> In the default provider, acmp, the port's sa_dest.refcnt is
>> initialized to one in acmp_init_dest(). When a port associated with
>> the device is added, the refcnt is set once again to one in
>> acmp_port_up().
>> 
>> For a system with VFs enabled, the ports for both the PF and the VFs
>> are added. Here an extract from the log file on a system with a single
>> VF enabled:
>> 
>>   acmp_port_up: mlx4_0 1
>>   acmp_port_up: mlx4_0 1 is up
>>   acmp_port_up: mlx4_0 1
>>   acmp_port_up: mlx4_0 1 is up
>> 
>> Now, when/if this ports goes down, acmp_port_down() will be called. It
>> decrements the refcnt and waits endlessly until it becomes zero. For
>> the first call to acmp_port_down(), it becomes zero, but the
>> subsequent call will loop forever in said busy-waiting loop, due to
>> the refcnt becoming minus one.
>> 
>> Fixed by changing the atomic_set() to atomic_inc() in acmp_port_up()
>> and test for one instead of the while loop in acmp_port_down().
>> 
>> Also added printing of instance number to ease debugging.
>> 
>> With the fix, the extract from the log file yields:
>> 
>>   acmp_port_up: mlx4_0 1
>>   acmp_port_up: mlx4_0 1 1 is up
>>   acmp_port_up: mlx4_0 1
>>   acmp_port_up: mlx4_0 1 2 is up
>>   acmp_port_down: mlx4_0 1
>>   acmp_port_down: mlx4_0 1 2 is down
>>   acmp_port_down: mlx4_0 1
>>   acmp_port_down: mlx4_0 1 1 is down
>> 
>> Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
>> ---
>> ibacm/prov/acmp/src/acmp.c | 21 ++++++++++++---------
>> 1 file changed, 12 insertions(+), 9 deletions(-)
>> 
>> diff --git a/ibacm/prov/acmp/src/acmp.c b/ibacm/prov/acmp/src/acmp.c
>> index 0324ae2f..76c52fcb 100644
>> --- a/ibacm/prov/acmp/src/acmp.c
>> +++ b/ibacm/prov/acmp/src/acmp.c
>> @@ -2618,6 +2618,7 @@ static void acmp_port_up(struct acmp_port *port)
>> 	__be16 pkey_be;
>> 	__be16 sm_lid;
>> 	int i, ret;
>> +	int instance;
>> 
>> 	acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num);
>> 	ret = ibv_query_port(port->dev->verbs, port->port_num, &attr);
>> @@ -2643,7 +2644,7 @@ static void acmp_port_up(struct acmp_port *port)
>> 	acmp_set_dest_addr(&port->sa_dest, ACM_ADDRESS_LID,
>> 			   (uint8_t *) &sm_lid, sizeof(sm_lid));
>> 
>> -	atomic_set(&port->sa_dest.refcnt, 1);
>> +	instance = atomic_inc(&port->sa_dest.refcnt) - 1;
>> 	port->sa_dest.state = ACMP_READY;
>> 	for (i = 0; i < attr.pkey_tbl_len; i++) {
>> 		ret = ibv_query_pkey(port->dev->verbs, port->port_num, i, &pkey_be);
>> @@ -2663,11 +2664,13 @@ static void acmp_port_up(struct acmp_port *port)
>> 	}
>> 
>> 	port->state = IBV_PORT_ACTIVE;
>> -	acm_log(1, "%s %d is up\n", port->dev->verbs->device->name, port->port_num);
>> +	acm_log(1, "%s %d %d is up\n", port->dev->verbs->device->name, port->port_num, instance);
>> }
>> 
>> static void acmp_port_down(struct acmp_port *port)
>> {
>> +	int instance;
>> +
>> 	acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num);
>> 	pthread_mutex_lock(&port->lock);
>> 	port->state = IBV_PORT_DOWN;
>> @@ -2678,13 +2681,13 @@ static void acmp_port_down(struct acmp_port *port)
>> 	 * event instead of a sleep loop, but it's not worth it given how
>> 	 * infrequently we should be processing a port down event in practice.
>> 	 */
>> -	atomic_dec(&port->sa_dest.refcnt);
>> -	while (atomic_get(&port->sa_dest.refcnt))
>> -		sleep(0);
>> -	pthread_mutex_lock(&port->sa_dest.lock);
>> -	port->sa_dest.state = ACMP_INIT;
>> -	pthread_mutex_unlock(&port->sa_dest.lock);
>> -	acm_log(1, "%s %d is down\n", port->dev->verbs->device->name, port->port_num);
>> +	instance = atomic_dec(&port->sa_dest.refcnt);
>> +	if (instance == 1) {
>> +		pthread_mutex_lock(&port->sa_dest.lock);
>> +		port->sa_dest.state = ACMP_INIT;
>> +		pthread_mutex_unlock(&port->sa_dest.lock);
>> +	}
>> +	acm_log(1, "%s %d %d is down\n", port->dev->verbs->device->name, port->port_num, instance);
>> }
>> 
>> static int acmp_open_port(const struct acm_port *cport, void *dev_context,
>> -- 
>> 2.14.3
>> 
> 
> Just made a github pull request out of this.

Anything more required from my side on:

https://github.com/linux-rdma/rdma-core/pull/400
?

Thxs, Håkon
Jason Gunthorpe Oct. 22, 2018, 8:02 p.m. UTC | #3
On Mon, Oct 22, 2018 at 01:19:03PM +0200, Håkon Bugge wrote:
> 
> 
> > On 16 Oct 2018, at 12:58, Håkon Bugge <haakon.bugge@oracle.com> wrote:
> > 
> > On 12 Oct 2018, at 19:13, Håkon Bugge <haakon.bugge@oracle.com> wrote:
> >> 
> >> In the default provider, acmp, the port's sa_dest.refcnt is
> >> initialized to one in acmp_init_dest(). When a port associated with
> >> the device is added, the refcnt is set once again to one in
> >> acmp_port_up().
> >> 
> >> For a system with VFs enabled, the ports for both the PF and the VFs
> >> are added. Here an extract from the log file on a system with a single
> >> VF enabled:
> >> 
> >>   acmp_port_up: mlx4_0 1
> >>   acmp_port_up: mlx4_0 1 is up
> >>   acmp_port_up: mlx4_0 1
> >>   acmp_port_up: mlx4_0 1 is up
> >> 
> >> Now, when/if this ports goes down, acmp_port_down() will be called. It
> >> decrements the refcnt and waits endlessly until it becomes zero. For
> >> the first call to acmp_port_down(), it becomes zero, but the
> >> subsequent call will loop forever in said busy-waiting loop, due to
> >> the refcnt becoming minus one.
> >> 
> >> Fixed by changing the atomic_set() to atomic_inc() in acmp_port_up()
> >> and test for one instead of the while loop in acmp_port_down().
> >> 
> >> Also added printing of instance number to ease debugging.
> >> 
> >> With the fix, the extract from the log file yields:
> >> 
> >>   acmp_port_up: mlx4_0 1
> >>   acmp_port_up: mlx4_0 1 1 is up
> >>   acmp_port_up: mlx4_0 1
> >>   acmp_port_up: mlx4_0 1 2 is up
> >>   acmp_port_down: mlx4_0 1
> >>   acmp_port_down: mlx4_0 1 2 is down
> >>   acmp_port_down: mlx4_0 1
> >>   acmp_port_down: mlx4_0 1 1 is down
> >> 
> >> Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
> >> ibacm/prov/acmp/src/acmp.c | 21 ++++++++++++---------
> >> 1 file changed, 12 insertions(+), 9 deletions(-)
> >> 
> >> diff --git a/ibacm/prov/acmp/src/acmp.c b/ibacm/prov/acmp/src/acmp.c
> >> index 0324ae2f..76c52fcb 100644
> >> +++ b/ibacm/prov/acmp/src/acmp.c
> >> @@ -2618,6 +2618,7 @@ static void acmp_port_up(struct acmp_port *port)
> >> 	__be16 pkey_be;
> >> 	__be16 sm_lid;
> >> 	int i, ret;
> >> +	int instance;
> >> 
> >> 	acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num);
> >> 	ret = ibv_query_port(port->dev->verbs, port->port_num, &attr);
> >> @@ -2643,7 +2644,7 @@ static void acmp_port_up(struct acmp_port *port)
> >> 	acmp_set_dest_addr(&port->sa_dest, ACM_ADDRESS_LID,
> >> 			   (uint8_t *) &sm_lid, sizeof(sm_lid));
> >> 
> >> -	atomic_set(&port->sa_dest.refcnt, 1);
> >> +	instance = atomic_inc(&port->sa_dest.refcnt) - 1;
> >> 	port->sa_dest.state = ACMP_READY;
> >> 	for (i = 0; i < attr.pkey_tbl_len; i++) {
> >> 		ret = ibv_query_pkey(port->dev->verbs, port->port_num, i, &pkey_be);
> >> @@ -2663,11 +2664,13 @@ static void acmp_port_up(struct acmp_port *port)
> >> 	}
> >> 
> >> 	port->state = IBV_PORT_ACTIVE;
> >> -	acm_log(1, "%s %d is up\n", port->dev->verbs->device->name, port->port_num);
> >> +	acm_log(1, "%s %d %d is up\n", port->dev->verbs->device->name, port->port_num, instance);
> >> }
> >> 
> >> static void acmp_port_down(struct acmp_port *port)
> >> {
> >> +	int instance;
> >> +
> >> 	acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num);
> >> 	pthread_mutex_lock(&port->lock);
> >> 	port->state = IBV_PORT_DOWN;
> >> @@ -2678,13 +2681,13 @@ static void acmp_port_down(struct acmp_port *port)
> >> 	 * event instead of a sleep loop, but it's not worth it given how
> >> 	 * infrequently we should be processing a port down event in practice.
> >> 	 */
> >> -	atomic_dec(&port->sa_dest.refcnt);
> >> -	while (atomic_get(&port->sa_dest.refcnt))
> >> -		sleep(0);
> >> -	pthread_mutex_lock(&port->sa_dest.lock);
> >> -	port->sa_dest.state = ACMP_INIT;
> >> -	pthread_mutex_unlock(&port->sa_dest.lock);
> >> -	acm_log(1, "%s %d is down\n", port->dev->verbs->device->name, port->port_num);
> >> +	instance = atomic_dec(&port->sa_dest.refcnt);
> >> +	if (instance == 1) {
> >> +		pthread_mutex_lock(&port->sa_dest.lock);
> >> +		port->sa_dest.state = ACMP_INIT;
> >> +		pthread_mutex_unlock(&port->sa_dest.lock);
> >> +	}
> >> +	acm_log(1, "%s %d %d is down\n", port->dev->verbs->device->name, port->port_num, instance);
> >> }
> >> 
> >> static int acmp_open_port(const struct acm_port *cport, void *dev_context,
> > 
> > Just made a github pull request out of this.
> 
> Anything more required from my side on:
> 
> https://github.com/linux-rdma/rdma-core/pull/400
> ?

Someone who cares about ACM should review it..

Jason
Haakon Bugge Oct. 31, 2018, 11:37 a.m. UTC | #4
Hi,

Somewhat confused. We have in ibacm/include/acm_mad.h:

struct ib_sa_mad {
	[]
        __be16 status;
	[]
};

In ibacm/prov/acmp/src/acmp.c, acmp_process_join_resp():

        if (mad->status) {
                acm_log(0, "ERROR - join response status 0x%x\n", mad->status);
                goto out;
        }

So, I thought, simple enough, the above is an endian conversion bug, and the code should read:

        if (be16toh(mad->status)) {
                acm_log(0, "ERROR - join response status 0x%x\n", be16toh(mad->status));
                goto out;
        }

On a real system, I saw:

acmp_process_join_resp: ERROR - join response status 0x2

So, since the conversion is missing, the status value is actually 0x200.

Looking at IBTA, MAD Common Status Field Bit Values, status 0x200 is (bit #0) not busy, (bit #1) no redirection, (bits #2-4) no invalid fields, and a Class Specific status of 2, which from Table SA MAD Class-Specific Status Encodings is ERR_REQ_INVALID.

Is this the correct interpretation? If so, should the status print divide the Common and Class-Specific in two? Or are we only interested in the Class-Specific error here?


Thxs, Håkon
Leon Romanovsky Oct. 31, 2018, 11:49 a.m. UTC | #5
On Tue, Oct 23, 2018 at 01:57:50PM +0200, Håkon Bugge wrote:
>
>
> > On 22 Oct 2018, at 22:02, Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > On Mon, Oct 22, 2018 at 01:19:03PM +0200, Håkon Bugge wrote:
> >>
> >>
> >>> On 16 Oct 2018, at 12:58, Håkon Bugge <haakon.bugge@oracle.com> wrote:
> >>>
> >>> On 12 Oct 2018, at 19:13, Håkon Bugge <haakon.bugge@oracle.com> wrote:
> >>>>
> >>>> In the default provider, acmp, the port's sa_dest.refcnt is
> >>>> initialized to one in acmp_init_dest(). When a port associated with
> >>>> the device is added, the refcnt is set once again to one in
> >>>> acmp_port_up().
> >>>>
> >>>> For a system with VFs enabled, the ports for both the PF and the VFs
> >>>> are added. Here an extract from the log file on a system with a single
> >>>> VF enabled:
> >>>>
> >>>>  acmp_port_up: mlx4_0 1
> >>>>  acmp_port_up: mlx4_0 1 is up
> >>>>  acmp_port_up: mlx4_0 1
> >>>>  acmp_port_up: mlx4_0 1 is up
> >>>>
> >>>> Now, when/if this ports goes down, acmp_port_down() will be called. It
> >>>> decrements the refcnt and waits endlessly until it becomes zero. For
> >>>> the first call to acmp_port_down(), it becomes zero, but the
> >>>> subsequent call will loop forever in said busy-waiting loop, due to
> >>>> the refcnt becoming minus one.
> >>>>
> >>>> Fixed by changing the atomic_set() to atomic_inc() in acmp_port_up()
> >>>> and test for one instead of the while loop in acmp_port_down().
> >>>>
> >>>> Also added printing of instance number to ease debugging.
> >>>>
> >>>> With the fix, the extract from the log file yields:
> >>>>
> >>>>  acmp_port_up: mlx4_0 1
> >>>>  acmp_port_up: mlx4_0 1 1 is up
> >>>>  acmp_port_up: mlx4_0 1
> >>>>  acmp_port_up: mlx4_0 1 2 is up
> >>>>  acmp_port_down: mlx4_0 1
> >>>>  acmp_port_down: mlx4_0 1 2 is down
> >>>>  acmp_port_down: mlx4_0 1
> >>>>  acmp_port_down: mlx4_0 1 1 is down
> >>>>
> >>>> Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
> >>>> ibacm/prov/acmp/src/acmp.c | 21 ++++++++++++---------
> >>>> 1 file changed, 12 insertions(+), 9 deletions(-)
> >>>>
> >>>> diff --git a/ibacm/prov/acmp/src/acmp.c b/ibacm/prov/acmp/src/acmp.c
> >>>> index 0324ae2f..76c52fcb 100644
> >>>> +++ b/ibacm/prov/acmp/src/acmp.c
> >>>> @@ -2618,6 +2618,7 @@ static void acmp_port_up(struct acmp_port *port)
> >>>> 	__be16 pkey_be;
> >>>> 	__be16 sm_lid;
> >>>> 	int i, ret;
> >>>> +	int instance;
> >>>>
> >>>> 	acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num);
> >>>> 	ret = ibv_query_port(port->dev->verbs, port->port_num, &attr);
> >>>> @@ -2643,7 +2644,7 @@ static void acmp_port_up(struct acmp_port *port)
> >>>> 	acmp_set_dest_addr(&port->sa_dest, ACM_ADDRESS_LID,
> >>>> 			   (uint8_t *) &sm_lid, sizeof(sm_lid));
> >>>>
> >>>> -	atomic_set(&port->sa_dest.refcnt, 1);
> >>>> +	instance = atomic_inc(&port->sa_dest.refcnt) - 1;
> >>>> 	port->sa_dest.state = ACMP_READY;
> >>>> 	for (i = 0; i < attr.pkey_tbl_len; i++) {
> >>>> 		ret = ibv_query_pkey(port->dev->verbs, port->port_num, i, &pkey_be);
> >>>> @@ -2663,11 +2664,13 @@ static void acmp_port_up(struct acmp_port *port)
> >>>> 	}
> >>>>
> >>>> 	port->state = IBV_PORT_ACTIVE;
> >>>> -	acm_log(1, "%s %d is up\n", port->dev->verbs->device->name, port->port_num);
> >>>> +	acm_log(1, "%s %d %d is up\n", port->dev->verbs->device->name, port->port_num, instance);
> >>>> }
> >>>>
> >>>> static void acmp_port_down(struct acmp_port *port)
> >>>> {
> >>>> +	int instance;
> >>>> +
> >>>> 	acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num);
> >>>> 	pthread_mutex_lock(&port->lock);
> >>>> 	port->state = IBV_PORT_DOWN;
> >>>> @@ -2678,13 +2681,13 @@ static void acmp_port_down(struct acmp_port *port)
> >>>> 	 * event instead of a sleep loop, but it's not worth it given how
> >>>> 	 * infrequently we should be processing a port down event in practice.
> >>>> 	 */
> >>>> -	atomic_dec(&port->sa_dest.refcnt);
> >>>> -	while (atomic_get(&port->sa_dest.refcnt))
> >>>> -		sleep(0);
> >>>> -	pthread_mutex_lock(&port->sa_dest.lock);
> >>>> -	port->sa_dest.state = ACMP_INIT;
> >>>> -	pthread_mutex_unlock(&port->sa_dest.lock);
> >>>> -	acm_log(1, "%s %d is down\n", port->dev->verbs->device->name, port->port_num);
> >>>> +	instance = atomic_dec(&port->sa_dest.refcnt);
> >>>> +	if (instance == 1) {
> >>>> +		pthread_mutex_lock(&port->sa_dest.lock);
> >>>> +		port->sa_dest.state = ACMP_INIT;
> >>>> +		pthread_mutex_unlock(&port->sa_dest.lock);
> >>>> +	}
> >>>> +	acm_log(1, "%s %d %d is down\n", port->dev->verbs->device->name, port->port_num, instance);
> >>>> }
> >>>>
> >>>> static int acmp_open_port(const struct acm_port *cport, void *dev_context,
> >>>
> >>> Just made a github pull request out of this.
> >>
> >> Anything more required from my side on:
> >>
> >> https://github.com/linux-rdma/rdma-core/pull/400 <https://github.com/linux-rdma/rdma-core/pull/400>
> >> ?
> >
> > Someone who cares about ACM should review it..
>
> Would an orcl employee (from a different continent than me) suffice?
>

Håkon

It is better than nothing, like we have now.

Thanks

>
> Thxs, Håkon
>
> >
> > Jason
>
Hal Rosenstock Oct. 31, 2018, 12:22 p.m. UTC | #6
On 10/31/2018 7:37 AM, Håkon Bugge wrote:
> Hi,
> 
> Somewhat confused. We have in ibacm/include/acm_mad.h:
> 
> struct ib_sa_mad {
> 	[]
>         __be16 status;
> 	[]
> };
> 
> In ibacm/prov/acmp/src/acmp.c, acmp_process_join_resp():
> 
>         if (mad->status) {
>                 acm_log(0, "ERROR - join response status 0x%x\n", mad->status);
>                 goto out;
>         }
> 
> So, I thought, simple enough, the above is an endian conversion bug, and the code should read:
> 
>         if (be16toh(mad->status)) {
>                 acm_log(0, "ERROR - join response status 0x%x\n", be16toh(mad->status));
>                 goto out;
>         }
> 
> On a real system, I saw:
> 
> acmp_process_join_resp: ERROR - join response status 0x2
> 
> So, since the conversion is missing, the status value is actually 0x200.
> 
> Looking at IBTA, MAD Common Status Field Bit Values, status 0x200 is (bit #0) not busy, (bit #1) no redirection, (bits #2-4) no invalid fields, and a Class Specific status of 2, which from Table SA MAD Class-Specific Status Encodings is ERR_REQ_INVALID.
> 
> Is this the correct interpretation? 

Yes.

> If so, should the status print divide the Common and Class-Specific in two? Or are we only interested in the Class-Specific error here?

I think it should handle both common and SA class specific errors here.

-- Hal

> 
> Thxs, Håkon
> 
> 
>
diff mbox series

Patch

diff --git a/ibacm/prov/acmp/src/acmp.c b/ibacm/prov/acmp/src/acmp.c
index 0324ae2f..76c52fcb 100644
--- a/ibacm/prov/acmp/src/acmp.c
+++ b/ibacm/prov/acmp/src/acmp.c
@@ -2618,6 +2618,7 @@  static void acmp_port_up(struct acmp_port *port)
 	__be16 pkey_be;
 	__be16 sm_lid;
 	int i, ret;
+	int instance;
 
 	acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num);
 	ret = ibv_query_port(port->dev->verbs, port->port_num, &attr);
@@ -2643,7 +2644,7 @@  static void acmp_port_up(struct acmp_port *port)
 	acmp_set_dest_addr(&port->sa_dest, ACM_ADDRESS_LID,
 			   (uint8_t *) &sm_lid, sizeof(sm_lid));
 
-	atomic_set(&port->sa_dest.refcnt, 1);
+	instance = atomic_inc(&port->sa_dest.refcnt) - 1;
 	port->sa_dest.state = ACMP_READY;
 	for (i = 0; i < attr.pkey_tbl_len; i++) {
 		ret = ibv_query_pkey(port->dev->verbs, port->port_num, i, &pkey_be);
@@ -2663,11 +2664,13 @@  static void acmp_port_up(struct acmp_port *port)
 	}
 
 	port->state = IBV_PORT_ACTIVE;
-	acm_log(1, "%s %d is up\n", port->dev->verbs->device->name, port->port_num);
+	acm_log(1, "%s %d %d is up\n", port->dev->verbs->device->name, port->port_num, instance);
 }
 
 static void acmp_port_down(struct acmp_port *port)
 {
+	int instance;
+
 	acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num);
 	pthread_mutex_lock(&port->lock);
 	port->state = IBV_PORT_DOWN;
@@ -2678,13 +2681,13 @@  static void acmp_port_down(struct acmp_port *port)
 	 * event instead of a sleep loop, but it's not worth it given how
 	 * infrequently we should be processing a port down event in practice.
 	 */
-	atomic_dec(&port->sa_dest.refcnt);
-	while (atomic_get(&port->sa_dest.refcnt))
-		sleep(0);
-	pthread_mutex_lock(&port->sa_dest.lock);
-	port->sa_dest.state = ACMP_INIT;
-	pthread_mutex_unlock(&port->sa_dest.lock);
-	acm_log(1, "%s %d is down\n", port->dev->verbs->device->name, port->port_num);
+	instance = atomic_dec(&port->sa_dest.refcnt);
+	if (instance == 1) {
+		pthread_mutex_lock(&port->sa_dest.lock);
+		port->sa_dest.state = ACMP_INIT;
+		pthread_mutex_unlock(&port->sa_dest.lock);
+	}
+	acm_log(1, "%s %d %d is down\n", port->dev->verbs->device->name, port->port_num, instance);
 }
 
 static int acmp_open_port(const struct acm_port *cport, void *dev_context,