diff mbox series

[rdma-next,2/4] RDMA/cma: Multiple path records support with netlink channel

Message ID 2fa2b6c93c4c16c8915bac3cfc4f27be1d60519d.1662631201.git.leonro@nvidia.com (mailing list archive)
State Accepted
Headers show
Series Support multiple path records | expand

Commit Message

Leon Romanovsky Sept. 8, 2022, 10:09 a.m. UTC
From: Mark Zhang <markzhang@nvidia.com>

Support receiving inbound and outbound IB path records (along with GMP
PathRecord) from user-space service through the RDMA netlink channel.
The LIDs in these 3 PRs can be used in this way:
1. GMP PR: used as the standard local/remote LIDs;
2. DLID of outbound PR: Used as the "dlid" field for outbound traffic;
3. DLID of inbound PR: Used as the "dlid" field for outbound traffic in
   responder side.

This is aimed to support adaptive routing. With current IB routing
solution when a packet goes out it's assigned with a fixed DLID per
target, meaning a fixed router will be used.
The LIDs in inbound/outbound path records can be used to identify group
of routers that allow communication with another subnet's entity. With
them packets from an inter-subnet connection may travel through any
router in the set to reach the target.

As confirmed with Jason, when sending a netlink request, kernel uses
LS_RESOLVE_PATH_USE_ALL so that the service knows kernel supports
multiple PRs.

Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/cma.c             |  70 ++++++-
 drivers/infiniband/core/sa_query.c        | 235 +++++++++++++++-------
 drivers/infiniband/ulp/ipoib/ipoib_main.c |   2 +-
 drivers/infiniband/ulp/srp/ib_srp.c       |   2 +-
 include/rdma/ib_sa.h                      |   3 +-
 include/rdma/rdma_cm.h                    |   6 +
 6 files changed, 231 insertions(+), 87 deletions(-)

Comments

Jason Gunthorpe Sept. 22, 2022, 1:58 p.m. UTC | #1
On Thu, Sep 08, 2022 at 01:09:01PM +0300, Leon Romanovsky wrote:

> +static void route_set_path_rec_inbound(struct cma_work *work,
> +				       struct sa_path_rec *path_rec)
> +{
> +	struct rdma_route *route = &work->id->id.route;
> +
> +	if (!route->path_rec_inbound) {
> +		route->path_rec_inbound =
> +			kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL);
> +		if (!route->path_rec_inbound)
> +			return;
> +	}
> +
> +	*route->path_rec_inbound = *path_rec;
> +}

We are just ignoring these memory allocation failures??

>  static void cma_query_handler(int status, struct sa_path_rec *path_rec,
> -			      void *context)
> +			      int num_prs, void *context)

This param should be "unsigned int num_prs"

>  {
>  	struct cma_work *work = context;
>  	struct rdma_route *route;
> +	int i;
>  
>  	route = &work->id->id.route;
>  
> -	if (!status) {
> -		route->num_pri_alt_paths = 1;
> -		*route->path_rec = *path_rec;
> -	} else {
> -		work->old_state = RDMA_CM_ROUTE_QUERY;
> -		work->new_state = RDMA_CM_ADDR_RESOLVED;
> -		work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
> -		work->event.status = status;
> -		pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n",
> -				     status);
> +	if (status)
> +		goto fail;
> +
> +	for (i = 0; i < num_prs; i++) {
> +		if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP))
> +			*route->path_rec = path_rec[i];
> +		else if (path_rec[i].flags & IB_PATH_INBOUND)
> +			route_set_path_rec_inbound(work, &path_rec[i]);
> +		else if (path_rec[i].flags & IB_PATH_OUTBOUND)
> +			route_set_path_rec_outbound(work, &path_rec[i]);
> +	}
> +	if (!route->path_rec) {

Why is this needed? The for loop above will have already oops'd.

> +/**
> + * ib_sa_pr_callback_multiple() - Parse path records then do callback.
> + *
> + * In a multiple-PR case the PRs are saved in "query->resp_pr_data"
> + * (instead of"mad->data") and with "ib_path_rec_data" structure format,
> + * so that rec->flags can be set to indicate the type of PR.
> + * This is valid only in IB fabric.
> + */
> +static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query,
> +				       int status, int num_prs,
> +				       struct ib_path_rec_data *rec_data)
> +{
> +	struct sa_path_rec *rec;
> +	int i;
> +
> +	rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL);
> +	if (!rec) {
> +		query->callback(-ENOMEM, NULL, 0, query->context);
> +		return;
> +	}

This all seems really wild, why are we allocating memory so many times
on this path? Have ib_nl_process_good_resolve_rsp unpack the mad
instead of storing the raw format

It would also be good to make resp_pr_data always valid so all these
special paths don't need to exist.

> diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
> index 81916039ee24..cdc7cafab572 100644
> --- a/include/rdma/rdma_cm.h
> +++ b/include/rdma/rdma_cm.h
> @@ -49,9 +49,15 @@ struct rdma_addr {
>  	struct rdma_dev_addr dev_addr;
>  };
>  
> +#define RDMA_PRIMARY_PATH_MAX_REC_NUM 3

This is a strange place for this define, it should be in sa_query.c?

Jason
Mark Zhang Sept. 23, 2022, 1:40 a.m. UTC | #2
On 9/22/2022 9:58 PM, Jason Gunthorpe wrote:
> On Thu, Sep 08, 2022 at 01:09:01PM +0300, Leon Romanovsky wrote:
> 
>> +static void route_set_path_rec_inbound(struct cma_work *work,
>> +				       struct sa_path_rec *path_rec)
>> +{
>> +	struct rdma_route *route = &work->id->id.route;
>> +
>> +	if (!route->path_rec_inbound) {
>> +		route->path_rec_inbound =
>> +			kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL);
>> +		if (!route->path_rec_inbound)
>> +			return;
>> +	}
>> +
>> +	*route->path_rec_inbound = *path_rec;
>> +}
> 
> We are just ignoring these memory allocation failures??
> 
Inside "if" statement if kzalloc fails here then we don't set 
route->path_rec_inbound or outbound;

>>   static void cma_query_handler(int status, struct sa_path_rec *path_rec,
>> -			      void *context)
>> +			      int num_prs, void *context)
> 
> This param should be "unsigned int num_prs"

Ack

> 
>>   {
>>   	struct cma_work *work = context;
>>   	struct rdma_route *route;
>> +	int i;
>>   
>>   	route = &work->id->id.route;
>>   
>> -	if (!status) {
>> -		route->num_pri_alt_paths = 1;
>> -		*route->path_rec = *path_rec;
>> -	} else {
>> -		work->old_state = RDMA_CM_ROUTE_QUERY;
>> -		work->new_state = RDMA_CM_ADDR_RESOLVED;
>> -		work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
>> -		work->event.status = status;
>> -		pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n",
>> -				     status);
>> +	if (status)
>> +		goto fail;
>> +
>> +	for (i = 0; i < num_prs; i++) {
>> +		if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP))
>> +			*route->path_rec = path_rec[i];
>> +		else if (path_rec[i].flags & IB_PATH_INBOUND)
>> +			route_set_path_rec_inbound(work, &path_rec[i]);
>> +		else if (path_rec[i].flags & IB_PATH_OUTBOUND)
>> +			route_set_path_rec_outbound(work, &path_rec[i]);
>> +	}
>> +	if (!route->path_rec) {
> 
> Why is this needed? The for loop above will have already oops'd.

Right, this "if" is no needed. We don't need to check if route->path_rec 
is valid in this function because it is allocated in cma_resolve_ib_route()

> 
>> +/**
>> + * ib_sa_pr_callback_multiple() - Parse path records then do callback.
>> + *
>> + * In a multiple-PR case the PRs are saved in "query->resp_pr_data"
>> + * (instead of"mad->data") and with "ib_path_rec_data" structure format,
>> + * so that rec->flags can be set to indicate the type of PR.
>> + * This is valid only in IB fabric.
>> + */
>> +static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query,
>> +				       int status, int num_prs,
>> +				       struct ib_path_rec_data *rec_data)
>> +{
>> +	struct sa_path_rec *rec;
>> +	int i;
>> +
>> +	rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL);
>> +	if (!rec) {
>> +		query->callback(-ENOMEM, NULL, 0, query->context);
>> +		return;
>> +	}
> 
> This all seems really wild, why are we allocating memory so many times
> on this path? Have ib_nl_process_good_resolve_rsp unpack the mad
> instead of storing the raw format
> 
> It would also be good to make resp_pr_data always valid so all these
> special paths don't need to exist.

The ib_sa_pr_callback_single() uses stack variable "rec" but 
ib_sa_pr_callback_multiple() uses malloc because there are multiple PRs.

ib_sa_path_rec_callback is also used by ib_post_send_mad(), which always 
have single PR and saved in mad->data, so always set resp_pr_data in 
netlink case is not enough.

> >> diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
>> index 81916039ee24..cdc7cafab572 100644
>> --- a/include/rdma/rdma_cm.h
>> +++ b/include/rdma/rdma_cm.h
>> @@ -49,9 +49,15 @@ struct rdma_addr {
>>   	struct rdma_dev_addr dev_addr;
>>   };
>>   
>> +#define RDMA_PRIMARY_PATH_MAX_REC_NUM 3
> 
> This is a strange place for this define, it should be in sa_query.c?

That's because path_rec, path_rec_inbound and path_rec_outbound are 
defined here, but yes it is only used in sa_query.c, so maybe better 
move it to there.

Thanks Jason.
Jason Gunthorpe Sept. 23, 2022, 1:13 p.m. UTC | #3
On Fri, Sep 23, 2022 at 09:40:22AM +0800, Mark Zhang wrote:
> On 9/22/2022 9:58 PM, Jason Gunthorpe wrote:
> > On Thu, Sep 08, 2022 at 01:09:01PM +0300, Leon Romanovsky wrote:
> > 
> > > +static void route_set_path_rec_inbound(struct cma_work *work,
> > > +				       struct sa_path_rec *path_rec)
> > > +{
> > > +	struct rdma_route *route = &work->id->id.route;
> > > +
> > > +	if (!route->path_rec_inbound) {
> > > +		route->path_rec_inbound =
> > > +			kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL);
> > > +		if (!route->path_rec_inbound)
> > > +			return;
> > > +	}
> > > +
> > > +	*route->path_rec_inbound = *path_rec;
> > > +}
> > 
> > We are just ignoring these memory allocation failures??
> > 
> Inside "if" statement if kzalloc fails here then we don't set
> route->path_rec_inbound or outbound;

But why don't we propogate a ENOMEM failure code?
> > > +static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query,
> > > +				       int status, int num_prs,
> > > +				       struct ib_path_rec_data *rec_data)
> > > +{
> > > +	struct sa_path_rec *rec;
> > > +	int i;
> > > +
> > > +	rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL);
> > > +	if (!rec) {
> > > +		query->callback(-ENOMEM, NULL, 0, query->context);
> > > +		return;
> > > +	}
> > 
> > This all seems really wild, why are we allocating memory so many times
> > on this path? Have ib_nl_process_good_resolve_rsp unpack the mad
> > instead of storing the raw format
> > 
> > It would also be good to make resp_pr_data always valid so all these
> > special paths don't need to exist.
> 
> The ib_sa_pr_callback_single() uses stack variable "rec" but
> ib_sa_pr_callback_multiple() uses malloc because there are multiple PRs.
> 
> ib_sa_path_rec_callback is also used by ib_post_send_mad(), which always
> have single PR and saved in mad->data, so always set resp_pr_data in netlink
> case is not enough.

We should always be able to point resp_pr_data to some kind of
storage, even if it is stack storage.

Jason
Mark Zhang Sept. 23, 2022, 2:24 p.m. UTC | #4
On 9/23/2022 9:13 PM, Jason Gunthorpe wrote:
> On Fri, Sep 23, 2022 at 09:40:22AM +0800, Mark Zhang wrote:
>> On 9/22/2022 9:58 PM, Jason Gunthorpe wrote:
>>> On Thu, Sep 08, 2022 at 01:09:01PM +0300, Leon Romanovsky wrote:
>>>
>>>> +static void route_set_path_rec_inbound(struct cma_work *work,
>>>> +				       struct sa_path_rec *path_rec)
>>>> +{
>>>> +	struct rdma_route *route = &work->id->id.route;
>>>> +
>>>> +	if (!route->path_rec_inbound) {
>>>> +		route->path_rec_inbound =
>>>> +			kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL);
>>>> +		if (!route->path_rec_inbound)
>>>> +			return;
>>>> +	}
>>>> +
>>>> +	*route->path_rec_inbound = *path_rec;
>>>> +}
>>>
>>> We are just ignoring these memory allocation failures??
>>>
>> Inside "if" statement if kzalloc fails here then we don't set
>> route->path_rec_inbound or outbound;
> 
> But why don't we propogate a ENOMEM failure code?

Because inbound/outbound PRs are optional, so even they are provided 
they can still be ignored if cma is not able to set them (e.g. memory 
allocation failure in this case).

>>>> +static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query,
>>>> +				       int status, int num_prs,
>>>> +				       struct ib_path_rec_data *rec_data)
>>>> +{
>>>> +	struct sa_path_rec *rec;
>>>> +	int i;
>>>> +
>>>> +	rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL);
>>>> +	if (!rec) {
>>>> +		query->callback(-ENOMEM, NULL, 0, query->context);
>>>> +		return;
>>>> +	}
>>>
>>> This all seems really wild, why are we allocating memory so many times
>>> on this path? Have ib_nl_process_good_resolve_rsp unpack the mad
>>> instead of storing the raw format
>>>
>>> It would also be good to make resp_pr_data always valid so all these
>>> special paths don't need to exist.
>>
>> The ib_sa_pr_callback_single() uses stack variable "rec" but
>> ib_sa_pr_callback_multiple() uses malloc because there are multiple PRs.
>>
>> ib_sa_path_rec_callback is also used by ib_post_send_mad(), which always
>> have single PR and saved in mad->data, so always set resp_pr_data in netlink
>> case is not enough.
> 
> We should always be able to point resp_pr_data to some kind of
> storage, even if it is stack storage.

The idea is:
- Single PR: PR in mad->data; Used by both netlink and
   ib_post_send_mad();
- Multiple PRs: PRs in resp_pr_data, with "ib_path_rec_data" structure
   format; Currently used by netlink only.

So if we want to always use resp_pr_data then in single-PR case we need 
to copy mad->data to resp_pr_data in both netlink and 
ib_post_send_mad(), and turn it into "ib_path_rec_data" structure 
format. This adds complexity for single-PR, which should be most of 
situations?

Use malloc instead of stack for resp_pr_data and multiple-PR unpack is 
because sizeof(sa_path_rec)=72B, now we supports 3 and there might be 
more in future..
Mark Zhang Sept. 23, 2022, 2:50 p.m. UTC | #5
On 9/23/2022 10:24 PM, Mark Zhang wrote:
> External email: Use caution opening links or attachments
> 
> 
> On 9/23/2022 9:13 PM, Jason Gunthorpe wrote:
>> On Fri, Sep 23, 2022 at 09:40:22AM +0800, Mark Zhang wrote:
>>> On 9/22/2022 9:58 PM, Jason Gunthorpe wrote:
>>>> On Thu, Sep 08, 2022 at 01:09:01PM +0300, Leon Romanovsky wrote:
>>>>
>>>>> +static void route_set_path_rec_inbound(struct cma_work *work,
>>>>> +                                 struct sa_path_rec *path_rec)
>>>>> +{
>>>>> +  struct rdma_route *route = &work->id->id.route;
>>>>> +
>>>>> +  if (!route->path_rec_inbound) {
>>>>> +          route->path_rec_inbound =
>>>>> +                  kzalloc(sizeof(*route->path_rec_inbound), 
>>>>> GFP_KERNEL);
>>>>> +          if (!route->path_rec_inbound)
>>>>> +                  return;
>>>>> +  }
>>>>> +
>>>>> +  *route->path_rec_inbound = *path_rec;
>>>>> +}
>>>>
>>>> We are just ignoring these memory allocation failures??
>>>>
>>> Inside "if" statement if kzalloc fails here then we don't set
>>> route->path_rec_inbound or outbound;
>>
>> But why don't we propogate a ENOMEM failure code?
> 
> Because inbound/outbound PRs are optional, so even they are provided
> they can still be ignored if cma is not able to set them (e.g. memory
> allocation failure in this case).
> 
>>>>> +static void ib_sa_pr_callback_multiple(struct ib_sa_path_query 
>>>>> *query,
>>>>> +                                 int status, int num_prs,
>>>>> +                                 struct ib_path_rec_data *rec_data)
>>>>> +{
>>>>> +  struct sa_path_rec *rec;
>>>>> +  int i;
>>>>> +
>>>>> +  rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL);
>>>>> +  if (!rec) {
>>>>> +          query->callback(-ENOMEM, NULL, 0, query->context);
>>>>> +          return;
>>>>> +  }
>>>>
>>>> This all seems really wild, why are we allocating memory so many times
>>>> on this path? Have ib_nl_process_good_resolve_rsp unpack the mad
>>>> instead of storing the raw format
>>>>
>>>> It would also be good to make resp_pr_data always valid so all these
>>>> special paths don't need to exist.
>>>
>>> The ib_sa_pr_callback_single() uses stack variable "rec" but
>>> ib_sa_pr_callback_multiple() uses malloc because there are multiple PRs.
>>>
>>> ib_sa_path_rec_callback is also used by ib_post_send_mad(), which always
>>> have single PR and saved in mad->data, so always set resp_pr_data in 
>>> netlink
>>> case is not enough.
>>
>> We should always be able to point resp_pr_data to some kind of
>> storage, even if it is stack storage.
> 
> The idea is:
> - Single PR: PR in mad->data; Used by both netlink and
>    ib_post_send_mad();
> - Multiple PRs: PRs in resp_pr_data, with "ib_path_rec_data" structure
>    format; Currently used by netlink only.
> So if we want to always use resp_pr_data then in single-PR case we need
> to copy mad->data to resp_pr_data in both netlink and
> ib_post_send_mad(), and turn it into "ib_path_rec_data" structure
> format. This adds complexity for single-PR, which should be most of
> situations?

Sorry what I mean is resp_pr_data is used by netlink while mad->data is 
used by post_send_mad(), so if we want to always use resp_pr_data then 
in post_send_mad() we need to do malloc, copy and transfer. Besides if 
malloc failures then we still need to use mad->data, unless we always 
use stack no matter how many PRs there are.

> Use malloc instead of stack for resp_pr_data and multiple-PR unpack is
> because sizeof(sa_path_rec)=72B, now we supports 3 and there might be
> more in future..
>
Jason Gunthorpe Sept. 23, 2022, 6:19 p.m. UTC | #6
On Fri, Sep 23, 2022 at 10:24:35PM +0800, Mark Zhang wrote:
> On 9/23/2022 9:13 PM, Jason Gunthorpe wrote:
> > On Fri, Sep 23, 2022 at 09:40:22AM +0800, Mark Zhang wrote:
> > > On 9/22/2022 9:58 PM, Jason Gunthorpe wrote:
> > > > On Thu, Sep 08, 2022 at 01:09:01PM +0300, Leon Romanovsky wrote:
> > > > 
> > > > > +static void route_set_path_rec_inbound(struct cma_work *work,
> > > > > +				       struct sa_path_rec *path_rec)
> > > > > +{
> > > > > +	struct rdma_route *route = &work->id->id.route;
> > > > > +
> > > > > +	if (!route->path_rec_inbound) {
> > > > > +		route->path_rec_inbound =
> > > > > +			kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL);
> > > > > +		if (!route->path_rec_inbound)
> > > > > +			return;
> > > > > +	}
> > > > > +
> > > > > +	*route->path_rec_inbound = *path_rec;
> > > > > +}
> > > > 
> > > > We are just ignoring these memory allocation failures??
> > > > 
> > > Inside "if" statement if kzalloc fails here then we don't set
> > > route->path_rec_inbound or outbound;
> > 
> > But why don't we propogate a ENOMEM failure code?
> 
> Because inbound/outbound PRs are optional, so even they are provided they
> can still be ignored if cma is not able to set them (e.g. memory allocation
> failure in this case).

This isn't how we do things, the netlink operation had a failure, the
failure should propogate. If we've run out of memory the CM connection
is going to blow up anyhow very quickly.

> > We should always be able to point resp_pr_data to some kind of
> > storage, even if it is stack storage.
> 
> The idea is:
> - Single PR: PR in mad->data; Used by both netlink and
>   ib_post_send_mad();
> - Multiple PRs: PRs in resp_pr_data, with "ib_path_rec_data" structure
>   format; Currently used by netlink only.
> 
> So if we want to always use resp_pr_data then in single-PR case we need to
> copy mad->data to resp_pr_data in both netlink and ib_post_send_mad(), and
> turn it into "ib_path_rec_data" structure format. This adds complexity for
> single-PR, which should be most of situations?

You don't copy it, you unpack it. We already have to unpack it, just
to a (large!) stack var - unpack it to resp_pr_data instead.

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 91e72a76d95e..a3efc462305d 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -2026,6 +2026,8 @@  static void _destroy_id(struct rdma_id_private *id_priv,
 		cma_id_put(id_priv->id.context);
 
 	kfree(id_priv->id.route.path_rec);
+	kfree(id_priv->id.route.path_rec_inbound);
+	kfree(id_priv->id.route.path_rec_outbound);
 
 	put_net(id_priv->id.route.addr.dev_addr.net);
 	kfree(id_priv);
@@ -2817,26 +2819,72 @@  int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer)
 }
 EXPORT_SYMBOL(rdma_set_min_rnr_timer);
 
+static void route_set_path_rec_inbound(struct cma_work *work,
+				       struct sa_path_rec *path_rec)
+{
+	struct rdma_route *route = &work->id->id.route;
+
+	if (!route->path_rec_inbound) {
+		route->path_rec_inbound =
+			kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL);
+		if (!route->path_rec_inbound)
+			return;
+	}
+
+	*route->path_rec_inbound = *path_rec;
+}
+
+static void route_set_path_rec_outbound(struct cma_work *work,
+					struct sa_path_rec *path_rec)
+{
+	struct rdma_route *route = &work->id->id.route;
+
+	if (!route->path_rec_outbound) {
+		route->path_rec_outbound =
+			kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL);
+		if (!route->path_rec_outbound)
+			return;
+	}
+
+	*route->path_rec_outbound = *path_rec;
+}
+
 static void cma_query_handler(int status, struct sa_path_rec *path_rec,
-			      void *context)
+			      int num_prs, void *context)
 {
 	struct cma_work *work = context;
 	struct rdma_route *route;
+	int i;
 
 	route = &work->id->id.route;
 
-	if (!status) {
-		route->num_pri_alt_paths = 1;
-		*route->path_rec = *path_rec;
-	} else {
-		work->old_state = RDMA_CM_ROUTE_QUERY;
-		work->new_state = RDMA_CM_ADDR_RESOLVED;
-		work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
-		work->event.status = status;
-		pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n",
-				     status);
+	if (status)
+		goto fail;
+
+	for (i = 0; i < num_prs; i++) {
+		if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP))
+			*route->path_rec = path_rec[i];
+		else if (path_rec[i].flags & IB_PATH_INBOUND)
+			route_set_path_rec_inbound(work, &path_rec[i]);
+		else if (path_rec[i].flags & IB_PATH_OUTBOUND)
+			route_set_path_rec_outbound(work, &path_rec[i]);
+	}
+	if (!route->path_rec) {
+		status = -EINVAL;
+		goto fail;
 	}
 
+	route->num_pri_alt_paths = 1;
+	queue_work(cma_wq, &work->work);
+	return;
+
+fail:
+	work->old_state = RDMA_CM_ROUTE_QUERY;
+	work->new_state = RDMA_CM_ADDR_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+	work->event.status = status;
+	pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n",
+			     status);
 	queue_work(cma_wq, &work->work);
 }
 
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 003e504feca2..0de83d9a4985 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -50,6 +50,7 @@ 
 #include <rdma/ib_marshall.h>
 #include <rdma/ib_addr.h>
 #include <rdma/opa_addr.h>
+#include <rdma/rdma_cm.h>
 #include "sa.h"
 #include "core_priv.h"
 
@@ -104,7 +105,8 @@  struct ib_sa_device {
 };
 
 struct ib_sa_query {
-	void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+	void (*callback)(struct ib_sa_query *sa_query, int status,
+			 int num_prs, struct ib_sa_mad *mad);
 	void (*release)(struct ib_sa_query *);
 	struct ib_sa_client    *client;
 	struct ib_sa_port      *port;
@@ -116,6 +118,12 @@  struct ib_sa_query {
 	u32			seq; /* Local svc request sequence number */
 	unsigned long		timeout; /* Local svc timeout */
 	u8			path_use; /* How will the pathrecord be used */
+
+	/* A separate buffer to save pathrecords of a response, as in cases
+	 * like IB/netlink, mulptiple pathrecords are supported, so that
+	 * mad->data is not large enough to hold them
+	 */
+	void			*resp_pr_data;
 };
 
 #define IB_SA_ENABLE_LOCAL_SERVICE	0x00000001
@@ -123,7 +131,8 @@  struct ib_sa_query {
 #define IB_SA_QUERY_OPA			0x00000004
 
 struct ib_sa_path_query {
-	void (*callback)(int, struct sa_path_rec *, void *);
+	void (*callback)(int status, struct sa_path_rec *rec,
+			 int num_paths, void *context);
 	void *context;
 	struct ib_sa_query sa_query;
 	struct sa_path_rec *conv_pr;
@@ -712,7 +721,7 @@  static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
 
 	if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
 	    sa_rec->reversible != 0)
-		query->path_use = LS_RESOLVE_PATH_USE_GMP;
+		query->path_use = LS_RESOLVE_PATH_USE_ALL;
 	else
 		query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL;
 	header->path_use = query->path_use;
@@ -865,50 +874,81 @@  static void send_handler(struct ib_mad_agent *agent,
 static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
 					   const struct nlmsghdr *nlh)
 {
+	struct ib_path_rec_data *srec, *drec;
+	struct ib_sa_path_query *path_query;
 	struct ib_mad_send_wc mad_send_wc;
-	struct ib_sa_mad *mad = NULL;
 	const struct nlattr *head, *curr;
-	struct ib_path_rec_data  *rec;
-	int len, rem;
+	struct ib_sa_mad *mad = NULL;
+	int len, rem, num_prs = 0;
 	u32 mask = 0;
 	int status = -EIO;
 
-	if (query->callback) {
-		head = (const struct nlattr *) nlmsg_data(nlh);
-		len = nlmsg_len(nlh);
-		switch (query->path_use) {
-		case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
-			mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
-			break;
+	if (!query->callback)
+		goto out;
 
-		case LS_RESOLVE_PATH_USE_ALL:
-		case LS_RESOLVE_PATH_USE_GMP:
-		default:
-			mask = IB_PATH_PRIMARY | IB_PATH_GMP |
-				IB_PATH_BIDIRECTIONAL;
-			break;
+	path_query = container_of(query, struct ib_sa_path_query, sa_query);
+	mad = query->mad_buf->mad;
+	if (!path_query->conv_pr &&
+	    (be16_to_cpu(mad->mad_hdr.attr_id) == IB_SA_ATTR_PATH_REC)) {
+		/* Need a larger buffer for possible multiple PRs */
+		query->resp_pr_data = kvcalloc(RDMA_PRIMARY_PATH_MAX_REC_NUM,
+					       sizeof(*drec), GFP_KERNEL);
+		if (!query->resp_pr_data) {
+			query->callback(query, -ENOMEM, 0, NULL);
+			return;
 		}
-		nla_for_each_attr(curr, head, len, rem) {
-			if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
-				rec = nla_data(curr);
-				/*
-				 * Get the first one. In the future, we may
-				 * need to get up to 6 pathrecords.
-				 */
-				if ((rec->flags & mask) == mask) {
-					mad = query->mad_buf->mad;
-					mad->mad_hdr.method |=
-						IB_MGMT_METHOD_RESP;
-					memcpy(mad->data, rec->path_rec,
-					       sizeof(rec->path_rec));
-					status = 0;
-					break;
-				}
-			}
+	}
+
+	head = (const struct nlattr *) nlmsg_data(nlh);
+	len = nlmsg_len(nlh);
+	switch (query->path_use) {
+	case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
+		mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
+		break;
+
+	case LS_RESOLVE_PATH_USE_ALL:
+		mask = IB_PATH_PRIMARY;
+		break;
+
+	case LS_RESOLVE_PATH_USE_GMP:
+	default:
+		mask = IB_PATH_PRIMARY | IB_PATH_GMP |
+			IB_PATH_BIDIRECTIONAL;
+		break;
+	}
+
+	drec = (struct ib_path_rec_data *)query->resp_pr_data;
+	nla_for_each_attr(curr, head, len, rem) {
+		if (curr->nla_type != LS_NLA_TYPE_PATH_RECORD)
+			continue;
+
+		srec = nla_data(curr);
+		if ((srec->flags & mask) != mask)
+			continue;
+
+		status = 0;
+		if (!drec) {
+			memcpy(mad->data, srec->path_rec,
+			       sizeof(srec->path_rec));
+			num_prs = 1;
+			break;
 		}
-		query->callback(query, status, mad);
+
+		memcpy(drec, srec, sizeof(*drec));
+		drec++;
+		num_prs++;
+		if (num_prs >= RDMA_PRIMARY_PATH_MAX_REC_NUM)
+			break;
 	}
 
+	if (!status)
+		mad->mad_hdr.method |= IB_MGMT_METHOD_RESP;
+
+	query->callback(query, status, num_prs, mad);
+	kvfree(query->resp_pr_data);
+	query->resp_pr_data = NULL;
+
+out:
 	mad_send_wc.send_buf = query->mad_buf;
 	mad_send_wc.status = IB_WC_SUCCESS;
 	send_handler(query->mad_buf->mad_agent, &mad_send_wc);
@@ -1411,41 +1451,90 @@  static int opa_pr_query_possible(struct ib_sa_client *client,
 		return PR_IB_SUPPORTED;
 }
 
+static void ib_sa_pr_callback_single(struct ib_sa_path_query *query,
+				     int status, struct ib_sa_mad *mad)
+{
+	struct sa_path_rec rec = {};
+
+	ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+		  mad->data, &rec);
+	rec.rec_type = SA_PATH_REC_TYPE_IB;
+	sa_path_set_dmac_zero(&rec);
+
+	if (query->conv_pr) {
+		struct sa_path_rec opa;
+
+		memset(&opa, 0, sizeof(struct sa_path_rec));
+		sa_convert_path_ib_to_opa(&opa, &rec);
+		query->callback(status, &opa, 1, query->context);
+	} else {
+		query->callback(status, &rec, 1, query->context);
+	}
+}
+
+/**
+ * ib_sa_pr_callback_multiple() - Parse path records then do callback.
+ *
+ * In a multiple-PR case the PRs are saved in "query->resp_pr_data"
+ * (instead of"mad->data") and with "ib_path_rec_data" structure format,
+ * so that rec->flags can be set to indicate the type of PR.
+ * This is valid only in IB fabric.
+ */
+static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query,
+				       int status, int num_prs,
+				       struct ib_path_rec_data *rec_data)
+{
+	struct sa_path_rec *rec;
+	int i;
+
+	rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL);
+	if (!rec) {
+		query->callback(-ENOMEM, NULL, 0, query->context);
+		return;
+	}
+
+	for (i = 0; i < num_prs; i++) {
+		ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+			  rec_data[i].path_rec, rec + i);
+		rec[i].rec_type = SA_PATH_REC_TYPE_IB;
+		sa_path_set_dmac_zero(rec + i);
+		rec[i].flags = rec_data[i].flags;
+	}
+
+	query->callback(status, rec, num_prs, query->context);
+	kvfree(rec);
+}
+
 static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
-				    int status,
+				    int status, int num_prs,
 				    struct ib_sa_mad *mad)
 {
 	struct ib_sa_path_query *query =
 		container_of(sa_query, struct ib_sa_path_query, sa_query);
+	struct sa_path_rec rec;
 
-	if (mad) {
-		struct sa_path_rec rec;
-
-		if (sa_query->flags & IB_SA_QUERY_OPA) {
-			ib_unpack(opa_path_rec_table,
-				  ARRAY_SIZE(opa_path_rec_table),
-				  mad->data, &rec);
-			rec.rec_type = SA_PATH_REC_TYPE_OPA;
-			query->callback(status, &rec, query->context);
-		} else {
-			ib_unpack(path_rec_table,
-				  ARRAY_SIZE(path_rec_table),
-				  mad->data, &rec);
-			rec.rec_type = SA_PATH_REC_TYPE_IB;
-			sa_path_set_dmac_zero(&rec);
-
-			if (query->conv_pr) {
-				struct sa_path_rec opa;
+	if (!mad || !num_prs) {
+		query->callback(status, NULL, 0, query->context);
+		return;
+	}
 
-				memset(&opa, 0, sizeof(struct sa_path_rec));
-				sa_convert_path_ib_to_opa(&opa, &rec);
-				query->callback(status, &opa, query->context);
-			} else {
-				query->callback(status, &rec, query->context);
-			}
+	if (sa_query->flags & IB_SA_QUERY_OPA) {
+		if (num_prs != 1) {
+			query->callback(-EINVAL, NULL, 0, query->context);
+			return;
 		}
-	} else
-		query->callback(status, NULL, query->context);
+
+		ib_unpack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table),
+			  mad->data, &rec);
+		rec.rec_type = SA_PATH_REC_TYPE_OPA;
+		query->callback(status, &rec, num_prs, query->context);
+	} else {
+		if (!sa_query->resp_pr_data)
+			ib_sa_pr_callback_single(query, status, mad);
+		else
+			ib_sa_pr_callback_multiple(query, status, num_prs,
+						   sa_query->resp_pr_data);
+	}
 }
 
 static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
@@ -1489,7 +1578,7 @@  int ib_sa_path_rec_get(struct ib_sa_client *client,
 		       unsigned long timeout_ms, gfp_t gfp_mask,
 		       void (*callback)(int status,
 					struct sa_path_rec *resp,
-					void *context),
+					int num_paths, void *context),
 		       void *context,
 		       struct ib_sa_query **sa_query)
 {
@@ -1588,7 +1677,7 @@  int ib_sa_path_rec_get(struct ib_sa_client *client,
 EXPORT_SYMBOL(ib_sa_path_rec_get);
 
 static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
-					int status,
+					int status, int num_prs,
 					struct ib_sa_mad *mad)
 {
 	struct ib_sa_mcmember_query *query =
@@ -1680,7 +1769,7 @@  int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
 
 /* Support GuidInfoRecord */
 static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query,
-					int status,
+					int status, int num_paths,
 					struct ib_sa_mad *mad)
 {
 	struct ib_sa_guidinfo_query *query =
@@ -1790,7 +1879,7 @@  static void ib_classportinfo_cb(void *context)
 }
 
 static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
-					      int status,
+					      int status, int num_prs,
 					      struct ib_sa_mad *mad)
 {
 	unsigned long flags;
@@ -1966,13 +2055,13 @@  static void send_handler(struct ib_mad_agent *agent,
 			/* No callback -- already got recv */
 			break;
 		case IB_WC_RESP_TIMEOUT_ERR:
-			query->callback(query, -ETIMEDOUT, NULL);
+			query->callback(query, -ETIMEDOUT, 0, NULL);
 			break;
 		case IB_WC_WR_FLUSH_ERR:
-			query->callback(query, -EINTR, NULL);
+			query->callback(query, -EINTR, 0, NULL);
 			break;
 		default:
-			query->callback(query, -EIO, NULL);
+			query->callback(query, -EIO, 0, NULL);
 			break;
 		}
 
@@ -2000,10 +2089,10 @@  static void recv_handler(struct ib_mad_agent *mad_agent,
 		if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
 			query->callback(query,
 					mad_recv_wc->recv_buf.mad->mad_hdr.status ?
-					-EINVAL : 0,
+					-EINVAL : 0, 1,
 					(struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
 		else
-			query->callback(query, -EIO, NULL);
+			query->callback(query, -EIO, 0, NULL);
 	}
 
 	ib_free_recv_mad(mad_recv_wc);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index a4904371e2db..ac25fc80fb33 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -742,7 +742,7 @@  void ipoib_flush_paths(struct net_device *dev)
 
 static void path_rec_completion(int status,
 				struct sa_path_rec *pathrec,
-				void *path_ptr)
+				int num_prs, void *path_ptr)
 {
 	struct ipoib_path *path = path_ptr;
 	struct net_device *dev = path->dev;
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 1e777b2043d6..e5ec5444b662 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -699,7 +699,7 @@  static void srp_free_ch_ib(struct srp_target_port *target,
 
 static void srp_path_rec_completion(int status,
 				    struct sa_path_rec *pathrec,
-				    void *ch_ptr)
+				    int num_paths, void *ch_ptr)
 {
 	struct srp_rdma_ch *ch = ch_ptr;
 	struct srp_target_port *target = ch->target;
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 3634d4cc7a56..e930bec33b31 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -186,6 +186,7 @@  struct sa_path_rec {
 		struct sa_path_rec_opa opa;
 	};
 	enum sa_path_rec_type rec_type;
+	u32 flags;
 };
 
 static inline enum ib_gid_type
@@ -413,7 +414,7 @@  int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device,
 		       ib_sa_comp_mask comp_mask, unsigned long timeout_ms,
 		       gfp_t gfp_mask,
 		       void (*callback)(int status, struct sa_path_rec *resp,
-					void *context),
+					int num_prs, void *context),
 		       void *context, struct ib_sa_query **query);
 
 struct ib_sa_multicast {
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 81916039ee24..cdc7cafab572 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -49,9 +49,15 @@  struct rdma_addr {
 	struct rdma_dev_addr dev_addr;
 };
 
+#define RDMA_PRIMARY_PATH_MAX_REC_NUM 3
 struct rdma_route {
 	struct rdma_addr addr;
 	struct sa_path_rec *path_rec;
+
+	/* Optional path records of primary path */
+	struct sa_path_rec *path_rec_inbound;
+	struct sa_path_rec *path_rec_outbound;
+
 	/*
 	 * 0 - No primary nor alternate path is available
 	 * 1 - Only primary path is available