diff mbox

[14/14] IB/mad: Add final OPA MAD processing

Message ID 1433615915-24591-15-git-send-email-ira.weiny@intel.com (mailing list archive)
State Superseded
Headers show

Commit Message

Ira Weiny June 6, 2015, 6:38 p.m. UTC
From: Ira Weiny <ira.weiny@intel.com>

For devices which support OPA MADs

   1) Use previously defined SMP support functions.

   2) Pass correct base version to ib_create_send_mad when processing OPA MADs.

   3) Process out_mad_key_index returned by agents for a response.  This is
      necessary because OPA SMP packets must carry a valid pkey.

   4) Carry the correct segment size (OPA vs IBTA) of RMPP messages within
      ib_mad_recv_wc.

   5) Handle variable length OPA MADs by:

        * Adjusting the 'fake' WC for locally routed SMP's to represent the
          proper incoming byte_len
        * out_mad_size is used from the local HCA agents
                1) when sending agent responses on the wire
                2) when passing responses through the local_completions
		   function

	NOTE: wc.byte_len includes the GRH length and therefore is different
	      from the in_mad_size specified to the local HCA agents.
	      out_mad_size should _not_ include the GRH length as it is added

Signed-off-by: Ira Weiny <ira.weiny@intel.com>

---
Changes from V1:
	Use out_mad_pkey_index rather than the wc.pkey_index from the agents
		This is a much cleaner interface than returning data in the
		in_wc which can now remain const.
	Correct all opa variables to be bool type
	Adjust for ib_mad_private being a flex array

 drivers/infiniband/core/agent.c    |   7 +-
 drivers/infiniband/core/agent.h    |   2 +-
 drivers/infiniband/core/mad.c      | 220 ++++++++++++++++++++++++++++++++-----
 drivers/infiniband/core/mad_priv.h |   1 +
 drivers/infiniband/core/mad_rmpp.c |  20 +++-
 drivers/infiniband/core/user_mad.c |  19 ++--
 include/rdma/ib_mad.h              |   8 ++
 7 files changed, 234 insertions(+), 43 deletions(-)

Comments

Liran Liss June 10, 2015, 6:30 a.m. UTC | #1
> From: Ira Weiny <ira.weiny@intel.com>

Hi Ira,

OPA cannot impersonate IB; OPA node and link types have to be designated as such.
In terms of MAD processing flows, both explicit (as in the handle_opa_smi() call below) and implicit code paths (which share IB flows - there are several cases) must make this distinction.

> +static enum smi_action
> +handle_opa_smi(struct ib_mad_port_private *port_priv,
> +	       struct ib_mad_qp_info *qp_info,
> +	       struct ib_wc *wc,
> +	       int port_num,
> +	       struct ib_mad_private *recv,
> +	       struct ib_mad_private *response)
> +{
...
> +	} else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH)  <----

--Liran
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny June 10, 2015, 5:54 p.m. UTC | #2
On Wed, Jun 10, 2015 at 06:30:58AM +0000, Liran Liss wrote:
> > From: Ira Weiny <ira.weiny@intel.com>
> 
> Hi Ira,
> 
> OPA cannot impersonate IB; OPA node and link types have to be designated as such.

This was discussed at length and we agreed that the kernel would have explicit
capabilities communicated between the drivers and the core layers rather than
using link layer to determine what core support was needed.

For Node Type, OPA is its own "namespace" and as such we use the same values
for "CA" and "Switch".  The code you reference below is explicitly executed
only on OPA devices so I don't see why this is in conflict with IB.

> In terms of MAD processing flows, both explicit (as in the handle_opa_smi() call below) and implicit code paths (which share IB flows - there are several cases) must make this distinction.
> 

I agreed and all OPA differences are limited to device/ports which explicitly
indicate they are OPA ports.

For example:

        opa = rdma_cap_opa_mad(qp_info->port_priv->device,
                               qp_info->port_priv->port_num);

...

        if (opa && ((struct ib_mad_hdr *)(recv->mad))->base_version == OPA_MGMT_BASE_VERSION) {
                recv->header.recv_wc.mad_len = wc->byte_len - sizeof(struct ib_grh);
                recv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad);
        } else {
                recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
                recv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad);
        }


If I missed a place where this is not the case please let me know but I made
this change many months back and I'm pretty sure I caught them all.

Thanks,
Ira


> > +static enum smi_action
> > +handle_opa_smi(struct ib_mad_port_private *port_priv,
> > +	       struct ib_mad_qp_info *qp_info,
> > +	       struct ib_wc *wc,
> > +	       int port_num,
> > +	       struct ib_mad_private *recv,
> > +	       struct ib_mad_private *response)
> > +{
> ...
> > +	} else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH)  <----
> 
> --Liran
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford June 10, 2015, 6:37 p.m. UTC | #3
On Wed, 2015-06-10 at 06:30 +0000, Liran Liss wrote:
> > From: Ira Weiny <ira.weiny@intel.com>
> 
> Hi Ira,
> 
> OPA cannot impersonate IB; OPA node and link types have to be designated as such.
> In terms of MAD processing flows, both explicit (as in the handle_opa_smi() call below) and implicit code paths (which share IB flows - there are several cases) must make this distinction.

As far as in the kernel is concerned, the individual capability bits are
much more important.  I would actually like to do away with the
node_type variable from struct ib_device eventually.  As for user space,
where we have to maintain ABI, node_type can be IB_CA (after all, the
OPA devices are just like RoCE devices in that they implement IB VERBS
as their user visible transport, and only addressing/management is
different from link layer IB devices), link layer needs to be OPA.

> > +static enum smi_action
> > +handle_opa_smi(struct ib_mad_port_private *port_priv,
> > +	       struct ib_mad_qp_info *qp_info,
> > +	       struct ib_wc *wc,
> > +	       int port_num,
> > +	       struct ib_mad_private *recv,
> > +	       struct ib_mad_private *response)
> > +{
> ...
> > +	} else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH)  <----
> 
> --Liran
Jason Gunthorpe June 10, 2015, 6:56 p.m. UTC | #4
On Wed, Jun 10, 2015 at 02:37:26PM -0400, Doug Ledford wrote:
> On Wed, 2015-06-10 at 06:30 +0000, Liran Liss wrote:
> > > From: Ira Weiny <ira.weiny@intel.com>
> > 
> > Hi Ira,
> > 
> > OPA cannot impersonate IB; OPA node and link types have to be
> > designated as such.  In terms of MAD processing flows, both
> > explicit (as in the handle_opa_smi() call below) and implicit code
> > paths (which share IB flows - there are several cases) must make
> > this distinction.
> 
> As far as in the kernel is concerned, the individual capability bits
> are much more important.  I would actually like to do away with the
> node_type variable from struct ib_device eventually.  As for user
> space,

All SMI code has different behavior if it is running on a switch or
HCA, so testing for 'switchyness' is very appropriate here.
cap_is_switch_smi would be a nice refinement to let us drop nodetype.

I don't have a problem with sharing the IBA constant names for MAD
structures (like RDMA_NODE_IB_SWITCH) between IB and OPA code. They
already share the structure layouts/etc.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford June 10, 2015, 7:59 p.m. UTC | #5
On Wed, 2015-06-10 at 12:56 -0600, Jason Gunthorpe wrote:
> On Wed, Jun 10, 2015 at 02:37:26PM -0400, Doug Ledford wrote:
> > On Wed, 2015-06-10 at 06:30 +0000, Liran Liss wrote:
> > > > From: Ira Weiny <ira.weiny@intel.com>
> > > 
> > > Hi Ira,
> > > 
> > > OPA cannot impersonate IB; OPA node and link types have to be
> > > designated as such.  In terms of MAD processing flows, both
> > > explicit (as in the handle_opa_smi() call below) and implicit code
> > > paths (which share IB flows - there are several cases) must make
> > > this distinction.
> > 
> > As far as in the kernel is concerned, the individual capability bits
> > are much more important.  I would actually like to do away with the
> > node_type variable from struct ib_device eventually.  As for user
> > space,
> 
> All SMI code has different behavior if it is running on a switch or
> HCA, so testing for 'switchyness' is very appropriate here.

Sure...

> cap_is_switch_smi would be a nice refinement to let us drop nodetype.

Exactly, we need a bit added to the immutable data bits, and a new cap_
helper, and then nodetype is ready to be retired.  Add a bit, drop a
u8 ;-)

> I don't have a problem with sharing the IBA constant names for MAD
> structures (like RDMA_NODE_IB_SWITCH) between IB and OPA code. They
> already share the structure layouts/etc.
> 
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liran Liss June 11, 2015, 6:27 p.m. UTC | #6
> From: Doug Ledford [mailto:dledford@redhat.com]


> > > > OPA cannot impersonate IB; OPA node and link types have to be

> > > > designated as such.  In terms of MAD processing flows, both

> > > > explicit (as in the handle_opa_smi() call below) and implicit code

> > > > paths (which share IB flows - there are several cases) must make

> > > > this distinction.

> > >

> > > As far as in the kernel is concerned, the individual capability bits

> > > are much more important.  I would actually like to do away with the

> > > node_type variable from struct ib_device eventually.  As for user

> > > space,


We agreed on the concept of capability bits for the sake of simplifying code sharing.
That is OK.

But the node_type stands for more than just an abstract RDMA device:
In IB, it designates an instance of an industry-standard, well-defined, device type: it's possible link types, transport, semantics, management, everything.
It *should* be exposed to user-space so apps that know and care what they are running on could continue to work.

The place for abstraction is in the rdmacm/CMA, which serves applications that just
want some RDMA functionality regardless of the underlying technology.

> >

> > All SMI code has different behavior if it is running on a switch or

> > HCA, so testing for 'switchyness' is very appropriate here.

> 

> Sure...

> 

> > cap_is_switch_smi would be a nice refinement to let us drop nodetype.

> 

> Exactly, we need a bit added to the immutable data bits, and a new cap_

> helper, and then nodetype is ready to be retired.  Add a bit, drop a

> u8 ;-)

> 


This is indeed a viable solution.

> > I don't have a problem with sharing the IBA constant names for MAD

> > structures (like RDMA_NODE_IB_SWITCH) between IB and OPA code. They

> > already share the structure layouts/etc.

> >


The node type is reflected to user-space, which, as I mentioned above, is important.
Abusing this enumeration is misleading, even in the kernel.
Jason's proposal for a 'cap_is_switch_smi' is more readable, and directly in line with
the explicit capability approach that we discussed.
Hefty, Sean June 11, 2015, 9 p.m. UTC | #7
PiA+IGNhcF9pc19zd2l0Y2hfc21pIHdvdWxkIGJlIGEgbmljZSByZWZpbmVtZW50IHRvIGxldCB1
cyBkcm9wIG5vZGV0eXBlLg0KPiANCj4gRXhhY3RseSwgd2UgbmVlZCBhIGJpdCBhZGRlZCB0byB0
aGUgaW1tdXRhYmxlIGRhdGEgYml0cywgYW5kIGEgbmV3IGNhcF8NCj4gaGVscGVyLCBhbmQgdGhl
biBub2RldHlwZSBpcyByZWFkeSB0byBiZSByZXRpcmVkLiAgQWRkIGEgYml0LCBkcm9wIGENCj4g
dTggOy0pDQoNCkkgYWdyZWUgdGhhdCB0aGUgbm9kZSB0eXBlIGVudW0gaXNuJ3QgcGFydGljdWxh
cmx5IHVzZWZ1bCBhbmQgc2hvdWxkIGJlIHJldGlyZWQuICBJbiBmYWN0LCBJIGRvbid0IHNlZSB3
aGVyZSBSRE1BX05PREVfSUJfU1dJVENIIGlzIHVzZWQgYnkgYW55IHVwc3RyZWFtIGRldmljZS4g
IFNvIEkgZG9uJ3QgdGhpbmsgdGhlcmUncyBhbnkgb2JsaWdhdGlvbiB0byBrZWVwIGl0LiAgQnV0
IGV2ZW4gaWYgd2UgZG8sIEknbSBub3Qgc3VyZSB0aGlzIGlzIHRoZSBjb3JyZWN0IGFwcHJvYWNo
LiAgSSBkb24ndCBrbm93IHRoaXMgZm9yIGEgZmFjdCwgYnV0IGl0IHNlZW1zIG1vcmUgbGlrZWx5
IHRoYXQgc29tZW9uZSB3b3VsZCBlbWJlZCBMaW51eCBvbiBhbiBJQiBzd2l0Y2ggdGhhbiB0aGV5
IHdvdWxkIHBsdWcgYW4gSUIgc3dpdGNoIGludG8gYSBMaW51eCBiYXNlZCBzeXN0ZW0uICBUaGUg
Y29kZSBpcyBkZXNpZ25lZCBhcm91bmQgdGhlIGxhdHRlci4gIE1ha2luZyB0aGlzIGEgc3lzdGVt
IHdpZGUgc2V0dGluZyBtaWdodCBzaW1wbGlmeSB0aGUgY29kZSBhbmQgb3B0aW1pemUgdGhlIGNv
ZGUgcGF0aHMuDQoNCi0gU2Vhbg0K
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hal Rosenstock June 11, 2015, 11:24 p.m. UTC | #8
On 6/11/2015 5:00 PM, Hefty, Sean wrote:
>>> cap_is_switch_smi would be a nice refinement to let us drop nodetype.
>>
>> Exactly, we need a bit added to the immutable data bits, and a new cap_
>> helper, and then nodetype is ready to be retired.  Add a bit, drop a
>> u8 ;-)
> 
> I agree that the node type enum isn't particularly useful and should be retired.  

Are you referring to kernel space or user space or both ?

> In fact, I don't see where RDMA_NODE_IB_SWITCH is used by any upstream device.  

While not upstream, there are at least 2 vendors with one or more switch
device drivers using the upstream stack.

> So I don't think there's any obligation to keep it.  

In kernel space, we can get rid of it but it's exposed by verbs and
currently relied upon in user space in a number of places.

There's one kernel place that needs more than just cap_is_switch_smi().

> But even if we do, I'm not sure this is the correct approach.  I don't know this for a fact, 
> but it seems more likely that someone would embed Linux on an IB switch than they would plug an IB switch 
> into a Linux based system.  The code is designed around the latter.  Making this a system wide setting might simplify the code and optimize the code paths.

I think we need to discuss how user space would be addressed.

-- Hal

> - Sean
> N?????r??y???b?X???v?^?)?{.n?+????{????{ay????,j??f???h???z??w??????j:+v???w?j?m????????zZ+???j"??!tml=

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean June 11, 2015, 11:52 p.m. UTC | #9
> > I agree that the node type enum isn't particularly useful and should be

> retired.

> 

> Are you referring to kernel space or user space or both ?


Short term, kernel space.  User space needs to keep something around for backwards compatibility.

But the in tree code will never expose this value up.

> > But even if we do, I'm not sure this is the correct approach.  I don't

> know this for a fact,

> > but it seems more likely that someone would embed Linux on an IB switch

> than they would plug an IB switch

> > into a Linux based system.  The code is designed around the latter.

> Making this a system wide setting might simplify the code and optimize the

> code paths.

> 

> I think we need to discuss how user space would be addressed.


This is an issue with out of tree drivers.  We're having to guess what things might be doing.  Are all devices being exposed up as a 'switch', or is there ever a case where there's a 'switch' device and an HCA device being reported together, or (highly unlikely) a switch device and an RNIC?

If the real use case is to embed Linux on a switch, then we could look at making that a system wide setting, rather than per device.  This could clean up the kernel without impacting the uABI.
Hal Rosenstock June 12, 2015, 12:22 a.m. UTC | #10
On 6/11/2015 7:52 PM, Hefty, Sean wrote:
>>> I agree that the node type enum isn't particularly useful and should be
>> retired.
>>
>> Are you referring to kernel space or user space or both ?
> 
> Short term, kernel space.  User space needs to keep something around for backwards compatibility.
> 
> But the in tree code will never expose this value up.
> 
>>> But even if we do, I'm not sure this is the correct approach.  I don't
>> know this for a fact,
>>> but it seems more likely that someone would embed Linux on an IB switch
>> than they would plug an IB switch
>>> into a Linux based system.  The code is designed around the latter.
>> Making this a system wide setting might simplify the code and optimize the
>> code paths.
>>
>> I think we need to discuss how user space would be addressed.
> 
> This is an issue with out of tree drivers.  We're having to guess what things might be doing.  
> Are all devices being exposed up as a 'switch', or is there ever a case where there's a 'switch' device 
> and an HCA device being reported together, or (highly unlikely) a switch device and an RNIC?

Gateways are comprised of switch + HCA devices. There are other more
complex cases of multiple devices.

> If the real use case is to embed Linux on a switch, then we could look at making that a system wide setting, 
> rather than per device.  This could clean up the kernel without impacting the uABI.

I think that system wide is too limiting and it needs to be on a per
device basis.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford June 12, 2015, 2:23 p.m. UTC | #11
On 06/11/2015 02:27 PM, Liran Liss wrote:
>> From: Doug Ledford [mailto:dledford@redhat.com]
> 
>>>>> OPA cannot impersonate IB; OPA node and link types have to be
>>>>> designated as such.  In terms of MAD processing flows, both
>>>>> explicit (as in the handle_opa_smi() call below) and implicit code
>>>>> paths (which share IB flows - there are several cases) must make
>>>>> this distinction.
>>>>
>>>> As far as in the kernel is concerned, the individual capability bits
>>>> are much more important.  I would actually like to do away with the
>>>> node_type variable from struct ib_device eventually.  As for user
>>>> space,
> 
> We agreed on the concept of capability bits for the sake of simplifying code sharing.
> That is OK.
> 
> But the node_type stands for more than just an abstract RDMA device:
> In IB, it designates an instance of an industry-standard, well-defined, device type: it's possible link types, transport, semantics, management, everything.
> It *should* be exposed to user-space so apps that know and care what they are running on could continue to work.

I'm sorry, but your argument here is not very convincing at all.  And
it's somewhat hypocritical.  When RoCE was first introduced, the *exact*
same argument could be used to argue for why RoCE should require a new
node_type.  Except then, because RoCE was your own, you argued for, and
got, an expansion of the IB node_type definition that now included a
relevant link_layer attribute that apps never needed to care about
before.  However, now you are a victim of your own success.  You set the
standard then that if the new device can properly emulate an IB Verbs/IB
Link Layer device in terms of A) supported primitives (iWARP and usNIC
both fail here, and hence why they have their own node_types) and B)
queue pair creation process modulo link layer specific addressing
attributes, then that device qualifies to use the IB_CA node_type and
merely needs only a link_layer attribute to differentiate it.

The new OPA stuff appears to be following *exactly* the same development
model/path that RoCE did.  When RoCE was introduced, all the apps that
really cared about low level addressing on the link layer had to be
modified to encompass the new link type.  This is simply link_layer
number three for apps to care about.

> The place for abstraction is in the rdmacm/CMA, which serves applications that just
> want some RDMA functionality regardless of the underlying technology.
> 
>>>
>>> All SMI code has different behavior if it is running on a switch or
>>> HCA, so testing for 'switchyness' is very appropriate here.
>>
>> Sure...
>>
>>> cap_is_switch_smi would be a nice refinement to let us drop nodetype.
>>
>> Exactly, we need a bit added to the immutable data bits, and a new cap_
>> helper, and then nodetype is ready to be retired.  Add a bit, drop a
>> u8 ;-)
>>
> 
> This is indeed a viable solution.
> 
>>> I don't have a problem with sharing the IBA constant names for MAD
>>> structures (like RDMA_NODE_IB_SWITCH) between IB and OPA code. They
>>> already share the structure layouts/etc.
>>>
> 
> The node type is reflected to user-space, which, as I mentioned above, is important.
> Abusing this enumeration is misleading, even in the kernel.
> Jason's proposal for a 'cap_is_switch_smi' is more readable, and directly in line with
> the explicit capability approach that we discussed.
> 
> N?????r??y???b?X???v?^?)?{.n?+????{????{ay????,j??f???h???z??w??????j:+v???w?j?m????????zZ+??????j"??!tml=
>
Liran Liss June 14, 2015, 7:16 p.m. UTC | #12
> From: Doug Ledford [mailto:dledford@redhat.com]


> > But the node_type stands for more than just an abstract RDMA device:

> > In IB, it designates an instance of an industry-standard, well-defined,

> device type: it's possible link types, transport, semantics, management,

> everything.

> > It *should* be exposed to user-space so apps that know and care what

> they are running on could continue to work.

> 

> I'm sorry, but your argument here is not very convincing at all.  And

> it's somewhat hypocritical.  When RoCE was first introduced, the *exact*

> same argument could be used to argue for why RoCE should require a new

> node_type.  Except then, because RoCE was your own, you argued for, and

> got, an expansion of the IB node_type definition that now included a

> relevant link_layer attribute that apps never needed to care about

> before.  However, now you are a victim of your own success.  You set the

> standard then that if the new device can properly emulate an IB Verbs/IB

> Link Layer device in terms of A) supported primitives (iWARP and usNIC

> both fail here, and hence why they have their own node_types) and B)

> queue pair creation process modulo link layer specific addressing

> attributes, then that device qualifies to use the IB_CA node_type and

> merely needs only a link_layer attribute to differentiate it.

> 


No. RoCE is as an open standard from the IBTA with the exact same RDMA protocol semantics as InfiniBand and a clear set of compliancy rules without which an implementation can't claim to be such. A RoCE device *is* an IB CA with an Ethernet link.
In contrast, OPA is a proprietary protocol. We don't know what primitives are supported, and whether the semantics of supported primitives are the same as in InfiniBand.

> The new OPA stuff appears to be following *exactly* the same development

> model/path that RoCE did.  When RoCE was introduced, all the apps that

> really cared about low level addressing on the link layer had to be

> modified to encompass the new link type.  This is simply link_layer

> number three for apps to care about.

> 


You are missing my point. API transparency is not a synonym for full semantic equivalence.  The Node Type doesn’t indicate level of adherence to an API. Node Type indicates compliancy to a  specification (e.g. wire protocol, remote order of execution, error semantics, architectural limitations, etc). The IBTA CA and Switch Node Types belong to devices that are compliant to the corresponding specifications from the InfiniBand Trade Association.  And that doesn’t prevent applications to choose to be coded to run over nodes of different Node Type as it happens today with IB/RoCE and iWARP.

This has nothing to do with addressing.
Doug Ledford June 15, 2015, 5:39 a.m. UTC | #13
On 06/14/2015 03:16 PM, Liran Liss wrote:
>> From: Doug Ledford [mailto:dledford@redhat.com]
> 
>>> But the node_type stands for more than just an abstract RDMA device:
>>> In IB, it designates an instance of an industry-standard, well-defined,
>> device type: it's possible link types, transport, semantics, management,
>> everything.
>>> It *should* be exposed to user-space so apps that know and care what
>> they are running on could continue to work.
>>
>> I'm sorry, but your argument here is not very convincing at all.  And
>> it's somewhat hypocritical.  When RoCE was first introduced, the *exact*
>> same argument could be used to argue for why RoCE should require a new
>> node_type.  Except then, because RoCE was your own, you argued for, and
>> got, an expansion of the IB node_type definition that now included a
>> relevant link_layer attribute that apps never needed to care about
>> before.  However, now you are a victim of your own success.  You set the
>> standard then that if the new device can properly emulate an IB Verbs/IB
>> Link Layer device in terms of A) supported primitives (iWARP and usNIC
>> both fail here, and hence why they have their own node_types) and B)
>> queue pair creation process modulo link layer specific addressing
>> attributes, then that device qualifies to use the IB_CA node_type and
>> merely needs only a link_layer attribute to differentiate it.
>>
> 
> No. RoCE is as an open standard from the IBTA with the exact same RDMA protocol semantics as InfiniBand and a clear set of compliancy rules without which an implementation can't claim to be such. A RoCE device *is* an IB CA with an Ethernet link.
> In contrast, OPA is a proprietary protocol. We don't know what primitives are supported, and whether the semantics of supported primitives are the same as in InfiniBand.

Intel has stated on this list that they intend for RDMA apps to run on
OPA transparently.  That pretty much implies the list of primitives and
everything else that they must support.  However, time will tell if they
succeeded or not.

>> The new OPA stuff appears to be following *exactly* the same development
>> model/path that RoCE did.  When RoCE was introduced, all the apps that
>> really cared about low level addressing on the link layer had to be
>> modified to encompass the new link type.  This is simply link_layer
>> number three for apps to care about.
>>
> 
> You are missing my point. API transparency is not a synonym for full semantic equivalence.  The Node Type doesn’t indicate level of adherence to an API. Node Type indicates compliancy to a  specification (e.g. wire protocol, remote order of execution, error semantics, architectural limitations, etc). The IBTA CA and Switch Node Types belong to devices that are compliant to the corresponding specifications from the InfiniBand Trade Association.  And that doesn’t prevent applications to choose to be coded to run over nodes of different Node Type as it happens today with IB/RoCE and iWARP.
> 
> This has nothing to do with addressing.

And whether you like it or not, Intel is intentionally creating a
device/fabric with the specific intention of mimicking the IB_CA device
type (with stated exceptions for MAD packets and addresses).  They
obviously won't have certification as an IB_CA, but that's not their
aim.  Their aim is to be a functional drop in replacement that apps
don't need to know about except for the stated exceptions.

And I'm not missing your point.  Your point is inappropriate.  You're
trying to conflate certification with a functional API.  The IB_CA node
type is not an official certification of anything, and the linux kernel
is not an official certifying body for anything.  If you want
certification, you go to the OFA and the UNH-IOL testing program.
There, you have the rights to the certification branding logo and you
have the right to deny access to that logo to anyone that doesn't meet
the branding requirements.

You're right that apps can be coded to other CA types, like RNICs and
USNICs.  However, those are all very different from an IB_CA due to
limited queue pair types or limited primitives.  If OPA had that same
limitation then I would agree it needs a different node type.

So this will be my litmus test.  Currently, an app that supports all of
the RDMA types looks like this:

if (node_type == RNIC)
	do iwarpy stuff
else if (node_type == USNIC)
	do USNIC stuff
else if (node_type == IB_CA)
	do IB verbs stuff
	if (link_layer == Ethernet)
		do RoCE addressing/management
	else
		do IB addressing/management



If, in the end, apps that are modified to support OPA end up looking
like this:

if (node_type == RNIC)
	do iwarpy stuff
else if (node_type == USNIC)
	do USNIC stuff
else if (node_type == IB_CA || node_type == OPA_CA)
	do IB verbs stuff
	if (node_type == OPA_CA)
		do OPA addressing/management
	else if (link_layer == Ethernet)
		do RoCE addressing/management
	else
		do IB addressing/management

where you can plainly see that the exact same goal can be accomplished
whether you have an OPA node_type or an IB_CA node_type + OPA
link_layer, then I will be fine with either a new node_type or a new
link_layer.  They will be functionally equivalent as far as I'm concerned.
Liran Liss June 16, 2015, 9:05 p.m. UTC | #14
> From: Doug Ledford [mailto:dledford@redhat.com]


> > No. RoCE is as an open standard from the IBTA with the exact same RDMA

> protocol semantics as InfiniBand and a clear set of compliancy rules without

> which an implementation can't claim to be such. A RoCE device *is* an IB CA

> with an Ethernet link.

> > In contrast, OPA is a proprietary protocol. We don't know what primitives

> are supported, and whether the semantics of supported primitives are the

> same as in InfiniBand.

> 

> Intel has stated on this list that they intend for RDMA apps to run on

> OPA transparently.  That pretty much implies the list of primitives and

> everything else that they must support.  However, time will tell if they

> succeeded or not.

> 


I am sorry, but that's not good enough.
When I see an IB device, I know exactly what to expect. I can't say anything regarding an OPA device.

It might be that today the semantics are "close enough".
But in the future, both feature sets and semantics may diverge considerably.
What are you going to do then?

In addition, today, the host admin knows that 2 IB CA nodes will always interoperate. If you share the node type with OPA, everything breaks down. There is no way of knowing which devices work with which.

> >> The new OPA stuff appears to be following *exactly* the same

> development

> >> model/path that RoCE did.  When RoCE was introduced, all the apps that

> >> really cared about low level addressing on the link layer had to be

> >> modified to encompass the new link type.  This is simply link_layer

> >> number three for apps to care about.

> >>

> >

> > You are missing my point. API transparency is not a synonym for full

> semantic equivalence.  The Node Type doesn’t indicate level of adherence to

> an API. Node Type indicates compliancy to a  specification (e.g. wire protocol,

> remote order of execution, error semantics, architectural limitations, etc).

> The IBTA CA and Switch Node Types belong to devices that are compliant to

> the corresponding specifications from the InfiniBand Trade Association.  And

> that doesn’t prevent applications to choose to be coded to run over nodes of

> different Node Type as it happens today with IB/RoCE and iWARP.

> >

> > This has nothing to do with addressing.

> 

> And whether you like it or not, Intel is intentionally creating a

> device/fabric with the specific intention of mimicking the IB_CA device

> type (with stated exceptions for MAD packets and addresses).  They

> obviously won't have certification as an IB_CA, but that's not their

> aim.  Their aim is to be a functional drop in replacement that apps

> don't need to know about except for the stated exceptions.

> 


Intensions are nice, but there is no way to define these "stated exceptions" apart from a specification.

> And I'm not missing your point.  Your point is inappropriate.  You're

> trying to conflate certification with a functional API.  The IB_CA node

> type is not an official certification of anything, and the linux kernel

> is not an official certifying body for anything.  If you want

> certification, you go to the OFA and the UNH-IOL testing program.

> There, you have the rights to the certification branding logo and you

> have the right to deny access to that logo to anyone that doesn't meet

> the branding requirements.


Who said anything about certification?
I am talking about present and future semantic compliance to what an IB CA stands for, and interoperability guarantees.

ib_verbs define an *extensive* direct HW access API, which is constantly evolving.
You cannot describe the intricate object relations and semantics through an API.
In addition, you can't abstract anything or fix stuff in SW.
The only way to *truly* know what to expect when performing Verbs calls is to check the node type.

ib_verbs was never only an API. It started as the Linux implementation of the IBTA standard, with guaranteed semantics and wire protocol.
Later, the interface was reused to support additional RDMA devices. However, you could *always* check the node type if you wanted to, thereby retaining the standard guarantees. Win-win situation...

This is a very strong property; we should not give up on it.

> 

> You're right that apps can be coded to other CA types, like RNICs and

> USNICs.  However, those are all very different from an IB_CA due to

> limited queue pair types or limited primitives.  If OPA had that same

> limitation then I would agree it needs a different node type.

> 


How do you know that it doesn't?
Have you seen the OPA specification?

> So this will be my litmus test.  Currently, an app that supports all of

> the RDMA types looks like this:

> 

> if (node_type == RNIC)

> 	do iwarpy stuff

> else if (node_type == USNIC)

> 	do USNIC stuff

> else if (node_type == IB_CA)

> 	do IB verbs stuff

> 	if (link_layer == Ethernet)

> 		do RoCE addressing/management

> 	else

> 		do IB addressing/management

> 

> 

> 

> If, in the end, apps that are modified to support OPA end up looking

> like this:

> 

> if (node_type == RNIC)

> 	do iwarpy stuff

> else if (node_type == USNIC)

> 	do USNIC stuff

> else if (node_type == IB_CA || node_type == OPA_CA)

> 	do IB verbs stuff

> 	if (node_type == OPA_CA)

> 		do OPA addressing/management

> 	else if (link_layer == Ethernet)

> 		do RoCE addressing/management

> 	else

> 		do IB addressing/management

> 

> where you can plainly see that the exact same goal can be accomplished

> whether you have an OPA node_type or an IB_CA node_type + OPA

> link_layer, then I will be fine with either a new node_type or a new

> link_layer.  They will be functionally equivalent as far as I'm concerned.

> 


It is true that for some applications, your abstraction might work transparently.
But for other applications, your "do IB verbs stuff" (and not just the addressing/management) will either break today or break tomorrow.

This is bad both for IB and for OPA.

Why on earth are we putting ourselves into a position which could easily be avoided in the first place?

The solution is simple:
- As an API, Verbs will support IB/ROCE, iWARP, USNIC, and OPA
- The node type and link type refer to specific technologies
-- Most applications indeed don't care and don't check either of these properties anyway
-- Those that do, do it for a good reason; don't break them
- Management helpers will do a good job to keep the code maintainable and efficient even if OPA and IB have different node types

Win-win situation...
--Liran
Hefty, Sean June 16, 2015, 10:12 p.m. UTC | #15
> You're right that apps can be coded to other CA types, like RNICs and

> USNICs.  However, those are all very different from an IB_CA due to

> limited queue pair types or limited primitives.  If OPA had that same

> limitation then I would agree it needs a different node type.

> 

> So this will be my litmus test.  Currently, an app that supports all of

> the RDMA types looks like this:

> 

> if (node_type == RNIC)

> 	do iwarpy stuff

> else if (node_type == USNIC)

> 	do USNIC stuff

> else if (node_type == IB_CA)

> 	do IB verbs stuff

> 	if (link_layer == Ethernet)

> 		do RoCE addressing/management

> 	else

> 		do IB addressing/management


The node type values were originally defined to align with the IB management NodeInfo structure.  AFAICT, there was no intent to associate those values with specific functionality or addressing or verbs support or anything else, really, outside of what IB management needed.

iWarp added a new node type, so that the IB management code could ignore those devices.  RoCE basically broke this association by forcing additional checks in the local management code to also check against the link layer.  The recent mad capability bits are a superior solution, making the node type obsolete. 

At this point, the node type essentially indicates if we start counting ports at a numeric value 0 or 1.  The NodeType that an OPA channel adapter will report in a NodeInfo structure will be 1, the same value as if it were an IB channel adapter.

In the end, this argument matters one iota.  The kernel code barely relies on the node type, and a user space verbs provider can report whatever value it wants.

- Sean
Ira Weiny June 17, 2015, 2:03 p.m. UTC | #16
> 

> ib_verbs define an *extensive* direct HW access API, which is constantly

> evolving.


This is the problem with verbs...

> You cannot describe the intricate object relations and semantics through an

> API.

> In addition, you can't abstract anything or fix stuff in SW.

> The only way to *truly* know what to expect when performing Verbs calls is to

> check the node type.


How can you say this?

mthca, mlx4, mlx5, and qib all have different sets of functionality... all with the same node type.  OPA has the same set as qib...  same node type.

> 

> ib_verbs was never only an API. It started as the Linux implementation of the

> IBTA standard, with guaranteed semantics and wire protocol.

> Later, the interface was reused to support additional RDMA devices. However,

> you could *always* check the node type if you wanted to, thereby retaining the

> standard guarantees. Win-win situation...


Not true at all.  For example, Qib does not support XRC and yet has the same node type as mlx4 (5)...

> 

> This is a very strong property; we should not give up on it.


On the contrary the property is weak and implies functionality or lack of functionality rather than being explicit.  This was done because getting changes to kernel ABIs was hard and we took a shortcut with node type which we should not have.  OPA attempts to stop this madness and supports the functionality of verbs _As_ _Defined_ rather than creating yet another set of things which applications need to check against.

> 

> >

> > You're right that apps can be coded to other CA types, like RNICs and

> > USNICs.  However, those are all very different from an IB_CA due to

> > limited queue pair types or limited primitives.  If OPA had that same

> > limitation then I would agree it needs a different node type.

> >

> 

> How do you know that it doesn't?


Up until now you have had to take my word for it.  Now that the driver has been posted it should be clear what verbs we support (same as qib).

Ira
Liran Liss June 18, 2015, 8:12 p.m. UTC | #17
> From: Weiny, Ira [mailto:ira.weiny@intel.com]


> > ib_verbs define an *extensive* direct HW access API, which is constantly

> > evolving.

> 

> This is the problem with verbs...


Huh?
It is its strength, if you don't break backward compatibility...

> 

> > You cannot describe the intricate object relations and semantics through an

> > API.

> > In addition, you can't abstract anything or fix stuff in SW.

> > The only way to *truly* know what to expect when performing Verbs calls

> is to

> > check the node type.

> 

> How can you say this?

> 

> mthca, mlx4, mlx5, and qib all have different sets of functionality... all with

> the same node type.  OPA has the same set as qib...  same node type.

> 


Only that qib is IB, which is fully interoperable with mlx*

> >

> > ib_verbs was never only an API. It started as the Linux implementation of

> the

> > IBTA standard, with guaranteed semantics and wire protocol.

> > Later, the interface was reused to support additional RDMA devices.

> However,

> > you could *always* check the node type if you wanted to, thereby retaining

> the

> > standard guarantees. Win-win situation...

> 

> Not true at all.  For example, Qib does not support XRC and yet has the same

> node type as mlx4 (5)...


The node type is for guaranteeing semantics and interop for the features that you do implement...

> 

> >

> > This is a very strong property; we should not give up on it.

> 

> On the contrary the property is weak and implies functionality or lack of

> functionality rather than being explicit.  This was done because getting

> changes to kernel ABIs was hard and we took a shortcut with node type

> which we should not have.  OPA attempts to stop this madness and supports

> the functionality of verbs _As_ _Defined_ rather than creating yet another

> set of things which applications need to check against.

> 


I totally agree that we should improve the expressiveness and accuracy of our capabilities;
you don't need OPA for this. Unfortunately, it is not always the case. 

Also, there are behaviors that are not defined by the API, but still rely on the node type.
Management applications , for example.

--Liran
Doug Ledford June 18, 2015, 9 p.m. UTC | #18
> On Jun 16, 2015, at 5:05 PM, Liran Liss <liranl@mellanox.com> wrote:
> 
>> From: Doug Ledford [mailto:dledford@redhat.com]
> 
>>> No. RoCE is as an open standard from the IBTA with the exact same RDMA
>> protocol semantics as InfiniBand and a clear set of compliancy rules without
>> which an implementation can't claim to be such. A RoCE device *is* an IB CA
>> with an Ethernet link.
>>> In contrast, OPA is a proprietary protocol. We don't know what primitives
>> are supported, and whether the semantics of supported primitives are the
>> same as in InfiniBand.
>> 
>> Intel has stated on this list that they intend for RDMA apps to run on
>> OPA transparently.  That pretty much implies the list of primitives and
>> everything else that they must support.  However, time will tell if they
>> succeeded or not.
>> 
> 
> I am sorry, but that's not good enough.
> When I see an IB device, I know exactly what to expect. I can't say anything regarding an OPA device.
> 
> It might be that today the semantics are "close enough".
> But in the future, both feature sets and semantics may diverge considerably.
> What are you going to do then?
> 
> In addition, today, the host admin knows that 2 IB CA nodes will always interoperate. If you share the node type with OPA, everything breaks down. There is no way of knowing which devices work with which.

You’ve not done yourself any favors with this argument.  You’ve actually stretched yourself into the land of hyperbole and FUD in order to make this.  Do you not see that “2 IB CA nodes will always interoperate” is not true as soon as you consider differing link layer types?  For example, an mlx4_en device will not interoperate with a qib device, yet they are both IB_CA node types.  Conflating allowing an OPA device to be node type IB_CA and link layer OPA to everything breaking down is pure and utter rubbish.  And with that, we are done with this discussion.  I’ve detailed what my litmus test will be, and I’m sticking with exactly that.

In the case of iWARP and usNIC, there are significant differences from an IB_CA that render a program responsible for possibly altering its intended transfer mechanism significantly (for instance usNIC is UD only, iWARP can’t do atomics or immediate data, so any transfer engine design that uses either of those is out of the question).  On the other hand, everything that uses IB_CA supports the various primitives and only vary in their addressing/management.  If OPA stays true to that (and it certainly does so far by supporting the same verbs as qib), then IB_CA/link_layer OPA is perfectly acceptable and in fact preferred due to the fact that it will produce the minimum amount of change in user space applications before they can support the OPA devices.

>> So this will be my litmus test.  Currently, an app that supports all of
>> the RDMA types looks like this:
>> 
>> if (node_type == RNIC)
>> 	do iwarpy stuff
>> else if (node_type == USNIC)
>> 	do USNIC stuff
>> else if (node_type == IB_CA)
>> 	do IB verbs stuff
>> 	if (link_layer == Ethernet)
>> 		do RoCE addressing/management
>> 	else
>> 		do IB addressing/management
>> 
>> 
>> 
>> If, in the end, apps that are modified to support OPA end up looking
>> like this:
>> 
>> if (node_type == RNIC)
>> 	do iwarpy stuff
>> else if (node_type == USNIC)
>> 	do USNIC stuff
>> else if (node_type == IB_CA || node_type == OPA_CA)
>> 	do IB verbs stuff
>> 	if (node_type == OPA_CA)
>> 		do OPA addressing/management
>> 	else if (link_layer == Ethernet)
>> 		do RoCE addressing/management
>> 	else
>> 		do IB addressing/management
>> 
>> where you can plainly see that the exact same goal can be accomplished
>> whether you have an OPA node_type or an IB_CA node_type + OPA
>> link_layer, then I will be fine with either a new node_type or a new
>> link_layer.  They will be functionally equivalent as far as I'm concerned.
>> 
> 
> It is true that for some applications, your abstraction might work transparently.
> But for other applications, your "do IB verbs stuff" (and not just the addressing/management) will either break today or break tomorrow.

FUD.  Come to me when you have a concrete issue and not hand-wavy scare mongering.

> This is bad both for IB and for OPA.

No, it’s not.

> Why on earth are we putting ourselves into a position which could easily be avoided in the first place?
> 
> The solution is simple:
> - As an API, Verbs will support IB/ROCE, iWARP, USNIC, and OPA

There is *zero* functional difference between node_type == OPA or node_type == IB_CA and link_layer == OPA. An application has *exactly* what they need to do everything you have mentioned.  It changes the test the application makes, but not what the application does.

> - The node type and link type refer to specific technologies

Yes, and Intel has made it clear that they are copying IB Verbs as a technology.  It is common that the one being copied be pissed off by the copying, but it is also common that they can’t do a damn thing about it.

> -- Most applications indeed don't care and don't check either of these properties anyway
> -- Those that do, do it for a good reason; don't break them
> - Management helpers will do a good job to keep the code maintainable and efficient even if OPA and IB have different node types
> 
> Win-win situation...
> --Liran
> 
> 
> 
> N???r?y???b?X??v?^?)?{.n?+?{#<^_NSEDR_^#<??{ay????,j?f?h??z??w??j:+v?w?j?m????zZ+???j"?!

—
Doug Ledford <dledford@redhat.com>
	GPG Key ID: 0E572FDD
Hal Rosenstock June 19, 2015, 11:53 a.m. UTC | #19
On 6/18/2015 5:00 PM, Doug Ledford wrote:
> There is *zero* functional difference between node_type == OPA or node_type == IB_CA and link_layer == OPA. 
> An application has *exactly* what they need

We have neither of these things in the kernel today. Also, if I interpreted what was written by first Ira and more recently Sean, even if either of these things were done, the user space provider library for OPA might/could just change these back to the IB types.

-- Hal
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
diff mbox

Patch

diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index 6c420736ce93..c7dcfe4ca5f1 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -80,7 +80,7 @@  ib_get_agent_port(const struct ib_device *device, int port_num)
 
 void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh,
 			 const struct ib_wc *wc, const struct ib_device *device,
-			 int port_num, int qpn, size_t resp_mad_len)
+			 int port_num, int qpn, size_t resp_mad_len, bool opa)
 {
 	struct ib_agent_port_private *port_priv;
 	struct ib_mad_agent *agent;
@@ -106,11 +106,14 @@  void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *
 		return;
 	}
 
+	if (opa && mad_hdr->base_version != OPA_MGMT_BASE_VERSION)
+		resp_mad_len = IB_MGMT_MAD_SIZE;
+
 	send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
 				      IB_MGMT_MAD_HDR,
 				      resp_mad_len - IB_MGMT_MAD_HDR,
 				      GFP_KERNEL,
-				      IB_MGMT_BASE_VERSION);
+				      mad_hdr->base_version);
 	if (IS_ERR(send_buf)) {
 		dev_err(&device->dev, "ib_create_send_mad error\n");
 		goto err1;
diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h
index 234c8aa380e0..65f92bedae44 100644
--- a/drivers/infiniband/core/agent.h
+++ b/drivers/infiniband/core/agent.h
@@ -46,6 +46,6 @@  extern int ib_agent_port_close(struct ib_device *device, int port_num);
 
 extern void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh,
 				const struct ib_wc *wc, const struct ib_device *device,
-				int port_num, int qpn, size_t resp_mad_len);
+				int port_num, int qpn, size_t resp_mad_len, bool opa);
 
 #endif	/* __AGENT_H_ */
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 81acde8b5e90..17c90519c03e 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -3,6 +3,7 @@ 
  * Copyright (c) 2005 Intel Corporation.  All rights reserved.
  * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
  * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -44,6 +45,7 @@ 
 #include "mad_priv.h"
 #include "mad_rmpp.h"
 #include "smi.h"
+#include "opa_smi.h"
 #include "agent.h"
 
 MODULE_LICENSE("Dual BSD/GPL");
@@ -751,6 +753,7 @@  static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
 {
 	int ret = 0;
 	struct ib_smp *smp = mad_send_wr->send_buf.mad;
+	struct opa_smp *opa_smp = (struct opa_smp *)smp;
 	unsigned long flags;
 	struct ib_mad_local_private *local;
 	struct ib_mad_private *mad_priv;
@@ -762,6 +765,9 @@  static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
 	struct ib_send_wr *send_wr = &mad_send_wr->send_wr;
 	size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv);
 	u16 out_mad_pkey_index = 0;
+	u16 drslid;
+	bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
+				    mad_agent_priv->qp_info->port_priv->port_num);
 
 	if (device->node_type == RDMA_NODE_IB_SWITCH &&
 	    smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
@@ -775,19 +781,48 @@  static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
 	 * If we are at the start of the LID routed part, don't update the
 	 * hop_ptr or hop_cnt.  See section 14.2.2, Vol 1 IB spec.
 	 */
-	if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
-	     IB_LID_PERMISSIVE &&
-	     smi_handle_dr_smp_send(smp, device->node_type, port_num) ==
-	     IB_SMI_DISCARD) {
-		ret = -EINVAL;
-		dev_err(&device->dev, "Invalid directed route\n");
-		goto out;
-	}
+	if (opa && smp->class_version == OPA_SMP_CLASS_VERSION) {
+		u32 opa_drslid;
+
+		if ((opa_get_smp_direction(opa_smp)
+		     ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) ==
+		     OPA_LID_PERMISSIVE &&
+		     opa_smi_handle_dr_smp_send(opa_smp, device->node_type,
+						port_num) == IB_SMI_DISCARD) {
+			ret = -EINVAL;
+			dev_err(&device->dev, "OPA Invalid directed route\n");
+			goto out;
+		}
+		opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid);
+		if (opa_drslid != OPA_LID_PERMISSIVE &&
+		    opa_drslid & 0xffff0000) {
+			ret = -EINVAL;
+			dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n",
+			       opa_drslid);
+			goto out;
+		}
+		drslid = (u16)(opa_drslid & 0x0000ffff);
 
-	/* Check to post send on QP or process locally */
-	if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
-	    smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
-		goto out;
+		/* Check to post send on QP or process locally */
+		if (opa_smi_check_local_smp(opa_smp, device) == IB_SMI_DISCARD &&
+		    opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD)
+			goto out;
+	} else {
+		if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
+		     IB_LID_PERMISSIVE &&
+		     smi_handle_dr_smp_send(smp, device->node_type, port_num) ==
+		     IB_SMI_DISCARD) {
+			ret = -EINVAL;
+			dev_err(&device->dev, "Invalid directed route\n");
+			goto out;
+		}
+		drslid = be16_to_cpu(smp->dr_slid);
+
+		/* Check to post send on QP or process locally */
+		if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
+		    smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
+			goto out;
+	}
 
 	local = kmalloc(sizeof *local, GFP_ATOMIC);
 	if (!local) {
@@ -806,10 +841,16 @@  static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
 	}
 
 	build_smp_wc(mad_agent_priv->agent.qp,
-		     send_wr->wr_id, be16_to_cpu(smp->dr_slid),
+		     send_wr->wr_id, drslid,
 		     send_wr->wr.ud.pkey_index,
 		     send_wr->wr.ud.port_num, &mad_wc);
 
+	if (opa && smp->base_version == OPA_MGMT_BASE_VERSION) {
+		mad_wc.byte_len = mad_send_wr->send_buf.hdr_len
+					+ mad_send_wr->send_buf.data_len
+					+ sizeof(struct ib_grh);
+	}
+
 	/* No GRH for DR SMP */
 	ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
 				  (const struct ib_mad_hdr *)smp, mad_size,
@@ -861,6 +902,10 @@  static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
 	}
 
 	local->mad_send_wr = mad_send_wr;
+	if (opa) {
+		local->mad_send_wr->send_wr.wr.ud.pkey_index = out_mad_pkey_index;
+		local->return_wc_byte_len = mad_size;
+	}
 	/* Reference MAD agent until send side of local completion handled */
 	atomic_inc(&mad_agent_priv->refcount);
 	/* Queue local completion to local list */
@@ -1754,14 +1799,18 @@  out:
 	return mad_agent;
 }
 
-static int validate_mad(const struct ib_mad_hdr *mad_hdr, u32 qp_num)
+static int validate_mad(const struct ib_mad_hdr *mad_hdr,
+			const struct ib_mad_qp_info *qp_info,
+			bool opa)
 {
 	int valid = 0;
+	u32 qp_num = qp_info->qp->qp_num;
 
 	/* Make sure MAD base version is understood */
-	if (mad_hdr->base_version != IB_MGMT_BASE_VERSION) {
-		pr_err("MAD received with unsupported base version %d\n",
-			mad_hdr->base_version);
+	if (mad_hdr->base_version != IB_MGMT_BASE_VERSION &&
+	    (!opa || mad_hdr->base_version != OPA_MGMT_BASE_VERSION)) {
+		pr_err("MAD received with unsupported base version %d %s\n",
+		       mad_hdr->base_version, opa ? "(opa)" : "");
 		goto out;
 	}
 
@@ -2011,7 +2060,8 @@  static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv
 				    port_priv->device,
 				    smi_get_fwd_port(smp),
 				    qp_info->qp->qp_num,
-				    response->mad_size);
+				    response->mad_size,
+				    false);
 
 		return IB_SMI_DISCARD;
 	}
@@ -2019,7 +2069,8 @@  static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv
 }
 
 static bool generate_unmatched_resp(const struct ib_mad_private *recv,
-				    struct ib_mad_private *response)
+				    struct ib_mad_private *response,
+				    size_t *resp_len, bool opa)
 {
 	const struct ib_mad_hdr *recv_hdr = (const struct ib_mad_hdr *)recv->mad;
 	struct ib_mad_hdr *resp_hdr = (struct ib_mad_hdr *)response->mad;
@@ -2035,11 +2086,96 @@  static bool generate_unmatched_resp(const struct ib_mad_private *recv,
 		if (recv_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
 			resp_hdr->status |= IB_SMP_DIRECTION;
 
+		if (opa && recv_hdr->base_version == OPA_MGMT_BASE_VERSION) {
+			if (recv_hdr->mgmt_class ==
+			    IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+			    recv_hdr->mgmt_class ==
+			    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+				*resp_len = opa_get_smp_header_size(
+							(struct opa_smp *)recv->mad);
+			else
+				*resp_len = sizeof(struct ib_mad_hdr);
+		}
+
 		return true;
 	} else {
 		return false;
 	}
 }
+
+static enum smi_action
+handle_opa_smi(struct ib_mad_port_private *port_priv,
+	       struct ib_mad_qp_info *qp_info,
+	       struct ib_wc *wc,
+	       int port_num,
+	       struct ib_mad_private *recv,
+	       struct ib_mad_private *response)
+{
+	enum smi_forward_action retsmi;
+	struct opa_smp *smp = (struct opa_smp *)recv->mad;
+
+	if (opa_smi_handle_dr_smp_recv(smp,
+				   port_priv->device->node_type,
+				   port_num,
+				   port_priv->device->phys_port_cnt) ==
+				   IB_SMI_DISCARD)
+		return IB_SMI_DISCARD;
+
+	retsmi = opa_smi_check_forward_dr_smp(smp);
+	if (retsmi == IB_SMI_LOCAL)
+		return IB_SMI_HANDLE;
+
+	if (retsmi == IB_SMI_SEND) { /* don't forward */
+		if (opa_smi_handle_dr_smp_send(smp,
+					   port_priv->device->node_type,
+					   port_num) == IB_SMI_DISCARD)
+			return IB_SMI_DISCARD;
+
+		if (opa_smi_check_local_smp(smp, port_priv->device) ==
+		    IB_SMI_DISCARD)
+			return IB_SMI_DISCARD;
+
+	} else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) {
+		/* forward case for switches */
+		memcpy(response, recv, mad_priv_size(response));
+		response->header.recv_wc.wc = &response->header.wc;
+		response->header.recv_wc.recv_buf.opa_mad =
+				(struct opa_mad *)response->mad;
+		response->header.recv_wc.recv_buf.grh = &response->grh;
+
+		agent_send_response((const struct ib_mad_hdr *)response->mad,
+				    &response->grh, wc,
+				    port_priv->device,
+				    opa_smi_get_fwd_port(smp),
+				    qp_info->qp->qp_num,
+				    recv->header.wc.byte_len,
+				    true);
+
+		return IB_SMI_DISCARD;
+	}
+
+	return IB_SMI_HANDLE;
+}
+
+static enum smi_action
+handle_smi(struct ib_mad_port_private *port_priv,
+	   struct ib_mad_qp_info *qp_info,
+	   struct ib_wc *wc,
+	   int port_num,
+	   struct ib_mad_private *recv,
+	   struct ib_mad_private *response,
+	   bool opa)
+{
+	struct ib_mad_hdr *mad_hdr = (struct ib_mad_hdr *)recv->mad;
+
+	if (opa && mad_hdr->base_version == OPA_MGMT_BASE_VERSION &&
+	    mad_hdr->class_version == OPA_SMI_CLASS_VERSION)
+		return handle_opa_smi(port_priv, qp_info, wc, port_num, recv,
+				      response);
+
+	return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response);
+}
+
 static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
 				     struct ib_wc *wc)
 {
@@ -2052,11 +2188,15 @@  static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
 	int ret = IB_MAD_RESULT_SUCCESS;
 	size_t mad_size;
 	u16 resp_mad_pkey_index = 0;
+	bool opa;
 
 	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
 	qp_info = mad_list->mad_queue->qp_info;
 	dequeue_mad(mad_list);
 
+	opa = rdma_cap_opa_mad(qp_info->port_priv->device,
+			       qp_info->port_priv->port_num);
+
 	mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
 				    mad_list);
 	recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
@@ -2068,7 +2208,15 @@  static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
 	/* Setup MAD receive work completion from "normal" work completion */
 	recv->header.wc = *wc;
 	recv->header.recv_wc.wc = &recv->header.wc;
-	recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+
+	if (opa && ((struct ib_mad_hdr *)(recv->mad))->base_version == OPA_MGMT_BASE_VERSION) {
+		recv->header.recv_wc.mad_len = wc->byte_len - sizeof(struct ib_grh);
+		recv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad);
+	} else {
+		recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+		recv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad);
+	}
+
 	recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad;
 	recv->header.recv_wc.recv_buf.grh = &recv->grh;
 
@@ -2076,7 +2224,7 @@  static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
 		snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS);
 
 	/* Validate MAD */
-	if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info->qp->qp_num))
+	if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa))
 		goto out;
 
 	mad_size = recv->mad_size;
@@ -2094,8 +2242,8 @@  static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
 
 	if (((struct ib_mad_hdr *)recv->mad)->mgmt_class ==
 	    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-		if (handle_ib_smi(port_priv, qp_info, wc, port_num, recv,
-				  response)
+		if (handle_smi(port_priv, qp_info, wc, port_num, recv,
+			       response, opa)
 		    == IB_SMI_DISCARD)
 			goto out;
 	}
@@ -2118,7 +2266,7 @@  static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
 						    port_priv->device,
 						    port_num,
 						    qp_info->qp->qp_num,
-						    response->mad_size);
+						    mad_size, opa);
 				goto out;
 			}
 		}
@@ -2133,10 +2281,10 @@  static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
 		 */
 		recv = NULL;
 	} else if ((ret & IB_MAD_RESULT_SUCCESS) &&
-		   generate_unmatched_resp(recv, response)) {
+		   generate_unmatched_resp(recv, response, &mad_size, opa)) {
 		agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc,
 				    port_priv->device, port_num,
-				    qp_info->qp->qp_num, response->mad_size);
+				    qp_info->qp->qp_num, mad_size, opa);
 	}
 
 out:
@@ -2537,10 +2685,14 @@  static void local_completions(struct work_struct *work)
 	int free_mad;
 	struct ib_wc wc;
 	struct ib_mad_send_wc mad_send_wc;
+	bool opa;
 
 	mad_agent_priv =
 		container_of(work, struct ib_mad_agent_private, local_work);
 
+	opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
+			       mad_agent_priv->qp_info->port_priv->port_num);
+
 	spin_lock_irqsave(&mad_agent_priv->lock, flags);
 	while (!list_empty(&mad_agent_priv->local_list)) {
 		local = list_entry(mad_agent_priv->local_list.next,
@@ -2550,6 +2702,7 @@  static void local_completions(struct work_struct *work)
 		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 		free_mad = 0;
 		if (local->mad_priv) {
+			u8 base_version;
 			recv_mad_agent = local->recv_mad_agent;
 			if (!recv_mad_agent) {
 				dev_err(&mad_agent_priv->agent.device->dev,
@@ -2565,11 +2718,20 @@  static void local_completions(struct work_struct *work)
 			build_smp_wc(recv_mad_agent->agent.qp,
 				     (unsigned long) local->mad_send_wr,
 				     be16_to_cpu(IB_LID_PERMISSIVE),
-				     0, recv_mad_agent->agent.port_num, &wc);
+				     local->mad_send_wr->send_wr.wr.ud.pkey_index,
+				     recv_mad_agent->agent.port_num, &wc);
 
 			local->mad_priv->header.recv_wc.wc = &wc;
-			local->mad_priv->header.recv_wc.mad_len =
-						sizeof(struct ib_mad);
+
+			base_version = ((struct ib_mad_hdr *)(local->mad_priv->mad))->base_version;
+			if (opa && base_version == OPA_MGMT_BASE_VERSION) {
+				local->mad_priv->header.recv_wc.mad_len = local->return_wc_byte_len;
+				local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad);
+			} else {
+				local->mad_priv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+				local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad);
+			}
+
 			INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list);
 			list_add(&local->mad_priv->header.recv_wc.recv_buf.list,
 				 &local->mad_priv->header.recv_wc.rmpp_list);
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 4423f68e2a77..5be89f98928f 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -148,6 +148,7 @@  struct ib_mad_local_private {
 	struct ib_mad_private *mad_priv;
 	struct ib_mad_agent_private *recv_mad_agent;
 	struct ib_mad_send_wr_private *mad_send_wr;
+	size_t return_wc_byte_len;
 };
 
 struct ib_mad_mgmt_method_table {
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
index f4e4fe609e95..382941b46e43 100644
--- a/drivers/infiniband/core/mad_rmpp.c
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -1,6 +1,7 @@ 
 /*
  * Copyright (c) 2005 Intel Inc. All rights reserved.
  * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -67,6 +68,7 @@  struct mad_rmpp_recv {
 	u8 mgmt_class;
 	u8 class_version;
 	u8 method;
+	u8 base_version;
 };
 
 static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
@@ -318,6 +320,7 @@  create_rmpp_recv(struct ib_mad_agent_private *agent,
 	rmpp_recv->mgmt_class = mad_hdr->mgmt_class;
 	rmpp_recv->class_version = mad_hdr->class_version;
 	rmpp_recv->method  = mad_hdr->method;
+	rmpp_recv->base_version  = mad_hdr->base_version;
 	return rmpp_recv;
 
 error:	kfree(rmpp_recv);
@@ -433,14 +436,23 @@  static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv)
 {
 	struct ib_rmpp_mad *rmpp_mad;
 	int hdr_size, data_size, pad;
+	bool opa = rdma_cap_opa_mad(rmpp_recv->agent->qp_info->port_priv->device,
+				    rmpp_recv->agent->qp_info->port_priv->port_num);
 
 	rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad;
 
 	hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
-	data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
-	pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
-	if (pad > IB_MGMT_RMPP_DATA || pad < 0)
-		pad = 0;
+	if (opa && rmpp_recv->base_version == OPA_MGMT_BASE_VERSION) {
+		data_size = sizeof(struct opa_rmpp_mad) - hdr_size;
+		pad = OPA_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+		if (pad > OPA_MGMT_RMPP_DATA || pad < 0)
+			pad = 0;
+	} else {
+		data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
+		pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+		if (pad > IB_MGMT_RMPP_DATA || pad < 0)
+			pad = 0;
+	}
 
 	return hdr_size + rmpp_recv->seg_num * data_size - pad;
 }
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index d4286712405d..35567fffaa4e 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -262,20 +262,23 @@  static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
 {
 	struct ib_mad_recv_buf *recv_buf;
 	int left, seg_payload, offset, max_seg_payload;
+	size_t seg_size;
 
-	/* We need enough room to copy the first (or only) MAD segment. */
 	recv_buf = &packet->recv_wc->recv_buf;
-	if ((packet->length <= sizeof (*recv_buf->mad) &&
+	seg_size = packet->recv_wc->mad_seg_size;
+
+	/* We need enough room to copy the first (or only) MAD segment. */
+	if ((packet->length <= seg_size &&
 	     count < hdr_size(file) + packet->length) ||
-	    (packet->length > sizeof (*recv_buf->mad) &&
-	     count < hdr_size(file) + sizeof (*recv_buf->mad)))
+	    (packet->length > seg_size &&
+	     count < hdr_size(file) + seg_size))
 		return -EINVAL;
 
 	if (copy_to_user(buf, &packet->mad, hdr_size(file)))
 		return -EFAULT;
 
 	buf += hdr_size(file);
-	seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad));
+	seg_payload = min_t(int, packet->length, seg_size);
 	if (copy_to_user(buf, recv_buf->mad, seg_payload))
 		return -EFAULT;
 
@@ -292,7 +295,7 @@  static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
 			return -ENOSPC;
 		}
 		offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class);
-		max_seg_payload = sizeof (struct ib_mad) - offset;
+		max_seg_payload = seg_size - offset;
 
 		for (left = packet->length - seg_payload, buf += seg_payload;
 		     left; left -= seg_payload, buf += seg_payload) {
@@ -450,6 +453,7 @@  static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
 	struct ib_rmpp_mad *rmpp_mad;
 	__be64 *tid;
 	int ret, data_len, hdr_len, copy_offset, rmpp_active;
+	u8 base_version;
 
 	if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
 		return -EINVAL;
@@ -516,12 +520,13 @@  static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
 		rmpp_active = 0;
 	}
 
+	base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version;
 	data_len = count - hdr_size(file) - hdr_len;
 	packet->msg = ib_create_send_mad(agent,
 					 be32_to_cpu(packet->mad.hdr.qpn),
 					 packet->mad.hdr.pkey_index, rmpp_active,
 					 hdr_len, data_len, GFP_KERNEL,
-					 IB_MGMT_BASE_VERSION);
+					 base_version);
 	if (IS_ERR(packet->msg)) {
 		ret = PTR_ERR(packet->msg);
 		goto err_ah;
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index eaa8c5dc472b..c8422d5a5a91 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -199,6 +199,12 @@  struct ib_rmpp_mad {
 	u8			data[IB_MGMT_RMPP_DATA];
 };
 
+struct opa_rmpp_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	u8			data[OPA_MGMT_RMPP_DATA];
+};
+
 struct ib_sa_mad {
 	struct ib_mad_hdr	mad_hdr;
 	struct ib_rmpp_hdr	rmpp_hdr;
@@ -429,6 +435,7 @@  struct ib_mad_recv_buf {
  * @recv_buf: Specifies the location of the received data buffer(s).
  * @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers.
  * @mad_len: The length of the received MAD, without duplicated headers.
+ * @mad_seg_size: The size of individual MAD segments
  *
  * For received response, the wr_id contains a pointer to the ib_mad_send_buf
  *   for the corresponding send request.
@@ -438,6 +445,7 @@  struct ib_mad_recv_wc {
 	struct ib_mad_recv_buf	recv_buf;
 	struct list_head	rmpp_list;
 	int			mad_len;
+	size_t			mad_seg_size;
 };
 
 /**