diff mbox

[v4,for-next,04/12] IB/ipoib: Return IPoIB devices matching connection parameters

Message ID 1431841868-28063-5-git-send-email-haggaie@mellanox.com (mailing list archive)
State Changes Requested
Headers show

Commit Message

Haggai Eran May 17, 2015, 5:51 a.m. UTC
From: Guy Shapiro <guysh@mellanox.com>

Implement the get_net_device_by_port_pkey_ip callback that returns network
device to ib_core according to connection parameters. Check the ipoib
device and iterate over all child devices to look for a match.

For each ipoib device we iterate through all upper devices when searching
for a matching IP, in order to support bonding.

Signed-off-by: Guy Shapiro <guysh@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Yotam Kenneth <yotamke@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 139 +++++++++++++++++++++++++++++-
 1 file changed, 138 insertions(+), 1 deletion(-)

Comments

Jason Gunthorpe May 19, 2015, 6:28 p.m. UTC | #1
On Sun, May 17, 2015 at 08:51:00AM +0300, Haggai Eran wrote:

> +#if IS_ENABLED(CONFIG_IPV6)
> +	struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
> +#endif
> +	__be32 ret_addr;
> +
> +	switch (addr->sa_family) {
> +	case AF_INET:
> +		in_dev = in_dev_get(dev);
> +		if (!in_dev)
> +			return false;
> +
> +		ret_addr = inet_confirm_addr(net, in_dev, 0,
> +					     addr_in->sin_addr.s_addr,
> +					     RT_SCOPE_HOST);
> +		in_dev_put(in_dev);
> +		if (ret_addr)
> +			return true;
> +
> +		break;
> +#if IS_ENABLED(CONFIG_IPV6)
> +	case AF_INET6:
> +		if (ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
> +			return true;
> +
> +		break;
> +#endif

Can you use

  if (IS_ENABLED(CONFIG_IPV6))

At the call site instead of the #if guards?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 19, 2015, 11:55 p.m. UTC | #2
On Sun, May 17, 2015 at 08:51:00AM +0300, Haggai Eran wrote:
> From: Guy Shapiro <guysh@mellanox.com>
> 
> Implement the get_net_device_by_port_pkey_ip callback that returns network
> device to ib_core according to connection parameters. Check the ipoib
> device and iterate over all child devices to look for a match.

Can you give a run down on how to actually set this up? Like what
shell command do you execute?

What are the child devices in the netnamespace?

> For each ipoib device we iterate through all upper devices when searching
> for a matching IP, in order to support bonding.

Checking an IP address in a packet against a device without consulting
the routing table is a big red flag for me. Can you elaborate?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haggai Eran May 20, 2015, 3:17 p.m. UTC | #3
On 19/05/2015 21:28, Jason Gunthorpe wrote:
> On Sun, May 17, 2015 at 08:51:00AM +0300, Haggai Eran wrote:
> 
>> +#if IS_ENABLED(CONFIG_IPV6)
>> +	struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
>> +#endif
>> +	__be32 ret_addr;
>> +
>> +	switch (addr->sa_family) {
>> +	case AF_INET:
>> +		in_dev = in_dev_get(dev);
>> +		if (!in_dev)
>> +			return false;
>> +
>> +		ret_addr = inet_confirm_addr(net, in_dev, 0,
>> +					     addr_in->sin_addr.s_addr,
>> +					     RT_SCOPE_HOST);
>> +		in_dev_put(in_dev);
>> +		if (ret_addr)
>> +			return true;
>> +
>> +		break;
>> +#if IS_ENABLED(CONFIG_IPV6)
>> +	case AF_INET6:
>> +		if (ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
>> +			return true;
>> +
>> +		break;
>> +#endif
> 
> Can you use
> 
>   if (IS_ENABLED(CONFIG_IPV6))
> 
> At the call site instead of the #if guards?

Sure, I'll do that in the next revision of the patch-set.

Haggai

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haggai Eran May 21, 2015, 5:33 a.m. UTC | #4
On 20/05/2015 02:55, Jason Gunthorpe wrote:
> On Sun, May 17, 2015 at 08:51:00AM +0300, Haggai Eran wrote:
>> > From: Guy Shapiro <guysh@mellanox.com>
>> > 
>> > Implement the get_net_device_by_port_pkey_ip callback that returns network
>> > device to ib_core according to connection parameters. Check the ipoib
>> > device and iterate over all child devices to look for a match.
> Can you give a run down on how to actually set this up? Like what
> shell command do you execute?

Sure.

There are two methods to create new child interface for IPoIB.
For a specific P_Key, write the desired P_Key to the create_child sysfs
file:
# echo 0x8000 > /sys/class/net/ib0/create_child
This creates a new interface ib0.8000 operating with P_Key 0x8000.

To create a new child interface on the default P_Key, its possible to
use iproute:
# ip link add link ib0 name ib0.1 type ipoib

In order to create a new network namespace:
# ip netns add ns1

Then, you can assign the new netdev to the namespace:
# ip link set ib0.1 netns ns1

You can then set an IP address in the network namespace, and try some
RDMA CM applications:
# ip netns exec ns1 ip addr add dev ib0.1 192.168.0.1/24
# ip netns exec ns1 rdma_server

Regards,
Haggai
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haggai Eran May 21, 2015, 5:48 a.m. UTC | #5
On 20/05/2015 02:55, Jason Gunthorpe wrote:
> On Sun, May 17, 2015 at 08:51:00AM +0300, Haggai Eran wrote:
>> For each ipoib device we iterate through all upper devices when searching
>> for a matching IP, in order to support bonding.
> 
> Checking an IP address in a packet against a device without consulting
> the routing table is a big red flag for me. Can you elaborate?
We want to match the incoming CM request to a specific net_dev, and use
the network namespace of that net_dev to find the right rdma_cm_id.
Because there can be multiple network namespaces, there would also be
multiple routing tables, so I'm not sure how we could use them. The
solution we used is to walk all the upper net_devs of all the IPoIB
child devices of a given IB device and port, and find one that matches
the request's IB device, port number, P_Key and IP address.

Regards,
Haggai
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz May 21, 2015, 5:48 a.m. UTC | #6
On Thu, May 21, 2015 at 8:33 AM, Haggai Eran <haggaie@mellanox.com> wrote:
> On 20/05/2015 02:55, Jason Gunthorpe wrote:
>> On Sun, May 17, 2015 at 08:51:00AM +0300, Haggai Eran wrote:
>>> > From: Guy Shapiro <guysh@mellanox.com>
>>> >
>>> > Implement the get_net_device_by_port_pkey_ip callback that returns network
>>> > device to ib_core according to connection parameters. Check the ipoib
>>> > device and iterate over all child devices to look for a match.
>> Can you give a run down on how to actually set this up? Like what
>> shell command do you execute?
>
> Sure.
>
> There are two methods to create new child interface for IPoIB.
> For a specific P_Key, write the desired P_Key to the create_child sysfs
> file:
> # echo 0x8000 > /sys/class/net/ib0/create_child
> This creates a new interface ib0.8000 operating with P_Key 0x8000.

0x8000 is practically zero (bits 15-0), right? not sure this is a valid pkey.


> To create a new child interface on the default P_Key, its possible to
> use iproute:
> # ip link add link ib0 name ib0.1 type ipoib

you can use non default pkeys as well here.

$ ip link add link ib0 name ib0.1 type ipoib pkey 0x8001
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haggai Eran May 21, 2015, 6:33 a.m. UTC | #7
On 21/05/2015 08:48, Or Gerlitz wrote:
> On Thu, May 21, 2015 at 8:33 AM, Haggai Eran <haggaie@mellanox.com> wrote:
>> On 20/05/2015 02:55, Jason Gunthorpe wrote:
>>> On Sun, May 17, 2015 at 08:51:00AM +0300, Haggai Eran wrote:
>>>>> From: Guy Shapiro <guysh@mellanox.com>
>>>>>
>>>>> Implement the get_net_device_by_port_pkey_ip callback that returns network
>>>>> device to ib_core according to connection parameters. Check the ipoib
>>>>> device and iterate over all child devices to look for a match.
>>> Can you give a run down on how to actually set this up? Like what
>>> shell command do you execute?
>>
>> Sure.
>>
>> There are two methods to create new child interface for IPoIB.
>> For a specific P_Key, write the desired P_Key to the create_child sysfs
>> file:
>> # echo 0x8000 > /sys/class/net/ib0/create_child
>> This creates a new interface ib0.8000 operating with P_Key 0x8000.
> 
> 0x8000 is practically zero (bits 15-0), right? not sure this is a valid pkey.
Right, bad example :)

> 
> 
>> To create a new child interface on the default P_Key, its possible to
>> use iproute:
>> # ip link add link ib0 name ib0.1 type ipoib
> 
> you can use non default pkeys as well here.
> 
> $ ip link add link ib0 name ib0.1 type ipoib pkey 0x8001
> 

Right. I think when we started development of the namespaces patches
these child interfaces (rtnetlink with pkey) didn't work for us, but I
checked now on an updated kernel and they do.

Haggai
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz May 21, 2015, 10:31 a.m. UTC | #8
On Thu, May 21, 2015 at 9:33 AM, Haggai Eran <haggaie@mellanox.com> wrote:

>> you can use non default pkeys as well here.
>> $ ip link add link ib0 name ib0.1 type ipoib pkey 0x8001

> Right. I think when we started development of the namespaces patches
> these child interfaces (rtnetlink with pkey) didn't work for us, but I
> checked now on an updated kernel and they do.

The last change to drivers/infiniband/ulp/ipoib/ipoib_netlink.c took
place on 3.12, well before you started... could be that you provided
wrong command line and such, I haven't seen any bug report nor fix
from you group to that area of the kernel.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 21, 2015, 5:43 p.m. UTC | #9
On Thu, May 21, 2015 at 08:33:53AM +0300, Haggai Eran wrote:

> To create a new child interface on the default P_Key, its possible to
> use iproute:
> # ip link add link ib0 name ib0.1 type ipoib

Uh..

A key invariant of the IP stack is that is it possible to uniquely
identify the ingress device.

So the above scheme is fine for IPoIB, because it uses the interface
unique QPN to uniquely ID the netdevice: (Device,Port,Pkey,QPN)
is the unique ID tuple. The world is happy.

But RDMA CM doesn't provide the QPN. So when RDMA CM searches the
netdevs for an address it cannot *uniquely* map to a IPoIB interface.

This is bad, and *completely wrong*, but today, nobody is going to
really notice or care. The cases where it does something you don't
want are not very significant.

But with containers.. Think this through for a minute: 'In some cases
the RDMA CM selecs the wrong child' - that goes from being a minor
annoyance to a violation of containment! Worse the criteria for
'selects the wrong child' can be triggered by the contained users. Eg
the contained user adds a IP to their child that duplicates another
container. Now we've lost control.

The very idea of ib_get_net_dev_by_port_pkey_ip is broken.

So, I don't know what to say here.. Ideas?

1) Forbid creating more than one pkey per ipoib interface?
2) Somehow extend the RDMA CM to send the IPoIB qpn too?
3) ??

Right now the only case that comes to mind is duplicating IPs, that is
already going to cause an ARP collision, so maybe having the RDMA CM
randomly select an IP is not the end of the world... But with
containers and security, who knows? I'm not confident I've
exhaustively thought of all possibilities here.

----------

Anyhow.. looking again through this series and the existing code, the
flow is wrong, and really needs to be changed before this starts to
make sense to anyone, and is no doubt part of how we got here..

When a REQ arrives RDMA CM needs to run down these steps (this is identical
to what ip_input.c does)

 1) Locate the netdev associated with the ingress of the packet,
    in a sane world this is done by only checking the
    unique (Device,Port,Pkey,QPN) tuple.
    If we keep our brokeness, we'd do this based on
    (Device,Port,Pkey,IP) - if there are IP collisions then randomly
    select a netdev (similar to how ARP collision is handled).
 2) Then we do the ip_route_input_noref step, this will set skb_dst to
    the netdev that will handle the packet, or tell us to drop it.
    This is not always the same as the netdev that accepts the
    packet!!!

    NOTE: This route step is missing today, it does critical things
    like check that the node is actually listening on the dest IP!

 3) Now we can use skb_dst to iterate over the set of all RDMA CM listens:
     1) Bound to the skb_dst netdev
     2) Unbound in the same namespace as skb_dst netdev
    The first to match the dst IP + port is the listen that will accept the
    connection, now we go into the cma_new_conn_id path, and we don't
    need rdma_translate_ip because we already have the handling netdev.

The backwards operation of the current code is part of why this is all
so strange looking, and I think is strongly connected to the private
data compare issues Sean is talking about. It is very much the wrong
flow to look for the RDMA CM listen first, and then try to work
backwards to the netdev.

The above 3 steps hard require that the ib_cm and rdma_cm
maintain different listening lists, because we need the 2nd search in
#3. So this gets the ib_cm completely out of looking at the private
data. [And now we can think carefully about the best way to refcount
the listens in RDMA CM]

Once the above is cleaned up dropping in namespaces should just
happen naturally.

So.. I'm going to suggest you make a cleanup series to fix this:
 - Introduce the ib_get_net_dev_by_port_pkey_ip and document the
   breakage it represents
 - Rework rdma_cm to do steps 1,2,3 above, using
   ib_get_net_dev_by_port_pkey_ip to drive #1, your patch set is
   already doing about 50% of this change.
 - Fix the collateral damage

As I said to Matan, clean up series like this should not introduce
major functional changes, so stack the few remaining net namespace
things in another series after it.

** The same comments apply to RoCE too, but for RoCE step #1 works
   properly based on the (Device,Port,VLAN) unique tuple
   Ditto for iWarp **

I think this also moves to address Sean's concern about generality, at
least for listen. All three protocols will run down the same common
code to locate the netdev and find the correct RDMA CM listen. The
only difference is the 'ib_get_net_dev_by_port_pkey_ip' call varies.

.. I also wouldn't mind seeing the giant cma.c split, if it makes
   sense to have a cma_listen.c, for instance, wouldn't that be nice?

Sorry Haggi, this is a big change to your patchset :(

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haggai Eran May 28, 2015, 11:51 a.m. UTC | #10
On 21/05/2015 20:43, Jason Gunthorpe wrote:
> On Thu, May 21, 2015 at 08:33:53AM +0300, Haggai Eran wrote:
> 
>> To create a new child interface on the default P_Key, its possible to
>> use iproute:
>> # ip link add link ib0 name ib0.1 type ipoib
> 
> Uh..
> 
> A key invariant of the IP stack is that is it possible to uniquely
> identify the ingress device.
> 
> So the above scheme is fine for IPoIB, because it uses the interface
> unique QPN to uniquely ID the netdevice: (Device,Port,Pkey,QPN)
> is the unique ID tuple. The world is happy.
> 
> But RDMA CM doesn't provide the QPN. So when RDMA CM searches the
> netdevs for an address it cannot *uniquely* map to a IPoIB interface.
This is technically true, but if someone configures their system that
way, they will also have ARP conflicts in addition. I don't see why we
should support such a configuration.

> This is bad, and *completely wrong*, but today, nobody is going to
> really notice or care. The cases where it does something you don't
> want are not very significant.
> 
> But with containers.. Think this through for a minute: 'In some cases
> the RDMA CM selecs the wrong child' - that goes from being a minor
> annoyance to a violation of containment! Worse the criteria for
> 'selects the wrong child' can be triggered by the contained users. Eg
> the contained user adds a IP to their child that duplicates another
> container. Now we've lost control.
No, this is exactly what would happen in the Ethernet world. If you
create a conflicting configuration between two containers on the same
Ethernet segment, then one of them could get the traffic that was
intended for the other.
I don't see this as a violation of containment, because these containers
are assigned net_devs that communicate on the same segment, so they
behave just as two different hosts would, with or without conflicts.

> 
> The very idea of ib_get_net_dev_by_port_pkey_ip is broken.
> 
> So, I don't know what to say here.. Ideas?
> 
> 1) Forbid creating more than one pkey per ipoib interface?
You probably mean more than one IP on the same pkey. The pkey is
actually part of the request, so its not an issue.

> 2) Somehow extend the RDMA CM to send the IPoIB qpn too?
> 3) ??
We can do something crazy in the future like moving all CM requests to
run over UDP as in RoCEv2. But both adding the QPN or moving to UDP
require a wire protocol change and won't be compatible with today's systems.

> 
> Right now the only case that comes to mind is duplicating IPs, that is
> already going to cause an ARP collision, so maybe having the RDMA CM
> randomly select an IP is not the end of the world... But with
> containers and security, who knows? I'm not confident I've
> exhaustively thought of all possibilities here.
> 
> ----------
> 
> Anyhow.. looking again through this series and the existing code, the
> flow is wrong, and really needs to be changed before this starts to
> make sense to anyone, and is no doubt part of how we got here..
> 
> When a REQ arrives RDMA CM needs to run down these steps (this is identical
> to what ip_input.c does)
> 
>  1) Locate the netdev associated with the ingress of the packet,
>     in a sane world this is done by only checking the
>     unique (Device,Port,Pkey,QPN) tuple.
>     If we keep our brokeness, we'd do this based on
>     (Device,Port,Pkey,IP) - if there are IP collisions then randomly
>     select a netdev (similar to how ARP collision is handled).
That's what ib_get_net_dev_by_port_pkey_ip intends to do.

>  2) Then we do the ip_route_input_noref step, this will set skb_dst to
>     the netdev that will handle the packet, or tell us to drop it.
>     This is not always the same as the netdev that accepts the
>     packet!!!
> 
>     NOTE: This route step is missing today, it does critical things
>     like check that the node is actually listening on the dest IP!

Isn't this a little over-engineered? If all you want is to make sure the
net dev is up, can't we use something like netif_running()?

Also, this sounds like a major change in behavior even for applications
that do not use containers. I think today RDMA CM will accept
connections even if the ipoib interface is down.

> 
>  3) Now we can use skb_dst to iterate over the set of all RDMA CM listens:
>      1) Bound to the skb_dst netdev
>      2) Unbound in the same namespace as skb_dst netdev
>     The first to match the dst IP + port is the listen that will accept the
>     connection, now we go into the cma_new_conn_id path, and we don't
>     need rdma_translate_ip because we already have the handling netdev.
You shouldn't be able to bind one listener to a netdev in a namespace
and also have a different listener listening for any netdev on that same
namespace. (That is what cma_check_port verifies, right?) So when
looking for a listener in a namespace there should be only one match.

It is true we no longer need the rdma_translate_ip call.

> 
> The backwards operation of the current code is part of why this is all
> so strange looking, and I think is strongly connected to the private
> data compare issues Sean is talking about. It is very much the wrong
> flow to look for the RDMA CM listen first, and then try to work
> backwards to the netdev.
That's not what the code does. It first finds the netdev and decides on
the namespace based on that. It then finds the RDMA CM listener in that
namespace.

> 
> The above 3 steps hard require that the ib_cm and rdma_cm
> maintain different listening lists, because we need the 2nd search in
> #3. So this gets the ib_cm completely out of looking at the private
> data. [And now we can think carefully about the best way to refcount
> the listens in RDMA CM]

The current patch-set already makes sure that rdma_cm doesn't rely on
ib_cm's private_data matching.

> 
> Once the above is cleaned up dropping in namespaces should just
> happen naturally.
> 
> So.. I'm going to suggest you make a cleanup series to fix this:
>  - Introduce the ib_get_net_dev_by_port_pkey_ip and document the
>    breakage it represents
>  - Rework rdma_cm to do steps 1,2,3 above, using
>    ib_get_net_dev_by_port_pkey_ip to drive #1, your patch set is
>    already doing about 50% of this change.
>  - Fix the collateral damage
> 
> As I said to Matan, clean up series like this should not introduce
> major functional changes, so stack the few remaining net namespace
> things in another series after it.

I think I can refactor the series this way, but I don't think we need
step 2. It seems like an overly complicated way of checking whether a
netdev is up or not. It doesn't seem to provide any new information over
what ib_get_net_dev_by_port_pkey_ip does.

As for fixing the collateral damage, I assume you mean cleaning the
interfaces of ib_cm now that private data matching is not used? I don't
see why this should be part of the series.

> 
> ** The same comments apply to RoCE too, but for RoCE step #1 works
>    properly based on the (Device,Port,VLAN) unique tuple

For RoCE, you could still have multiple macvlan interfaces using the
same VLAN, with different IP address. So the unique tuple is
(Device,Port,VLAN,IP). With Matan adding a netdev to each GID entry in
the RoCE GID table, I think it would be simpler to find the netdev in RoCE.

>    Ditto for iWarp **
Yes, if iWarp CM can provide the netdev on which a request arrives, I
think it would be simple to use it with this design.

> 
> I think this also moves to address Sean's concern about generality, at
> least for listen. All three protocols will run down the same common
> code to locate the netdev and find the correct RDMA CM listen. The
> only difference is the 'ib_get_net_dev_by_port_pkey_ip' call varies.

Right.

> 
> .. I also wouldn't mind seeing the giant cma.c split, if it makes
>    sense to have a cma_listen.c, for instance, wouldn't that be nice?

It would be nice, but this patchset is becoming large already, and I
don't want to add unnecessary noise.

Regards,
Haggai
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 28, 2015, 3:45 p.m. UTC | #11
On Thu, May 28, 2015 at 02:51:51PM +0300, Haggai Eran wrote:

> > But RDMA CM doesn't provide the QPN. So when RDMA CM searches the
> > netdevs for an address it cannot *uniquely* map to a IPoIB interface.

> This is technically true, but if someone configures their system that
> way, they will also have ARP conflicts in addition. I don't see why we
> should support such a configuration.

Based on the dicussion in the other thread about this, the feeling is
we should not support same-GUID, same-PKey child interfaces at all. It
breaks too much stuff (DHCP,NetworkManager,IPv6,RDMA-CM)

> No, this is exactly what would happen in the Ethernet world. If you
> create a conflicting configuration between two containers on the same
> Ethernet segment, then one of them could get the traffic that was
> intended for the other.

It is not exactly the same. In Ethernet there is an ARP collision at
L3, but the traffic is properly addressed at L2 and unambiguously
directed into only one container. There are ways to deal with ARP
collisions, but those are only effective if the full LLADDR is
consistently used for routing to containers.

With RDMA-CM it is an L3 collsion, so our anti-ARP collision stuff
doesn't help.

Like I said, I don't from a security perspective what to make of this,
but it isn't exactly the same of ethernet.

> >  1) Locate the netdev associated with the ingress of the packet,
> >     in a sane world this is done by only checking the
> >     unique (Device,Port,Pkey,QPN) tuple.
> >     If we keep our brokeness, we'd do this based on
> >     (Device,Port,Pkey,IP) - if there are IP collisions then randomly
> >     select a netdev (similar to how ARP collision is handled).
> That's what ib_get_net_dev_by_port_pkey_ip intends to do.

Right, almost there.

ib_get_net_dev_by_port_pkey_ip needs to work in a very specific
way: If there is only one netdev with the (Device,Port,GUID,Pkey)
match then that is the answer.
(guid comes from the CM_REQ, if we add alias GUID support to IPoIB as
 Or suggested then it is needed)

The IP search should *only* be done if there are two children with
identical (Pkey,GUID), and as above, perhaps we should de-support that.

> >  2) Then we do the ip_route_input_noref step, this will set skb_dst to
> >     the netdev that will handle the packet, or tell us to drop it.
> >     This is not always the same as the netdev that accepts the
> >     packet!!!
> > 
> >     NOTE: This route step is missing today, it does critical things
> >     like check that the node is actually listening on the dest IP!
> 
> Isn't this a little over-engineered? If all you want is to make sure the
> net dev is up, can't we use something like netif_running()?

The routing check is not to see if the netdev is up, it is doing all
sorts of subtle userspace visible things. Like checking there is no
blackhole route configured for the packet, checking that the IP is
present in the system, netdevs are up, etc etc.

We don't get to pick and choose what netdev behaviors we implement
when doing this kind of stuff. Copy the netstack, don't make stuff up.

Understand the two layer separation, first with pick a netdev without
looking at L3 info, then we feed the netdev and L3 info into routing
to complete the process.

> Also, this sounds like a major change in behavior even for applications
> that do not use containers. I think today RDMA CM will accept
> connections even if the ipoib interface is down.

Yes, it is a change in behavior, things move closer to alignment with
how netdev works. We did a similar change to the output side years ago
as well. I guess the input side was missed.

Unless I'm missing something this isn't 'major', these are corner case
conditions/bugs that nobody sane should rely on.

> >  3) Now we can use skb_dst to iterate over the set of all RDMA CM listens:
> >      1) Bound to the skb_dst netdev
> >      2) Unbound in the same namespace as skb_dst netdev
> >     The first to match the dst IP + port is the listen that will accept the
> >     connection, now we go into the cma_new_conn_id path, and we don't
> >     need rdma_translate_ip because we already have the handling netdev.
> You shouldn't be able to bind one listener to a netdev in a namespace
> and also have a different listener listening for any netdev on that same
> namespace. (That is what cma_check_port verifies, right?) So when
> looking for a listener in a namespace there should be only one match.

You know, I don't remember off hand the exact semantics of sockets,
whatever sockets does :)

> > The backwards operation of the current code is part of why this is all
> > so strange looking, and I think is strongly connected to the private
> > data compare issues Sean is talking about. It is very much the wrong
> > flow to look for the RDMA CM listen first, and then try to work
> > backwards to the netdev.

> That's not what the code does. It first finds the netdev and decides on
> the namespace based on that. It then finds the RDMA CM listener in that
> namespace.

I was talking about the current code, but even with the patch set, the
behavior is still backwards:

 1) Find the netdev
 2) Get the namespace, throw away the netdev
 3) Find a CM_ID without a netdev
 4) Find a netdev from the CM_ID

Instead:
 1) Find the netdev
 2) Find a CM_ID compatible with that netdev

> I think I can refactor the series this way, but I don't think we need
> step 2. It seems like an overly complicated way of checking whether a
> netdev is up or not. It doesn't seem to provide any new information over
> what ib_get_net_dev_by_port_pkey_ip does.

If it is hard to do then leave a FIXME comment and go on, if it is
easy, it is the right step.

It does provide more information because
ib_get_net_dev_by_port_pkey_ip should not check the IP.

> For RoCE, you could still have multiple macvlan interfaces using the
> same VLAN, with different IP address. So the unique tuple is
> (Device,Port,VLAN,IP). With Matan adding a netdev to each GID entry in
> the RoCE GID table, I think it would be simpler to find the netdev in RoCE.

That isn't how I understand macvlan, the tuple will be
(Device,Port,MAC,VLAN)

You should never search for an IP, that is not how netdev works.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 3421e42870c3..75def39a4271 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -48,6 +48,9 @@ 
 
 #include <linux/jhash.h>
 #include <net/arp.h>
+#include <net/addrconf.h>
+#include <linux/inetdevice.h>
+#include <rdma/ib_cache.h>
 
 #define DRV_VERSION "1.0.0"
 
@@ -91,11 +94,15 @@  struct ib_sa_client ipoib_sa_client;
 static void ipoib_add_one(struct ib_device *device);
 static void ipoib_remove_one(struct ib_device *device);
 static void ipoib_neigh_reclaim(struct rcu_head *rp);
+static struct net_device *ipoib_get_net_device_by_port_pkey_ip(
+		struct ib_device *dev, u8 port, u16 pkey,
+		struct sockaddr *addr);
 
 static struct ib_client ipoib_client = {
 	.name   = "ipoib",
 	.add    = ipoib_add_one,
-	.remove = ipoib_remove_one
+	.remove = ipoib_remove_one,
+	.get_net_device_by_port_pkey_ip = ipoib_get_net_device_by_port_pkey_ip,
 };
 
 int ipoib_open(struct net_device *dev)
@@ -222,6 +229,136 @@  static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
+/* Called with an RCU read lock taken */
+static bool ipoib_is_dev_match_addr(struct sockaddr *addr,
+				    struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct in_device *in_dev;
+	struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
+#endif
+	__be32 ret_addr;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		in_dev = in_dev_get(dev);
+		if (!in_dev)
+			return false;
+
+		ret_addr = inet_confirm_addr(net, in_dev, 0,
+					     addr_in->sin_addr.s_addr,
+					     RT_SCOPE_HOST);
+		in_dev_put(in_dev);
+		if (ret_addr)
+			return true;
+
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		if (ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
+			return true;
+
+		break;
+#endif
+	}
+	return false;
+}
+
+/**
+ * Find a net_device matching the given address, which is an upper device of
+ * the given net_device.
+ * @addr: IP address to look for.
+ * @dev: base IPoIB net_device
+ *
+ * If found, returns the net_device with a reference held. Otherwise return
+ * NULL.
+ */
+static struct net_device *ipoib_get_net_dev_match_addr(struct sockaddr *addr,
+						       struct net_device *dev)
+{
+	struct net_device *upper,
+			  *result = NULL;
+	struct list_head *iter;
+
+	rcu_read_lock();
+	if (ipoib_is_dev_match_addr(addr, dev)) {
+		dev_hold(dev);
+		result = dev;
+		goto out;
+	}
+
+	netdev_for_each_all_upper_dev_rcu(dev, upper, iter) {
+		if (ipoib_is_dev_match_addr(addr, upper)) {
+			dev_hold(upper);
+			result = upper;
+			break;
+		}
+	}
+out:
+	rcu_read_unlock();
+	return result;
+}
+
+/* returns an IPoIB netdev on top a given ipoib device matching a pkey_index
+ * and address, if one exists. */
+static struct net_device *ipoib_match_pkey_addr(struct ipoib_dev_priv *priv,
+						u16 pkey_index,
+						struct sockaddr *addr)
+{
+	struct ipoib_dev_priv *child_priv;
+	struct net_device *net_dev = NULL;
+
+	if (priv->pkey_index == pkey_index) {
+		net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
+		if (net_dev)
+			return net_dev;
+	}
+
+	/* Check child interfaces */
+	down_read(&priv->vlan_rwsem);
+	list_for_each_entry(child_priv, &priv->child_intfs, list) {
+		net_dev = ipoib_match_pkey_addr(child_priv, pkey_index, addr);
+		if (net_dev)
+			break;
+	}
+	up_read(&priv->vlan_rwsem);
+
+	return net_dev;
+}
+
+static struct net_device *ipoib_get_net_device_by_port_pkey_ip(
+		struct ib_device *dev, u8 port, u16 pkey, struct sockaddr *addr)
+{
+	struct ipoib_dev_priv *priv;
+	struct list_head *dev_list;
+	struct net_device *net_dev;
+	u16 pkey_index;
+	int ret;
+
+	ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
+	if (ret)
+		return NULL;
+
+	if (!rdma_protocol_ib(dev, port))
+		return NULL;
+
+	dev_list = ib_get_client_data(dev, &ipoib_client);
+	if (!dev_list)
+		return NULL;
+
+	list_for_each_entry(priv, dev_list, list) {
+		if (priv->port != port)
+			continue;
+
+		net_dev = ipoib_match_pkey_addr(priv, pkey_index, addr);
+		if (net_dev)
+			return net_dev;
+	}
+	return NULL;
+}
+
 int ipoib_set_mode(struct net_device *dev, const char *buf)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);