diff mbox series

[net-next,v4,09/18] net/smc: introduce SMC-D loopback device

Message ID 1695568613-125057-10-git-send-email-guwen@linux.alibaba.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net/smc: implement virtual ISM extension and loopback-ism | expand

Checks

Context Check Description
netdev/series_format fail Series longer than 15 patches (and no cover letter)
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1342 this patch: 1342
netdev/cc_maintainers success CCed 12 of 12 maintainers
netdev/build_clang success Errors and warnings before: 1363 this patch: 1363
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1365 this patch: 1365
netdev/checkpatch warning WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Wen Gu Sept. 24, 2023, 3:16 p.m. UTC
This patch introduces a kind of loopback device for SMC-D. The device
is created when SMC module is loaded and destroyed when the SMC module
is unloaded. The loopback device is a kernel device used only by the
SMC module and is not restricted by net namespace, so it can be used
for local inter-process or inter-container communication.

Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
---
 net/smc/Kconfig        |  13 ++++
 net/smc/Makefile       |   2 +-
 net/smc/af_smc.c       |  12 +++-
 net/smc/smc_loopback.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_loopback.h |  33 ++++++++++
 5 files changed, 223 insertions(+), 2 deletions(-)
 create mode 100644 net/smc/smc_loopback.c
 create mode 100644 net/smc/smc_loopback.h

Comments

Alexandra Winter Sept. 25, 2023, 11:50 a.m. UTC | #1
On 24.09.23 17:16, Wen Gu wrote:
> This patch introduces a kind of loopback device for SMC-D. The device
> is created when SMC module is loaded and destroyed when the SMC module
> is unloaded. The loopback device is a kernel device used only by the
> SMC module and is not restricted by net namespace, so it can be used
> for local inter-process or inter-container communication.
> 
> Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
> ---
>  net/smc/Kconfig        |  13 ++++
>  net/smc/Makefile       |   2 +-
>  net/smc/af_smc.c       |  12 +++-
>  net/smc/smc_loopback.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++
>  net/smc/smc_loopback.h |  33 ++++++++++
>  5 files changed, 223 insertions(+), 2 deletions(-)
>  create mode 100644 net/smc/smc_loopback.c
>  create mode 100644 net/smc/smc_loopback.h


Hello Wen Gu,

thank you for adding the Kconfig, so the distributions can decide when to offer this feature.

I propose you add some kind of runtime switch as well. Not every user who loads the SMC module
may want to exploit smcd-loopback. Especially in native environements without containers.

If no RoCE interfaces or no ISM interfaces exist, the respective handling is skipped in SMC.
If loopback is always created unconditionally, there is no way to opt-out.
Alexandra Winter Sept. 25, 2023, 1:29 p.m. UTC | #2
On 25.09.23 13:50, Alexandra Winter wrote:
> 
> 
> On 24.09.23 17:16, Wen Gu wrote:
>> This patch introduces a kind of loopback device for SMC-D. The device
>> is created when SMC module is loaded and destroyed when the SMC module
>> is unloaded. The loopback device is a kernel device used only by the
>> SMC module and is not restricted by net namespace, so it can be used
>> for local inter-process or inter-container communication.
>>
>> Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
>> ---
>>  net/smc/Kconfig        |  13 ++++
>>  net/smc/Makefile       |   2 +-
>>  net/smc/af_smc.c       |  12 +++-
>>  net/smc/smc_loopback.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  net/smc/smc_loopback.h |  33 ++++++++++
>>  5 files changed, 223 insertions(+), 2 deletions(-)
>>  create mode 100644 net/smc/smc_loopback.c
>>  create mode 100644 net/smc/smc_loopback.h
> 
> 
> Hello Wen Gu,
> 
> thank you for adding the Kconfig, so the distributions can decide when to offer this feature.
> 
> I propose you add some kind of runtime switch as well. Not every user who loads the SMC module
> may want to exploit smcd-loopback. Especially in native environements without containers.
> 
> If no RoCE interfaces or no ISM interfaces exist, the respective handling is skipped in SMC.
> If loopback is always created unconditionally, there is no way to opt-out.
> 

Another thing came to my mind:

When loopback is created and registered when the SMC module is loaded, it will implicitly always have highest priority, right?
That should be stated somewhere.
Also, if you create a runtime switch this will change, so then you need to decide about priority of loopback vs ISM device (and other future smcd-devices).
Wen Gu Sept. 25, 2023, 1:57 p.m. UTC | #3
On 2023/9/25 19:50, Alexandra Winter wrote:
> 
> 
> On 24.09.23 17:16, Wen Gu wrote:
>> This patch introduces a kind of loopback device for SMC-D. The device
>> is created when SMC module is loaded and destroyed when the SMC module
>> is unloaded. The loopback device is a kernel device used only by the
>> SMC module and is not restricted by net namespace, so it can be used
>> for local inter-process or inter-container communication.
>>
>> Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
>> ---
>>   net/smc/Kconfig        |  13 ++++
>>   net/smc/Makefile       |   2 +-
>>   net/smc/af_smc.c       |  12 +++-
>>   net/smc/smc_loopback.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   net/smc/smc_loopback.h |  33 ++++++++++
>>   5 files changed, 223 insertions(+), 2 deletions(-)
>>   create mode 100644 net/smc/smc_loopback.c
>>   create mode 100644 net/smc/smc_loopback.h
> 
> 
> Hello Wen Gu,
> 
> thank you for adding the Kconfig, so the distributions can decide when to offer this feature.
> 
> I propose you add some kind of runtime switch as well. Not every user who loads the SMC module
> may want to exploit smcd-loopback. Especially in native environements without containers.
> 
> If no RoCE interfaces or no ISM interfaces exist, the respective handling is skipped in SMC.
> If loopback is always created unconditionally, there is no way to opt-out.

Yes, I need to think about this. Make a runtime switch to enable/disable the loopback-ism just
like ip link up/down. An rough idea is to add an smc-tools command, like 'smcd device disable/enable loopback'.

Thank you very much.

Regards,
Wen Gu
Wen Gu Sept. 25, 2023, 2:20 p.m. UTC | #4
On 2023/9/25 21:29, Alexandra Winter wrote:
> 
> 
> On 25.09.23 13:50, Alexandra Winter wrote:
>>
>>
>> On 24.09.23 17:16, Wen Gu wrote:
>>> This patch introduces a kind of loopback device for SMC-D. The device
>>> is created when SMC module is loaded and destroyed when the SMC module
>>> is unloaded. The loopback device is a kernel device used only by the
>>> SMC module and is not restricted by net namespace, so it can be used
>>> for local inter-process or inter-container communication.
>>>
>>> Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
>>> ---
>>>   net/smc/Kconfig        |  13 ++++
>>>   net/smc/Makefile       |   2 +-
>>>   net/smc/af_smc.c       |  12 +++-
>>>   net/smc/smc_loopback.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>   net/smc/smc_loopback.h |  33 ++++++++++
>>>   5 files changed, 223 insertions(+), 2 deletions(-)
>>>   create mode 100644 net/smc/smc_loopback.c
>>>   create mode 100644 net/smc/smc_loopback.h
>>
>>
>> Hello Wen Gu,
>>
>> thank you for adding the Kconfig, so the distributions can decide when to offer this feature.
>>
>> I propose you add some kind of runtime switch as well. Not every user who loads the SMC module
>> may want to exploit smcd-loopback. Especially in native environements without containers.
>>
>> If no RoCE interfaces or no ISM interfaces exist, the respective handling is skipped in SMC.
>> If loopback is always created unconditionally, there is no way to opt-out.
>>
> 
> Another thing came to my mind:
> 
> When loopback is created and registered when the SMC module is loaded, it will implicitly always have highest priority, right?
> That should be stated somewhere.
> Also, if you create a runtime switch this will change, so then you need to decide about priority of loopback vs ISM device (and other future smcd-devices).

Yes. I think the question may become 'How users to define the priority of existing the smcd devices'. In the past,
all the ISMv2 has nearly same performance so priority is not very important. But now there are other virtual ISM 
devices, they perform differently.

My rough idea is defining a fixed priority, such as whenever loopback-ism is enabled, it is always the first in the
slots. If fixed priority is not appropriate, low-priority devices can be prioritized by disabling high-priority devices.

So it seems that the runtime switch of the loopback-ism is even more necessary.

Thanks,
Wen Gu
Dust Li Sept. 25, 2023, 3:18 p.m. UTC | #5
On Mon, Sep 25, 2023 at 01:50:22PM +0200, Alexandra Winter wrote:
>
>
>On 24.09.23 17:16, Wen Gu wrote:
>> This patch introduces a kind of loopback device for SMC-D. The device
>> is created when SMC module is loaded and destroyed when the SMC module
>> is unloaded. The loopback device is a kernel device used only by the
>> SMC module and is not restricted by net namespace, so it can be used
>> for local inter-process or inter-container communication.
>> 
>> Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
>> ---
>>  net/smc/Kconfig        |  13 ++++
>>  net/smc/Makefile       |   2 +-
>>  net/smc/af_smc.c       |  12 +++-
>>  net/smc/smc_loopback.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  net/smc/smc_loopback.h |  33 ++++++++++
>>  5 files changed, 223 insertions(+), 2 deletions(-)
>>  create mode 100644 net/smc/smc_loopback.c
>>  create mode 100644 net/smc/smc_loopback.h
>
>
>Hello Wen Gu,
>
>thank you for adding the Kconfig, so the distributions can decide when to offer this feature.
>
>I propose you add some kind of runtime switch as well. Not every user who loads the SMC module
>may want to exploit smcd-loopback. Especially in native environements without containers.
>
>If no RoCE interfaces or no ISM interfaces exist, the respective handling is skipped in SMC.
>If loopback is always created unconditionally, there is no way to opt-out.

Hi Sandy,

After talking to Wen Gu offline, I think the real issue here might be
we don't have an abstract layer in SMC, something like net/core/dev.c

Without this, we cannot do:

1. Enable/disable those devices dynamically
   Currently, If we want to disable a SMC-R device to communicate with
   others, we need to refer to 'ip link set dev xxx down' to disable the
   netdevice, then Infiniband subsystem will notify SMC that the state of
   the IB device has changed. We cannot explicitly choose not to use some
   specific IB/RoCE devices without disable totally.
   If the loopback device need to support enable/disable itself, I
   think it might be better to enable this feature for all SMC devices.

2. Do statistics per device
   Now, we have to relay on IB/RoCE devices' hardware statistics to see
   how many packets/bytes we have sent through this device.

Both the above issues get worse when the IB/RoCE device is shared by SMC
and userspace RDMA applications. If SMC-R and userspace RDMA applications
run at the same time, we can't enable the device to run userspace RDMA
applications while block it from running SMC. For statistics, we cannot
tell how many packets/bytes were sent by SMC and how many were sent by
userspace RDMA applications.

So I think those are better to support in the SMC layer.

Best regards!
Dust
Alexandra Winter Sept. 26, 2023, 7:24 a.m. UTC | #6
On 25.09.23 17:18, Dust Li wrote:
>> Hello Wen Gu,
>>
>> thank you for adding the Kconfig, so the distributions can decide when to offer this feature.
>>
>> I propose you add some kind of runtime switch as well. Not every user who loads the SMC module
>> may want to exploit smcd-loopback. Especially in native environements without containers.
>>
>> If no RoCE interfaces or no ISM interfaces exist, the respective handling is skipped in SMC.
>> If loopback is always created unconditionally, there is no way to opt-out.
> Hi Sandy,
> 
> After talking to Wen Gu offline, I think the real issue here might be
> we don't have an abstract layer in SMC, something like net/core/dev.c
> 
> Without this, we cannot do:
> 
> 1. Enable/disable those devices dynamically
>    Currently, If we want to disable a SMC-R device to communicate with
>    others, we need to refer to 'ip link set dev xxx down' to disable the
>    netdevice, then Infiniband subsystem will notify SMC that the state of
>    the IB device has changed. We cannot explicitly choose not to use some
>    specific IB/RoCE devices without disable totally.
>    If the loopback device need to support enable/disable itself, I
>    think it might be better to enable this feature for all SMC devices.
> 
> 2. Do statistics per device
>    Now, we have to relay on IB/RoCE devices' hardware statistics to see
>    how many packets/bytes we have sent through this device.
> 
> Both the above issues get worse when the IB/RoCE device is shared by SMC
> and userspace RDMA applications. If SMC-R and userspace RDMA applications
> run at the same time, we can't enable the device to run userspace RDMA
> applications while block it from running SMC. For statistics, we cannot
> tell how many packets/bytes were sent by SMC and how many were sent by
> userspace RDMA applications.
> 
> So I think those are better to support in the SMC layer.
> 
> Best regards!
> Dust

Thank you very much for your considerations. I also think a generic handling 
of these requirements in the smc layer would be best. Especially, if we want 
to add virtio-ism support soon. There we will face the same issues again.
Let's hear what others think about this.
Jan Karcher Sept. 28, 2023, 3:16 a.m. UTC | #7
On 26/09/2023 09:24, Alexandra Winter wrote:
> 
> 
> On 25.09.23 17:18, Dust Li wrote:
>>> Hello Wen Gu,
>>>
>>> thank you for adding the Kconfig, so the distributions can decide when to offer this feature.
>>>
>>> I propose you add some kind of runtime switch as well. Not every user who loads the SMC module
>>> may want to exploit smcd-loopback. Especially in native environements without containers.
>>>
>>> If no RoCE interfaces or no ISM interfaces exist, the respective handling is skipped in SMC.
>>> If loopback is always created unconditionally, there is no way to opt-out.
>> Hi Sandy,
>>
>> After talking to Wen Gu offline, I think the real issue here might be
>> we don't have an abstract layer in SMC, something like net/core/dev.c
>>
>> Without this, we cannot do:
>>
>> 1. Enable/disable those devices dynamically
>>     Currently, If we want to disable a SMC-R device to communicate with
>>     others, we need to refer to 'ip link set dev xxx down' to disable the
>>     netdevice, then Infiniband subsystem will notify SMC that the state of
>>     the IB device has changed. We cannot explicitly choose not to use some
>>     specific IB/RoCE devices without disable totally.
>>     If the loopback device need to support enable/disable itself, I
>>     think it might be better to enable this feature for all SMC devices.
>>
>> 2. Do statistics per device
>>     Now, we have to relay on IB/RoCE devices' hardware statistics to see
>>     how many packets/bytes we have sent through this device.
>>
>> Both the above issues get worse when the IB/RoCE device is shared by SMC
>> and userspace RDMA applications. If SMC-R and userspace RDMA applications
>> run at the same time, we can't enable the device to run userspace RDMA
>> applications while block it from running SMC. For statistics, we cannot
>> tell how many packets/bytes were sent by SMC and how many were sent by
>> userspace RDMA applications.
>>
>> So I think those are better to support in the SMC layer.
>>
>> Best regards!
>> Dust
> 
> Thank you very much for your considerations. I also think a generic handling
> of these requirements in the smc layer would be best. Especially, if we want
> to add virtio-ism support soon. There we will face the same issues again.
> Let's hear what others think about this.
> 
> 

Thanks you Sandy for bringing it up and Dust Li & Wen Gu for your thoughts.
I agree that such a runtime switch is needed and also that this generic 
handling would be good in the smc layer.
Wen Gu Sept. 28, 2023, 6:35 p.m. UTC | #8
On 2023/9/28 11:16, Jan Karcher wrote:
> 
> 
> On 26/09/2023 09:24, Alexandra Winter wrote:
>>
>>
>> On 25.09.23 17:18, Dust Li wrote:
>>>> Hello Wen Gu,
>>>>
>>>> thank you for adding the Kconfig, so the distributions can decide when to offer this feature.
>>>>
>>>> I propose you add some kind of runtime switch as well. Not every user who loads the SMC module
>>>> may want to exploit smcd-loopback. Especially in native environements without containers.
>>>>
>>>> If no RoCE interfaces or no ISM interfaces exist, the respective handling is skipped in SMC.
>>>> If loopback is always created unconditionally, there is no way to opt-out.
>>> Hi Sandy,
>>>
>>> After talking to Wen Gu offline, I think the real issue here might be
>>> we don't have an abstract layer in SMC, something like net/core/dev.c
>>>
>>> Without this, we cannot do:
>>>
>>> 1. Enable/disable those devices dynamically
>>>     Currently, If we want to disable a SMC-R device to communicate with
>>>     others, we need to refer to 'ip link set dev xxx down' to disable the
>>>     netdevice, then Infiniband subsystem will notify SMC that the state of
>>>     the IB device has changed. We cannot explicitly choose not to use some
>>>     specific IB/RoCE devices without disable totally.
>>>     If the loopback device need to support enable/disable itself, I
>>>     think it might be better to enable this feature for all SMC devices.
>>>
>>> 2. Do statistics per device
>>>     Now, we have to relay on IB/RoCE devices' hardware statistics to see
>>>     how many packets/bytes we have sent through this device.
>>>
>>> Both the above issues get worse when the IB/RoCE device is shared by SMC
>>> and userspace RDMA applications. If SMC-R and userspace RDMA applications
>>> run at the same time, we can't enable the device to run userspace RDMA
>>> applications while block it from running SMC. For statistics, we cannot
>>> tell how many packets/bytes were sent by SMC and how many were sent by
>>> userspace RDMA applications.
>>>
>>> So I think those are better to support in the SMC layer.
>>>
>>> Best regards!
>>> Dust
>>
>> Thank you very much for your considerations. I also think a generic handling
>> of these requirements in the smc layer would be best. Especially, if we want
>> to add virtio-ism support soon. There we will face the same issues again.
>> Let's hear what others think about this.
>>
>>
> 
> Thanks you Sandy for bringing it up and Dust Li & Wen Gu for your thoughts.
> I agree that such a runtime switch is needed and also that this generic handling would be good in the smc layer.

Right. runtime switch is necessary. I'm trying some ways to see which one is more suitable.


As for implementing a abstract layer that capable of handling 1) enable/disable SMC usage of
RDMA/ISM devices. 2) count packets/bytes of RDMA/ISM devices that generated/consumed by SMC,
I believe it would be helpful, and IMHO its architecture may be:

----------------------------------------------
                   SMC protocol
     (af_smc.c / smc_core.c / smc_clc.c ...)
----------------------------------------------
           Abstract layer of SMC device
       (define SMC device common operations)
----------------------------------------------
   RDMA device |        (virt) ISM device
   (smc_ib.c)  |   (smc_ism.c / smc_loopback.c)
----------------------------------------------

But I also believe this may require a lot of works and may be a long-term job.

If only for the virtual ISM device, e.g.loopback-ism, I am considering adding it to the Linux
device tree (/sys/devices/virtual/) to make it more 'device-like', and controlling its
enable/disable and get the statistics through some files, such as
echo 1 > /sys/devices/virtual/loopback-ism/alive
or
cat /sys/devices/virtual/loopback-ism/statistics/{rx|tx}_{bytes|packets}
(similar to what tcp lo have in /sys/devices/virtual/net/lo)

What are your thoughts on it? Thanks.


--
A little off-topic, it's currently China's National Day holiday, which lasts for about a week,
so we are now on vacation. As a result, my responses might be a bit slower, but I will still
make time to check/reply the mail and prepare for my new version. Thank you all very much!

Regards,
Wen Gu
Alexandra Winter Sept. 29, 2023, 2:08 p.m. UTC | #9
On 28.09.23 20:35, Wen Gu wrote:
> 
> 
> On 2023/9/28 11:16, Jan Karcher wrote:
>>
>>
>> On 26/09/2023 09:24, Alexandra Winter wrote:
>>>
>>>
>>> On 25.09.23 17:18, Dust Li wrote:
>>>>> Hello Wen Gu,
>>>>>
>>>>> thank you for adding the Kconfig, so the distributions can decide when to offer this feature.
>>>>>
>>>>> I propose you add some kind of runtime switch as well. Not every user who loads the SMC module
>>>>> may want to exploit smcd-loopback. Especially in native environements without containers.
>>>>>
>>>>> If no RoCE interfaces or no ISM interfaces exist, the respective handling is skipped in SMC.
>>>>> If loopback is always created unconditionally, there is no way to opt-out.
>>>> Hi Sandy,
>>>>
>>>> After talking to Wen Gu offline, I think the real issue here might be
>>>> we don't have an abstract layer in SMC, something like net/core/dev.c
>>>>
>>>> Without this, we cannot do:
>>>>
>>>> 1. Enable/disable those devices dynamically
>>>>     Currently, If we want to disable a SMC-R device to communicate with
>>>>     others, we need to refer to 'ip link set dev xxx down' to disable the
>>>>     netdevice, then Infiniband subsystem will notify SMC that the state of
>>>>     the IB device has changed. We cannot explicitly choose not to use some
>>>>     specific IB/RoCE devices without disable totally.
>>>>     If the loopback device need to support enable/disable itself, I
>>>>     think it might be better to enable this feature for all SMC devices.
>>>>
>>>> 2. Do statistics per device
>>>>     Now, we have to relay on IB/RoCE devices' hardware statistics to see
>>>>     how many packets/bytes we have sent through this device.
>>>>
>>>> Both the above issues get worse when the IB/RoCE device is shared by SMC
>>>> and userspace RDMA applications. If SMC-R and userspace RDMA applications
>>>> run at the same time, we can't enable the device to run userspace RDMA
>>>> applications while block it from running SMC. For statistics, we cannot
>>>> tell how many packets/bytes were sent by SMC and how many were sent by
>>>> userspace RDMA applications.
>>>>
>>>> So I think those are better to support in the SMC layer.
>>>>
>>>> Best regards!
>>>> Dust
>>>
>>> Thank you very much for your considerations. I also think a generic handling
>>> of these requirements in the smc layer would be best. Especially, if we want
>>> to add virtio-ism support soon. There we will face the same issues again.
>>> Let's hear what others think about this.
>>>
>>>
>>
>> Thanks you Sandy for bringing it up and Dust Li & Wen Gu for your thoughts.
>> I agree that such a runtime switch is needed and also that this generic handling would be good in the smc layer.
> 
> Right. runtime switch is necessary. I'm trying some ways to see which one is more suitable.
> 
> 
> As for implementing a abstract layer that capable of handling 1) enable/disable SMC usage of
> RDMA/ISM devices. 2) count packets/bytes of RDMA/ISM devices that generated/consumed by SMC,
> I believe it would be helpful, and IMHO its architecture may be:
> 
> ----------------------------------------------
>                   SMC protocol
>     (af_smc.c / smc_core.c / smc_clc.c ...)
> ----------------------------------------------
>           Abstract layer of SMC device
>       (define SMC device common operations)
> ----------------------------------------------
>   RDMA device |        (virt) ISM device
>   (smc_ib.c)  |   (smc_ism.c / smc_loopback.c)
> ----------------------------------------------
> 
> But I also believe this may require a lot of works and may be a long-term job.
> 

I like that concept a lot. If we can agree on a direction, we can define
meaningful pieces and approach it piece by piece.


> If only for the virtual ISM device, e.g.loopback-ism, I am considering adding it to the Linux
> device tree (/sys/devices/virtual/) to make it more 'device-like', and controlling its
> enable/disable and get the statistics through some files, such as
> echo 1 > /sys/devices/virtual/loopback-ism/alive
> or
> cat /sys/devices/virtual/loopback-ism/statistics/{rx|tx}_{bytes|packets}
> (similar to what tcp lo have in /sys/devices/virtual/net/lo)
> 
> What are your thoughts on it? Thanks.
> 

Makes sense to me, but I don't have too much experience in that area.
I have never seen an attribute called 'alive' before. 
I think attributes like 'power', 'enable' or 'online' are used for other device types.

> 
> -- 
> A little off-topic, it's currently China's National Day holiday, which lasts for about a week,
> so we are now on vacation. As a result, my responses might be a bit slower, but I will still
> make time to check/reply the mail and prepare for my new version. Thank you all very much!
> 
> Regards,
> Wen Gu

Next week is Germany's national holiday, so many of us are out as well.
Wen Gu Oct. 4, 2023, 9:05 a.m. UTC | #10
On 2023/9/29 22:08, Alexandra Winter wrote:
> 
> 
> On 28.09.23 20:35, Wen Gu wrote:
>>
>>
>> On 2023/9/28 11:16, Jan Karcher wrote:
>>>
>>>
>>> On 26/09/2023 09:24, Alexandra Winter wrote:
>>>>
>>>>
>>>> On 25.09.23 17:18, Dust Li wrote:
>>>>>> Hello Wen Gu,
>>>>>>
>>>>>> thank you for adding the Kconfig, so the distributions can decide when to offer this feature.
>>>>>>
>>>>>> I propose you add some kind of runtime switch as well. Not every user who loads the SMC module
>>>>>> may want to exploit smcd-loopback. Especially in native environements without containers.
>>>>>>
>>>>>> If no RoCE interfaces or no ISM interfaces exist, the respective handling is skipped in SMC.
>>>>>> If loopback is always created unconditionally, there is no way to opt-out.
>>>>> Hi Sandy,
>>>>>
>>>>> After talking to Wen Gu offline, I think the real issue here might be
>>>>> we don't have an abstract layer in SMC, something like net/core/dev.c
>>>>>
>>>>> Without this, we cannot do:
>>>>>
>>>>> 1. Enable/disable those devices dynamically
>>>>>      Currently, If we want to disable a SMC-R device to communicate with
>>>>>      others, we need to refer to 'ip link set dev xxx down' to disable the
>>>>>      netdevice, then Infiniband subsystem will notify SMC that the state of
>>>>>      the IB device has changed. We cannot explicitly choose not to use some
>>>>>      specific IB/RoCE devices without disable totally.
>>>>>      If the loopback device need to support enable/disable itself, I
>>>>>      think it might be better to enable this feature for all SMC devices.
>>>>>
>>>>> 2. Do statistics per device
>>>>>      Now, we have to relay on IB/RoCE devices' hardware statistics to see
>>>>>      how many packets/bytes we have sent through this device.
>>>>>
>>>>> Both the above issues get worse when the IB/RoCE device is shared by SMC
>>>>> and userspace RDMA applications. If SMC-R and userspace RDMA applications
>>>>> run at the same time, we can't enable the device to run userspace RDMA
>>>>> applications while block it from running SMC. For statistics, we cannot
>>>>> tell how many packets/bytes were sent by SMC and how many were sent by
>>>>> userspace RDMA applications.
>>>>>
>>>>> So I think those are better to support in the SMC layer.
>>>>>
>>>>> Best regards!
>>>>> Dust
>>>>
>>>> Thank you very much for your considerations. I also think a generic handling
>>>> of these requirements in the smc layer would be best. Especially, if we want
>>>> to add virtio-ism support soon. There we will face the same issues again.
>>>> Let's hear what others think about this.
>>>>
>>>>
>>>
>>> Thanks you Sandy for bringing it up and Dust Li & Wen Gu for your thoughts.
>>> I agree that such a runtime switch is needed and also that this generic handling would be good in the smc layer.
>>
>> Right. runtime switch is necessary. I'm trying some ways to see which one is more suitable.
>>
>>
>> As for implementing a abstract layer that capable of handling 1) enable/disable SMC usage of
>> RDMA/ISM devices. 2) count packets/bytes of RDMA/ISM devices that generated/consumed by SMC,
>> I believe it would be helpful, and IMHO its architecture may be:
>>
>> ----------------------------------------------
>>                    SMC protocol
>>      (af_smc.c / smc_core.c / smc_clc.c ...)
>> ----------------------------------------------
>>            Abstract layer of SMC device
>>        (define SMC device common operations)
>> ----------------------------------------------
>>    RDMA device |        (virt) ISM device
>>    (smc_ib.c)  |   (smc_ism.c / smc_loopback.c)
>> ----------------------------------------------
>>
>> But I also believe this may require a lot of works and may be a long-term job.
>>
> 
> I like that concept a lot. If we can agree on a direction, we can define
> meaningful pieces and approach it piece by piece.
> 

Yes. It can be added to our interlock's backup list.

> 
>> If only for the virtual ISM device, e.g.loopback-ism, I am considering adding it to the Linux
>> device tree (/sys/devices/virtual/) to make it more 'device-like', and controlling its
>> enable/disable and get the statistics through some files, such as
>> echo 1 > /sys/devices/virtual/loopback-ism/alive
>> or
>> cat /sys/devices/virtual/loopback-ism/statistics/{rx|tx}_{bytes|packets}
>> (similar to what tcp lo have in /sys/devices/virtual/net/lo)
>>
>> What are your thoughts on it? Thanks.
>>
> 
> Makes sense to me, but I don't have too much experience in that area.
> I have never seen an attribute called 'alive' before.
> I think attributes like 'power', 'enable' or 'online' are used for other device types.
> 

Thanks. I will refer to existing devices for reference.

>>
>> -- 
>> A little off-topic, it's currently China's National Day holiday, which lasts for about a week,
>> so we are now on vacation. As a result, my responses might be a bit slower, but I will still
>> make time to check/reply the mail and prepare for my new version. Thank you all very much!
>>
>> Regards,
>> Wen Gu
> 
> Next week is Germany's national holiday, so many of us are out as well.

Have a nice holiday! :)
diff mbox series

Patch

diff --git a/net/smc/Kconfig b/net/smc/Kconfig
index 1ab3c5a..3e9e03e 100644
--- a/net/smc/Kconfig
+++ b/net/smc/Kconfig
@@ -19,3 +19,16 @@  config SMC_DIAG
 	  smcss.
 
 	  if unsure, say Y.
+
+config SMC_LO
+	bool "SMC_LO: virtual loopback-ism for SMC"
+	depends on SMC
+	default n
+	help
+	  SMC_LO provides a kind of virtual ISM device called loopback-ism
+	  for SMC-D to upgrade AF_INET TCP connections whose ends share the
+	  same kernel.
+	  loopback-ism is a software-implemented simulation device that does
+	  not depend on a specific architecture or hardware.
+
+	  if unsure, say N.
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 875efcd..a8c3711 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -4,5 +4,5 @@  obj-$(CONFIG_SMC)	+= smc.o
 obj-$(CONFIG_SMC_DIAG)	+= smc_diag.o
 smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
 smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
-smc-y += smc_tracepoint.o
+smc-y += smc_tracepoint.o smc_loopback.o
 smc-$(CONFIG_SYSCTL) += smc_sysctl.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index c5b7716..6435659 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -53,6 +53,7 @@ 
 #include "smc_stats.h"
 #include "smc_tracepoint.h"
 #include "smc_sysctl.h"
+#include "smc_loopback.h"
 
 static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
 						 * creation on server
@@ -3565,15 +3566,23 @@  static int __init smc_init(void)
 		goto out_sock;
 	}
 
+	rc = smc_loopback_init();
+	if (rc) {
+		pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc);
+		goto out_ib;
+	}
+
 	rc = tcp_register_ulp(&smc_ulp_ops);
 	if (rc) {
 		pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
-		goto out_ib;
+		goto out_lo;
 	}
 
 	static_branch_enable(&tcp_have_smc);
 	return 0;
 
+out_lo:
+	smc_loopback_exit();
 out_ib:
 	smc_ib_unregister_client();
 out_sock:
@@ -3611,6 +3620,7 @@  static void __exit smc_exit(void)
 	tcp_unregister_ulp(&smc_ulp_ops);
 	sock_unregister(PF_SMC);
 	smc_core_exit();
+	smc_loopback_exit();
 	smc_ib_unregister_client();
 	smc_ism_exit();
 	destroy_workqueue(smc_close_wq);
diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c
new file mode 100644
index 0000000..4631236
--- /dev/null
+++ b/net/smc/smc_loopback.c
@@ -0,0 +1,165 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Shared Memory Communications Direct over loopback device.
+ *
+ *  Provide a SMC-D loopback dummy device.
+ *
+ *  Copyright (c) 2022, Alibaba Inc.
+ *
+ *  Author: Wen Gu <guwen@linux.alibaba.com>
+ *          Tony Lu <tonylu@linux.alibaba.com>
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/types.h>
+#include <net/smc.h>
+
+#include "smc_ism.h"
+#include "smc_loopback.h"
+
+#if IS_ENABLED(CONFIG_SMC_LO)
+static const char smc_lo_dev_name[] = "smc_lo";
+static struct smc_lo_dev *lo_dev;
+
+static const struct smcd_ops lo_ops = {
+	.query_remote_gid	= NULL,
+	.register_dmb		= NULL,
+	.unregister_dmb		= NULL,
+	.add_vlan_id		= NULL,
+	.del_vlan_id		= NULL,
+	.set_vlan_required	= NULL,
+	.reset_vlan_required	= NULL,
+	.signal_event		= NULL,
+	.move_data		= NULL,
+	.supports_v2		= NULL,
+	.get_system_eid		= NULL,
+	.get_local_gid		= NULL,
+	.get_chid		= NULL,
+	.get_dev		= NULL,
+};
+
+static struct smcd_dev *smcd_lo_alloc_dev(const struct smcd_ops *ops,
+					  int max_dmbs)
+{
+	struct smcd_dev *smcd;
+
+	smcd = kzalloc(sizeof(*smcd), GFP_KERNEL);
+	if (!smcd)
+		return NULL;
+
+	smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *),
+			     GFP_KERNEL);
+	if (!smcd->conn)
+		goto out_smcd;
+
+	smcd->ops = ops;
+
+	spin_lock_init(&smcd->lock);
+	spin_lock_init(&smcd->lgr_lock);
+	INIT_LIST_HEAD(&smcd->vlan);
+	INIT_LIST_HEAD(&smcd->lgr_list);
+	init_waitqueue_head(&smcd->lgrs_deleted);
+	return smcd;
+
+out_smcd:
+	kfree(smcd);
+	return NULL;
+}
+
+static int smcd_lo_register_dev(struct smc_lo_dev *ldev)
+{
+	struct smcd_dev *smcd;
+
+	smcd = smcd_lo_alloc_dev(&lo_ops, SMC_LODEV_MAX_DMBS);
+	if (!smcd)
+		return -ENOMEM;
+
+	ldev->smcd = smcd;
+	smcd->priv = ldev;
+
+	/* TODO:
+	 * register smc_lo to smcd_dev list.
+	 */
+	return 0;
+}
+
+static void smcd_lo_unregister_dev(struct smc_lo_dev *ldev)
+{
+	/* TODO:
+	 * unregister smc_lo from smcd_dev list.
+	 */
+}
+
+static void smc_lo_dev_release(struct device *dev)
+{
+	struct smc_lo_dev *ldev =
+		container_of(dev, struct smc_lo_dev, dev);
+	struct smcd_dev *smcd = ldev->smcd;
+
+	kfree(smcd->conn);
+	kfree(smcd);
+	kfree(ldev);
+}
+
+static int smc_lo_dev_init(struct smc_lo_dev *ldev)
+{
+	return smcd_lo_register_dev(ldev);
+}
+
+static int smc_lo_dev_probe(void)
+{
+	struct smc_lo_dev *ldev;
+	int ret;
+
+	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
+	if (!ldev)
+		return -ENOMEM;
+
+	ldev->dev.parent = NULL;
+	ldev->dev.release = smc_lo_dev_release;
+	device_initialize(&ldev->dev);
+	dev_set_name(&ldev->dev, smc_lo_dev_name);
+
+	ret = smc_lo_dev_init(ldev);
+	if (ret)
+		goto free_dev;
+
+	lo_dev = ldev; /* global loopback device */
+	return 0;
+
+free_dev:
+	kfree(ldev);
+	return ret;
+}
+
+static void smc_lo_dev_exit(struct smc_lo_dev *ldev)
+{
+	smcd_lo_unregister_dev(ldev);
+}
+
+static void smc_lo_dev_remove(void)
+{
+	if (!lo_dev)
+		return;
+
+	smc_lo_dev_exit(lo_dev);
+	put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */
+}
+#endif
+
+int smc_loopback_init(void)
+{
+#if IS_ENABLED(CONFIG_SMC_LO)
+	return smc_lo_dev_probe();
+#else
+	return 0;
+#endif
+}
+
+void smc_loopback_exit(void)
+{
+#if IS_ENABLED(CONFIG_SMC_LO)
+	smc_lo_dev_remove();
+#endif
+}
diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h
new file mode 100644
index 0000000..39aa1dc
--- /dev/null
+++ b/net/smc/smc_loopback.h
@@ -0,0 +1,33 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Shared Memory Communications Direct over loopback device.
+ *
+ *  Provide a SMC-D loopback dummy device.
+ *
+ *  Copyright (c) 2022, Alibaba Inc.
+ *
+ *  Author: Wen Gu <guwen@linux.alibaba.com>
+ *          Tony Lu <tonylu@linux.alibaba.com>
+ *
+ */
+
+#ifndef _SMC_LOOPBACK_H
+#define _SMC_LOOPBACK_H
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <net/smc.h>
+
+#if IS_ENABLED(CONFIG_SMC_LO)
+#define SMC_LODEV_MAX_DMBS 5000
+
+struct smc_lo_dev {
+	struct smcd_dev *smcd;
+	struct device dev;
+};
+#endif
+
+int smc_loopback_init(void);
+void smc_loopback_exit(void);
+
+#endif /* _SMC_LOOPBACK_H */