diff mbox series

net: tuntap: add ioctl() TUNGETQUEUEINDX to fetch queue index

Message ID 20240731111940.8383-1-ayaka@soulik.info (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net: tuntap: add ioctl() TUNGETQUEUEINDX to fetch queue index | expand

Checks

Context Check Description
netdev/series_format warning Single patches do not need cover letters; Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 42 this patch: 42
netdev/build_tools success Errors and warnings before: 10 this patch: 10
netdev/cc_maintainers success CCed 6 of 6 maintainers
netdev/build_clang success Errors and warnings before: 43 this patch: 43
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 48 this patch: 48
netdev/checkpatch fail ERROR: "(foo*)" should be "(foo *)"
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-07-31--21-00 (tests: 706)

Commit Message

ayaka July 31, 2024, 11:19 a.m. UTC
We need the queue index in qdisc mapping rule. There is no way to
fetch that.

Signed-off-by: Randy Li <ayaka@soulik.info>
---
 drivers/net/tap.c           | 9 +++++++++
 drivers/net/tun.c           | 4 ++++
 include/uapi/linux/if_tun.h | 1 +
 3 files changed, 14 insertions(+)

Comments

Willem de Bruijn July 31, 2024, 2:12 p.m. UTC | #1
Randy Li wrote:
> We need the queue index in qdisc mapping rule. There is no way to
> fetch that.

In which command exactly?

> Signed-off-by: Randy Li <ayaka@soulik.info>
> ---
>  drivers/net/tap.c           | 9 +++++++++
>  drivers/net/tun.c           | 4 ++++
>  include/uapi/linux/if_tun.h | 1 +
>  3 files changed, 14 insertions(+)
> 
> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> index 77574f7a3bd4..6099f27a0a1f 100644
> --- a/drivers/net/tap.c
> +++ b/drivers/net/tap.c
> @@ -1120,6 +1120,15 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
>  		rtnl_unlock();
>  		return ret;
>  
> +	case TUNGETQUEUEINDEX:
> +		rtnl_lock();
> +		if (!q->enabled)
> +			ret = -EINVAL;
> +

Below will just overwrite the above ret

> +		ret = put_user(q->queue_index, up);
> +		rtnl_unlock();
> +		return ret;
> +
>  	case SIOCGIFHWADDR:
>  		rtnl_lock();
>  		tap = tap_get_tap_dev(q);
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 1d06c560c5e6..5473a0fca2e1 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -3115,6 +3115,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
>  			return -EPERM;
>  		return open_related_ns(&net->ns, get_net_ns);
> +	} else if (cmd == TUNGETQUEUEINDEX) {
> +		if (tfile->detached)
> +			return -EINVAL;
> +		return put_user(tfile->queue_index, (unsigned int __user*)argp);

Unless you're certain that these fields can be read without RTNL, move
below rtnl_lock() statement.

>  	}
>  
>  	rtnl_lock();
> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
> index 287cdc81c939..2668ca3b06a5 100644
> --- a/include/uapi/linux/if_tun.h
> +++ b/include/uapi/linux/if_tun.h
> @@ -61,6 +61,7 @@
>  #define TUNSETFILTEREBPF _IOR('T', 225, int)
>  #define TUNSETCARRIER _IOW('T', 226, int)
>  #define TUNGETDEVNETNS _IO('T', 227)
> +#define TUNGETQUEUEINDEX _IOR('T', 228, unsigned int)
>  
>  /* TUNSETIFF ifr flags */
>  #define IFF_TUN		0x0001
> -- 
> 2.45.2
>
ayaka July 31, 2024, 4:45 p.m. UTC | #2
On 2024/7/31 22:12, Willem de Bruijn wrote:
> Randy Li wrote:
>> We need the queue index in qdisc mapping rule. There is no way to
>> fetch that.
> In which command exactly?

That is for sch_multiq, here is an example

tc qdisc add dev  tun0 root handle 1: multiq

tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst 
172.16.10.1 action skbedit queue_mapping 0
tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst 
172.16.10.20 action skbedit queue_mapping 1

tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst 
172.16.10.10 action skbedit queue_mapping 2


The purpose here is taking advantage of the multiple threads. For the 
the server side(gateway of the tunnel's subnet), usually a different 
peer would invoked a different encryption/decryption key pair, it would 
be better to handle each in its own thread. Or the application would 
need to implement a dispatcher here.


I am newbie to the tc(8), I verified the command above with a tun type 
multiple threads demo. But I don't know how to drop the unwanted ingress 
filter here, the queue 0 may be a little broken.

>
>> Signed-off-by: Randy Li <ayaka@soulik.info>
>> ---
>>   drivers/net/tap.c           | 9 +++++++++
>>   drivers/net/tun.c           | 4 ++++
>>   include/uapi/linux/if_tun.h | 1 +
>>   3 files changed, 14 insertions(+)
>>
>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>> index 77574f7a3bd4..6099f27a0a1f 100644
>> --- a/drivers/net/tap.c
>> +++ b/drivers/net/tap.c
>> @@ -1120,6 +1120,15 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
>>   		rtnl_unlock();
>>   		return ret;
>>   
>> +	case TUNGETQUEUEINDEX:
>> +		rtnl_lock();
>> +		if (!q->enabled)
>> +			ret = -EINVAL;
>> +
> Below will just overwrite the above ret
Sorry, I didn't verify the tap type.
>
>> +		ret = put_user(q->queue_index, up);
>> +		rtnl_unlock();
>> +		return ret;
>> +
>>   	case SIOCGIFHWADDR:
>>   		rtnl_lock();
>>   		tap = tap_get_tap_dev(q);
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index 1d06c560c5e6..5473a0fca2e1 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -3115,6 +3115,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
>>   			return -EPERM;
>>   		return open_related_ns(&net->ns, get_net_ns);
>> +	} else if (cmd == TUNGETQUEUEINDEX) {
>> +		if (tfile->detached)
>> +			return -EINVAL;
>> +		return put_user(tfile->queue_index, (unsigned int __user*)argp);
> Unless you're certain that these fields can be read without RTNL, move
> below rtnl_lock() statement.
Would fix in v2.
>>   	}
>>   
>>   	rtnl_lock();
>> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
>> index 287cdc81c939..2668ca3b06a5 100644
>> --- a/include/uapi/linux/if_tun.h
>> +++ b/include/uapi/linux/if_tun.h
>> @@ -61,6 +61,7 @@
>>   #define TUNSETFILTEREBPF _IOR('T', 225, int)
>>   #define TUNSETCARRIER _IOW('T', 226, int)
>>   #define TUNGETDEVNETNS _IO('T', 227)
>> +#define TUNGETQUEUEINDEX _IOR('T', 228, unsigned int)
>>   
>>   /* TUNSETIFF ifr flags */
>>   #define IFF_TUN		0x0001
>> -- 
>> 2.45.2
>>
>
Willem de Bruijn July 31, 2024, 9:57 p.m. UTC | #3
nits:

- INDX->INDEX. It's correct in the code
- prefix networking patches with the target tree: PATCH net-next

Randy Li wrote:
> 
> On 2024/7/31 22:12, Willem de Bruijn wrote:
> > Randy Li wrote:
> >> We need the queue index in qdisc mapping rule. There is no way to
> >> fetch that.
> > In which command exactly?
> 
> That is for sch_multiq, here is an example
> 
> tc qdisc add dev  tun0 root handle 1: multiq
> 
> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst 
> 172.16.10.1 action skbedit queue_mapping 0
> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst 
> 172.16.10.20 action skbedit queue_mapping 1
> 
> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst 
> 172.16.10.10 action skbedit queue_mapping 2

If using an IFF_MULTI_QUEUE tun device, packets are automatically
load balanced across the multiple queues, in tun_select_queue.

If you want more explicit queue selection than by rxhash, tun
supports TUNSETSTEERINGEBPF.

> 
> The purpose here is taking advantage of the multiple threads. For the 
> the server side(gateway of the tunnel's subnet), usually a different 
> peer would invoked a different encryption/decryption key pair, it would 
> be better to handle each in its own thread. Or the application would 
> need to implement a dispatcher here.

A thread in which context? Or do you mean queue?

> 
> I am newbie to the tc(8), I verified the command above with a tun type 
> multiple threads demo. But I don't know how to drop the unwanted ingress 
> filter here, the queue 0 may be a little broken.

Not opposed to exposing the queue index if there is a need. Not sure
yet that there is.

Also, since for an IFF_MULTI_QUEUE the queue_id is just assigned
iteratively, it can also be inferred without an explicit call.
ayaka Aug. 1, 2024, 9:15 a.m. UTC | #4
On 2024/8/1 05:57, Willem de Bruijn wrote:
> nits:
>
> - INDX->INDEX. It's correct in the code
> - prefix networking patches with the target tree: PATCH net-next
I see.
>
> Randy Li wrote:
>> On 2024/7/31 22:12, Willem de Bruijn wrote:
>>> Randy Li wrote:
>>>> We need the queue index in qdisc mapping rule. There is no way to
>>>> fetch that.
>>> In which command exactly?
>> That is for sch_multiq, here is an example
>>
>> tc qdisc add dev  tun0 root handle 1: multiq
>>
>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
>> 172.16.10.1 action skbedit queue_mapping 0
>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
>> 172.16.10.20 action skbedit queue_mapping 1
>>
>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
>> 172.16.10.10 action skbedit queue_mapping 2
> If using an IFF_MULTI_QUEUE tun device, packets are automatically
> load balanced across the multiple queues, in tun_select_queue.
>
> If you want more explicit queue selection than by rxhash, tun
> supports TUNSETSTEERINGEBPF.

I know this eBPF thing. But I am newbie to eBPF as well I didn't figure 
out how to config eBPF dynamically.

Besides, I think I still need to know which queue is the target in eBPF.

>> The purpose here is taking advantage of the multiple threads. For the
>> the server side(gateway of the tunnel's subnet), usually a different
>> peer would invoked a different encryption/decryption key pair, it would
>> be better to handle each in its own thread. Or the application would
>> need to implement a dispatcher here.
> A thread in which context? Or do you mean queue?
The thread in the userspace. Each thread responds for a queue.
>
>> I am newbie to the tc(8), I verified the command above with a tun type
>> multiple threads demo. But I don't know how to drop the unwanted ingress
>> filter here, the queue 0 may be a little broken.
> Not opposed to exposing the queue index if there is a need. Not sure
> yet that there is.
>
> Also, since for an IFF_MULTI_QUEUE the queue_id is just assigned
> iteratively, it can also be inferred without an explicit call.

I don't think there would be sequence lock in creating multiple queue. 
Unless application uses an explicitly lock itself.

While that did makes a problem when a queue would be disabled. It would 
swap the last queue index with that queue, leading to fetch the queue 
index calling again, also it would request an update for the qdisc flow 
rule.

Could I submit a ***new*** PATCH which would peak a hole, also it 
applies for re-enabling the queue.

> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 1d06c560c5e6..5473a0fca2e1 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -3115,6 +3115,10 @@ static long __tun_chr_ioctl(struct file *file, 
> unsigned int cmd,
>           if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
>               return -EPERM;
>           return open_related_ns(&net->ns, get_net_ns);
> +    } else if (cmd == TUNGETQUEUEINDEX) {
> +        if (tfile->detached)
> +            return -EINVAL;
> +        return put_user(tfile->queue_index, (unsigned int __user*)argp);
> Unless you're certain that these fields can be read without RTNL, move
> below rtnl_lock() statement.
> Would fix in v2. 
I was trying to not hold the global lock or long period, that is why I 
didn't made v2 yesterday.

When I wrote this,  I saw ioctl() TUNSETQUEUE->tun_attach() above. Is 
the rtnl_lock() scope the lighting lock here?
Willem de Bruijn Aug. 1, 2024, 1:04 p.m. UTC | #5
Randy Li wrote:
> 
> On 2024/8/1 05:57, Willem de Bruijn wrote:
> > nits:
> >
> > - INDX->INDEX. It's correct in the code
> > - prefix networking patches with the target tree: PATCH net-next
> I see.
> >
> > Randy Li wrote:
> >> On 2024/7/31 22:12, Willem de Bruijn wrote:
> >>> Randy Li wrote:
> >>>> We need the queue index in qdisc mapping rule. There is no way to
> >>>> fetch that.
> >>> In which command exactly?
> >> That is for sch_multiq, here is an example
> >>
> >> tc qdisc add dev  tun0 root handle 1: multiq
> >>
> >> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
> >> 172.16.10.1 action skbedit queue_mapping 0
> >> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
> >> 172.16.10.20 action skbedit queue_mapping 1
> >>
> >> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
> >> 172.16.10.10 action skbedit queue_mapping 2
> > If using an IFF_MULTI_QUEUE tun device, packets are automatically
> > load balanced across the multiple queues, in tun_select_queue.
> >
> > If you want more explicit queue selection than by rxhash, tun
> > supports TUNSETSTEERINGEBPF.
> 
> I know this eBPF thing. But I am newbie to eBPF as well I didn't figure 
> out how to config eBPF dynamically.

Lack of experience with an existing interface is insufficient reason
to introduce another interface, of course.

> Besides, I think I still need to know which queue is the target in eBPF.

See SKF_AD_QUEUE for classic BPF and __sk_buff queue_mapping for eBPF.

> >> The purpose here is taking advantage of the multiple threads. For the
> >> the server side(gateway of the tunnel's subnet), usually a different
> >> peer would invoked a different encryption/decryption key pair, it would
> >> be better to handle each in its own thread. Or the application would
> >> need to implement a dispatcher here.
> > A thread in which context? Or do you mean queue?
> The thread in the userspace. Each thread responds for a queue.
> >
> >> I am newbie to the tc(8), I verified the command above with a tun type
> >> multiple threads demo. But I don't know how to drop the unwanted ingress
> >> filter here, the queue 0 may be a little broken.
> > Not opposed to exposing the queue index if there is a need. Not sure
> > yet that there is.
> >
> > Also, since for an IFF_MULTI_QUEUE the queue_id is just assigned
> > iteratively, it can also be inferred without an explicit call.
> 
> I don't think there would be sequence lock in creating multiple queue. 
> Unless application uses an explicitly lock itself.
> 
> While that did makes a problem when a queue would be disabled. It would 
> swap the last queue index with that queue, leading to fetch the queue 
> index calling again, also it would request an update for the qdisc flow 
> rule.
> 
> Could I submit a ***new*** PATCH which would peak a hole, also it 
> applies for re-enabling the queue.
> 
> > diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> > index 1d06c560c5e6..5473a0fca2e1 100644
> > --- a/drivers/net/tun.c
> > +++ b/drivers/net/tun.c
> > @@ -3115,6 +3115,10 @@ static long __tun_chr_ioctl(struct file *file, 
> > unsigned int cmd,
> >           if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
> >               return -EPERM;
> >           return open_related_ns(&net->ns, get_net_ns);
> > +    } else if (cmd == TUNGETQUEUEINDEX) {
> > +        if (tfile->detached)
> > +            return -EINVAL;
> > +        return put_user(tfile->queue_index, (unsigned int __user*)argp);
> > Unless you're certain that these fields can be read without RTNL, move
> > below rtnl_lock() statement.
> > Would fix in v2. 
> I was trying to not hold the global lock or long period, that is why I 
> didn't made v2 yesterday.
> 
> When I wrote this,  I saw ioctl() TUNSETQUEUE->tun_attach() above. Is 
> the rtnl_lock() scope the lighting lock here?
>
ayaka Aug. 1, 2024, 1:36 p.m. UTC | #6
On 2024/8/1 21:04, Willem de Bruijn wrote:
> Randy Li wrote:
>> On 2024/8/1 05:57, Willem de Bruijn wrote:
>>> nits:
>>>
>>> - INDX->INDEX. It's correct in the code
>>> - prefix networking patches with the target tree: PATCH net-next
>> I see.
>>> Randy Li wrote:
>>>> On 2024/7/31 22:12, Willem de Bruijn wrote:
>>>>> Randy Li wrote:
>>>>>> We need the queue index in qdisc mapping rule. There is no way to
>>>>>> fetch that.
>>>>> In which command exactly?
>>>> That is for sch_multiq, here is an example
>>>>
>>>> tc qdisc add dev  tun0 root handle 1: multiq
>>>>
>>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
>>>> 172.16.10.1 action skbedit queue_mapping 0
>>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
>>>> 172.16.10.20 action skbedit queue_mapping 1
>>>>
>>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
>>>> 172.16.10.10 action skbedit queue_mapping 2
>>> If using an IFF_MULTI_QUEUE tun device, packets are automatically
>>> load balanced across the multiple queues, in tun_select_queue.
>>>
>>> If you want more explicit queue selection than by rxhash, tun
>>> supports TUNSETSTEERINGEBPF.
>> I know this eBPF thing. But I am newbie to eBPF as well I didn't figure
>> out how to config eBPF dynamically.
> Lack of experience with an existing interface is insufficient reason
> to introduce another interface, of course.

tc(8) was old interfaces but doesn't have the sufficient info here to 
complete its work.

I think eBPF didn't work in all the platforms? JIT doesn't sound like a 
good solution for embeded platform.

Some VPS providers doesn't offer new enough kernel supporting eBPF is 
another problem here, it is far more easy that just patching an old 
kernel with this.

Anyway, I would learn into it while I would still send out the v2 of 
this patch. I would figure out whether eBPF could solve all the problem 
here.

>> Besides, I think I still need to know which queue is the target in eBPF.
> See SKF_AD_QUEUE for classic BPF and __sk_buff queue_mapping for eBPF.
I would look into it. Wish I don't need the patch that keeps the queue 
index unchanged.
>>>> The purpose here is taking advantage of the multiple threads. For the
>>>> the server side(gateway of the tunnel's subnet), usually a different
>>>> peer would invoked a different encryption/decryption key pair, it would
>>>> be better to handle each in its own thread. Or the application would
>>>> need to implement a dispatcher here.
>>> A thread in which context? Or do you mean queue?
>> The thread in the userspace. Each thread responds for a queue.
>>>> I am newbie to the tc(8), I verified the command above with a tun type
>>>> multiple threads demo. But I don't know how to drop the unwanted ingress
>>>> filter here, the queue 0 may be a little broken.
>>> Not opposed to exposing the queue index if there is a need. Not sure
>>> yet that there is.
>>>
>>> Also, since for an IFF_MULTI_QUEUE the queue_id is just assigned
>>> iteratively, it can also be inferred without an explicit call.
>> I don't think there would be sequence lock in creating multiple queue.
>> Unless application uses an explicitly lock itself.
>>
>> While that did makes a problem when a queue would be disabled. It would
>> swap the last queue index with that queue, leading to fetch the queue
>> index calling again, also it would request an update for the qdisc flow
>> rule.
>>
>> Could I submit a ***new*** PATCH which would peak a hole, also it
>> applies for re-enabling the queue.
>>
>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>> index 1d06c560c5e6..5473a0fca2e1 100644
>>> --- a/drivers/net/tun.c
>>> +++ b/drivers/net/tun.c
>>> @@ -3115,6 +3115,10 @@ static long __tun_chr_ioctl(struct file *file,
>>> unsigned int cmd,
>>>            if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
>>>                return -EPERM;
>>>            return open_related_ns(&net->ns, get_net_ns);
>>> +    } else if (cmd == TUNGETQUEUEINDEX) {
>>> +        if (tfile->detached)
>>> +            return -EINVAL;
>>> +        return put_user(tfile->queue_index, (unsigned int __user*)argp);
>>> Unless you're certain that these fields can be read without RTNL, move
>>> below rtnl_lock() statement.
>>> Would fix in v2.
>> I was trying to not hold the global lock or long period, that is why I
>> didn't made v2 yesterday.
>>
>> When I wrote this,  I saw ioctl() TUNSETQUEUE->tun_attach() above. Is
>> the rtnl_lock() scope the lighting lock here?
>>
>
Willem de Bruijn Aug. 1, 2024, 2:17 p.m. UTC | #7
Randy Li wrote:
> 
> On 2024/8/1 21:04, Willem de Bruijn wrote:
> > Randy Li wrote:
> >> On 2024/8/1 05:57, Willem de Bruijn wrote:
> >>> nits:
> >>>
> >>> - INDX->INDEX. It's correct in the code
> >>> - prefix networking patches with the target tree: PATCH net-next
> >> I see.
> >>> Randy Li wrote:
> >>>> On 2024/7/31 22:12, Willem de Bruijn wrote:
> >>>>> Randy Li wrote:
> >>>>>> We need the queue index in qdisc mapping rule. There is no way to
> >>>>>> fetch that.
> >>>>> In which command exactly?
> >>>> That is for sch_multiq, here is an example
> >>>>
> >>>> tc qdisc add dev  tun0 root handle 1: multiq
> >>>>
> >>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
> >>>> 172.16.10.1 action skbedit queue_mapping 0
> >>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
> >>>> 172.16.10.20 action skbedit queue_mapping 1
> >>>>
> >>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
> >>>> 172.16.10.10 action skbedit queue_mapping 2
> >>> If using an IFF_MULTI_QUEUE tun device, packets are automatically
> >>> load balanced across the multiple queues, in tun_select_queue.
> >>>
> >>> If you want more explicit queue selection than by rxhash, tun
> >>> supports TUNSETSTEERINGEBPF.
> >> I know this eBPF thing. But I am newbie to eBPF as well I didn't figure
> >> out how to config eBPF dynamically.
> > Lack of experience with an existing interface is insufficient reason
> > to introduce another interface, of course.
> 
> tc(8) was old interfaces but doesn't have the sufficient info here to 
> complete its work.

tc is maintained.

> I think eBPF didn't work in all the platforms? JIT doesn't sound like a 
> good solution for embeded platform.
> 
> Some VPS providers doesn't offer new enough kernel supporting eBPF is 
> another problem here, it is far more easy that just patching an old 
> kernel with this.

We don't add duplicative features because they are easier to
cherry-pick to old kernels.

> Anyway, I would learn into it while I would still send out the v2 of 
> this patch. I would figure out whether eBPF could solve all the problem 
> here.

Most importantly, why do you need a fixed mapping of IP address to
queue? Can you explain why relying on the standard rx_hash based
mapping is not sufficient for your workload?
ayaka Aug. 1, 2024, 7:52 p.m. UTC | #8
On 2024/8/1 22:17, Willem de Bruijn wrote:
> Randy Li wrote:
>> On 2024/8/1 21:04, Willem de Bruijn wrote:
>>> Randy Li wrote:
>>>> On 2024/8/1 05:57, Willem de Bruijn wrote:
>>>>> nits:
>>>>>
>>>>> - INDX->INDEX. It's correct in the code
>>>>> - prefix networking patches with the target tree: PATCH net-next
>>>> I see.
>>>>> Randy Li wrote:
>>>>>> On 2024/7/31 22:12, Willem de Bruijn wrote:
>>>>>>> Randy Li wrote:
>>>>>>>> We need the queue index in qdisc mapping rule. There is no way to
>>>>>>>> fetch that.
>>>>>>> In which command exactly?
>>>>>> That is for sch_multiq, here is an example
>>>>>>
>>>>>> tc qdisc add dev  tun0 root handle 1: multiq
>>>>>>
>>>>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
>>>>>> 172.16.10.1 action skbedit queue_mapping 0
>>>>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
>>>>>> 172.16.10.20 action skbedit queue_mapping 1
>>>>>>
>>>>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
>>>>>> 172.16.10.10 action skbedit queue_mapping 2
>>>>> If using an IFF_MULTI_QUEUE tun device, packets are automatically
>>>>> load balanced across the multiple queues, in tun_select_queue.
>>>>>
>>>>> If you want more explicit queue selection than by rxhash, tun
>>>>> supports TUNSETSTEERINGEBPF.
>>>> I know this eBPF thing. But I am newbie to eBPF as well I didn't figure
>>>> out how to config eBPF dynamically.
>>> Lack of experience with an existing interface is insufficient reason
>>> to introduce another interface, of course.
>> tc(8) was old interfaces but doesn't have the sufficient info here to
>> complete its work.
> tc is maintained.
>
>> I think eBPF didn't work in all the platforms? JIT doesn't sound like a
>> good solution for embeded platform.
>>
>> Some VPS providers doesn't offer new enough kernel supporting eBPF is
>> another problem here, it is far more easy that just patching an old
>> kernel with this.
> We don't add duplicative features because they are easier to
> cherry-pick to old kernels.
I was trying to say the tc(8) or netlink solution sound more suitable 
for general deploying.
>> Anyway, I would learn into it while I would still send out the v2 of
>> this patch. I would figure out whether eBPF could solve all the problem
>> here.
> Most importantly, why do you need a fixed mapping of IP address to
> queue? Can you explain why relying on the standard rx_hash based
> mapping is not sufficient for your workload?

Server

   |

   |------ tun subnet (e.x. 172.16.10.0/24) ------- peer A (172.16.10.1)

|------ peer B (172.16.10.3)

|------  peer C (172.16.10.20)

I am not even sure the rx_hash could work here, the server here acts as 
a router or gateway, I don't know how to filter the connection from the 
external interface based on rx_hash. Besides, VPN application didn't 
operate on the socket() itself.

I think this question is about why I do the filter in the kernel not the 
userspace?

It would be much more easy to the dispatch work in kernel, I only need 
to watch the established peer with the help of epoll(). Kernel could 
drop all the unwanted packets. Besides, if I do the filter/dispatcher 
work in the userspace, it would need to copy the packet's data to the 
userspace first, even decide its fate by reading a few bytes from its 
beginning offset. I think we can avoid such a cost.
Willem de Bruijn Aug. 2, 2024, 3:10 p.m. UTC | #9
Randy Li wrote:
> 
> On 2024/8/1 22:17, Willem de Bruijn wrote:
> > Randy Li wrote:
> >> On 2024/8/1 21:04, Willem de Bruijn wrote:
> >>> Randy Li wrote:
> >>>> On 2024/8/1 05:57, Willem de Bruijn wrote:
> >>>>> nits:
> >>>>>
> >>>>> - INDX->INDEX. It's correct in the code
> >>>>> - prefix networking patches with the target tree: PATCH net-next
> >>>> I see.
> >>>>> Randy Li wrote:
> >>>>>> On 2024/7/31 22:12, Willem de Bruijn wrote:
> >>>>>>> Randy Li wrote:
> >>>>>>>> We need the queue index in qdisc mapping rule. There is no way to
> >>>>>>>> fetch that.
> >>>>>>> In which command exactly?
> >>>>>> That is for sch_multiq, here is an example
> >>>>>>
> >>>>>> tc qdisc add dev  tun0 root handle 1: multiq
> >>>>>>
> >>>>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
> >>>>>> 172.16.10.1 action skbedit queue_mapping 0
> >>>>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
> >>>>>> 172.16.10.20 action skbedit queue_mapping 1
> >>>>>>
> >>>>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
> >>>>>> 172.16.10.10 action skbedit queue_mapping 2
> >>>>> If using an IFF_MULTI_QUEUE tun device, packets are automatically
> >>>>> load balanced across the multiple queues, in tun_select_queue.
> >>>>>
> >>>>> If you want more explicit queue selection than by rxhash, tun
> >>>>> supports TUNSETSTEERINGEBPF.
> >>>> I know this eBPF thing. But I am newbie to eBPF as well I didn't figure
> >>>> out how to config eBPF dynamically.
> >>> Lack of experience with an existing interface is insufficient reason
> >>> to introduce another interface, of course.
> >> tc(8) was old interfaces but doesn't have the sufficient info here to
> >> complete its work.
> > tc is maintained.
> >
> >> I think eBPF didn't work in all the platforms? JIT doesn't sound like a
> >> good solution for embeded platform.
> >>
> >> Some VPS providers doesn't offer new enough kernel supporting eBPF is
> >> another problem here, it is far more easy that just patching an old
> >> kernel with this.
> > We don't add duplicative features because they are easier to
> > cherry-pick to old kernels.
> I was trying to say the tc(8) or netlink solution sound more suitable 
> for general deploying.
> >> Anyway, I would learn into it while I would still send out the v2 of
> >> this patch. I would figure out whether eBPF could solve all the problem
> >> here.
> > Most importantly, why do you need a fixed mapping of IP address to
> > queue? Can you explain why relying on the standard rx_hash based
> > mapping is not sufficient for your workload?
> 
> Server
> 
>    |
> 
>    |------ tun subnet (e.x. 172.16.10.0/24) ------- peer A (172.16.10.1)
> 
> |------ peer B (172.16.10.3)
> 
> |------  peer C (172.16.10.20)
> 
> I am not even sure the rx_hash could work here, the server here acts as 
> a router or gateway, I don't know how to filter the connection from the 
> external interface based on rx_hash. Besides, VPN application didn't 
> operate on the socket() itself.
> 
> I think this question is about why I do the filter in the kernel not the 
> userspace?
> 
> It would be much more easy to the dispatch work in kernel, I only need 
> to watch the established peer with the help of epoll(). Kernel could 
> drop all the unwanted packets. Besides, if I do the filter/dispatcher 
> work in the userspace, it would need to copy the packet's data to the 
> userspace first, even decide its fate by reading a few bytes from its 
> beginning offset. I think we can avoid such a cost.

A custom mapping function is exactly the purpose of TUNSETSTEERINGEBPF.

Please take a look at that. It's a lot more elegant than going through
userspace and then inserting individual tc skbedit filters.
ayaka Aug. 7, 2024, 6:54 p.m. UTC | #10
Hello Willem

On 2024/8/2 23:10, Willem de Bruijn wrote:
> Randy Li wrote:
>> On 2024/8/1 22:17, Willem de Bruijn wrote:
>>> Randy Li wrote:
>>>> On 2024/8/1 21:04, Willem de Bruijn wrote:
>>>>> Randy Li wrote:
>>>>>> On 2024/8/1 05:57, Willem de Bruijn wrote:
>>>>>>> nits:
>>>>>>>
>>>>>>> - INDX->INDEX. It's correct in the code
>>>>>>> - prefix networking patches with the target tree: PATCH net-next
>>>>>> I see.
>>>>>>> Randy Li wrote:
>>>>>>>> On 2024/7/31 22:12, Willem de Bruijn wrote:
>>>>>>>>> Randy Li wrote:
>>>>>>>>>> We need the queue index in qdisc mapping rule. There is no way to
>>>>>>>>>> fetch that.
>>>>>>>>> In which command exactly?
>>>>>>>> That is for sch_multiq, here is an example
>>>>>>>>
>>>>>>>> tc qdisc add dev  tun0 root handle 1: multiq
>>>>>>>>
>>>>>>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
>>>>>>>> 172.16.10.1 action skbedit queue_mapping 0
>>>>>>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
>>>>>>>> 172.16.10.20 action skbedit queue_mapping 1
>>>>>>>>
>>>>>>>> tc filter add dev tun0 parent 1: protocol ip prio 1 u32 match ip dst
>>>>>>>> 172.16.10.10 action skbedit queue_mapping 2
>>>>>>> If using an IFF_MULTI_QUEUE tun device, packets are automatically
>>>>>>> load balanced across the multiple queues, in tun_select_queue.
>>>>>>>
>>>>>>> If you want more explicit queue selection than by rxhash, tun
>>>>>>> supports TUNSETSTEERINGEBPF.
>>>>>> I know this eBPF thing. But I am newbie to eBPF as well I didn't figure
>>>>>> out how to config eBPF dynamically.
>>>>> Lack of experience with an existing interface is insufficient reason
>>>>> to introduce another interface, of course.
>>>> tc(8) was old interfaces but doesn't have the sufficient info here to
>>>> complete its work.
>>> tc is maintained.
>>>
>>>> I think eBPF didn't work in all the platforms? JIT doesn't sound like a
>>>> good solution for embeded platform.
>>>>
>>>> Some VPS providers doesn't offer new enough kernel supporting eBPF is
>>>> another problem here, it is far more easy that just patching an old
>>>> kernel with this.
>>> We don't add duplicative features because they are easier to
>>> cherry-pick to old kernels.
>> I was trying to say the tc(8) or netlink solution sound more suitable
>> for general deploying.
>>>> Anyway, I would learn into it while I would still send out the v2 of
>>>> this patch. I would figure out whether eBPF could solve all the problem
>>>> here.
>>> Most importantly, why do you need a fixed mapping of IP address to
>>> queue? Can you explain why relying on the standard rx_hash based
>>> mapping is not sufficient for your workload?
>> Server
>>
>>     |
>>
>>     |------ tun subnet (e.x. 172.16.10.0/24) ------- peer A (172.16.10.1)
>>
>> |------ peer B (172.16.10.3)
>>
>> |------  peer C (172.16.10.20)
>>
>> I am not even sure the rx_hash could work here, the server here acts as
>> a router or gateway, I don't know how to filter the connection from the
>> external interface based on rx_hash. Besides, VPN application didn't
>> operate on the socket() itself.
>>
>> I think this question is about why I do the filter in the kernel not the
>> userspace?
>>
>> It would be much more easy to the dispatch work in kernel, I only need
>> to watch the established peer with the help of epoll(). Kernel could
>> drop all the unwanted packets. Besides, if I do the filter/dispatcher
>> work in the userspace, it would need to copy the packet's data to the
>> userspace first, even decide its fate by reading a few bytes from its
>> beginning offset. I think we can avoid such a cost.
> A custom mapping function is exactly the purpose of TUNSETSTEERINGEBPF.
>
> Please take a look at that. It's a lot more elegant than going through
> userspace and then inserting individual tc skbedit filters.

I checked how this socket filter works, I think we still need this 
serial of patch.

If I was right, this eBPF doesn't work like a regular socket filter. The 
eBPF's return value here means the target queue index not the size of 
the data that we want to keep from the sk_buf parameter's buf.

Besides, according to 
https://ebpf-docs.dylanreimerink.nl/linux/program-type/BPF_PROG_TYPE_SOCKET_FILTER/

I think the eBPF here can modify neither queue_mapping field nor hash 
field here.

> See SKF_AD_QUEUE for classic BPF and __sk_buff queue_mapping for eBPF.

Is it a map type BPF_MAP_TYPE_QUEUE?

Besides, I think the eBPF in TUNSETSTEERINGEBPF would NOT take 
queue_mapping.

If I want to drop packets for unwanted destination, I think 
TUNSETFILTEREBPF is what I need?

That would lead to lookup the same mapping table twice, is there a 
better way for the CPU cache?
Willem de Bruijn Aug. 8, 2024, 2:10 a.m. UTC | #11
> >> I think this question is about why I do the filter in the kernel not the
> >> userspace?
> >>
> >> It would be much more easy to the dispatch work in kernel, I only need
> >> to watch the established peer with the help of epoll(). Kernel could
> >> drop all the unwanted packets. Besides, if I do the filter/dispatcher
> >> work in the userspace, it would need to copy the packet's data to the
> >> userspace first, even decide its fate by reading a few bytes from its
> >> beginning offset. I think we can avoid such a cost.
> > A custom mapping function is exactly the purpose of TUNSETSTEERINGEBPF.
> >
> > Please take a look at that. It's a lot more elegant than going through
> > userspace and then inserting individual tc skbedit filters.
>
> I checked how this socket filter works, I think we still need this
> serial of patch.
>
> If I was right, this eBPF doesn't work like a regular socket filter. The
> eBPF's return value here means the target queue index not the size of
> the data that we want to keep from the sk_buf parameter's buf.

TUNSETSTEERINGEBPF is a queue selection mechanism for multi-queue tun devices.

It replaces the need to set skb->queue_mapping.

See also

"
commit 96f84061620c6325a2ca9a9a05b410e6461d03c3
Author: Jason Wang <jasowang@redhat.com>
Date:   Mon Dec 4 17:31:23 2017 +0800

    tun: add eBPF based queue selection method
"

> Besides, according to
> https://ebpf-docs.dylanreimerink.nl/linux/program-type/BPF_PROG_TYPE_SOCKET_FILTER/
>
> I think the eBPF here can modify neither queue_mapping field nor hash
> field here.
>
> > See SKF_AD_QUEUE for classic BPF and __sk_buff queue_mapping for eBPF.
>
> Is it a map type BPF_MAP_TYPE_QUEUE?
>
> Besides, I think the eBPF in TUNSETSTEERINGEBPF would NOT take
> queue_mapping.

It obviates the need.

It sounds like you want to both filter and steer to a specific queue. Why?

In that case, a tc egress tc_bpf program may be able to do both.
Again, by writing to __sk_buff queue_mapping. Instead of u32 +
skbedit.

See also

"
commit 74e31ca850c1cddeca03503171dd145b6ce293b6
Author: Jesper Dangaard Brouer <brouer@redhat.com>
Date:   Tue Feb 19 19:53:02 2019 +0100

    bpf: add skb->queue_mapping write access from tc clsact
"

But I suppose you could prefer u32 + skbedit.

Either way, the pertinent point is that you want to map some flow
match to a specific queue id.

This is straightforward if all queues are opened and none are closed.
But it is not if queues can get detached and attached dynamically.
Which I guess you encounter in practice?

I'm actually not sure how the current `tfile->queue_index =
tun->numqueues;` works in that case. As __tun_detach will do decrement
`--tun->numqueues;`. So multiple tfiles could end up with the same
queue_index. Unless dynamic detach + attach is not possible. But it
seems it is. Jason, if you're following, do you know this?

> If I want to drop packets for unwanted destination, I think
> TUNSETFILTEREBPF is what I need?
>
> That would lead to lookup the same mapping table twice, is there a
> better way for the CPU cache?

I agree that two programs should not be needed.
Jason Wang Aug. 8, 2024, 2:49 a.m. UTC | #12
On Thu, Aug 8, 2024 at 10:11 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> > >> I think this question is about why I do the filter in the kernel not the
> > >> userspace?
> > >>
> > >> It would be much more easy to the dispatch work in kernel, I only need
> > >> to watch the established peer with the help of epoll(). Kernel could
> > >> drop all the unwanted packets. Besides, if I do the filter/dispatcher
> > >> work in the userspace, it would need to copy the packet's data to the
> > >> userspace first, even decide its fate by reading a few bytes from its
> > >> beginning offset. I think we can avoid such a cost.
> > > A custom mapping function is exactly the purpose of TUNSETSTEERINGEBPF.
> > >
> > > Please take a look at that. It's a lot more elegant than going through
> > > userspace and then inserting individual tc skbedit filters.
> >
> > I checked how this socket filter works, I think we still need this
> > serial of patch.
> >
> > If I was right, this eBPF doesn't work like a regular socket filter. The
> > eBPF's return value here means the target queue index not the size of
> > the data that we want to keep from the sk_buf parameter's buf.
>
> TUNSETSTEERINGEBPF is a queue selection mechanism for multi-queue tun devices.
>
> It replaces the need to set skb->queue_mapping.
>
> See also
>
> "
> commit 96f84061620c6325a2ca9a9a05b410e6461d03c3
> Author: Jason Wang <jasowang@redhat.com>
> Date:   Mon Dec 4 17:31:23 2017 +0800
>
>     tun: add eBPF based queue selection method
> "

Exactly.

>
> > Besides, according to
> > https://ebpf-docs.dylanreimerink.nl/linux/program-type/BPF_PROG_TYPE_SOCKET_FILTER/
> >
> > I think the eBPF here can modify neither queue_mapping field nor hash
> > field here.
> >
> > > See SKF_AD_QUEUE for classic BPF and __sk_buff queue_mapping for eBPF.
> >
> > Is it a map type BPF_MAP_TYPE_QUEUE?
> >
> > Besides, I think the eBPF in TUNSETSTEERINGEBPF would NOT take
> > queue_mapping.
>
> It obviates the need.
>
> It sounds like you want to both filter and steer to a specific queue. Why?
>
> In that case, a tc egress tc_bpf program may be able to do both.
> Again, by writing to __sk_buff queue_mapping. Instead of u32 +
> skbedit.
>
> See also
>
> "
> commit 74e31ca850c1cddeca03503171dd145b6ce293b6
> Author: Jesper Dangaard Brouer <brouer@redhat.com>
> Date:   Tue Feb 19 19:53:02 2019 +0100
>
>     bpf: add skb->queue_mapping write access from tc clsact
> "
>
> But I suppose you could prefer u32 + skbedit.
>
> Either way, the pertinent point is that you want to map some flow
> match to a specific queue id.
>
> This is straightforward if all queues are opened and none are closed.
> But it is not if queues can get detached and attached dynamically.
> Which I guess you encounter in practice?
>
> I'm actually not sure how the current `tfile->queue_index =
> tun->numqueues;` works in that case. As __tun_detach will do decrement
> `--tun->numqueues;`. So multiple tfiles could end up with the same
> queue_index. Unless dynamic detach + attach is not possible.

It is expected to work, otherwise there should be a bug.

> But it
> seems it is. Jason, if you're following, do you know this?

__tun_detach() will move the last tfile in the tfiles[] array to the
current tfile->queue_index, and modify its queue_index:

        rcu_assign_pointer(tun->tfiles[index],
                                   tun->tfiles[tun->numqueues - 1]);
        ntfile = rtnl_dereference(tun->tfiles[index]);
        ntfile->queue_index = index;
        rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
                                   NULL);

        --tun->numqueues;

tun_attach() will move the detached tfile to the end of the tfiles[]
array and enable it:


        tfile->queue_index = tun->numqueues;
        ....
        rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
        tun->numqueues++;

>
> > If I want to drop packets for unwanted destination, I think
> > TUNSETFILTEREBPF is what I need?
> >
> > That would lead to lookup the same mapping table twice, is there a
> > better way for the CPU cache?
>
> I agree that two programs should not be needed.
>

Thanks
Willem de Bruijn Aug. 8, 2024, 3:11 a.m. UTC | #13
> > In that case, a tc egress tc_bpf program may be able to do both.
> > Again, by writing to __sk_buff queue_mapping. Instead of u32 +
> > skbedit.
> >
> > See also
> >
> > "
> > commit 74e31ca850c1cddeca03503171dd145b6ce293b6
> > Author: Jesper Dangaard Brouer <brouer@redhat.com>
> > Date:   Tue Feb 19 19:53:02 2019 +0100
> >
> >     bpf: add skb->queue_mapping write access from tc clsact
> > "
> >
> > But I suppose you could prefer u32 + skbedit.
> >
> > Either way, the pertinent point is that you want to map some flow
> > match to a specific queue id.
> >
> > This is straightforward if all queues are opened and none are closed.
> > But it is not if queues can get detached and attached dynamically.
> > Which I guess you encounter in practice?
> >
> > I'm actually not sure how the current `tfile->queue_index =
> > tun->numqueues;` works in that case. As __tun_detach will do decrement
> > `--tun->numqueues;`. So multiple tfiles could end up with the same
> > queue_index. Unless dynamic detach + attach is not possible.
>
> It is expected to work, otherwise there should be a bug.
>
> > But it
> > seems it is. Jason, if you're following, do you know this?
>
> __tun_detach() will move the last tfile in the tfiles[] array to the
> current tfile->queue_index, and modify its queue_index:
>
>         rcu_assign_pointer(tun->tfiles[index],
>                                    tun->tfiles[tun->numqueues - 1]);
>         ntfile = rtnl_dereference(tun->tfiles[index]);
>         ntfile->queue_index = index;
>         rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
>                                    NULL);
>
>         --tun->numqueues;
>
> tun_attach() will move the detached tfile to the end of the tfiles[]
> array and enable it:
>
>
>         tfile->queue_index = tun->numqueues;
>         ....
>         rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>         tun->numqueues++;
>

Ah right. Thanks. I had forgotten about that.

So I guess an application that owns all the queues could keep track of
the queue-id to FD mapping. But it is not trivial, nor defined ABI
behavior.

Querying the queue_id as in the proposed patch might not solve the
challenge, though. Since an FD's queue-id may change simply because
another queue was detached. So this would have to be queried on each
detach.

I suppose one underlying question is how important is the mapping of
flows to specific queue-id's? Is it a problem if the destination queue
for a flow changes mid-stream?
Jason Wang Aug. 8, 2024, 3:36 a.m. UTC | #14
On Thu, Aug 8, 2024 at 11:12 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> > > In that case, a tc egress tc_bpf program may be able to do both.
> > > Again, by writing to __sk_buff queue_mapping. Instead of u32 +
> > > skbedit.
> > >
> > > See also
> > >
> > > "
> > > commit 74e31ca850c1cddeca03503171dd145b6ce293b6
> > > Author: Jesper Dangaard Brouer <brouer@redhat.com>
> > > Date:   Tue Feb 19 19:53:02 2019 +0100
> > >
> > >     bpf: add skb->queue_mapping write access from tc clsact
> > > "
> > >
> > > But I suppose you could prefer u32 + skbedit.
> > >
> > > Either way, the pertinent point is that you want to map some flow
> > > match to a specific queue id.
> > >
> > > This is straightforward if all queues are opened and none are closed.
> > > But it is not if queues can get detached and attached dynamically.
> > > Which I guess you encounter in practice?
> > >
> > > I'm actually not sure how the current `tfile->queue_index =
> > > tun->numqueues;` works in that case. As __tun_detach will do decrement
> > > `--tun->numqueues;`. So multiple tfiles could end up with the same
> > > queue_index. Unless dynamic detach + attach is not possible.
> >
> > It is expected to work, otherwise there should be a bug.
> >
> > > But it
> > > seems it is. Jason, if you're following, do you know this?
> >
> > __tun_detach() will move the last tfile in the tfiles[] array to the
> > current tfile->queue_index, and modify its queue_index:
> >
> >         rcu_assign_pointer(tun->tfiles[index],
> >                                    tun->tfiles[tun->numqueues - 1]);
> >         ntfile = rtnl_dereference(tun->tfiles[index]);
> >         ntfile->queue_index = index;
> >         rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
> >                                    NULL);
> >
> >         --tun->numqueues;
> >
> > tun_attach() will move the detached tfile to the end of the tfiles[]
> > array and enable it:
> >
> >
> >         tfile->queue_index = tun->numqueues;
> >         ....
> >         rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
> >         tun->numqueues++;
> >
>
> Ah right. Thanks. I had forgotten about that.
>
> So I guess an application that owns all the queues could keep track of
> the queue-id to FD mapping. But it is not trivial, nor defined ABI
> behavior.

Yes and this is because tun allows arbitrary tfile to be detached
while the networking core assumes the queues are continuously. If
application want a stable queue id it needs to "understand" the kernel
behaviour somehow (unfortunately), see below.

>
> Querying the queue_id as in the proposed patch might not solve the
> challenge, though. Since an FD's queue-id may change simply because
> another queue was detached. So this would have to be queried on each
> detach.

Let's say we have 4 tunfd.

tunfd 0 -> queue 0
...
tunfd 3 -> queue 3

To have a stable queue id it needs to detach from the last tunfd and
attach in the reverse order.

E.g if it want to detach 2 tfiles, it needs to

detach tunfd3, detach tunfd2

And it if want to attach 2 tfiles back it needs to

attach tunfd2, attach tunfd3

This is how Qemu works in the case of multiqueue tuntap and RSS. This
allows the qemu doesn't need to deduce if the mapping between
queue_index and tfile changes. But it means if kernel behavior changes
in the future, it may break a lot of usersapce.

>
> I suppose one underlying question is how important is the mapping of
> flows to specific queue-id's?

For example to implement steering offload to kernel. E.g RSS,
userspace needs to know the mapping between queue id and tfile.

> Is it a problem if the destination queue
> for a flow changes mid-stream?
>

I think this is a good question. It's about whether or not we can make
the current kernel behaviour as part of ABI.

If yes, there's no need for the explicit getindex assuming the tunfd
is created by the application itself , as the queue_index could be
deduced from the behaviour of the application itself. But it might not
be all, if tuntap fd were passed through unix domain socket, the
application that receives those file descriptors might need to know
the queue_index mappings.

If no, allow such query might be useful. It might give a chance for
the user to know the mapping as you pointed out. But this patch needs
some tweak as when the tfile is detached, the queue_index is
meaningless in that case.

Thanks
ayaka Aug. 8, 2024, 4:16 a.m. UTC | #15
Sent from my iPad

> On Aug 8, 2024, at 11:13 AM, Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> 
> 
>> 
>>> In that case, a tc egress tc_bpf program may be able to do both.
>>> Again, by writing to __sk_buff queue_mapping. Instead of u32 +
>>> skbedit.
>>> 
>>> See also
>>> 
>>> "
>>> commit 74e31ca850c1cddeca03503171dd145b6ce293b6
>>> Author: Jesper Dangaard Brouer <brouer@redhat.com>
>>> Date:   Tue Feb 19 19:53:02 2019 +0100
>>> 
>>>    bpf: add skb->queue_mapping write access from tc clsact
>>> "
>>> 
>>> But I suppose you could prefer u32 + skbedit.
>>> 
>>> Either way, the pertinent point is that you want to map some flow
>>> match to a specific queue id.
>>> 
>>> This is straightforward if all queues are opened and none are closed.
>>> But it is not if queues can get detached and attached dynamically.
>>> Which I guess you encounter in practice?
>>> 
>>> I'm actually not sure how the current `tfile->queue_index =
>>> tun->numqueues;` works in that case. As __tun_detach will do decrement
>>> `--tun->numqueues;`. So multiple tfiles could end up with the same
>>> queue_index. Unless dynamic detach + attach is not possible.
>> 
>> It is expected to work, otherwise there should be a bug.
>> 
>>> But it
>>> seems it is. Jason, if you're following, do you know this?
>> 
>> __tun_detach() will move the last tfile in the tfiles[] array to the
>> current tfile->queue_index, and modify its queue_index:
>> 
>>        rcu_assign_pointer(tun->tfiles[index],
>>                                   tun->tfiles[tun->numqueues - 1]);
>>        ntfile = rtnl_dereference(tun->tfiles[index]);
>>        ntfile->queue_index = index;
>>        rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
>>                                   NULL);
>> 
>>        --tun->numqueues;
>> 
>> tun_attach() will move the detached tfile to the end of the tfiles[]
>> array and enable it:
>> 
>> 
>>        tfile->queue_index = tun->numqueues;
>>        ....
>>        rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>>        tun->numqueues++;
>> 
> 
> Ah right. Thanks. I had forgotten about that.
> 
> So I guess an application that owns all the queues could keep track of
> the queue-id to FD mapping. But it is not trivial, nor defined ABI
> behavior.
> 
> Querying the queue_id as in the proposed patch might not solve the
> challenge, though. Since an FD's queue-id may change simply because
Yes, when I asked about those eBPF thing, I thought I don’t need the queue id in those ebpf. It turns out a misunderstanding.
Do we all agree that no matter which filter or steering method we used here, we need a method to query queue index assigned with a fd?
> another queue was detached. So this would have to be queried on each
> detach.
> 
Thank you Jason. That is why I mentioned I may need to submit another patch to bind the queue index with a flow.

I think here is a good chance to discuss about this.
I think from the design, the number of queue was a fixed number in those hardware devices? Also for those remote processor type wireless device(I think those are the modem devices).
The way invoked with hash in every packet could consume lots of CPU times. And it is not necessary to track every packet.
Could I add another property in struct tun_file and steering program return wanted value. Then it is application’s work to keep this new property unique.
> I suppose one underlying question is how important is the mapping of
> flows to specific queue-id's? Is it a problem if the destination queue
> for a flow changes mid-stream?
Yes, it matters. Or why I want to use this feature. From all the open source VPN I know, neither enabled this multiqueu feature nor create more than one queue for it.
And virtual machine would use the tap at the most time(they want to emulate a real nic).
So basically this multiple queue feature was kind of useless for the VPN usage.
If the filter can’t work atomically here, which would lead to unwanted packets transmitted to the wrong thread.
Willem de Bruijn Aug. 8, 2024, 6:48 p.m. UTC | #16
> > So I guess an application that owns all the queues could keep track of
> > the queue-id to FD mapping. But it is not trivial, nor defined ABI
> > behavior.
> >
> > Querying the queue_id as in the proposed patch might not solve the
> > challenge, though. Since an FD's queue-id may change simply because
> Yes, when I asked about those eBPF thing, I thought I don’t need the queue id in those ebpf. It turns out a misunderstanding.
> Do we all agree that no matter which filter or steering method we used here, we need a method to query queue index assigned with a fd?

That depends how you intend to use it. And in particular how to work
around the issue of IDs not being stable. Without solving that, it
seems like an impractical and even dangerous -because easy to misuse-
interface.

> > another queue was detached. So this would have to be queried on each
> > detach.
> >
> Thank you Jason. That is why I mentioned I may need to submit another patch to bind the queue index with a flow.
>
> I think here is a good chance to discuss about this.
> I think from the design, the number of queue was a fixed number in those hardware devices? Also for those remote processor type wireless device(I think those are the modem devices).
> The way invoked with hash in every packet could consume lots of CPU times. And it is not necessary to track every packet.

rxhash based steering is common. There needs to be a strong(er) reason
to implement an alternative.

> Could I add another property in struct tun_file and steering program return wanted value. Then it is application’s work to keep this new property unique.

I don't entirely follow this suggestion?

> > I suppose one underlying question is how important is the mapping of
> > flows to specific queue-id's? Is it a problem if the destination queue
> > for a flow changes mid-stream?
> Yes, it matters. Or why I want to use this feature. From all the open source VPN I know, neither enabled this multiqueu feature nor create more than one queue for it.
> And virtual machine would use the tap at the most time(they want to emulate a real nic).
> So basically this multiple queue feature was kind of useless for the VPN usage.
> If the filter can’t work atomically here, which would lead to unwanted packets transmitted to the wrong thread.

What exactly is the issue if a flow migrates from one queue to
another? There may be some OOO arrival. But these configuration
changes are rare events.
ayaka Aug. 9, 2024, 4:45 a.m. UTC | #17
Sent from my iPad

> On Aug 9, 2024, at 2:49 AM, Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> 
> 
>> 
>>> So I guess an application that owns all the queues could keep track of
>>> the queue-id to FD mapping. But it is not trivial, nor defined ABI
>>> behavior.
>>> 
>>> Querying the queue_id as in the proposed patch might not solve the
>>> challenge, though. Since an FD's queue-id may change simply because
>> Yes, when I asked about those eBPF thing, I thought I don’t need the queue id in those ebpf. It turns out a misunderstanding.
>> Do we all agree that no matter which filter or steering method we used here, we need a method to query queue index assigned with a fd?
> 
> That depends how you intend to use it. And in particular how to work
> around the issue of IDs not being stable. Without solving that, it
> seems like an impractical and even dangerous -because easy to misuse-
> interface.
> 
First of all, I need to figure out when the steering action happens.
When I use multiq qdisc with skbedit, does it happens after the net_device_ops->ndo_select_queue() ?
If it did, that will still generate unused rxhash and txhash and flow tracking. It sounds a big overhead.
Is it the same path for tc-bpf solution ?

I would reply with my concern about violating IDs in your last question.
>>> another queue was detached. So this would have to be queried on each
>>> detach.
>>> 
>> Thank you Jason. That is why I mentioned I may need to submit another patch to bind the queue index with a flow.
>> 
>> I think here is a good chance to discuss about this.
>> I think from the design, the number of queue was a fixed number in those hardware devices? Also for those remote processor type wireless device(I think those are the modem devices).
>> The way invoked with hash in every packet could consume lots of CPU times. And it is not necessary to track every packet.
> 
> rxhash based steering is common. There needs to be a strong(er) reason
> to implement an alternative.
> 
I have a few questions about this hash steering, which didn’t request any future filter invoked:
1. If a flow happens before wrote to the tun, how to filter it?
2. Does such a hash operation happen to every packet passing through?
3. Is rxhash based on the flow tracking record in the tun driver?
Those CPU overhead may demolish the benefit of the multiple queues and filters in the kernel solution.
Also the flow tracking has a limited to 4096 or 1024, for a IPv4 /24 subnet, if everyone opened 16 websites, are we run out of memory before some entries expired?

I want to  seek there is a modern way to implement VPN in Linux after so many features has been introduced to Linux. So far, I don’t find a proper way to make any advantage here than other platforms.
>> Could I add another property in struct tun_file and steering program return wanted value. Then it is application’s work to keep this new property unique.
> 
> I don't entirely follow this suggestion?
> 
>>> I suppose one underlying question is how important is the mapping of
>>> flows to specific queue-id's? Is it a problem if the destination queue
>>> for a flow changes mid-stream?
>> Yes, it matters. Or why I want to use this feature. From all the open source VPN I know, neither enabled this multiqueu feature nor create more than one queue for it.
>> And virtual machine would use the tap at the most time(they want to emulate a real nic).
>> So basically this multiple queue feature was kind of useless for the VPN usage.
>> If the filter can’t work atomically here, which would lead to unwanted packets transmitted to the wrong thread.
> 
> What exactly is the issue if a flow migrates from one queue to
> another? There may be some OOO arrival. But these configuration
> changes are rare events.
I don’t know what the OOO means here.
If a flow would migrate from its supposed queue to another, that was against the pretension to use the multiple queues here.
A queue presents a VPN node here. It means it would leak one’s data to the other.
Also those data could be just garbage fragments costs bandwidth sending to a peer that can’t handle it.
Willem de Bruijn Aug. 9, 2024, 2:55 p.m. UTC | #18
ayaka wrote:
> 
> Sent from my iPad

Try to avoid ^^^
 
> > On Aug 9, 2024, at 2:49 AM, Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> > 
> > 
> >> 
> >>> So I guess an application that owns all the queues could keep track of
> >>> the queue-id to FD mapping. But it is not trivial, nor defined ABI
> >>> behavior.
> >>> 
> >>> Querying the queue_id as in the proposed patch might not solve the
> >>> challenge, though. Since an FD's queue-id may change simply because
> >> Yes, when I asked about those eBPF thing, I thought I don’t need the queue id in those ebpf. It turns out a misunderstanding.
> >> Do we all agree that no matter which filter or steering method we used here, we need a method to query queue index assigned with a fd?
> > 
> > That depends how you intend to use it. And in particular how to work
> > around the issue of IDs not being stable. Without solving that, it
> > seems like an impractical and even dangerous -because easy to misuse-
> > interface.
> > 
> First of all, I need to figure out when the steering action happens.
> When I use multiq qdisc with skbedit, does it happens after the net_device_ops->ndo_select_queue() ?
> If it did, that will still generate unused rxhash and txhash and flow tracking. It sounds a big overhead.
> Is it the same path for tc-bpf solution ?

TC egress is called early in __dev_queue_xmit, the main entry point for
transmission, in sch_handle_egress.

A few lines below netdev_core_pick_tx selects the txq by setting
skb->queue_mapping. Either through netdev_pick_tx or through a device
specific callback ndo_select_queue if it exists.

For tun, tun_select_queue implements that callback. If
TUNSETSTEERINGEBPF is configured, then the BPF program is called. Else
it uses its own rx_hash based approach in tun_automq_select_queue.

There is a special case in between. If TC egress ran skbedit, then
this sets current->net_xmit.skip_txqueue. Which will read the txq
from the skb->queue_mapping set by skbedit, and skip netdev_pick_tx.

That seems more roundabout than I had expected. I thought the code
would just check whether skb->queue_mapping is set and if so skip
netdev_pick_tx.

I wonder if this now means that setting queue_mapping with any other
TC action than skbedit now gets ignored. Importantly, cls_bpf or
act_bpf.

> I would reply with my concern about violating IDs in your last question.
> >>> another queue was detached. So this would have to be queried on each
> >>> detach.
> >>> 
> >> Thank you Jason. That is why I mentioned I may need to submit another patch to bind the queue index with a flow.
> >> 
> >> I think here is a good chance to discuss about this.
> >> I think from the design, the number of queue was a fixed number in those hardware devices? Also for those remote processor type wireless device(I think those are the modem devices).
> >> The way invoked with hash in every packet could consume lots of CPU times. And it is not necessary to track every packet.
> > 
> > rxhash based steering is common. There needs to be a strong(er) reason
> > to implement an alternative.
> > 
> I have a few questions about this hash steering, which didn’t request any future filter invoked:
> 1. If a flow happens before wrote to the tun, how to filter it?

What do you mean?

> 2. Does such a hash operation happen to every packet passing through?

For packets with a local socket, the computation is cached in the
socket.

For these tunnel packets, see tun_automq_select_queue. Specifically,
the call to __skb_get_hash_symmetric.

I'm actually not entirely sure why tun has this, rather than defer
to netdev_pick_tx, which call skb_tx_hash.

> 3. Is rxhash based on the flow tracking record in the tun driver?
> Those CPU overhead may demolish the benefit of the multiple queues and filters in the kernel solution.

Keyword is "may". Avoid premature optimization in favor of data.

> Also the flow tracking has a limited to 4096 or 1024, for a IPv4 /24 subnet, if everyone opened 16 websites, are we run out of memory before some entries expired?
> 
> I want to  seek there is a modern way to implement VPN in Linux after so many features has been introduced to Linux. So far, I don’t find a proper way to make any advantage here than other platforms.
> >> Could I add another property in struct tun_file and steering program return wanted value. Then it is application’s work to keep this new property unique.
> > 
> > I don't entirely follow this suggestion?
> > 
> >>> I suppose one underlying question is how important is the mapping of
> >>> flows to specific queue-id's? Is it a problem if the destination queue
> >>> for a flow changes mid-stream?
> >> Yes, it matters. Or why I want to use this feature. From all the open source VPN I know, neither enabled this multiqueu feature nor create more than one queue for it.
> >> And virtual machine would use the tap at the most time(they want to emulate a real nic).
> >> So basically this multiple queue feature was kind of useless for the VPN usage.
> >> If the filter can’t work atomically here, which would lead to unwanted packets transmitted to the wrong thread.
> > 
> > What exactly is the issue if a flow migrates from one queue to
> > another? There may be some OOO arrival. But these configuration
> > changes are rare events.
> I don’t know what the OOO means here.

Out of order.

> If a flow would migrate from its supposed queue to another, that was against the pretension to use the multiple queues here.
> A queue presents a VPN node here. It means it would leak one’s data to the other.
> Also those data could be just garbage fragments costs bandwidth sending to a peer that can’t handle it.

MultiQ is normally just a scalability optimization. It does not matter
for correctness, bar the possibility of brief packet reordering when a
flow switches queues.

I now get that what you are trying to do is set up a 1:1 relationship
between VPN connections and multi queue tun FDs. What would be reading
these FDs? If a single process, then it can definitely handle flow
migration.
Jason Wang Aug. 12, 2024, 6:05 a.m. UTC | #19
On Fri, Aug 9, 2024 at 10:55 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> ayaka wrote:
> >
> > Sent from my iPad
>
> Try to avoid ^^^
>

[...]

> > 2. Does such a hash operation happen to every packet passing through?
>
> For packets with a local socket, the computation is cached in the
> socket.
>
> For these tunnel packets, see tun_automq_select_queue. Specifically,
> the call to __skb_get_hash_symmetric.
>
> I'm actually not entirely sure why tun has this, rather than defer
> to netdev_pick_tx, which call skb_tx_hash.

Not sure I get the question, but it needs to use a consistent hash to
match the flows stored before.

>
> > 3. Is rxhash based on the flow tracking record in the tun driver?
> > Those CPU overhead may demolish the benefit of the multiple queues and filters in the kernel solution.
>
> Keyword is "may". Avoid premature optimization in favor of data.
>
> > Also the flow tracking has a limited to 4096 or 1024, for a IPv4 /24 subnet, if everyone opened 16 websites, are we run out of memory before some entries expired?
> >
> > I want to  seek there is a modern way to implement VPN in Linux after so many features has been introduced to Linux. So far, I don’t find a proper way to make any advantage here than other platforms.

I think I need to understand how we could define "modern" here.

Btw, I vaguely remember there are some new vpn projects that try to
use vhost-net to accelerate.

E.g https://gitlab.com/openconnect/openconnect

Thanks
Willem de Bruijn Aug. 12, 2024, 5:10 p.m. UTC | #20
Jason Wang wrote:
> On Fri, Aug 9, 2024 at 10:55 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > ayaka wrote:
> > >
> > > Sent from my iPad
> >
> > Try to avoid ^^^
> >
> 
> [...]
> 
> > > 2. Does such a hash operation happen to every packet passing through?
> >
> > For packets with a local socket, the computation is cached in the
> > socket.
> >
> > For these tunnel packets, see tun_automq_select_queue. Specifically,
> > the call to __skb_get_hash_symmetric.
> >
> > I'm actually not entirely sure why tun has this, rather than defer
> > to netdev_pick_tx, which call skb_tx_hash.
> 
> Not sure I get the question, but it needs to use a consistent hash to
> match the flows stored before.

This is a bit tangential to Randy's original thread, but I would like
to understand this part a bit better, if you don't mind.

Tun automq calls __skb_get_hash_symmetric instead of the
non-symmetrical skb_get_hash of netdev_pick_tx. That makes sense.

Also, netdev_pick_tx tries other things first, like XPS.

Why does automq have to be stateful, keeping a table. Rather than
always computing symmetrical_hash % reciprocal_scale(txq, numqueues)
directly, as is does when the flow is not found?

Just curious, thanks.

> >
> > > 3. Is rxhash based on the flow tracking record in the tun driver?
> > > Those CPU overhead may demolish the benefit of the multiple queues and filters in the kernel solution.
> >
> > Keyword is "may". Avoid premature optimization in favor of data.
> >
> > > Also the flow tracking has a limited to 4096 or 1024, for a IPv4 /24 subnet, if everyone opened 16 websites, are we run out of memory before some entries expired?
> > >
> > > I want to  seek there is a modern way to implement VPN in Linux after so many features has been introduced to Linux. So far, I don’t find a proper way to make any advantage here than other platforms.
> 
> I think I need to understand how we could define "modern" here.
> 
> Btw, I vaguely remember there are some new vpn projects that try to
> use vhost-net to accelerate.
> 
> E.g https://gitlab.com/openconnect/openconnect
> 
> Thanks
>
Jason Wang Aug. 13, 2024, 3:53 a.m. UTC | #21
On Tue, Aug 13, 2024 at 1:11 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Wang wrote:
> > On Fri, Aug 9, 2024 at 10:55 PM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > ayaka wrote:
> > > >
> > > > Sent from my iPad
> > >
> > > Try to avoid ^^^
> > >
> >
> > [...]
> >
> > > > 2. Does such a hash operation happen to every packet passing through?
> > >
> > > For packets with a local socket, the computation is cached in the
> > > socket.
> > >
> > > For these tunnel packets, see tun_automq_select_queue. Specifically,
> > > the call to __skb_get_hash_symmetric.
> > >
> > > I'm actually not entirely sure why tun has this, rather than defer
> > > to netdev_pick_tx, which call skb_tx_hash.
> >
> > Not sure I get the question, but it needs to use a consistent hash to
> > match the flows stored before.
>
> This is a bit tangential to Randy's original thread, but I would like
> to understand this part a bit better, if you don't mind.

Comments are more than welcomed. The code is written more than 10
years, it should have something that can be improved.

>
> Tun automq calls __skb_get_hash_symmetric instead of the
> non-symmetrical skb_get_hash of netdev_pick_tx. That makes sense.
>
> Also, netdev_pick_tx tries other things first, like XPS.

Right, using XPS may conflict with the user expected behaviour (e.g
the automatic steering has been documented in the virtio spec, though
it's best effort somehow).

>
> Why does automq have to be stateful, keeping a table. Rather than
> always computing symmetrical_hash % reciprocal_scale(txq, numqueues)
> directly, as is does when the flow is not found?
>
> Just curious, thanks.

You are right, I think we can avoid the hash calculation and depend on
the fallback in netdev_pick_tx().

Have put this in my backlog.

Thanks

>
> > >
> > > > 3. Is rxhash based on the flow tracking record in the tun driver?
> > > > Those CPU overhead may demolish the benefit of the multiple queues and filters in the kernel solution.
> > >
> > > Keyword is "may". Avoid premature optimization in favor of data.
> > >
> > > > Also the flow tracking has a limited to 4096 or 1024, for a IPv4 /24 subnet, if everyone opened 16 websites, are we run out of memory before some entries expired?
> > > >
> > > > I want to  seek there is a modern way to implement VPN in Linux after so many features has been introduced to Linux. So far, I don’t find a proper way to make any advantage here than other platforms.
> >
> > I think I need to understand how we could define "modern" here.
> >
> > Btw, I vaguely remember there are some new vpn projects that try to
> > use vhost-net to accelerate.
> >
> > E.g https://gitlab.com/openconnect/openconnect
> >
> > Thanks
> >
>
>
Willem de Bruijn Aug. 13, 2024, 1:22 p.m. UTC | #22
Jason Wang wrote:
> On Tue, Aug 13, 2024 at 1:11 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Jason Wang wrote:
> > > On Fri, Aug 9, 2024 at 10:55 PM Willem de Bruijn
> > > <willemdebruijn.kernel@gmail.com> wrote:
> > > >
> > > > ayaka wrote:
> > > > >
> > > > > Sent from my iPad
> > > >
> > > > Try to avoid ^^^
> > > >
> > >
> > > [...]
> > >
> > > > > 2. Does such a hash operation happen to every packet passing through?
> > > >
> > > > For packets with a local socket, the computation is cached in the
> > > > socket.
> > > >
> > > > For these tunnel packets, see tun_automq_select_queue. Specifically,
> > > > the call to __skb_get_hash_symmetric.
> > > >
> > > > I'm actually not entirely sure why tun has this, rather than defer
> > > > to netdev_pick_tx, which call skb_tx_hash.
> > >
> > > Not sure I get the question, but it needs to use a consistent hash to
> > > match the flows stored before.
> >
> > This is a bit tangential to Randy's original thread, but I would like
> > to understand this part a bit better, if you don't mind.
> 
> Comments are more than welcomed. The code is written more than 10
> years, it should have something that can be improved.
> 
> >
> > Tun automq calls __skb_get_hash_symmetric instead of the
> > non-symmetrical skb_get_hash of netdev_pick_tx. That makes sense.
> >
> > Also, netdev_pick_tx tries other things first, like XPS.
> 
> Right, using XPS may conflict with the user expected behaviour (e.g
> the automatic steering has been documented in the virtio spec, though
> it's best effort somehow).
> 
> >
> > Why does automq have to be stateful, keeping a table. Rather than
> > always computing symmetrical_hash % reciprocal_scale(txq, numqueues)
> > directly, as is does when the flow is not found?
> >
> > Just curious, thanks.
> 
> You are right, I think we can avoid the hash calculation and depend on
> the fallback in netdev_pick_tx().
> 
> Have put this in my backlog.

Great. Thanks for taking a look at that Jason.

It may very well be that standard netdev_pick_tx does not conform to
the virtio spec behavior. But if it does, nice simplification.

If we have to create a variant that takes a bool skip_xps, that's
totally worth it.
diff mbox series

Patch

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 77574f7a3bd4..6099f27a0a1f 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1120,6 +1120,15 @@  static long tap_ioctl(struct file *file, unsigned int cmd,
 		rtnl_unlock();
 		return ret;
 
+	case TUNGETQUEUEINDEX:
+		rtnl_lock();
+		if (!q->enabled)
+			ret = -EINVAL;
+
+		ret = put_user(q->queue_index, up);
+		rtnl_unlock();
+		return ret;
+
 	case SIOCGIFHWADDR:
 		rtnl_lock();
 		tap = tap_get_tap_dev(q);
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 1d06c560c5e6..5473a0fca2e1 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -3115,6 +3115,10 @@  static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 		return open_related_ns(&net->ns, get_net_ns);
+	} else if (cmd == TUNGETQUEUEINDEX) {
+		if (tfile->detached)
+			return -EINVAL;
+		return put_user(tfile->queue_index, (unsigned int __user*)argp);
 	}
 
 	rtnl_lock();
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 287cdc81c939..2668ca3b06a5 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -61,6 +61,7 @@ 
 #define TUNSETFILTEREBPF _IOR('T', 225, int)
 #define TUNSETCARRIER _IOW('T', 226, int)
 #define TUNGETDEVNETNS _IO('T', 227)
+#define TUNGETQUEUEINDEX _IOR('T', 228, unsigned int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001