Message ID | 20240915-rss-v3-7-c630015db082@daynix.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | tun: Introduce virtio-net hashing feature | expand |
Akihiko Odaki wrote: > RSS is a receive steering algorithm that can be negotiated to use with > virtio_net. Conventionally the hash calculation was done by the VMM. > However, computing the hash after the queue was chosen defeats the > purpose of RSS. > > Another approach is to use eBPF steering program. This approach has > another downside: it cannot report the calculated hash due to the > restrictive nature of eBPF steering program. > > Introduce the code to perform RSS to the kernel in order to overcome > thse challenges. An alternative solution is to extend the eBPF steering > program so that it will be able to report to the userspace, but I didn't > opt for it because extending the current mechanism of eBPF steering > program as is because it relies on legacy context rewriting, and > introducing kfunc-based eBPF will result in non-UAPI dependency while > the other relevant virtualization APIs such as KVM and vhost_net are > UAPIs. > > Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > --- > drivers/net/tun.c | 119 +++++++++++++++++++++++++++++++++++++++----- > include/uapi/linux/if_tun.h | 27 ++++++++++ > 2 files changed, 133 insertions(+), 13 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index b8fcd71becac..5a429b391144 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -175,6 +175,9 @@ struct tun_prog { > > struct tun_vnet_hash_container { > struct tun_vnet_hash common; > + struct tun_vnet_hash_rss rss; > + __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > + u16 rss_indirection_table[]; > }; > > /* Since the socket were moved to tun_file, to preserve the behavior of persist > @@ -227,7 +230,7 @@ struct veth { > }; > > static const struct tun_vnet_hash tun_vnet_hash_cap = { > - .flags = TUN_VNET_HASH_REPORT, > + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, > .types = VIRTIO_NET_SUPPORTED_HASH_TYPES > }; > > @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) > return ret % numqueues; > } > > +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun, > + struct sk_buff *skb, > + const struct tun_vnet_hash_container *vnet_hash) > +{ > + struct tun_vnet_hash_ext *ext; > + struct virtio_net_hash hash; > + u32 numqueues = READ_ONCE(tun->numqueues); > + u16 txq, index; > + > + if (!numqueues) > + return 0; > + > + if (!virtio_net_hash_rss(skb, vnet_hash->common.types, vnet_hash->rss_key, > + &hash)) > + return vnet_hash->rss.unclassified_queue % numqueues; > + > + if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) { > + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); > + if (ext) { > + ext->value = hash.value; > + ext->report = hash.report; > + } > + } > + > + index = hash.value & vnet_hash->rss.indirection_table_mask; > + txq = READ_ONCE(vnet_hash->rss_indirection_table[index]); > + > + return txq % numqueues; > +} > + > static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, > struct net_device *sb_dev) > { > @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, > } else { > struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); > > - ret = tun_automq_select_queue(tun, skb, vnet_hash); > + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) > + ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash); > + else > + ret = tun_automq_select_queue(tun, skb, vnet_hash); > } > rcu_read_unlock(); > > @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) > } > > static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, > - void __user *data) > + int fd) > { > struct bpf_prog *prog; > - int fd; > - > - if (copy_from_user(&fd, data, sizeof(fd))) > - return -EFAULT; > > if (fd == -1) { > prog = NULL; > @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > int ifindex; > int sndbuf; > int vnet_hdr_sz; > + int fd; > int le; > int ret; > bool do_notify = false; > @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > break; > > case TUNSETSTEERINGEBPF: > - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); > + if (get_user(fd, (int __user *)argp)) { > + ret = -EFAULT; > + break; > + } > + > + vnet_hash = rtnl_dereference(tun->vnet_hash); > + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { > + ret = -EBUSY; > + break; > + } > + > + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); > break; > > case TUNSETFILTEREBPF: > - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); > + if (get_user(fd, (int __user *)argp)) { > + ret = -EFAULT; > + break; > + } > + > + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); > break; > > case TUNSETCARRIER: > @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > break; > } > > - vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL); > - if (!vnet_hash) { > - ret = -ENOMEM; > - break; > + if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) { > + struct tun_vnet_hash_rss rss; > + size_t indirection_table_size; > + size_t key_size; > + size_t size; > + > + if (tun->steering_prog) { > + ret = -EBUSY; > + break; > + } > + > + if (copy_from_user(&rss, argp, sizeof(rss))) { > + ret = -EFAULT; > + break; > + } > + argp = (struct tun_vnet_hash_rss __user *)argp + 1; > + > + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; Why make uapi a mask rather than a length? Also is there a upper length bounds sanity check for this input from userspace? > + key_size = virtio_net_hash_key_length(vnet_hash_common.types); > + size = sizeof(*vnet_hash) + indirection_table_size + key_size; key_size is included in sizeof(*vnet_hash), always VIRTIO_NET_RSS_MAX_KEY_SIZE. > + > + vnet_hash = kmalloc(size, GFP_KERNEL); > + if (!vnet_hash) { > + ret = -ENOMEM; > + break; > + } > + > + if (copy_from_user(vnet_hash->rss_indirection_table, > + argp, indirection_table_size)) { > + kfree(vnet_hash); > + ret = -EFAULT; > + break; > + } > + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; > + > + if (copy_from_user(vnet_hash->rss_key, argp, key_size)) { > + kfree(vnet_hash); > + ret = -EFAULT; > + break; > + } > + > + vnet_hash->rss = rss; > + } else { > + vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL); > + if (!vnet_hash) { > + ret = -ENOMEM; > + break; > + } > } > > vnet_hash->common = vnet_hash_common; > diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h > index 1561e8ce0a0a..1c130409db5d 100644 > --- a/include/uapi/linux/if_tun.h > +++ b/include/uapi/linux/if_tun.h > @@ -75,6 +75,14 @@ > * > * The argument is a pointer to &struct tun_vnet_hash. > * > + * The argument is a pointer to the compound of the following in order if > + * %TUN_VNET_HASH_RSS is set: > + * > + * 1. &struct tun_vnet_hash > + * 2. &struct tun_vnet_hash_rss > + * 3. Indirection table > + * 4. Key > + * > * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to > * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with > * %TUN_VNET_HASH_REPORT. > @@ -144,6 +152,13 @@ struct tun_filter { > */ > #define TUN_VNET_HASH_REPORT 0x0001 > > +/** > + * define TUN_VNET_HASH_RSS - Request virtio_net RSS > + * > + * This is mutually exclusive with eBPF steering program. > + */ > +#define TUN_VNET_HASH_RSS 0x0002 > + > /** > * struct tun_vnet_hash - virtio_net hashing configuration > * @flags: > @@ -159,4 +174,16 @@ struct tun_vnet_hash { > __u32 types; > }; > > +/** > + * struct tun_vnet_hash_rss - virtio_net RSS configuration > + * @indirection_table_mask: > + * Bitmask to be applied to the indirection table index > + * @unclassified_queue: > + * The index of the queue to place unclassified packets in > + */ > +struct tun_vnet_hash_rss { > + __u16 indirection_table_mask; > + __u16 unclassified_queue; > +}; > + > #endif /* _UAPI__IF_TUN_H */ > > -- > 2.46.0 >
On 2024/09/18 15:28, Willem de Bruijn wrote: > Akihiko Odaki wrote: >> RSS is a receive steering algorithm that can be negotiated to use with >> virtio_net. Conventionally the hash calculation was done by the VMM. >> However, computing the hash after the queue was chosen defeats the >> purpose of RSS. >> >> Another approach is to use eBPF steering program. This approach has >> another downside: it cannot report the calculated hash due to the >> restrictive nature of eBPF steering program. >> >> Introduce the code to perform RSS to the kernel in order to overcome >> thse challenges. An alternative solution is to extend the eBPF steering >> program so that it will be able to report to the userspace, but I didn't >> opt for it because extending the current mechanism of eBPF steering >> program as is because it relies on legacy context rewriting, and >> introducing kfunc-based eBPF will result in non-UAPI dependency while >> the other relevant virtualization APIs such as KVM and vhost_net are >> UAPIs. >> >> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >> --- >> drivers/net/tun.c | 119 +++++++++++++++++++++++++++++++++++++++----- >> include/uapi/linux/if_tun.h | 27 ++++++++++ >> 2 files changed, 133 insertions(+), 13 deletions(-) >> >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >> index b8fcd71becac..5a429b391144 100644 >> --- a/drivers/net/tun.c >> +++ b/drivers/net/tun.c >> @@ -175,6 +175,9 @@ struct tun_prog { >> >> struct tun_vnet_hash_container { >> struct tun_vnet_hash common; >> + struct tun_vnet_hash_rss rss; >> + __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; >> + u16 rss_indirection_table[]; >> }; >> >> /* Since the socket were moved to tun_file, to preserve the behavior of persist >> @@ -227,7 +230,7 @@ struct veth { >> }; >> >> static const struct tun_vnet_hash tun_vnet_hash_cap = { >> - .flags = TUN_VNET_HASH_REPORT, >> + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, >> .types = VIRTIO_NET_SUPPORTED_HASH_TYPES >> }; >> >> @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) >> return ret % numqueues; >> } >> >> +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun, >> + struct sk_buff *skb, >> + const struct tun_vnet_hash_container *vnet_hash) >> +{ >> + struct tun_vnet_hash_ext *ext; >> + struct virtio_net_hash hash; >> + u32 numqueues = READ_ONCE(tun->numqueues); >> + u16 txq, index; >> + >> + if (!numqueues) >> + return 0; >> + >> + if (!virtio_net_hash_rss(skb, vnet_hash->common.types, vnet_hash->rss_key, >> + &hash)) >> + return vnet_hash->rss.unclassified_queue % numqueues; >> + >> + if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) { >> + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); >> + if (ext) { >> + ext->value = hash.value; >> + ext->report = hash.report; >> + } >> + } >> + >> + index = hash.value & vnet_hash->rss.indirection_table_mask; >> + txq = READ_ONCE(vnet_hash->rss_indirection_table[index]); >> + >> + return txq % numqueues; >> +} >> + >> static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, >> struct net_device *sb_dev) >> { >> @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, >> } else { >> struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); >> >> - ret = tun_automq_select_queue(tun, skb, vnet_hash); >> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) >> + ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash); >> + else >> + ret = tun_automq_select_queue(tun, skb, vnet_hash); >> } >> rcu_read_unlock(); >> >> @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) >> } >> >> static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, >> - void __user *data) >> + int fd) >> { >> struct bpf_prog *prog; >> - int fd; >> - >> - if (copy_from_user(&fd, data, sizeof(fd))) >> - return -EFAULT; >> >> if (fd == -1) { >> prog = NULL; >> @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> int ifindex; >> int sndbuf; >> int vnet_hdr_sz; >> + int fd; >> int le; >> int ret; >> bool do_notify = false; >> @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> break; >> >> case TUNSETSTEERINGEBPF: >> - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); >> + if (get_user(fd, (int __user *)argp)) { >> + ret = -EFAULT; >> + break; >> + } >> + >> + vnet_hash = rtnl_dereference(tun->vnet_hash); >> + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { >> + ret = -EBUSY; >> + break; >> + } >> + >> + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); >> break; >> >> case TUNSETFILTEREBPF: >> - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); >> + if (get_user(fd, (int __user *)argp)) { >> + ret = -EFAULT; >> + break; >> + } >> + >> + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); >> break; >> >> case TUNSETCARRIER: >> @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> break; >> } >> >> - vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL); >> - if (!vnet_hash) { >> - ret = -ENOMEM; >> - break; >> + if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) { >> + struct tun_vnet_hash_rss rss; >> + size_t indirection_table_size; >> + size_t key_size; >> + size_t size; >> + >> + if (tun->steering_prog) { >> + ret = -EBUSY; >> + break; >> + } >> + >> + if (copy_from_user(&rss, argp, sizeof(rss))) { >> + ret = -EFAULT; >> + break; >> + } >> + argp = (struct tun_vnet_hash_rss __user *)argp + 1; >> + >> + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; > > Why make uapi a mask rather than a length? It follows the virtio specification. It is actually used as a mask in tun_vnet_rss_select_queue(). > > Also is there a upper length bounds sanity check for this input from > userspace? No, but the maximum size is limited to 128 bytes because the indirection_table_mask is 16-bit and it indexes an array of 16-bit integers. > >> + key_size = virtio_net_hash_key_length(vnet_hash_common.types); >> + size = sizeof(*vnet_hash) + indirection_table_size + key_size; > > key_size is included in sizeof(*vnet_hash), always > VIRTIO_NET_RSS_MAX_KEY_SIZE. I will fix this by replacing it with: struct_size(vnet_hash, rss_indirection_table, (size_t)rss.indirection_table_mask + 1) Regards, Akihiko Odaki >> + >> + vnet_hash = kmalloc(size, GFP_KERNEL); >> + if (!vnet_hash) { >> + ret = -ENOMEM; >> + break; >> + } >> + >> + if (copy_from_user(vnet_hash->rss_indirection_table, >> + argp, indirection_table_size)) { >> + kfree(vnet_hash); >> + ret = -EFAULT; >> + break; >> + } >> + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; >> + >> + if (copy_from_user(vnet_hash->rss_key, argp, key_size)) { >> + kfree(vnet_hash); >> + ret = -EFAULT; >> + break; >> + } >> + >> + vnet_hash->rss = rss; >> + } else { >> + vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL); >> + if (!vnet_hash) { >> + ret = -ENOMEM; >> + break; >> + } >> } >> >> vnet_hash->common = vnet_hash_common; >> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h >> index 1561e8ce0a0a..1c130409db5d 100644 >> --- a/include/uapi/linux/if_tun.h >> +++ b/include/uapi/linux/if_tun.h >> @@ -75,6 +75,14 @@ >> * >> * The argument is a pointer to &struct tun_vnet_hash. >> * >> + * The argument is a pointer to the compound of the following in order if >> + * %TUN_VNET_HASH_RSS is set: >> + * >> + * 1. &struct tun_vnet_hash >> + * 2. &struct tun_vnet_hash_rss >> + * 3. Indirection table >> + * 4. Key >> + * >> * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to >> * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with >> * %TUN_VNET_HASH_REPORT. >> @@ -144,6 +152,13 @@ struct tun_filter { >> */ >> #define TUN_VNET_HASH_REPORT 0x0001 >> >> +/** >> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS >> + * >> + * This is mutually exclusive with eBPF steering program. >> + */ >> +#define TUN_VNET_HASH_RSS 0x0002 >> + >> /** >> * struct tun_vnet_hash - virtio_net hashing configuration >> * @flags: >> @@ -159,4 +174,16 @@ struct tun_vnet_hash { >> __u32 types; >> }; >> >> +/** >> + * struct tun_vnet_hash_rss - virtio_net RSS configuration >> + * @indirection_table_mask: >> + * Bitmask to be applied to the indirection table index >> + * @unclassified_queue: >> + * The index of the queue to place unclassified packets in >> + */ >> +struct tun_vnet_hash_rss { >> + __u16 indirection_table_mask; >> + __u16 unclassified_queue; >> +}; >> + >> #endif /* _UAPI__IF_TUN_H */ >> >> -- >> 2.46.0 >> > >
On 2024/09/24 10:56, Akihiko Odaki wrote: > On 2024/09/18 15:28, Willem de Bruijn wrote: >> Akihiko Odaki wrote: >>> RSS is a receive steering algorithm that can be negotiated to use with >>> virtio_net. Conventionally the hash calculation was done by the VMM. >>> However, computing the hash after the queue was chosen defeats the >>> purpose of RSS. >>> >>> Another approach is to use eBPF steering program. This approach has >>> another downside: it cannot report the calculated hash due to the >>> restrictive nature of eBPF steering program. >>> >>> Introduce the code to perform RSS to the kernel in order to overcome >>> thse challenges. An alternative solution is to extend the eBPF steering >>> program so that it will be able to report to the userspace, but I didn't >>> opt for it because extending the current mechanism of eBPF steering >>> program as is because it relies on legacy context rewriting, and >>> introducing kfunc-based eBPF will result in non-UAPI dependency while >>> the other relevant virtualization APIs such as KVM and vhost_net are >>> UAPIs. >>> >>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >>> --- >>> drivers/net/tun.c | 119 >>> +++++++++++++++++++++++++++++++++++++++----- >>> include/uapi/linux/if_tun.h | 27 ++++++++++ >>> 2 files changed, 133 insertions(+), 13 deletions(-) >>> >>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >>> index b8fcd71becac..5a429b391144 100644 >>> --- a/drivers/net/tun.c >>> +++ b/drivers/net/tun.c >>> @@ -175,6 +175,9 @@ struct tun_prog { >>> struct tun_vnet_hash_container { >>> struct tun_vnet_hash common; >>> + struct tun_vnet_hash_rss rss; >>> + __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; >>> + u16 rss_indirection_table[]; >>> }; >>> /* Since the socket were moved to tun_file, to preserve the >>> behavior of persist >>> @@ -227,7 +230,7 @@ struct veth { >>> }; >>> static const struct tun_vnet_hash tun_vnet_hash_cap = { >>> - .flags = TUN_VNET_HASH_REPORT, >>> + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, >>> .types = VIRTIO_NET_SUPPORTED_HASH_TYPES >>> }; >>> @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct >>> tun_struct *tun, struct sk_buff *skb) >>> return ret % numqueues; >>> } >>> +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun, >>> + struct sk_buff *skb, >>> + const struct tun_vnet_hash_container *vnet_hash) >>> +{ >>> + struct tun_vnet_hash_ext *ext; >>> + struct virtio_net_hash hash; >>> + u32 numqueues = READ_ONCE(tun->numqueues); >>> + u16 txq, index; >>> + >>> + if (!numqueues) >>> + return 0; >>> + >>> + if (!virtio_net_hash_rss(skb, vnet_hash->common.types, >>> vnet_hash->rss_key, >>> + &hash)) >>> + return vnet_hash->rss.unclassified_queue % numqueues; >>> + >>> + if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) { >>> + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); >>> + if (ext) { >>> + ext->value = hash.value; >>> + ext->report = hash.report; >>> + } >>> + } >>> + >>> + index = hash.value & vnet_hash->rss.indirection_table_mask; >>> + txq = READ_ONCE(vnet_hash->rss_indirection_table[index]); >>> + >>> + return txq % numqueues; >>> +} >>> + >>> static u16 tun_select_queue(struct net_device *dev, struct sk_buff >>> *skb, >>> struct net_device *sb_dev) >>> { >>> @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device >>> *dev, struct sk_buff *skb, >>> } else { >>> struct tun_vnet_hash_container *vnet_hash = >>> rcu_dereference(tun->vnet_hash); >>> - ret = tun_automq_select_queue(tun, skb, vnet_hash); >>> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) >>> + ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash); >>> + else >>> + ret = tun_automq_select_queue(tun, skb, vnet_hash); >>> } >>> rcu_read_unlock(); >>> @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, >>> struct ifreq *ifr) >>> } >>> static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog >>> __rcu **prog_p, >>> - void __user *data) >>> + int fd) >>> { >>> struct bpf_prog *prog; >>> - int fd; >>> - >>> - if (copy_from_user(&fd, data, sizeof(fd))) >>> - return -EFAULT; >>> if (fd == -1) { >>> prog = NULL; >>> @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, >>> unsigned int cmd, >>> int ifindex; >>> int sndbuf; >>> int vnet_hdr_sz; >>> + int fd; >>> int le; >>> int ret; >>> bool do_notify = false; >>> @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file >>> *file, unsigned int cmd, >>> break; >>> case TUNSETSTEERINGEBPF: >>> - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); >>> + if (get_user(fd, (int __user *)argp)) { >>> + ret = -EFAULT; >>> + break; >>> + } >>> + >>> + vnet_hash = rtnl_dereference(tun->vnet_hash); >>> + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & >>> TUN_VNET_HASH_RSS)) { >>> + ret = -EBUSY; >>> + break; >>> + } >>> + >>> + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); >>> break; >>> case TUNSETFILTEREBPF: >>> - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); >>> + if (get_user(fd, (int __user *)argp)) { >>> + ret = -EFAULT; >>> + break; >>> + } >>> + >>> + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); >>> break; >>> case TUNSETCARRIER: >>> @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file >>> *file, unsigned int cmd, >>> break; >>> } >>> - vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL); >>> - if (!vnet_hash) { >>> - ret = -ENOMEM; >>> - break; >>> + if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) { >>> + struct tun_vnet_hash_rss rss; >>> + size_t indirection_table_size; >>> + size_t key_size; >>> + size_t size; >>> + >>> + if (tun->steering_prog) { >>> + ret = -EBUSY; >>> + break; >>> + } >>> + >>> + if (copy_from_user(&rss, argp, sizeof(rss))) { >>> + ret = -EFAULT; >>> + break; >>> + } >>> + argp = (struct tun_vnet_hash_rss __user *)argp + 1; >>> + >>> + indirection_table_size = >>> ((size_t)rss.indirection_table_mask + 1) * 2; >> >> Why make uapi a mask rather than a length? > > It follows the virtio specification. It is actually used as a mask in > tun_vnet_rss_select_queue(). > >> >> Also is there a upper length bounds sanity check for this input from >> userspace? > > No, but the maximum size is limited to 128 bytes because the > indirection_table_mask is 16-bit and it indexes an array of 16-bit > integers. Not 128 bytes but 128 KiB. > >> >>> + key_size = >>> virtio_net_hash_key_length(vnet_hash_common.types); >>> + size = sizeof(*vnet_hash) + indirection_table_size + >>> key_size; >> >> key_size is included in sizeof(*vnet_hash), always >> VIRTIO_NET_RSS_MAX_KEY_SIZE. > > I will fix this by replacing it with: > struct_size(vnet_hash, rss_indirection_table, > (size_t)rss.indirection_table_mask + 1) > > Regards, > Akihiko Odaki > >>> + >>> + vnet_hash = kmalloc(size, GFP_KERNEL); >>> + if (!vnet_hash) { >>> + ret = -ENOMEM; >>> + break; >>> + } >>> + >>> + if (copy_from_user(vnet_hash->rss_indirection_table, >>> + argp, indirection_table_size)) { >>> + kfree(vnet_hash); >>> + ret = -EFAULT; >>> + break; >>> + } >>> + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; >>> + >>> + if (copy_from_user(vnet_hash->rss_key, argp, key_size)) { >>> + kfree(vnet_hash); >>> + ret = -EFAULT; >>> + break; >>> + } >>> + >>> + vnet_hash->rss = rss; >>> + } else { >>> + vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL); >>> + if (!vnet_hash) { >>> + ret = -ENOMEM; >>> + break; >>> + } >>> } >>> vnet_hash->common = vnet_hash_common; >>> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h >>> index 1561e8ce0a0a..1c130409db5d 100644 >>> --- a/include/uapi/linux/if_tun.h >>> +++ b/include/uapi/linux/if_tun.h >>> @@ -75,6 +75,14 @@ >>> * >>> * The argument is a pointer to &struct tun_vnet_hash. >>> * >>> + * The argument is a pointer to the compound of the following in >>> order if >>> + * %TUN_VNET_HASH_RSS is set: >>> + * >>> + * 1. &struct tun_vnet_hash >>> + * 2. &struct tun_vnet_hash_rss >>> + * 3. Indirection table >>> + * 4. Key >>> + * >>> * %TUNSETVNETHDRSZ ioctl must be called with a number greater than >>> or equal to >>> * the size of &struct virtio_net_hdr_v1_hash before calling this >>> ioctl with >>> * %TUN_VNET_HASH_REPORT. >>> @@ -144,6 +152,13 @@ struct tun_filter { >>> */ >>> #define TUN_VNET_HASH_REPORT 0x0001 >>> +/** >>> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS >>> + * >>> + * This is mutually exclusive with eBPF steering program. >>> + */ >>> +#define TUN_VNET_HASH_RSS 0x0002 >>> + >>> /** >>> * struct tun_vnet_hash - virtio_net hashing configuration >>> * @flags: >>> @@ -159,4 +174,16 @@ struct tun_vnet_hash { >>> __u32 types; >>> }; >>> +/** >>> + * struct tun_vnet_hash_rss - virtio_net RSS configuration >>> + * @indirection_table_mask: >>> + * Bitmask to be applied to the indirection table index >>> + * @unclassified_queue: >>> + * The index of the queue to place unclassified packets in >>> + */ >>> +struct tun_vnet_hash_rss { >>> + __u16 indirection_table_mask; >>> + __u16 unclassified_queue; >>> +}; >>> + >>> #endif /* _UAPI__IF_TUN_H */ >>> >>> -- >>> 2.46.0 >>> >> >>
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b8fcd71becac..5a429b391144 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -175,6 +175,9 @@ struct tun_prog { struct tun_vnet_hash_container { struct tun_vnet_hash common; + struct tun_vnet_hash_rss rss; + __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; + u16 rss_indirection_table[]; }; /* Since the socket were moved to tun_file, to preserve the behavior of persist @@ -227,7 +230,7 @@ struct veth { }; static const struct tun_vnet_hash tun_vnet_hash_cap = { - .flags = TUN_VNET_HASH_REPORT, + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, .types = VIRTIO_NET_SUPPORTED_HASH_TYPES }; @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) return ret % numqueues; } +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun, + struct sk_buff *skb, + const struct tun_vnet_hash_container *vnet_hash) +{ + struct tun_vnet_hash_ext *ext; + struct virtio_net_hash hash; + u32 numqueues = READ_ONCE(tun->numqueues); + u16 txq, index; + + if (!numqueues) + return 0; + + if (!virtio_net_hash_rss(skb, vnet_hash->common.types, vnet_hash->rss_key, + &hash)) + return vnet_hash->rss.unclassified_queue % numqueues; + + if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) { + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); + if (ext) { + ext->value = hash.value; + ext->report = hash.report; + } + } + + index = hash.value & vnet_hash->rss.indirection_table_mask; + txq = READ_ONCE(vnet_hash->rss_indirection_table[index]); + + return txq % numqueues; +} + static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, } else { struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); - ret = tun_automq_select_queue(tun, skb, vnet_hash); + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) + ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash); + else + ret = tun_automq_select_queue(tun, skb, vnet_hash); } rcu_read_unlock(); @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) } static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, - void __user *data) + int fd) { struct bpf_prog *prog; - int fd; - - if (copy_from_user(&fd, data, sizeof(fd))) - return -EFAULT; if (fd == -1) { prog = NULL; @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, int ifindex; int sndbuf; int vnet_hdr_sz; + int fd; int le; int ret; bool do_notify = false; @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; case TUNSETSTEERINGEBPF: - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); + if (get_user(fd, (int __user *)argp)) { + ret = -EFAULT; + break; + } + + vnet_hash = rtnl_dereference(tun->vnet_hash); + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { + ret = -EBUSY; + break; + } + + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); break; case TUNSETFILTEREBPF: - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); + if (get_user(fd, (int __user *)argp)) { + ret = -EFAULT; + break; + } + + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); break; case TUNSETCARRIER: @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; } - vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL); - if (!vnet_hash) { - ret = -ENOMEM; - break; + if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) { + struct tun_vnet_hash_rss rss; + size_t indirection_table_size; + size_t key_size; + size_t size; + + if (tun->steering_prog) { + ret = -EBUSY; + break; + } + + if (copy_from_user(&rss, argp, sizeof(rss))) { + ret = -EFAULT; + break; + } + argp = (struct tun_vnet_hash_rss __user *)argp + 1; + + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; + key_size = virtio_net_hash_key_length(vnet_hash_common.types); + size = sizeof(*vnet_hash) + indirection_table_size + key_size; + + vnet_hash = kmalloc(size, GFP_KERNEL); + if (!vnet_hash) { + ret = -ENOMEM; + break; + } + + if (copy_from_user(vnet_hash->rss_indirection_table, + argp, indirection_table_size)) { + kfree(vnet_hash); + ret = -EFAULT; + break; + } + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; + + if (copy_from_user(vnet_hash->rss_key, argp, key_size)) { + kfree(vnet_hash); + ret = -EFAULT; + break; + } + + vnet_hash->rss = rss; + } else { + vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL); + if (!vnet_hash) { + ret = -ENOMEM; + break; + } } vnet_hash->common = vnet_hash_common; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 1561e8ce0a0a..1c130409db5d 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -75,6 +75,14 @@ * * The argument is a pointer to &struct tun_vnet_hash. * + * The argument is a pointer to the compound of the following in order if + * %TUN_VNET_HASH_RSS is set: + * + * 1. &struct tun_vnet_hash + * 2. &struct tun_vnet_hash_rss + * 3. Indirection table + * 4. Key + * * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with * %TUN_VNET_HASH_REPORT. @@ -144,6 +152,13 @@ struct tun_filter { */ #define TUN_VNET_HASH_REPORT 0x0001 +/** + * define TUN_VNET_HASH_RSS - Request virtio_net RSS + * + * This is mutually exclusive with eBPF steering program. + */ +#define TUN_VNET_HASH_RSS 0x0002 + /** * struct tun_vnet_hash - virtio_net hashing configuration * @flags: @@ -159,4 +174,16 @@ struct tun_vnet_hash { __u32 types; }; +/** + * struct tun_vnet_hash_rss - virtio_net RSS configuration + * @indirection_table_mask: + * Bitmask to be applied to the indirection table index + * @unclassified_queue: + * The index of the queue to place unclassified packets in + */ +struct tun_vnet_hash_rss { + __u16 indirection_table_mask; + __u16 unclassified_queue; +}; + #endif /* _UAPI__IF_TUN_H */
RSS is a receive steering algorithm that can be negotiated to use with virtio_net. Conventionally the hash calculation was done by the VMM. However, computing the hash after the queue was chosen defeats the purpose of RSS. Another approach is to use eBPF steering program. This approach has another downside: it cannot report the calculated hash due to the restrictive nature of eBPF steering program. Introduce the code to perform RSS to the kernel in order to overcome thse challenges. An alternative solution is to extend the eBPF steering program so that it will be able to report to the userspace, but I didn't opt for it because extending the current mechanism of eBPF steering program as is because it relies on legacy context rewriting, and introducing kfunc-based eBPF will result in non-UAPI dependency while the other relevant virtualization APIs such as KVM and vhost_net are UAPIs. Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> --- drivers/net/tun.c | 119 +++++++++++++++++++++++++++++++++++++++----- include/uapi/linux/if_tun.h | 27 ++++++++++ 2 files changed, 133 insertions(+), 13 deletions(-)