Message ID | 20250307-rss-v9-3-df76624025eb@daynix.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | tun: Introduce virtio-net hashing feature | expand |
Akihiko Odaki wrote: > Hash reporting > ============== > > Allow the guest to reuse the hash value to make receive steering > consistent between the host and guest, and to save hash computation. > > RSS > === > > RSS is a receive steering algorithm that can be negotiated to use with > virtio_net. Conventionally the hash calculation was done by the VMM. > However, computing the hash after the queue was chosen defeats the > purpose of RSS. > > Another approach is to use eBPF steering program. This approach has > another downside: it cannot report the calculated hash due to the > restrictive nature of eBPF steering program. > > Introduce the code to perform RSS to the kernel in order to overcome > thse challenges. An alternative solution is to extend the eBPF steering > program so that it will be able to report to the userspace, but I didn't > opt for it because extending the current mechanism of eBPF steering > program as is because it relies on legacy context rewriting, and > introducing kfunc-based eBPF will result in non-UAPI dependency while > the other relevant virtualization APIs such as KVM and vhost_net are > UAPIs. > > Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > Tested-by: Lei Yang <leiyang@redhat.com> > --- > Documentation/networking/tuntap.rst | 7 ++ > drivers/net/Kconfig | 1 + > drivers/net/tap.c | 68 ++++++++++++++- > drivers/net/tun.c | 98 +++++++++++++++++----- > drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- > include/linux/if_tap.h | 2 + > include/linux/skbuff.h | 3 + > include/uapi/linux/if_tun.h | 75 +++++++++++++++++ > net/core/skbuff.c | 4 + > 9 files changed, 386 insertions(+), 31 deletions(-) This is arguably still doing too much in a single patch. Can you split tap from tun? Move ioctl contrl operations out to their own patch? > > diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst > index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 > --- a/Documentation/networking/tuntap.rst > +++ b/Documentation/networking/tuntap.rst > @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: > return ioctl(fd, TUNSETQUEUE, (void *)&ifr); > } > > +3.4 Reference > +------------- > + > +``linux/if_tun.h`` defines the interface described below: > + > +.. kernel-doc:: include/uapi/linux/if_tun.h > + > Universal TUN/TAP device driver Frequently Asked Question > ========================================================= > > diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig > index 1fd5acdc73c6af0e1a861867039c3624fc618e25..aecfd244dd83585fea2c5b815dcd787c58166c28 100644 > --- a/drivers/net/Kconfig > +++ b/drivers/net/Kconfig > @@ -395,6 +395,7 @@ config TUN > tristate "Universal TUN/TAP device driver support" > depends on INET > select CRC32 > + select SKB_EXTENSIONS > help > TUN/TAP provides packet reception and transmission for user space > programs. It can be viewed as a simple Point-to-Point or Ethernet > diff --git a/drivers/net/tap.c b/drivers/net/tap.c > index d4ece538f1b23789ca60caa6232690e4d0a4d14a..9428b63ec27e7f92e78a78afcb5e24383862c00d 100644 > --- a/drivers/net/tap.c > +++ b/drivers/net/tap.c > @@ -49,6 +49,10 @@ struct major_info { > struct list_head next; > }; > > +struct tap_skb_cb { > + struct virtio_net_hash hash; > +}; > + > #define GOODCOPY_LEN 128 > > static const struct proto_ops tap_socket_ops; > @@ -179,6 +183,22 @@ static void tap_put_queue(struct tap_queue *q) > sock_put(&q->sk); > } > > +static struct tap_skb_cb *tap_skb_cb(const struct sk_buff *skb) > +{ > + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct tap_skb_cb)); > + return (struct tap_skb_cb *)skb->cb; > +} > + > +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb) > +{ > + return &tap_skb_cb(skb)->hash; > +} > + > +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb) > +{ > + return &tap_skb_cb(skb)->hash; > +} > + These two helpers do the same thing. > /* > * Select a queue based on the rxq of the device on which this packet > * arrived. If the incoming device is not mq, calculate a flow hash > @@ -189,6 +209,7 @@ static void tap_put_queue(struct tap_queue *q) > static struct tap_queue *tap_get_queue(struct tap_dev *tap, > struct sk_buff *skb) > { > + struct flow_keys_basic keys_basic; > struct tap_queue *queue = NULL; > /* Access to taps array is protected by rcu, but access to numvtaps > * isn't. Below we use it to lookup a queue, but treat it as a hint > @@ -196,17 +217,47 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, > * racing against queue removal. > */ > int numvtaps = READ_ONCE(tap->numvtaps); > + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash); > __u32 rxq; > > + *tap_skb_cb(skb) = (struct tap_skb_cb) { > + .hash = { .report = VIRTIO_NET_HASH_REPORT_NONE } > + }; > + > if (!numvtaps) > goto out; > > if (numvtaps == 1) > goto single; > > + if (vnet_hash) { > + if ((vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { > + rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash); > + queue = rcu_dereference(tap->taps[rxq]); > + goto out; so tun_vnet_hash_report does not work in this case? > + } > + > + if (!skb->l4_hash && !skb->sw_hash) { > + struct flow_keys keys; > + > + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); > + rxq = flow_hash_from_keys(&keys); > + keys_basic = (struct flow_keys_basic) { > + .control = keys.control, > + .basic = keys.basic > + }; > + } else { > + skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0, > + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); > + rxq = skb->hash; > + } > + } else { > + rxq = skb_get_hash(skb); > + } > + > /* Check if we can use flow to select a queue */ > - rxq = skb_get_hash(skb); > if (rxq) { > + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash); > queue = rcu_dereference(tap->taps[rxq % numvtaps]); > goto out; > } > @@ -711,11 +762,12 @@ static ssize_t tap_put_user(struct tap_queue *q, > int total; > > if (q->flags & IFF_VNET_HDR) { > - struct virtio_net_hdr vnet_hdr; > + struct virtio_net_hdr_v1_hash vnet_hdr; > > vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); > > - ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); > + ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb, > + tap_find_hash, &vnet_hdr); > if (ret) > return ret; > > @@ -992,6 +1044,16 @@ static long tap_ioctl(struct file *file, unsigned int cmd, > rtnl_unlock(); > return ret; > > + case TUNGETVNETHASHCAP: > + return tun_vnet_ioctl_gethashcap(argp); > + > + case TUNSETVNETHASH: > + rtnl_lock(); > + tap = rtnl_dereference(q->tap); > + ret = tap ? tun_vnet_ioctl_sethash(&tap->vnet_hash, true, argp) : -EBADFD; > + rtnl_unlock(); > + return ret; > + > case SIOCGIFHWADDR: > rtnl_lock(); > tap = tap_get_tap_dev(q); > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index d8f4d3e996a7a81d1f8b04635054081671a14f07..520013df416e93d3a50b46be9b53ae9ab410eab4 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -209,6 +209,7 @@ struct tun_struct { > struct bpf_prog __rcu *xdp_prog; > struct tun_prog __rcu *steering_prog; > struct tun_prog __rcu *filter_prog; > + struct tun_vnet_hash_container __rcu *vnet_hash; > struct ethtool_link_ksettings link_ksettings; > /* init args */ > struct file *file; > @@ -451,20 +452,37 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) > e->rps_rxhash = hash; > } > > +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb) > +{ > + return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); > +} > + > +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb) > +{ > + return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); > +} > + > /* We try to identify a flow through its rxhash. The reason that > * we do not check rxq no. is because some cards(e.g 82599), chooses > * the rxq based on the txq where the last packet of the flow comes. As > * the userspace application move between processors, we may get a > * different rxq no. here. > */ > -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) > +static u16 tun_automq_select_queue(struct tun_struct *tun, > + const struct tun_vnet_hash_container *vnet_hash, > + struct sk_buff *skb) > { > + struct flow_keys keys; > + struct flow_keys_basic keys_basic; > struct tun_flow_entry *e; > u32 txq, numqueues; > > numqueues = READ_ONCE(tun->numqueues); > > - txq = __skb_get_hash_symmetric(skb); > + memset(&keys, 0, sizeof(keys)); > + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); > + > + txq = flow_hash_from_keys(&keys); > e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); > if (e) { > tun_flow_save_rps_rxhash(e, txq); > @@ -473,6 +491,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) > txq = reciprocal_scale(txq, numqueues); > } > > + keys_basic = (struct flow_keys_basic) { > + .control = keys.control, > + .basic = keys.basic > + }; > + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq, > + tun_add_hash); > + > return txq; > } > > @@ -500,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, > u16 ret; > > rcu_read_lock(); > - if (rcu_dereference(tun->steering_prog)) > + if (rcu_dereference(tun->steering_prog)) { > ret = tun_ebpf_select_queue(tun, skb); > - else > - ret = tun_automq_select_queue(tun, skb); > + } else { > + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); > + > + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) > + ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash, > + skb, tun_add_hash); > + else > + ret = tun_automq_select_queue(tun, vnet_hash, skb); > + } > rcu_read_unlock(); > > return ret; > @@ -1987,7 +2019,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, > ssize_t ret; > > if (tun->flags & IFF_VNET_HDR) { > - struct virtio_net_hdr gso = { 0 }; > + struct virtio_net_hdr_v1_hash gso = { 0 }; > > vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); > ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); > @@ -2040,9 +2072,10 @@ static ssize_t tun_put_user(struct tun_struct *tun, > } > > if (vnet_hdr_sz) { > - struct virtio_net_hdr gso; > + struct virtio_net_hdr_v1_hash gso; > > - ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); > + ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, > + skb, tun_find_hash, &gso); > if (ret) > return ret; > > @@ -2223,6 +2256,7 @@ static void tun_free_netdev(struct net_device *dev) > security_tun_dev_free_security(tun->security); > __tun_set_ebpf(tun, &tun->steering_prog, NULL); > __tun_set_ebpf(tun, &tun->filter_prog, NULL); > + kfree_rcu_mightsleep(rcu_access_pointer(tun->vnet_hash)); > } > > static void tun_setup(struct net_device *dev) > @@ -2921,13 +2955,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) > } > > static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, > - void __user *data) > + int fd) > { > struct bpf_prog *prog; > - int fd; > - > - if (copy_from_user(&fd, data, sizeof(fd))) > - return -EFAULT; > > if (fd == -1) { > prog = NULL; > @@ -2993,7 +3023,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > int ifindex; > int sndbuf; > int ret; > + int fd; > bool do_notify = false; > + struct tun_vnet_hash_container *vnet_hash; > > if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || > (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { > @@ -3020,7 +3052,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > rtnl_lock(); > > tun = tun_get(tfile); > - if (cmd == TUNSETIFF) { > + switch (cmd) { > + case TUNSETIFF: > ret = -EEXIST; > if (tun) > goto unlock; > @@ -3035,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > if (copy_to_user(argp, &ifr, ifreq_len)) > ret = -EFAULT; > goto unlock; > - } > - if (cmd == TUNSETIFINDEX) { > + > + case TUNSETIFINDEX: > ret = -EPERM; > if (tun) > goto unlock; > @@ -3050,6 +3083,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > ret = 0; > tfile->ifindex = ifindex; > goto unlock; > + > + case TUNGETVNETHASHCAP: > + ret = tun_vnet_ioctl_gethashcap(argp); > + goto unlock; > } > > ret = -EBADFD; > @@ -3230,11 +3267,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > break; > > case TUNSETSTEERINGEBPF: > - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); > + if (get_user(fd, (int __user *)argp)) { > + ret = -EFAULT; > + break; > + } > + > + vnet_hash = rtnl_dereference(tun->vnet_hash); > + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { > + ret = -EBUSY; > + break; > + } > + > + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); > break; > > case TUNSETFILTEREBPF: > - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); > + if (get_user(fd, (int __user *)argp)) { > + ret = -EFAULT; > + break; > + } > + > + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); > break; > > case TUNSETCARRIER: > @@ -3252,8 +3305,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > ret = open_related_ns(&net->ns, get_net_ns); > break; > > + case TUNSETVNETHASH: > + ret = tun_vnet_ioctl_sethash(&tun->vnet_hash, > + !rtnl_dereference(tun->steering_prog), > + argp); > + break; > + > default: > - ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); > + ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, > + cmd, argp); no need to touch this > break; > } > > diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h > index 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed 100644 > --- a/drivers/net/tun_vnet.h > +++ b/drivers/net/tun_vnet.h > @@ -6,6 +6,16 @@ > #define TUN_VNET_LE 0x80000000 > #define TUN_VNET_BE 0x40000000 > > +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *); > +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *); > + > +struct tun_vnet_hash_container { > + struct tun_vnet_hash common; > + struct tun_vnet_hash_rss rss; > + u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > + u16 rss_indirection_table[]; > +}; > + > static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) > { > bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && > @@ -107,6 +117,123 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, > } > } > > +static inline long tun_vnet_ioctl_gethashcap(void __user *argp) > +{ > + static const struct tun_vnet_hash cap = { > + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, > + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES > + }; > + > + return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; > +} > + > +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, > + bool can_rss, void __user *argp) > +{ > + struct tun_vnet_hash hash_buf; > + struct tun_vnet_hash_container *hash; > + > + if (copy_from_user(&hash_buf, argp, sizeof(hash_buf))) > + return -EFAULT; > + argp = (struct tun_vnet_hash __user *)argp + 1; > + > + if (hash_buf.flags & TUN_VNET_HASH_RSS) { > + struct tun_vnet_hash_rss rss; > + size_t indirection_table_size; > + size_t key_size; > + size_t size; > + > + if (!can_rss) > + return -EBUSY; > + > + if (copy_from_user(&rss, argp, sizeof(rss))) > + return -EFAULT; > + argp = (struct tun_vnet_hash_rss __user *)argp + 1; > + > + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; > + key_size = virtio_net_hash_key_length(hash_buf.types); > + size = struct_size(hash, rss_indirection_table, > + (size_t)rss.indirection_table_mask + 1); > + > + hash = kmalloc(size, GFP_KERNEL); > + if (!hash) > + return -ENOMEM; > + > + if (copy_from_user(hash->rss_indirection_table, > + argp, indirection_table_size)) { > + kfree(hash); > + return -EFAULT; > + } > + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; > + > + if (copy_from_user(hash->rss_key, argp, key_size)) { > + kfree(hash); > + return -EFAULT; > + } > + > + virtio_net_toeplitz_convert_key(hash->rss_key, key_size); > + hash->rss = rss; > + } else { > + hash = kmalloc(sizeof(hash->common), GFP_KERNEL); > + if (!hash) > + return -ENOMEM; > + } > + > + hash->common = hash_buf; > + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash)); > + return 0; > +} > + > +static void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash, > + struct sk_buff *skb, > + const struct flow_keys_basic *keys, > + u32 value, > + tun_vnet_hash_add vnet_hash_add) > +{ > + struct virtio_net_hash *report; > + > + if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT)) > + return; > + > + report = vnet_hash_add(skb); > + if (!report) > + return; > + > + *report = (struct virtio_net_hash) { > + .report = virtio_net_hash_report(hash->common.types, keys), > + .value = value > + }; > +} > + > +static u16 tun_vnet_rss_select_queue(u32 numqueues, > + const struct tun_vnet_hash_container *hash, > + struct sk_buff *skb, > + tun_vnet_hash_add vnet_hash_add) > +{ > + struct virtio_net_hash *report; > + struct virtio_net_hash ret; > + u16 txq, index; > + > + if (!numqueues) > + return 0; > + > + virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret); > + > + if (!ret.report) > + return hash->rss.unclassified_queue % numqueues; > + > + if (hash->common.flags & TUN_VNET_HASH_REPORT) { > + report = vnet_hash_add(skb); > + if (report) > + *report = ret; > + } > + > + index = ret.value & hash->rss.indirection_table_mask; > + txq = READ_ONCE(hash->rss_indirection_table[index]); > + > + return txq % numqueues; > +} > + > static inline int tun_vnet_hdr_get(int sz, unsigned int flags, > struct iov_iter *from, > struct virtio_net_hdr *hdr) > @@ -135,15 +262,17 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags, > } > > static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, > - const struct virtio_net_hdr *hdr) > + const struct virtio_net_hdr_v1_hash *hdr) > { > + int content_sz = MIN(sizeof(*hdr), sz); > + > if (unlikely(iov_iter_count(iter) < sz)) > return -EINVAL; > > - if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) > + if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz)) > return -EFAULT; > > - if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) > + if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) > return -EFAULT; > > return 0; > @@ -155,26 +284,38 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, > return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); > } > > -static inline int tun_vnet_hdr_from_skb(unsigned int flags, > +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags, > const struct net_device *dev, > const struct sk_buff *skb, > - struct virtio_net_hdr *hdr) > + tun_vnet_hash_find vnet_hash_find, > + struct virtio_net_hdr_v1_hash *hdr) > { > int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; > + const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ? > + NULL : vnet_hash_find(skb); > + > + *hdr = (struct virtio_net_hdr_v1_hash) { > + .hash_report = VIRTIO_NET_HASH_REPORT_NONE > + }; > + > + if (report) { > + hdr->hash_value = cpu_to_le32(report->value); > + hdr->hash_report = cpu_to_le16(report->report); > + } > > - if (virtio_net_hdr_from_skb(skb, hdr, > + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, > tun_vnet_is_little_endian(flags), true, > vlan_hlen)) { > struct skb_shared_info *sinfo = skb_shinfo(skb); > > if (net_ratelimit()) { > netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", > - sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size), > - tun_vnet16_to_cpu(flags, hdr->hdr_len)); > + sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size), > + tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len)); > print_hex_dump(KERN_ERR, "tun: ", > DUMP_PREFIX_NONE, > 16, 1, skb->head, > - min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true); > + min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true); > } > WARN_ON_ONCE(1); > return -EINVAL; > diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h > index 553552fa635c3e1e53d1a63c203d32e4c4fd5a4f..7334c46a3f101675a0d4e5a036987cfe18842f9f 100644 > --- a/include/linux/if_tap.h > +++ b/include/linux/if_tap.h > @@ -31,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f) > #define MAX_TAP_QUEUES 256 > > struct tap_queue; > +struct tun_vnet_hash_container; > > struct tap_dev { > struct net_device *dev; > @@ -43,6 +44,7 @@ struct tap_dev { > int numqueues; > netdev_features_t tap_features; > int minor; > + struct tun_vnet_hash_container __rcu *vnet_hash; > > void (*update_features)(struct tap_dev *tap, netdev_features_t features); > void (*count_tx_dropped)(struct tap_dev *tap); > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index bb2b751d274acff931281a72e8b4b0c699b4e8af..cdd793f1c360ad5f63fcc4cbf67d845f5e2ccf6f 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -4842,6 +4842,9 @@ enum skb_ext_id { > #endif > #if IS_ENABLED(CONFIG_MCTP_FLOWS) > SKB_EXT_MCTP, > +#endif > +#if IS_ENABLED(CONFIG_TUN) > + SKB_EXT_TUN_VNET_HASH, > #endif > SKB_EXT_NUM, /* must be last */ > }; > diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h > index 287cdc81c9390c289a30545aa7ed23d81c3329d3..4887f97500a870c7ef3c96a5837b2d0a5a225040 100644 > --- a/include/uapi/linux/if_tun.h > +++ b/include/uapi/linux/if_tun.h > @@ -62,6 +62,42 @@ > #define TUNSETCARRIER _IOW('T', 226, int) > #define TUNGETDEVNETNS _IO('T', 227) > > +/** > + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability. > + * > + * The argument is a pointer to &struct tun_vnet_hash which will store the > + * maximal virtio_net hashing configuration. > + */ > +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash) > + > +/** > + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing > + * > + * The argument is a pointer to &struct tun_vnet_hash. > + * > + * The argument is a pointer to the compound of the following in order if > + * %TUN_VNET_HASH_RSS is set: > + * > + * 1. &struct tun_vnet_hash > + * 2. &struct tun_vnet_hash_rss > + * 3. Indirection table > + * 4. Key > + * > + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only > + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal > + * to the size of &struct virtio_net_hdr_v1_hash. > + * > + * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will > + * always be little-endian. > + * > + * This ioctl results in %EBADFD if the underlying device is deleted. It affects > + * all queues attached to the same device. > + * > + * This ioctl currently has no effect on XDP packets and packets with > + * queue_mapping set by TC. > + */ > +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash) > + > /* TUNSETIFF ifr flags */ > #define IFF_TUN 0x0001 > #define IFF_TAP 0x0002 > @@ -115,4 +151,43 @@ struct tun_filter { > __u8 addr[][ETH_ALEN]; > }; > > +/** > + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost > + */ > +#define TUN_VNET_HASH_REPORT 0x0001 > + > +/** > + * define TUN_VNET_HASH_RSS - Request virtio_net RSS > + * > + * This is mutually exclusive with eBPF steering program. > + */ > +#define TUN_VNET_HASH_RSS 0x0002 > + > +/** > + * struct tun_vnet_hash - virtio_net hashing configuration > + * @flags: > + * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS > + * @pad: > + * Should be filled with zero before passing to %TUNSETVNETHASH > + * @types: > + * Bitmask of allowed hash types > + */ > +struct tun_vnet_hash { > + __u16 flags; > + __u8 pad[2]; > + __u32 types; > +}; > + > +/** > + * struct tun_vnet_hash_rss - virtio_net RSS configuration > + * @indirection_table_mask: > + * Bitmask to be applied to the indirection table index > + * @unclassified_queue: > + * The index of the queue to place unclassified packets in > + */ > +struct tun_vnet_hash_rss { > + __u16 indirection_table_mask; > + __u16 unclassified_queue; > +}; > + > #endif /* _UAPI__IF_TUN_H */ > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 7b03b64fdcb276f68ce881d1d8da8e4c6b897efc..aa2a091b649f0c9d6e0196f34f345ba78b5498fb 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -64,6 +64,7 @@ > #include <linux/mpls.h> > #include <linux/kcov.h> > #include <linux/iov_iter.h> > +#include <linux/virtio_net.h> > > #include <net/protocol.h> > #include <net/dst.h> > @@ -4969,6 +4970,9 @@ static const u8 skb_ext_type_len[] = { > #if IS_ENABLED(CONFIG_MCTP_FLOWS) > [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), > #endif > +#if IS_ENABLED(CONFIG_TUN) > + [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct virtio_net_hash), > +#endif > }; > > static __always_inline unsigned int skb_ext_total_length(void) > > -- > 2.48.1 >
On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > Hash reporting > ============== > > Allow the guest to reuse the hash value to make receive steering > consistent between the host and guest, and to save hash computation. > > RSS > === > > RSS is a receive steering algorithm that can be negotiated to use with > virtio_net. Conventionally the hash calculation was done by the VMM. > However, computing the hash after the queue was chosen defeats the > purpose of RSS. > > Another approach is to use eBPF steering program. This approach has > another downside: it cannot report the calculated hash due to the > restrictive nature of eBPF steering program. > > Introduce the code to perform RSS to the kernel in order to overcome > thse challenges. An alternative solution is to extend the eBPF steering > program so that it will be able to report to the userspace, but I didn't > opt for it because extending the current mechanism of eBPF steering > program as is because it relies on legacy context rewriting, and > introducing kfunc-based eBPF will result in non-UAPI dependency while > the other relevant virtualization APIs such as KVM and vhost_net are > UAPIs. > > Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > Tested-by: Lei Yang <leiyang@redhat.com> > --- > Documentation/networking/tuntap.rst | 7 ++ > drivers/net/Kconfig | 1 + > drivers/net/tap.c | 68 ++++++++++++++- > drivers/net/tun.c | 98 +++++++++++++++++----- > drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- > include/linux/if_tap.h | 2 + > include/linux/skbuff.h | 3 + > include/uapi/linux/if_tun.h | 75 +++++++++++++++++ > net/core/skbuff.c | 4 + > 9 files changed, 386 insertions(+), 31 deletions(-) > > diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst > index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 > --- a/Documentation/networking/tuntap.rst > +++ b/Documentation/networking/tuntap.rst > @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: > return ioctl(fd, TUNSETQUEUE, (void *)&ifr); > } > > +3.4 Reference > +------------- > + > +``linux/if_tun.h`` defines the interface described below: > + > +.. kernel-doc:: include/uapi/linux/if_tun.h > + > Universal TUN/TAP device driver Frequently Asked Question > ========================================================= > > diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig > index 1fd5acdc73c6af0e1a861867039c3624fc618e25..aecfd244dd83585fea2c5b815dcd787c58166c28 100644 > --- a/drivers/net/Kconfig > +++ b/drivers/net/Kconfig > @@ -395,6 +395,7 @@ config TUN > tristate "Universal TUN/TAP device driver support" > depends on INET > select CRC32 > + select SKB_EXTENSIONS > help > TUN/TAP provides packet reception and transmission for user space > programs. It can be viewed as a simple Point-to-Point or Ethernet > diff --git a/drivers/net/tap.c b/drivers/net/tap.c > index d4ece538f1b23789ca60caa6232690e4d0a4d14a..9428b63ec27e7f92e78a78afcb5e24383862c00d 100644 > --- a/drivers/net/tap.c > +++ b/drivers/net/tap.c > @@ -49,6 +49,10 @@ struct major_info { > struct list_head next; > }; > > +struct tap_skb_cb { > + struct virtio_net_hash hash; > +}; > + > #define GOODCOPY_LEN 128 > > static const struct proto_ops tap_socket_ops; > @@ -179,6 +183,22 @@ static void tap_put_queue(struct tap_queue *q) > sock_put(&q->sk); > } > > +static struct tap_skb_cb *tap_skb_cb(const struct sk_buff *skb) > +{ > + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct tap_skb_cb)); > + return (struct tap_skb_cb *)skb->cb; > +} > + > +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb) > +{ > + return &tap_skb_cb(skb)->hash; > +} > + > +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb) > +{ > + return &tap_skb_cb(skb)->hash; > +} > + > /* > * Select a queue based on the rxq of the device on which this packet > * arrived. If the incoming device is not mq, calculate a flow hash > @@ -189,6 +209,7 @@ static void tap_put_queue(struct tap_queue *q) > static struct tap_queue *tap_get_queue(struct tap_dev *tap, > struct sk_buff *skb) > { > + struct flow_keys_basic keys_basic; > struct tap_queue *queue = NULL; > /* Access to taps array is protected by rcu, but access to numvtaps > * isn't. Below we use it to lookup a queue, but treat it as a hint > @@ -196,17 +217,47 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, > * racing against queue removal. > */ > int numvtaps = READ_ONCE(tap->numvtaps); > + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash); > __u32 rxq; > > + *tap_skb_cb(skb) = (struct tap_skb_cb) { > + .hash = { .report = VIRTIO_NET_HASH_REPORT_NONE } > + }; > + > if (!numvtaps) > goto out; > > if (numvtaps == 1) > goto single; > > + if (vnet_hash) { > + if ((vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { > + rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash); > + queue = rcu_dereference(tap->taps[rxq]); > + goto out; > + } > + > + if (!skb->l4_hash && !skb->sw_hash) { > + struct flow_keys keys; > + > + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); > + rxq = flow_hash_from_keys(&keys); > + keys_basic = (struct flow_keys_basic) { > + .control = keys.control, > + .basic = keys.basic > + }; > + } else { > + skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0, > + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); > + rxq = skb->hash; > + } > + } else { > + rxq = skb_get_hash(skb); > + } > + > /* Check if we can use flow to select a queue */ > - rxq = skb_get_hash(skb); > if (rxq) { > + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash); > queue = rcu_dereference(tap->taps[rxq % numvtaps]); > goto out; > } > @@ -711,11 +762,12 @@ static ssize_t tap_put_user(struct tap_queue *q, > int total; > > if (q->flags & IFF_VNET_HDR) { > - struct virtio_net_hdr vnet_hdr; > + struct virtio_net_hdr_v1_hash vnet_hdr; > > vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); > > - ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); > + ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb, > + tap_find_hash, &vnet_hdr); > if (ret) > return ret; > > @@ -992,6 +1044,16 @@ static long tap_ioctl(struct file *file, unsigned int cmd, > rtnl_unlock(); > return ret; > > + case TUNGETVNETHASHCAP: > + return tun_vnet_ioctl_gethashcap(argp); > + > + case TUNSETVNETHASH: > + rtnl_lock(); > + tap = rtnl_dereference(q->tap); > + ret = tap ? tun_vnet_ioctl_sethash(&tap->vnet_hash, true, argp) : -EBADFD; > + rtnl_unlock(); > + return ret; > + > case SIOCGIFHWADDR: > rtnl_lock(); > tap = tap_get_tap_dev(q); > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index d8f4d3e996a7a81d1f8b04635054081671a14f07..520013df416e93d3a50b46be9b53ae9ab410eab4 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -209,6 +209,7 @@ struct tun_struct { > struct bpf_prog __rcu *xdp_prog; > struct tun_prog __rcu *steering_prog; > struct tun_prog __rcu *filter_prog; > + struct tun_vnet_hash_container __rcu *vnet_hash; > struct ethtool_link_ksettings link_ksettings; > /* init args */ > struct file *file; > @@ -451,20 +452,37 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) > e->rps_rxhash = hash; > } > > +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb) > +{ > + return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); > +} > + > +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb) > +{ > + return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); > +} > + > /* We try to identify a flow through its rxhash. The reason that > * we do not check rxq no. is because some cards(e.g 82599), chooses > * the rxq based on the txq where the last packet of the flow comes. As > * the userspace application move between processors, we may get a > * different rxq no. here. > */ > -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) > +static u16 tun_automq_select_queue(struct tun_struct *tun, > + const struct tun_vnet_hash_container *vnet_hash, > + struct sk_buff *skb) > { > + struct flow_keys keys; > + struct flow_keys_basic keys_basic; > struct tun_flow_entry *e; > u32 txq, numqueues; > > numqueues = READ_ONCE(tun->numqueues); > > - txq = __skb_get_hash_symmetric(skb); > + memset(&keys, 0, sizeof(keys)); > + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); > + > + txq = flow_hash_from_keys(&keys); > e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); > if (e) { > tun_flow_save_rps_rxhash(e, txq); > @@ -473,6 +491,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) > txq = reciprocal_scale(txq, numqueues); > } > > + keys_basic = (struct flow_keys_basic) { > + .control = keys.control, > + .basic = keys.basic > + }; > + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq, > + tun_add_hash); > + > return txq; > } > > @@ -500,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, > u16 ret; > > rcu_read_lock(); > - if (rcu_dereference(tun->steering_prog)) > + if (rcu_dereference(tun->steering_prog)) { > ret = tun_ebpf_select_queue(tun, skb); > - else > - ret = tun_automq_select_queue(tun, skb); > + } else { > + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); > + > + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) > + ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash, > + skb, tun_add_hash); > + else > + ret = tun_automq_select_queue(tun, vnet_hash, skb); > + } > rcu_read_unlock(); > > return ret; > @@ -1987,7 +2019,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, > ssize_t ret; > > if (tun->flags & IFF_VNET_HDR) { > - struct virtio_net_hdr gso = { 0 }; > + struct virtio_net_hdr_v1_hash gso = { 0 }; > > vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); > ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); > @@ -2040,9 +2072,10 @@ static ssize_t tun_put_user(struct tun_struct *tun, > } > > if (vnet_hdr_sz) { > - struct virtio_net_hdr gso; > + struct virtio_net_hdr_v1_hash gso; > > - ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); > + ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, > + skb, tun_find_hash, &gso); > if (ret) > return ret; > > @@ -2223,6 +2256,7 @@ static void tun_free_netdev(struct net_device *dev) > security_tun_dev_free_security(tun->security); > __tun_set_ebpf(tun, &tun->steering_prog, NULL); > __tun_set_ebpf(tun, &tun->filter_prog, NULL); > + kfree_rcu_mightsleep(rcu_access_pointer(tun->vnet_hash)); > } > > static void tun_setup(struct net_device *dev) > @@ -2921,13 +2955,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) > } > > static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, > - void __user *data) > + int fd) > { > struct bpf_prog *prog; > - int fd; > - > - if (copy_from_user(&fd, data, sizeof(fd))) > - return -EFAULT; > > if (fd == -1) { > prog = NULL; > @@ -2993,7 +3023,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > int ifindex; > int sndbuf; > int ret; > + int fd; > bool do_notify = false; > + struct tun_vnet_hash_container *vnet_hash; > > if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || > (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { > @@ -3020,7 +3052,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > rtnl_lock(); > > tun = tun_get(tfile); > - if (cmd == TUNSETIFF) { > + switch (cmd) { > + case TUNSETIFF: > ret = -EEXIST; > if (tun) > goto unlock; > @@ -3035,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > if (copy_to_user(argp, &ifr, ifreq_len)) > ret = -EFAULT; > goto unlock; > - } > - if (cmd == TUNSETIFINDEX) { > + > + case TUNSETIFINDEX: > ret = -EPERM; > if (tun) > goto unlock; > @@ -3050,6 +3083,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > ret = 0; > tfile->ifindex = ifindex; > goto unlock; > + > + case TUNGETVNETHASHCAP: > + ret = tun_vnet_ioctl_gethashcap(argp); > + goto unlock; > } > > ret = -EBADFD; > @@ -3230,11 +3267,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > break; > > case TUNSETSTEERINGEBPF: > - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); > + if (get_user(fd, (int __user *)argp)) { > + ret = -EFAULT; > + break; > + } > + > + vnet_hash = rtnl_dereference(tun->vnet_hash); > + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { > + ret = -EBUSY; > + break; > + } > + > + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); > break; > > case TUNSETFILTEREBPF: > - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); > + if (get_user(fd, (int __user *)argp)) { > + ret = -EFAULT; > + break; > + } > + > + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); > break; > > case TUNSETCARRIER: > @@ -3252,8 +3305,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > ret = open_related_ns(&net->ns, get_net_ns); > break; > > + case TUNSETVNETHASH: > + ret = tun_vnet_ioctl_sethash(&tun->vnet_hash, > + !rtnl_dereference(tun->steering_prog), > + argp); > + break; > + > default: > - ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); > + ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, > + cmd, argp); > break; > } > > diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h > index 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed 100644 > --- a/drivers/net/tun_vnet.h > +++ b/drivers/net/tun_vnet.h > @@ -6,6 +6,16 @@ > #define TUN_VNET_LE 0x80000000 > #define TUN_VNET_BE 0x40000000 > > +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *); > +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *); > + > +struct tun_vnet_hash_container { > + struct tun_vnet_hash common; I'd rename this as hash. > + struct tun_vnet_hash_rss rss; > + u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > + u16 rss_indirection_table[]; > +}; Besides the separated ioctl, I'd split this structure into rss and hash part as well. > + > static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) > { > bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && > @@ -107,6 +117,123 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, > } > } > > +static inline long tun_vnet_ioctl_gethashcap(void __user *argp) > +{ > + static const struct tun_vnet_hash cap = { > + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, > + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES > + }; > + > + return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; Let's has a consistent name for this and the uapi to be consistent with TUNSETIFF/TUNGETIFF. Probably TUNSETVNETHASH and tun_vnet_ioctl_gethash(). > +} > + > +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, > + bool can_rss, void __user *argp) So again, can_rss seems to be tricky. Looking at its caller, it tires to make eBPF and RSS mutually exclusive. I still don't understand why we need this. Allow eBPF program to override some of the path seems to be common practice. What's more, we didn't try (or even can't) to make automq and eBPF to be mutually exclusive. So I still didn't see what we gain from this and it complicates the codes and may lead to ambiguous uAPI/behaviour. > +{ > + struct tun_vnet_hash hash_buf; > + struct tun_vnet_hash_container *hash; > + > + if (copy_from_user(&hash_buf, argp, sizeof(hash_buf))) > + return -EFAULT; > + argp = (struct tun_vnet_hash __user *)argp + 1; > + > + if (hash_buf.flags & TUN_VNET_HASH_RSS) { > + struct tun_vnet_hash_rss rss; > + size_t indirection_table_size; > + size_t key_size; > + size_t size; > + > + if (!can_rss) > + return -EBUSY; > + > + if (copy_from_user(&rss, argp, sizeof(rss))) > + return -EFAULT; > + argp = (struct tun_vnet_hash_rss __user *)argp + 1; > + > + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; > + key_size = virtio_net_hash_key_length(hash_buf.types); > + size = struct_size(hash, rss_indirection_table, > + (size_t)rss.indirection_table_mask + 1); > + > + hash = kmalloc(size, GFP_KERNEL); > + if (!hash) > + return -ENOMEM; > + > + if (copy_from_user(hash->rss_indirection_table, > + argp, indirection_table_size)) { > + kfree(hash); > + return -EFAULT; > + } > + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; > + > + if (copy_from_user(hash->rss_key, argp, key_size)) { > + kfree(hash); > + return -EFAULT; > + } > + > + virtio_net_toeplitz_convert_key(hash->rss_key, key_size); > + hash->rss = rss; > + } else { > + hash = kmalloc(sizeof(hash->common), GFP_KERNEL); > + if (!hash) > + return -ENOMEM; Do we need to validate the hash here (at least against the types we supported?) > + } > + > + hash->common = hash_buf; > + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash)); I still didn't understand the trick here. E.g we use very simple primitives in synchronizing ebpf program through RCU in __tun_set_ebpf(). > + return 0; > +} > + > +static void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash, > + struct sk_buff *skb, > + const struct flow_keys_basic *keys, > + u32 value, > + tun_vnet_hash_add vnet_hash_add) > +{ > + struct virtio_net_hash *report; > + > + if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT)) > + return; > + > + report = vnet_hash_add(skb); > + if (!report) > + return; > + > + *report = (struct virtio_net_hash) { > + .report = virtio_net_hash_report(hash->common.types, keys), > + .value = value > + }; What's the advantage of using Designated Initializers here? Simple assignment can save two lines of code. > +} > + > +static u16 tun_vnet_rss_select_queue(u32 numqueues, > + const struct tun_vnet_hash_container *hash, > + struct sk_buff *skb, > + tun_vnet_hash_add vnet_hash_add) > +{ > + struct virtio_net_hash *report; > + struct virtio_net_hash ret; > + u16 txq, index; > + > + if (!numqueues) > + return 0; > + > + virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret); > + > + if (!ret.report) > + return hash->rss.unclassified_queue % numqueues; > + > + if (hash->common.flags & TUN_VNET_HASH_REPORT) { > + report = vnet_hash_add(skb); > + if (report) > + *report = ret; > + } Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? If yes, it should be a bug. > + > + index = ret.value & hash->rss.indirection_table_mask; > + txq = READ_ONCE(hash->rss_indirection_table[index]); So vnet_hash is accessed via rcu_dereference(), I don't get any reason we need READ_ONCE here, is this paired with something? If yes, let's add a comment here. If rss_indirection_table need why indirection_table_mask doesn't need this? > + > + return txq % numqueues; > +} > + > static inline int tun_vnet_hdr_get(int sz, unsigned int flags, > struct iov_iter *from, > struct virtio_net_hdr *hdr) > @@ -135,15 +262,17 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags, > } > > static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, > - const struct virtio_net_hdr *hdr) > + const struct virtio_net_hdr_v1_hash *hdr) > { To be more robust, we can tweak the function to accept a vnet_hdr_len parameter then we can avoid touching this every time when we need to extend vnet hdr in the future? > + int content_sz = MIN(sizeof(*hdr), sz); > + > if (unlikely(iov_iter_count(iter) < sz)) > return -EINVAL; > > - if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) > + if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz)) > return -EFAULT; > > - if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) > + if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) > return -EFAULT; > > return 0; > @@ -155,26 +284,38 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, > return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); > } > > -static inline int tun_vnet_hdr_from_skb(unsigned int flags, > +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags, > const struct net_device *dev, > const struct sk_buff *skb, > - struct virtio_net_hdr *hdr) > + tun_vnet_hash_find vnet_hash_find, > + struct virtio_net_hdr_v1_hash *hdr) > { > int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; > + const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ? > + NULL : vnet_hash_find(skb); > + > + *hdr = (struct virtio_net_hdr_v1_hash) { > + .hash_report = VIRTIO_NET_HASH_REPORT_NONE > + }; > + > + if (report) { > + hdr->hash_value = cpu_to_le32(report->value); > + hdr->hash_report = cpu_to_le16(report->report); > + } > > - if (virtio_net_hdr_from_skb(skb, hdr, > + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, > tun_vnet_is_little_endian(flags), true, > vlan_hlen)) { > struct skb_shared_info *sinfo = skb_shinfo(skb); > > if (net_ratelimit()) { > netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", > - sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size), > - tun_vnet16_to_cpu(flags, hdr->hdr_len)); > + sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size), > + tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len)); > print_hex_dump(KERN_ERR, "tun: ", > DUMP_PREFIX_NONE, > 16, 1, skb->head, > - min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true); > + min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true); > } > WARN_ON_ONCE(1); > return -EINVAL; > diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h > index 553552fa635c3e1e53d1a63c203d32e4c4fd5a4f..7334c46a3f101675a0d4e5a036987cfe18842f9f 100644 > --- a/include/linux/if_tap.h > +++ b/include/linux/if_tap.h > @@ -31,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f) > #define MAX_TAP_QUEUES 256 > > struct tap_queue; > +struct tun_vnet_hash_container; > > struct tap_dev { > struct net_device *dev; > @@ -43,6 +44,7 @@ struct tap_dev { > int numqueues; > netdev_features_t tap_features; > int minor; > + struct tun_vnet_hash_container __rcu *vnet_hash; > > void (*update_features)(struct tap_dev *tap, netdev_features_t features); > void (*count_tx_dropped)(struct tap_dev *tap); > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index bb2b751d274acff931281a72e8b4b0c699b4e8af..cdd793f1c360ad5f63fcc4cbf67d845f5e2ccf6f 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -4842,6 +4842,9 @@ enum skb_ext_id { > #endif > #if IS_ENABLED(CONFIG_MCTP_FLOWS) > SKB_EXT_MCTP, > +#endif > +#if IS_ENABLED(CONFIG_TUN) > + SKB_EXT_TUN_VNET_HASH, > #endif > SKB_EXT_NUM, /* must be last */ > }; > diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h > index 287cdc81c9390c289a30545aa7ed23d81c3329d3..4887f97500a870c7ef3c96a5837b2d0a5a225040 100644 > --- a/include/uapi/linux/if_tun.h > +++ b/include/uapi/linux/if_tun.h > @@ -62,6 +62,42 @@ > #define TUNSETCARRIER _IOW('T', 226, int) > #define TUNGETDEVNETNS _IO('T', 227) > > +/** > + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability. > + * > + * The argument is a pointer to &struct tun_vnet_hash which will store the > + * maximal virtio_net hashing configuration. > + */ > +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash) > + > +/** > + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing > + * > + * The argument is a pointer to &struct tun_vnet_hash. > + * > + * The argument is a pointer to the compound of the following in order if > + * %TUN_VNET_HASH_RSS is set: > + * > + * 1. &struct tun_vnet_hash > + * 2. &struct tun_vnet_hash_rss > + * 3. Indirection table > + * 4. Key > + * > + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only > + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal > + * to the size of &struct virtio_net_hdr_v1_hash. So you had a dependency check already for vnet hdr len. I'd still suggest to split this into rss and hash as they are separated features. Then we can use separate data structure for them instead of a container struct. > + * > + * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will > + * always be little-endian. > + * > + * This ioctl results in %EBADFD if the underlying device is deleted. It affects > + * all queues attached to the same device. > + * > + * This ioctl currently has no effect on XDP packets and packets with > + * queue_mapping set by TC. > + */ > +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash) > + > /* TUNSETIFF ifr flags */ > #define IFF_TUN 0x0001 > #define IFF_TAP 0x0002 > @@ -115,4 +151,43 @@ struct tun_filter { > __u8 addr[][ETH_ALEN]; > }; > > +/** > + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost > + */ > +#define TUN_VNET_HASH_REPORT 0x0001 > + > +/** > + * define TUN_VNET_HASH_RSS - Request virtio_net RSS > + * > + * This is mutually exclusive with eBPF steering program. > + */ > +#define TUN_VNET_HASH_RSS 0x0002 > + > +/** > + * struct tun_vnet_hash - virtio_net hashing configuration > + * @flags: > + * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS > + * @pad: > + * Should be filled with zero before passing to %TUNSETVNETHASH > + * @types: > + * Bitmask of allowed hash types > + */ > +struct tun_vnet_hash { > + __u16 flags; > + __u8 pad[2]; > + __u32 types; > +}; Padding in the middle of the structure is not elegant. Any reason for this? And hash->types seems never used. > + > +/** > + * struct tun_vnet_hash_rss - virtio_net RSS configuration > + * @indirection_table_mask: > + * Bitmask to be applied to the indirection table index > + * @unclassified_queue: > + * The index of the queue to place unclassified packets in > + */ > +struct tun_vnet_hash_rss { > + __u16 indirection_table_mask; > + __u16 unclassified_queue; > +}; > + > #endif /* _UAPI__IF_TUN_H */ > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 7b03b64fdcb276f68ce881d1d8da8e4c6b897efc..aa2a091b649f0c9d6e0196f34f345ba78b5498fb 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -64,6 +64,7 @@ > #include <linux/mpls.h> > #include <linux/kcov.h> > #include <linux/iov_iter.h> > +#include <linux/virtio_net.h> > > #include <net/protocol.h> > #include <net/dst.h> > @@ -4969,6 +4970,9 @@ static const u8 skb_ext_type_len[] = { > #if IS_ENABLED(CONFIG_MCTP_FLOWS) > [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), > #endif > +#if IS_ENABLED(CONFIG_TUN) > + [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct virtio_net_hash), > +#endif > }; > > static __always_inline unsigned int skb_ext_total_length(void) > > -- > 2.48.1 > Thanks
On Mon, Mar 10, 2025 at 11:55 AM Jason Wang <jasowang@redhat.com> wrote: > > On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > > > Hash reporting > > ============== > > > > Allow the guest to reuse the hash value to make receive steering > > consistent between the host and guest, and to save hash computation. > > > > RSS > > === > > > > RSS is a receive steering algorithm that can be negotiated to use with > > virtio_net. Conventionally the hash calculation was done by the VMM. > > However, computing the hash after the queue was chosen defeats the > > purpose of RSS. > > > > Another approach is to use eBPF steering program. This approach has > > another downside: it cannot report the calculated hash due to the > > restrictive nature of eBPF steering program. > > > > Introduce the code to perform RSS to the kernel in order to overcome > > thse challenges. An alternative solution is to extend the eBPF steering > > program so that it will be able to report to the userspace, but I didn't > > opt for it because extending the current mechanism of eBPF steering > > program as is because it relies on legacy context rewriting, and > > introducing kfunc-based eBPF will result in non-UAPI dependency while > > the other relevant virtualization APIs such as KVM and vhost_net are > > UAPIs. > > > > Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > > Tested-by: Lei Yang <leiyang@redhat.com> > > --- > > Documentation/networking/tuntap.rst | 7 ++ > > drivers/net/Kconfig | 1 + > > drivers/net/tap.c | 68 ++++++++++++++- > > drivers/net/tun.c | 98 +++++++++++++++++----- > > drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- > > include/linux/if_tap.h | 2 + > > include/linux/skbuff.h | 3 + > > include/uapi/linux/if_tun.h | 75 +++++++++++++++++ > > net/core/skbuff.c | 4 + > > 9 files changed, 386 insertions(+), 31 deletions(-) [...] > > + * > > + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only > > + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal > > + * to the size of &struct virtio_net_hdr_v1_hash. > > So you had a dependency check already for vnet hdr len. I'd still > suggest to split this into rss and hash as they are separated > features. Then we can use separate data structure for them instead of > a container struct. > Besides this, I think we still need to add new bits to TUNGETIFF to let userspace know about the new ability. Thanks
On 2025/03/09 4:32, Willem de Bruijn wrote: > Akihiko Odaki wrote: >> Hash reporting >> ============== >> >> Allow the guest to reuse the hash value to make receive steering >> consistent between the host and guest, and to save hash computation. >> >> RSS >> === >> >> RSS is a receive steering algorithm that can be negotiated to use with >> virtio_net. Conventionally the hash calculation was done by the VMM. >> However, computing the hash after the queue was chosen defeats the >> purpose of RSS. >> >> Another approach is to use eBPF steering program. This approach has >> another downside: it cannot report the calculated hash due to the >> restrictive nature of eBPF steering program. >> >> Introduce the code to perform RSS to the kernel in order to overcome >> thse challenges. An alternative solution is to extend the eBPF steering >> program so that it will be able to report to the userspace, but I didn't >> opt for it because extending the current mechanism of eBPF steering >> program as is because it relies on legacy context rewriting, and >> introducing kfunc-based eBPF will result in non-UAPI dependency while >> the other relevant virtualization APIs such as KVM and vhost_net are >> UAPIs. >> >> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >> Tested-by: Lei Yang <leiyang@redhat.com> >> --- >> Documentation/networking/tuntap.rst | 7 ++ >> drivers/net/Kconfig | 1 + >> drivers/net/tap.c | 68 ++++++++++++++- >> drivers/net/tun.c | 98 +++++++++++++++++----- >> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >> include/linux/if_tap.h | 2 + >> include/linux/skbuff.h | 3 + >> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >> net/core/skbuff.c | 4 + >> 9 files changed, 386 insertions(+), 31 deletions(-) > > This is arguably still doing too much in a single patch. > > Can you split tap from tun? Move ioctl contrl operations out to their > own patch? I'll split changes for the code specific to TUN and TAP from the changes for the common code in the next version. > >> >> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst >> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 >> --- a/Documentation/networking/tuntap.rst >> +++ b/Documentation/networking/tuntap.rst >> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >> } >> >> +3.4 Reference >> +------------- >> + >> +``linux/if_tun.h`` defines the interface described below: >> + >> +.. kernel-doc:: include/uapi/linux/if_tun.h >> + >> Universal TUN/TAP device driver Frequently Asked Question >> ========================================================= >> >> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig >> index 1fd5acdc73c6af0e1a861867039c3624fc618e25..aecfd244dd83585fea2c5b815dcd787c58166c28 100644 >> --- a/drivers/net/Kconfig >> +++ b/drivers/net/Kconfig >> @@ -395,6 +395,7 @@ config TUN >> tristate "Universal TUN/TAP device driver support" >> depends on INET >> select CRC32 >> + select SKB_EXTENSIONS >> help >> TUN/TAP provides packet reception and transmission for user space >> programs. It can be viewed as a simple Point-to-Point or Ethernet >> diff --git a/drivers/net/tap.c b/drivers/net/tap.c >> index d4ece538f1b23789ca60caa6232690e4d0a4d14a..9428b63ec27e7f92e78a78afcb5e24383862c00d 100644 >> --- a/drivers/net/tap.c >> +++ b/drivers/net/tap.c >> @@ -49,6 +49,10 @@ struct major_info { >> struct list_head next; >> }; >> >> +struct tap_skb_cb { >> + struct virtio_net_hash hash; >> +}; >> + >> #define GOODCOPY_LEN 128 >> >> static const struct proto_ops tap_socket_ops; >> @@ -179,6 +183,22 @@ static void tap_put_queue(struct tap_queue *q) >> sock_put(&q->sk); >> } >> >> +static struct tap_skb_cb *tap_skb_cb(const struct sk_buff *skb) >> +{ >> + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct tap_skb_cb)); >> + return (struct tap_skb_cb *)skb->cb; >> +} >> + >> +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb) >> +{ >> + return &tap_skb_cb(skb)->hash; >> +} >> + >> +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb) >> +{ >> + return &tap_skb_cb(skb)->hash; >> +} >> + > > These two helpers do the same thing. They have different signatures, which matter because they are passed as function pointers to common interfaces. > >> /* >> * Select a queue based on the rxq of the device on which this packet >> * arrived. If the incoming device is not mq, calculate a flow hash >> @@ -189,6 +209,7 @@ static void tap_put_queue(struct tap_queue *q) >> static struct tap_queue *tap_get_queue(struct tap_dev *tap, >> struct sk_buff *skb) >> { >> + struct flow_keys_basic keys_basic; >> struct tap_queue *queue = NULL; >> /* Access to taps array is protected by rcu, but access to numvtaps >> * isn't. Below we use it to lookup a queue, but treat it as a hint >> @@ -196,17 +217,47 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, >> * racing against queue removal. >> */ >> int numvtaps = READ_ONCE(tap->numvtaps); >> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash); >> __u32 rxq; >> >> + *tap_skb_cb(skb) = (struct tap_skb_cb) { >> + .hash = { .report = VIRTIO_NET_HASH_REPORT_NONE } >> + }; >> + >> if (!numvtaps) >> goto out; >> >> if (numvtaps == 1) >> goto single; >> >> + if (vnet_hash) { >> + if ((vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { >> + rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash); >> + queue = rcu_dereference(tap->taps[rxq]); >> + goto out; > > so tun_vnet_hash_report does not work in this case? tun_vnet_rss_select_queue() adds the hash to skb if necessary so it has tap_add_hash as its argument. > >> + } >> + >> + if (!skb->l4_hash && !skb->sw_hash) { >> + struct flow_keys keys; >> + >> + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); >> + rxq = flow_hash_from_keys(&keys); >> + keys_basic = (struct flow_keys_basic) { >> + .control = keys.control, >> + .basic = keys.basic >> + }; >> + } else { >> + skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0, >> + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); >> + rxq = skb->hash; >> + } >> + } else { >> + rxq = skb_get_hash(skb); >> + } >> + >> /* Check if we can use flow to select a queue */ >> - rxq = skb_get_hash(skb); >> if (rxq) { >> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash); >> queue = rcu_dereference(tap->taps[rxq % numvtaps]); >> goto out; >> } >> @@ -711,11 +762,12 @@ static ssize_t tap_put_user(struct tap_queue *q, >> int total; >> >> if (q->flags & IFF_VNET_HDR) { >> - struct virtio_net_hdr vnet_hdr; >> + struct virtio_net_hdr_v1_hash vnet_hdr; >> >> vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); >> >> - ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); >> + ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb, >> + tap_find_hash, &vnet_hdr); >> if (ret) >> return ret; >> >> @@ -992,6 +1044,16 @@ static long tap_ioctl(struct file *file, unsigned int cmd, >> rtnl_unlock(); >> return ret; >> >> + case TUNGETVNETHASHCAP: >> + return tun_vnet_ioctl_gethashcap(argp); >> + >> + case TUNSETVNETHASH: >> + rtnl_lock(); >> + tap = rtnl_dereference(q->tap); >> + ret = tap ? tun_vnet_ioctl_sethash(&tap->vnet_hash, true, argp) : -EBADFD; >> + rtnl_unlock(); >> + return ret; >> + >> case SIOCGIFHWADDR: >> rtnl_lock(); >> tap = tap_get_tap_dev(q); >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >> index d8f4d3e996a7a81d1f8b04635054081671a14f07..520013df416e93d3a50b46be9b53ae9ab410eab4 100644 >> --- a/drivers/net/tun.c >> +++ b/drivers/net/tun.c >> @@ -209,6 +209,7 @@ struct tun_struct { >> struct bpf_prog __rcu *xdp_prog; >> struct tun_prog __rcu *steering_prog; >> struct tun_prog __rcu *filter_prog; >> + struct tun_vnet_hash_container __rcu *vnet_hash; >> struct ethtool_link_ksettings link_ksettings; >> /* init args */ >> struct file *file; >> @@ -451,20 +452,37 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) >> e->rps_rxhash = hash; >> } >> >> +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb) >> +{ >> + return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); >> +} >> + >> +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb) >> +{ >> + return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); >> +} >> + >> /* We try to identify a flow through its rxhash. The reason that >> * we do not check rxq no. is because some cards(e.g 82599), chooses >> * the rxq based on the txq where the last packet of the flow comes. As >> * the userspace application move between processors, we may get a >> * different rxq no. here. >> */ >> -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >> +static u16 tun_automq_select_queue(struct tun_struct *tun, >> + const struct tun_vnet_hash_container *vnet_hash, >> + struct sk_buff *skb) >> { >> + struct flow_keys keys; >> + struct flow_keys_basic keys_basic; >> struct tun_flow_entry *e; >> u32 txq, numqueues; >> >> numqueues = READ_ONCE(tun->numqueues); >> >> - txq = __skb_get_hash_symmetric(skb); >> + memset(&keys, 0, sizeof(keys)); >> + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); >> + >> + txq = flow_hash_from_keys(&keys); >> e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); >> if (e) { >> tun_flow_save_rps_rxhash(e, txq); >> @@ -473,6 +491,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >> txq = reciprocal_scale(txq, numqueues); >> } >> >> + keys_basic = (struct flow_keys_basic) { >> + .control = keys.control, >> + .basic = keys.basic >> + }; >> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq, >> + tun_add_hash); >> + >> return txq; >> } >> >> @@ -500,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, >> u16 ret; >> >> rcu_read_lock(); >> - if (rcu_dereference(tun->steering_prog)) >> + if (rcu_dereference(tun->steering_prog)) { >> ret = tun_ebpf_select_queue(tun, skb); >> - else >> - ret = tun_automq_select_queue(tun, skb); >> + } else { >> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); >> + >> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) >> + ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash, >> + skb, tun_add_hash); >> + else >> + ret = tun_automq_select_queue(tun, vnet_hash, skb); >> + } >> rcu_read_unlock(); >> >> return ret; >> @@ -1987,7 +2019,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, >> ssize_t ret; >> >> if (tun->flags & IFF_VNET_HDR) { >> - struct virtio_net_hdr gso = { 0 }; >> + struct virtio_net_hdr_v1_hash gso = { 0 }; >> >> vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); >> ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); >> @@ -2040,9 +2072,10 @@ static ssize_t tun_put_user(struct tun_struct *tun, >> } >> >> if (vnet_hdr_sz) { >> - struct virtio_net_hdr gso; >> + struct virtio_net_hdr_v1_hash gso; >> >> - ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); >> + ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, >> + skb, tun_find_hash, &gso); >> if (ret) >> return ret; >> >> @@ -2223,6 +2256,7 @@ static void tun_free_netdev(struct net_device *dev) >> security_tun_dev_free_security(tun->security); >> __tun_set_ebpf(tun, &tun->steering_prog, NULL); >> __tun_set_ebpf(tun, &tun->filter_prog, NULL); >> + kfree_rcu_mightsleep(rcu_access_pointer(tun->vnet_hash)); >> } >> >> static void tun_setup(struct net_device *dev) >> @@ -2921,13 +2955,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) >> } >> >> static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, >> - void __user *data) >> + int fd) >> { >> struct bpf_prog *prog; >> - int fd; >> - >> - if (copy_from_user(&fd, data, sizeof(fd))) >> - return -EFAULT; >> >> if (fd == -1) { >> prog = NULL; >> @@ -2993,7 +3023,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> int ifindex; >> int sndbuf; >> int ret; >> + int fd; >> bool do_notify = false; >> + struct tun_vnet_hash_container *vnet_hash; >> >> if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || >> (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { >> @@ -3020,7 +3052,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> rtnl_lock(); >> >> tun = tun_get(tfile); >> - if (cmd == TUNSETIFF) { >> + switch (cmd) { >> + case TUNSETIFF: >> ret = -EEXIST; >> if (tun) >> goto unlock; >> @@ -3035,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> if (copy_to_user(argp, &ifr, ifreq_len)) >> ret = -EFAULT; >> goto unlock; >> - } >> - if (cmd == TUNSETIFINDEX) { >> + >> + case TUNSETIFINDEX: >> ret = -EPERM; >> if (tun) >> goto unlock; >> @@ -3050,6 +3083,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> ret = 0; >> tfile->ifindex = ifindex; >> goto unlock; >> + >> + case TUNGETVNETHASHCAP: >> + ret = tun_vnet_ioctl_gethashcap(argp); >> + goto unlock; >> } >> >> ret = -EBADFD; >> @@ -3230,11 +3267,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> break; >> >> case TUNSETSTEERINGEBPF: >> - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); >> + if (get_user(fd, (int __user *)argp)) { >> + ret = -EFAULT; >> + break; >> + } >> + >> + vnet_hash = rtnl_dereference(tun->vnet_hash); >> + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { >> + ret = -EBUSY; >> + break; >> + } >> + >> + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); >> break; >> >> case TUNSETFILTEREBPF: >> - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); >> + if (get_user(fd, (int __user *)argp)) { >> + ret = -EFAULT; >> + break; >> + } >> + >> + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); >> break; >> >> case TUNSETCARRIER: >> @@ -3252,8 +3305,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> ret = open_related_ns(&net->ns, get_net_ns); >> break; >> >> + case TUNSETVNETHASH: >> + ret = tun_vnet_ioctl_sethash(&tun->vnet_hash, >> + !rtnl_dereference(tun->steering_prog), >> + argp); >> + break; >> + >> default: >> - ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); >> + ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, >> + cmd, argp); > > no need to touch this > >> break; >> } >> >> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h >> index 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed 100644 >> --- a/drivers/net/tun_vnet.h >> +++ b/drivers/net/tun_vnet.h >> @@ -6,6 +6,16 @@ >> #define TUN_VNET_LE 0x80000000 >> #define TUN_VNET_BE 0x40000000 >> >> +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *); >> +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *); >> + >> +struct tun_vnet_hash_container { >> + struct tun_vnet_hash common; >> + struct tun_vnet_hash_rss rss; >> + u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; >> + u16 rss_indirection_table[]; >> +}; >> + >> static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) >> { >> bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && >> @@ -107,6 +117,123 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, >> } >> } >> >> +static inline long tun_vnet_ioctl_gethashcap(void __user *argp) >> +{ >> + static const struct tun_vnet_hash cap = { >> + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, >> + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES >> + }; >> + >> + return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; >> +} >> + >> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, >> + bool can_rss, void __user *argp) >> +{ >> + struct tun_vnet_hash hash_buf; >> + struct tun_vnet_hash_container *hash; >> + >> + if (copy_from_user(&hash_buf, argp, sizeof(hash_buf))) >> + return -EFAULT; >> + argp = (struct tun_vnet_hash __user *)argp + 1; >> + >> + if (hash_buf.flags & TUN_VNET_HASH_RSS) { >> + struct tun_vnet_hash_rss rss; >> + size_t indirection_table_size; >> + size_t key_size; >> + size_t size; >> + >> + if (!can_rss) >> + return -EBUSY; >> + >> + if (copy_from_user(&rss, argp, sizeof(rss))) >> + return -EFAULT; >> + argp = (struct tun_vnet_hash_rss __user *)argp + 1; >> + >> + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; >> + key_size = virtio_net_hash_key_length(hash_buf.types); >> + size = struct_size(hash, rss_indirection_table, >> + (size_t)rss.indirection_table_mask + 1); >> + >> + hash = kmalloc(size, GFP_KERNEL); >> + if (!hash) >> + return -ENOMEM; >> + >> + if (copy_from_user(hash->rss_indirection_table, >> + argp, indirection_table_size)) { >> + kfree(hash); >> + return -EFAULT; >> + } >> + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; >> + >> + if (copy_from_user(hash->rss_key, argp, key_size)) { >> + kfree(hash); >> + return -EFAULT; >> + } >> + >> + virtio_net_toeplitz_convert_key(hash->rss_key, key_size); >> + hash->rss = rss; >> + } else { >> + hash = kmalloc(sizeof(hash->common), GFP_KERNEL); >> + if (!hash) >> + return -ENOMEM; >> + } >> + >> + hash->common = hash_buf; >> + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash)); >> + return 0; >> +} >> + >> +static void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash, >> + struct sk_buff *skb, >> + const struct flow_keys_basic *keys, >> + u32 value, >> + tun_vnet_hash_add vnet_hash_add) >> +{ >> + struct virtio_net_hash *report; >> + >> + if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT)) >> + return; >> + >> + report = vnet_hash_add(skb); >> + if (!report) >> + return; >> + >> + *report = (struct virtio_net_hash) { >> + .report = virtio_net_hash_report(hash->common.types, keys), >> + .value = value >> + }; >> +} >> + >> +static u16 tun_vnet_rss_select_queue(u32 numqueues, >> + const struct tun_vnet_hash_container *hash, >> + struct sk_buff *skb, >> + tun_vnet_hash_add vnet_hash_add) >> +{ >> + struct virtio_net_hash *report; >> + struct virtio_net_hash ret; >> + u16 txq, index; >> + >> + if (!numqueues) >> + return 0; >> + >> + virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret); >> + >> + if (!ret.report) >> + return hash->rss.unclassified_queue % numqueues; >> + >> + if (hash->common.flags & TUN_VNET_HASH_REPORT) { >> + report = vnet_hash_add(skb); >> + if (report) >> + *report = ret; >> + } >> + >> + index = ret.value & hash->rss.indirection_table_mask; >> + txq = READ_ONCE(hash->rss_indirection_table[index]); >> + >> + return txq % numqueues; >> +} >> + >> static inline int tun_vnet_hdr_get(int sz, unsigned int flags, >> struct iov_iter *from, >> struct virtio_net_hdr *hdr) >> @@ -135,15 +262,17 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags, >> } >> >> static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, >> - const struct virtio_net_hdr *hdr) >> + const struct virtio_net_hdr_v1_hash *hdr) >> { >> + int content_sz = MIN(sizeof(*hdr), sz); >> + >> if (unlikely(iov_iter_count(iter) < sz)) >> return -EINVAL; >> >> - if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) >> + if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz)) >> return -EFAULT; >> >> - if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) >> + if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) >> return -EFAULT; >> >> return 0; >> @@ -155,26 +284,38 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, >> return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); >> } >> >> -static inline int tun_vnet_hdr_from_skb(unsigned int flags, >> +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags, >> const struct net_device *dev, >> const struct sk_buff *skb, >> - struct virtio_net_hdr *hdr) >> + tun_vnet_hash_find vnet_hash_find, >> + struct virtio_net_hdr_v1_hash *hdr) >> { >> int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; >> + const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ? >> + NULL : vnet_hash_find(skb); >> + >> + *hdr = (struct virtio_net_hdr_v1_hash) { >> + .hash_report = VIRTIO_NET_HASH_REPORT_NONE >> + }; >> + >> + if (report) { >> + hdr->hash_value = cpu_to_le32(report->value); >> + hdr->hash_report = cpu_to_le16(report->report); >> + } >> >> - if (virtio_net_hdr_from_skb(skb, hdr, >> + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, >> tun_vnet_is_little_endian(flags), true, >> vlan_hlen)) { >> struct skb_shared_info *sinfo = skb_shinfo(skb); >> >> if (net_ratelimit()) { >> netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", >> - sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size), >> - tun_vnet16_to_cpu(flags, hdr->hdr_len)); >> + sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size), >> + tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len)); >> print_hex_dump(KERN_ERR, "tun: ", >> DUMP_PREFIX_NONE, >> 16, 1, skb->head, >> - min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true); >> + min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true); >> } >> WARN_ON_ONCE(1); >> return -EINVAL; >> diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h >> index 553552fa635c3e1e53d1a63c203d32e4c4fd5a4f..7334c46a3f101675a0d4e5a036987cfe18842f9f 100644 >> --- a/include/linux/if_tap.h >> +++ b/include/linux/if_tap.h >> @@ -31,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f) >> #define MAX_TAP_QUEUES 256 >> >> struct tap_queue; >> +struct tun_vnet_hash_container; >> >> struct tap_dev { >> struct net_device *dev; >> @@ -43,6 +44,7 @@ struct tap_dev { >> int numqueues; >> netdev_features_t tap_features; >> int minor; >> + struct tun_vnet_hash_container __rcu *vnet_hash; >> >> void (*update_features)(struct tap_dev *tap, netdev_features_t features); >> void (*count_tx_dropped)(struct tap_dev *tap); >> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h >> index bb2b751d274acff931281a72e8b4b0c699b4e8af..cdd793f1c360ad5f63fcc4cbf67d845f5e2ccf6f 100644 >> --- a/include/linux/skbuff.h >> +++ b/include/linux/skbuff.h >> @@ -4842,6 +4842,9 @@ enum skb_ext_id { >> #endif >> #if IS_ENABLED(CONFIG_MCTP_FLOWS) >> SKB_EXT_MCTP, >> +#endif >> +#if IS_ENABLED(CONFIG_TUN) >> + SKB_EXT_TUN_VNET_HASH, >> #endif >> SKB_EXT_NUM, /* must be last */ >> }; >> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h >> index 287cdc81c9390c289a30545aa7ed23d81c3329d3..4887f97500a870c7ef3c96a5837b2d0a5a225040 100644 >> --- a/include/uapi/linux/if_tun.h >> +++ b/include/uapi/linux/if_tun.h >> @@ -62,6 +62,42 @@ >> #define TUNSETCARRIER _IOW('T', 226, int) >> #define TUNGETDEVNETNS _IO('T', 227) >> >> +/** >> + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability. >> + * >> + * The argument is a pointer to &struct tun_vnet_hash which will store the >> + * maximal virtio_net hashing configuration. >> + */ >> +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash) >> + >> +/** >> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing >> + * >> + * The argument is a pointer to &struct tun_vnet_hash. >> + * >> + * The argument is a pointer to the compound of the following in order if >> + * %TUN_VNET_HASH_RSS is set: >> + * >> + * 1. &struct tun_vnet_hash >> + * 2. &struct tun_vnet_hash_rss >> + * 3. Indirection table >> + * 4. Key >> + * >> + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only >> + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal >> + * to the size of &struct virtio_net_hdr_v1_hash. >> + * >> + * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will >> + * always be little-endian. >> + * >> + * This ioctl results in %EBADFD if the underlying device is deleted. It affects >> + * all queues attached to the same device. >> + * >> + * This ioctl currently has no effect on XDP packets and packets with >> + * queue_mapping set by TC. >> + */ >> +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash) >> + >> /* TUNSETIFF ifr flags */ >> #define IFF_TUN 0x0001 >> #define IFF_TAP 0x0002 >> @@ -115,4 +151,43 @@ struct tun_filter { >> __u8 addr[][ETH_ALEN]; >> }; >> >> +/** >> + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost >> + */ >> +#define TUN_VNET_HASH_REPORT 0x0001 >> + >> +/** >> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS >> + * >> + * This is mutually exclusive with eBPF steering program. >> + */ >> +#define TUN_VNET_HASH_RSS 0x0002 >> + >> +/** >> + * struct tun_vnet_hash - virtio_net hashing configuration >> + * @flags: >> + * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS >> + * @pad: >> + * Should be filled with zero before passing to %TUNSETVNETHASH >> + * @types: >> + * Bitmask of allowed hash types >> + */ >> +struct tun_vnet_hash { >> + __u16 flags; >> + __u8 pad[2]; >> + __u32 types; >> +}; >> + >> +/** >> + * struct tun_vnet_hash_rss - virtio_net RSS configuration >> + * @indirection_table_mask: >> + * Bitmask to be applied to the indirection table index >> + * @unclassified_queue: >> + * The index of the queue to place unclassified packets in >> + */ >> +struct tun_vnet_hash_rss { >> + __u16 indirection_table_mask; >> + __u16 unclassified_queue; >> +}; >> + >> #endif /* _UAPI__IF_TUN_H */ >> diff --git a/net/core/skbuff.c b/net/core/skbuff.c >> index 7b03b64fdcb276f68ce881d1d8da8e4c6b897efc..aa2a091b649f0c9d6e0196f34f345ba78b5498fb 100644 >> --- a/net/core/skbuff.c >> +++ b/net/core/skbuff.c >> @@ -64,6 +64,7 @@ >> #include <linux/mpls.h> >> #include <linux/kcov.h> >> #include <linux/iov_iter.h> >> +#include <linux/virtio_net.h> >> >> #include <net/protocol.h> >> #include <net/dst.h> >> @@ -4969,6 +4970,9 @@ static const u8 skb_ext_type_len[] = { >> #if IS_ENABLED(CONFIG_MCTP_FLOWS) >> [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), >> #endif >> +#if IS_ENABLED(CONFIG_TUN) >> + [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct virtio_net_hash), >> +#endif >> }; >> >> static __always_inline unsigned int skb_ext_total_length(void) >> >> -- >> 2.48.1 >> > >
On 2025/03/10 12:55, Jason Wang wrote: > On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> Hash reporting >> ============== >> >> Allow the guest to reuse the hash value to make receive steering >> consistent between the host and guest, and to save hash computation. >> >> RSS >> === >> >> RSS is a receive steering algorithm that can be negotiated to use with >> virtio_net. Conventionally the hash calculation was done by the VMM. >> However, computing the hash after the queue was chosen defeats the >> purpose of RSS. >> >> Another approach is to use eBPF steering program. This approach has >> another downside: it cannot report the calculated hash due to the >> restrictive nature of eBPF steering program. >> >> Introduce the code to perform RSS to the kernel in order to overcome >> thse challenges. An alternative solution is to extend the eBPF steering >> program so that it will be able to report to the userspace, but I didn't >> opt for it because extending the current mechanism of eBPF steering >> program as is because it relies on legacy context rewriting, and >> introducing kfunc-based eBPF will result in non-UAPI dependency while >> the other relevant virtualization APIs such as KVM and vhost_net are >> UAPIs. >> >> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >> Tested-by: Lei Yang <leiyang@redhat.com> >> --- >> Documentation/networking/tuntap.rst | 7 ++ >> drivers/net/Kconfig | 1 + >> drivers/net/tap.c | 68 ++++++++++++++- >> drivers/net/tun.c | 98 +++++++++++++++++----- >> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >> include/linux/if_tap.h | 2 + >> include/linux/skbuff.h | 3 + >> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >> net/core/skbuff.c | 4 + >> 9 files changed, 386 insertions(+), 31 deletions(-) >> >> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst >> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 >> --- a/Documentation/networking/tuntap.rst >> +++ b/Documentation/networking/tuntap.rst >> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >> } >> >> +3.4 Reference >> +------------- >> + >> +``linux/if_tun.h`` defines the interface described below: >> + >> +.. kernel-doc:: include/uapi/linux/if_tun.h >> + >> Universal TUN/TAP device driver Frequently Asked Question >> ========================================================= >> >> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig >> index 1fd5acdc73c6af0e1a861867039c3624fc618e25..aecfd244dd83585fea2c5b815dcd787c58166c28 100644 >> --- a/drivers/net/Kconfig >> +++ b/drivers/net/Kconfig >> @@ -395,6 +395,7 @@ config TUN >> tristate "Universal TUN/TAP device driver support" >> depends on INET >> select CRC32 >> + select SKB_EXTENSIONS >> help >> TUN/TAP provides packet reception and transmission for user space >> programs. It can be viewed as a simple Point-to-Point or Ethernet >> diff --git a/drivers/net/tap.c b/drivers/net/tap.c >> index d4ece538f1b23789ca60caa6232690e4d0a4d14a..9428b63ec27e7f92e78a78afcb5e24383862c00d 100644 >> --- a/drivers/net/tap.c >> +++ b/drivers/net/tap.c >> @@ -49,6 +49,10 @@ struct major_info { >> struct list_head next; >> }; >> >> +struct tap_skb_cb { >> + struct virtio_net_hash hash; >> +}; >> + >> #define GOODCOPY_LEN 128 >> >> static const struct proto_ops tap_socket_ops; >> @@ -179,6 +183,22 @@ static void tap_put_queue(struct tap_queue *q) >> sock_put(&q->sk); >> } >> >> +static struct tap_skb_cb *tap_skb_cb(const struct sk_buff *skb) >> +{ >> + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct tap_skb_cb)); >> + return (struct tap_skb_cb *)skb->cb; >> +} >> + >> +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb) >> +{ >> + return &tap_skb_cb(skb)->hash; >> +} >> + >> +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb) >> +{ >> + return &tap_skb_cb(skb)->hash; >> +} >> + >> /* >> * Select a queue based on the rxq of the device on which this packet >> * arrived. If the incoming device is not mq, calculate a flow hash >> @@ -189,6 +209,7 @@ static void tap_put_queue(struct tap_queue *q) >> static struct tap_queue *tap_get_queue(struct tap_dev *tap, >> struct sk_buff *skb) >> { >> + struct flow_keys_basic keys_basic; >> struct tap_queue *queue = NULL; >> /* Access to taps array is protected by rcu, but access to numvtaps >> * isn't. Below we use it to lookup a queue, but treat it as a hint >> @@ -196,17 +217,47 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, >> * racing against queue removal. >> */ >> int numvtaps = READ_ONCE(tap->numvtaps); >> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash); >> __u32 rxq; >> >> + *tap_skb_cb(skb) = (struct tap_skb_cb) { >> + .hash = { .report = VIRTIO_NET_HASH_REPORT_NONE } >> + }; >> + >> if (!numvtaps) >> goto out; >> >> if (numvtaps == 1) >> goto single; >> >> + if (vnet_hash) { >> + if ((vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { >> + rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash); >> + queue = rcu_dereference(tap->taps[rxq]); >> + goto out; >> + } >> + >> + if (!skb->l4_hash && !skb->sw_hash) { >> + struct flow_keys keys; >> + >> + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); >> + rxq = flow_hash_from_keys(&keys); >> + keys_basic = (struct flow_keys_basic) { >> + .control = keys.control, >> + .basic = keys.basic >> + }; >> + } else { >> + skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0, >> + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); >> + rxq = skb->hash; >> + } >> + } else { >> + rxq = skb_get_hash(skb); >> + } >> + >> /* Check if we can use flow to select a queue */ >> - rxq = skb_get_hash(skb); >> if (rxq) { >> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash); >> queue = rcu_dereference(tap->taps[rxq % numvtaps]); >> goto out; >> } >> @@ -711,11 +762,12 @@ static ssize_t tap_put_user(struct tap_queue *q, >> int total; >> >> if (q->flags & IFF_VNET_HDR) { >> - struct virtio_net_hdr vnet_hdr; >> + struct virtio_net_hdr_v1_hash vnet_hdr; >> >> vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); >> >> - ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); >> + ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb, >> + tap_find_hash, &vnet_hdr); >> if (ret) >> return ret; >> >> @@ -992,6 +1044,16 @@ static long tap_ioctl(struct file *file, unsigned int cmd, >> rtnl_unlock(); >> return ret; >> >> + case TUNGETVNETHASHCAP: >> + return tun_vnet_ioctl_gethashcap(argp); >> + >> + case TUNSETVNETHASH: >> + rtnl_lock(); >> + tap = rtnl_dereference(q->tap); >> + ret = tap ? tun_vnet_ioctl_sethash(&tap->vnet_hash, true, argp) : -EBADFD; >> + rtnl_unlock(); >> + return ret; >> + >> case SIOCGIFHWADDR: >> rtnl_lock(); >> tap = tap_get_tap_dev(q); >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >> index d8f4d3e996a7a81d1f8b04635054081671a14f07..520013df416e93d3a50b46be9b53ae9ab410eab4 100644 >> --- a/drivers/net/tun.c >> +++ b/drivers/net/tun.c >> @@ -209,6 +209,7 @@ struct tun_struct { >> struct bpf_prog __rcu *xdp_prog; >> struct tun_prog __rcu *steering_prog; >> struct tun_prog __rcu *filter_prog; >> + struct tun_vnet_hash_container __rcu *vnet_hash; >> struct ethtool_link_ksettings link_ksettings; >> /* init args */ >> struct file *file; >> @@ -451,20 +452,37 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) >> e->rps_rxhash = hash; >> } >> >> +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb) >> +{ >> + return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); >> +} >> + >> +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb) >> +{ >> + return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); >> +} >> + >> /* We try to identify a flow through its rxhash. The reason that >> * we do not check rxq no. is because some cards(e.g 82599), chooses >> * the rxq based on the txq where the last packet of the flow comes. As >> * the userspace application move between processors, we may get a >> * different rxq no. here. >> */ >> -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >> +static u16 tun_automq_select_queue(struct tun_struct *tun, >> + const struct tun_vnet_hash_container *vnet_hash, >> + struct sk_buff *skb) >> { >> + struct flow_keys keys; >> + struct flow_keys_basic keys_basic; >> struct tun_flow_entry *e; >> u32 txq, numqueues; >> >> numqueues = READ_ONCE(tun->numqueues); >> >> - txq = __skb_get_hash_symmetric(skb); >> + memset(&keys, 0, sizeof(keys)); >> + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); >> + >> + txq = flow_hash_from_keys(&keys); >> e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); >> if (e) { >> tun_flow_save_rps_rxhash(e, txq); >> @@ -473,6 +491,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >> txq = reciprocal_scale(txq, numqueues); >> } >> >> + keys_basic = (struct flow_keys_basic) { >> + .control = keys.control, >> + .basic = keys.basic >> + }; >> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq, >> + tun_add_hash); >> + >> return txq; >> } >> >> @@ -500,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, >> u16 ret; >> >> rcu_read_lock(); >> - if (rcu_dereference(tun->steering_prog)) >> + if (rcu_dereference(tun->steering_prog)) { >> ret = tun_ebpf_select_queue(tun, skb); >> - else >> - ret = tun_automq_select_queue(tun, skb); >> + } else { >> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); >> + >> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) >> + ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash, >> + skb, tun_add_hash); >> + else >> + ret = tun_automq_select_queue(tun, vnet_hash, skb); >> + } >> rcu_read_unlock(); >> >> return ret; >> @@ -1987,7 +2019,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, >> ssize_t ret; >> >> if (tun->flags & IFF_VNET_HDR) { >> - struct virtio_net_hdr gso = { 0 }; >> + struct virtio_net_hdr_v1_hash gso = { 0 }; >> >> vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); >> ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); >> @@ -2040,9 +2072,10 @@ static ssize_t tun_put_user(struct tun_struct *tun, >> } >> >> if (vnet_hdr_sz) { >> - struct virtio_net_hdr gso; >> + struct virtio_net_hdr_v1_hash gso; >> >> - ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); >> + ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, >> + skb, tun_find_hash, &gso); >> if (ret) >> return ret; >> >> @@ -2223,6 +2256,7 @@ static void tun_free_netdev(struct net_device *dev) >> security_tun_dev_free_security(tun->security); >> __tun_set_ebpf(tun, &tun->steering_prog, NULL); >> __tun_set_ebpf(tun, &tun->filter_prog, NULL); >> + kfree_rcu_mightsleep(rcu_access_pointer(tun->vnet_hash)); >> } >> >> static void tun_setup(struct net_device *dev) >> @@ -2921,13 +2955,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) >> } >> >> static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, >> - void __user *data) >> + int fd) >> { >> struct bpf_prog *prog; >> - int fd; >> - >> - if (copy_from_user(&fd, data, sizeof(fd))) >> - return -EFAULT; >> >> if (fd == -1) { >> prog = NULL; >> @@ -2993,7 +3023,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> int ifindex; >> int sndbuf; >> int ret; >> + int fd; >> bool do_notify = false; >> + struct tun_vnet_hash_container *vnet_hash; >> >> if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || >> (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { >> @@ -3020,7 +3052,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> rtnl_lock(); >> >> tun = tun_get(tfile); >> - if (cmd == TUNSETIFF) { >> + switch (cmd) { >> + case TUNSETIFF: >> ret = -EEXIST; >> if (tun) >> goto unlock; >> @@ -3035,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> if (copy_to_user(argp, &ifr, ifreq_len)) >> ret = -EFAULT; >> goto unlock; >> - } >> - if (cmd == TUNSETIFINDEX) { >> + >> + case TUNSETIFINDEX: >> ret = -EPERM; >> if (tun) >> goto unlock; >> @@ -3050,6 +3083,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> ret = 0; >> tfile->ifindex = ifindex; >> goto unlock; >> + >> + case TUNGETVNETHASHCAP: >> + ret = tun_vnet_ioctl_gethashcap(argp); >> + goto unlock; >> } >> >> ret = -EBADFD; >> @@ -3230,11 +3267,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> break; >> >> case TUNSETSTEERINGEBPF: >> - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); >> + if (get_user(fd, (int __user *)argp)) { >> + ret = -EFAULT; >> + break; >> + } >> + >> + vnet_hash = rtnl_dereference(tun->vnet_hash); >> + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { >> + ret = -EBUSY; >> + break; >> + } >> + >> + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); >> break; >> >> case TUNSETFILTEREBPF: >> - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); >> + if (get_user(fd, (int __user *)argp)) { >> + ret = -EFAULT; >> + break; >> + } >> + >> + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); >> break; >> >> case TUNSETCARRIER: >> @@ -3252,8 +3305,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> ret = open_related_ns(&net->ns, get_net_ns); >> break; >> >> + case TUNSETVNETHASH: >> + ret = tun_vnet_ioctl_sethash(&tun->vnet_hash, >> + !rtnl_dereference(tun->steering_prog), >> + argp); >> + break; >> + >> default: >> - ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); >> + ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, >> + cmd, argp); >> break; >> } >> >> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h >> index 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed 100644 >> --- a/drivers/net/tun_vnet.h >> +++ b/drivers/net/tun_vnet.h >> @@ -6,6 +6,16 @@ >> #define TUN_VNET_LE 0x80000000 >> #define TUN_VNET_BE 0x40000000 >> >> +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *); >> +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *); >> + >> +struct tun_vnet_hash_container { >> + struct tun_vnet_hash common; > > I'd rename this as hash. > >> + struct tun_vnet_hash_rss rss; >> + u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; >> + u16 rss_indirection_table[]; >> +}; > > Besides the separated ioctl, I'd split this structure into rss and > hash part as well. > >> + >> static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) >> { >> bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && >> @@ -107,6 +117,123 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, >> } >> } >> >> +static inline long tun_vnet_ioctl_gethashcap(void __user *argp) >> +{ >> + static const struct tun_vnet_hash cap = { >> + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, >> + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES >> + }; >> + >> + return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; > > Let's has a consistent name for this and the uapi to be consistent > with TUNSETIFF/TUNGETIFF. Probably TUNSETVNETHASH and > tun_vnet_ioctl_gethash(). They have different semantics so they should have different names. TUNGETIFF reports the value currently set while TUNGETVNETHASHCAP reports the value that can be set later. > >> +} >> + >> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, >> + bool can_rss, void __user *argp) > > So again, can_rss seems to be tricky. Looking at its caller, it tires > to make eBPF and RSS mutually exclusive. I still don't understand why > we need this. Allow eBPF program to override some of the path seems to > be common practice. > > What's more, we didn't try (or even can't) to make automq and eBPF to > be mutually exclusive. So I still didn't see what we gain from this > and it complicates the codes and may lead to ambiguous uAPI/behaviour. automq and eBPF are mutually exclusive; automq is disabled when an eBPF steering program is set so I followed the example here. We don't even have an interface for eBPF to let it fall back to another alogirhtm. I could make it fall back to RSS if the eBPF steeering program is designed to fall back to automq when it returns e.g., -1. But such an interface is currently not defined and defining one is out of scope of this patch series. > >> +{ >> + struct tun_vnet_hash hash_buf; >> + struct tun_vnet_hash_container *hash; >> + >> + if (copy_from_user(&hash_buf, argp, sizeof(hash_buf))) >> + return -EFAULT; >> + argp = (struct tun_vnet_hash __user *)argp + 1; >> + >> + if (hash_buf.flags & TUN_VNET_HASH_RSS) { >> + struct tun_vnet_hash_rss rss; >> + size_t indirection_table_size; >> + size_t key_size; >> + size_t size; >> + >> + if (!can_rss) >> + return -EBUSY; >> + >> + if (copy_from_user(&rss, argp, sizeof(rss))) >> + return -EFAULT; >> + argp = (struct tun_vnet_hash_rss __user *)argp + 1; >> + >> + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; >> + key_size = virtio_net_hash_key_length(hash_buf.types); >> + size = struct_size(hash, rss_indirection_table, >> + (size_t)rss.indirection_table_mask + 1); >> + >> + hash = kmalloc(size, GFP_KERNEL); >> + if (!hash) >> + return -ENOMEM; >> + >> + if (copy_from_user(hash->rss_indirection_table, >> + argp, indirection_table_size)) { >> + kfree(hash); >> + return -EFAULT; >> + } >> + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; >> + >> + if (copy_from_user(hash->rss_key, argp, key_size)) { >> + kfree(hash); >> + return -EFAULT; >> + } >> + >> + virtio_net_toeplitz_convert_key(hash->rss_key, key_size); >> + hash->rss = rss; >> + } else { >> + hash = kmalloc(sizeof(hash->common), GFP_KERNEL); >> + if (!hash) >> + return -ENOMEM; > > Do we need to validate the hash here (at least against the types we supported?) > >> + } >> + >> + hash->common = hash_buf; >> + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash)); > > I still didn't understand the trick here. E.g we use very simple > primitives in synchronizing ebpf program through RCU in > __tun_set_ebpf(). It is even simpler than __tun_set_ebpf(). The differences from __tun_set_ebpf() are: 1. This uses the rtnl lock instead of TUN-specific one. It makes the code simpler as the rtnl lock is already taken in __tun_chr_ioctl(). 2. This does not add rcu_head and uses blocking APIs for simplicity. > >> + return 0; >> +} >> + >> +static void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash, >> + struct sk_buff *skb, >> + const struct flow_keys_basic *keys, >> + u32 value, >> + tun_vnet_hash_add vnet_hash_add) >> +{ >> + struct virtio_net_hash *report; >> + >> + if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT)) >> + return; >> + >> + report = vnet_hash_add(skb); >> + if (!report) >> + return; >> + >> + *report = (struct virtio_net_hash) { >> + .report = virtio_net_hash_report(hash->common.types, keys), >> + .value = value >> + }; > > What's the advantage of using Designated Initializers here? Simple > assignment can save two lines of code. It automatically fills other fileds with zero. Simple assignments will need more tokens for zeroing. > >> +} >> + >> +static u16 tun_vnet_rss_select_queue(u32 numqueues, >> + const struct tun_vnet_hash_container *hash, >> + struct sk_buff *skb, >> + tun_vnet_hash_add vnet_hash_add) >> +{ >> + struct virtio_net_hash *report; >> + struct virtio_net_hash ret; >> + u16 txq, index; >> + >> + if (!numqueues) >> + return 0; >> + >> + virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret); >> + >> + if (!ret.report) >> + return hash->rss.unclassified_queue % numqueues; >> + >> + if (hash->common.flags & TUN_VNET_HASH_REPORT) { >> + report = vnet_hash_add(skb); >> + if (report) >> + *report = ret; >> + } > > Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? > If yes, it should be a bug. It is possible to use RSS without TUN_VNET_HASH_REPORT. It is more of a feature instead of a bug; it behaves like QEMU's eBPF program but requires no privilege and is more optimized with native code and ffs(). > >> + >> + index = ret.value & hash->rss.indirection_table_mask; >> + txq = READ_ONCE(hash->rss_indirection_table[index]); > > So vnet_hash is accessed via rcu_dereference(), I don't get any reason > we need READ_ONCE here, is this paired with something? If yes, let's > add a comment here. If rss_indirection_table need why > indirection_table_mask doesn't need this? I'll drop it. I think it's just a left-over of previous versions without RCU. > >> + >> + return txq % numqueues; >> +} >> + >> static inline int tun_vnet_hdr_get(int sz, unsigned int flags, >> struct iov_iter *from, >> struct virtio_net_hdr *hdr) >> @@ -135,15 +262,17 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags, >> } >> >> static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, >> - const struct virtio_net_hdr *hdr) >> + const struct virtio_net_hdr_v1_hash *hdr) >> { > > To be more robust, we can tweak the function to accept a vnet_hdr_len > parameter then we can avoid touching this every time when we need to > extend vnet hdr in the future? I think you meant vnet_hdr_sz instead of vnet_hdr_len. It is already passed just as "sz" here as the function name already says it's about the header. It is possible to add another parameter for sizeof(*hdr) and convert the hdr parameter to void * to avoid future changes. But I rather keep it as is because the current form ensures the hdr is large enough and statically avoids buffer overrun. > >> + int content_sz = MIN(sizeof(*hdr), sz); >> + >> if (unlikely(iov_iter_count(iter) < sz)) >> return -EINVAL; >> >> - if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) >> + if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz)) >> return -EFAULT; >> >> - if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) >> + if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) >> return -EFAULT; >> >> return 0; >> @@ -155,26 +284,38 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, >> return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); >> } >> >> -static inline int tun_vnet_hdr_from_skb(unsigned int flags, >> +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags, >> const struct net_device *dev, >> const struct sk_buff *skb, >> - struct virtio_net_hdr *hdr) >> + tun_vnet_hash_find vnet_hash_find, >> + struct virtio_net_hdr_v1_hash *hdr) >> { >> int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; >> + const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ? >> + NULL : vnet_hash_find(skb); >> + >> + *hdr = (struct virtio_net_hdr_v1_hash) { >> + .hash_report = VIRTIO_NET_HASH_REPORT_NONE >> + }; >> + >> + if (report) { >> + hdr->hash_value = cpu_to_le32(report->value); >> + hdr->hash_report = cpu_to_le16(report->report); >> + } >> >> - if (virtio_net_hdr_from_skb(skb, hdr, >> + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, >> tun_vnet_is_little_endian(flags), true, >> vlan_hlen)) { >> struct skb_shared_info *sinfo = skb_shinfo(skb); >> >> if (net_ratelimit()) { >> netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", >> - sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size), >> - tun_vnet16_to_cpu(flags, hdr->hdr_len)); >> + sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size), >> + tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len)); >> print_hex_dump(KERN_ERR, "tun: ", >> DUMP_PREFIX_NONE, >> 16, 1, skb->head, >> - min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true); >> + min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true); >> } >> WARN_ON_ONCE(1); >> return -EINVAL; >> diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h >> index 553552fa635c3e1e53d1a63c203d32e4c4fd5a4f..7334c46a3f101675a0d4e5a036987cfe18842f9f 100644 >> --- a/include/linux/if_tap.h >> +++ b/include/linux/if_tap.h >> @@ -31,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f) >> #define MAX_TAP_QUEUES 256 >> >> struct tap_queue; >> +struct tun_vnet_hash_container; >> >> struct tap_dev { >> struct net_device *dev; >> @@ -43,6 +44,7 @@ struct tap_dev { >> int numqueues; >> netdev_features_t tap_features; >> int minor; >> + struct tun_vnet_hash_container __rcu *vnet_hash; >> >> void (*update_features)(struct tap_dev *tap, netdev_features_t features); >> void (*count_tx_dropped)(struct tap_dev *tap); >> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h >> index bb2b751d274acff931281a72e8b4b0c699b4e8af..cdd793f1c360ad5f63fcc4cbf67d845f5e2ccf6f 100644 >> --- a/include/linux/skbuff.h >> +++ b/include/linux/skbuff.h >> @@ -4842,6 +4842,9 @@ enum skb_ext_id { >> #endif >> #if IS_ENABLED(CONFIG_MCTP_FLOWS) >> SKB_EXT_MCTP, >> +#endif >> +#if IS_ENABLED(CONFIG_TUN) >> + SKB_EXT_TUN_VNET_HASH, >> #endif >> SKB_EXT_NUM, /* must be last */ >> }; >> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h >> index 287cdc81c9390c289a30545aa7ed23d81c3329d3..4887f97500a870c7ef3c96a5837b2d0a5a225040 100644 >> --- a/include/uapi/linux/if_tun.h >> +++ b/include/uapi/linux/if_tun.h >> @@ -62,6 +62,42 @@ >> #define TUNSETCARRIER _IOW('T', 226, int) >> #define TUNGETDEVNETNS _IO('T', 227) >> >> +/** >> + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability. >> + * >> + * The argument is a pointer to &struct tun_vnet_hash which will store the >> + * maximal virtio_net hashing configuration. >> + */ >> +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash) >> + >> +/** >> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing >> + * >> + * The argument is a pointer to &struct tun_vnet_hash. >> + * >> + * The argument is a pointer to the compound of the following in order if >> + * %TUN_VNET_HASH_RSS is set: >> + * >> + * 1. &struct tun_vnet_hash >> + * 2. &struct tun_vnet_hash_rss >> + * 3. Indirection table >> + * 4. Key >> + * >> + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only >> + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal >> + * to the size of &struct virtio_net_hdr_v1_hash. > > So you had a dependency check already for vnet hdr len. I'd still > suggest to split this into rss and hash as they are separated > features. Then we can use separate data structure for them instead of > a container struct. I added a dependency check and found it is complicating the code and requires additional tests. I need a reason to justify the complexity if we are going to split it. Regards, Akihiko Odaki > >> + * >> + * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will >> + * always be little-endian. >> + * >> + * This ioctl results in %EBADFD if the underlying device is deleted. It affects >> + * all queues attached to the same device. >> + * >> + * This ioctl currently has no effect on XDP packets and packets with >> + * queue_mapping set by TC. >> + */ >> +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash) >> + >> /* TUNSETIFF ifr flags */ >> #define IFF_TUN 0x0001 >> #define IFF_TAP 0x0002 >> @@ -115,4 +151,43 @@ struct tun_filter { >> __u8 addr[][ETH_ALEN]; >> }; >> >> +/** >> + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost >> + */ >> +#define TUN_VNET_HASH_REPORT 0x0001 >> + >> +/** >> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS >> + * >> + * This is mutually exclusive with eBPF steering program. >> + */ >> +#define TUN_VNET_HASH_RSS 0x0002 >> + >> +/** >> + * struct tun_vnet_hash - virtio_net hashing configuration >> + * @flags: >> + * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS >> + * @pad: >> + * Should be filled with zero before passing to %TUNSETVNETHASH >> + * @types: >> + * Bitmask of allowed hash types >> + */ >> +struct tun_vnet_hash { >> + __u16 flags; >> + __u8 pad[2]; >> + __u32 types; >> +}; > > Padding in the middle of the structure is not elegant. Any reason for this? > > And hash->types seems never used. > >> + >> +/** >> + * struct tun_vnet_hash_rss - virtio_net RSS configuration >> + * @indirection_table_mask: >> + * Bitmask to be applied to the indirection table index >> + * @unclassified_queue: >> + * The index of the queue to place unclassified packets in >> + */ >> +struct tun_vnet_hash_rss { >> + __u16 indirection_table_mask; >> + __u16 unclassified_queue; >> +}; >> + >> #endif /* _UAPI__IF_TUN_H */ >> diff --git a/net/core/skbuff.c b/net/core/skbuff.c >> index 7b03b64fdcb276f68ce881d1d8da8e4c6b897efc..aa2a091b649f0c9d6e0196f34f345ba78b5498fb 100644 >> --- a/net/core/skbuff.c >> +++ b/net/core/skbuff.c >> @@ -64,6 +64,7 @@ >> #include <linux/mpls.h> >> #include <linux/kcov.h> >> #include <linux/iov_iter.h> >> +#include <linux/virtio_net.h> >> >> #include <net/protocol.h> >> #include <net/dst.h> >> @@ -4969,6 +4970,9 @@ static const u8 skb_ext_type_len[] = { >> #if IS_ENABLED(CONFIG_MCTP_FLOWS) >> [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), >> #endif >> +#if IS_ENABLED(CONFIG_TUN) >> + [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct virtio_net_hash), >> +#endif >> }; >> >> static __always_inline unsigned int skb_ext_total_length(void) >> >> -- >> 2.48.1 >> > > Thanks >
On 2025/03/10 12:55, Jason Wang wrote: > On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> Hash reporting >> ============== >> >> Allow the guest to reuse the hash value to make receive steering >> consistent between the host and guest, and to save hash computation. >> >> RSS >> === >> >> RSS is a receive steering algorithm that can be negotiated to use with >> virtio_net. Conventionally the hash calculation was done by the VMM. >> However, computing the hash after the queue was chosen defeats the >> purpose of RSS. >> >> Another approach is to use eBPF steering program. This approach has >> another downside: it cannot report the calculated hash due to the >> restrictive nature of eBPF steering program. >> >> Introduce the code to perform RSS to the kernel in order to overcome >> thse challenges. An alternative solution is to extend the eBPF steering >> program so that it will be able to report to the userspace, but I didn't >> opt for it because extending the current mechanism of eBPF steering >> program as is because it relies on legacy context rewriting, and >> introducing kfunc-based eBPF will result in non-UAPI dependency while >> the other relevant virtualization APIs such as KVM and vhost_net are >> UAPIs. >> >> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >> Tested-by: Lei Yang <leiyang@redhat.com> >> --- >> Documentation/networking/tuntap.rst | 7 ++ >> drivers/net/Kconfig | 1 + >> drivers/net/tap.c | 68 ++++++++++++++- >> drivers/net/tun.c | 98 +++++++++++++++++----- >> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >> include/linux/if_tap.h | 2 + >> include/linux/skbuff.h | 3 + >> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >> net/core/skbuff.c | 4 + >> 9 files changed, 386 insertions(+), 31 deletions(-) >> >> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst >> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 >> --- a/Documentation/networking/tuntap.rst >> +++ b/Documentation/networking/tuntap.rst >> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >> } >> >> +3.4 Reference >> +------------- >> + >> +``linux/if_tun.h`` defines the interface described below: >> + >> +.. kernel-doc:: include/uapi/linux/if_tun.h >> + >> Universal TUN/TAP device driver Frequently Asked Question >> ========================================================= >> >> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig >> index 1fd5acdc73c6af0e1a861867039c3624fc618e25..aecfd244dd83585fea2c5b815dcd787c58166c28 100644 >> --- a/drivers/net/Kconfig >> +++ b/drivers/net/Kconfig >> @@ -395,6 +395,7 @@ config TUN >> tristate "Universal TUN/TAP device driver support" >> depends on INET >> select CRC32 >> + select SKB_EXTENSIONS >> help >> TUN/TAP provides packet reception and transmission for user space >> programs. It can be viewed as a simple Point-to-Point or Ethernet >> diff --git a/drivers/net/tap.c b/drivers/net/tap.c >> index d4ece538f1b23789ca60caa6232690e4d0a4d14a..9428b63ec27e7f92e78a78afcb5e24383862c00d 100644 >> --- a/drivers/net/tap.c >> +++ b/drivers/net/tap.c >> @@ -49,6 +49,10 @@ struct major_info { >> struct list_head next; >> }; >> >> +struct tap_skb_cb { >> + struct virtio_net_hash hash; >> +}; >> + >> #define GOODCOPY_LEN 128 >> >> static const struct proto_ops tap_socket_ops; >> @@ -179,6 +183,22 @@ static void tap_put_queue(struct tap_queue *q) >> sock_put(&q->sk); >> } >> >> +static struct tap_skb_cb *tap_skb_cb(const struct sk_buff *skb) >> +{ >> + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct tap_skb_cb)); >> + return (struct tap_skb_cb *)skb->cb; >> +} >> + >> +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb) >> +{ >> + return &tap_skb_cb(skb)->hash; >> +} >> + >> +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb) >> +{ >> + return &tap_skb_cb(skb)->hash; >> +} >> + >> /* >> * Select a queue based on the rxq of the device on which this packet >> * arrived. If the incoming device is not mq, calculate a flow hash >> @@ -189,6 +209,7 @@ static void tap_put_queue(struct tap_queue *q) >> static struct tap_queue *tap_get_queue(struct tap_dev *tap, >> struct sk_buff *skb) >> { >> + struct flow_keys_basic keys_basic; >> struct tap_queue *queue = NULL; >> /* Access to taps array is protected by rcu, but access to numvtaps >> * isn't. Below we use it to lookup a queue, but treat it as a hint >> @@ -196,17 +217,47 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, >> * racing against queue removal. >> */ >> int numvtaps = READ_ONCE(tap->numvtaps); >> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash); >> __u32 rxq; >> >> + *tap_skb_cb(skb) = (struct tap_skb_cb) { >> + .hash = { .report = VIRTIO_NET_HASH_REPORT_NONE } >> + }; >> + >> if (!numvtaps) >> goto out; >> >> if (numvtaps == 1) >> goto single; >> >> + if (vnet_hash) { >> + if ((vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { >> + rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash); >> + queue = rcu_dereference(tap->taps[rxq]); >> + goto out; >> + } >> + >> + if (!skb->l4_hash && !skb->sw_hash) { >> + struct flow_keys keys; >> + >> + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); >> + rxq = flow_hash_from_keys(&keys); >> + keys_basic = (struct flow_keys_basic) { >> + .control = keys.control, >> + .basic = keys.basic >> + }; >> + } else { >> + skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0, >> + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); >> + rxq = skb->hash; >> + } >> + } else { >> + rxq = skb_get_hash(skb); >> + } >> + >> /* Check if we can use flow to select a queue */ >> - rxq = skb_get_hash(skb); >> if (rxq) { >> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash); >> queue = rcu_dereference(tap->taps[rxq % numvtaps]); >> goto out; >> } >> @@ -711,11 +762,12 @@ static ssize_t tap_put_user(struct tap_queue *q, >> int total; >> >> if (q->flags & IFF_VNET_HDR) { >> - struct virtio_net_hdr vnet_hdr; >> + struct virtio_net_hdr_v1_hash vnet_hdr; >> >> vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); >> >> - ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); >> + ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb, >> + tap_find_hash, &vnet_hdr); >> if (ret) >> return ret; >> >> @@ -992,6 +1044,16 @@ static long tap_ioctl(struct file *file, unsigned int cmd, >> rtnl_unlock(); >> return ret; >> >> + case TUNGETVNETHASHCAP: >> + return tun_vnet_ioctl_gethashcap(argp); >> + >> + case TUNSETVNETHASH: >> + rtnl_lock(); >> + tap = rtnl_dereference(q->tap); >> + ret = tap ? tun_vnet_ioctl_sethash(&tap->vnet_hash, true, argp) : -EBADFD; >> + rtnl_unlock(); >> + return ret; >> + >> case SIOCGIFHWADDR: >> rtnl_lock(); >> tap = tap_get_tap_dev(q); >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >> index d8f4d3e996a7a81d1f8b04635054081671a14f07..520013df416e93d3a50b46be9b53ae9ab410eab4 100644 >> --- a/drivers/net/tun.c >> +++ b/drivers/net/tun.c >> @@ -209,6 +209,7 @@ struct tun_struct { >> struct bpf_prog __rcu *xdp_prog; >> struct tun_prog __rcu *steering_prog; >> struct tun_prog __rcu *filter_prog; >> + struct tun_vnet_hash_container __rcu *vnet_hash; >> struct ethtool_link_ksettings link_ksettings; >> /* init args */ >> struct file *file; >> @@ -451,20 +452,37 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) >> e->rps_rxhash = hash; >> } >> >> +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb) >> +{ >> + return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); >> +} >> + >> +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb) >> +{ >> + return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); >> +} >> + >> /* We try to identify a flow through its rxhash. The reason that >> * we do not check rxq no. is because some cards(e.g 82599), chooses >> * the rxq based on the txq where the last packet of the flow comes. As >> * the userspace application move between processors, we may get a >> * different rxq no. here. >> */ >> -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >> +static u16 tun_automq_select_queue(struct tun_struct *tun, >> + const struct tun_vnet_hash_container *vnet_hash, >> + struct sk_buff *skb) >> { >> + struct flow_keys keys; >> + struct flow_keys_basic keys_basic; >> struct tun_flow_entry *e; >> u32 txq, numqueues; >> >> numqueues = READ_ONCE(tun->numqueues); >> >> - txq = __skb_get_hash_symmetric(skb); >> + memset(&keys, 0, sizeof(keys)); >> + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); >> + >> + txq = flow_hash_from_keys(&keys); >> e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); >> if (e) { >> tun_flow_save_rps_rxhash(e, txq); >> @@ -473,6 +491,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >> txq = reciprocal_scale(txq, numqueues); >> } >> >> + keys_basic = (struct flow_keys_basic) { >> + .control = keys.control, >> + .basic = keys.basic >> + }; >> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq, >> + tun_add_hash); >> + >> return txq; >> } >> >> @@ -500,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, >> u16 ret; >> >> rcu_read_lock(); >> - if (rcu_dereference(tun->steering_prog)) >> + if (rcu_dereference(tun->steering_prog)) { >> ret = tun_ebpf_select_queue(tun, skb); >> - else >> - ret = tun_automq_select_queue(tun, skb); >> + } else { >> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); >> + >> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) >> + ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash, >> + skb, tun_add_hash); >> + else >> + ret = tun_automq_select_queue(tun, vnet_hash, skb); >> + } >> rcu_read_unlock(); >> >> return ret; >> @@ -1987,7 +2019,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, >> ssize_t ret; >> >> if (tun->flags & IFF_VNET_HDR) { >> - struct virtio_net_hdr gso = { 0 }; >> + struct virtio_net_hdr_v1_hash gso = { 0 }; >> >> vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); >> ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); >> @@ -2040,9 +2072,10 @@ static ssize_t tun_put_user(struct tun_struct *tun, >> } >> >> if (vnet_hdr_sz) { >> - struct virtio_net_hdr gso; >> + struct virtio_net_hdr_v1_hash gso; >> >> - ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); >> + ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, >> + skb, tun_find_hash, &gso); >> if (ret) >> return ret; >> >> @@ -2223,6 +2256,7 @@ static void tun_free_netdev(struct net_device *dev) >> security_tun_dev_free_security(tun->security); >> __tun_set_ebpf(tun, &tun->steering_prog, NULL); >> __tun_set_ebpf(tun, &tun->filter_prog, NULL); >> + kfree_rcu_mightsleep(rcu_access_pointer(tun->vnet_hash)); >> } >> >> static void tun_setup(struct net_device *dev) >> @@ -2921,13 +2955,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) >> } >> >> static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, >> - void __user *data) >> + int fd) >> { >> struct bpf_prog *prog; >> - int fd; >> - >> - if (copy_from_user(&fd, data, sizeof(fd))) >> - return -EFAULT; >> >> if (fd == -1) { >> prog = NULL; >> @@ -2993,7 +3023,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> int ifindex; >> int sndbuf; >> int ret; >> + int fd; >> bool do_notify = false; >> + struct tun_vnet_hash_container *vnet_hash; >> >> if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || >> (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { >> @@ -3020,7 +3052,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> rtnl_lock(); >> >> tun = tun_get(tfile); >> - if (cmd == TUNSETIFF) { >> + switch (cmd) { >> + case TUNSETIFF: >> ret = -EEXIST; >> if (tun) >> goto unlock; >> @@ -3035,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> if (copy_to_user(argp, &ifr, ifreq_len)) >> ret = -EFAULT; >> goto unlock; >> - } >> - if (cmd == TUNSETIFINDEX) { >> + >> + case TUNSETIFINDEX: >> ret = -EPERM; >> if (tun) >> goto unlock; >> @@ -3050,6 +3083,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> ret = 0; >> tfile->ifindex = ifindex; >> goto unlock; >> + >> + case TUNGETVNETHASHCAP: >> + ret = tun_vnet_ioctl_gethashcap(argp); >> + goto unlock; >> } >> >> ret = -EBADFD; >> @@ -3230,11 +3267,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> break; >> >> case TUNSETSTEERINGEBPF: >> - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); >> + if (get_user(fd, (int __user *)argp)) { >> + ret = -EFAULT; >> + break; >> + } >> + >> + vnet_hash = rtnl_dereference(tun->vnet_hash); >> + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { >> + ret = -EBUSY; >> + break; >> + } >> + >> + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); >> break; >> >> case TUNSETFILTEREBPF: >> - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); >> + if (get_user(fd, (int __user *)argp)) { >> + ret = -EFAULT; >> + break; >> + } >> + >> + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); >> break; >> >> case TUNSETCARRIER: >> @@ -3252,8 +3305,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> ret = open_related_ns(&net->ns, get_net_ns); >> break; >> >> + case TUNSETVNETHASH: >> + ret = tun_vnet_ioctl_sethash(&tun->vnet_hash, >> + !rtnl_dereference(tun->steering_prog), >> + argp); >> + break; >> + >> default: >> - ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); >> + ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, >> + cmd, argp); >> break; >> } >> >> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h >> index 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed 100644 >> --- a/drivers/net/tun_vnet.h >> +++ b/drivers/net/tun_vnet.h >> @@ -6,6 +6,16 @@ >> #define TUN_VNET_LE 0x80000000 >> #define TUN_VNET_BE 0x40000000 >> >> +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *); >> +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *); >> + >> +struct tun_vnet_hash_container { >> + struct tun_vnet_hash common; > > I'd rename this as hash. Everything in this structure is about hash. "common" represents its feature well. I see a few alternative options though I don't prefer them either; they make the code verbose and I don't think they are worthwhile: 1. Rename tun_vnet_hash to tun_vnet_hash_common. 2. Prefix the other fields with "hash_" for consistency. > >> + struct tun_vnet_hash_rss rss; >> + u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; >> + u16 rss_indirection_table[]; >> +}; > > Besides the separated ioctl, I'd split this structure into rss and > hash part as well. > >> + >> static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) >> { >> bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && >> @@ -107,6 +117,123 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, >> } >> } >> >> +static inline long tun_vnet_ioctl_gethashcap(void __user *argp) >> +{ >> + static const struct tun_vnet_hash cap = { >> + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, >> + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES >> + }; >> + >> + return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; > > Let's has a consistent name for this and the uapi to be consistent > with TUNSETIFF/TUNGETIFF. Probably TUNSETVNETHASH and > tun_vnet_ioctl_gethash(). > >> +} >> + >> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, >> + bool can_rss, void __user *argp) > > So again, can_rss seems to be tricky. Looking at its caller, it tires > to make eBPF and RSS mutually exclusive. I still don't understand why > we need this. Allow eBPF program to override some of the path seems to > be common practice. > > What's more, we didn't try (or even can't) to make automq and eBPF to > be mutually exclusive. So I still didn't see what we gain from this > and it complicates the codes and may lead to ambiguous uAPI/behaviour. > >> +{ >> + struct tun_vnet_hash hash_buf; >> + struct tun_vnet_hash_container *hash; >> + >> + if (copy_from_user(&hash_buf, argp, sizeof(hash_buf))) >> + return -EFAULT; >> + argp = (struct tun_vnet_hash __user *)argp + 1; >> + >> + if (hash_buf.flags & TUN_VNET_HASH_RSS) { >> + struct tun_vnet_hash_rss rss; >> + size_t indirection_table_size; >> + size_t key_size; >> + size_t size; >> + >> + if (!can_rss) >> + return -EBUSY; >> + >> + if (copy_from_user(&rss, argp, sizeof(rss))) >> + return -EFAULT; >> + argp = (struct tun_vnet_hash_rss __user *)argp + 1; >> + >> + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; >> + key_size = virtio_net_hash_key_length(hash_buf.types); >> + size = struct_size(hash, rss_indirection_table, >> + (size_t)rss.indirection_table_mask + 1); >> + >> + hash = kmalloc(size, GFP_KERNEL); >> + if (!hash) >> + return -ENOMEM; >> + >> + if (copy_from_user(hash->rss_indirection_table, >> + argp, indirection_table_size)) { >> + kfree(hash); >> + return -EFAULT; >> + } >> + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; >> + >> + if (copy_from_user(hash->rss_key, argp, key_size)) { >> + kfree(hash); >> + return -EFAULT; >> + } >> + >> + virtio_net_toeplitz_convert_key(hash->rss_key, key_size); >> + hash->rss = rss; >> + } else { >> + hash = kmalloc(sizeof(hash->common), GFP_KERNEL); >> + if (!hash) >> + return -ENOMEM; > > Do we need to validate the hash here (at least against the types we supported?) > >> + } >> + >> + hash->common = hash_buf; >> + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash)); > > I still didn't understand the trick here. E.g we use very simple > primitives in synchronizing ebpf program through RCU in > __tun_set_ebpf(). > >> + return 0; >> +} >> + >> +static void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash, >> + struct sk_buff *skb, >> + const struct flow_keys_basic *keys, >> + u32 value, >> + tun_vnet_hash_add vnet_hash_add) >> +{ >> + struct virtio_net_hash *report; >> + >> + if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT)) >> + return; >> + >> + report = vnet_hash_add(skb); >> + if (!report) >> + return; >> + >> + *report = (struct virtio_net_hash) { >> + .report = virtio_net_hash_report(hash->common.types, keys), >> + .value = value >> + }; > > What's the advantage of using Designated Initializers here? Simple > assignment can save two lines of code. > >> +} >> + >> +static u16 tun_vnet_rss_select_queue(u32 numqueues, >> + const struct tun_vnet_hash_container *hash, >> + struct sk_buff *skb, >> + tun_vnet_hash_add vnet_hash_add) >> +{ >> + struct virtio_net_hash *report; >> + struct virtio_net_hash ret; >> + u16 txq, index; >> + >> + if (!numqueues) >> + return 0; >> + >> + virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret); >> + >> + if (!ret.report) >> + return hash->rss.unclassified_queue % numqueues; >> + >> + if (hash->common.flags & TUN_VNET_HASH_REPORT) { >> + report = vnet_hash_add(skb); >> + if (report) >> + *report = ret; >> + } > > Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? > If yes, it should be a bug. > >> + >> + index = ret.value & hash->rss.indirection_table_mask; >> + txq = READ_ONCE(hash->rss_indirection_table[index]); > > So vnet_hash is accessed via rcu_dereference(), I don't get any reason > we need READ_ONCE here, is this paired with something? If yes, let's > add a comment here. If rss_indirection_table need why > indirection_table_mask doesn't need this? > >> + >> + return txq % numqueues; >> +} >> + >> static inline int tun_vnet_hdr_get(int sz, unsigned int flags, >> struct iov_iter *from, >> struct virtio_net_hdr *hdr) >> @@ -135,15 +262,17 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags, >> } >> >> static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, >> - const struct virtio_net_hdr *hdr) >> + const struct virtio_net_hdr_v1_hash *hdr) >> { > > To be more robust, we can tweak the function to accept a vnet_hdr_len > parameter then we can avoid touching this every time when we need to > extend vnet hdr in the future? > >> + int content_sz = MIN(sizeof(*hdr), sz); >> + >> if (unlikely(iov_iter_count(iter) < sz)) >> return -EINVAL; >> >> - if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) >> + if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz)) >> return -EFAULT; >> >> - if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) >> + if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) >> return -EFAULT; >> >> return 0; >> @@ -155,26 +284,38 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, >> return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); >> } >> >> -static inline int tun_vnet_hdr_from_skb(unsigned int flags, >> +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags, >> const struct net_device *dev, >> const struct sk_buff *skb, >> - struct virtio_net_hdr *hdr) >> + tun_vnet_hash_find vnet_hash_find, >> + struct virtio_net_hdr_v1_hash *hdr) >> { >> int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; >> + const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ? >> + NULL : vnet_hash_find(skb); >> + >> + *hdr = (struct virtio_net_hdr_v1_hash) { >> + .hash_report = VIRTIO_NET_HASH_REPORT_NONE >> + }; >> + >> + if (report) { >> + hdr->hash_value = cpu_to_le32(report->value); >> + hdr->hash_report = cpu_to_le16(report->report); >> + } >> >> - if (virtio_net_hdr_from_skb(skb, hdr, >> + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, >> tun_vnet_is_little_endian(flags), true, >> vlan_hlen)) { >> struct skb_shared_info *sinfo = skb_shinfo(skb); >> >> if (net_ratelimit()) { >> netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", >> - sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size), >> - tun_vnet16_to_cpu(flags, hdr->hdr_len)); >> + sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size), >> + tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len)); >> print_hex_dump(KERN_ERR, "tun: ", >> DUMP_PREFIX_NONE, >> 16, 1, skb->head, >> - min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true); >> + min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true); >> } >> WARN_ON_ONCE(1); >> return -EINVAL; >> diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h >> index 553552fa635c3e1e53d1a63c203d32e4c4fd5a4f..7334c46a3f101675a0d4e5a036987cfe18842f9f 100644 >> --- a/include/linux/if_tap.h >> +++ b/include/linux/if_tap.h >> @@ -31,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f) >> #define MAX_TAP_QUEUES 256 >> >> struct tap_queue; >> +struct tun_vnet_hash_container; >> >> struct tap_dev { >> struct net_device *dev; >> @@ -43,6 +44,7 @@ struct tap_dev { >> int numqueues; >> netdev_features_t tap_features; >> int minor; >> + struct tun_vnet_hash_container __rcu *vnet_hash; >> >> void (*update_features)(struct tap_dev *tap, netdev_features_t features); >> void (*count_tx_dropped)(struct tap_dev *tap); >> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h >> index bb2b751d274acff931281a72e8b4b0c699b4e8af..cdd793f1c360ad5f63fcc4cbf67d845f5e2ccf6f 100644 >> --- a/include/linux/skbuff.h >> +++ b/include/linux/skbuff.h >> @@ -4842,6 +4842,9 @@ enum skb_ext_id { >> #endif >> #if IS_ENABLED(CONFIG_MCTP_FLOWS) >> SKB_EXT_MCTP, >> +#endif >> +#if IS_ENABLED(CONFIG_TUN) >> + SKB_EXT_TUN_VNET_HASH, >> #endif >> SKB_EXT_NUM, /* must be last */ >> }; >> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h >> index 287cdc81c9390c289a30545aa7ed23d81c3329d3..4887f97500a870c7ef3c96a5837b2d0a5a225040 100644 >> --- a/include/uapi/linux/if_tun.h >> +++ b/include/uapi/linux/if_tun.h >> @@ -62,6 +62,42 @@ >> #define TUNSETCARRIER _IOW('T', 226, int) >> #define TUNGETDEVNETNS _IO('T', 227) >> >> +/** >> + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability. >> + * >> + * The argument is a pointer to &struct tun_vnet_hash which will store the >> + * maximal virtio_net hashing configuration. >> + */ >> +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash) >> + >> +/** >> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing >> + * >> + * The argument is a pointer to &struct tun_vnet_hash. >> + * >> + * The argument is a pointer to the compound of the following in order if >> + * %TUN_VNET_HASH_RSS is set: >> + * >> + * 1. &struct tun_vnet_hash >> + * 2. &struct tun_vnet_hash_rss >> + * 3. Indirection table >> + * 4. Key >> + * >> + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only >> + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal >> + * to the size of &struct virtio_net_hdr_v1_hash. > > So you had a dependency check already for vnet hdr len. I'd still > suggest to split this into rss and hash as they are separated > features. Then we can use separate data structure for them instead of > a container struct. > >> + * >> + * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will >> + * always be little-endian. >> + * >> + * This ioctl results in %EBADFD if the underlying device is deleted. It affects >> + * all queues attached to the same device. >> + * >> + * This ioctl currently has no effect on XDP packets and packets with >> + * queue_mapping set by TC. >> + */ >> +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash) >> + >> /* TUNSETIFF ifr flags */ >> #define IFF_TUN 0x0001 >> #define IFF_TAP 0x0002 >> @@ -115,4 +151,43 @@ struct tun_filter { >> __u8 addr[][ETH_ALEN]; >> }; >> >> +/** >> + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost >> + */ >> +#define TUN_VNET_HASH_REPORT 0x0001 >> + >> +/** >> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS >> + * >> + * This is mutually exclusive with eBPF steering program. >> + */ >> +#define TUN_VNET_HASH_RSS 0x0002 >> + >> +/** >> + * struct tun_vnet_hash - virtio_net hashing configuration >> + * @flags: >> + * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS >> + * @pad: >> + * Should be filled with zero before passing to %TUNSETVNETHASH >> + * @types: >> + * Bitmask of allowed hash types >> + */ >> +struct tun_vnet_hash { >> + __u16 flags; >> + __u8 pad[2]; >> + __u32 types; >> +}; > > Padding in the middle of the structure is not elegant. Any reason for this? > > And hash->types seems never used. > >> + >> +/** >> + * struct tun_vnet_hash_rss - virtio_net RSS configuration >> + * @indirection_table_mask: >> + * Bitmask to be applied to the indirection table index >> + * @unclassified_queue: >> + * The index of the queue to place unclassified packets in >> + */ >> +struct tun_vnet_hash_rss { >> + __u16 indirection_table_mask; >> + __u16 unclassified_queue; >> +}; >> + >> #endif /* _UAPI__IF_TUN_H */ >> diff --git a/net/core/skbuff.c b/net/core/skbuff.c >> index 7b03b64fdcb276f68ce881d1d8da8e4c6b897efc..aa2a091b649f0c9d6e0196f34f345ba78b5498fb 100644 >> --- a/net/core/skbuff.c >> +++ b/net/core/skbuff.c >> @@ -64,6 +64,7 @@ >> #include <linux/mpls.h> >> #include <linux/kcov.h> >> #include <linux/iov_iter.h> >> +#include <linux/virtio_net.h> >> >> #include <net/protocol.h> >> #include <net/dst.h> >> @@ -4969,6 +4970,9 @@ static const u8 skb_ext_type_len[] = { >> #if IS_ENABLED(CONFIG_MCTP_FLOWS) >> [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), >> #endif >> +#if IS_ENABLED(CONFIG_TUN) >> + [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct virtio_net_hash), >> +#endif >> }; >> >> static __always_inline unsigned int skb_ext_total_length(void) >> >> -- >> 2.48.1 >> > > Thanks >
On 2025/03/10 12:55, Jason Wang wrote: > On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> Hash reporting >> ============== >> >> Allow the guest to reuse the hash value to make receive steering >> consistent between the host and guest, and to save hash computation. >> >> RSS >> === >> >> RSS is a receive steering algorithm that can be negotiated to use with >> virtio_net. Conventionally the hash calculation was done by the VMM. >> However, computing the hash after the queue was chosen defeats the >> purpose of RSS. >> >> Another approach is to use eBPF steering program. This approach has >> another downside: it cannot report the calculated hash due to the >> restrictive nature of eBPF steering program. >> >> Introduce the code to perform RSS to the kernel in order to overcome >> thse challenges. An alternative solution is to extend the eBPF steering >> program so that it will be able to report to the userspace, but I didn't >> opt for it because extending the current mechanism of eBPF steering >> program as is because it relies on legacy context rewriting, and >> introducing kfunc-based eBPF will result in non-UAPI dependency while >> the other relevant virtualization APIs such as KVM and vhost_net are >> UAPIs. >> >> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >> Tested-by: Lei Yang <leiyang@redhat.com> >> --- >> Documentation/networking/tuntap.rst | 7 ++ >> drivers/net/Kconfig | 1 + >> drivers/net/tap.c | 68 ++++++++++++++- >> drivers/net/tun.c | 98 +++++++++++++++++----- >> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >> include/linux/if_tap.h | 2 + >> include/linux/skbuff.h | 3 + >> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >> net/core/skbuff.c | 4 + >> 9 files changed, 386 insertions(+), 31 deletions(-) >> >> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst >> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 >> --- a/Documentation/networking/tuntap.rst >> +++ b/Documentation/networking/tuntap.rst >> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >> } >> >> +3.4 Reference >> +------------- >> + >> +``linux/if_tun.h`` defines the interface described below: >> + >> +.. kernel-doc:: include/uapi/linux/if_tun.h >> + >> Universal TUN/TAP device driver Frequently Asked Question >> ========================================================= >> >> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig >> index 1fd5acdc73c6af0e1a861867039c3624fc618e25..aecfd244dd83585fea2c5b815dcd787c58166c28 100644 >> --- a/drivers/net/Kconfig >> +++ b/drivers/net/Kconfig >> @@ -395,6 +395,7 @@ config TUN >> tristate "Universal TUN/TAP device driver support" >> depends on INET >> select CRC32 >> + select SKB_EXTENSIONS >> help >> TUN/TAP provides packet reception and transmission for user space >> programs. It can be viewed as a simple Point-to-Point or Ethernet >> diff --git a/drivers/net/tap.c b/drivers/net/tap.c >> index d4ece538f1b23789ca60caa6232690e4d0a4d14a..9428b63ec27e7f92e78a78afcb5e24383862c00d 100644 >> --- a/drivers/net/tap.c >> +++ b/drivers/net/tap.c >> @@ -49,6 +49,10 @@ struct major_info { >> struct list_head next; >> }; >> >> +struct tap_skb_cb { >> + struct virtio_net_hash hash; >> +}; >> + >> #define GOODCOPY_LEN 128 >> >> static const struct proto_ops tap_socket_ops; >> @@ -179,6 +183,22 @@ static void tap_put_queue(struct tap_queue *q) >> sock_put(&q->sk); >> } >> >> +static struct tap_skb_cb *tap_skb_cb(const struct sk_buff *skb) >> +{ >> + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct tap_skb_cb)); >> + return (struct tap_skb_cb *)skb->cb; >> +} >> + >> +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb) >> +{ >> + return &tap_skb_cb(skb)->hash; >> +} >> + >> +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb) >> +{ >> + return &tap_skb_cb(skb)->hash; >> +} >> + >> /* >> * Select a queue based on the rxq of the device on which this packet >> * arrived. If the incoming device is not mq, calculate a flow hash >> @@ -189,6 +209,7 @@ static void tap_put_queue(struct tap_queue *q) >> static struct tap_queue *tap_get_queue(struct tap_dev *tap, >> struct sk_buff *skb) >> { >> + struct flow_keys_basic keys_basic; >> struct tap_queue *queue = NULL; >> /* Access to taps array is protected by rcu, but access to numvtaps >> * isn't. Below we use it to lookup a queue, but treat it as a hint >> @@ -196,17 +217,47 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, >> * racing against queue removal. >> */ >> int numvtaps = READ_ONCE(tap->numvtaps); >> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash); >> __u32 rxq; >> >> + *tap_skb_cb(skb) = (struct tap_skb_cb) { >> + .hash = { .report = VIRTIO_NET_HASH_REPORT_NONE } >> + }; >> + >> if (!numvtaps) >> goto out; >> >> if (numvtaps == 1) >> goto single; >> >> + if (vnet_hash) { >> + if ((vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { >> + rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash); >> + queue = rcu_dereference(tap->taps[rxq]); >> + goto out; >> + } >> + >> + if (!skb->l4_hash && !skb->sw_hash) { >> + struct flow_keys keys; >> + >> + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); >> + rxq = flow_hash_from_keys(&keys); >> + keys_basic = (struct flow_keys_basic) { >> + .control = keys.control, >> + .basic = keys.basic >> + }; >> + } else { >> + skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0, >> + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); >> + rxq = skb->hash; >> + } >> + } else { >> + rxq = skb_get_hash(skb); >> + } >> + >> /* Check if we can use flow to select a queue */ >> - rxq = skb_get_hash(skb); >> if (rxq) { >> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash); >> queue = rcu_dereference(tap->taps[rxq % numvtaps]); >> goto out; >> } >> @@ -711,11 +762,12 @@ static ssize_t tap_put_user(struct tap_queue *q, >> int total; >> >> if (q->flags & IFF_VNET_HDR) { >> - struct virtio_net_hdr vnet_hdr; >> + struct virtio_net_hdr_v1_hash vnet_hdr; >> >> vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); >> >> - ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); >> + ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb, >> + tap_find_hash, &vnet_hdr); >> if (ret) >> return ret; >> >> @@ -992,6 +1044,16 @@ static long tap_ioctl(struct file *file, unsigned int cmd, >> rtnl_unlock(); >> return ret; >> >> + case TUNGETVNETHASHCAP: >> + return tun_vnet_ioctl_gethashcap(argp); >> + >> + case TUNSETVNETHASH: >> + rtnl_lock(); >> + tap = rtnl_dereference(q->tap); >> + ret = tap ? tun_vnet_ioctl_sethash(&tap->vnet_hash, true, argp) : -EBADFD; >> + rtnl_unlock(); >> + return ret; >> + >> case SIOCGIFHWADDR: >> rtnl_lock(); >> tap = tap_get_tap_dev(q); >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >> index d8f4d3e996a7a81d1f8b04635054081671a14f07..520013df416e93d3a50b46be9b53ae9ab410eab4 100644 >> --- a/drivers/net/tun.c >> +++ b/drivers/net/tun.c >> @@ -209,6 +209,7 @@ struct tun_struct { >> struct bpf_prog __rcu *xdp_prog; >> struct tun_prog __rcu *steering_prog; >> struct tun_prog __rcu *filter_prog; >> + struct tun_vnet_hash_container __rcu *vnet_hash; >> struct ethtool_link_ksettings link_ksettings; >> /* init args */ >> struct file *file; >> @@ -451,20 +452,37 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) >> e->rps_rxhash = hash; >> } >> >> +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb) >> +{ >> + return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); >> +} >> + >> +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb) >> +{ >> + return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); >> +} >> + >> /* We try to identify a flow through its rxhash. The reason that >> * we do not check rxq no. is because some cards(e.g 82599), chooses >> * the rxq based on the txq where the last packet of the flow comes. As >> * the userspace application move between processors, we may get a >> * different rxq no. here. >> */ >> -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >> +static u16 tun_automq_select_queue(struct tun_struct *tun, >> + const struct tun_vnet_hash_container *vnet_hash, >> + struct sk_buff *skb) >> { >> + struct flow_keys keys; >> + struct flow_keys_basic keys_basic; >> struct tun_flow_entry *e; >> u32 txq, numqueues; >> >> numqueues = READ_ONCE(tun->numqueues); >> >> - txq = __skb_get_hash_symmetric(skb); >> + memset(&keys, 0, sizeof(keys)); >> + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); >> + >> + txq = flow_hash_from_keys(&keys); >> e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); >> if (e) { >> tun_flow_save_rps_rxhash(e, txq); >> @@ -473,6 +491,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >> txq = reciprocal_scale(txq, numqueues); >> } >> >> + keys_basic = (struct flow_keys_basic) { >> + .control = keys.control, >> + .basic = keys.basic >> + }; >> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq, >> + tun_add_hash); >> + >> return txq; >> } >> >> @@ -500,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, >> u16 ret; >> >> rcu_read_lock(); >> - if (rcu_dereference(tun->steering_prog)) >> + if (rcu_dereference(tun->steering_prog)) { >> ret = tun_ebpf_select_queue(tun, skb); >> - else >> - ret = tun_automq_select_queue(tun, skb); >> + } else { >> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); >> + >> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) >> + ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash, >> + skb, tun_add_hash); >> + else >> + ret = tun_automq_select_queue(tun, vnet_hash, skb); >> + } >> rcu_read_unlock(); >> >> return ret; >> @@ -1987,7 +2019,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, >> ssize_t ret; >> >> if (tun->flags & IFF_VNET_HDR) { >> - struct virtio_net_hdr gso = { 0 }; >> + struct virtio_net_hdr_v1_hash gso = { 0 }; >> >> vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); >> ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); >> @@ -2040,9 +2072,10 @@ static ssize_t tun_put_user(struct tun_struct *tun, >> } >> >> if (vnet_hdr_sz) { >> - struct virtio_net_hdr gso; >> + struct virtio_net_hdr_v1_hash gso; >> >> - ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); >> + ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, >> + skb, tun_find_hash, &gso); >> if (ret) >> return ret; >> >> @@ -2223,6 +2256,7 @@ static void tun_free_netdev(struct net_device *dev) >> security_tun_dev_free_security(tun->security); >> __tun_set_ebpf(tun, &tun->steering_prog, NULL); >> __tun_set_ebpf(tun, &tun->filter_prog, NULL); >> + kfree_rcu_mightsleep(rcu_access_pointer(tun->vnet_hash)); >> } >> >> static void tun_setup(struct net_device *dev) >> @@ -2921,13 +2955,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) >> } >> >> static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, >> - void __user *data) >> + int fd) >> { >> struct bpf_prog *prog; >> - int fd; >> - >> - if (copy_from_user(&fd, data, sizeof(fd))) >> - return -EFAULT; >> >> if (fd == -1) { >> prog = NULL; >> @@ -2993,7 +3023,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> int ifindex; >> int sndbuf; >> int ret; >> + int fd; >> bool do_notify = false; >> + struct tun_vnet_hash_container *vnet_hash; >> >> if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || >> (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { >> @@ -3020,7 +3052,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> rtnl_lock(); >> >> tun = tun_get(tfile); >> - if (cmd == TUNSETIFF) { >> + switch (cmd) { >> + case TUNSETIFF: >> ret = -EEXIST; >> if (tun) >> goto unlock; >> @@ -3035,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> if (copy_to_user(argp, &ifr, ifreq_len)) >> ret = -EFAULT; >> goto unlock; >> - } >> - if (cmd == TUNSETIFINDEX) { >> + >> + case TUNSETIFINDEX: >> ret = -EPERM; >> if (tun) >> goto unlock; >> @@ -3050,6 +3083,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> ret = 0; >> tfile->ifindex = ifindex; >> goto unlock; >> + >> + case TUNGETVNETHASHCAP: >> + ret = tun_vnet_ioctl_gethashcap(argp); >> + goto unlock; >> } >> >> ret = -EBADFD; >> @@ -3230,11 +3267,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> break; >> >> case TUNSETSTEERINGEBPF: >> - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); >> + if (get_user(fd, (int __user *)argp)) { >> + ret = -EFAULT; >> + break; >> + } >> + >> + vnet_hash = rtnl_dereference(tun->vnet_hash); >> + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { >> + ret = -EBUSY; >> + break; >> + } >> + >> + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); >> break; >> >> case TUNSETFILTEREBPF: >> - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); >> + if (get_user(fd, (int __user *)argp)) { >> + ret = -EFAULT; >> + break; >> + } >> + >> + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); >> break; >> >> case TUNSETCARRIER: >> @@ -3252,8 +3305,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> ret = open_related_ns(&net->ns, get_net_ns); >> break; >> >> + case TUNSETVNETHASH: >> + ret = tun_vnet_ioctl_sethash(&tun->vnet_hash, >> + !rtnl_dereference(tun->steering_prog), >> + argp); >> + break; >> + >> default: >> - ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); >> + ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, >> + cmd, argp); >> break; >> } >> >> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h >> index 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed 100644 >> --- a/drivers/net/tun_vnet.h >> +++ b/drivers/net/tun_vnet.h >> @@ -6,6 +6,16 @@ >> #define TUN_VNET_LE 0x80000000 >> #define TUN_VNET_BE 0x40000000 >> >> +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *); >> +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *); >> + >> +struct tun_vnet_hash_container { >> + struct tun_vnet_hash common; > > I'd rename this as hash. > >> + struct tun_vnet_hash_rss rss; >> + u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; >> + u16 rss_indirection_table[]; >> +}; > > Besides the separated ioctl, I'd split this structure into rss and > hash part as well. > >> + >> static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) >> { >> bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && >> @@ -107,6 +117,123 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, >> } >> } >> >> +static inline long tun_vnet_ioctl_gethashcap(void __user *argp) >> +{ >> + static const struct tun_vnet_hash cap = { >> + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, >> + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES >> + }; >> + >> + return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; > > Let's has a consistent name for this and the uapi to be consistent > with TUNSETIFF/TUNGETIFF. Probably TUNSETVNETHASH and > tun_vnet_ioctl_gethash(). > >> +} >> + >> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, >> + bool can_rss, void __user *argp) > > So again, can_rss seems to be tricky. Looking at its caller, it tires > to make eBPF and RSS mutually exclusive. I still don't understand why > we need this. Allow eBPF program to override some of the path seems to > be common practice. > > What's more, we didn't try (or even can't) to make automq and eBPF to > be mutually exclusive. So I still didn't see what we gain from this > and it complicates the codes and may lead to ambiguous uAPI/behaviour. > >> +{ >> + struct tun_vnet_hash hash_buf; >> + struct tun_vnet_hash_container *hash; >> + >> + if (copy_from_user(&hash_buf, argp, sizeof(hash_buf))) >> + return -EFAULT; >> + argp = (struct tun_vnet_hash __user *)argp + 1; >> + >> + if (hash_buf.flags & TUN_VNET_HASH_RSS) { >> + struct tun_vnet_hash_rss rss; >> + size_t indirection_table_size; >> + size_t key_size; >> + size_t size; >> + >> + if (!can_rss) >> + return -EBUSY; >> + >> + if (copy_from_user(&rss, argp, sizeof(rss))) >> + return -EFAULT; >> + argp = (struct tun_vnet_hash_rss __user *)argp + 1; >> + >> + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; >> + key_size = virtio_net_hash_key_length(hash_buf.types); >> + size = struct_size(hash, rss_indirection_table, >> + (size_t)rss.indirection_table_mask + 1); >> + >> + hash = kmalloc(size, GFP_KERNEL); >> + if (!hash) >> + return -ENOMEM; >> + >> + if (copy_from_user(hash->rss_indirection_table, >> + argp, indirection_table_size)) { >> + kfree(hash); >> + return -EFAULT; >> + } >> + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; >> + >> + if (copy_from_user(hash->rss_key, argp, key_size)) { >> + kfree(hash); >> + return -EFAULT; >> + } >> + >> + virtio_net_toeplitz_convert_key(hash->rss_key, key_size); >> + hash->rss = rss; >> + } else { >> + hash = kmalloc(sizeof(hash->common), GFP_KERNEL); >> + if (!hash) >> + return -ENOMEM; > > Do we need to validate the hash here (at least against the types we supported?) Sorry for repeateadly sending emails; I find comments not addressed each time I read this email. This ioctl does not check unknown bits in the flags, pad, and types. Please tell me if you want to have them rejected instead. Existing ioctls behave differ in such a situation. TUNSETOFFLOAD rejects unknown bits so that the userspace can know features unsupported. On the other hand, TUNSETIFF ignores unknown bits, and provides the TUNGETFEATURES ioctl for feature detection. This patch follows the example of TUNSETIFF; TUNSETVNETHASH ignores unknown bits, and TUNGETVNETHASHCAP allows feature detection. > >> + } >> + >> + hash->common = hash_buf; >> + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash)); > > I still didn't understand the trick here. E.g we use very simple > primitives in synchronizing ebpf program through RCU in > __tun_set_ebpf(). > >> + return 0; >> +} >> + >> +static void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash, >> + struct sk_buff *skb, >> + const struct flow_keys_basic *keys, >> + u32 value, >> + tun_vnet_hash_add vnet_hash_add) >> +{ >> + struct virtio_net_hash *report; >> + >> + if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT)) >> + return; >> + >> + report = vnet_hash_add(skb); >> + if (!report) >> + return; >> + >> + *report = (struct virtio_net_hash) { >> + .report = virtio_net_hash_report(hash->common.types, keys), >> + .value = value >> + }; > > What's the advantage of using Designated Initializers here? Simple > assignment can save two lines of code. > >> +} >> + >> +static u16 tun_vnet_rss_select_queue(u32 numqueues, >> + const struct tun_vnet_hash_container *hash, >> + struct sk_buff *skb, >> + tun_vnet_hash_add vnet_hash_add) >> +{ >> + struct virtio_net_hash *report; >> + struct virtio_net_hash ret; >> + u16 txq, index; >> + >> + if (!numqueues) >> + return 0; >> + >> + virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret); >> + >> + if (!ret.report) >> + return hash->rss.unclassified_queue % numqueues; >> + >> + if (hash->common.flags & TUN_VNET_HASH_REPORT) { >> + report = vnet_hash_add(skb); >> + if (report) >> + *report = ret; >> + } > > Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? > If yes, it should be a bug. > >> + >> + index = ret.value & hash->rss.indirection_table_mask; >> + txq = READ_ONCE(hash->rss_indirection_table[index]); > > So vnet_hash is accessed via rcu_dereference(), I don't get any reason > we need READ_ONCE here, is this paired with something? If yes, let's > add a comment here. If rss_indirection_table need why > indirection_table_mask doesn't need this? > >> + >> + return txq % numqueues; >> +} >> + >> static inline int tun_vnet_hdr_get(int sz, unsigned int flags, >> struct iov_iter *from, >> struct virtio_net_hdr *hdr) >> @@ -135,15 +262,17 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags, >> } >> >> static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, >> - const struct virtio_net_hdr *hdr) >> + const struct virtio_net_hdr_v1_hash *hdr) >> { > > To be more robust, we can tweak the function to accept a vnet_hdr_len > parameter then we can avoid touching this every time when we need to > extend vnet hdr in the future? > >> + int content_sz = MIN(sizeof(*hdr), sz); >> + >> if (unlikely(iov_iter_count(iter) < sz)) >> return -EINVAL; >> >> - if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) >> + if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz)) >> return -EFAULT; >> >> - if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) >> + if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) >> return -EFAULT; >> >> return 0; >> @@ -155,26 +284,38 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, >> return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); >> } >> >> -static inline int tun_vnet_hdr_from_skb(unsigned int flags, >> +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags, >> const struct net_device *dev, >> const struct sk_buff *skb, >> - struct virtio_net_hdr *hdr) >> + tun_vnet_hash_find vnet_hash_find, >> + struct virtio_net_hdr_v1_hash *hdr) >> { >> int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; >> + const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ? >> + NULL : vnet_hash_find(skb); >> + >> + *hdr = (struct virtio_net_hdr_v1_hash) { >> + .hash_report = VIRTIO_NET_HASH_REPORT_NONE >> + }; >> + >> + if (report) { >> + hdr->hash_value = cpu_to_le32(report->value); >> + hdr->hash_report = cpu_to_le16(report->report); >> + } >> >> - if (virtio_net_hdr_from_skb(skb, hdr, >> + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, >> tun_vnet_is_little_endian(flags), true, >> vlan_hlen)) { >> struct skb_shared_info *sinfo = skb_shinfo(skb); >> >> if (net_ratelimit()) { >> netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", >> - sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size), >> - tun_vnet16_to_cpu(flags, hdr->hdr_len)); >> + sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size), >> + tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len)); >> print_hex_dump(KERN_ERR, "tun: ", >> DUMP_PREFIX_NONE, >> 16, 1, skb->head, >> - min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true); >> + min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true); >> } >> WARN_ON_ONCE(1); >> return -EINVAL; >> diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h >> index 553552fa635c3e1e53d1a63c203d32e4c4fd5a4f..7334c46a3f101675a0d4e5a036987cfe18842f9f 100644 >> --- a/include/linux/if_tap.h >> +++ b/include/linux/if_tap.h >> @@ -31,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f) >> #define MAX_TAP_QUEUES 256 >> >> struct tap_queue; >> +struct tun_vnet_hash_container; >> >> struct tap_dev { >> struct net_device *dev; >> @@ -43,6 +44,7 @@ struct tap_dev { >> int numqueues; >> netdev_features_t tap_features; >> int minor; >> + struct tun_vnet_hash_container __rcu *vnet_hash; >> >> void (*update_features)(struct tap_dev *tap, netdev_features_t features); >> void (*count_tx_dropped)(struct tap_dev *tap); >> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h >> index bb2b751d274acff931281a72e8b4b0c699b4e8af..cdd793f1c360ad5f63fcc4cbf67d845f5e2ccf6f 100644 >> --- a/include/linux/skbuff.h >> +++ b/include/linux/skbuff.h >> @@ -4842,6 +4842,9 @@ enum skb_ext_id { >> #endif >> #if IS_ENABLED(CONFIG_MCTP_FLOWS) >> SKB_EXT_MCTP, >> +#endif >> +#if IS_ENABLED(CONFIG_TUN) >> + SKB_EXT_TUN_VNET_HASH, >> #endif >> SKB_EXT_NUM, /* must be last */ >> }; >> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h >> index 287cdc81c9390c289a30545aa7ed23d81c3329d3..4887f97500a870c7ef3c96a5837b2d0a5a225040 100644 >> --- a/include/uapi/linux/if_tun.h >> +++ b/include/uapi/linux/if_tun.h >> @@ -62,6 +62,42 @@ >> #define TUNSETCARRIER _IOW('T', 226, int) >> #define TUNGETDEVNETNS _IO('T', 227) >> >> +/** >> + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability. >> + * >> + * The argument is a pointer to &struct tun_vnet_hash which will store the >> + * maximal virtio_net hashing configuration. >> + */ >> +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash) >> + >> +/** >> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing >> + * >> + * The argument is a pointer to &struct tun_vnet_hash. >> + * >> + * The argument is a pointer to the compound of the following in order if >> + * %TUN_VNET_HASH_RSS is set: >> + * >> + * 1. &struct tun_vnet_hash >> + * 2. &struct tun_vnet_hash_rss >> + * 3. Indirection table >> + * 4. Key >> + * >> + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only >> + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal >> + * to the size of &struct virtio_net_hdr_v1_hash. > > So you had a dependency check already for vnet hdr len. I'd still > suggest to split this into rss and hash as they are separated > features. Then we can use separate data structure for them instead of > a container struct. > >> + * >> + * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will >> + * always be little-endian. >> + * >> + * This ioctl results in %EBADFD if the underlying device is deleted. It affects >> + * all queues attached to the same device. >> + * >> + * This ioctl currently has no effect on XDP packets and packets with >> + * queue_mapping set by TC. >> + */ >> +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash) >> + >> /* TUNSETIFF ifr flags */ >> #define IFF_TUN 0x0001 >> #define IFF_TAP 0x0002 >> @@ -115,4 +151,43 @@ struct tun_filter { >> __u8 addr[][ETH_ALEN]; >> }; >> >> +/** >> + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost >> + */ >> +#define TUN_VNET_HASH_REPORT 0x0001 >> + >> +/** >> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS >> + * >> + * This is mutually exclusive with eBPF steering program. >> + */ >> +#define TUN_VNET_HASH_RSS 0x0002 >> + >> +/** >> + * struct tun_vnet_hash - virtio_net hashing configuration >> + * @flags: >> + * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS >> + * @pad: >> + * Should be filled with zero before passing to %TUNSETVNETHASH >> + * @types: >> + * Bitmask of allowed hash types >> + */ >> +struct tun_vnet_hash { >> + __u16 flags; >> + __u8 pad[2]; >> + __u32 types; >> +}; > > Padding in the middle of the structure is not elegant. Any reason for this? > > And hash->types seems never used. > >> + >> +/** >> + * struct tun_vnet_hash_rss - virtio_net RSS configuration >> + * @indirection_table_mask: >> + * Bitmask to be applied to the indirection table index >> + * @unclassified_queue: >> + * The index of the queue to place unclassified packets in >> + */ >> +struct tun_vnet_hash_rss { >> + __u16 indirection_table_mask; >> + __u16 unclassified_queue; >> +}; >> + >> #endif /* _UAPI__IF_TUN_H */ >> diff --git a/net/core/skbuff.c b/net/core/skbuff.c >> index 7b03b64fdcb276f68ce881d1d8da8e4c6b897efc..aa2a091b649f0c9d6e0196f34f345ba78b5498fb 100644 >> --- a/net/core/skbuff.c >> +++ b/net/core/skbuff.c >> @@ -64,6 +64,7 @@ >> #include <linux/mpls.h> >> #include <linux/kcov.h> >> #include <linux/iov_iter.h> >> +#include <linux/virtio_net.h> >> >> #include <net/protocol.h> >> #include <net/dst.h> >> @@ -4969,6 +4970,9 @@ static const u8 skb_ext_type_len[] = { >> #if IS_ENABLED(CONFIG_MCTP_FLOWS) >> [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), >> #endif >> +#if IS_ENABLED(CONFIG_TUN) >> + [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct virtio_net_hash), >> +#endif >> }; >> >> static __always_inline unsigned int skb_ext_total_length(void) >> >> -- >> 2.48.1 >> > > Thanks >
On 2025/03/10 13:01, Jason Wang wrote: > On Mon, Mar 10, 2025 at 11:55 AM Jason Wang <jasowang@redhat.com> wrote: >> >> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>> >>> Hash reporting >>> ============== >>> >>> Allow the guest to reuse the hash value to make receive steering >>> consistent between the host and guest, and to save hash computation. >>> >>> RSS >>> === >>> >>> RSS is a receive steering algorithm that can be negotiated to use with >>> virtio_net. Conventionally the hash calculation was done by the VMM. >>> However, computing the hash after the queue was chosen defeats the >>> purpose of RSS. >>> >>> Another approach is to use eBPF steering program. This approach has >>> another downside: it cannot report the calculated hash due to the >>> restrictive nature of eBPF steering program. >>> >>> Introduce the code to perform RSS to the kernel in order to overcome >>> thse challenges. An alternative solution is to extend the eBPF steering >>> program so that it will be able to report to the userspace, but I didn't >>> opt for it because extending the current mechanism of eBPF steering >>> program as is because it relies on legacy context rewriting, and >>> introducing kfunc-based eBPF will result in non-UAPI dependency while >>> the other relevant virtualization APIs such as KVM and vhost_net are >>> UAPIs. >>> >>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >>> Tested-by: Lei Yang <leiyang@redhat.com> >>> --- >>> Documentation/networking/tuntap.rst | 7 ++ >>> drivers/net/Kconfig | 1 + >>> drivers/net/tap.c | 68 ++++++++++++++- >>> drivers/net/tun.c | 98 +++++++++++++++++----- >>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >>> include/linux/if_tap.h | 2 + >>> include/linux/skbuff.h | 3 + >>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >>> net/core/skbuff.c | 4 + >>> 9 files changed, 386 insertions(+), 31 deletions(-) > > [...] > >>> + * >>> + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only >>> + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal >>> + * to the size of &struct virtio_net_hdr_v1_hash. >> >> So you had a dependency check already for vnet hdr len. I'd still >> suggest to split this into rss and hash as they are separated >> features. Then we can use separate data structure for them instead of >> a container struct. >> > > Besides this, I think we still need to add new bits to TUNGETIFF to > let userspace know about the new ability. The userspace can peform TUNGETVNETHASHCAP and see if it results in EINVAL. Regards, Akihiko Odaki > > Thanks >
On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2025/03/10 12:55, Jason Wang wrote: > > On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> Hash reporting > >> ============== > >> > >> Allow the guest to reuse the hash value to make receive steering > >> consistent between the host and guest, and to save hash computation. > >> > >> RSS > >> === > >> > >> RSS is a receive steering algorithm that can be negotiated to use with > >> virtio_net. Conventionally the hash calculation was done by the VMM. > >> However, computing the hash after the queue was chosen defeats the > >> purpose of RSS. > >> > >> Another approach is to use eBPF steering program. This approach has > >> another downside: it cannot report the calculated hash due to the > >> restrictive nature of eBPF steering program. > >> > >> Introduce the code to perform RSS to the kernel in order to overcome > >> thse challenges. An alternative solution is to extend the eBPF steering > >> program so that it will be able to report to the userspace, but I didn't > >> opt for it because extending the current mechanism of eBPF steering > >> program as is because it relies on legacy context rewriting, and > >> introducing kfunc-based eBPF will result in non-UAPI dependency while > >> the other relevant virtualization APIs such as KVM and vhost_net are > >> UAPIs. > >> > >> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > >> Tested-by: Lei Yang <leiyang@redhat.com> > >> --- > >> Documentation/networking/tuntap.rst | 7 ++ > >> drivers/net/Kconfig | 1 + > >> drivers/net/tap.c | 68 ++++++++++++++- > >> drivers/net/tun.c | 98 +++++++++++++++++----- > >> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- > >> include/linux/if_tap.h | 2 + > >> include/linux/skbuff.h | 3 + > >> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ > >> net/core/skbuff.c | 4 + > >> 9 files changed, 386 insertions(+), 31 deletions(-) > >> > >> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst > >> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 > >> --- a/Documentation/networking/tuntap.rst > >> +++ b/Documentation/networking/tuntap.rst > >> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: > >> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); > >> } > >> > >> +3.4 Reference > >> +------------- > >> + > >> +``linux/if_tun.h`` defines the interface described below: > >> + > >> +.. kernel-doc:: include/uapi/linux/if_tun.h > >> + > >> Universal TUN/TAP device driver Frequently Asked Question > >> ========================================================= > >> > >> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig > >> index 1fd5acdc73c6af0e1a861867039c3624fc618e25..aecfd244dd83585fea2c5b815dcd787c58166c28 100644 > >> --- a/drivers/net/Kconfig > >> +++ b/drivers/net/Kconfig > >> @@ -395,6 +395,7 @@ config TUN > >> tristate "Universal TUN/TAP device driver support" > >> depends on INET > >> select CRC32 > >> + select SKB_EXTENSIONS > >> help > >> TUN/TAP provides packet reception and transmission for user space > >> programs. It can be viewed as a simple Point-to-Point or Ethernet > >> diff --git a/drivers/net/tap.c b/drivers/net/tap.c > >> index d4ece538f1b23789ca60caa6232690e4d0a4d14a..9428b63ec27e7f92e78a78afcb5e24383862c00d 100644 > >> --- a/drivers/net/tap.c > >> +++ b/drivers/net/tap.c > >> @@ -49,6 +49,10 @@ struct major_info { > >> struct list_head next; > >> }; > >> > >> +struct tap_skb_cb { > >> + struct virtio_net_hash hash; > >> +}; > >> + > >> #define GOODCOPY_LEN 128 > >> > >> static const struct proto_ops tap_socket_ops; > >> @@ -179,6 +183,22 @@ static void tap_put_queue(struct tap_queue *q) > >> sock_put(&q->sk); > >> } > >> > >> +static struct tap_skb_cb *tap_skb_cb(const struct sk_buff *skb) > >> +{ > >> + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct tap_skb_cb)); > >> + return (struct tap_skb_cb *)skb->cb; > >> +} > >> + > >> +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb) > >> +{ > >> + return &tap_skb_cb(skb)->hash; > >> +} > >> + > >> +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb) > >> +{ > >> + return &tap_skb_cb(skb)->hash; > >> +} > >> + > >> /* > >> * Select a queue based on the rxq of the device on which this packet > >> * arrived. If the incoming device is not mq, calculate a flow hash > >> @@ -189,6 +209,7 @@ static void tap_put_queue(struct tap_queue *q) > >> static struct tap_queue *tap_get_queue(struct tap_dev *tap, > >> struct sk_buff *skb) > >> { > >> + struct flow_keys_basic keys_basic; > >> struct tap_queue *queue = NULL; > >> /* Access to taps array is protected by rcu, but access to numvtaps > >> * isn't. Below we use it to lookup a queue, but treat it as a hint > >> @@ -196,17 +217,47 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, > >> * racing against queue removal. > >> */ > >> int numvtaps = READ_ONCE(tap->numvtaps); > >> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash); > >> __u32 rxq; > >> > >> + *tap_skb_cb(skb) = (struct tap_skb_cb) { > >> + .hash = { .report = VIRTIO_NET_HASH_REPORT_NONE } > >> + }; > >> + > >> if (!numvtaps) > >> goto out; > >> > >> if (numvtaps == 1) > >> goto single; > >> > >> + if (vnet_hash) { > >> + if ((vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { > >> + rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash); > >> + queue = rcu_dereference(tap->taps[rxq]); > >> + goto out; > >> + } > >> + > >> + if (!skb->l4_hash && !skb->sw_hash) { > >> + struct flow_keys keys; > >> + > >> + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); > >> + rxq = flow_hash_from_keys(&keys); > >> + keys_basic = (struct flow_keys_basic) { > >> + .control = keys.control, > >> + .basic = keys.basic > >> + }; > >> + } else { > >> + skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0, > >> + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); > >> + rxq = skb->hash; > >> + } > >> + } else { > >> + rxq = skb_get_hash(skb); > >> + } > >> + > >> /* Check if we can use flow to select a queue */ > >> - rxq = skb_get_hash(skb); > >> if (rxq) { > >> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash); > >> queue = rcu_dereference(tap->taps[rxq % numvtaps]); > >> goto out; > >> } > >> @@ -711,11 +762,12 @@ static ssize_t tap_put_user(struct tap_queue *q, > >> int total; > >> > >> if (q->flags & IFF_VNET_HDR) { > >> - struct virtio_net_hdr vnet_hdr; > >> + struct virtio_net_hdr_v1_hash vnet_hdr; > >> > >> vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); > >> > >> - ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); > >> + ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb, > >> + tap_find_hash, &vnet_hdr); > >> if (ret) > >> return ret; > >> > >> @@ -992,6 +1044,16 @@ static long tap_ioctl(struct file *file, unsigned int cmd, > >> rtnl_unlock(); > >> return ret; > >> > >> + case TUNGETVNETHASHCAP: > >> + return tun_vnet_ioctl_gethashcap(argp); > >> + > >> + case TUNSETVNETHASH: > >> + rtnl_lock(); > >> + tap = rtnl_dereference(q->tap); > >> + ret = tap ? tun_vnet_ioctl_sethash(&tap->vnet_hash, true, argp) : -EBADFD; > >> + rtnl_unlock(); > >> + return ret; > >> + > >> case SIOCGIFHWADDR: > >> rtnl_lock(); > >> tap = tap_get_tap_dev(q); > >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c > >> index d8f4d3e996a7a81d1f8b04635054081671a14f07..520013df416e93d3a50b46be9b53ae9ab410eab4 100644 > >> --- a/drivers/net/tun.c > >> +++ b/drivers/net/tun.c > >> @@ -209,6 +209,7 @@ struct tun_struct { > >> struct bpf_prog __rcu *xdp_prog; > >> struct tun_prog __rcu *steering_prog; > >> struct tun_prog __rcu *filter_prog; > >> + struct tun_vnet_hash_container __rcu *vnet_hash; > >> struct ethtool_link_ksettings link_ksettings; > >> /* init args */ > >> struct file *file; > >> @@ -451,20 +452,37 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) > >> e->rps_rxhash = hash; > >> } > >> > >> +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb) > >> +{ > >> + return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); > >> +} > >> + > >> +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb) > >> +{ > >> + return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); > >> +} > >> + > >> /* We try to identify a flow through its rxhash. The reason that > >> * we do not check rxq no. is because some cards(e.g 82599), chooses > >> * the rxq based on the txq where the last packet of the flow comes. As > >> * the userspace application move between processors, we may get a > >> * different rxq no. here. > >> */ > >> -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) > >> +static u16 tun_automq_select_queue(struct tun_struct *tun, > >> + const struct tun_vnet_hash_container *vnet_hash, > >> + struct sk_buff *skb) > >> { > >> + struct flow_keys keys; > >> + struct flow_keys_basic keys_basic; > >> struct tun_flow_entry *e; > >> u32 txq, numqueues; > >> > >> numqueues = READ_ONCE(tun->numqueues); > >> > >> - txq = __skb_get_hash_symmetric(skb); > >> + memset(&keys, 0, sizeof(keys)); > >> + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); > >> + > >> + txq = flow_hash_from_keys(&keys); > >> e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); > >> if (e) { > >> tun_flow_save_rps_rxhash(e, txq); > >> @@ -473,6 +491,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) > >> txq = reciprocal_scale(txq, numqueues); > >> } > >> > >> + keys_basic = (struct flow_keys_basic) { > >> + .control = keys.control, > >> + .basic = keys.basic > >> + }; > >> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq, > >> + tun_add_hash); > >> + > >> return txq; > >> } > >> > >> @@ -500,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, > >> u16 ret; > >> > >> rcu_read_lock(); > >> - if (rcu_dereference(tun->steering_prog)) > >> + if (rcu_dereference(tun->steering_prog)) { > >> ret = tun_ebpf_select_queue(tun, skb); > >> - else > >> - ret = tun_automq_select_queue(tun, skb); > >> + } else { > >> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); > >> + > >> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) > >> + ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash, > >> + skb, tun_add_hash); > >> + else > >> + ret = tun_automq_select_queue(tun, vnet_hash, skb); > >> + } > >> rcu_read_unlock(); > >> > >> return ret; > >> @@ -1987,7 +2019,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, > >> ssize_t ret; > >> > >> if (tun->flags & IFF_VNET_HDR) { > >> - struct virtio_net_hdr gso = { 0 }; > >> + struct virtio_net_hdr_v1_hash gso = { 0 }; > >> > >> vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); > >> ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); > >> @@ -2040,9 +2072,10 @@ static ssize_t tun_put_user(struct tun_struct *tun, > >> } > >> > >> if (vnet_hdr_sz) { > >> - struct virtio_net_hdr gso; > >> + struct virtio_net_hdr_v1_hash gso; > >> > >> - ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); > >> + ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, > >> + skb, tun_find_hash, &gso); > >> if (ret) > >> return ret; > >> > >> @@ -2223,6 +2256,7 @@ static void tun_free_netdev(struct net_device *dev) > >> security_tun_dev_free_security(tun->security); > >> __tun_set_ebpf(tun, &tun->steering_prog, NULL); > >> __tun_set_ebpf(tun, &tun->filter_prog, NULL); > >> + kfree_rcu_mightsleep(rcu_access_pointer(tun->vnet_hash)); > >> } > >> > >> static void tun_setup(struct net_device *dev) > >> @@ -2921,13 +2955,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) > >> } > >> > >> static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, > >> - void __user *data) > >> + int fd) > >> { > >> struct bpf_prog *prog; > >> - int fd; > >> - > >> - if (copy_from_user(&fd, data, sizeof(fd))) > >> - return -EFAULT; > >> > >> if (fd == -1) { > >> prog = NULL; > >> @@ -2993,7 +3023,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > >> int ifindex; > >> int sndbuf; > >> int ret; > >> + int fd; > >> bool do_notify = false; > >> + struct tun_vnet_hash_container *vnet_hash; > >> > >> if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || > >> (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { > >> @@ -3020,7 +3052,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > >> rtnl_lock(); > >> > >> tun = tun_get(tfile); > >> - if (cmd == TUNSETIFF) { > >> + switch (cmd) { > >> + case TUNSETIFF: > >> ret = -EEXIST; > >> if (tun) > >> goto unlock; > >> @@ -3035,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > >> if (copy_to_user(argp, &ifr, ifreq_len)) > >> ret = -EFAULT; > >> goto unlock; > >> - } > >> - if (cmd == TUNSETIFINDEX) { > >> + > >> + case TUNSETIFINDEX: > >> ret = -EPERM; > >> if (tun) > >> goto unlock; > >> @@ -3050,6 +3083,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > >> ret = 0; > >> tfile->ifindex = ifindex; > >> goto unlock; > >> + > >> + case TUNGETVNETHASHCAP: > >> + ret = tun_vnet_ioctl_gethashcap(argp); > >> + goto unlock; > >> } > >> > >> ret = -EBADFD; > >> @@ -3230,11 +3267,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > >> break; > >> > >> case TUNSETSTEERINGEBPF: > >> - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); > >> + if (get_user(fd, (int __user *)argp)) { > >> + ret = -EFAULT; > >> + break; > >> + } > >> + > >> + vnet_hash = rtnl_dereference(tun->vnet_hash); > >> + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { > >> + ret = -EBUSY; > >> + break; > >> + } > >> + > >> + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); > >> break; > >> > >> case TUNSETFILTEREBPF: > >> - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); > >> + if (get_user(fd, (int __user *)argp)) { > >> + ret = -EFAULT; > >> + break; > >> + } > >> + > >> + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); > >> break; > >> > >> case TUNSETCARRIER: > >> @@ -3252,8 +3305,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > >> ret = open_related_ns(&net->ns, get_net_ns); > >> break; > >> > >> + case TUNSETVNETHASH: > >> + ret = tun_vnet_ioctl_sethash(&tun->vnet_hash, > >> + !rtnl_dereference(tun->steering_prog), > >> + argp); > >> + break; > >> + > >> default: > >> - ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); > >> + ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, > >> + cmd, argp); > >> break; > >> } > >> > >> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h > >> index 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed 100644 > >> --- a/drivers/net/tun_vnet.h > >> +++ b/drivers/net/tun_vnet.h > >> @@ -6,6 +6,16 @@ > >> #define TUN_VNET_LE 0x80000000 > >> #define TUN_VNET_BE 0x40000000 > >> > >> +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *); > >> +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *); > >> + > >> +struct tun_vnet_hash_container { > >> + struct tun_vnet_hash common; > > > > I'd rename this as hash. > > > >> + struct tun_vnet_hash_rss rss; > >> + u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > >> + u16 rss_indirection_table[]; > >> +}; > > > > Besides the separated ioctl, I'd split this structure into rss and > > hash part as well. > > > >> + > >> static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) > >> { > >> bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && > >> @@ -107,6 +117,123 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, > >> } > >> } > >> > >> +static inline long tun_vnet_ioctl_gethashcap(void __user *argp) > >> +{ > >> + static const struct tun_vnet_hash cap = { > >> + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, > >> + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES > >> + }; > >> + > >> + return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; > > > > Let's has a consistent name for this and the uapi to be consistent > > with TUNSETIFF/TUNGETIFF. Probably TUNSETVNETHASH and > > tun_vnet_ioctl_gethash(). > > They have different semantics so they should have different names. > TUNGETIFF reports the value currently set while TUNGETVNETHASHCAP > reports the value that can be set later. I'm not sure I will get here. I meant a symmetric name TUNSETVNETHASH and TUNVETVNETHASH. > > > > >> +} > >> + > >> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, > >> + bool can_rss, void __user *argp) > > > > So again, can_rss seems to be tricky. Looking at its caller, it tires > > to make eBPF and RSS mutually exclusive. I still don't understand why > > we need this. Allow eBPF program to override some of the path seems to > > be common practice. > > > > What's more, we didn't try (or even can't) to make automq and eBPF to > > be mutually exclusive. So I still didn't see what we gain from this > > and it complicates the codes and may lead to ambiguous uAPI/behaviour. > > automq and eBPF are mutually exclusive; automq is disabled when an eBPF > steering program is set so I followed the example here. I meant from the view of uAPI, the kernel doesn't or can't reject eBPF while using automq. > > We don't even have an interface for eBPF to let it fall back to another > alogirhtm. It doesn't even need this, e.g XDP overrides the default receiving path. > I could make it fall back to RSS if the eBPF steeering > program is designed to fall back to automq when it returns e.g., -1. But > such an interface is currently not defined and defining one is out of > scope of this patch series. Just to make sure we are on the same page, I meant we just need to make the behaviour consistent: allow eBPF to override the behaviour of both automq and rss. > > > > >> +{ > >> + struct tun_vnet_hash hash_buf; > >> + struct tun_vnet_hash_container *hash; > >> + > >> + if (copy_from_user(&hash_buf, argp, sizeof(hash_buf))) > >> + return -EFAULT; > >> + argp = (struct tun_vnet_hash __user *)argp + 1; > >> + > >> + if (hash_buf.flags & TUN_VNET_HASH_RSS) { > >> + struct tun_vnet_hash_rss rss; > >> + size_t indirection_table_size; > >> + size_t key_size; > >> + size_t size; > >> + > >> + if (!can_rss) > >> + return -EBUSY; > >> + > >> + if (copy_from_user(&rss, argp, sizeof(rss))) > >> + return -EFAULT; > >> + argp = (struct tun_vnet_hash_rss __user *)argp + 1; > >> + > >> + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; > >> + key_size = virtio_net_hash_key_length(hash_buf.types); > >> + size = struct_size(hash, rss_indirection_table, > >> + (size_t)rss.indirection_table_mask + 1); > >> + > >> + hash = kmalloc(size, GFP_KERNEL); > >> + if (!hash) > >> + return -ENOMEM; > >> + > >> + if (copy_from_user(hash->rss_indirection_table, > >> + argp, indirection_table_size)) { > >> + kfree(hash); > >> + return -EFAULT; > >> + } > >> + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; > >> + > >> + if (copy_from_user(hash->rss_key, argp, key_size)) { > >> + kfree(hash); > >> + return -EFAULT; > >> + } > >> + > >> + virtio_net_toeplitz_convert_key(hash->rss_key, key_size); > >> + hash->rss = rss; > >> + } else { > >> + hash = kmalloc(sizeof(hash->common), GFP_KERNEL); > >> + if (!hash) > >> + return -ENOMEM; > > > > Do we need to validate the hash here (at least against the types we supported?) > > > >> + } > >> + > >> + hash->common = hash_buf; > >> + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash)); > > > > I still didn't understand the trick here. E.g we use very simple > > primitives in synchronizing ebpf program through RCU in > > __tun_set_ebpf(). > > It is even simpler than __tun_set_ebpf(). The differences from > __tun_set_ebpf() are: > 1. This uses the rtnl lock instead of TUN-specific one. It makes the > code simpler as the rtnl lock is already taken in __tun_chr_ioctl(). It can be tweaked to use rtnl as well. > 2. This does not add rcu_head and uses blocking APIs for simplicity. Right. > > > > >> + return 0; > >> +} > >> + > >> +static void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash, > >> + struct sk_buff *skb, > >> + const struct flow_keys_basic *keys, > >> + u32 value, > >> + tun_vnet_hash_add vnet_hash_add) > >> +{ > >> + struct virtio_net_hash *report; > >> + > >> + if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT)) > >> + return; > >> + > >> + report = vnet_hash_add(skb); > >> + if (!report) > >> + return; > >> + > >> + *report = (struct virtio_net_hash) { > >> + .report = virtio_net_hash_report(hash->common.types, keys), > >> + .value = value > >> + }; > > > > What's the advantage of using Designated Initializers here? Simple > > assignment can save two lines of code. > > It automatically fills other fileds with zero. Simple assignments will > need more tokens for zeroing. Ok. > > > > >> +} > >> + > >> +static u16 tun_vnet_rss_select_queue(u32 numqueues, > >> + const struct tun_vnet_hash_container *hash, > >> + struct sk_buff *skb, > >> + tun_vnet_hash_add vnet_hash_add) > >> +{ > >> + struct virtio_net_hash *report; > >> + struct virtio_net_hash ret; > >> + u16 txq, index; > >> + > >> + if (!numqueues) > >> + return 0; > >> + > >> + virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret); > >> + > >> + if (!ret.report) > >> + return hash->rss.unclassified_queue % numqueues; > >> + > >> + if (hash->common.flags & TUN_VNET_HASH_REPORT) { > >> + report = vnet_hash_add(skb); > >> + if (report) > >> + *report = ret; > >> + } > > > > Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? > > If yes, it should be a bug. > > It is possible to use RSS without TUN_VNET_HASH_REPORT. Another call to separate the ioctls then. > It is more of a > feature instead of a bug; it behaves like QEMU's eBPF program but > requires no privilege and is more optimized with native code and ffs(). > > > > >> + > >> + index = ret.value & hash->rss.indirection_table_mask; > >> + txq = READ_ONCE(hash->rss_indirection_table[index]); > > > > So vnet_hash is accessed via rcu_dereference(), I don't get any reason > > we need READ_ONCE here, is this paired with something? If yes, let's > > add a comment here. If rss_indirection_table need why > > indirection_table_mask doesn't need this? > > I'll drop it. I think it's just a left-over of previous versions without > RCU. > > > > >> + > >> + return txq % numqueues; > >> +} > >> + > >> static inline int tun_vnet_hdr_get(int sz, unsigned int flags, > >> struct iov_iter *from, > >> struct virtio_net_hdr *hdr) > >> @@ -135,15 +262,17 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags, > >> } > >> > >> static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, > >> - const struct virtio_net_hdr *hdr) > >> + const struct virtio_net_hdr_v1_hash *hdr) > >> { > > > > To be more robust, we can tweak the function to accept a vnet_hdr_len > > parameter then we can avoid touching this every time when we need to > > extend vnet hdr in the future? > > I think you meant vnet_hdr_sz instead of vnet_hdr_len. It is already > passed just as "sz" here as the function name already says it's about > the header. > > It is possible to add another parameter for sizeof(*hdr) and convert the > hdr parameter to void * to avoid future changes. But I rather keep it as > is because the current form ensures the hdr is large enough and > statically avoids buffer overrun. Right. > > > > >> + int content_sz = MIN(sizeof(*hdr), sz); > >> + > >> if (unlikely(iov_iter_count(iter) < sz)) > >> return -EINVAL; > >> > >> - if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) > >> + if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz)) > >> return -EFAULT; > >> > >> - if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) > >> + if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) > >> return -EFAULT; > >> > >> return 0; > >> @@ -155,26 +284,38 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, > >> return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); > >> } > >> > >> -static inline int tun_vnet_hdr_from_skb(unsigned int flags, > >> +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags, > >> const struct net_device *dev, > >> const struct sk_buff *skb, > >> - struct virtio_net_hdr *hdr) > >> + tun_vnet_hash_find vnet_hash_find, > >> + struct virtio_net_hdr_v1_hash *hdr) > >> { > >> int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; > >> + const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ? > >> + NULL : vnet_hash_find(skb); > >> + > >> + *hdr = (struct virtio_net_hdr_v1_hash) { > >> + .hash_report = VIRTIO_NET_HASH_REPORT_NONE > >> + }; > >> + > >> + if (report) { > >> + hdr->hash_value = cpu_to_le32(report->value); > >> + hdr->hash_report = cpu_to_le16(report->report); > >> + } > >> > >> - if (virtio_net_hdr_from_skb(skb, hdr, > >> + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, > >> tun_vnet_is_little_endian(flags), true, > >> vlan_hlen)) { > >> struct skb_shared_info *sinfo = skb_shinfo(skb); > >> > >> if (net_ratelimit()) { > >> netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", > >> - sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size), > >> - tun_vnet16_to_cpu(flags, hdr->hdr_len)); > >> + sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size), > >> + tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len)); > >> print_hex_dump(KERN_ERR, "tun: ", > >> DUMP_PREFIX_NONE, > >> 16, 1, skb->head, > >> - min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true); > >> + min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true); > >> } > >> WARN_ON_ONCE(1); > >> return -EINVAL; > >> diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h > >> index 553552fa635c3e1e53d1a63c203d32e4c4fd5a4f..7334c46a3f101675a0d4e5a036987cfe18842f9f 100644 > >> --- a/include/linux/if_tap.h > >> +++ b/include/linux/if_tap.h > >> @@ -31,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f) > >> #define MAX_TAP_QUEUES 256 > >> > >> struct tap_queue; > >> +struct tun_vnet_hash_container; > >> > >> struct tap_dev { > >> struct net_device *dev; > >> @@ -43,6 +44,7 @@ struct tap_dev { > >> int numqueues; > >> netdev_features_t tap_features; > >> int minor; > >> + struct tun_vnet_hash_container __rcu *vnet_hash; > >> > >> void (*update_features)(struct tap_dev *tap, netdev_features_t features); > >> void (*count_tx_dropped)(struct tap_dev *tap); > >> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > >> index bb2b751d274acff931281a72e8b4b0c699b4e8af..cdd793f1c360ad5f63fcc4cbf67d845f5e2ccf6f 100644 > >> --- a/include/linux/skbuff.h > >> +++ b/include/linux/skbuff.h > >> @@ -4842,6 +4842,9 @@ enum skb_ext_id { > >> #endif > >> #if IS_ENABLED(CONFIG_MCTP_FLOWS) > >> SKB_EXT_MCTP, > >> +#endif > >> +#if IS_ENABLED(CONFIG_TUN) > >> + SKB_EXT_TUN_VNET_HASH, > >> #endif > >> SKB_EXT_NUM, /* must be last */ > >> }; > >> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h > >> index 287cdc81c9390c289a30545aa7ed23d81c3329d3..4887f97500a870c7ef3c96a5837b2d0a5a225040 100644 > >> --- a/include/uapi/linux/if_tun.h > >> +++ b/include/uapi/linux/if_tun.h > >> @@ -62,6 +62,42 @@ > >> #define TUNSETCARRIER _IOW('T', 226, int) > >> #define TUNGETDEVNETNS _IO('T', 227) > >> > >> +/** > >> + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability. > >> + * > >> + * The argument is a pointer to &struct tun_vnet_hash which will store the > >> + * maximal virtio_net hashing configuration. > >> + */ > >> +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash) > >> + > >> +/** > >> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing > >> + * > >> + * The argument is a pointer to &struct tun_vnet_hash. > >> + * > >> + * The argument is a pointer to the compound of the following in order if > >> + * %TUN_VNET_HASH_RSS is set: > >> + * > >> + * 1. &struct tun_vnet_hash > >> + * 2. &struct tun_vnet_hash_rss > >> + * 3. Indirection table > >> + * 4. Key > >> + * > >> + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only > >> + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal > >> + * to the size of &struct virtio_net_hdr_v1_hash. > > > > So you had a dependency check already for vnet hdr len. I'd still > > suggest to split this into rss and hash as they are separated > > features. Then we can use separate data structure for them instead of > > a container struct. > > I added a dependency check and found it is complicating the code and > requires additional tests. I need a reason to justify the complexity if > we are going to split it. As we discussed above: They don't dedpend on each other. > > Regards, > Akihiko Odaki > > > > >> + * > >> + * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will > >> + * always be little-endian. > >> + * > >> + * This ioctl results in %EBADFD if the underlying device is deleted. It affects > >> + * all queues attached to the same device. > >> + * > >> + * This ioctl currently has no effect on XDP packets and packets with > >> + * queue_mapping set by TC. > >> + */ > >> +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash) > >> + > >> /* TUNSETIFF ifr flags */ > >> #define IFF_TUN 0x0001 > >> #define IFF_TAP 0x0002 > >> @@ -115,4 +151,43 @@ struct tun_filter { > >> __u8 addr[][ETH_ALEN]; > >> }; > >> > >> +/** > >> + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost > >> + */ > >> +#define TUN_VNET_HASH_REPORT 0x0001 > >> + > >> +/** > >> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS > >> + * > >> + * This is mutually exclusive with eBPF steering program. > >> + */ > >> +#define TUN_VNET_HASH_RSS 0x0002 > >> + > >> +/** > >> + * struct tun_vnet_hash - virtio_net hashing configuration > >> + * @flags: > >> + * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS > >> + * @pad: > >> + * Should be filled with zero before passing to %TUNSETVNETHASH > >> + * @types: > >> + * Bitmask of allowed hash types > >> + */ > >> +struct tun_vnet_hash { > >> + __u16 flags; > >> + __u8 pad[2]; > >> + __u32 types; > >> +}; > > > > Padding in the middle of the structure is not elegant. Any reason for this? > > > > And hash->types seems never used. > > > >> + > >> +/** > >> + * struct tun_vnet_hash_rss - virtio_net RSS configuration > >> + * @indirection_table_mask: > >> + * Bitmask to be applied to the indirection table index > >> + * @unclassified_queue: > >> + * The index of the queue to place unclassified packets in > >> + */ > >> +struct tun_vnet_hash_rss { > >> + __u16 indirection_table_mask; > >> + __u16 unclassified_queue; > >> +}; > >> + > >> #endif /* _UAPI__IF_TUN_H */ > >> diff --git a/net/core/skbuff.c b/net/core/skbuff.c > >> index 7b03b64fdcb276f68ce881d1d8da8e4c6b897efc..aa2a091b649f0c9d6e0196f34f345ba78b5498fb 100644 > >> --- a/net/core/skbuff.c > >> +++ b/net/core/skbuff.c > >> @@ -64,6 +64,7 @@ > >> #include <linux/mpls.h> > >> #include <linux/kcov.h> > >> #include <linux/iov_iter.h> > >> +#include <linux/virtio_net.h> > >> > >> #include <net/protocol.h> > >> #include <net/dst.h> > >> @@ -4969,6 +4970,9 @@ static const u8 skb_ext_type_len[] = { > >> #if IS_ENABLED(CONFIG_MCTP_FLOWS) > >> [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), > >> #endif > >> +#if IS_ENABLED(CONFIG_TUN) > >> + [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct virtio_net_hash), > >> +#endif > >> }; > >> > >> static __always_inline unsigned int skb_ext_total_length(void) > >> > >> -- > >> 2.48.1 > >> > > > > Thanks > > >
On Mon, Mar 10, 2025 at 3:59 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2025/03/10 12:55, Jason Wang wrote: > > On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> Hash reporting > >> ============== > >> > >> Allow the guest to reuse the hash value to make receive steering > >> consistent between the host and guest, and to save hash computation. > >> > >> RSS > >> === > >> > >> RSS is a receive steering algorithm that can be negotiated to use with > >> virtio_net. Conventionally the hash calculation was done by the VMM. > >> However, computing the hash after the queue was chosen defeats the > >> purpose of RSS. > >> > >> Another approach is to use eBPF steering program. This approach has > >> another downside: it cannot report the calculated hash due to the > >> restrictive nature of eBPF steering program. > >> > >> Introduce the code to perform RSS to the kernel in order to overcome > >> thse challenges. An alternative solution is to extend the eBPF steering > >> program so that it will be able to report to the userspace, but I didn't > >> opt for it because extending the current mechanism of eBPF steering > >> program as is because it relies on legacy context rewriting, and > >> introducing kfunc-based eBPF will result in non-UAPI dependency while > >> the other relevant virtualization APIs such as KVM and vhost_net are > >> UAPIs. > >> > >> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > >> Tested-by: Lei Yang <leiyang@redhat.com> > >> --- > >> Documentation/networking/tuntap.rst | 7 ++ > >> drivers/net/Kconfig | 1 + > >> drivers/net/tap.c | 68 ++++++++++++++- > >> drivers/net/tun.c | 98 +++++++++++++++++----- > >> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- > >> include/linux/if_tap.h | 2 + > >> include/linux/skbuff.h | 3 + > >> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ > >> net/core/skbuff.c | 4 + > >> 9 files changed, 386 insertions(+), 31 deletions(-) > >> > >> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst > >> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 > >> --- a/Documentation/networking/tuntap.rst > >> +++ b/Documentation/networking/tuntap.rst > >> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: > >> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); > >> } > >> [...] > >> > >> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h > >> index 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed 100644 > >> --- a/drivers/net/tun_vnet.h > >> +++ b/drivers/net/tun_vnet.h > >> @@ -6,6 +6,16 @@ > >> #define TUN_VNET_LE 0x80000000 > >> #define TUN_VNET_BE 0x40000000 > >> > >> +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *); > >> +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *); > >> + > >> +struct tun_vnet_hash_container { > >> + struct tun_vnet_hash common; > > > > I'd rename this as hash. > > Everything in this structure is about hash. "common" represents its > feature well. > > I see a few alternative options though I don't prefer them either; they > make the code verbose and I don't think they are worthwhile: > 1. Rename tun_vnet_hash to tun_vnet_hash_common. > 2. Prefix the other fields with "hash_" for consistency. Or use different structures, one for hash_report another is for rss. > > > > >> + struct tun_vnet_hash_rss rss; > >> + u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > >> + u16 rss_indirection_table[]; > >> +}; > > > > Besides the separated ioctl, I'd split this structure into rss and > > hash part as well. Like this. Thanks
On 2025/03/11 9:38, Jason Wang wrote: > On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2025/03/10 12:55, Jason Wang wrote: >>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> Hash reporting >>>> ============== >>>> >>>> Allow the guest to reuse the hash value to make receive steering >>>> consistent between the host and guest, and to save hash computation. >>>> >>>> RSS >>>> === >>>> >>>> RSS is a receive steering algorithm that can be negotiated to use with >>>> virtio_net. Conventionally the hash calculation was done by the VMM. >>>> However, computing the hash after the queue was chosen defeats the >>>> purpose of RSS. >>>> >>>> Another approach is to use eBPF steering program. This approach has >>>> another downside: it cannot report the calculated hash due to the >>>> restrictive nature of eBPF steering program. >>>> >>>> Introduce the code to perform RSS to the kernel in order to overcome >>>> thse challenges. An alternative solution is to extend the eBPF steering >>>> program so that it will be able to report to the userspace, but I didn't >>>> opt for it because extending the current mechanism of eBPF steering >>>> program as is because it relies on legacy context rewriting, and >>>> introducing kfunc-based eBPF will result in non-UAPI dependency while >>>> the other relevant virtualization APIs such as KVM and vhost_net are >>>> UAPIs. >>>> >>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >>>> Tested-by: Lei Yang <leiyang@redhat.com> >>>> --- >>>> Documentation/networking/tuntap.rst | 7 ++ >>>> drivers/net/Kconfig | 1 + >>>> drivers/net/tap.c | 68 ++++++++++++++- >>>> drivers/net/tun.c | 98 +++++++++++++++++----- >>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >>>> include/linux/if_tap.h | 2 + >>>> include/linux/skbuff.h | 3 + >>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >>>> net/core/skbuff.c | 4 + >>>> 9 files changed, 386 insertions(+), 31 deletions(-) >>>> >>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst >>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 >>>> --- a/Documentation/networking/tuntap.rst >>>> +++ b/Documentation/networking/tuntap.rst >>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >>>> } >>>> >>>> +3.4 Reference >>>> +------------- >>>> + >>>> +``linux/if_tun.h`` defines the interface described below: >>>> + >>>> +.. kernel-doc:: include/uapi/linux/if_tun.h >>>> + >>>> Universal TUN/TAP device driver Frequently Asked Question >>>> ========================================================= >>>> >>>> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig >>>> index 1fd5acdc73c6af0e1a861867039c3624fc618e25..aecfd244dd83585fea2c5b815dcd787c58166c28 100644 >>>> --- a/drivers/net/Kconfig >>>> +++ b/drivers/net/Kconfig >>>> @@ -395,6 +395,7 @@ config TUN >>>> tristate "Universal TUN/TAP device driver support" >>>> depends on INET >>>> select CRC32 >>>> + select SKB_EXTENSIONS >>>> help >>>> TUN/TAP provides packet reception and transmission for user space >>>> programs. It can be viewed as a simple Point-to-Point or Ethernet >>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c >>>> index d4ece538f1b23789ca60caa6232690e4d0a4d14a..9428b63ec27e7f92e78a78afcb5e24383862c00d 100644 >>>> --- a/drivers/net/tap.c >>>> +++ b/drivers/net/tap.c >>>> @@ -49,6 +49,10 @@ struct major_info { >>>> struct list_head next; >>>> }; >>>> >>>> +struct tap_skb_cb { >>>> + struct virtio_net_hash hash; >>>> +}; >>>> + >>>> #define GOODCOPY_LEN 128 >>>> >>>> static const struct proto_ops tap_socket_ops; >>>> @@ -179,6 +183,22 @@ static void tap_put_queue(struct tap_queue *q) >>>> sock_put(&q->sk); >>>> } >>>> >>>> +static struct tap_skb_cb *tap_skb_cb(const struct sk_buff *skb) >>>> +{ >>>> + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct tap_skb_cb)); >>>> + return (struct tap_skb_cb *)skb->cb; >>>> +} >>>> + >>>> +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb) >>>> +{ >>>> + return &tap_skb_cb(skb)->hash; >>>> +} >>>> + >>>> +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb) >>>> +{ >>>> + return &tap_skb_cb(skb)->hash; >>>> +} >>>> + >>>> /* >>>> * Select a queue based on the rxq of the device on which this packet >>>> * arrived. If the incoming device is not mq, calculate a flow hash >>>> @@ -189,6 +209,7 @@ static void tap_put_queue(struct tap_queue *q) >>>> static struct tap_queue *tap_get_queue(struct tap_dev *tap, >>>> struct sk_buff *skb) >>>> { >>>> + struct flow_keys_basic keys_basic; >>>> struct tap_queue *queue = NULL; >>>> /* Access to taps array is protected by rcu, but access to numvtaps >>>> * isn't. Below we use it to lookup a queue, but treat it as a hint >>>> @@ -196,17 +217,47 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, >>>> * racing against queue removal. >>>> */ >>>> int numvtaps = READ_ONCE(tap->numvtaps); >>>> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash); >>>> __u32 rxq; >>>> >>>> + *tap_skb_cb(skb) = (struct tap_skb_cb) { >>>> + .hash = { .report = VIRTIO_NET_HASH_REPORT_NONE } >>>> + }; >>>> + >>>> if (!numvtaps) >>>> goto out; >>>> >>>> if (numvtaps == 1) >>>> goto single; >>>> >>>> + if (vnet_hash) { >>>> + if ((vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { >>>> + rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash); >>>> + queue = rcu_dereference(tap->taps[rxq]); >>>> + goto out; >>>> + } >>>> + >>>> + if (!skb->l4_hash && !skb->sw_hash) { >>>> + struct flow_keys keys; >>>> + >>>> + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); >>>> + rxq = flow_hash_from_keys(&keys); >>>> + keys_basic = (struct flow_keys_basic) { >>>> + .control = keys.control, >>>> + .basic = keys.basic >>>> + }; >>>> + } else { >>>> + skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0, >>>> + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); >>>> + rxq = skb->hash; >>>> + } >>>> + } else { >>>> + rxq = skb_get_hash(skb); >>>> + } >>>> + >>>> /* Check if we can use flow to select a queue */ >>>> - rxq = skb_get_hash(skb); >>>> if (rxq) { >>>> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash); >>>> queue = rcu_dereference(tap->taps[rxq % numvtaps]); >>>> goto out; >>>> } >>>> @@ -711,11 +762,12 @@ static ssize_t tap_put_user(struct tap_queue *q, >>>> int total; >>>> >>>> if (q->flags & IFF_VNET_HDR) { >>>> - struct virtio_net_hdr vnet_hdr; >>>> + struct virtio_net_hdr_v1_hash vnet_hdr; >>>> >>>> vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); >>>> >>>> - ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); >>>> + ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb, >>>> + tap_find_hash, &vnet_hdr); >>>> if (ret) >>>> return ret; >>>> >>>> @@ -992,6 +1044,16 @@ static long tap_ioctl(struct file *file, unsigned int cmd, >>>> rtnl_unlock(); >>>> return ret; >>>> >>>> + case TUNGETVNETHASHCAP: >>>> + return tun_vnet_ioctl_gethashcap(argp); >>>> + >>>> + case TUNSETVNETHASH: >>>> + rtnl_lock(); >>>> + tap = rtnl_dereference(q->tap); >>>> + ret = tap ? tun_vnet_ioctl_sethash(&tap->vnet_hash, true, argp) : -EBADFD; >>>> + rtnl_unlock(); >>>> + return ret; >>>> + >>>> case SIOCGIFHWADDR: >>>> rtnl_lock(); >>>> tap = tap_get_tap_dev(q); >>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >>>> index d8f4d3e996a7a81d1f8b04635054081671a14f07..520013df416e93d3a50b46be9b53ae9ab410eab4 100644 >>>> --- a/drivers/net/tun.c >>>> +++ b/drivers/net/tun.c >>>> @@ -209,6 +209,7 @@ struct tun_struct { >>>> struct bpf_prog __rcu *xdp_prog; >>>> struct tun_prog __rcu *steering_prog; >>>> struct tun_prog __rcu *filter_prog; >>>> + struct tun_vnet_hash_container __rcu *vnet_hash; >>>> struct ethtool_link_ksettings link_ksettings; >>>> /* init args */ >>>> struct file *file; >>>> @@ -451,20 +452,37 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) >>>> e->rps_rxhash = hash; >>>> } >>>> >>>> +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb) >>>> +{ >>>> + return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); >>>> +} >>>> + >>>> +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb) >>>> +{ >>>> + return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); >>>> +} >>>> + >>>> /* We try to identify a flow through its rxhash. The reason that >>>> * we do not check rxq no. is because some cards(e.g 82599), chooses >>>> * the rxq based on the txq where the last packet of the flow comes. As >>>> * the userspace application move between processors, we may get a >>>> * different rxq no. here. >>>> */ >>>> -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >>>> +static u16 tun_automq_select_queue(struct tun_struct *tun, >>>> + const struct tun_vnet_hash_container *vnet_hash, >>>> + struct sk_buff *skb) >>>> { >>>> + struct flow_keys keys; >>>> + struct flow_keys_basic keys_basic; >>>> struct tun_flow_entry *e; >>>> u32 txq, numqueues; >>>> >>>> numqueues = READ_ONCE(tun->numqueues); >>>> >>>> - txq = __skb_get_hash_symmetric(skb); >>>> + memset(&keys, 0, sizeof(keys)); >>>> + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); >>>> + >>>> + txq = flow_hash_from_keys(&keys); >>>> e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); >>>> if (e) { >>>> tun_flow_save_rps_rxhash(e, txq); >>>> @@ -473,6 +491,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >>>> txq = reciprocal_scale(txq, numqueues); >>>> } >>>> >>>> + keys_basic = (struct flow_keys_basic) { >>>> + .control = keys.control, >>>> + .basic = keys.basic >>>> + }; >>>> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq, >>>> + tun_add_hash); >>>> + >>>> return txq; >>>> } >>>> >>>> @@ -500,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, >>>> u16 ret; >>>> >>>> rcu_read_lock(); >>>> - if (rcu_dereference(tun->steering_prog)) >>>> + if (rcu_dereference(tun->steering_prog)) { >>>> ret = tun_ebpf_select_queue(tun, skb); >>>> - else >>>> - ret = tun_automq_select_queue(tun, skb); >>>> + } else { >>>> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); >>>> + >>>> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) >>>> + ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash, >>>> + skb, tun_add_hash); >>>> + else >>>> + ret = tun_automq_select_queue(tun, vnet_hash, skb); >>>> + } >>>> rcu_read_unlock(); >>>> >>>> return ret; >>>> @@ -1987,7 +2019,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, >>>> ssize_t ret; >>>> >>>> if (tun->flags & IFF_VNET_HDR) { >>>> - struct virtio_net_hdr gso = { 0 }; >>>> + struct virtio_net_hdr_v1_hash gso = { 0 }; >>>> >>>> vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); >>>> ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); >>>> @@ -2040,9 +2072,10 @@ static ssize_t tun_put_user(struct tun_struct *tun, >>>> } >>>> >>>> if (vnet_hdr_sz) { >>>> - struct virtio_net_hdr gso; >>>> + struct virtio_net_hdr_v1_hash gso; >>>> >>>> - ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); >>>> + ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, >>>> + skb, tun_find_hash, &gso); >>>> if (ret) >>>> return ret; >>>> >>>> @@ -2223,6 +2256,7 @@ static void tun_free_netdev(struct net_device *dev) >>>> security_tun_dev_free_security(tun->security); >>>> __tun_set_ebpf(tun, &tun->steering_prog, NULL); >>>> __tun_set_ebpf(tun, &tun->filter_prog, NULL); >>>> + kfree_rcu_mightsleep(rcu_access_pointer(tun->vnet_hash)); >>>> } >>>> >>>> static void tun_setup(struct net_device *dev) >>>> @@ -2921,13 +2955,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) >>>> } >>>> >>>> static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, >>>> - void __user *data) >>>> + int fd) >>>> { >>>> struct bpf_prog *prog; >>>> - int fd; >>>> - >>>> - if (copy_from_user(&fd, data, sizeof(fd))) >>>> - return -EFAULT; >>>> >>>> if (fd == -1) { >>>> prog = NULL; >>>> @@ -2993,7 +3023,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >>>> int ifindex; >>>> int sndbuf; >>>> int ret; >>>> + int fd; >>>> bool do_notify = false; >>>> + struct tun_vnet_hash_container *vnet_hash; >>>> >>>> if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || >>>> (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { >>>> @@ -3020,7 +3052,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >>>> rtnl_lock(); >>>> >>>> tun = tun_get(tfile); >>>> - if (cmd == TUNSETIFF) { >>>> + switch (cmd) { >>>> + case TUNSETIFF: >>>> ret = -EEXIST; >>>> if (tun) >>>> goto unlock; >>>> @@ -3035,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >>>> if (copy_to_user(argp, &ifr, ifreq_len)) >>>> ret = -EFAULT; >>>> goto unlock; >>>> - } >>>> - if (cmd == TUNSETIFINDEX) { >>>> + >>>> + case TUNSETIFINDEX: >>>> ret = -EPERM; >>>> if (tun) >>>> goto unlock; >>>> @@ -3050,6 +3083,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >>>> ret = 0; >>>> tfile->ifindex = ifindex; >>>> goto unlock; >>>> + >>>> + case TUNGETVNETHASHCAP: >>>> + ret = tun_vnet_ioctl_gethashcap(argp); >>>> + goto unlock; >>>> } >>>> >>>> ret = -EBADFD; >>>> @@ -3230,11 +3267,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >>>> break; >>>> >>>> case TUNSETSTEERINGEBPF: >>>> - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); >>>> + if (get_user(fd, (int __user *)argp)) { >>>> + ret = -EFAULT; >>>> + break; >>>> + } >>>> + >>>> + vnet_hash = rtnl_dereference(tun->vnet_hash); >>>> + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { >>>> + ret = -EBUSY; >>>> + break; >>>> + } >>>> + >>>> + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); >>>> break; >>>> >>>> case TUNSETFILTEREBPF: >>>> - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); >>>> + if (get_user(fd, (int __user *)argp)) { >>>> + ret = -EFAULT; >>>> + break; >>>> + } >>>> + >>>> + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); >>>> break; >>>> >>>> case TUNSETCARRIER: >>>> @@ -3252,8 +3305,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >>>> ret = open_related_ns(&net->ns, get_net_ns); >>>> break; >>>> >>>> + case TUNSETVNETHASH: >>>> + ret = tun_vnet_ioctl_sethash(&tun->vnet_hash, >>>> + !rtnl_dereference(tun->steering_prog), >>>> + argp); >>>> + break; >>>> + >>>> default: >>>> - ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); >>>> + ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, >>>> + cmd, argp); >>>> break; >>>> } >>>> >>>> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h >>>> index 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed 100644 >>>> --- a/drivers/net/tun_vnet.h >>>> +++ b/drivers/net/tun_vnet.h >>>> @@ -6,6 +6,16 @@ >>>> #define TUN_VNET_LE 0x80000000 >>>> #define TUN_VNET_BE 0x40000000 >>>> >>>> +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *); >>>> +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *); >>>> + >>>> +struct tun_vnet_hash_container { >>>> + struct tun_vnet_hash common; >>> >>> I'd rename this as hash. >>> >>>> + struct tun_vnet_hash_rss rss; >>>> + u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; >>>> + u16 rss_indirection_table[]; >>>> +}; >>> >>> Besides the separated ioctl, I'd split this structure into rss and >>> hash part as well. >>> >>>> + >>>> static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) >>>> { >>>> bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && >>>> @@ -107,6 +117,123 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, >>>> } >>>> } >>>> >>>> +static inline long tun_vnet_ioctl_gethashcap(void __user *argp) >>>> +{ >>>> + static const struct tun_vnet_hash cap = { >>>> + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, >>>> + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES >>>> + }; >>>> + >>>> + return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; >>> >>> Let's has a consistent name for this and the uapi to be consistent >>> with TUNSETIFF/TUNGETIFF. Probably TUNSETVNETHASH and >>> tun_vnet_ioctl_gethash(). >> >> They have different semantics so they should have different names. >> TUNGETIFF reports the value currently set while TUNGETVNETHASHCAP >> reports the value that can be set later. > > I'm not sure I will get here. I meant a symmetric name > > TUNSETVNETHASH and TUNVETVNETHASH. > >> >>> >>>> +} >>>> + >>>> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, >>>> + bool can_rss, void __user *argp) >>> >>> So again, can_rss seems to be tricky. Looking at its caller, it tires >>> to make eBPF and RSS mutually exclusive. I still don't understand why >>> we need this. Allow eBPF program to override some of the path seems to >>> be common practice. >>> >>> What's more, we didn't try (or even can't) to make automq and eBPF to >>> be mutually exclusive. So I still didn't see what we gain from this >>> and it complicates the codes and may lead to ambiguous uAPI/behaviour. >> >> automq and eBPF are mutually exclusive; automq is disabled when an eBPF >> steering program is set so I followed the example here. > > I meant from the view of uAPI, the kernel doesn't or can't reject eBPF > while using automq. > >> >> We don't even have an interface for eBPF to let it fall back to another >> alogirhtm. > > It doesn't even need this, e.g XDP overrides the default receiving path. > >> I could make it fall back to RSS if the eBPF steeering >> program is designed to fall back to automq when it returns e.g., -1. But >> such an interface is currently not defined and defining one is out of >> scope of this patch series. > > Just to make sure we are on the same page, I meant we just need to > make the behaviour consistent: allow eBPF to override the behaviour of > both automq and rss. That assumes eBPF takes precedence over RSS, which is not obvious to me. Let's add an interface for the eBPF steering program to fall back to another steering algorithm. I said it is out of scope before, but it makes clear that the eBPF steering program takes precedence over other algorithms and allows us to delete the code for the configuration validation in this patch. > >> >>> >>>> +{ >>>> + struct tun_vnet_hash hash_buf; >>>> + struct tun_vnet_hash_container *hash; >>>> + >>>> + if (copy_from_user(&hash_buf, argp, sizeof(hash_buf))) >>>> + return -EFAULT; >>>> + argp = (struct tun_vnet_hash __user *)argp + 1; >>>> + >>>> + if (hash_buf.flags & TUN_VNET_HASH_RSS) { >>>> + struct tun_vnet_hash_rss rss; >>>> + size_t indirection_table_size; >>>> + size_t key_size; >>>> + size_t size; >>>> + >>>> + if (!can_rss) >>>> + return -EBUSY; >>>> + >>>> + if (copy_from_user(&rss, argp, sizeof(rss))) >>>> + return -EFAULT; >>>> + argp = (struct tun_vnet_hash_rss __user *)argp + 1; >>>> + >>>> + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; >>>> + key_size = virtio_net_hash_key_length(hash_buf.types); >>>> + size = struct_size(hash, rss_indirection_table, >>>> + (size_t)rss.indirection_table_mask + 1); >>>> + >>>> + hash = kmalloc(size, GFP_KERNEL); >>>> + if (!hash) >>>> + return -ENOMEM; >>>> + >>>> + if (copy_from_user(hash->rss_indirection_table, >>>> + argp, indirection_table_size)) { >>>> + kfree(hash); >>>> + return -EFAULT; >>>> + } >>>> + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; >>>> + >>>> + if (copy_from_user(hash->rss_key, argp, key_size)) { >>>> + kfree(hash); >>>> + return -EFAULT; >>>> + } >>>> + >>>> + virtio_net_toeplitz_convert_key(hash->rss_key, key_size); >>>> + hash->rss = rss; >>>> + } else { >>>> + hash = kmalloc(sizeof(hash->common), GFP_KERNEL); >>>> + if (!hash) >>>> + return -ENOMEM; >>> >>> Do we need to validate the hash here (at least against the types we supported?) >>> >>>> + } >>>> + >>>> + hash->common = hash_buf; >>>> + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash)); >>> >>> I still didn't understand the trick here. E.g we use very simple >>> primitives in synchronizing ebpf program through RCU in >>> __tun_set_ebpf(). >> >> It is even simpler than __tun_set_ebpf(). The differences from >> __tun_set_ebpf() are: >> 1. This uses the rtnl lock instead of TUN-specific one. It makes the >> code simpler as the rtnl lock is already taken in __tun_chr_ioctl(). > > It can be tweaked to use rtnl as well. > >> 2. This does not add rcu_head and uses blocking APIs for simplicity. > > Right. > >> >>> >>>> + return 0; >>>> +} >>>> + >>>> +static void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash, >>>> + struct sk_buff *skb, >>>> + const struct flow_keys_basic *keys, >>>> + u32 value, >>>> + tun_vnet_hash_add vnet_hash_add) >>>> +{ >>>> + struct virtio_net_hash *report; >>>> + >>>> + if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT)) >>>> + return; >>>> + >>>> + report = vnet_hash_add(skb); >>>> + if (!report) >>>> + return; >>>> + >>>> + *report = (struct virtio_net_hash) { >>>> + .report = virtio_net_hash_report(hash->common.types, keys), >>>> + .value = value >>>> + }; >>> >>> What's the advantage of using Designated Initializers here? Simple >>> assignment can save two lines of code. >> >> It automatically fills other fileds with zero. Simple assignments will >> need more tokens for zeroing. > > Ok. > >> >>> >>>> +} >>>> + >>>> +static u16 tun_vnet_rss_select_queue(u32 numqueues, >>>> + const struct tun_vnet_hash_container *hash, >>>> + struct sk_buff *skb, >>>> + tun_vnet_hash_add vnet_hash_add) >>>> +{ >>>> + struct virtio_net_hash *report; >>>> + struct virtio_net_hash ret; >>>> + u16 txq, index; >>>> + >>>> + if (!numqueues) >>>> + return 0; >>>> + >>>> + virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret); >>>> + >>>> + if (!ret.report) >>>> + return hash->rss.unclassified_queue % numqueues; >>>> + >>>> + if (hash->common.flags & TUN_VNET_HASH_REPORT) { >>>> + report = vnet_hash_add(skb); >>>> + if (report) >>>> + *report = ret; >>>> + } >>> >>> Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? >>> If yes, it should be a bug. >> >> It is possible to use RSS without TUN_VNET_HASH_REPORT. > > Another call to separate the ioctls then. RSS and hash reporting are not completely independent though. A plot twist is the "types" parameter; it is a parameter that is "common" for RSS and hash reporting. RSS and hash reporting must share this parameter when both are enabled at the same time; otherwise RSS may compute hash values that are not suited for hash reporting. The paramter will be duplicated if we have separate ioctls for RSS and hash reporting, and the kernel will have a chiken-egg problem when ensuring they are synchronized; when the ioctl for RSS is issued, should the kernel ensure the "types" parameter is identical with one specified for hash reporting? It will not work if the userspace may decide to configure hash reporting after RSS. > >> It is more of a >> feature instead of a bug; it behaves like QEMU's eBPF program but >> requires no privilege and is more optimized with native code and ffs(). >> >>> >>>> + >>>> + index = ret.value & hash->rss.indirection_table_mask; >>>> + txq = READ_ONCE(hash->rss_indirection_table[index]); >>> >>> So vnet_hash is accessed via rcu_dereference(), I don't get any reason >>> we need READ_ONCE here, is this paired with something? If yes, let's >>> add a comment here. If rss_indirection_table need why >>> indirection_table_mask doesn't need this? >> >> I'll drop it. I think it's just a left-over of previous versions without >> RCU. >> >>> >>>> + >>>> + return txq % numqueues; >>>> +} >>>> + >>>> static inline int tun_vnet_hdr_get(int sz, unsigned int flags, >>>> struct iov_iter *from, >>>> struct virtio_net_hdr *hdr) >>>> @@ -135,15 +262,17 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags, >>>> } >>>> >>>> static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, >>>> - const struct virtio_net_hdr *hdr) >>>> + const struct virtio_net_hdr_v1_hash *hdr) >>>> { >>> >>> To be more robust, we can tweak the function to accept a vnet_hdr_len >>> parameter then we can avoid touching this every time when we need to >>> extend vnet hdr in the future? >> >> I think you meant vnet_hdr_sz instead of vnet_hdr_len. It is already >> passed just as "sz" here as the function name already says it's about >> the header. >> >> It is possible to add another parameter for sizeof(*hdr) and convert the >> hdr parameter to void * to avoid future changes. But I rather keep it as >> is because the current form ensures the hdr is large enough and >> statically avoids buffer overrun. > > Right. > >> >>> >>>> + int content_sz = MIN(sizeof(*hdr), sz); >>>> + >>>> if (unlikely(iov_iter_count(iter) < sz)) >>>> return -EINVAL; >>>> >>>> - if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) >>>> + if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz)) >>>> return -EFAULT; >>>> >>>> - if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) >>>> + if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) >>>> return -EFAULT; >>>> >>>> return 0; >>>> @@ -155,26 +284,38 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, >>>> return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); >>>> } >>>> >>>> -static inline int tun_vnet_hdr_from_skb(unsigned int flags, >>>> +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags, >>>> const struct net_device *dev, >>>> const struct sk_buff *skb, >>>> - struct virtio_net_hdr *hdr) >>>> + tun_vnet_hash_find vnet_hash_find, >>>> + struct virtio_net_hdr_v1_hash *hdr) >>>> { >>>> int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; >>>> + const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ? >>>> + NULL : vnet_hash_find(skb); >>>> + >>>> + *hdr = (struct virtio_net_hdr_v1_hash) { >>>> + .hash_report = VIRTIO_NET_HASH_REPORT_NONE >>>> + }; >>>> + >>>> + if (report) { >>>> + hdr->hash_value = cpu_to_le32(report->value); >>>> + hdr->hash_report = cpu_to_le16(report->report); >>>> + } >>>> >>>> - if (virtio_net_hdr_from_skb(skb, hdr, >>>> + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, >>>> tun_vnet_is_little_endian(flags), true, >>>> vlan_hlen)) { >>>> struct skb_shared_info *sinfo = skb_shinfo(skb); >>>> >>>> if (net_ratelimit()) { >>>> netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", >>>> - sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size), >>>> - tun_vnet16_to_cpu(flags, hdr->hdr_len)); >>>> + sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size), >>>> + tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len)); >>>> print_hex_dump(KERN_ERR, "tun: ", >>>> DUMP_PREFIX_NONE, >>>> 16, 1, skb->head, >>>> - min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true); >>>> + min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true); >>>> } >>>> WARN_ON_ONCE(1); >>>> return -EINVAL; >>>> diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h >>>> index 553552fa635c3e1e53d1a63c203d32e4c4fd5a4f..7334c46a3f101675a0d4e5a036987cfe18842f9f 100644 >>>> --- a/include/linux/if_tap.h >>>> +++ b/include/linux/if_tap.h >>>> @@ -31,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f) >>>> #define MAX_TAP_QUEUES 256 >>>> >>>> struct tap_queue; >>>> +struct tun_vnet_hash_container; >>>> >>>> struct tap_dev { >>>> struct net_device *dev; >>>> @@ -43,6 +44,7 @@ struct tap_dev { >>>> int numqueues; >>>> netdev_features_t tap_features; >>>> int minor; >>>> + struct tun_vnet_hash_container __rcu *vnet_hash; >>>> >>>> void (*update_features)(struct tap_dev *tap, netdev_features_t features); >>>> void (*count_tx_dropped)(struct tap_dev *tap); >>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h >>>> index bb2b751d274acff931281a72e8b4b0c699b4e8af..cdd793f1c360ad5f63fcc4cbf67d845f5e2ccf6f 100644 >>>> --- a/include/linux/skbuff.h >>>> +++ b/include/linux/skbuff.h >>>> @@ -4842,6 +4842,9 @@ enum skb_ext_id { >>>> #endif >>>> #if IS_ENABLED(CONFIG_MCTP_FLOWS) >>>> SKB_EXT_MCTP, >>>> +#endif >>>> +#if IS_ENABLED(CONFIG_TUN) >>>> + SKB_EXT_TUN_VNET_HASH, >>>> #endif >>>> SKB_EXT_NUM, /* must be last */ >>>> }; >>>> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h >>>> index 287cdc81c9390c289a30545aa7ed23d81c3329d3..4887f97500a870c7ef3c96a5837b2d0a5a225040 100644 >>>> --- a/include/uapi/linux/if_tun.h >>>> +++ b/include/uapi/linux/if_tun.h >>>> @@ -62,6 +62,42 @@ >>>> #define TUNSETCARRIER _IOW('T', 226, int) >>>> #define TUNGETDEVNETNS _IO('T', 227) >>>> >>>> +/** >>>> + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability. >>>> + * >>>> + * The argument is a pointer to &struct tun_vnet_hash which will store the >>>> + * maximal virtio_net hashing configuration. >>>> + */ >>>> +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash) >>>> + >>>> +/** >>>> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing >>>> + * >>>> + * The argument is a pointer to &struct tun_vnet_hash. >>>> + * >>>> + * The argument is a pointer to the compound of the following in order if >>>> + * %TUN_VNET_HASH_RSS is set: >>>> + * >>>> + * 1. &struct tun_vnet_hash >>>> + * 2. &struct tun_vnet_hash_rss >>>> + * 3. Indirection table >>>> + * 4. Key >>>> + * >>>> + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only >>>> + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal >>>> + * to the size of &struct virtio_net_hdr_v1_hash. >>> >>> So you had a dependency check already for vnet hdr len. I'd still >>> suggest to split this into rss and hash as they are separated >>> features. Then we can use separate data structure for them instead of >>> a container struct. >> >> I added a dependency check and found it is complicating the code and >> requires additional tests. I need a reason to justify the complexity if >> we are going to split it. > > As we discussed above: They don't dedpend on each other. > >> >> Regards, >> Akihiko Odaki >> >>> >>>> + * >>>> + * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will >>>> + * always be little-endian. >>>> + * >>>> + * This ioctl results in %EBADFD if the underlying device is deleted. It affects >>>> + * all queues attached to the same device. >>>> + * >>>> + * This ioctl currently has no effect on XDP packets and packets with >>>> + * queue_mapping set by TC. >>>> + */ >>>> +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash) >>>> + >>>> /* TUNSETIFF ifr flags */ >>>> #define IFF_TUN 0x0001 >>>> #define IFF_TAP 0x0002 >>>> @@ -115,4 +151,43 @@ struct tun_filter { >>>> __u8 addr[][ETH_ALEN]; >>>> }; >>>> >>>> +/** >>>> + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost >>>> + */ >>>> +#define TUN_VNET_HASH_REPORT 0x0001 >>>> + >>>> +/** >>>> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS >>>> + * >>>> + * This is mutually exclusive with eBPF steering program. >>>> + */ >>>> +#define TUN_VNET_HASH_RSS 0x0002 >>>> + >>>> +/** >>>> + * struct tun_vnet_hash - virtio_net hashing configuration >>>> + * @flags: >>>> + * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS >>>> + * @pad: >>>> + * Should be filled with zero before passing to %TUNSETVNETHASH >>>> + * @types: >>>> + * Bitmask of allowed hash types >>>> + */ >>>> +struct tun_vnet_hash { >>>> + __u16 flags; >>>> + __u8 pad[2]; >>>> + __u32 types; >>>> +}; >>> >>> Padding in the middle of the structure is not elegant. Any reason for this? >>> >>> And hash->types seems never used. >>> >>>> + >>>> +/** >>>> + * struct tun_vnet_hash_rss - virtio_net RSS configuration >>>> + * @indirection_table_mask: >>>> + * Bitmask to be applied to the indirection table index >>>> + * @unclassified_queue: >>>> + * The index of the queue to place unclassified packets in >>>> + */ >>>> +struct tun_vnet_hash_rss { >>>> + __u16 indirection_table_mask; >>>> + __u16 unclassified_queue; >>>> +}; >>>> + >>>> #endif /* _UAPI__IF_TUN_H */ >>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c >>>> index 7b03b64fdcb276f68ce881d1d8da8e4c6b897efc..aa2a091b649f0c9d6e0196f34f345ba78b5498fb 100644 >>>> --- a/net/core/skbuff.c >>>> +++ b/net/core/skbuff.c >>>> @@ -64,6 +64,7 @@ >>>> #include <linux/mpls.h> >>>> #include <linux/kcov.h> >>>> #include <linux/iov_iter.h> >>>> +#include <linux/virtio_net.h> >>>> >>>> #include <net/protocol.h> >>>> #include <net/dst.h> >>>> @@ -4969,6 +4970,9 @@ static const u8 skb_ext_type_len[] = { >>>> #if IS_ENABLED(CONFIG_MCTP_FLOWS) >>>> [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), >>>> #endif >>>> +#if IS_ENABLED(CONFIG_TUN) >>>> + [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct virtio_net_hash), >>>> +#endif >>>> }; >>>> >>>> static __always_inline unsigned int skb_ext_total_length(void) >>>> >>>> -- >>>> 2.48.1 >>>> >>> >>> Thanks >>> >> >
On 2025/03/11 9:38, Jason Wang wrote: > On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2025/03/10 12:55, Jason Wang wrote: >>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> Hash reporting >>>> ============== >>>> >>>> Allow the guest to reuse the hash value to make receive steering >>>> consistent between the host and guest, and to save hash computation. >>>> >>>> RSS >>>> === >>>> >>>> RSS is a receive steering algorithm that can be negotiated to use with >>>> virtio_net. Conventionally the hash calculation was done by the VMM. >>>> However, computing the hash after the queue was chosen defeats the >>>> purpose of RSS. >>>> >>>> Another approach is to use eBPF steering program. This approach has >>>> another downside: it cannot report the calculated hash due to the >>>> restrictive nature of eBPF steering program. >>>> >>>> Introduce the code to perform RSS to the kernel in order to overcome >>>> thse challenges. An alternative solution is to extend the eBPF steering >>>> program so that it will be able to report to the userspace, but I didn't >>>> opt for it because extending the current mechanism of eBPF steering >>>> program as is because it relies on legacy context rewriting, and >>>> introducing kfunc-based eBPF will result in non-UAPI dependency while >>>> the other relevant virtualization APIs such as KVM and vhost_net are >>>> UAPIs. >>>> >>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >>>> Tested-by: Lei Yang <leiyang@redhat.com> >>>> --- >>>> Documentation/networking/tuntap.rst | 7 ++ >>>> drivers/net/Kconfig | 1 + >>>> drivers/net/tap.c | 68 ++++++++++++++- >>>> drivers/net/tun.c | 98 +++++++++++++++++----- >>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >>>> include/linux/if_tap.h | 2 + >>>> include/linux/skbuff.h | 3 + >>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >>>> net/core/skbuff.c | 4 + >>>> 9 files changed, 386 insertions(+), 31 deletions(-) >>>> >>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst >>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 >>>> --- a/Documentation/networking/tuntap.rst >>>> +++ b/Documentation/networking/tuntap.rst >>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >>>> } >>>> >>>> +3.4 Reference >>>> +------------- >>>> + >>>> +``linux/if_tun.h`` defines the interface described below: >>>> + >>>> +.. kernel-doc:: include/uapi/linux/if_tun.h >>>> + >>>> Universal TUN/TAP device driver Frequently Asked Question >>>> ========================================================= >>>> >>>> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig >>>> index 1fd5acdc73c6af0e1a861867039c3624fc618e25..aecfd244dd83585fea2c5b815dcd787c58166c28 100644 >>>> --- a/drivers/net/Kconfig >>>> +++ b/drivers/net/Kconfig >>>> @@ -395,6 +395,7 @@ config TUN >>>> tristate "Universal TUN/TAP device driver support" >>>> depends on INET >>>> select CRC32 >>>> + select SKB_EXTENSIONS >>>> help >>>> TUN/TAP provides packet reception and transmission for user space >>>> programs. It can be viewed as a simple Point-to-Point or Ethernet >>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c >>>> index d4ece538f1b23789ca60caa6232690e4d0a4d14a..9428b63ec27e7f92e78a78afcb5e24383862c00d 100644 >>>> --- a/drivers/net/tap.c >>>> +++ b/drivers/net/tap.c >>>> @@ -49,6 +49,10 @@ struct major_info { >>>> struct list_head next; >>>> }; >>>> >>>> +struct tap_skb_cb { >>>> + struct virtio_net_hash hash; >>>> +}; >>>> + >>>> #define GOODCOPY_LEN 128 >>>> >>>> static const struct proto_ops tap_socket_ops; >>>> @@ -179,6 +183,22 @@ static void tap_put_queue(struct tap_queue *q) >>>> sock_put(&q->sk); >>>> } >>>> >>>> +static struct tap_skb_cb *tap_skb_cb(const struct sk_buff *skb) >>>> +{ >>>> + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct tap_skb_cb)); >>>> + return (struct tap_skb_cb *)skb->cb; >>>> +} >>>> + >>>> +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb) >>>> +{ >>>> + return &tap_skb_cb(skb)->hash; >>>> +} >>>> + >>>> +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb) >>>> +{ >>>> + return &tap_skb_cb(skb)->hash; >>>> +} >>>> + >>>> /* >>>> * Select a queue based on the rxq of the device on which this packet >>>> * arrived. If the incoming device is not mq, calculate a flow hash >>>> @@ -189,6 +209,7 @@ static void tap_put_queue(struct tap_queue *q) >>>> static struct tap_queue *tap_get_queue(struct tap_dev *tap, >>>> struct sk_buff *skb) >>>> { >>>> + struct flow_keys_basic keys_basic; >>>> struct tap_queue *queue = NULL; >>>> /* Access to taps array is protected by rcu, but access to numvtaps >>>> * isn't. Below we use it to lookup a queue, but treat it as a hint >>>> @@ -196,17 +217,47 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, >>>> * racing against queue removal. >>>> */ >>>> int numvtaps = READ_ONCE(tap->numvtaps); >>>> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash); >>>> __u32 rxq; >>>> >>>> + *tap_skb_cb(skb) = (struct tap_skb_cb) { >>>> + .hash = { .report = VIRTIO_NET_HASH_REPORT_NONE } >>>> + }; >>>> + >>>> if (!numvtaps) >>>> goto out; >>>> >>>> if (numvtaps == 1) >>>> goto single; >>>> >>>> + if (vnet_hash) { >>>> + if ((vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { >>>> + rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash); >>>> + queue = rcu_dereference(tap->taps[rxq]); >>>> + goto out; >>>> + } >>>> + >>>> + if (!skb->l4_hash && !skb->sw_hash) { >>>> + struct flow_keys keys; >>>> + >>>> + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); >>>> + rxq = flow_hash_from_keys(&keys); >>>> + keys_basic = (struct flow_keys_basic) { >>>> + .control = keys.control, >>>> + .basic = keys.basic >>>> + }; >>>> + } else { >>>> + skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0, >>>> + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); >>>> + rxq = skb->hash; >>>> + } >>>> + } else { >>>> + rxq = skb_get_hash(skb); >>>> + } >>>> + >>>> /* Check if we can use flow to select a queue */ >>>> - rxq = skb_get_hash(skb); >>>> if (rxq) { >>>> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash); >>>> queue = rcu_dereference(tap->taps[rxq % numvtaps]); >>>> goto out; >>>> } >>>> @@ -711,11 +762,12 @@ static ssize_t tap_put_user(struct tap_queue *q, >>>> int total; >>>> >>>> if (q->flags & IFF_VNET_HDR) { >>>> - struct virtio_net_hdr vnet_hdr; >>>> + struct virtio_net_hdr_v1_hash vnet_hdr; >>>> >>>> vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); >>>> >>>> - ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); >>>> + ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb, >>>> + tap_find_hash, &vnet_hdr); >>>> if (ret) >>>> return ret; >>>> >>>> @@ -992,6 +1044,16 @@ static long tap_ioctl(struct file *file, unsigned int cmd, >>>> rtnl_unlock(); >>>> return ret; >>>> >>>> + case TUNGETVNETHASHCAP: >>>> + return tun_vnet_ioctl_gethashcap(argp); >>>> + >>>> + case TUNSETVNETHASH: >>>> + rtnl_lock(); >>>> + tap = rtnl_dereference(q->tap); >>>> + ret = tap ? tun_vnet_ioctl_sethash(&tap->vnet_hash, true, argp) : -EBADFD; >>>> + rtnl_unlock(); >>>> + return ret; >>>> + >>>> case SIOCGIFHWADDR: >>>> rtnl_lock(); >>>> tap = tap_get_tap_dev(q); >>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >>>> index d8f4d3e996a7a81d1f8b04635054081671a14f07..520013df416e93d3a50b46be9b53ae9ab410eab4 100644 >>>> --- a/drivers/net/tun.c >>>> +++ b/drivers/net/tun.c >>>> @@ -209,6 +209,7 @@ struct tun_struct { >>>> struct bpf_prog __rcu *xdp_prog; >>>> struct tun_prog __rcu *steering_prog; >>>> struct tun_prog __rcu *filter_prog; >>>> + struct tun_vnet_hash_container __rcu *vnet_hash; >>>> struct ethtool_link_ksettings link_ksettings; >>>> /* init args */ >>>> struct file *file; >>>> @@ -451,20 +452,37 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) >>>> e->rps_rxhash = hash; >>>> } >>>> >>>> +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb) >>>> +{ >>>> + return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); >>>> +} >>>> + >>>> +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb) >>>> +{ >>>> + return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); >>>> +} >>>> + >>>> /* We try to identify a flow through its rxhash. The reason that >>>> * we do not check rxq no. is because some cards(e.g 82599), chooses >>>> * the rxq based on the txq where the last packet of the flow comes. As >>>> * the userspace application move between processors, we may get a >>>> * different rxq no. here. >>>> */ >>>> -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >>>> +static u16 tun_automq_select_queue(struct tun_struct *tun, >>>> + const struct tun_vnet_hash_container *vnet_hash, >>>> + struct sk_buff *skb) >>>> { >>>> + struct flow_keys keys; >>>> + struct flow_keys_basic keys_basic; >>>> struct tun_flow_entry *e; >>>> u32 txq, numqueues; >>>> >>>> numqueues = READ_ONCE(tun->numqueues); >>>> >>>> - txq = __skb_get_hash_symmetric(skb); >>>> + memset(&keys, 0, sizeof(keys)); >>>> + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); >>>> + >>>> + txq = flow_hash_from_keys(&keys); >>>> e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); >>>> if (e) { >>>> tun_flow_save_rps_rxhash(e, txq); >>>> @@ -473,6 +491,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) >>>> txq = reciprocal_scale(txq, numqueues); >>>> } >>>> >>>> + keys_basic = (struct flow_keys_basic) { >>>> + .control = keys.control, >>>> + .basic = keys.basic >>>> + }; >>>> + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq, >>>> + tun_add_hash); >>>> + >>>> return txq; >>>> } >>>> >>>> @@ -500,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, >>>> u16 ret; >>>> >>>> rcu_read_lock(); >>>> - if (rcu_dereference(tun->steering_prog)) >>>> + if (rcu_dereference(tun->steering_prog)) { >>>> ret = tun_ebpf_select_queue(tun, skb); >>>> - else >>>> - ret = tun_automq_select_queue(tun, skb); >>>> + } else { >>>> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); >>>> + >>>> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) >>>> + ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash, >>>> + skb, tun_add_hash); >>>> + else >>>> + ret = tun_automq_select_queue(tun, vnet_hash, skb); >>>> + } >>>> rcu_read_unlock(); >>>> >>>> return ret; >>>> @@ -1987,7 +2019,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, >>>> ssize_t ret; >>>> >>>> if (tun->flags & IFF_VNET_HDR) { >>>> - struct virtio_net_hdr gso = { 0 }; >>>> + struct virtio_net_hdr_v1_hash gso = { 0 }; >>>> >>>> vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); >>>> ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); >>>> @@ -2040,9 +2072,10 @@ static ssize_t tun_put_user(struct tun_struct *tun, >>>> } >>>> >>>> if (vnet_hdr_sz) { >>>> - struct virtio_net_hdr gso; >>>> + struct virtio_net_hdr_v1_hash gso; >>>> >>>> - ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); >>>> + ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, >>>> + skb, tun_find_hash, &gso); >>>> if (ret) >>>> return ret; >>>> >>>> @@ -2223,6 +2256,7 @@ static void tun_free_netdev(struct net_device *dev) >>>> security_tun_dev_free_security(tun->security); >>>> __tun_set_ebpf(tun, &tun->steering_prog, NULL); >>>> __tun_set_ebpf(tun, &tun->filter_prog, NULL); >>>> + kfree_rcu_mightsleep(rcu_access_pointer(tun->vnet_hash)); >>>> } >>>> >>>> static void tun_setup(struct net_device *dev) >>>> @@ -2921,13 +2955,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) >>>> } >>>> >>>> static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, >>>> - void __user *data) >>>> + int fd) >>>> { >>>> struct bpf_prog *prog; >>>> - int fd; >>>> - >>>> - if (copy_from_user(&fd, data, sizeof(fd))) >>>> - return -EFAULT; >>>> >>>> if (fd == -1) { >>>> prog = NULL; >>>> @@ -2993,7 +3023,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >>>> int ifindex; >>>> int sndbuf; >>>> int ret; >>>> + int fd; >>>> bool do_notify = false; >>>> + struct tun_vnet_hash_container *vnet_hash; >>>> >>>> if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || >>>> (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { >>>> @@ -3020,7 +3052,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >>>> rtnl_lock(); >>>> >>>> tun = tun_get(tfile); >>>> - if (cmd == TUNSETIFF) { >>>> + switch (cmd) { >>>> + case TUNSETIFF: >>>> ret = -EEXIST; >>>> if (tun) >>>> goto unlock; >>>> @@ -3035,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >>>> if (copy_to_user(argp, &ifr, ifreq_len)) >>>> ret = -EFAULT; >>>> goto unlock; >>>> - } >>>> - if (cmd == TUNSETIFINDEX) { >>>> + >>>> + case TUNSETIFINDEX: >>>> ret = -EPERM; >>>> if (tun) >>>> goto unlock; >>>> @@ -3050,6 +3083,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >>>> ret = 0; >>>> tfile->ifindex = ifindex; >>>> goto unlock; >>>> + >>>> + case TUNGETVNETHASHCAP: >>>> + ret = tun_vnet_ioctl_gethashcap(argp); >>>> + goto unlock; >>>> } >>>> >>>> ret = -EBADFD; >>>> @@ -3230,11 +3267,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >>>> break; >>>> >>>> case TUNSETSTEERINGEBPF: >>>> - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); >>>> + if (get_user(fd, (int __user *)argp)) { >>>> + ret = -EFAULT; >>>> + break; >>>> + } >>>> + >>>> + vnet_hash = rtnl_dereference(tun->vnet_hash); >>>> + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { >>>> + ret = -EBUSY; >>>> + break; >>>> + } >>>> + >>>> + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); >>>> break; >>>> >>>> case TUNSETFILTEREBPF: >>>> - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); >>>> + if (get_user(fd, (int __user *)argp)) { >>>> + ret = -EFAULT; >>>> + break; >>>> + } >>>> + >>>> + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); >>>> break; >>>> >>>> case TUNSETCARRIER: >>>> @@ -3252,8 +3305,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >>>> ret = open_related_ns(&net->ns, get_net_ns); >>>> break; >>>> >>>> + case TUNSETVNETHASH: >>>> + ret = tun_vnet_ioctl_sethash(&tun->vnet_hash, >>>> + !rtnl_dereference(tun->steering_prog), >>>> + argp); >>>> + break; >>>> + >>>> default: >>>> - ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); >>>> + ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, >>>> + cmd, argp); >>>> break; >>>> } >>>> >>>> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h >>>> index 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed 100644 >>>> --- a/drivers/net/tun_vnet.h >>>> +++ b/drivers/net/tun_vnet.h >>>> @@ -6,6 +6,16 @@ >>>> #define TUN_VNET_LE 0x80000000 >>>> #define TUN_VNET_BE 0x40000000 >>>> >>>> +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *); >>>> +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *); >>>> + >>>> +struct tun_vnet_hash_container { >>>> + struct tun_vnet_hash common; >>> >>> I'd rename this as hash. >>> >>>> + struct tun_vnet_hash_rss rss; >>>> + u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; >>>> + u16 rss_indirection_table[]; >>>> +}; >>> >>> Besides the separated ioctl, I'd split this structure into rss and >>> hash part as well. >>> >>>> + >>>> static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) >>>> { >>>> bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && >>>> @@ -107,6 +117,123 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, >>>> } >>>> } >>>> >>>> +static inline long tun_vnet_ioctl_gethashcap(void __user *argp) >>>> +{ >>>> + static const struct tun_vnet_hash cap = { >>>> + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, >>>> + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES >>>> + }; >>>> + >>>> + return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; >>> >>> Let's has a consistent name for this and the uapi to be consistent >>> with TUNSETIFF/TUNGETIFF. Probably TUNSETVNETHASH and >>> tun_vnet_ioctl_gethash(). >> >> They have different semantics so they should have different names. >> TUNGETIFF reports the value currently set while TUNGETVNETHASHCAP >> reports the value that can be set later. > > I'm not sure I will get here. I meant a symmetric name > > TUNSETVNETHASH and TUNVETVNETHASH. TUNGETVNETHASHCAP does not correspond to TUNGETIFF. The correspondence of ioctl names is as follows: TUNGETFEATURES - TUNGETVNETHASHCAP TUNSETIFF - TUNSETVNETHASH TUNGETIFF - no corresponding ioctl for the virtio-net hash features Regards, Akihiko Odaki
On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2025/03/11 9:38, Jason Wang wrote: > > On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2025/03/10 12:55, Jason Wang wrote: > >>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> Hash reporting > >>>> ============== > >>>> > >>>> Allow the guest to reuse the hash value to make receive steering > >>>> consistent between the host and guest, and to save hash computation. > >>>> > >>>> RSS > >>>> === > >>>> > >>>> RSS is a receive steering algorithm that can be negotiated to use with > >>>> virtio_net. Conventionally the hash calculation was done by the VMM. > >>>> However, computing the hash after the queue was chosen defeats the > >>>> purpose of RSS. > >>>> > >>>> Another approach is to use eBPF steering program. This approach has > >>>> another downside: it cannot report the calculated hash due to the > >>>> restrictive nature of eBPF steering program. > >>>> > >>>> Introduce the code to perform RSS to the kernel in order to overcome > >>>> thse challenges. An alternative solution is to extend the eBPF steering > >>>> program so that it will be able to report to the userspace, but I didn't > >>>> opt for it because extending the current mechanism of eBPF steering > >>>> program as is because it relies on legacy context rewriting, and > >>>> introducing kfunc-based eBPF will result in non-UAPI dependency while > >>>> the other relevant virtualization APIs such as KVM and vhost_net are > >>>> UAPIs. > >>>> > >>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > >>>> Tested-by: Lei Yang <leiyang@redhat.com> > >>>> --- > >>>> Documentation/networking/tuntap.rst | 7 ++ > >>>> drivers/net/Kconfig | 1 + > >>>> drivers/net/tap.c | 68 ++++++++++++++- > >>>> drivers/net/tun.c | 98 +++++++++++++++++----- > >>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- > >>>> include/linux/if_tap.h | 2 + > >>>> include/linux/skbuff.h | 3 + > >>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ > >>>> net/core/skbuff.c | 4 + > >>>> 9 files changed, 386 insertions(+), 31 deletions(-) > >>>> > >>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst > >>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 > >>>> --- a/Documentation/networking/tuntap.rst > >>>> +++ b/Documentation/networking/tuntap.rst > >>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: > >>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); > >>>> } > >>>> [...] > >>>> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, > >>>> + bool can_rss, void __user *argp) > >>> > >>> So again, can_rss seems to be tricky. Looking at its caller, it tires > >>> to make eBPF and RSS mutually exclusive. I still don't understand why > >>> we need this. Allow eBPF program to override some of the path seems to > >>> be common practice. > >>> > >>> What's more, we didn't try (or even can't) to make automq and eBPF to > >>> be mutually exclusive. So I still didn't see what we gain from this > >>> and it complicates the codes and may lead to ambiguous uAPI/behaviour. > >> > >> automq and eBPF are mutually exclusive; automq is disabled when an eBPF > >> steering program is set so I followed the example here. > > > > I meant from the view of uAPI, the kernel doesn't or can't reject eBPF > > while using automq. > > >> > >> We don't even have an interface for eBPF to let it fall back to another > >> alogirhtm. > > > > It doesn't even need this, e.g XDP overrides the default receiving path. > > > >> I could make it fall back to RSS if the eBPF steeering > >> program is designed to fall back to automq when it returns e.g., -1. But > >> such an interface is currently not defined and defining one is out of > >> scope of this patch series. > > > > Just to make sure we are on the same page, I meant we just need to > > make the behaviour consistent: allow eBPF to override the behaviour of > > both automq and rss. > > That assumes eBPF takes precedence over RSS, which is not obvious to me. Well, it's kind of obvious. Not speaking the eBPF selector, we have other eBPF stuffs like skbedit etc. > > Let's add an interface for the eBPF steering program to fall back to > another steering algorithm. I said it is out of scope before, but it > makes clear that the eBPF steering program takes precedence over other > algorithms and allows us to delete the code for the configuration > validation in this patch. Fallback is out of scope but it's not what I meant. I meant in the current uAPI take eBPF precedence over automq. It's much more simpler to stick this precedence unless we see obvious advanatge. > > > > >> > >>> [...] > >>> Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? > >>> If yes, it should be a bug. > >> > >> It is possible to use RSS without TUN_VNET_HASH_REPORT. > > > > Another call to separate the ioctls then. > > RSS and hash reporting are not completely independent though. Spec said: """ VIRTIO_NET_F_RSSRequires VIRTIO_NET_F_CTRL_VQ. """ > > A plot twist is the "types" parameter; it is a parameter that is > "common" for RSS and hash reporting. So we can share part of the structure through the uAPI. > RSS and hash reporting must share > this parameter when both are enabled at the same time; otherwise RSS may > compute hash values that are not suited for hash reporting. Is this mandated by the spec? If yes, we can add a check. If not, userspace risk themselves as a mis-configuration which we don't need to bother. Note that spec use different commands for hash_report and rss. > > The paramter will be duplicated if we have separate ioctls for RSS and > hash reporting, and the kernel will have a chiken-egg problem when > ensuring they are synchronized; when the ioctl for RSS is issued, should > the kernel ensure the "types" parameter is identical with one specified > for hash reporting? It will not work if the userspace may decide to > configure hash reporting after RSS. > See my reply above. Thanks
On Tue, Mar 11, 2025 at 2:17 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2025/03/11 9:38, Jason Wang wrote: > > On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2025/03/10 12:55, Jason Wang wrote: > >>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> Hash reporting > >>>> ============== > >>>> > >>>> Allow the guest to reuse the hash value to make receive steering > >>>> consistent between the host and guest, and to save hash computation. > >>>> > >>>> RSS > >>>> === > >>>> > >>>> RSS is a receive steering algorithm that can be negotiated to use with > >>>> virtio_net. Conventionally the hash calculation was done by the VMM. > >>>> However, computing the hash after the queue was chosen defeats the > >>>> purpose of RSS. > >>>> > >>>> Another approach is to use eBPF steering program. This approach has > >>>> another downside: it cannot report the calculated hash due to the > >>>> restrictive nature of eBPF steering program. > >>>> > >>>> Introduce the code to perform RSS to the kernel in order to overcome > >>>> thse challenges. An alternative solution is to extend the eBPF steering > >>>> program so that it will be able to report to the userspace, but I didn't > >>>> opt for it because extending the current mechanism of eBPF steering > >>>> program as is because it relies on legacy context rewriting, and > >>>> introducing kfunc-based eBPF will result in non-UAPI dependency while > >>>> the other relevant virtualization APIs such as KVM and vhost_net are > >>>> UAPIs. > >>>> > >>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > >>>> Tested-by: Lei Yang <leiyang@redhat.com> > >>>> --- > >>>> Documentation/networking/tuntap.rst | 7 ++ > >>>> drivers/net/Kconfig | 1 + > >>>> drivers/net/tap.c | 68 ++++++++++++++- > >>>> drivers/net/tun.c | 98 +++++++++++++++++----- > >>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- > >>>> include/linux/if_tap.h | 2 + > >>>> include/linux/skbuff.h | 3 + > >>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ > >>>> net/core/skbuff.c | 4 + > >>>> 9 files changed, 386 insertions(+), 31 deletions(-) > >>>> [...] > >>> Let's has a consistent name for this and the uapi to be consistent > >>> with TUNSETIFF/TUNGETIFF. Probably TUNSETVNETHASH and > >>> tun_vnet_ioctl_gethash(). > >> > >> They have different semantics so they should have different names. > >> TUNGETIFF reports the value currently set while TUNGETVNETHASHCAP > >> reports the value that can be set later. > > > > I'm not sure I will get here. I meant a symmetric name > > > > TUNSETVNETHASH and TUNVETVNETHASH. > > TUNGETVNETHASHCAP does not correspond to TUNGETIFF. The correspondence > of ioctl names is as follows: > TUNGETFEATURES - TUNGETVNETHASHCAP TUNGETFEATURES returns the value set from TUNSETIFF. This differs from TUNGETVNETHASHCAP semantic which just return the capabilities. +static inline long tun_vnet_ioctl_gethashcap(void __user *argp) +{ + static const struct tun_vnet_hash cap = { + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES + }; + + return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; +} TUNGETFEATURES doesn't' help too much for non-persist TAP as userspace knows what value it set before. > TUNSETIFF - TUNSETVNETHASH > TUNGETIFF - no corresponding ioctl for the virtio-net hash features And this sounds odd and a hint for a incomplete uAPI as userspace needs to know knowing what can set before doing TUNSETVNETHASH. > > Regards, > Akihiko Odaki > Thanks
On 2025/03/12 11:35, Jason Wang wrote: > On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2025/03/11 9:38, Jason Wang wrote: >>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> On 2025/03/10 12:55, Jason Wang wrote: >>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>> >>>>>> Hash reporting >>>>>> ============== >>>>>> >>>>>> Allow the guest to reuse the hash value to make receive steering >>>>>> consistent between the host and guest, and to save hash computation. >>>>>> >>>>>> RSS >>>>>> === >>>>>> >>>>>> RSS is a receive steering algorithm that can be negotiated to use with >>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. >>>>>> However, computing the hash after the queue was chosen defeats the >>>>>> purpose of RSS. >>>>>> >>>>>> Another approach is to use eBPF steering program. This approach has >>>>>> another downside: it cannot report the calculated hash due to the >>>>>> restrictive nature of eBPF steering program. >>>>>> >>>>>> Introduce the code to perform RSS to the kernel in order to overcome >>>>>> thse challenges. An alternative solution is to extend the eBPF steering >>>>>> program so that it will be able to report to the userspace, but I didn't >>>>>> opt for it because extending the current mechanism of eBPF steering >>>>>> program as is because it relies on legacy context rewriting, and >>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while >>>>>> the other relevant virtualization APIs such as KVM and vhost_net are >>>>>> UAPIs. >>>>>> >>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >>>>>> Tested-by: Lei Yang <leiyang@redhat.com> >>>>>> --- >>>>>> Documentation/networking/tuntap.rst | 7 ++ >>>>>> drivers/net/Kconfig | 1 + >>>>>> drivers/net/tap.c | 68 ++++++++++++++- >>>>>> drivers/net/tun.c | 98 +++++++++++++++++----- >>>>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >>>>>> include/linux/if_tap.h | 2 + >>>>>> include/linux/skbuff.h | 3 + >>>>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >>>>>> net/core/skbuff.c | 4 + >>>>>> 9 files changed, 386 insertions(+), 31 deletions(-) >>>>>> >>>>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst >>>>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 >>>>>> --- a/Documentation/networking/tuntap.rst >>>>>> +++ b/Documentation/networking/tuntap.rst >>>>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >>>>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >>>>>> } >>>>>> > > [...] > >>>>>> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, >>>>>> + bool can_rss, void __user *argp) >>>>> >>>>> So again, can_rss seems to be tricky. Looking at its caller, it tires >>>>> to make eBPF and RSS mutually exclusive. I still don't understand why >>>>> we need this. Allow eBPF program to override some of the path seems to >>>>> be common practice. >>>>> >>>>> What's more, we didn't try (or even can't) to make automq and eBPF to >>>>> be mutually exclusive. So I still didn't see what we gain from this >>>>> and it complicates the codes and may lead to ambiguous uAPI/behaviour. >>>> >>>> automq and eBPF are mutually exclusive; automq is disabled when an eBPF >>>> steering program is set so I followed the example here. >>> >>> I meant from the view of uAPI, the kernel doesn't or can't reject eBPF >>> while using automq. >> > >> >>>> We don't even have an interface for eBPF to let it fall back to another >>>> alogirhtm. >>> >>> It doesn't even need this, e.g XDP overrides the default receiving path. >>> >>>> I could make it fall back to RSS if the eBPF steeering >>>> program is designed to fall back to automq when it returns e.g., -1. But >>>> such an interface is currently not defined and defining one is out of >>>> scope of this patch series. >>> >>> Just to make sure we are on the same page, I meant we just need to >>> make the behaviour consistent: allow eBPF to override the behaviour of >>> both automq and rss. >> >> That assumes eBPF takes precedence over RSS, which is not obvious to me. > > Well, it's kind of obvious. Not speaking the eBPF selector, we have > other eBPF stuffs like skbedit etc. > >> >> Let's add an interface for the eBPF steering program to fall back to >> another steering algorithm. I said it is out of scope before, but it >> makes clear that the eBPF steering program takes precedence over other >> algorithms and allows us to delete the code for the configuration >> validation in this patch. > > Fallback is out of scope but it's not what I meant. > > I meant in the current uAPI take eBPF precedence over automq. It's > much more simpler to stick this precedence unless we see obvious > advanatge. We still have three different design options that preserve the current precedence: 1) Precedence order: eBPF -> RSS -> automq 2) Precedence order: RSS -> eBPF -> automq 3) Precedence order: eBPF OR RSS -> automq where eBPF and RSS are mutually exclusive I think this is a unique situation for this steering program and I could not find another example in other eBPF stuffs. The current version implements 3) because it is not obvious whether we should choose either 1) or 2). But 1) will be the most capable option if eBPF has a fall-back feature. > >> >>> >>>> >>>>> > > [...] > >>>>> Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? >>>>> If yes, it should be a bug. >>>> >>>> It is possible to use RSS without TUN_VNET_HASH_REPORT. >>> >>> Another call to separate the ioctls then. >> >> RSS and hash reporting are not completely independent though. > > Spec said: > > """ > VIRTIO_NET_F_RSSRequires VIRTIO_NET_F_CTRL_VQ. > """ I meant the features can be enabled independently, but they will share the hash type set when they are enabled at the same time. > >> >> A plot twist is the "types" parameter; it is a parameter that is >> "common" for RSS and hash reporting. > > So we can share part of the structure through the uAPI. Isn't that what this patch does? > >> RSS and hash reporting must share >> this parameter when both are enabled at the same time; otherwise RSS may >> compute hash values that are not suited for hash reporting. > > Is this mandated by the spec? If yes, we can add a check. If not, > userspace risk themselves as a mis-configuration which we don't need > to bother. Yes, it is mandated. 5.1.6.4.3 Hash calculation for incoming packets says: > A device attempts to calculate a per-packet hash in the following > cases: > > - The feature VIRTIO_NET_F_RSS was negotiated. The device uses the > hash to determine the receive virtqueue to place incoming packets. > - The feature VIRTIO_NET_F_HASH_REPORT was negotiated. The device > reports the hash value and the hash type with the packet. > > If the feature VIRTIO_NET_F_RSS was negotiated: > > - The device uses hash_types of the virtio_net_rss_config structure > as ’Enabled hash types’ bitmask. > - The device uses a key as defined in hash_key_data and hash_key_length of the virtio_net_rss_config structure (see > 5.1.6.5.7.1). > > If the feature VIRTIO_NET_F_RSS was not negotiated: > > - The device uses hash_types of the virtio_net_hash_config structure > as ’Enabled hash types’ bitmask. > - The device uses a key as defined in hash_key_data and > hash_key_length of the virtio_net_hash_config structure (see > .1.6.5.6.4). So when both VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT are negotiated, virtio_net_rss_config not only controls RSS but also the reported hash values and types. They cannot be divergent. > > Note that spec use different commands for hash_report and rss. TUNSETVNETHASH is different from these commands in terms that it also negotiates VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS. In the virtio-net specification, it is not defined what would happen if these features are negotiated but the VIRTIO_NET_CTRL_MQ_RSS_CONFIG or VIRTIO_NET_CTRL_MQ_HASH_CONFIG commands are not sent. There is no such ambiguity with TUNSETVNETHASH. Regards, Akihiko Odaki > >> >> The paramter will be duplicated if we have separate ioctls for RSS and >> hash reporting, and the kernel will have a chiken-egg problem when >> ensuring they are synchronized; when the ioctl for RSS is issued, should >> the kernel ensure the "types" parameter is identical with one specified >> for hash reporting? It will not work if the userspace may decide to >> configure hash reporting after RSS. >> > > See my reply above. > > Thanks >
On 2025/03/12 11:59, Jason Wang wrote: > On Tue, Mar 11, 2025 at 2:17 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2025/03/11 9:38, Jason Wang wrote: >>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> On 2025/03/10 12:55, Jason Wang wrote: >>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>> >>>>>> Hash reporting >>>>>> ============== >>>>>> >>>>>> Allow the guest to reuse the hash value to make receive steering >>>>>> consistent between the host and guest, and to save hash computation. >>>>>> >>>>>> RSS >>>>>> === >>>>>> >>>>>> RSS is a receive steering algorithm that can be negotiated to use with >>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. >>>>>> However, computing the hash after the queue was chosen defeats the >>>>>> purpose of RSS. >>>>>> >>>>>> Another approach is to use eBPF steering program. This approach has >>>>>> another downside: it cannot report the calculated hash due to the >>>>>> restrictive nature of eBPF steering program. >>>>>> >>>>>> Introduce the code to perform RSS to the kernel in order to overcome >>>>>> thse challenges. An alternative solution is to extend the eBPF steering >>>>>> program so that it will be able to report to the userspace, but I didn't >>>>>> opt for it because extending the current mechanism of eBPF steering >>>>>> program as is because it relies on legacy context rewriting, and >>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while >>>>>> the other relevant virtualization APIs such as KVM and vhost_net are >>>>>> UAPIs. >>>>>> >>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >>>>>> Tested-by: Lei Yang <leiyang@redhat.com> >>>>>> --- >>>>>> Documentation/networking/tuntap.rst | 7 ++ >>>>>> drivers/net/Kconfig | 1 + >>>>>> drivers/net/tap.c | 68 ++++++++++++++- >>>>>> drivers/net/tun.c | 98 +++++++++++++++++----- >>>>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >>>>>> include/linux/if_tap.h | 2 + >>>>>> include/linux/skbuff.h | 3 + >>>>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >>>>>> net/core/skbuff.c | 4 + >>>>>> 9 files changed, 386 insertions(+), 31 deletions(-) >>>>>> > > [...] > >>>>> Let's has a consistent name for this and the uapi to be consistent >>>>> with TUNSETIFF/TUNGETIFF. Probably TUNSETVNETHASH and >>>>> tun_vnet_ioctl_gethash(). >>>> >>>> They have different semantics so they should have different names. >>>> TUNGETIFF reports the value currently set while TUNGETVNETHASHCAP >>>> reports the value that can be set later. >>> >>> I'm not sure I will get here. I meant a symmetric name >>> >>> TUNSETVNETHASH and TUNVETVNETHASH. >> >> TUNGETVNETHASHCAP does not correspond to TUNGETIFF. The correspondence >> of ioctl names is as follows: >> TUNGETFEATURES - TUNGETVNETHASHCAP > > TUNGETFEATURES returns the value set from TUNSETIFF. This differs from > TUNGETVNETHASHCAP semantic which just return the capabilities. > > +static inline long tun_vnet_ioctl_gethashcap(void __user *argp) > +{ > + static const struct tun_vnet_hash cap = { > + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, > + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES > + }; > + > + return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; > +} > > TUNGETFEATURES doesn't' help too much for non-persist TAP as userspace > knows what value it set before. > >> TUNSETIFF - TUNSETVNETHASH >> TUNGETIFF - no corresponding ioctl for the virtio-net hash features > > And this sounds odd and a hint for a incomplete uAPI as userspace > needs to know knowing what can set before doing TUNSETVNETHASH. You are confused with TUNGETFEATURES and TUNGETIFF. Below is the code that implements TUNGETFEATURES: if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on * TUNSETIFF. */ return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER | TUN_FEATURES, (unsigned int __user*)argp); } else if (cmd == TUNSETQUEUE) { Regards, Akihiko Odaki > >> >> Regards, >> Akihiko Odaki >> > > Thanks >
On Wed, Mar 12, 2025 at 1:03 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2025/03/12 11:35, Jason Wang wrote: > > On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2025/03/11 9:38, Jason Wang wrote: > >>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> On 2025/03/10 12:55, Jason Wang wrote: > >>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>> > >>>>>> Hash reporting > >>>>>> ============== > >>>>>> > >>>>>> Allow the guest to reuse the hash value to make receive steering > >>>>>> consistent between the host and guest, and to save hash computation. > >>>>>> > >>>>>> RSS > >>>>>> === > >>>>>> > >>>>>> RSS is a receive steering algorithm that can be negotiated to use with > >>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. > >>>>>> However, computing the hash after the queue was chosen defeats the > >>>>>> purpose of RSS. > >>>>>> > >>>>>> Another approach is to use eBPF steering program. This approach has > >>>>>> another downside: it cannot report the calculated hash due to the > >>>>>> restrictive nature of eBPF steering program. > >>>>>> > >>>>>> Introduce the code to perform RSS to the kernel in order to overcome > >>>>>> thse challenges. An alternative solution is to extend the eBPF steering > >>>>>> program so that it will be able to report to the userspace, but I didn't > >>>>>> opt for it because extending the current mechanism of eBPF steering > >>>>>> program as is because it relies on legacy context rewriting, and > >>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while > >>>>>> the other relevant virtualization APIs such as KVM and vhost_net are > >>>>>> UAPIs. > >>>>>> > >>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > >>>>>> Tested-by: Lei Yang <leiyang@redhat.com> > >>>>>> --- > >>>>>> Documentation/networking/tuntap.rst | 7 ++ > >>>>>> drivers/net/Kconfig | 1 + > >>>>>> drivers/net/tap.c | 68 ++++++++++++++- > >>>>>> drivers/net/tun.c | 98 +++++++++++++++++----- > >>>>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- > >>>>>> include/linux/if_tap.h | 2 + > >>>>>> include/linux/skbuff.h | 3 + > >>>>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ > >>>>>> net/core/skbuff.c | 4 + > >>>>>> 9 files changed, 386 insertions(+), 31 deletions(-) > >>>>>> > >>>>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst > >>>>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 > >>>>>> --- a/Documentation/networking/tuntap.rst > >>>>>> +++ b/Documentation/networking/tuntap.rst > >>>>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: > >>>>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); > >>>>>> } > >>>>>> > > > > [...] > > > >>>>>> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, > >>>>>> + bool can_rss, void __user *argp) > >>>>> > >>>>> So again, can_rss seems to be tricky. Looking at its caller, it tires > >>>>> to make eBPF and RSS mutually exclusive. I still don't understand why > >>>>> we need this. Allow eBPF program to override some of the path seems to > >>>>> be common practice. > >>>>> > >>>>> What's more, we didn't try (or even can't) to make automq and eBPF to > >>>>> be mutually exclusive. So I still didn't see what we gain from this > >>>>> and it complicates the codes and may lead to ambiguous uAPI/behaviour. > >>>> > >>>> automq and eBPF are mutually exclusive; automq is disabled when an eBPF > >>>> steering program is set so I followed the example here. > >>> > >>> I meant from the view of uAPI, the kernel doesn't or can't reject eBPF > >>> while using automq. > >> > >> > >>>> We don't even have an interface for eBPF to let it fall back to another > >>>> alogirhtm. > >>> > >>> It doesn't even need this, e.g XDP overrides the default receiving path. > >>> > >>>> I could make it fall back to RSS if the eBPF steeering > >>>> program is designed to fall back to automq when it returns e.g., -1. But > >>>> such an interface is currently not defined and defining one is out of > >>>> scope of this patch series. > >>> > >>> Just to make sure we are on the same page, I meant we just need to > >>> make the behaviour consistent: allow eBPF to override the behaviour of > >>> both automq and rss. > >> > >> That assumes eBPF takes precedence over RSS, which is not obvious to me. > > > > Well, it's kind of obvious. Not speaking the eBPF selector, we have > > other eBPF stuffs like skbedit etc. > > > >> > >> Let's add an interface for the eBPF steering program to fall back to > >> another steering algorithm. I said it is out of scope before, but it > >> makes clear that the eBPF steering program takes precedence over other > >> algorithms and allows us to delete the code for the configuration > >> validation in this patch. > > > > Fallback is out of scope but it's not what I meant. > > > > I meant in the current uAPI take eBPF precedence over automq. It's > > much more simpler to stick this precedence unless we see obvious > > advanatge. > > We still have three different design options that preserve the current > precedence: > > 1) Precedence order: eBPF -> RSS -> automq > 2) Precedence order: RSS -> eBPF -> automq > 3) Precedence order: eBPF OR RSS -> automq where eBPF and RSS are > mutually exclusive > > I think this is a unique situation for this steering program and I could > not find another example in other eBPF stuffs. As described above, queue mapping could be overridden by tc-ebpf. So there's no way to guarantee the RSS will work: https://github.com/DPDK/dpdk/blob/main/drivers/net/tap/bpf/tap_rss.c#L262 Making eBPF first leaves a chance for the management layer to override the choice of Qemu. > > The current version implements 3) because it is not obvious whether we > should choose either 1) or 2). But you didn't explain why you choose 3), and it leads to tricky code (e.g the can_rss stuff etc). > But 1) will be the most capable option if > eBPF has a fall-back feature. > > > > >> > >>> > >>>> > >>>>> > > > > [...] > > > >>>>> Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? > >>>>> If yes, it should be a bug. > >>>> > >>>> It is possible to use RSS without TUN_VNET_HASH_REPORT. > >>> > >>> Another call to separate the ioctls then. > >> > >> RSS and hash reporting are not completely independent though. > > > > Spec said: > > > > """ > > VIRTIO_NET_F_RSSRequires VIRTIO_NET_F_CTRL_VQ. > > """ > > I meant the features can be enabled independently, but they will share > the hash type set when they are enabled at the same time. Looking at the spec: Hash repot uses: """ struct virtio_net_hash_config { le32 hash_types; le16 reserved[4]; u8 hash_key_length; u8 hash_key_data[hash_key_length]; }; """ RSS uses """ struct rss_rq_id { le16 vq_index_1_16: 15; /* Bits 1 to 16 of the virtqueue index */ le16 reserved: 1; /* Set to zero */ }; struct virtio_net_rss_config { le32 hash_types; le16 indirection_table_mask; struct rss_rq_id unclassified_queue; struct rss_rq_id indirection_table[indirection_table_length]; le16 max_tx_vq; u8 hash_key_length; u8 hash_key_data[hash_key_length]; }; """ Instead of trying to figure out whether we can share some data structures, why not simply start from what has been done in the spec? This would ease the usersapce as well where it can simply do 1:1 mapping between ctrl vq command and tun uAPI. > > > > >> > >> A plot twist is the "types" parameter; it is a parameter that is > >> "common" for RSS and hash reporting. > > > > So we can share part of the structure through the uAPI. > > Isn't that what this patch does? I didn't see, basically I see only one TUNSETVNETHASH that is used to set both hash report and rss: """ +/** + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing + * + * The argument is a pointer to &struct tun_vnet_hash. + * + * The argument is a pointer to the compound of the following in order if + * %TUN_VNET_HASH_RSS is set: + * + * 1. &struct tun_vnet_hash + * 2. &struct tun_vnet_hash_rss + * 3. Indirection table + * 4. Key + * """ And it seems to lack parameters like max_tx_vq. What's more, we've already had virito-net uAPI. Why not simply reusing them? > > > > >> RSS and hash reporting must share > >> this parameter when both are enabled at the same time; otherwise RSS may > >> compute hash values that are not suited for hash reporting. > > > > Is this mandated by the spec? If yes, we can add a check. If not, > > userspace risk themselves as a mis-configuration which we don't need > > to bother. > > Yes, it is mandated. 5.1.6.4.3 Hash calculation for incoming packets says: > > A device attempts to calculate a per-packet hash in the following > > cases: > > > > - The feature VIRTIO_NET_F_RSS was negotiated. The device uses the > > hash to determine the receive virtqueue to place incoming packets. > > - The feature VIRTIO_NET_F_HASH_REPORT was negotiated. The device > > reports the hash value and the hash type with the packet. > > > > If the feature VIRTIO_NET_F_RSS was negotiated: > > > > - The device uses hash_types of the virtio_net_rss_config structure > > as ’Enabled hash types’ bitmask. > > - The device uses a key as defined in hash_key_data and > hash_key_length of the virtio_net_rss_config structure (see > > 5.1.6.5.7.1). > > > > If the feature VIRTIO_NET_F_RSS was not negotiated: > > > > - The device uses hash_types of the virtio_net_hash_config structure > > as ’Enabled hash types’ bitmask. > > - The device uses a key as defined in hash_key_data and > > hash_key_length of the virtio_net_hash_config structure (see > > .1.6.5.6.4). > > So when both VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT are > negotiated, virtio_net_rss_config not only controls RSS but also the > reported hash values and types. They cannot be divergent. > > > > > Note that spec use different commands for hash_report and rss. > > TUNSETVNETHASH is different from these commands in terms that it also > negotiates VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS. > There Are different "issues" here: 1) Whether or not we need to use a unified API for negotiating RSS and HASH_REPORT features 2) Whether or not we need to sue a unified API for setting RSS and HASH_REPORT configuration What I want to say is point 2. But what you raise is point 1. For simplicity, it looks to me like it's a call for having separated ioctls for feature negotiation (for example via TUNSETIFF). You may argue that either RSS or HASH_REPORT requires configurations, we can just follow what spec defines or not (e.g what happens if RSS/HASH_REPORT were negotiated but no configurations were set). > In the virtio-net specification, it is not defined what would happen if > these features are negotiated but the VIRTIO_NET_CTRL_MQ_RSS_CONFIG or > VIRTIO_NET_CTRL_MQ_HASH_CONFIG commands are not sent. There is no such > ambiguity with TUNSETVNETHASH. So I don't see advantages of unifying hash reports and rss into a single ioctl. Let's just follow what has been done in the spec that uses separated commands. Tuntap is not a good place to debate whether those commands could be unified or not. We need to move it to the spec but assuming spec has been done, it might be too late or too few advantages for having another design. Thanks > > Regards, > Akihiko Odaki > > > > >> > >> The paramter will be duplicated if we have separate ioctls for RSS and > >> hash reporting, and the kernel will have a chiken-egg problem when > >> ensuring they are synchronized; when the ioctl for RSS is issued, should > >> the kernel ensure the "types" parameter is identical with one specified > >> for hash reporting? It will not work if the userspace may decide to > >> configure hash reporting after RSS. > >> > > > > See my reply above. > > > > Thanks > > >
On Wed, Mar 12, 2025 at 1:55 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2025/03/12 11:59, Jason Wang wrote: > > On Tue, Mar 11, 2025 at 2:17 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2025/03/11 9:38, Jason Wang wrote: > >>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> On 2025/03/10 12:55, Jason Wang wrote: > >>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>> > >>>>>> Hash reporting > >>>>>> ============== > >>>>>> > >>>>>> Allow the guest to reuse the hash value to make receive steering > >>>>>> consistent between the host and guest, and to save hash computation. > >>>>>> > >>>>>> RSS > >>>>>> === > >>>>>> > >>>>>> RSS is a receive steering algorithm that can be negotiated to use with > >>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. > >>>>>> However, computing the hash after the queue was chosen defeats the > >>>>>> purpose of RSS. > >>>>>> > >>>>>> Another approach is to use eBPF steering program. This approach has > >>>>>> another downside: it cannot report the calculated hash due to the > >>>>>> restrictive nature of eBPF steering program. > >>>>>> > >>>>>> Introduce the code to perform RSS to the kernel in order to overcome > >>>>>> thse challenges. An alternative solution is to extend the eBPF steering > >>>>>> program so that it will be able to report to the userspace, but I didn't > >>>>>> opt for it because extending the current mechanism of eBPF steering > >>>>>> program as is because it relies on legacy context rewriting, and > >>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while > >>>>>> the other relevant virtualization APIs such as KVM and vhost_net are > >>>>>> UAPIs. > >>>>>> > >>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > >>>>>> Tested-by: Lei Yang <leiyang@redhat.com> > >>>>>> --- > >>>>>> Documentation/networking/tuntap.rst | 7 ++ > >>>>>> drivers/net/Kconfig | 1 + > >>>>>> drivers/net/tap.c | 68 ++++++++++++++- > >>>>>> drivers/net/tun.c | 98 +++++++++++++++++----- > >>>>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- > >>>>>> include/linux/if_tap.h | 2 + > >>>>>> include/linux/skbuff.h | 3 + > >>>>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ > >>>>>> net/core/skbuff.c | 4 + > >>>>>> 9 files changed, 386 insertions(+), 31 deletions(-) > >>>>>> > > > > [...] > > > >>>>> Let's has a consistent name for this and the uapi to be consistent > >>>>> with TUNSETIFF/TUNGETIFF. Probably TUNSETVNETHASH and > >>>>> tun_vnet_ioctl_gethash(). > >>>> > >>>> They have different semantics so they should have different names. > >>>> TUNGETIFF reports the value currently set while TUNGETVNETHASHCAP > >>>> reports the value that can be set later. > >>> > >>> I'm not sure I will get here. I meant a symmetric name > >>> > >>> TUNSETVNETHASH and TUNVETVNETHASH. > >> > >> TUNGETVNETHASHCAP does not correspond to TUNGETIFF. The correspondence > >> of ioctl names is as follows: > >> TUNGETFEATURES - TUNGETVNETHASHCAP > > > > TUNGETFEATURES returns the value set from TUNSETIFF. This differs from > > TUNGETVNETHASHCAP semantic which just return the capabilities. > > > > +static inline long tun_vnet_ioctl_gethashcap(void __user *argp) > > +{ > > + static const struct tun_vnet_hash cap = { > > + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, > > + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES > > + }; > > + > > + return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; > > +} > > > > TUNGETFEATURES doesn't' help too much for non-persist TAP as userspace > > knows what value it set before. > > > >> TUNSETIFF - TUNSETVNETHASH > >> TUNGETIFF - no corresponding ioctl for the virtio-net hash features > > > > And this sounds odd and a hint for a incomplete uAPI as userspace > > needs to know knowing what can set before doing TUNSETVNETHASH. > > You are confused with TUNGETFEATURES and TUNGETIFF. Below is the code > that implements TUNGETFEATURES: > if (cmd == TUNGETFEATURES) { > /* Currently this just means: "what IFF flags are valid?". > * This is needed because we never checked for invalid flags on > * TUNSETIFF. > */ > return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER | > TUN_FEATURES, (unsigned int __user*)argp); > } else if (cmd == TUNSETQUEUE) { Right. Thanks > > Regards, > Akihiko Odaki > > > > >> > >> Regards, > >> Akihiko Odaki > >> > > > > Thanks > > >
On 2025/03/17 10:12, Jason Wang wrote: > On Wed, Mar 12, 2025 at 1:03 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2025/03/12 11:35, Jason Wang wrote: >>> On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> On 2025/03/11 9:38, Jason Wang wrote: >>>>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>> >>>>>> On 2025/03/10 12:55, Jason Wang wrote: >>>>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>> >>>>>>>> Hash reporting >>>>>>>> ============== >>>>>>>> >>>>>>>> Allow the guest to reuse the hash value to make receive steering >>>>>>>> consistent between the host and guest, and to save hash computation. >>>>>>>> >>>>>>>> RSS >>>>>>>> === >>>>>>>> >>>>>>>> RSS is a receive steering algorithm that can be negotiated to use with >>>>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. >>>>>>>> However, computing the hash after the queue was chosen defeats the >>>>>>>> purpose of RSS. >>>>>>>> >>>>>>>> Another approach is to use eBPF steering program. This approach has >>>>>>>> another downside: it cannot report the calculated hash due to the >>>>>>>> restrictive nature of eBPF steering program. >>>>>>>> >>>>>>>> Introduce the code to perform RSS to the kernel in order to overcome >>>>>>>> thse challenges. An alternative solution is to extend the eBPF steering >>>>>>>> program so that it will be able to report to the userspace, but I didn't >>>>>>>> opt for it because extending the current mechanism of eBPF steering >>>>>>>> program as is because it relies on legacy context rewriting, and >>>>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while >>>>>>>> the other relevant virtualization APIs such as KVM and vhost_net are >>>>>>>> UAPIs. >>>>>>>> >>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >>>>>>>> Tested-by: Lei Yang <leiyang@redhat.com> >>>>>>>> --- >>>>>>>> Documentation/networking/tuntap.rst | 7 ++ >>>>>>>> drivers/net/Kconfig | 1 + >>>>>>>> drivers/net/tap.c | 68 ++++++++++++++- >>>>>>>> drivers/net/tun.c | 98 +++++++++++++++++----- >>>>>>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >>>>>>>> include/linux/if_tap.h | 2 + >>>>>>>> include/linux/skbuff.h | 3 + >>>>>>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >>>>>>>> net/core/skbuff.c | 4 + >>>>>>>> 9 files changed, 386 insertions(+), 31 deletions(-) >>>>>>>> >>>>>>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst >>>>>>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 >>>>>>>> --- a/Documentation/networking/tuntap.rst >>>>>>>> +++ b/Documentation/networking/tuntap.rst >>>>>>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >>>>>>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >>>>>>>> } >>>>>>>> >>> >>> [...] >>> >>>>>>>> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, >>>>>>>> + bool can_rss, void __user *argp) >>>>>>> >>>>>>> So again, can_rss seems to be tricky. Looking at its caller, it tires >>>>>>> to make eBPF and RSS mutually exclusive. I still don't understand why >>>>>>> we need this. Allow eBPF program to override some of the path seems to >>>>>>> be common practice. >>>>>>> >>>>>>> What's more, we didn't try (or even can't) to make automq and eBPF to >>>>>>> be mutually exclusive. So I still didn't see what we gain from this >>>>>>> and it complicates the codes and may lead to ambiguous uAPI/behaviour. >>>>>> >>>>>> automq and eBPF are mutually exclusive; automq is disabled when an eBPF >>>>>> steering program is set so I followed the example here. >>>>> >>>>> I meant from the view of uAPI, the kernel doesn't or can't reject eBPF >>>>> while using automq. >>>> > >> >>>>>> We don't even have an interface for eBPF to let it fall back to another >>>>>> alogirhtm. >>>>> >>>>> It doesn't even need this, e.g XDP overrides the default receiving path. >>>>> >>>>>> I could make it fall back to RSS if the eBPF steeering >>>>>> program is designed to fall back to automq when it returns e.g., -1. But >>>>>> such an interface is currently not defined and defining one is out of >>>>>> scope of this patch series. >>>>> >>>>> Just to make sure we are on the same page, I meant we just need to >>>>> make the behaviour consistent: allow eBPF to override the behaviour of >>>>> both automq and rss. >>>> >>>> That assumes eBPF takes precedence over RSS, which is not obvious to me. >>> >>> Well, it's kind of obvious. Not speaking the eBPF selector, we have >>> other eBPF stuffs like skbedit etc. >>> >>>> >>>> Let's add an interface for the eBPF steering program to fall back to >>>> another steering algorithm. I said it is out of scope before, but it >>>> makes clear that the eBPF steering program takes precedence over other >>>> algorithms and allows us to delete the code for the configuration >>>> validation in this patch. >>> >>> Fallback is out of scope but it's not what I meant. >>> >>> I meant in the current uAPI take eBPF precedence over automq. It's >>> much more simpler to stick this precedence unless we see obvious >>> advanatge. >> >> We still have three different design options that preserve the current >> precedence: >> >> 1) Precedence order: eBPF -> RSS -> automq >> 2) Precedence order: RSS -> eBPF -> automq >> 3) Precedence order: eBPF OR RSS -> automq where eBPF and RSS are >> mutually exclusive >> >> I think this is a unique situation for this steering program and I could >> not find another example in other eBPF stuffs. > > As described above, queue mapping could be overridden by tc-ebpf. So > there's no way to guarantee the RSS will work: > > https://github.com/DPDK/dpdk/blob/main/drivers/net/tap/bpf/tap_rss.c#L262 > > Making eBPF first leaves a chance for the management layer to override > the choice of Qemu. I referred to the eBPF steering program instead of tc-ebpf. tc-ebpf is nothing to do with the TUNSETSTEERINGEBPF ioctl, which this patch changes. > >> >> The current version implements 3) because it is not obvious whether we >> should choose either 1) or 2). > > But you didn't explain why you choose 3), and it leads to tricky code > (e.g the can_rss stuff etc). I wrote: "because it is not obvious whether we should choose either 1) or 2)", but I think I can explain it better: When an eBPF steering program cannot implement a fallback, it means the eBPF steering program requests the full control over the steering. On the other hand, RSS also requests the same control. So these two will conflict and the entity controlling the steering will be undefined when both are enabled. 3) eliminates the undefined semantics by rejecting to enable both. An alternative approach is to allow eBPF steering programs to fall back. When both the eBPF program and RSS are enabled, RSS will gain the control of steering under the well-defined situation where the eBPF steering program decides to fall back. > >> But 1) will be the most capable option if >> eBPF has a fall-back feature. >> >>> >>>> >>>>> >>>>>> >>>>>>> >>> >>> [...] >>> >>>>>>> Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? >>>>>>> If yes, it should be a bug. >>>>>> >>>>>> It is possible to use RSS without TUN_VNET_HASH_REPORT. >>>>> >>>>> Another call to separate the ioctls then. >>>> >>>> RSS and hash reporting are not completely independent though. >>> >>> Spec said: >>> >>> """ >>> VIRTIO_NET_F_RSSRequires VIRTIO_NET_F_CTRL_VQ. >>> """ >> >> I meant the features can be enabled independently, but they will share >> the hash type set when they are enabled at the same time. > > Looking at the spec: > > Hash repot uses: > > """ > struct virtio_net_hash_config { > le32 hash_types; > le16 reserved[4]; > u8 hash_key_length; > u8 hash_key_data[hash_key_length]; > }; > """ > > RSS uses > > """ > struct rss_rq_id { > le16 vq_index_1_16: 15; /* Bits 1 to 16 of the virtqueue index */ > le16 reserved: 1; /* Set to zero */ > }; > > struct virtio_net_rss_config { > le32 hash_types; > le16 indirection_table_mask; > struct rss_rq_id unclassified_queue; > struct rss_rq_id indirection_table[indirection_table_length]; > le16 max_tx_vq; > u8 hash_key_length; > u8 hash_key_data[hash_key_length]; > }; > """ > > Instead of trying to figure out whether we can share some data > structures, why not simply start from what has been done in the spec? > This would ease the usersapce as well where it can simply do 1:1 > mapping between ctrl vq command and tun uAPI. The spec also defines struct virtio_net_hash_config (which will be used when RSS is disabled) and struct virtio_net_rss_config to match the layout to share some fields. However, the UAPI does not follow the interface design of virtio due to some problems with these structures. Below is the definition of struct virtio_net_hash_config: struct virtio_net_hash_config { le32 hash_types; le16 reserved[4]; u8 hash_key_length; u8 hash_key_data[hash_key_length]; }; Here, hash_types, hash_key_length, and hash_key_data are shared with struct virtio_net_rss_config. One problem is that struct virtio_net_rss_config has a flexible array (indirection_table) between hash_types and hash_key_length. This is something we cannot express with C. Another problem is that the semantics of the key in struct virtio_net_hash_config is not defined in the spec. To solve these problems, I defined the UAPI structures that do not include indiretion_table. > >> >>> >>>> >>>> A plot twist is the "types" parameter; it is a parameter that is >>>> "common" for RSS and hash reporting. >>> >>> So we can share part of the structure through the uAPI. >> >> Isn't that what this patch does? > > I didn't see, basically I see only one TUNSETVNETHASH that is used to > set both hash report and rss: The UAPI shares struct tun_vnet_hash for both hash report and rss. > > """ > +/** > + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing > + * > + * The argument is a pointer to &struct tun_vnet_hash. > + * > + * The argument is a pointer to the compound of the following in order if > + * %TUN_VNET_HASH_RSS is set: > + * > + * 1. &struct tun_vnet_hash > + * 2. &struct tun_vnet_hash_rss > + * 3. Indirection table > + * 4. Key > + * > """ > > And it seems to lack parameters like max_tx_vq. max_tx_vq is not relevant with hashing. > > What's more, we've already had virito-net uAPI. Why not simply reusing them? See the above. > >> >>> >>>> RSS and hash reporting must share >>>> this parameter when both are enabled at the same time; otherwise RSS may >>>> compute hash values that are not suited for hash reporting. >>> >>> Is this mandated by the spec? If yes, we can add a check. If not, >>> userspace risk themselves as a mis-configuration which we don't need >>> to bother. >> >> Yes, it is mandated. 5.1.6.4.3 Hash calculation for incoming packets says: >> > A device attempts to calculate a per-packet hash in the following >> > cases: >> > >> > - The feature VIRTIO_NET_F_RSS was negotiated. The device uses the >> > hash to determine the receive virtqueue to place incoming packets. >> > - The feature VIRTIO_NET_F_HASH_REPORT was negotiated. The device >> > reports the hash value and the hash type with the packet. >> > >> > If the feature VIRTIO_NET_F_RSS was negotiated: >> > >> > - The device uses hash_types of the virtio_net_rss_config structure >> > as ’Enabled hash types’ bitmask. >> > - The device uses a key as defined in hash_key_data and >> hash_key_length of the virtio_net_rss_config structure (see >> > 5.1.6.5.7.1). >> > >> > If the feature VIRTIO_NET_F_RSS was not negotiated: >> > >> > - The device uses hash_types of the virtio_net_hash_config structure >> > as ’Enabled hash types’ bitmask. >> > - The device uses a key as defined in hash_key_data and >> > hash_key_length of the virtio_net_hash_config structure (see >> > .1.6.5.6.4). >> >> So when both VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT are >> negotiated, virtio_net_rss_config not only controls RSS but also the >> reported hash values and types. They cannot be divergent. >> >>> >>> Note that spec use different commands for hash_report and rss. >> >> TUNSETVNETHASH is different from these commands in terms that it also >> negotiates VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS. >> > > There Are different "issues" here: > > 1) Whether or not we need to use a unified API for negotiating RSS and > HASH_REPORT features > 2) Whether or not we need to sue a unified API for setting RSS and > HASH_REPORT configuration > > What I want to say is point 2. But what you raise is point 1. > > For simplicity, it looks to me like it's a call for having separated > ioctls for feature negotiation (for example via TUNSETIFF). You may > argue that either RSS or HASH_REPORT requires configurations, we can > just follow what spec defines or not (e.g what happens if > RSS/HASH_REPORT were negotiated but no configurations were set). Unfortunately TUNSETIFF does not fit in this use case. The flags set with TUNSETIFF are fixed, but the guest can request a different feature set anytime by resetting the device. > >> In the virtio-net specification, it is not defined what would happen if >> these features are negotiated but the VIRTIO_NET_CTRL_MQ_RSS_CONFIG or >> VIRTIO_NET_CTRL_MQ_HASH_CONFIG commands are not sent. There is no such >> ambiguity with TUNSETVNETHASH. > > So I don't see advantages of unifying hash reports and rss into a > single ioctl. Let's just follow what has been done in the spec that > uses separated commands. Tuntap is not a good place to debate whether > those commands could be unified or not. We need to move it to the spec > but assuming spec has been done, it might be too late or too few > advantages for having another design. It makes sense for the spec to reuse the generic feature negotiation mechanism, but the situation is different for tuntap; we cannot use TUNSETIFF and need to define another. Then why don't we exploit this opportunity to have an interface with well-defined semantics? The virtio spec does its best as an interface between the host and guest and tuntap does its best as an UAPI. I don't think there is an advantage to split ioctls to follow the spec after all. It makes sense if we can pass-through virtio commands to tuntap, but it is not possible as ioctl operation codes are different from virtio commands. The best possibility is to share structures, not commands, and I don't think even sharing structures makes sense here because of the reasons described above. Regards, Akihiko Odaki > > Thanks > >> >> Regards, >> Akihiko Odaki >> >>> >>>> >>>> The paramter will be duplicated if we have separate ioctls for RSS and >>>> hash reporting, and the kernel will have a chiken-egg problem when >>>> ensuring they are synchronized; when the ioctl for RSS is issued, should >>>> the kernel ensure the "types" parameter is identical with one specified >>>> for hash reporting? It will not work if the userspace may decide to >>>> configure hash reporting after RSS. >>>> >>> >>> See my reply above. >>> >>> Thanks >>> >> >
On Mon, Mar 17, 2025 at 3:07 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2025/03/17 10:12, Jason Wang wrote: > > On Wed, Mar 12, 2025 at 1:03 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2025/03/12 11:35, Jason Wang wrote: > >>> On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> On 2025/03/11 9:38, Jason Wang wrote: > >>>>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>> > >>>>>> On 2025/03/10 12:55, Jason Wang wrote: > >>>>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>> > >>>>>>>> Hash reporting > >>>>>>>> ============== > >>>>>>>> > >>>>>>>> Allow the guest to reuse the hash value to make receive steering > >>>>>>>> consistent between the host and guest, and to save hash computation. > >>>>>>>> > >>>>>>>> RSS > >>>>>>>> === > >>>>>>>> > >>>>>>>> RSS is a receive steering algorithm that can be negotiated to use with > >>>>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. > >>>>>>>> However, computing the hash after the queue was chosen defeats the > >>>>>>>> purpose of RSS. > >>>>>>>> > >>>>>>>> Another approach is to use eBPF steering program. This approach has > >>>>>>>> another downside: it cannot report the calculated hash due to the > >>>>>>>> restrictive nature of eBPF steering program. > >>>>>>>> > >>>>>>>> Introduce the code to perform RSS to the kernel in order to overcome > >>>>>>>> thse challenges. An alternative solution is to extend the eBPF steering > >>>>>>>> program so that it will be able to report to the userspace, but I didn't > >>>>>>>> opt for it because extending the current mechanism of eBPF steering > >>>>>>>> program as is because it relies on legacy context rewriting, and > >>>>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while > >>>>>>>> the other relevant virtualization APIs such as KVM and vhost_net are > >>>>>>>> UAPIs. > >>>>>>>> > >>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > >>>>>>>> Tested-by: Lei Yang <leiyang@redhat.com> > >>>>>>>> --- > >>>>>>>> Documentation/networking/tuntap.rst | 7 ++ > >>>>>>>> drivers/net/Kconfig | 1 + > >>>>>>>> drivers/net/tap.c | 68 ++++++++++++++- > >>>>>>>> drivers/net/tun.c | 98 +++++++++++++++++----- > >>>>>>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- > >>>>>>>> include/linux/if_tap.h | 2 + > >>>>>>>> include/linux/skbuff.h | 3 + > >>>>>>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ > >>>>>>>> net/core/skbuff.c | 4 + > >>>>>>>> 9 files changed, 386 insertions(+), 31 deletions(-) > >>>>>>>> > >>>>>>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst > >>>>>>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 > >>>>>>>> --- a/Documentation/networking/tuntap.rst > >>>>>>>> +++ b/Documentation/networking/tuntap.rst > >>>>>>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: > >>>>>>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); > >>>>>>>> } > >>>>>>>> > >>> > >>> [...] > >>> > >>>>>>>> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, > >>>>>>>> + bool can_rss, void __user *argp) > >>>>>>> > >>>>>>> So again, can_rss seems to be tricky. Looking at its caller, it tires > >>>>>>> to make eBPF and RSS mutually exclusive. I still don't understand why > >>>>>>> we need this. Allow eBPF program to override some of the path seems to > >>>>>>> be common practice. > >>>>>>> > >>>>>>> What's more, we didn't try (or even can't) to make automq and eBPF to > >>>>>>> be mutually exclusive. So I still didn't see what we gain from this > >>>>>>> and it complicates the codes and may lead to ambiguous uAPI/behaviour. > >>>>>> > >>>>>> automq and eBPF are mutually exclusive; automq is disabled when an eBPF > >>>>>> steering program is set so I followed the example here. > >>>>> > >>>>> I meant from the view of uAPI, the kernel doesn't or can't reject eBPF > >>>>> while using automq. > >>>> > >> > >>>>>> We don't even have an interface for eBPF to let it fall back to another > >>>>>> alogirhtm. > >>>>> > >>>>> It doesn't even need this, e.g XDP overrides the default receiving path. > >>>>> > >>>>>> I could make it fall back to RSS if the eBPF steeering > >>>>>> program is designed to fall back to automq when it returns e.g., -1. But > >>>>>> such an interface is currently not defined and defining one is out of > >>>>>> scope of this patch series. > >>>>> > >>>>> Just to make sure we are on the same page, I meant we just need to > >>>>> make the behaviour consistent: allow eBPF to override the behaviour of > >>>>> both automq and rss. > >>>> > >>>> That assumes eBPF takes precedence over RSS, which is not obvious to me. > >>> > >>> Well, it's kind of obvious. Not speaking the eBPF selector, we have > >>> other eBPF stuffs like skbedit etc. > >>> > >>>> > >>>> Let's add an interface for the eBPF steering program to fall back to > >>>> another steering algorithm. I said it is out of scope before, but it > >>>> makes clear that the eBPF steering program takes precedence over other > >>>> algorithms and allows us to delete the code for the configuration > >>>> validation in this patch. > >>> > >>> Fallback is out of scope but it's not what I meant. > >>> > >>> I meant in the current uAPI take eBPF precedence over automq. It's > >>> much more simpler to stick this precedence unless we see obvious > >>> advanatge. > >> > >> We still have three different design options that preserve the current > >> precedence: > >> > >> 1) Precedence order: eBPF -> RSS -> automq > >> 2) Precedence order: RSS -> eBPF -> automq > >> 3) Precedence order: eBPF OR RSS -> automq where eBPF and RSS are > >> mutually exclusive > >> > >> I think this is a unique situation for this steering program and I could > >> not find another example in other eBPF stuffs. > > > > As described above, queue mapping could be overridden by tc-ebpf. So > > there's no way to guarantee the RSS will work: > > > > https://github.com/DPDK/dpdk/blob/main/drivers/net/tap/bpf/tap_rss.c#L262 > > > > Making eBPF first leaves a chance for the management layer to override > > the choice of Qemu. > > I referred to the eBPF steering program instead of tc-ebpf. tc-ebpf is > nothing to do with the TUNSETSTEERINGEBPF ioctl, which this patch changes. I meant you can't do "full control" in any case, the point below doesn't stand. Queue mapping could be restored even if RSS is set. > > > > >> > >> The current version implements 3) because it is not obvious whether we > >> should choose either 1) or 2). > > > > But you didn't explain why you choose 3), and it leads to tricky code > > (e.g the can_rss stuff etc). > > I wrote: "because it is not obvious whether we should choose either 1) > or 2)", but I think I can explain it better: > > When an eBPF steering program cannot implement a fallback, it means the > eBPF steering program requests the full control over the steering. On > the other hand, RSS also requests the same control. So these two will > conflict and the entity controlling the steering will be undefined when > both are enabled. Well, the fallback is orthogonal to the proposal here. We haven't had that since the introduction of the eBPF steering program. This means automq has been in "conflict" with eBPF for years. Again, another advantage, allowing the eBPF program to be the first to allow the management layer to override Qemu's steering. > > 3) eliminates the undefined semantics by rejecting to enable both. This would lead a usersapce noticeable change of the behaviour? And what do you mean by "rejecting to enable both"? > An > alternative approach is to allow eBPF steering programs to fall back. > When both the eBPF program and RSS are enabled, RSS will gain the > control of steering under the well-defined situation where the eBPF > steering program decides to fall back. How about just stick the eBPF precedence in this proposal and introduce the fallback on top? This helps to speed up the iteration (as the version has been iterated to 11). > > > > >> But 1) will be the most capable option if > >> eBPF has a fall-back feature. > >> > >>> > >>>> > >>>>> > >>>>>> > >>>>>>> > >>> > >>> [...] > >>> > >>>>>>> Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? > >>>>>>> If yes, it should be a bug. > >>>>>> > >>>>>> It is possible to use RSS without TUN_VNET_HASH_REPORT. > >>>>> > >>>>> Another call to separate the ioctls then. > >>>> > >>>> RSS and hash reporting are not completely independent though. > >>> > >>> Spec said: > >>> > >>> """ > >>> VIRTIO_NET_F_RSSRequires VIRTIO_NET_F_CTRL_VQ. > >>> """ > >> > >> I meant the features can be enabled independently, but they will share > >> the hash type set when they are enabled at the same time. > > > > Looking at the spec: > > > > Hash repot uses: > > > > """ > > struct virtio_net_hash_config { > > le32 hash_types; > > le16 reserved[4]; > > u8 hash_key_length; > > u8 hash_key_data[hash_key_length]; > > }; > > """ > > > > RSS uses > > > > """ > > struct rss_rq_id { > > le16 vq_index_1_16: 15; /* Bits 1 to 16 of the virtqueue index */ > > le16 reserved: 1; /* Set to zero */ > > }; > > > > struct virtio_net_rss_config { > > le32 hash_types; > > le16 indirection_table_mask; > > struct rss_rq_id unclassified_queue; > > struct rss_rq_id indirection_table[indirection_table_length]; > > le16 max_tx_vq; > > u8 hash_key_length; > > u8 hash_key_data[hash_key_length]; > > }; > > """ > > > > Instead of trying to figure out whether we can share some data > > structures, why not simply start from what has been done in the spec? > > This would ease the usersapce as well where it can simply do 1:1 > > mapping between ctrl vq command and tun uAPI. > > The spec also defines struct virtio_net_hash_config (which will be used > when RSS is disabled) and struct virtio_net_rss_config to match the > layout to share some fields. However, the UAPI does not follow the > interface design of virtio due to some problems with these structures. Copy-paste error. The above is copied from the virtio spec, but I meant the existing uAPI in virtio_net.h: /* * The command VIRTIO_NET_CTRL_MQ_RSS_CONFIG has the same effect as * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET does and additionally configures * the receive steering to use a hash calculated for incoming packet * to decide on receive virtqueue to place the packet. The command * also provides parameters to calculate a hash and receive virtqueue. */ struct virtio_net_rss_config { __le32 hash_types; __le16 indirection_table_mask; __le16 unclassified_queue; __le16 indirection_table[1/* + indirection_table_mask */]; __le16 max_tx_vq; __u8 hash_key_length; __u8 hash_key_data[/* hash_key_length */]; }; #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 /* * The command VIRTIO_NET_CTRL_MQ_HASH_CONFIG requests the device * to include in the virtio header of the packet the value of the * calculated hash and the report type of hash. It also provides * parameters for hash calculation. The command requires feature * VIRTIO_NET_F_HASH_REPORT to be negotiated to extend the * layout of virtio header as defined in virtio_net_hdr_v1_hash. */ struct virtio_net_hash_config { __le32 hash_types; /* for compatibility with virtio_net_rss_config */ __le16 reserved[4]; __u8 hash_key_length; __u8 hash_key_data[/* hash_key_length */]; }; This has been used by Qemu but I see a virtio-net version of: struct virtio_net_ctrl_rss { u32 hash_types; u16 indirection_table_mask; u16 unclassified_queue; u16 hash_cfg_reserved; /* for HASH_CONFIG (see virtio_net_hash_config for details) */ u16 max_tx_vq; u8 hash_key_length; u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; u16 *indirection_table; }; This is ugly and results in a tricky code when trying to submit RSS/HASH commands to the device: if (vi->has_rss) { sg_buf_size = sizeof(uint16_t) * vi->rss_indir_table_size; sg_set_buf(&sgs[1], vi->rss.indirection_table, sg_buf_size); } else { sg_set_buf(&sgs[1], &vi->rss.hash_cfg_reserved, sizeof(uint16_t)); } > > Below is the definition of struct virtio_net_hash_config: > > struct virtio_net_hash_config { > le32 hash_types; > le16 reserved[4]; > u8 hash_key_length; > u8 hash_key_data[hash_key_length]; > }; > > Here, hash_types, hash_key_length, and hash_key_data are shared with > struct virtio_net_rss_config. > > One problem is that struct virtio_net_rss_config has a flexible array > (indirection_table) between hash_types and hash_key_length. This is > something we cannot express with C. We can split the virtio_net_rss_config to ease the dealing with arrays, more below. > > Another problem is that the semantics of the key in struct > virtio_net_hash_config is not defined in the spec. If this is the case. Let's fix that in the spec first to make sure our uAPI aligns with spec without ambiguity. It would be a nightmare to deal with the in-consistency between virtio spec and Linux uAPIs. > > To solve these problems, I defined the UAPI structures that do not > include indiretion_table. > > > > >> > >>> > >>>> > >>>> A plot twist is the "types" parameter; it is a parameter that is > >>>> "common" for RSS and hash reporting. > >>> > >>> So we can share part of the structure through the uAPI. > >> > >> Isn't that what this patch does? > > > > I didn't see, basically I see only one TUNSETVNETHASH that is used to > > set both hash report and rss: > > The UAPI shares struct tun_vnet_hash for both hash report and rss. I meant sharing structure in two ioctls instead of reusing a specific structure for two semantics in one ioctl if possible. Though I don't think we need any sharing. > > > > > """ > > +/** > > + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing > > + * > > + * The argument is a pointer to &struct tun_vnet_hash. > > + * > > + * The argument is a pointer to the compound of the following in order if > > + * %TUN_VNET_HASH_RSS is set: > > + * > > + * 1. &struct tun_vnet_hash > > + * 2. &struct tun_vnet_hash_rss > > + * 3. Indirection table > > + * 4. Key > > + * > > """ > > > > And it seems to lack parameters like max_tx_vq. > > max_tx_vq is not relevant with hashing. It is needed for RSS and we don't have that, no? > > > > > What's more, we've already had virito-net uAPI. Why not simply reusing them? > > See the above. > > > > >> > >>> > >>>> RSS and hash reporting must share > >>>> this parameter when both are enabled at the same time; otherwise RSS may > >>>> compute hash values that are not suited for hash reporting. > >>> > >>> Is this mandated by the spec? If yes, we can add a check. If not, > >>> userspace risk themselves as a mis-configuration which we don't need > >>> to bother. > >> > >> Yes, it is mandated. 5.1.6.4.3 Hash calculation for incoming packets says: > >> > A device attempts to calculate a per-packet hash in the following > >> > cases: > >> > > >> > - The feature VIRTIO_NET_F_RSS was negotiated. The device uses the > >> > hash to determine the receive virtqueue to place incoming packets. > >> > - The feature VIRTIO_NET_F_HASH_REPORT was negotiated. The device > >> > reports the hash value and the hash type with the packet. > >> > > >> > If the feature VIRTIO_NET_F_RSS was negotiated: > >> > > >> > - The device uses hash_types of the virtio_net_rss_config structure > >> > as ’Enabled hash types’ bitmask. > >> > - The device uses a key as defined in hash_key_data and > >> hash_key_length of the virtio_net_rss_config structure (see > >> > 5.1.6.5.7.1). > >> > > >> > If the feature VIRTIO_NET_F_RSS was not negotiated: > >> > > >> > - The device uses hash_types of the virtio_net_hash_config structure > >> > as ’Enabled hash types’ bitmask. > >> > - The device uses a key as defined in hash_key_data and > >> > hash_key_length of the virtio_net_hash_config structure (see > >> > .1.6.5.6.4). > >> > >> So when both VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT are > >> negotiated, virtio_net_rss_config not only controls RSS but also the > >> reported hash values and types. They cannot be divergent. > >> > >>> > >>> Note that spec use different commands for hash_report and rss. > >> > >> TUNSETVNETHASH is different from these commands in terms that it also > >> negotiates VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS. > >> > > > > There Are different "issues" here: > > > > 1) Whether or not we need to use a unified API for negotiating RSS and > > HASH_REPORT features > > 2) Whether or not we need to sue a unified API for setting RSS and > > HASH_REPORT configuration > > > > What I want to say is point 2. But what you raise is point 1. > > > > For simplicity, it looks to me like it's a call for having separated > > ioctls for feature negotiation (for example via TUNSETIFF). You may > > argue that either RSS or HASH_REPORT requires configurations, we can > > just follow what spec defines or not (e.g what happens if > > RSS/HASH_REPORT were negotiated but no configurations were set). > > Unfortunately TUNSETIFF does not fit in this use case. The flags set > with TUNSETIFF are fixed, but the guest can request a different feature > set anytime by resetting the device. TUNSETIFF, enables the device to be able to handle RSS/HASREPORT. TUNSETHASH/RSS. dealing with RSS/HASH command from userspace. This is the way we used to do for multi queue and vnet header. TUNSETIFF requires CAP_NET_ADMIN, this could be an extra safe guard for unprivileged userspace. > > > >> In the virtio-net specification, it is not defined what would > happen if > >> these features are negotiated but the VIRTIO_NET_CTRL_MQ_RSS_CONFIG or > >> VIRTIO_NET_CTRL_MQ_HASH_CONFIG commands are not sent. There is no such > >> ambiguity with TUNSETVNETHASH. > > > > So I don't see advantages of unifying hash reports and rss into a > > single ioctl. Let's just follow what has been done in the spec that > > uses separated commands. Tuntap is not a good place to debate whether > > those commands could be unified or not. We need to move it to the spec > > but assuming spec has been done, it might be too late or too few > > advantages for having another design. > > It makes sense for the spec to reuse the generic feature negotiation > mechanism, but the situation is different for tuntap; we cannot use > TUNSETIFF and need to define another. Then why don't we exploit this > opportunity to have an interface with well-defined semantics? That's perfectly fine, but it needs to be done in virtio-net's uAPI not tun's. What's more, if you think two commands are not well-defined, let's fix that in the virtio spec first. > The virtio > spec does its best as an interface between the host and guest and tuntap > does its best as an UAPI. See above, let's fix the uAPI first. We don't want DPDK to use tun's uAPI for RSS > > I don't think there is an advantage to split ioctls to follow the spec > after all. It makes sense if we can pass-through virtio commands to > tuntap, but it is not possible as ioctl operation codes are different > from virtio commands. I don't see a connection with the operation code. For example, we can add new uAPIs in virtio-net which could be something like: struct virtio_net_rss_config_header { __le32 hash_types; __le16 indirection_table_mask; __le16 unclassified_queue; __le16 indirection_table[]; } struct virtio_net_rss_config_tailer { __le16 max_tx_vq; u8 hash_key_length; u8 hash_key_data[]; } These two are used by TUNSETVNETRSS. And simply reuse the virtio_net_hash_config for TUNSETVETHASH. With this, we can tweak the virtio-net driver with this new uAPI. Then tap* can reuse this. > The best possibility is to share structures, not > commands, and I don't think even sharing structures makes sense here > because of the reasons described above. I don't want to share structures, I meant starting from something that is simple and has been sorted in the virtio spec. Optimization could be done on top. Thanks > > Regards, > Akihiko Odaki > > > > > Thanks > > > >> > >> Regards, > >> Akihiko Odaki > >> > >>> > >>>> > >>>> The paramter will be duplicated if we have separate ioctls for RSS and > >>>> hash reporting, and the kernel will have a chiken-egg problem when > >>>> ensuring they are synchronized; when the ioctl for RSS is issued, should > >>>> the kernel ensure the "types" parameter is identical with one specified > >>>> for hash reporting? It will not work if the userspace may decide to > >>>> configure hash reporting after RSS. > >>>> > >>> > >>> See my reply above. > >>> > >>> Thanks > >>> > >> > > >
On 2025/03/18 9:15, Jason Wang wrote: > On Mon, Mar 17, 2025 at 3:07 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2025/03/17 10:12, Jason Wang wrote: >>> On Wed, Mar 12, 2025 at 1:03 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> On 2025/03/12 11:35, Jason Wang wrote: >>>>> On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>> >>>>>> On 2025/03/11 9:38, Jason Wang wrote: >>>>>>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>> >>>>>>>> On 2025/03/10 12:55, Jason Wang wrote: >>>>>>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>> >>>>>>>>>> Hash reporting >>>>>>>>>> ============== >>>>>>>>>> >>>>>>>>>> Allow the guest to reuse the hash value to make receive steering >>>>>>>>>> consistent between the host and guest, and to save hash computation. >>>>>>>>>> >>>>>>>>>> RSS >>>>>>>>>> === >>>>>>>>>> >>>>>>>>>> RSS is a receive steering algorithm that can be negotiated to use with >>>>>>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. >>>>>>>>>> However, computing the hash after the queue was chosen defeats the >>>>>>>>>> purpose of RSS. >>>>>>>>>> >>>>>>>>>> Another approach is to use eBPF steering program. This approach has >>>>>>>>>> another downside: it cannot report the calculated hash due to the >>>>>>>>>> restrictive nature of eBPF steering program. >>>>>>>>>> >>>>>>>>>> Introduce the code to perform RSS to the kernel in order to overcome >>>>>>>>>> thse challenges. An alternative solution is to extend the eBPF steering >>>>>>>>>> program so that it will be able to report to the userspace, but I didn't >>>>>>>>>> opt for it because extending the current mechanism of eBPF steering >>>>>>>>>> program as is because it relies on legacy context rewriting, and >>>>>>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while >>>>>>>>>> the other relevant virtualization APIs such as KVM and vhost_net are >>>>>>>>>> UAPIs. >>>>>>>>>> >>>>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >>>>>>>>>> Tested-by: Lei Yang <leiyang@redhat.com> >>>>>>>>>> --- >>>>>>>>>> Documentation/networking/tuntap.rst | 7 ++ >>>>>>>>>> drivers/net/Kconfig | 1 + >>>>>>>>>> drivers/net/tap.c | 68 ++++++++++++++- >>>>>>>>>> drivers/net/tun.c | 98 +++++++++++++++++----- >>>>>>>>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >>>>>>>>>> include/linux/if_tap.h | 2 + >>>>>>>>>> include/linux/skbuff.h | 3 + >>>>>>>>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >>>>>>>>>> net/core/skbuff.c | 4 + >>>>>>>>>> 9 files changed, 386 insertions(+), 31 deletions(-) >>>>>>>>>> >>>>>>>>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst >>>>>>>>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 >>>>>>>>>> --- a/Documentation/networking/tuntap.rst >>>>>>>>>> +++ b/Documentation/networking/tuntap.rst >>>>>>>>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >>>>>>>>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >>>>>>>>>> } >>>>>>>>>> >>>>> >>>>> [...] >>>>> >>>>>>>>>> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, >>>>>>>>>> + bool can_rss, void __user *argp) >>>>>>>>> >>>>>>>>> So again, can_rss seems to be tricky. Looking at its caller, it tires >>>>>>>>> to make eBPF and RSS mutually exclusive. I still don't understand why >>>>>>>>> we need this. Allow eBPF program to override some of the path seems to >>>>>>>>> be common practice. >>>>>>>>> >>>>>>>>> What's more, we didn't try (or even can't) to make automq and eBPF to >>>>>>>>> be mutually exclusive. So I still didn't see what we gain from this >>>>>>>>> and it complicates the codes and may lead to ambiguous uAPI/behaviour. >>>>>>>> >>>>>>>> automq and eBPF are mutually exclusive; automq is disabled when an eBPF >>>>>>>> steering program is set so I followed the example here. >>>>>>> >>>>>>> I meant from the view of uAPI, the kernel doesn't or can't reject eBPF >>>>>>> while using automq. >>>>>> > >> >>>>>>>> We don't even have an interface for eBPF to let it fall back to another >>>>>>>> alogirhtm. >>>>>>> >>>>>>> It doesn't even need this, e.g XDP overrides the default receiving path. >>>>>>> >>>>>>>> I could make it fall back to RSS if the eBPF steeering >>>>>>>> program is designed to fall back to automq when it returns e.g., -1. But >>>>>>>> such an interface is currently not defined and defining one is out of >>>>>>>> scope of this patch series. >>>>>>> >>>>>>> Just to make sure we are on the same page, I meant we just need to >>>>>>> make the behaviour consistent: allow eBPF to override the behaviour of >>>>>>> both automq and rss. >>>>>> >>>>>> That assumes eBPF takes precedence over RSS, which is not obvious to me. >>>>> >>>>> Well, it's kind of obvious. Not speaking the eBPF selector, we have >>>>> other eBPF stuffs like skbedit etc. >>>>> >>>>>> >>>>>> Let's add an interface for the eBPF steering program to fall back to >>>>>> another steering algorithm. I said it is out of scope before, but it >>>>>> makes clear that the eBPF steering program takes precedence over other >>>>>> algorithms and allows us to delete the code for the configuration >>>>>> validation in this patch. >>>>> >>>>> Fallback is out of scope but it's not what I meant. >>>>> >>>>> I meant in the current uAPI take eBPF precedence over automq. It's >>>>> much more simpler to stick this precedence unless we see obvious >>>>> advanatge. >>>> >>>> We still have three different design options that preserve the current >>>> precedence: >>>> >>>> 1) Precedence order: eBPF -> RSS -> automq >>>> 2) Precedence order: RSS -> eBPF -> automq >>>> 3) Precedence order: eBPF OR RSS -> automq where eBPF and RSS are >>>> mutually exclusive >>>> >>>> I think this is a unique situation for this steering program and I could >>>> not find another example in other eBPF stuffs. >>> >>> As described above, queue mapping could be overridden by tc-ebpf. So >>> there's no way to guarantee the RSS will work: >>> >>> https://github.com/DPDK/dpdk/blob/main/drivers/net/tap/bpf/tap_rss.c#L262 >>> >>> Making eBPF first leaves a chance for the management layer to override >>> the choice of Qemu. >> >> I referred to the eBPF steering program instead of tc-ebpf. tc-ebpf is >> nothing to do with the TUNSETSTEERINGEBPF ioctl, which this patch changes. > > I meant you can't do "full control" in any case, the point below > doesn't stand. Queue mapping could be restored even if RSS is set. What matters here is how we handle the control when tc didn't take it. eBPF, RSS, or automq make take all of it; I referred that as "full control". > >> >>> >>>> >>>> The current version implements 3) because it is not obvious whether we >>>> should choose either 1) or 2). >>> >>> But you didn't explain why you choose 3), and it leads to tricky code >>> (e.g the can_rss stuff etc). >> >> I wrote: "because it is not obvious whether we should choose either 1) >> or 2)", but I think I can explain it better: >> >> When an eBPF steering program cannot implement a fallback, it means the >> eBPF steering program requests the full control over the steering. On >> the other hand, RSS also requests the same control. So these two will >> conflict and the entity controlling the steering will be undefined when >> both are enabled. > > Well, the fallback is orthogonal to the proposal here. We haven't had > that since the introduction of the eBPF steering program. This means > automq has been in "conflict" with eBPF for years. Again, another > advantage, allowing the eBPF program to be the first to allow the > management layer to override Qemu's steering. What if a VMM uses eBPF steering program and the management layer decides to override it with RSS? eBPF is obviously predecedent to automq as eBPF is an opt-in feature and automq is the implicit default. But this logic cannot be applied to decide the order of eBPF and RSS because they are both opt-in features. > >> >> 3) eliminates the undefined semantics by rejecting to enable both. > > This would lead a usersapce noticeable change of the behaviour? And > what do you mean by "rejecting to enable both"? Existing userspace code should see no change as it only cares the case where RSS is enabled. Here, rejecting to enable both means to deny setting an eBPF steering program when RSS is enabled, and visa-versa. > >> An >> alternative approach is to allow eBPF steering programs to fall back. >> When both the eBPF program and RSS are enabled, RSS will gain the >> control of steering under the well-defined situation where the eBPF >> steering program decides to fall back. > > How about just stick the eBPF precedence in this proposal and > introduce the fallback on top? This helps to speed up the iteration > (as the version has been iterated to 11). I don't think that helps much since we have another ongoing discussion below and it is not the sole roadblock. > >> >>> >>>> But 1) will be the most capable option if >>>> eBPF has a fall-back feature. >>>> >>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>> >>>>> [...] >>>>> >>>>>>>>> Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? >>>>>>>>> If yes, it should be a bug. >>>>>>>> >>>>>>>> It is possible to use RSS without TUN_VNET_HASH_REPORT. >>>>>>> >>>>>>> Another call to separate the ioctls then. >>>>>> >>>>>> RSS and hash reporting are not completely independent though. >>>>> >>>>> Spec said: >>>>> >>>>> """ >>>>> VIRTIO_NET_F_RSSRequires VIRTIO_NET_F_CTRL_VQ. >>>>> """ >>>> >>>> I meant the features can be enabled independently, but they will share >>>> the hash type set when they are enabled at the same time. >>> >>> Looking at the spec: >>> >>> Hash repot uses: >>> >>> """ >>> struct virtio_net_hash_config { >>> le32 hash_types; >>> le16 reserved[4]; >>> u8 hash_key_length; >>> u8 hash_key_data[hash_key_length]; >>> }; >>> """ >>> >>> RSS uses >>> >>> """ >>> struct rss_rq_id { >>> le16 vq_index_1_16: 15; /* Bits 1 to 16 of the virtqueue index */ >>> le16 reserved: 1; /* Set to zero */ >>> }; >>> >>> struct virtio_net_rss_config { >>> le32 hash_types; >>> le16 indirection_table_mask; >>> struct rss_rq_id unclassified_queue; >>> struct rss_rq_id indirection_table[indirection_table_length]; >>> le16 max_tx_vq; >>> u8 hash_key_length; >>> u8 hash_key_data[hash_key_length]; >>> }; >>> """ >>> >>> Instead of trying to figure out whether we can share some data >>> structures, why not simply start from what has been done in the spec? >>> This would ease the usersapce as well where it can simply do 1:1 >>> mapping between ctrl vq command and tun uAPI. >> >> The spec also defines struct virtio_net_hash_config (which will be used >> when RSS is disabled) and struct virtio_net_rss_config to match the >> layout to share some fields. However, the UAPI does not follow the >> interface design of virtio due to some problems with these structures. > > Copy-paste error. The above is copied from the virtio spec, but I > meant the existing uAPI in virtio_net.h: > > /* > * The command VIRTIO_NET_CTRL_MQ_RSS_CONFIG has the same effect as > * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET does and additionally configures > * the receive steering to use a hash calculated for incoming packet > * to decide on receive virtqueue to place the packet. The command > * also provides parameters to calculate a hash and receive virtqueue. > */ > struct virtio_net_rss_config { > __le32 hash_types; > __le16 indirection_table_mask; > __le16 unclassified_queue; > __le16 indirection_table[1/* + indirection_table_mask */]; > __le16 max_tx_vq; > __u8 hash_key_length; > __u8 hash_key_data[/* hash_key_length */]; > }; > > #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 > > /* > * The command VIRTIO_NET_CTRL_MQ_HASH_CONFIG requests the device > * to include in the virtio header of the packet the value of the > * calculated hash and the report type of hash. It also provides > * parameters for hash calculation. The command requires feature > * VIRTIO_NET_F_HASH_REPORT to be negotiated to extend the > * layout of virtio header as defined in virtio_net_hdr_v1_hash. > */ > struct virtio_net_hash_config { > __le32 hash_types; > /* for compatibility with virtio_net_rss_config */ > __le16 reserved[4]; > __u8 hash_key_length; > __u8 hash_key_data[/* hash_key_length */]; > }; > > This has been used by Qemu but I see a virtio-net version of: > > struct virtio_net_ctrl_rss { > u32 hash_types; > u16 indirection_table_mask; > u16 unclassified_queue; > u16 hash_cfg_reserved; /* for HASH_CONFIG (see > virtio_net_hash_config for details) */ > u16 max_tx_vq; > u8 hash_key_length; > u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > > u16 *indirection_table; > }; > > This is ugly and results in a tricky code when trying to submit > RSS/HASH commands to the device: > > if (vi->has_rss) { > sg_buf_size = sizeof(uint16_t) * vi->rss_indir_table_size; > sg_set_buf(&sgs[1], vi->rss.indirection_table, sg_buf_size); > } else { > sg_set_buf(&sgs[1], &vi->rss.hash_cfg_reserved, > sizeof(uint16_t)); > } The only reference to struct virtio_net_rss_config in QEMU is to derive the offset of indirection_table. This is because the definition in virtio_net.h also includes indirection_table in the middle and the offsets of later part are unusable. QEMU internally has a structure named VirtioNetRssData which just looks like struct virtio_net_ctrl_rss. > >> >> Below is the definition of struct virtio_net_hash_config: >> >> struct virtio_net_hash_config { >> le32 hash_types; >> le16 reserved[4]; >> u8 hash_key_length; >> u8 hash_key_data[hash_key_length]; >> }; >> >> Here, hash_types, hash_key_length, and hash_key_data are shared with >> struct virtio_net_rss_config. >> >> One problem is that struct virtio_net_rss_config has a flexible array >> (indirection_table) between hash_types and hash_key_length. This is >> something we cannot express with C. > > We can split the virtio_net_rss_config to ease the dealing with > arrays, more below. > >> >> Another problem is that the semantics of the key in struct >> virtio_net_hash_config is not defined in the spec. > > If this is the case. Let's fix that in the spec first to make sure our > uAPI aligns with spec without ambiguity. It would be a nightmare to > deal with the in-consistency between virtio spec and Linux uAPIs. The userspace doesn't need to do anything to deal with inconsistency since these fields are unused. > >> >> To solve these problems, I defined the UAPI structures that do not >> include indiretion_table. >> >>> >>>> >>>>> >>>>>> >>>>>> A plot twist is the "types" parameter; it is a parameter that is >>>>>> "common" for RSS and hash reporting. >>>>> >>>>> So we can share part of the structure through the uAPI. >>>> >>>> Isn't that what this patch does? >>> >>> I didn't see, basically I see only one TUNSETVNETHASH that is used to >>> set both hash report and rss: >> >> The UAPI shares struct tun_vnet_hash for both hash report and rss. > > I meant sharing structure in two ioctls instead of reusing a specific > structure for two semantics in one ioctl if possible. Though I don't > think we need any sharing. The UAPI implemented in this patch already shares struct tun_vnet_hash and having two ioctls doesn't change that. > >> >>> >>> """ >>> +/** >>> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing >>> + * >>> + * The argument is a pointer to &struct tun_vnet_hash. >>> + * >>> + * The argument is a pointer to the compound of the following in order if >>> + * %TUN_VNET_HASH_RSS is set: >>> + * >>> + * 1. &struct tun_vnet_hash >>> + * 2. &struct tun_vnet_hash_rss >>> + * 3. Indirection table >>> + * 4. Key >>> + * >>> """ >>> >>> And it seems to lack parameters like max_tx_vq. >> >> max_tx_vq is not relevant with hashing. > > It is needed for RSS and we don't have that, no? No. RSS is Receive Side Scaling but it's not about receiving. > >> >>> >>> What's more, we've already had virito-net uAPI. Why not simply reusing them? >> >> See the above. >> >>> >>>> >>>>> >>>>>> RSS and hash reporting must share >>>>>> this parameter when both are enabled at the same time; otherwise RSS may >>>>>> compute hash values that are not suited for hash reporting. >>>>> >>>>> Is this mandated by the spec? If yes, we can add a check. If not, >>>>> userspace risk themselves as a mis-configuration which we don't need >>>>> to bother. >>>> >>>> Yes, it is mandated. 5.1.6.4.3 Hash calculation for incoming packets says: >>>> > A device attempts to calculate a per-packet hash in the following >>>> > cases: >>>> > >>>> > - The feature VIRTIO_NET_F_RSS was negotiated. The device uses the >>>> > hash to determine the receive virtqueue to place incoming packets. >>>> > - The feature VIRTIO_NET_F_HASH_REPORT was negotiated. The device >>>> > reports the hash value and the hash type with the packet. >>>> > >>>> > If the feature VIRTIO_NET_F_RSS was negotiated: >>>> > >>>> > - The device uses hash_types of the virtio_net_rss_config structure >>>> > as ’Enabled hash types’ bitmask. >>>> > - The device uses a key as defined in hash_key_data and >>>> hash_key_length of the virtio_net_rss_config structure (see >>>> > 5.1.6.5.7.1). >>>> > >>>> > If the feature VIRTIO_NET_F_RSS was not negotiated: >>>> > >>>> > - The device uses hash_types of the virtio_net_hash_config structure >>>> > as ’Enabled hash types’ bitmask. >>>> > - The device uses a key as defined in hash_key_data and >>>> > hash_key_length of the virtio_net_hash_config structure (see >>>> > .1.6.5.6.4). >>>> >>>> So when both VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT are >>>> negotiated, virtio_net_rss_config not only controls RSS but also the >>>> reported hash values and types. They cannot be divergent. >>>> >>>>> >>>>> Note that spec use different commands for hash_report and rss. >>>> >>>> TUNSETVNETHASH is different from these commands in terms that it also >>>> negotiates VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS. >>>> >>> >>> There Are different "issues" here: >>> >>> 1) Whether or not we need to use a unified API for negotiating RSS and >>> HASH_REPORT features >>> 2) Whether or not we need to sue a unified API for setting RSS and >>> HASH_REPORT configuration >>> >>> What I want to say is point 2. But what you raise is point 1. >>> >>> For simplicity, it looks to me like it's a call for having separated >>> ioctls for feature negotiation (for example via TUNSETIFF). You may >>> argue that either RSS or HASH_REPORT requires configurations, we can >>> just follow what spec defines or not (e.g what happens if >>> RSS/HASH_REPORT were negotiated but no configurations were set). >> >> Unfortunately TUNSETIFF does not fit in this use case. The flags set >> with TUNSETIFF are fixed, but the guest can request a different feature >> set anytime by resetting the device. > > TUNSETIFF, enables the device to be able to handle RSS/HASREPORT. > TUNSETHASH/RSS. dealing with RSS/HASH command from userspace. We also needs to be able to disable them at runtime so that we can handle resets. > > This is the way we used to do for multi queue and vnet header. > TUNSETIFF requires CAP_NET_ADMIN, this could be an extra safe guard > for unprivileged userspace. I intend to allow using this feature without privilege. A VMM is usually unprivileged and requiring a privilege to configure tuntap is too prohibitive. > >> >> > >> In the virtio-net specification, it is not defined what would >> happen if >>>> these features are negotiated but the VIRTIO_NET_CTRL_MQ_RSS_CONFIG or >>>> VIRTIO_NET_CTRL_MQ_HASH_CONFIG commands are not sent. There is no such >>>> ambiguity with TUNSETVNETHASH. >>> >>> So I don't see advantages of unifying hash reports and rss into a >>> single ioctl. Let's just follow what has been done in the spec that >>> uses separated commands. Tuntap is not a good place to debate whether >>> those commands could be unified or not. We need to move it to the spec >>> but assuming spec has been done, it might be too late or too few >>> advantages for having another design. >> >> It makes sense for the spec to reuse the generic feature negotiation >> mechanism, but the situation is different for tuntap; we cannot use >> TUNSETIFF and need to define another. Then why don't we exploit this >> opportunity to have an interface with well-defined semantics? > > That's perfectly fine, but it needs to be done in virtio-net's uAPI > not tun's. What's more, if you think two commands are not > well-defined, let's fix that in the virtio spec first. > >> The virtio >> spec does its best as an interface between the host and guest and tuntap >> does its best as an UAPI. > > See above, let's fix the uAPI first. We don't want DPDK to use tun's > uAPI for RSS virtio-net's UAPI is for the virtio spec which has a capable generic feature negotiation mechanism. tuntap needs its own feature negotiation and it's nothing to do with virtio-net's UAPI. The structures for two commands have unused or redundant fields and a flexible array in the middle of the structure, but they are ABIs so we can't change it. DPDK is another reason to define tuntap's own UAPIs. They don't care unused or redundant fields and a flexible array in middle that are present in the virtio spec. It will also not want to deal with the requirement of little endian. Constructing struct virtio_net_rss_config is an extra burden for DPDK. On the other hand, Constructing tuntap-specific structures is not that complicated for VMMs. A VMM will need to inspect struct virtio_net_rss_config anyway to handle migration and check its size so it can store the values it inspected to struct tun_vnet_hash and struct tun_vnet_hash_rss and pass them to the kernel. The overall userspace implementation will be simpler by having structures specifically tailored for the communication between the userspace and kernel. > >> >> I don't think there is an advantage to split ioctls to follow the spec >> after all. It makes sense if we can pass-through virtio commands to >> tuntap, but it is not possible as ioctl operation codes are different >> from virtio commands. > > I don't see a connection with the operation code. For example, we can > add new uAPIs in virtio-net which could be something like: > > struct virtio_net_rss_config_header { > __le32 hash_types; > __le16 indirection_table_mask; > __le16 unclassified_queue; > __le16 indirection_table[]; > } > > struct virtio_net_rss_config_tailer { > __le16 max_tx_vq; > u8 hash_key_length; > u8 hash_key_data[]; > } > > These two are used by TUNSETVNETRSS. And simply reuse the > virtio_net_hash_config for TUNSETVETHASH. > > With this, we can tweak the virtio-net driver with this new uAPI. Then > tap* can reuse this. I implemented a UAPI and driver change accordingly: https://lore.kernel.org/r/20250318-virtio-v1-0-344caf336ddd@daynix.com This is a nice improvement for the driver, but I still don't think it is suited for the UAPI of tuntap. The requirements of extra fields and little endian cannot be removed from the virtio spec but they are irrelevant for tuntap. > >> The best possibility is to share structures, not >> commands, and I don't think even sharing structures makes sense here >> because of the reasons described above. > > I don't want to share structures, I meant starting from something that > is simple and has been sorted in the virtio spec. Optimization could > be done on top. I meant to reuse the structures in virtio_net.h. Regards, Akihiko Odaki > > Thanks > > >> >> Regards, >> Akihiko Odaki >> >>> >>> Thanks >>> >>>> >>>> Regards, >>>> Akihiko Odaki >>>> >>>>> >>>>>> >>>>>> The paramter will be duplicated if we have separate ioctls for RSS and >>>>>> hash reporting, and the kernel will have a chiken-egg problem when >>>>>> ensuring they are synchronized; when the ioctl for RSS is issued, should >>>>>> the kernel ensure the "types" parameter is identical with one specified >>>>>> for hash reporting? It will not work if the userspace may decide to >>>>>> configure hash reporting after RSS. >>>>>> >>>>> >>>>> See my reply above. >>>>> >>>>> Thanks >>>>> >>>> >>> >> >
On Tue, Mar 18, 2025 at 6:10 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2025/03/18 9:15, Jason Wang wrote: > > On Mon, Mar 17, 2025 at 3:07 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2025/03/17 10:12, Jason Wang wrote: > >>> On Wed, Mar 12, 2025 at 1:03 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> On 2025/03/12 11:35, Jason Wang wrote: > >>>>> On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>> > >>>>>> On 2025/03/11 9:38, Jason Wang wrote: > >>>>>>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>> > >>>>>>>> On 2025/03/10 12:55, Jason Wang wrote: > >>>>>>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>>>> > >>>>>>>>>> Hash reporting > >>>>>>>>>> ============== > >>>>>>>>>> > >>>>>>>>>> Allow the guest to reuse the hash value to make receive steering > >>>>>>>>>> consistent between the host and guest, and to save hash computation. > >>>>>>>>>> > >>>>>>>>>> RSS > >>>>>>>>>> === > >>>>>>>>>> > >>>>>>>>>> RSS is a receive steering algorithm that can be negotiated to use with > >>>>>>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. > >>>>>>>>>> However, computing the hash after the queue was chosen defeats the > >>>>>>>>>> purpose of RSS. > >>>>>>>>>> > >>>>>>>>>> Another approach is to use eBPF steering program. This approach has > >>>>>>>>>> another downside: it cannot report the calculated hash due to the > >>>>>>>>>> restrictive nature of eBPF steering program. > >>>>>>>>>> > >>>>>>>>>> Introduce the code to perform RSS to the kernel in order to overcome > >>>>>>>>>> thse challenges. An alternative solution is to extend the eBPF steering > >>>>>>>>>> program so that it will be able to report to the userspace, but I didn't > >>>>>>>>>> opt for it because extending the current mechanism of eBPF steering > >>>>>>>>>> program as is because it relies on legacy context rewriting, and > >>>>>>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while > >>>>>>>>>> the other relevant virtualization APIs such as KVM and vhost_net are > >>>>>>>>>> UAPIs. > >>>>>>>>>> > >>>>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > >>>>>>>>>> Tested-by: Lei Yang <leiyang@redhat.com> > >>>>>>>>>> --- > >>>>>>>>>> Documentation/networking/tuntap.rst | 7 ++ > >>>>>>>>>> drivers/net/Kconfig | 1 + > >>>>>>>>>> drivers/net/tap.c | 68 ++++++++++++++- > >>>>>>>>>> drivers/net/tun.c | 98 +++++++++++++++++----- > >>>>>>>>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- > >>>>>>>>>> include/linux/if_tap.h | 2 + > >>>>>>>>>> include/linux/skbuff.h | 3 + > >>>>>>>>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ > >>>>>>>>>> net/core/skbuff.c | 4 + > >>>>>>>>>> 9 files changed, 386 insertions(+), 31 deletions(-) > >>>>>>>>>> > >>>>>>>>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst > >>>>>>>>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 > >>>>>>>>>> --- a/Documentation/networking/tuntap.rst > >>>>>>>>>> +++ b/Documentation/networking/tuntap.rst > >>>>>>>>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: > >>>>>>>>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); > >>>>>>>>>> } > >>>>>>>>>> > >>>>> > >>>>> [...] > >>>>> > >>>>>>>>>> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, > >>>>>>>>>> + bool can_rss, void __user *argp) > >>>>>>>>> > >>>>>>>>> So again, can_rss seems to be tricky. Looking at its caller, it tires > >>>>>>>>> to make eBPF and RSS mutually exclusive. I still don't understand why > >>>>>>>>> we need this. Allow eBPF program to override some of the path seems to > >>>>>>>>> be common practice. > >>>>>>>>> > >>>>>>>>> What's more, we didn't try (or even can't) to make automq and eBPF to > >>>>>>>>> be mutually exclusive. So I still didn't see what we gain from this > >>>>>>>>> and it complicates the codes and may lead to ambiguous uAPI/behaviour. > >>>>>>>> > >>>>>>>> automq and eBPF are mutually exclusive; automq is disabled when an eBPF > >>>>>>>> steering program is set so I followed the example here. > >>>>>>> > >>>>>>> I meant from the view of uAPI, the kernel doesn't or can't reject eBPF > >>>>>>> while using automq. > >>>>>> > >> > >>>>>>>> We don't even have an interface for eBPF to let it fall back to another > >>>>>>>> alogirhtm. > >>>>>>> > >>>>>>> It doesn't even need this, e.g XDP overrides the default receiving path. > >>>>>>> > >>>>>>>> I could make it fall back to RSS if the eBPF steeering > >>>>>>>> program is designed to fall back to automq when it returns e.g., -1. But > >>>>>>>> such an interface is currently not defined and defining one is out of > >>>>>>>> scope of this patch series. > >>>>>>> > >>>>>>> Just to make sure we are on the same page, I meant we just need to > >>>>>>> make the behaviour consistent: allow eBPF to override the behaviour of > >>>>>>> both automq and rss. > >>>>>> > >>>>>> That assumes eBPF takes precedence over RSS, which is not obvious to me. > >>>>> > >>>>> Well, it's kind of obvious. Not speaking the eBPF selector, we have > >>>>> other eBPF stuffs like skbedit etc. > >>>>> > >>>>>> > >>>>>> Let's add an interface for the eBPF steering program to fall back to > >>>>>> another steering algorithm. I said it is out of scope before, but it > >>>>>> makes clear that the eBPF steering program takes precedence over other > >>>>>> algorithms and allows us to delete the code for the configuration > >>>>>> validation in this patch. > >>>>> > >>>>> Fallback is out of scope but it's not what I meant. > >>>>> > >>>>> I meant in the current uAPI take eBPF precedence over automq. It's > >>>>> much more simpler to stick this precedence unless we see obvious > >>>>> advanatge. > >>>> > >>>> We still have three different design options that preserve the current > >>>> precedence: > >>>> > >>>> 1) Precedence order: eBPF -> RSS -> automq > >>>> 2) Precedence order: RSS -> eBPF -> automq > >>>> 3) Precedence order: eBPF OR RSS -> automq where eBPF and RSS are > >>>> mutually exclusive > >>>> > >>>> I think this is a unique situation for this steering program and I could > >>>> not find another example in other eBPF stuffs. > >>> > >>> As described above, queue mapping could be overridden by tc-ebpf. So > >>> there's no way to guarantee the RSS will work: > >>> > >>> https://github.com/DPDK/dpdk/blob/main/drivers/net/tap/bpf/tap_rss.c#L262 > >>> > >>> Making eBPF first leaves a chance for the management layer to override > >>> the choice of Qemu. > >> > >> I referred to the eBPF steering program instead of tc-ebpf. tc-ebpf is > >> nothing to do with the TUNSETSTEERINGEBPF ioctl, which this patch changes. > > > > I meant you can't do "full control" in any case, the point below > > doesn't stand. Queue mapping could be restored even if RSS is set. > > What matters here is how we handle the control when tc didn't take it. > eBPF, RSS, or automq make take all of it; I referred that as "full control". > > > > >> > >>> > >>>> > >>>> The current version implements 3) because it is not obvious whether we > >>>> should choose either 1) or 2). > >>> > >>> But you didn't explain why you choose 3), and it leads to tricky code > >>> (e.g the can_rss stuff etc). > >> > >> I wrote: "because it is not obvious whether we should choose either 1) > >> or 2)", but I think I can explain it better: > >> > >> When an eBPF steering program cannot implement a fallback, it means the > >> eBPF steering program requests the full control over the steering. On > >> the other hand, RSS also requests the same control. So these two will > >> conflict and the entity controlling the steering will be undefined when > >> both are enabled. > > > > Well, the fallback is orthogonal to the proposal here. We haven't had > > that since the introduction of the eBPF steering program. This means > > automq has been in "conflict" with eBPF for years. Again, another > > advantage, allowing the eBPF program to be the first to allow the > > management layer to override Qemu's steering. > > What if a VMM uses eBPF steering program and the management layer > decides to override it with RSS? That's possible but I think we're seeking which approach is better. In this case, RSS could be implemented in eBPF but not the reverse. So my point is to start from something that is simpler. Simply allow eBPF on top of RSS as automq. And optimize on top. > > eBPF is obviously predecedent to automq as eBPF is an opt-in feature and > automq is the implicit default. But this logic cannot be applied to > decide the order of eBPF and RSS because they are both opt-in features. This is from the perspective of kernel development. But let's try to think from the userspace: A well written user space knows what it does, rejecting eBPF while RSS is set doesn't help. But anyhow if you stick, it doesn't harm. > > > > >> > >> 3) eliminates the undefined semantics by rejecting to enable both. > > > > This would lead a usersapce noticeable change of the behaviour? And > > what do you mean by "rejecting to enable both"? > > Existing userspace code should see no change as it only cares the case > where RSS is enabled. > > Here, rejecting to enable both means to deny setting an eBPF steering > program when RSS is enabled, and visa-versa. > > > > >> An > >> alternative approach is to allow eBPF steering programs to fall back. > >> When both the eBPF program and RSS are enabled, RSS will gain the > >> control of steering under the well-defined situation where the eBPF > >> steering program decides to fall back. > > > > How about just stick the eBPF precedence in this proposal and > > introduce the fallback on top? This helps to speed up the iteration > > (as the version has been iterated to 11). > > I don't think that helps much since we have another ongoing discussion > below and it is not the sole roadblock. > > > > >> > >>> > >>>> But 1) will be the most capable option if > >>>> eBPF has a fall-back feature. > >>>> > >>>>> > >>>>>> > >>>>>>> > >>>>>>>> > >>>>>>>>> > >>>>> > >>>>> [...] > >>>>> > >>>>>>>>> Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? > >>>>>>>>> If yes, it should be a bug. > >>>>>>>> > >>>>>>>> It is possible to use RSS without TUN_VNET_HASH_REPORT. > >>>>>>> > >>>>>>> Another call to separate the ioctls then. > >>>>>> > >>>>>> RSS and hash reporting are not completely independent though. > >>>>> > >>>>> Spec said: > >>>>> > >>>>> """ > >>>>> VIRTIO_NET_F_RSSRequires VIRTIO_NET_F_CTRL_VQ. > >>>>> """ > >>>> > >>>> I meant the features can be enabled independently, but they will share > >>>> the hash type set when they are enabled at the same time. > >>> > >>> Looking at the spec: > >>> > >>> Hash repot uses: > >>> > >>> """ > >>> struct virtio_net_hash_config { > >>> le32 hash_types; > >>> le16 reserved[4]; > >>> u8 hash_key_length; > >>> u8 hash_key_data[hash_key_length]; > >>> }; > >>> """ > >>> > >>> RSS uses > >>> > >>> """ > >>> struct rss_rq_id { > >>> le16 vq_index_1_16: 15; /* Bits 1 to 16 of the virtqueue index */ > >>> le16 reserved: 1; /* Set to zero */ > >>> }; > >>> > >>> struct virtio_net_rss_config { > >>> le32 hash_types; > >>> le16 indirection_table_mask; > >>> struct rss_rq_id unclassified_queue; > >>> struct rss_rq_id indirection_table[indirection_table_length]; > >>> le16 max_tx_vq; > >>> u8 hash_key_length; > >>> u8 hash_key_data[hash_key_length]; > >>> }; > >>> """ > >>> > >>> Instead of trying to figure out whether we can share some data > >>> structures, why not simply start from what has been done in the spec? > >>> This would ease the usersapce as well where it can simply do 1:1 > >>> mapping between ctrl vq command and tun uAPI. > >> > >> The spec also defines struct virtio_net_hash_config (which will be used > >> when RSS is disabled) and struct virtio_net_rss_config to match the > >> layout to share some fields. However, the UAPI does not follow the > >> interface design of virtio due to some problems with these structures. > > > > Copy-paste error. The above is copied from the virtio spec, but I > > meant the existing uAPI in virtio_net.h: > > > > /* > > * The command VIRTIO_NET_CTRL_MQ_RSS_CONFIG has the same effect as > > * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET does and additionally configures > > * the receive steering to use a hash calculated for incoming packet > > * to decide on receive virtqueue to place the packet. The command > > * also provides parameters to calculate a hash and receive virtqueue. > > */ > > struct virtio_net_rss_config { > > __le32 hash_types; > > __le16 indirection_table_mask; > > __le16 unclassified_queue; > > __le16 indirection_table[1/* + indirection_table_mask */]; > > __le16 max_tx_vq; > > __u8 hash_key_length; > > __u8 hash_key_data[/* hash_key_length */]; > > }; > > > #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 > > > > /* > > * The command VIRTIO_NET_CTRL_MQ_HASH_CONFIG requests the device > > * to include in the virtio header of the packet the value of the > > * calculated hash and the report type of hash. It also provides > > * parameters for hash calculation. The command requires feature > > * VIRTIO_NET_F_HASH_REPORT to be negotiated to extend the > > * layout of virtio header as defined in virtio_net_hdr_v1_hash. > > */ > > struct virtio_net_hash_config { > > __le32 hash_types; > > /* for compatibility with virtio_net_rss_config */ > > __le16 reserved[4]; > > __u8 hash_key_length; > > __u8 hash_key_data[/* hash_key_length */]; > > }; > > > > This has been used by Qemu but I see a virtio-net version of: > > > > struct virtio_net_ctrl_rss { > > u32 hash_types; > > u16 indirection_table_mask; > > u16 unclassified_queue; > > u16 hash_cfg_reserved; /* for HASH_CONFIG (see > > virtio_net_hash_config for details) */ > > u16 max_tx_vq; > > u8 hash_key_length; > > u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > > > > u16 *indirection_table; > > }; > > > > This is ugly and results in a tricky code when trying to submit > > RSS/HASH commands to the device: > > > > if (vi->has_rss) { > > sg_buf_size = sizeof(uint16_t) * vi->rss_indir_table_size; > > sg_set_buf(&sgs[1], vi->rss.indirection_table, sg_buf_size); > > } else { > > sg_set_buf(&sgs[1], &vi->rss.hash_cfg_reserved, > > sizeof(uint16_t)); > > } > > The only reference to struct virtio_net_rss_config in QEMU is to derive > the offset of indirection_table. This is because the definition in > virtio_net.h also includes indirection_table in the middle and the > offsets of later part are unusable. Yes. > > QEMU internally has a structure named VirtioNetRssData which just looks > like struct virtio_net_ctrl_rss. It's a pity that it doesn't use uAPI. We might need to fix them. > > > > >> > >> Below is the definition of struct virtio_net_hash_config: > >> > >> struct virtio_net_hash_config { > >> le32 hash_types; > >> le16 reserved[4]; > >> u8 hash_key_length; > >> u8 hash_key_data[hash_key_length]; > >> }; > >> > >> Here, hash_types, hash_key_length, and hash_key_data are shared with > >> struct virtio_net_rss_config. > >> > >> One problem is that struct virtio_net_rss_config has a flexible array > >> (indirection_table) between hash_types and hash_key_length. This is > >> something we cannot express with C. > > > > We can split the virtio_net_rss_config to ease the dealing with > > arrays, more below. > > > >> > >> Another problem is that the semantics of the key in struct > >> virtio_net_hash_config is not defined in the spec. > > > > If this is the case. Let's fix that in the spec first to make sure our > > uAPI aligns with spec without ambiguity. It would be a nightmare to > > deal with the in-consistency between virtio spec and Linux uAPIs. > > The userspace doesn't need to do anything to deal with inconsistency > since these fields are unused. > > > > >> > >> To solve these problems, I defined the UAPI structures that do not > >> include indiretion_table. > >> > >>> > >>>> > >>>>> > >>>>>> > >>>>>> A plot twist is the "types" parameter; it is a parameter that is > >>>>>> "common" for RSS and hash reporting. > >>>>> > >>>>> So we can share part of the structure through the uAPI. > >>>> > >>>> Isn't that what this patch does? > >>> > >>> I didn't see, basically I see only one TUNSETVNETHASH that is used to > >>> set both hash report and rss: > >> > >> The UAPI shares struct tun_vnet_hash for both hash report and rss. > > > > I meant sharing structure in two ioctls instead of reusing a specific > > structure for two semantics in one ioctl if possible. Though I don't > > think we need any sharing. > > The UAPI implemented in this patch already shares struct tun_vnet_hash > and having two ioctls doesn't change that. > > > > >> > >>> > >>> """ > >>> +/** > >>> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing > >>> + * > >>> + * The argument is a pointer to &struct tun_vnet_hash. > >>> + * > >>> + * The argument is a pointer to the compound of the following in order if > >>> + * %TUN_VNET_HASH_RSS is set: > >>> + * > >>> + * 1. &struct tun_vnet_hash > >>> + * 2. &struct tun_vnet_hash_rss > >>> + * 3. Indirection table > >>> + * 4. Key > >>> + * > >>> """ > >>> > >>> And it seems to lack parameters like max_tx_vq. > >> > >> max_tx_vq is not relevant with hashing. > > > > It is needed for RSS and we don't have that, no? > > No. RSS is Receive Side Scaling but it's not about receiving. Just to make sure I understand this, max_tx_vq is part of the virtio_net_rss_config, how would Qemu behave when it receives this from guest? """ A driver sets max_tx_vq to inform a device how many transmit virtqueues it may use (transmitq1…transmitq max_tx_vq). """ > > > > >> > >>> > >>> What's more, we've already had virito-net uAPI. Why not simply reusing them? > >> > >> See the above. > >> > >>> > >>>> > >>>>> > >>>>>> RSS and hash reporting must share > >>>>>> this parameter when both are enabled at the same time; otherwise RSS may > >>>>>> compute hash values that are not suited for hash reporting. > >>>>> > >>>>> Is this mandated by the spec? If yes, we can add a check. If not, > >>>>> userspace risk themselves as a mis-configuration which we don't need > >>>>> to bother. > >>>> > >>>> Yes, it is mandated. 5.1.6.4.3 Hash calculation for incoming packets says: > >>>> > A device attempts to calculate a per-packet hash in the following > >>>> > cases: > >>>> > > >>>> > - The feature VIRTIO_NET_F_RSS was negotiated. The device uses the > >>>> > hash to determine the receive virtqueue to place incoming packets. > >>>> > - The feature VIRTIO_NET_F_HASH_REPORT was negotiated. The device > >>>> > reports the hash value and the hash type with the packet. > >>>> > > >>>> > If the feature VIRTIO_NET_F_RSS was negotiated: > >>>> > > >>>> > - The device uses hash_types of the virtio_net_rss_config structure > >>>> > as ’Enabled hash types’ bitmask. > >>>> > - The device uses a key as defined in hash_key_data and > >>>> hash_key_length of the virtio_net_rss_config structure (see > >>>> > 5.1.6.5.7.1). > >>>> > > >>>> > If the feature VIRTIO_NET_F_RSS was not negotiated: > >>>> > > >>>> > - The device uses hash_types of the virtio_net_hash_config structure > >>>> > as ’Enabled hash types’ bitmask. > >>>> > - The device uses a key as defined in hash_key_data and > >>>> > hash_key_length of the virtio_net_hash_config structure (see > >>>> > .1.6.5.6.4). > >>>> > >>>> So when both VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT are > >>>> negotiated, virtio_net_rss_config not only controls RSS but also the > >>>> reported hash values and types. They cannot be divergent. > >>>> > >>>>> > >>>>> Note that spec use different commands for hash_report and rss. > >>>> > >>>> TUNSETVNETHASH is different from these commands in terms that it also > >>>> negotiates VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS. > >>>> > >>> > >>> There Are different "issues" here: > >>> > >>> 1) Whether or not we need to use a unified API for negotiating RSS and > >>> HASH_REPORT features > >>> 2) Whether or not we need to sue a unified API for setting RSS and > >>> HASH_REPORT configuration > >>> > >>> What I want to say is point 2. But what you raise is point 1. > >>> > >>> For simplicity, it looks to me like it's a call for having separated > >>> ioctls for feature negotiation (for example via TUNSETIFF). You may > >>> argue that either RSS or HASH_REPORT requires configurations, we can > >>> just follow what spec defines or not (e.g what happens if > >>> RSS/HASH_REPORT were negotiated but no configurations were set). > >> > >> Unfortunately TUNSETIFF does not fit in this use case. The flags set > >> with TUNSETIFF are fixed, but the guest can request a different feature > >> set anytime by resetting the device. > > > > TUNSETIFF, enables the device to be able to handle RSS/HASREPORT. > > TUNSETHASH/RSS. dealing with RSS/HASH command from userspace. > > We also needs to be able to disable them at runtime so that we can > handle resets. Via TUNSETHASH/RSS? I think it should have a way to accept parameters that disable RSS or hash report. > > > > > This is the way we used to do for multi queue and vnet header. > > TUNSETIFF requires CAP_NET_ADMIN, this could be an extra safe guard > > for unprivileged userspace. > > I intend to allow using this feature without privilege. A VMM is usually > unprivileged and requiring a privilege to configure tuntap is too > prohibitive. For safety, tun is not allowed to be created by unprivileged users. And it's not to configure the tuntap dynamically, it's about telling the function that tuntap can have (not necessarily enabled though) . > > > > >> > >> > >> In the virtio-net specification, it is not defined what would > >> happen if > >>>> these features are negotiated but the VIRTIO_NET_CTRL_MQ_RSS_CONFIG or > >>>> VIRTIO_NET_CTRL_MQ_HASH_CONFIG commands are not sent. There is no such > >>>> ambiguity with TUNSETVNETHASH. > >>> > >>> So I don't see advantages of unifying hash reports and rss into a > >>> single ioctl. Let's just follow what has been done in the spec that > >>> uses separated commands. Tuntap is not a good place to debate whether > >>> those commands could be unified or not. We need to move it to the spec > >>> but assuming spec has been done, it might be too late or too few > >>> advantages for having another design. > >> > >> It makes sense for the spec to reuse the generic feature negotiation > >> mechanism, but the situation is different for tuntap; we cannot use > >> TUNSETIFF and need to define another. Then why don't we exploit this > >> opportunity to have an interface with well-defined semantics? > > > > That's perfectly fine, but it needs to be done in virtio-net's uAPI > > not tun's. What's more, if you think two commands are not > > well-defined, let's fix that in the virtio spec first. > > > >> The virtio > >> spec does its best as an interface between the host and guest and tuntap > >> does its best as an UAPI. > > > > See above, let's fix the uAPI first. We don't want DPDK to use tun's > > uAPI for RSS > > virtio-net's UAPI is for the virtio spec which has a capable generic > feature negotiation mechanism. tuntap needs its own feature negotiation > and it's nothing to do with virtio-net's UAPI. Well, I don't mean the part of the feature negotiation. I mean the part for rss and hash report configuration. > > The structures for two commands have unused or redundant fields and a > flexible array in the middle of the structure, but they are ABIs so we > can't change it. > > DPDK is another reason to define tuntap's own UAPIs. They don't care > unused or redundant fields and a flexible array in middle that are > present in the virtio spec. It will also not want to deal with the > requirement of little endian. Constructing struct virtio_net_rss_config > is an extra burden for DPDK. I meant for vhost-user implementation in DPDK, it needs to use virtio-net uAPI not tuntap's for example. > > On the other hand, Constructing tuntap-specific structures is not that > complicated for VMMs. Not complicated but redundant. > A VMM will need to inspect struct > virtio_net_rss_config anyway to handle migration and check its size so > it can store the values it inspected to struct tun_vnet_hash and struct > tun_vnet_hash_rss and pass them to the kernel. I don't see how rss and hash reports differ from what we have now. Those inspections must be done anyhow for compatibility for example the check of offloading features. Such steps could not be eliminated no matter how we design the uAPI. > > The overall userspace implementation will be simpler by having > structures specifically tailored for the communication between the > userspace and kernel. This is exactly how a good uAPI should behave. If uAPI in virtio-net can't do this, I don't understand why uAPI in tuntap can solve it. > > > > >> > >> I don't think there is an advantage to split ioctls to follow the spec > >> after all. It makes sense if we can pass-through virtio commands to > >> tuntap, but it is not possible as ioctl operation codes are different > >> from virtio commands. > > > > I don't see a connection with the operation code. For example, we can > > add new uAPIs in virtio-net which could be something like: > > > > struct virtio_net_rss_config_header { > > __le32 hash_types; > > __le16 indirection_table_mask; > > __le16 unclassified_queue; > > __le16 indirection_table[]; > > } > > > > struct virtio_net_rss_config_tailer { > > __le16 max_tx_vq; > > u8 hash_key_length; > > u8 hash_key_data[]; > > } > > > > These two are used by TUNSETVNETRSS. And simply reuse the > > virtio_net_hash_config for TUNSETVETHASH. > > > With this, we can tweak the virtio-net driver with this new uAPI. Then > > tap* can reuse this. > > I implemented a UAPI and driver change accordingly: > https://lore.kernel.org/r/20250318-virtio-v1-0-344caf336ddd@daynix.com > > This is a nice improvement for the driver, but I still don't think it is > suited for the UAPI of tuntap. Any reason for this? It should work like virtio_net_hdr. > The requirements of extra fields and > little endian cannot be removed from the virtio spec but they are > irrelevant for tuntap. I don't understand this part. What fields are "extra" and need to be removed from the spec? > > > > >> The best possibility is to share structures, not > >> commands, and I don't think even sharing structures makes sense here > >> because of the reasons described above. > > > > I don't want to share structures, I meant starting from something that > > is simple and has been sorted in the virtio spec. Optimization could > > be done on top. > > I meant to reuse the structures in virtio_net.h. > > Regards, > Akihiko Odaki Thanks > > > > > Thanks > > > > > >> > >> Regards, > >> Akihiko Odaki > >> > >>> > >>> Thanks > >>> > >>>> > >>>> Regards, > >>>> Akihiko Odaki > >>>> > >>>>> > >>>>>> > >>>>>> The paramter will be duplicated if we have separate ioctls for RSS and > >>>>>> hash reporting, and the kernel will have a chiken-egg problem when > >>>>>> ensuring they are synchronized; when the ioctl for RSS is issued, should > >>>>>> the kernel ensure the "types" parameter is identical with one specified > >>>>>> for hash reporting? It will not work if the userspace may decide to > >>>>>> configure hash reporting after RSS. > >>>>>> > >>>>> > >>>>> See my reply above. > >>>>> > >>>>> Thanks > >>>>> > >>>> > >>> > >> > > >
On 2025/03/19 9:58, Jason Wang wrote: > On Tue, Mar 18, 2025 at 6:10 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2025/03/18 9:15, Jason Wang wrote: >>> On Mon, Mar 17, 2025 at 3:07 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> On 2025/03/17 10:12, Jason Wang wrote: >>>>> On Wed, Mar 12, 2025 at 1:03 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>> >>>>>> On 2025/03/12 11:35, Jason Wang wrote: >>>>>>> On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>> >>>>>>>> On 2025/03/11 9:38, Jason Wang wrote: >>>>>>>>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>> >>>>>>>>>> On 2025/03/10 12:55, Jason Wang wrote: >>>>>>>>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>>>> >>>>>>>>>>>> Hash reporting >>>>>>>>>>>> ============== >>>>>>>>>>>> >>>>>>>>>>>> Allow the guest to reuse the hash value to make receive steering >>>>>>>>>>>> consistent between the host and guest, and to save hash computation. >>>>>>>>>>>> >>>>>>>>>>>> RSS >>>>>>>>>>>> === >>>>>>>>>>>> >>>>>>>>>>>> RSS is a receive steering algorithm that can be negotiated to use with >>>>>>>>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. >>>>>>>>>>>> However, computing the hash after the queue was chosen defeats the >>>>>>>>>>>> purpose of RSS. >>>>>>>>>>>> >>>>>>>>>>>> Another approach is to use eBPF steering program. This approach has >>>>>>>>>>>> another downside: it cannot report the calculated hash due to the >>>>>>>>>>>> restrictive nature of eBPF steering program. >>>>>>>>>>>> >>>>>>>>>>>> Introduce the code to perform RSS to the kernel in order to overcome >>>>>>>>>>>> thse challenges. An alternative solution is to extend the eBPF steering >>>>>>>>>>>> program so that it will be able to report to the userspace, but I didn't >>>>>>>>>>>> opt for it because extending the current mechanism of eBPF steering >>>>>>>>>>>> program as is because it relies on legacy context rewriting, and >>>>>>>>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while >>>>>>>>>>>> the other relevant virtualization APIs such as KVM and vhost_net are >>>>>>>>>>>> UAPIs. >>>>>>>>>>>> >>>>>>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >>>>>>>>>>>> Tested-by: Lei Yang <leiyang@redhat.com> >>>>>>>>>>>> --- >>>>>>>>>>>> Documentation/networking/tuntap.rst | 7 ++ >>>>>>>>>>>> drivers/net/Kconfig | 1 + >>>>>>>>>>>> drivers/net/tap.c | 68 ++++++++++++++- >>>>>>>>>>>> drivers/net/tun.c | 98 +++++++++++++++++----- >>>>>>>>>>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >>>>>>>>>>>> include/linux/if_tap.h | 2 + >>>>>>>>>>>> include/linux/skbuff.h | 3 + >>>>>>>>>>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >>>>>>>>>>>> net/core/skbuff.c | 4 + >>>>>>>>>>>> 9 files changed, 386 insertions(+), 31 deletions(-) >>>>>>>>>>>> >>>>>>>>>>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst >>>>>>>>>>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 >>>>>>>>>>>> --- a/Documentation/networking/tuntap.rst >>>>>>>>>>>> +++ b/Documentation/networking/tuntap.rst >>>>>>>>>>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >>>>>>>>>>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >>>>>>>>>>>> } >>>>>>>>>>>> >>>>>>> >>>>>>> [...] >>>>>>> >>>>>>>>>>>> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, >>>>>>>>>>>> + bool can_rss, void __user *argp) >>>>>>>>>>> >>>>>>>>>>> So again, can_rss seems to be tricky. Looking at its caller, it tires >>>>>>>>>>> to make eBPF and RSS mutually exclusive. I still don't understand why >>>>>>>>>>> we need this. Allow eBPF program to override some of the path seems to >>>>>>>>>>> be common practice. >>>>>>>>>>> >>>>>>>>>>> What's more, we didn't try (or even can't) to make automq and eBPF to >>>>>>>>>>> be mutually exclusive. So I still didn't see what we gain from this >>>>>>>>>>> and it complicates the codes and may lead to ambiguous uAPI/behaviour. >>>>>>>>>> >>>>>>>>>> automq and eBPF are mutually exclusive; automq is disabled when an eBPF >>>>>>>>>> steering program is set so I followed the example here. >>>>>>>>> >>>>>>>>> I meant from the view of uAPI, the kernel doesn't or can't reject eBPF >>>>>>>>> while using automq. >>>>>>>> > >> >>>>>>>>>> We don't even have an interface for eBPF to let it fall back to another >>>>>>>>>> alogirhtm. >>>>>>>>> >>>>>>>>> It doesn't even need this, e.g XDP overrides the default receiving path. >>>>>>>>> >>>>>>>>>> I could make it fall back to RSS if the eBPF steeering >>>>>>>>>> program is designed to fall back to automq when it returns e.g., -1. But >>>>>>>>>> such an interface is currently not defined and defining one is out of >>>>>>>>>> scope of this patch series. >>>>>>>>> >>>>>>>>> Just to make sure we are on the same page, I meant we just need to >>>>>>>>> make the behaviour consistent: allow eBPF to override the behaviour of >>>>>>>>> both automq and rss. >>>>>>>> >>>>>>>> That assumes eBPF takes precedence over RSS, which is not obvious to me. >>>>>>> >>>>>>> Well, it's kind of obvious. Not speaking the eBPF selector, we have >>>>>>> other eBPF stuffs like skbedit etc. >>>>>>> >>>>>>>> >>>>>>>> Let's add an interface for the eBPF steering program to fall back to >>>>>>>> another steering algorithm. I said it is out of scope before, but it >>>>>>>> makes clear that the eBPF steering program takes precedence over other >>>>>>>> algorithms and allows us to delete the code for the configuration >>>>>>>> validation in this patch. >>>>>>> >>>>>>> Fallback is out of scope but it's not what I meant. >>>>>>> >>>>>>> I meant in the current uAPI take eBPF precedence over automq. It's >>>>>>> much more simpler to stick this precedence unless we see obvious >>>>>>> advanatge. >>>>>> >>>>>> We still have three different design options that preserve the current >>>>>> precedence: >>>>>> >>>>>> 1) Precedence order: eBPF -> RSS -> automq >>>>>> 2) Precedence order: RSS -> eBPF -> automq >>>>>> 3) Precedence order: eBPF OR RSS -> automq where eBPF and RSS are >>>>>> mutually exclusive >>>>>> >>>>>> I think this is a unique situation for this steering program and I could >>>>>> not find another example in other eBPF stuffs. >>>>> >>>>> As described above, queue mapping could be overridden by tc-ebpf. So >>>>> there's no way to guarantee the RSS will work: >>>>> >>>>> https://github.com/DPDK/dpdk/blob/main/drivers/net/tap/bpf/tap_rss.c#L262 >>>>> >>>>> Making eBPF first leaves a chance for the management layer to override >>>>> the choice of Qemu. >>>> >>>> I referred to the eBPF steering program instead of tc-ebpf. tc-ebpf is >>>> nothing to do with the TUNSETSTEERINGEBPF ioctl, which this patch changes. >>> >>> I meant you can't do "full control" in any case, the point below >>> doesn't stand. Queue mapping could be restored even if RSS is set. >> >> What matters here is how we handle the control when tc didn't take it. >> eBPF, RSS, or automq make take all of it; I referred that as "full control". >> >>> >>>> >>>>> >>>>>> >>>>>> The current version implements 3) because it is not obvious whether we >>>>>> should choose either 1) or 2). >>>>> >>>>> But you didn't explain why you choose 3), and it leads to tricky code >>>>> (e.g the can_rss stuff etc). >>>> >>>> I wrote: "because it is not obvious whether we should choose either 1) >>>> or 2)", but I think I can explain it better: >>>> >>>> When an eBPF steering program cannot implement a fallback, it means the >>>> eBPF steering program requests the full control over the steering. On >>>> the other hand, RSS also requests the same control. So these two will >>>> conflict and the entity controlling the steering will be undefined when >>>> both are enabled. >>> >>> Well, the fallback is orthogonal to the proposal here. We haven't had >>> that since the introduction of the eBPF steering program. This means >>> automq has been in "conflict" with eBPF for years. Again, another >>> advantage, allowing the eBPF program to be the first to allow the >>> management layer to override Qemu's steering. >> >> What if a VMM uses eBPF steering program and the management layer >> decides to override it with RSS? > > That's possible but I think we're seeking which approach is better. In > this case, RSS could be implemented in eBPF but not the reverse. > > So my point is to start from something that is simpler. Simply allow > eBPF on top of RSS as automq. And optimize on top. The in-kernel RSS implementation is more optimized and capable of hash reporting. I don't think either eBPF steering program or in-kernel RSS is more capable than the other and there is a reason to place eBPF on top of RSS. > >> >> eBPF is obviously predecedent to automq as eBPF is an opt-in feature and >> automq is the implicit default. But this logic cannot be applied to >> decide the order of eBPF and RSS because they are both opt-in features. > > This is from the perspective of kernel development. But let's try to > think from the userspace: A well written user space knows what it > does, rejecting eBPF while RSS is set doesn't help. But anyhow if you > stick, it doesn't harm. Yes, it not for the current userspace but for the future kernel development; the kernel can reserve the freedom to decide the priority of eBPF and RSS by rejecting eBPF while RSS. > >> >>> >>>> >>>> 3) eliminates the undefined semantics by rejecting to enable both. >>> >>> This would lead a usersapce noticeable change of the behaviour? And >>> what do you mean by "rejecting to enable both"? >> >> Existing userspace code should see no change as it only cares the case >> where RSS is enabled. >> >> Here, rejecting to enable both means to deny setting an eBPF steering >> program when RSS is enabled, and visa-versa. >> >>> >>>> An >>>> alternative approach is to allow eBPF steering programs to fall back. >>>> When both the eBPF program and RSS are enabled, RSS will gain the >>>> control of steering under the well-defined situation where the eBPF >>>> steering program decides to fall back. >>> >>> How about just stick the eBPF precedence in this proposal and >>> introduce the fallback on top? This helps to speed up the iteration >>> (as the version has been iterated to 11). >> >> I don't think that helps much since we have another ongoing discussion >> below and it is not the sole roadblock. >> >>> >>>> >>>>> >>>>>> But 1) will be the most capable option if >>>>>> eBPF has a fall-back feature. >>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>> >>>>>>> [...] >>>>>>> >>>>>>>>>>> Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? >>>>>>>>>>> If yes, it should be a bug. >>>>>>>>>> >>>>>>>>>> It is possible to use RSS without TUN_VNET_HASH_REPORT. >>>>>>>>> >>>>>>>>> Another call to separate the ioctls then. >>>>>>>> >>>>>>>> RSS and hash reporting are not completely independent though. >>>>>>> >>>>>>> Spec said: >>>>>>> >>>>>>> """ >>>>>>> VIRTIO_NET_F_RSSRequires VIRTIO_NET_F_CTRL_VQ. >>>>>>> """ >>>>>> >>>>>> I meant the features can be enabled independently, but they will share >>>>>> the hash type set when they are enabled at the same time. >>>>> >>>>> Looking at the spec: >>>>> >>>>> Hash repot uses: >>>>> >>>>> """ >>>>> struct virtio_net_hash_config { >>>>> le32 hash_types; >>>>> le16 reserved[4]; >>>>> u8 hash_key_length; >>>>> u8 hash_key_data[hash_key_length]; >>>>> }; >>>>> """ >>>>> >>>>> RSS uses >>>>> >>>>> """ >>>>> struct rss_rq_id { >>>>> le16 vq_index_1_16: 15; /* Bits 1 to 16 of the virtqueue index */ >>>>> le16 reserved: 1; /* Set to zero */ >>>>> }; >>>>> >>>>> struct virtio_net_rss_config { >>>>> le32 hash_types; >>>>> le16 indirection_table_mask; >>>>> struct rss_rq_id unclassified_queue; >>>>> struct rss_rq_id indirection_table[indirection_table_length]; >>>>> le16 max_tx_vq; >>>>> u8 hash_key_length; >>>>> u8 hash_key_data[hash_key_length]; >>>>> }; >>>>> """ >>>>> >>>>> Instead of trying to figure out whether we can share some data >>>>> structures, why not simply start from what has been done in the spec? >>>>> This would ease the usersapce as well where it can simply do 1:1 >>>>> mapping between ctrl vq command and tun uAPI. >>>> >>>> The spec also defines struct virtio_net_hash_config (which will be used >>>> when RSS is disabled) and struct virtio_net_rss_config to match the >>>> layout to share some fields. However, the UAPI does not follow the >>>> interface design of virtio due to some problems with these structures. >>> >>> Copy-paste error. The above is copied from the virtio spec, but I >>> meant the existing uAPI in virtio_net.h: >>> >>> /* >>> * The command VIRTIO_NET_CTRL_MQ_RSS_CONFIG has the same effect as >>> * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET does and additionally configures >>> * the receive steering to use a hash calculated for incoming packet >>> * to decide on receive virtqueue to place the packet. The command >>> * also provides parameters to calculate a hash and receive virtqueue. >>> */ >>> struct virtio_net_rss_config { >>> __le32 hash_types; >>> __le16 indirection_table_mask; >>> __le16 unclassified_queue; >>> __le16 indirection_table[1/* + indirection_table_mask */]; >>> __le16 max_tx_vq; >>> __u8 hash_key_length; >>> __u8 hash_key_data[/* hash_key_length */]; >>> }; >>>> #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 >>> >>> /* >>> * The command VIRTIO_NET_CTRL_MQ_HASH_CONFIG requests the device >>> * to include in the virtio header of the packet the value of the >>> * calculated hash and the report type of hash. It also provides >>> * parameters for hash calculation. The command requires feature >>> * VIRTIO_NET_F_HASH_REPORT to be negotiated to extend the >>> * layout of virtio header as defined in virtio_net_hdr_v1_hash. >>> */ >>> struct virtio_net_hash_config { >>> __le32 hash_types; >>> /* for compatibility with virtio_net_rss_config */ >>> __le16 reserved[4]; >>> __u8 hash_key_length; >>> __u8 hash_key_data[/* hash_key_length */]; >>> }; >>> >>> This has been used by Qemu but I see a virtio-net version of: >>> >>> struct virtio_net_ctrl_rss { >>> u32 hash_types; >>> u16 indirection_table_mask; >>> u16 unclassified_queue; >>> u16 hash_cfg_reserved; /* for HASH_CONFIG (see >>> virtio_net_hash_config for details) */ >>> u16 max_tx_vq; >>> u8 hash_key_length; >>> u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; >>> >>> u16 *indirection_table; >>> }; >>> >>> This is ugly and results in a tricky code when trying to submit >>> RSS/HASH commands to the device: >>> >>> if (vi->has_rss) { >>> sg_buf_size = sizeof(uint16_t) * vi->rss_indir_table_size; >>> sg_set_buf(&sgs[1], vi->rss.indirection_table, sg_buf_size); >>> } else { >>> sg_set_buf(&sgs[1], &vi->rss.hash_cfg_reserved, >>> sizeof(uint16_t)); >>> } >> >> The only reference to struct virtio_net_rss_config in QEMU is to derive >> the offset of indirection_table. This is because the definition in >> virtio_net.h also includes indirection_table in the middle and the >> offsets of later part are unusable. > > Yes. > >> >> QEMU internally has a structure named VirtioNetRssData which just looks >> like struct virtio_net_ctrl_rss. > > It's a pity that it doesn't use uAPI. We might need to fix them. It doesn't want to use the UAPI structures for the internal storage because it wants to store them in native endians and QEMU is not interested in some fields in the UAPI structures. struct tun_vnet_hash and struct tun_vnet_hash_rss are easy to fill using VirtioNetRssData. > >> >>> >>>> >>>> Below is the definition of struct virtio_net_hash_config: >>>> >>>> struct virtio_net_hash_config { >>>> le32 hash_types; >>>> le16 reserved[4]; >>>> u8 hash_key_length; >>>> u8 hash_key_data[hash_key_length]; >>>> }; >>>> >>>> Here, hash_types, hash_key_length, and hash_key_data are shared with >>>> struct virtio_net_rss_config. >>>> >>>> One problem is that struct virtio_net_rss_config has a flexible array >>>> (indirection_table) between hash_types and hash_key_length. This is >>>> something we cannot express with C. >>> >>> We can split the virtio_net_rss_config to ease the dealing with >>> arrays, more below. >>> >>>> >>>> Another problem is that the semantics of the key in struct >>>> virtio_net_hash_config is not defined in the spec. >>> >>> If this is the case. Let's fix that in the spec first to make sure our >>> uAPI aligns with spec without ambiguity. It would be a nightmare to >>> deal with the in-consistency between virtio spec and Linux uAPIs. >> >> The userspace doesn't need to do anything to deal with inconsistency >> since these fields are unused. >> >>> >>>> >>>> To solve these problems, I defined the UAPI structures that do not >>>> include indiretion_table. >>>> >>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>> A plot twist is the "types" parameter; it is a parameter that is >>>>>>>> "common" for RSS and hash reporting. >>>>>>> >>>>>>> So we can share part of the structure through the uAPI. >>>>>> >>>>>> Isn't that what this patch does? >>>>> >>>>> I didn't see, basically I see only one TUNSETVNETHASH that is used to >>>>> set both hash report and rss: >>>> >>>> The UAPI shares struct tun_vnet_hash for both hash report and rss. >>> >>> I meant sharing structure in two ioctls instead of reusing a specific >>> structure for two semantics in one ioctl if possible. Though I don't >>> think we need any sharing. >> >> The UAPI implemented in this patch already shares struct tun_vnet_hash >> and having two ioctls doesn't change that. >> >>> >>>> >>>>> >>>>> """ >>>>> +/** >>>>> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing >>>>> + * >>>>> + * The argument is a pointer to &struct tun_vnet_hash. >>>>> + * >>>>> + * The argument is a pointer to the compound of the following in order if >>>>> + * %TUN_VNET_HASH_RSS is set: >>>>> + * >>>>> + * 1. &struct tun_vnet_hash >>>>> + * 2. &struct tun_vnet_hash_rss >>>>> + * 3. Indirection table >>>>> + * 4. Key >>>>> + * >>>>> """ >>>>> >>>>> And it seems to lack parameters like max_tx_vq. >>>> >>>> max_tx_vq is not relevant with hashing. >>> >>> It is needed for RSS and we don't have that, no? >> >> No. RSS is Receive Side Scaling but it's not about receiving. > > Just to make sure I understand this, max_tx_vq is part of the > virtio_net_rss_config, how would Qemu behave when it receives this > from guest? > > """ > A driver sets max_tx_vq to inform a device how many transmit > virtqueues it may use (transmitq1…transmitq max_tx_vq). > """ It does nothing. > >> >>> >>>> >>>>> >>>>> What's more, we've already had virito-net uAPI. Why not simply reusing them? >>>> >>>> See the above. >>>> >>>>> >>>>>> >>>>>>> >>>>>>>> RSS and hash reporting must share >>>>>>>> this parameter when both are enabled at the same time; otherwise RSS may >>>>>>>> compute hash values that are not suited for hash reporting. >>>>>>> >>>>>>> Is this mandated by the spec? If yes, we can add a check. If not, >>>>>>> userspace risk themselves as a mis-configuration which we don't need >>>>>>> to bother. >>>>>> >>>>>> Yes, it is mandated. 5.1.6.4.3 Hash calculation for incoming packets says: >>>>>> > A device attempts to calculate a per-packet hash in the following >>>>>> > cases: >>>>>> > >>>>>> > - The feature VIRTIO_NET_F_RSS was negotiated. The device uses the >>>>>> > hash to determine the receive virtqueue to place incoming packets. >>>>>> > - The feature VIRTIO_NET_F_HASH_REPORT was negotiated. The device >>>>>> > reports the hash value and the hash type with the packet. >>>>>> > >>>>>> > If the feature VIRTIO_NET_F_RSS was negotiated: >>>>>> > >>>>>> > - The device uses hash_types of the virtio_net_rss_config structure >>>>>> > as ’Enabled hash types’ bitmask. >>>>>> > - The device uses a key as defined in hash_key_data and >>>>>> hash_key_length of the virtio_net_rss_config structure (see >>>>>> > 5.1.6.5.7.1). >>>>>> > >>>>>> > If the feature VIRTIO_NET_F_RSS was not negotiated: >>>>>> > >>>>>> > - The device uses hash_types of the virtio_net_hash_config structure >>>>>> > as ’Enabled hash types’ bitmask. >>>>>> > - The device uses a key as defined in hash_key_data and >>>>>> > hash_key_length of the virtio_net_hash_config structure (see >>>>>> > .1.6.5.6.4). >>>>>> >>>>>> So when both VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT are >>>>>> negotiated, virtio_net_rss_config not only controls RSS but also the >>>>>> reported hash values and types. They cannot be divergent. >>>>>> >>>>>>> >>>>>>> Note that spec use different commands for hash_report and rss. >>>>>> >>>>>> TUNSETVNETHASH is different from these commands in terms that it also >>>>>> negotiates VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS. >>>>>> >>>>> >>>>> There Are different "issues" here: >>>>> >>>>> 1) Whether or not we need to use a unified API for negotiating RSS and >>>>> HASH_REPORT features >>>>> 2) Whether or not we need to sue a unified API for setting RSS and >>>>> HASH_REPORT configuration >>>>> >>>>> What I want to say is point 2. But what you raise is point 1. >>>>> >>>>> For simplicity, it looks to me like it's a call for having separated >>>>> ioctls for feature negotiation (for example via TUNSETIFF). You may >>>>> argue that either RSS or HASH_REPORT requires configurations, we can >>>>> just follow what spec defines or not (e.g what happens if >>>>> RSS/HASH_REPORT were negotiated but no configurations were set). >>>> >>>> Unfortunately TUNSETIFF does not fit in this use case. The flags set >>>> with TUNSETIFF are fixed, but the guest can request a different feature >>>> set anytime by resetting the device. >>> >>> TUNSETIFF, enables the device to be able to handle RSS/HASREPORT. >>> TUNSETHASH/RSS. dealing with RSS/HASH command from userspace. >> >> We also needs to be able to disable them at runtime so that we can >> handle resets. > > Via TUNSETHASH/RSS? I think it should have a way to accept parameters > that disable RSS or hash report. That's what this patch implements. TUNSETVNETHASH accepts parameters to choose what features to be enabled. > >> >>> >>> This is the way we used to do for multi queue and vnet header. >>> TUNSETIFF requires CAP_NET_ADMIN, this could be an extra safe guard >>> for unprivileged userspace. >> >> I intend to allow using this feature without privilege. A VMM is usually >> unprivileged and requiring a privilege to configure tuntap is too >> prohibitive. > > For safety, tun is not allowed to be created by unprivileged users. > And it's not to configure the tuntap dynamically, it's about telling > the function that tuntap can have (not necessarily enabled though) . I don't think we need another barrier for the new functions. Once an unprivileged user get a file descriptor of tuntap from a privileged user, they are free to enable RSS and/or hash reporting. > >> >>> >>>> >>>> > >> In the virtio-net specification, it is not defined what would >>>> happen if >>>>>> these features are negotiated but the VIRTIO_NET_CTRL_MQ_RSS_CONFIG or >>>>>> VIRTIO_NET_CTRL_MQ_HASH_CONFIG commands are not sent. There is no such >>>>>> ambiguity with TUNSETVNETHASH. >>>>> >>>>> So I don't see advantages of unifying hash reports and rss into a >>>>> single ioctl. Let's just follow what has been done in the spec that >>>>> uses separated commands. Tuntap is not a good place to debate whether >>>>> those commands could be unified or not. We need to move it to the spec >>>>> but assuming spec has been done, it might be too late or too few >>>>> advantages for having another design. >>>> >>>> It makes sense for the spec to reuse the generic feature negotiation >>>> mechanism, but the situation is different for tuntap; we cannot use >>>> TUNSETIFF and need to define another. Then why don't we exploit this >>>> opportunity to have an interface with well-defined semantics? >>> >>> That's perfectly fine, but it needs to be done in virtio-net's uAPI >>> not tun's. What's more, if you think two commands are not >>> well-defined, let's fix that in the virtio spec first. >>> >>>> The virtio >>>> spec does its best as an interface between the host and guest and tuntap >>>> does its best as an UAPI. >>> >>> See above, let's fix the uAPI first. We don't want DPDK to use tun's >>> uAPI for RSS >> >> virtio-net's UAPI is for the virtio spec which has a capable generic >> feature negotiation mechanism. tuntap needs its own feature negotiation >> and it's nothing to do with virtio-net's UAPI. > > Well, I don't mean the part of the feature negotiation. I mean the > part for rss and hash report configuration. The feature negotiation still matters when deciding the granularity of ioctls. We need one ioctl for a feature negotiation, and to avoid having an intermediate state, the ioctl should also do the configuration. Hence that one ioctl should do all of the feature negotiation and configuration. > >> >> The structures for two commands have unused or redundant fields and a >> flexible array in the middle of the structure, but they are ABIs so we >> can't change it. >> >> DPDK is another reason to define tuntap's own UAPIs. They don't care >> unused or redundant fields and a flexible array in middle that are >> present in the virtio spec. It will also not want to deal with the >> requirement of little endian. Constructing struct virtio_net_rss_config >> is an extra burden for DPDK. > > I meant for vhost-user implementation in DPDK, it needs to use > virtio-net uAPI not tuntap's for example. The vhost-user implementation will use tuntap's UAPIs for its ethernet device backend. It uses the generic interface of ethernet device so for RSS it will use functions like rte_eth_dev_rss_hash_update() for example. tuntap's UAPIs are more suited to implement these interfaces as they operate in native endian and don't have extra fields. DPDk applications other than vhost-user also matter; they do not care what virtio does at all. > >> >> On the other hand, Constructing tuntap-specific structures is not that >> complicated for VMMs. > > Not complicated but redundant. > >> A VMM will need to inspect struct >> virtio_net_rss_config anyway to handle migration and check its size so >> it can store the values it inspected to struct tun_vnet_hash and struct >> tun_vnet_hash_rss and pass them to the kernel. > > I don't see how rss and hash reports differ from what we have now. > Those inspections must be done anyhow for compatibility for example > the check of offloading features. Such steps could not be eliminated > no matter how we design the uAPI. I explained the difference between the virtio and tuntap UAPIs, not between RSS and hash reporting. > >> >> The overall userspace implementation will be simpler by having >> structures specifically tailored for the communication between the >> userspace and kernel. > > This is exactly how a good uAPI should behave. If uAPI in virtio-net > can't do this, I don't understand why uAPI in tuntap can solve it. The UAPI in virtio-net cannot do it because it's already fixed and it also needs to perform endian conversion for the VM use case. tuntap doesn't have these restrictions. > >> >>> >>>> >>>> I don't think there is an advantage to split ioctls to follow the spec >>>> after all. It makes sense if we can pass-through virtio commands to >>>> tuntap, but it is not possible as ioctl operation codes are different >>>> from virtio commands. >>> >>> I don't see a connection with the operation code. For example, we can >>> add new uAPIs in virtio-net which could be something like: >>> >>> struct virtio_net_rss_config_header { >>> __le32 hash_types; >>> __le16 indirection_table_mask; >>> __le16 unclassified_queue; >>> __le16 indirection_table[]; >>> } >>> >>> struct virtio_net_rss_config_tailer { >>> __le16 max_tx_vq; >>> u8 hash_key_length; >>> u8 hash_key_data[]; >>> } >>> >>> These two are used by TUNSETVNETRSS. And simply reuse the >>> virtio_net_hash_config for TUNSETVETHASH. >> > > With this, we can tweak the virtio-net driver with this new uAPI. Then >>> tap* can reuse this. >> >> I implemented a UAPI and driver change accordingly: >> https://lore.kernel.org/r/20250318-virtio-v1-0-344caf336ddd@daynix.com >> >> This is a nice improvement for the driver, but I still don't think it is >> suited for the UAPI of tuntap. > > Any reason for this? It should work like virtio_net_hdr. > >> The requirements of extra fields and >> little endian cannot be removed from the virtio spec but they are >> irrelevant for tuntap. > > I don't understand this part. What fields are "extra" and need to be > removed from the spec? All fields not included in struct tun_vnet_hash and struct tun_vnet_hash_rss. Namely, for struct virtio_net_hash_config: - reserved - hash_key_length - hash_key_data For struct virtio_net_rss_config: - max_tx_vq - hash_key_length Regards, Akihiko Odaki > >> >>> >>>> The best possibility is to share structures, not >>>> commands, and I don't think even sharing structures makes sense here >>>> because of the reasons described above. >>> >>> I don't want to share structures, I meant starting from something that >>> is simple and has been sorted in the virtio spec. Optimization could >>> be done on top. >> >> I meant to reuse the structures in virtio_net.h. >> >> Regards, >> Akihiko Odaki > > Thanks > >> >>> >>> Thanks >>> >>> >>>> >>>> Regards, >>>> Akihiko Odaki >>>> >>>>> >>>>> Thanks >>>>> >>>>>> >>>>>> Regards, >>>>>> Akihiko Odaki >>>>>> >>>>>>> >>>>>>>> >>>>>>>> The paramter will be duplicated if we have separate ioctls for RSS and >>>>>>>> hash reporting, and the kernel will have a chiken-egg problem when >>>>>>>> ensuring they are synchronized; when the ioctl for RSS is issued, should >>>>>>>> the kernel ensure the "types" parameter is identical with one specified >>>>>>>> for hash reporting? It will not work if the userspace may decide to >>>>>>>> configure hash reporting after RSS. >>>>>>>> >>>>>>> >>>>>>> See my reply above. >>>>>>> >>>>>>> Thanks >>>>>>> >>>>>> >>>>> >>>> >>> >> >
On Wed, Mar 19, 2025 at 1:29 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2025/03/19 9:58, Jason Wang wrote: > > On Tue, Mar 18, 2025 at 6:10 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2025/03/18 9:15, Jason Wang wrote: > >>> On Mon, Mar 17, 2025 at 3:07 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> On 2025/03/17 10:12, Jason Wang wrote: > >>>>> On Wed, Mar 12, 2025 at 1:03 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>> > >>>>>> On 2025/03/12 11:35, Jason Wang wrote: > >>>>>>> On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>> > >>>>>>>> On 2025/03/11 9:38, Jason Wang wrote: > >>>>>>>>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>>>> > >>>>>>>>>> On 2025/03/10 12:55, Jason Wang wrote: > >>>>>>>>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>>>>>> > >>>>>>>>>>>> Hash reporting > >>>>>>>>>>>> ============== > >>>>>>>>>>>> > >>>>>>>>>>>> Allow the guest to reuse the hash value to make receive steering > >>>>>>>>>>>> consistent between the host and guest, and to save hash computation. > >>>>>>>>>>>> > >>>>>>>>>>>> RSS > >>>>>>>>>>>> === > >>>>>>>>>>>> > >>>>>>>>>>>> RSS is a receive steering algorithm that can be negotiated to use with > >>>>>>>>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. > >>>>>>>>>>>> However, computing the hash after the queue was chosen defeats the > >>>>>>>>>>>> purpose of RSS. > >>>>>>>>>>>> > >>>>>>>>>>>> Another approach is to use eBPF steering program. This approach has > >>>>>>>>>>>> another downside: it cannot report the calculated hash due to the > >>>>>>>>>>>> restrictive nature of eBPF steering program. > >>>>>>>>>>>> > >>>>>>>>>>>> Introduce the code to perform RSS to the kernel in order to overcome > >>>>>>>>>>>> thse challenges. An alternative solution is to extend the eBPF steering > >>>>>>>>>>>> program so that it will be able to report to the userspace, but I didn't > >>>>>>>>>>>> opt for it because extending the current mechanism of eBPF steering > >>>>>>>>>>>> program as is because it relies on legacy context rewriting, and > >>>>>>>>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while > >>>>>>>>>>>> the other relevant virtualization APIs such as KVM and vhost_net are > >>>>>>>>>>>> UAPIs. > >>>>>>>>>>>> > >>>>>>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > >>>>>>>>>>>> Tested-by: Lei Yang <leiyang@redhat.com> > >>>>>>>>>>>> --- > >>>>>>>>>>>> Documentation/networking/tuntap.rst | 7 ++ > >>>>>>>>>>>> drivers/net/Kconfig | 1 + > >>>>>>>>>>>> drivers/net/tap.c | 68 ++++++++++++++- > >>>>>>>>>>>> drivers/net/tun.c | 98 +++++++++++++++++----- > >>>>>>>>>>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- > >>>>>>>>>>>> include/linux/if_tap.h | 2 + > >>>>>>>>>>>> include/linux/skbuff.h | 3 + > >>>>>>>>>>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ > >>>>>>>>>>>> net/core/skbuff.c | 4 + > >>>>>>>>>>>> 9 files changed, 386 insertions(+), 31 deletions(-) > >>>>>>>>>>>> > >>>>>>>>>>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst > >>>>>>>>>>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 > >>>>>>>>>>>> --- a/Documentation/networking/tuntap.rst > >>>>>>>>>>>> +++ b/Documentation/networking/tuntap.rst > >>>>>>>>>>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: > >>>>>>>>>>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); > >>>>>>>>>>>> } > >>>>>>>>>>>> > >>>>>>> > >>>>>>> [...] > >>>>>>> > >>>>>>>>>>>> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, > >>>>>>>>>>>> + bool can_rss, void __user *argp) > >>>>>>>>>>> > >>>>>>>>>>> So again, can_rss seems to be tricky. Looking at its caller, it tires > >>>>>>>>>>> to make eBPF and RSS mutually exclusive. I still don't understand why > >>>>>>>>>>> we need this. Allow eBPF program to override some of the path seems to > >>>>>>>>>>> be common practice. > >>>>>>>>>>> > >>>>>>>>>>> What's more, we didn't try (or even can't) to make automq and eBPF to > >>>>>>>>>>> be mutually exclusive. So I still didn't see what we gain from this > >>>>>>>>>>> and it complicates the codes and may lead to ambiguous uAPI/behaviour. > >>>>>>>>>> > >>>>>>>>>> automq and eBPF are mutually exclusive; automq is disabled when an eBPF > >>>>>>>>>> steering program is set so I followed the example here. > >>>>>>>>> > >>>>>>>>> I meant from the view of uAPI, the kernel doesn't or can't reject eBPF > >>>>>>>>> while using automq. > >>>>>>>> > >> > >>>>>>>>>> We don't even have an interface for eBPF to let it fall back to another > >>>>>>>>>> alogirhtm. > >>>>>>>>> > >>>>>>>>> It doesn't even need this, e.g XDP overrides the default receiving path. > >>>>>>>>> > >>>>>>>>>> I could make it fall back to RSS if the eBPF steeering > >>>>>>>>>> program is designed to fall back to automq when it returns e.g., -1. But > >>>>>>>>>> such an interface is currently not defined and defining one is out of > >>>>>>>>>> scope of this patch series. > >>>>>>>>> > >>>>>>>>> Just to make sure we are on the same page, I meant we just need to > >>>>>>>>> make the behaviour consistent: allow eBPF to override the behaviour of > >>>>>>>>> both automq and rss. > >>>>>>>> > >>>>>>>> That assumes eBPF takes precedence over RSS, which is not obvious to me. > >>>>>>> > >>>>>>> Well, it's kind of obvious. Not speaking the eBPF selector, we have > >>>>>>> other eBPF stuffs like skbedit etc. > >>>>>>> > >>>>>>>> > >>>>>>>> Let's add an interface for the eBPF steering program to fall back to > >>>>>>>> another steering algorithm. I said it is out of scope before, but it > >>>>>>>> makes clear that the eBPF steering program takes precedence over other > >>>>>>>> algorithms and allows us to delete the code for the configuration > >>>>>>>> validation in this patch. > >>>>>>> > >>>>>>> Fallback is out of scope but it's not what I meant. > >>>>>>> > >>>>>>> I meant in the current uAPI take eBPF precedence over automq. It's > >>>>>>> much more simpler to stick this precedence unless we see obvious > >>>>>>> advanatge. > >>>>>> > >>>>>> We still have three different design options that preserve the current > >>>>>> precedence: > >>>>>> > >>>>>> 1) Precedence order: eBPF -> RSS -> automq > >>>>>> 2) Precedence order: RSS -> eBPF -> automq > >>>>>> 3) Precedence order: eBPF OR RSS -> automq where eBPF and RSS are > >>>>>> mutually exclusive > >>>>>> > >>>>>> I think this is a unique situation for this steering program and I could > >>>>>> not find another example in other eBPF stuffs. > >>>>> > >>>>> As described above, queue mapping could be overridden by tc-ebpf. So > >>>>> there's no way to guarantee the RSS will work: > >>>>> > >>>>> https://github.com/DPDK/dpdk/blob/main/drivers/net/tap/bpf/tap_rss.c#L262 > >>>>> > >>>>> Making eBPF first leaves a chance for the management layer to override > >>>>> the choice of Qemu. > >>>> > >>>> I referred to the eBPF steering program instead of tc-ebpf. tc-ebpf is > >>>> nothing to do with the TUNSETSTEERINGEBPF ioctl, which this patch changes. > >>> > >>> I meant you can't do "full control" in any case, the point below > >>> doesn't stand. Queue mapping could be restored even if RSS is set. > >> > >> What matters here is how we handle the control when tc didn't take it. > >> eBPF, RSS, or automq make take all of it; I referred that as "full control". > >> > >>> > >>>> > >>>>> > >>>>>> > >>>>>> The current version implements 3) because it is not obvious whether we > >>>>>> should choose either 1) or 2). > >>>>> > >>>>> But you didn't explain why you choose 3), and it leads to tricky code > >>>>> (e.g the can_rss stuff etc). > >>>> > >>>> I wrote: "because it is not obvious whether we should choose either 1) > >>>> or 2)", but I think I can explain it better: > >>>> > >>>> When an eBPF steering program cannot implement a fallback, it means the > >>>> eBPF steering program requests the full control over the steering. On > >>>> the other hand, RSS also requests the same control. So these two will > >>>> conflict and the entity controlling the steering will be undefined when > >>>> both are enabled. > >>> > >>> Well, the fallback is orthogonal to the proposal here. We haven't had > >>> that since the introduction of the eBPF steering program. This means > >>> automq has been in "conflict" with eBPF for years. Again, another > >>> advantage, allowing the eBPF program to be the first to allow the > >>> management layer to override Qemu's steering. > >> > >> What if a VMM uses eBPF steering program and the management layer > >> decides to override it with RSS? > > > > That's possible but I think we're seeking which approach is better. In > > this case, RSS could be implemented in eBPF but not the reverse. > > > > So my point is to start from something that is simpler. Simply allow > > eBPF on top of RSS as automq. And optimize on top. > > > The in-kernel RSS implementation is more optimized and capable of hash > reporting. I don't think either eBPF steering program or in-kernel RSS > is more capable than the other and there is a reason to place eBPF on > top of RSS. > > > > >> > >> eBPF is obviously predecedent to automq as eBPF is an opt-in feature and > >> automq is the implicit default. But this logic cannot be applied to > >> decide the order of eBPF and RSS because they are both opt-in features. > > > > This is from the perspective of kernel development. But let's try to > > think from the userspace: A well written user space knows what it > > does, rejecting eBPF while RSS is set doesn't help. But anyhow if you > > stick, it doesn't harm. > > Yes, it not for the current userspace but for the future kernel > development; the kernel can reserve the freedom to decide the priority > of eBPF and RSS by rejecting eBPF while RSS. > > > > >> > >>> > >>>> > >>>> 3) eliminates the undefined semantics by rejecting to enable both. > >>> > >>> This would lead a usersapce noticeable change of the behaviour? And > >>> what do you mean by "rejecting to enable both"? > >> > >> Existing userspace code should see no change as it only cares the case > >> where RSS is enabled. > >> > >> Here, rejecting to enable both means to deny setting an eBPF steering > >> program when RSS is enabled, and visa-versa. > >> > >>> > >>>> An > >>>> alternative approach is to allow eBPF steering programs to fall back. > >>>> When both the eBPF program and RSS are enabled, RSS will gain the > >>>> control of steering under the well-defined situation where the eBPF > >>>> steering program decides to fall back. > >>> > >>> How about just stick the eBPF precedence in this proposal and > >>> introduce the fallback on top? This helps to speed up the iteration > >>> (as the version has been iterated to 11). > >> > >> I don't think that helps much since we have another ongoing discussion > >> below and it is not the sole roadblock. > >> > >>> > >>>> > >>>>> > >>>>>> But 1) will be the most capable option if > >>>>>> eBPF has a fall-back feature. > >>>>>> > >>>>>>> > >>>>>>>> > >>>>>>>>> > >>>>>>>>>> > >>>>>>>>>>> > >>>>>>> > >>>>>>> [...] > >>>>>>> > >>>>>>>>>>> Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? > >>>>>>>>>>> If yes, it should be a bug. > >>>>>>>>>> > >>>>>>>>>> It is possible to use RSS without TUN_VNET_HASH_REPORT. > >>>>>>>>> > >>>>>>>>> Another call to separate the ioctls then. > >>>>>>>> > >>>>>>>> RSS and hash reporting are not completely independent though. > >>>>>>> > >>>>>>> Spec said: > >>>>>>> > >>>>>>> """ > >>>>>>> VIRTIO_NET_F_RSSRequires VIRTIO_NET_F_CTRL_VQ. > >>>>>>> """ > >>>>>> > >>>>>> I meant the features can be enabled independently, but they will share > >>>>>> the hash type set when they are enabled at the same time. > >>>>> > >>>>> Looking at the spec: > >>>>> > >>>>> Hash repot uses: > >>>>> > >>>>> """ > >>>>> struct virtio_net_hash_config { > >>>>> le32 hash_types; > >>>>> le16 reserved[4]; > >>>>> u8 hash_key_length; > >>>>> u8 hash_key_data[hash_key_length]; > >>>>> }; > >>>>> """ > >>>>> > >>>>> RSS uses > >>>>> > >>>>> """ > >>>>> struct rss_rq_id { > >>>>> le16 vq_index_1_16: 15; /* Bits 1 to 16 of the virtqueue index */ > >>>>> le16 reserved: 1; /* Set to zero */ > >>>>> }; > >>>>> > >>>>> struct virtio_net_rss_config { > >>>>> le32 hash_types; > >>>>> le16 indirection_table_mask; > >>>>> struct rss_rq_id unclassified_queue; > >>>>> struct rss_rq_id indirection_table[indirection_table_length]; > >>>>> le16 max_tx_vq; > >>>>> u8 hash_key_length; > >>>>> u8 hash_key_data[hash_key_length]; > >>>>> }; > >>>>> """ > >>>>> > >>>>> Instead of trying to figure out whether we can share some data > >>>>> structures, why not simply start from what has been done in the spec? > >>>>> This would ease the usersapce as well where it can simply do 1:1 > >>>>> mapping between ctrl vq command and tun uAPI. > >>>> > >>>> The spec also defines struct virtio_net_hash_config (which will be used > >>>> when RSS is disabled) and struct virtio_net_rss_config to match the > >>>> layout to share some fields. However, the UAPI does not follow the > >>>> interface design of virtio due to some problems with these structures. > >>> > >>> Copy-paste error. The above is copied from the virtio spec, but I > >>> meant the existing uAPI in virtio_net.h: > >>> > >>> /* > >>> * The command VIRTIO_NET_CTRL_MQ_RSS_CONFIG has the same effect as > >>> * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET does and additionally configures > >>> * the receive steering to use a hash calculated for incoming packet > >>> * to decide on receive virtqueue to place the packet. The command > >>> * also provides parameters to calculate a hash and receive virtqueue. > >>> */ > >>> struct virtio_net_rss_config { > >>> __le32 hash_types; > >>> __le16 indirection_table_mask; > >>> __le16 unclassified_queue; > >>> __le16 indirection_table[1/* + indirection_table_mask */]; > >>> __le16 max_tx_vq; > >>> __u8 hash_key_length; > >>> __u8 hash_key_data[/* hash_key_length */]; > >>> }; > >>>> #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 > >>> > >>> /* > >>> * The command VIRTIO_NET_CTRL_MQ_HASH_CONFIG requests the device > >>> * to include in the virtio header of the packet the value of the > >>> * calculated hash and the report type of hash. It also provides > >>> * parameters for hash calculation. The command requires feature > >>> * VIRTIO_NET_F_HASH_REPORT to be negotiated to extend the > >>> * layout of virtio header as defined in virtio_net_hdr_v1_hash. > >>> */ > >>> struct virtio_net_hash_config { > >>> __le32 hash_types; > >>> /* for compatibility with virtio_net_rss_config */ > >>> __le16 reserved[4]; > >>> __u8 hash_key_length; > >>> __u8 hash_key_data[/* hash_key_length */]; > >>> }; > >>> > >>> This has been used by Qemu but I see a virtio-net version of: > >>> > >>> struct virtio_net_ctrl_rss { > >>> u32 hash_types; > >>> u16 indirection_table_mask; > >>> u16 unclassified_queue; > >>> u16 hash_cfg_reserved; /* for HASH_CONFIG (see > >>> virtio_net_hash_config for details) */ > >>> u16 max_tx_vq; > >>> u8 hash_key_length; > >>> u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > >>> > >>> u16 *indirection_table; > >>> }; > >>> > >>> This is ugly and results in a tricky code when trying to submit > >>> RSS/HASH commands to the device: > >>> > >>> if (vi->has_rss) { > >>> sg_buf_size = sizeof(uint16_t) * vi->rss_indir_table_size; > >>> sg_set_buf(&sgs[1], vi->rss.indirection_table, sg_buf_size); > >>> } else { > >>> sg_set_buf(&sgs[1], &vi->rss.hash_cfg_reserved, > >>> sizeof(uint16_t)); > >>> } > >> > >> The only reference to struct virtio_net_rss_config in QEMU is to derive > >> the offset of indirection_table. This is because the definition in > >> virtio_net.h also includes indirection_table in the middle and the > >> offsets of later part are unusable. > > > > Yes. > > > >> > >> QEMU internally has a structure named VirtioNetRssData which just looks > >> like struct virtio_net_ctrl_rss. > > > > It's a pity that it doesn't use uAPI. We might need to fix them. > > It doesn't want to use the UAPI structures for the internal storage > because it wants to store them in native endians and QEMU is not > interested in some fields in the UAPI structures. struct tun_vnet_hash > and struct tun_vnet_hash_rss are easy to fill using VirtioNetRssData. > > > > >> > >>> > >>>> > >>>> Below is the definition of struct virtio_net_hash_config: > >>>> > >>>> struct virtio_net_hash_config { > >>>> le32 hash_types; > >>>> le16 reserved[4]; > >>>> u8 hash_key_length; > >>>> u8 hash_key_data[hash_key_length]; > >>>> }; > >>>> > >>>> Here, hash_types, hash_key_length, and hash_key_data are shared with > >>>> struct virtio_net_rss_config. > >>>> > >>>> One problem is that struct virtio_net_rss_config has a flexible array > >>>> (indirection_table) between hash_types and hash_key_length. This is > >>>> something we cannot express with C. > >>> > >>> We can split the virtio_net_rss_config to ease the dealing with > >>> arrays, more below. > >>> > >>>> > >>>> Another problem is that the semantics of the key in struct > >>>> virtio_net_hash_config is not defined in the spec. > >>> > >>> If this is the case. Let's fix that in the spec first to make sure our > >>> uAPI aligns with spec without ambiguity. It would be a nightmare to > >>> deal with the in-consistency between virtio spec and Linux uAPIs. > >> > >> The userspace doesn't need to do anything to deal with inconsistency > >> since these fields are unused. > >> > >>> > >>>> > >>>> To solve these problems, I defined the UAPI structures that do not > >>>> include indiretion_table. > >>>> > >>>>> > >>>>>> > >>>>>>> > >>>>>>>> > >>>>>>>> A plot twist is the "types" parameter; it is a parameter that is > >>>>>>>> "common" for RSS and hash reporting. > >>>>>>> > >>>>>>> So we can share part of the structure through the uAPI. > >>>>>> > >>>>>> Isn't that what this patch does? > >>>>> > >>>>> I didn't see, basically I see only one TUNSETVNETHASH that is used to > >>>>> set both hash report and rss: > >>>> > >>>> The UAPI shares struct tun_vnet_hash for both hash report and rss. > >>> > >>> I meant sharing structure in two ioctls instead of reusing a specific > >>> structure for two semantics in one ioctl if possible. Though I don't > >>> think we need any sharing. > >> > >> The UAPI implemented in this patch already shares struct tun_vnet_hash > >> and having two ioctls doesn't change that. > >> > >>> > >>>> > >>>>> > >>>>> """ > >>>>> +/** > >>>>> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing > >>>>> + * > >>>>> + * The argument is a pointer to &struct tun_vnet_hash. > >>>>> + * > >>>>> + * The argument is a pointer to the compound of the following in order if > >>>>> + * %TUN_VNET_HASH_RSS is set: > >>>>> + * > >>>>> + * 1. &struct tun_vnet_hash > >>>>> + * 2. &struct tun_vnet_hash_rss > >>>>> + * 3. Indirection table > >>>>> + * 4. Key > >>>>> + * > >>>>> """ > >>>>> > >>>>> And it seems to lack parameters like max_tx_vq. > >>>> > >>>> max_tx_vq is not relevant with hashing. > >>> > >>> It is needed for RSS and we don't have that, no? > >> > >> No. RSS is Receive Side Scaling but it's not about receiving. > > > > Just to make sure I understand this, max_tx_vq is part of the > > virtio_net_rss_config, how would Qemu behave when it receives this > > from guest? > > > > """ > > A driver sets max_tx_vq to inform a device how many transmit > > virtqueues it may use (transmitq1…transmitq max_tx_vq). > > """ > > It does nothing. Nope, see: commit 50bfcaedd78e53135ec0504302269b3b65bf1eff Author: Philo Lu <lulie@linux.alibaba.com> Date: Mon Nov 4 16:57:06 2024 +0800 virtio_net: Update rss when set queue RSS configuration should be updated with queue number. In particular, it should be updated when (1) rss enabled and (2) default rss configuration is used without user modification. During rss command processing, device updates queue_pairs using rss.max_tx_vq. That is, the device updates queue_pairs together with rss, so we can skip the sperate queue_pairs update (VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET below) and return directly. Also remove the `vi->has_rss ?` check when setting vi->rss.max_tx_vq, because this is not used in the other hash_report case. Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.") Signed-off-by: Philo Lu <lulie@linux.alibaba.com> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com> RSS doesn't depend on MQ. If it is not handled by Qemu, it should be a bug? > > > > >> > >>> > >>>> > >>>>> > >>>>> What's more, we've already had virito-net uAPI. Why not simply reusing them? > >>>> > >>>> See the above. > >>>> > >>>>> > >>>>>> > >>>>>>> > >>>>>>>> RSS and hash reporting must share > >>>>>>>> this parameter when both are enabled at the same time; otherwise RSS may > >>>>>>>> compute hash values that are not suited for hash reporting. > >>>>>>> > >>>>>>> Is this mandated by the spec? If yes, we can add a check. If not, > >>>>>>> userspace risk themselves as a mis-configuration which we don't need > >>>>>>> to bother. > >>>>>> > >>>>>> Yes, it is mandated. 5.1.6.4.3 Hash calculation for incoming packets says: > >>>>>> > A device attempts to calculate a per-packet hash in the following > >>>>>> > cases: > >>>>>> > > >>>>>> > - The feature VIRTIO_NET_F_RSS was negotiated. The device uses the > >>>>>> > hash to determine the receive virtqueue to place incoming packets. > >>>>>> > - The feature VIRTIO_NET_F_HASH_REPORT was negotiated. The device > >>>>>> > reports the hash value and the hash type with the packet. > >>>>>> > > >>>>>> > If the feature VIRTIO_NET_F_RSS was negotiated: > >>>>>> > > >>>>>> > - The device uses hash_types of the virtio_net_rss_config structure > >>>>>> > as ’Enabled hash types’ bitmask. > >>>>>> > - The device uses a key as defined in hash_key_data and > >>>>>> hash_key_length of the virtio_net_rss_config structure (see > >>>>>> > 5.1.6.5.7.1). > >>>>>> > > >>>>>> > If the feature VIRTIO_NET_F_RSS was not negotiated: > >>>>>> > > >>>>>> > - The device uses hash_types of the virtio_net_hash_config structure > >>>>>> > as ’Enabled hash types’ bitmask. > >>>>>> > - The device uses a key as defined in hash_key_data and > >>>>>> > hash_key_length of the virtio_net_hash_config structure (see > >>>>>> > .1.6.5.6.4). > >>>>>> > >>>>>> So when both VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT are > >>>>>> negotiated, virtio_net_rss_config not only controls RSS but also the > >>>>>> reported hash values and types. They cannot be divergent. > >>>>>> > >>>>>>> > >>>>>>> Note that spec use different commands for hash_report and rss. > >>>>>> > >>>>>> TUNSETVNETHASH is different from these commands in terms that it also > >>>>>> negotiates VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS. > >>>>>> > >>>>> > >>>>> There Are different "issues" here: > >>>>> > >>>>> 1) Whether or not we need to use a unified API for negotiating RSS and > >>>>> HASH_REPORT features > >>>>> 2) Whether or not we need to sue a unified API for setting RSS and > >>>>> HASH_REPORT configuration > >>>>> > >>>>> What I want to say is point 2. But what you raise is point 1. > >>>>> > >>>>> For simplicity, it looks to me like it's a call for having separated > >>>>> ioctls for feature negotiation (for example via TUNSETIFF). You may > >>>>> argue that either RSS or HASH_REPORT requires configurations, we can > >>>>> just follow what spec defines or not (e.g what happens if > >>>>> RSS/HASH_REPORT were negotiated but no configurations were set). > >>>> > >>>> Unfortunately TUNSETIFF does not fit in this use case. The flags set > >>>> with TUNSETIFF are fixed, but the guest can request a different feature > >>>> set anytime by resetting the device. > >>> > >>> TUNSETIFF, enables the device to be able to handle RSS/HASREPORT. > >>> TUNSETHASH/RSS. dealing with RSS/HASH command from userspace. > >> > >> We also needs to be able to disable them at runtime so that we can > >> handle resets. > > > > Via TUNSETHASH/RSS? I think it should have a way to accept parameters > > that disable RSS or hash report. > > That's what this patch implements. TUNSETVNETHASH accepts parameters to > choose what features to be enabled. > > > > >> > >>> > >>> This is the way we used to do for multi queue and vnet header. > >>> TUNSETIFF requires CAP_NET_ADMIN, this could be an extra safe guard > >>> for unprivileged userspace. > >> > >> I intend to allow using this feature without privilege. A VMM is usually > >> unprivileged and requiring a privilege to configure tuntap is too > >> prohibitive. > > > > For safety, tun is not allowed to be created by unprivileged users. > > And it's not to configure the tuntap dynamically, it's about telling > > the function that tuntap can have (not necessarily enabled though) . > > I don't think we need another barrier for the new functions. Once an > unprivileged user get a file descriptor of tuntap from a privileged > user, they are free to enable RSS and/or hash reporting. Only if such a feature is allowed by the privileged user. > > > > >> > >>> > >>>> > >>>> > >> In the virtio-net specification, it is not defined what would > >>>> happen if > >>>>>> these features are negotiated but the VIRTIO_NET_CTRL_MQ_RSS_CONFIG or > >>>>>> VIRTIO_NET_CTRL_MQ_HASH_CONFIG commands are not sent. There is no such > >>>>>> ambiguity with TUNSETVNETHASH. > >>>>> > >>>>> So I don't see advantages of unifying hash reports and rss into a > >>>>> single ioctl. Let's just follow what has been done in the spec that > >>>>> uses separated commands. Tuntap is not a good place to debate whether > >>>>> those commands could be unified or not. We need to move it to the spec > >>>>> but assuming spec has been done, it might be too late or too few > >>>>> advantages for having another design. > >>>> > >>>> It makes sense for the spec to reuse the generic feature negotiation > >>>> mechanism, but the situation is different for tuntap; we cannot use > >>>> TUNSETIFF and need to define another. Then why don't we exploit this > >>>> opportunity to have an interface with well-defined semantics? > >>> > >>> That's perfectly fine, but it needs to be done in virtio-net's uAPI > >>> not tun's. What's more, if you think two commands are not > >>> well-defined, let's fix that in the virtio spec first. > >>> > >>>> The virtio > >>>> spec does its best as an interface between the host and guest and tuntap > >>>> does its best as an UAPI. > >>> > >>> See above, let's fix the uAPI first. We don't want DPDK to use tun's > >>> uAPI for RSS > >> > >> virtio-net's UAPI is for the virtio spec which has a capable generic > >> feature negotiation mechanism. tuntap needs its own feature negotiation > >> and it's nothing to do with virtio-net's UAPI. > > > > Well, I don't mean the part of the feature negotiation. I mean the > > part for rss and hash report configuration. > > The feature negotiation still matters when deciding the granularity of > ioctls. We need one ioctl for a feature negotiation, and to avoid having > an intermediate state, I don't understand this. For example, driver can choose to 1) negotiate RSS 2) do something else. 3) configure RSS Spec doesn't require those two to be configured at the same time, so "intermediate state" is allowed. > the ioctl should also do the configuration. Hence > that one ioctl should do all of the feature negotiation and configuration. > > > > >> > >> The structures for two commands have unused or redundant fields and a > >> flexible array in the middle of the structure, but they are ABIs so we > >> can't change it. > >> > >> DPDK is another reason to define tuntap's own UAPIs. They don't care > >> unused or redundant fields and a flexible array in middle that are > >> present in the virtio spec. It will also not want to deal with the > >> requirement of little endian. Constructing struct virtio_net_rss_config > >> is an extra burden for DPDK. > > > > I meant for vhost-user implementation in DPDK, it needs to use > > virtio-net uAPI not tuntap's for example. > > The vhost-user implementation will use tuntap's UAPIs for its ethernet > device backend. That sounds pretty weird, vhost-user has nothing related to tuntap. > It uses the generic interface of ethernet device so for > RSS it will use functions like rte_eth_dev_rss_hash_update() for > example. tuntap's UAPIs are more suited to implement these interfaces as > they operate in native endian and don't have extra fields. Nope, for example it needs to use le for virtio_net_hdr if a modern device is used. But it needs a "native" endian according to the guest endian via TUNSETVNETLE/BE. We don't have a choice as virtio-net hdr support in tuntap is much earlier than modern devices. Let's don't do the same thing (native endian) for tuntap as RSS depends on modern, so we know it must be le. > > DPDk applications other than vhost-user also matter; they do not care > what virtio does at all. > > > >> > >> On the other hand, Constructing tuntap-specific structures is not that > >> complicated for VMMs. > > > > Not complicated but redundant. > > > >> A VMM will need to inspect struct > >> virtio_net_rss_config anyway to handle migration and check its size so > >> it can store the values it inspected to struct tun_vnet_hash and struct > >> tun_vnet_hash_rss and pass them to the kernel. > > > > I don't see how rss and hash reports differ from what we have now. > > Those inspections must be done anyhow for compatibility for example > > the check of offloading features. Such steps could not be eliminated > > no matter how we design the uAPI. > > I explained the difference between the virtio and tuntap UAPIs, not > between RSS and hash reporting. See above. > > > > >> > >> The overall userspace implementation will be simpler by having > >> structures specifically tailored for the communication between the > >> userspace and kernel. > > > > This is exactly how a good uAPI should behave. If uAPI in virtio-net > > can't do this, I don't understand why uAPI in tuntap can solve it. > > The UAPI in virtio-net cannot do it because it's already fixed and it > also needs to perform endian conversion for the VM use case. tuntap > doesn't have these restrictions. Same here. > > > > >> > >>> > >>>> > >>>> I don't think there is an advantage to split ioctls to follow the spec > >>>> after all. It makes sense if we can pass-through virtio commands to > >>>> tuntap, but it is not possible as ioctl operation codes are different > >>>> from virtio commands. > >>> > >>> I don't see a connection with the operation code. For example, we can > >>> add new uAPIs in virtio-net which could be something like: > >>> > >>> struct virtio_net_rss_config_header { > >>> __le32 hash_types; > >>> __le16 indirection_table_mask; > >>> __le16 unclassified_queue; > >>> __le16 indirection_table[]; > >>> } > >>> > >>> struct virtio_net_rss_config_tailer { > >>> __le16 max_tx_vq; > >>> u8 hash_key_length; > >>> u8 hash_key_data[]; > >>> } > >>> > >>> These two are used by TUNSETVNETRSS. And simply reuse the > >>> virtio_net_hash_config for TUNSETVETHASH. > >> > > With this, we can tweak the virtio-net driver with this new uAPI. Then > >>> tap* can reuse this. > >> > >> I implemented a UAPI and driver change accordingly: > >> https://lore.kernel.org/r/20250318-virtio-v1-0-344caf336ddd@daynix.com > >> > >> This is a nice improvement for the driver, but I still don't think it is > >> suited for the UAPI of tuntap. > > > > Any reason for this? It should work like virtio_net_hdr. > > > >> The requirements of extra fields and > >> little endian cannot be removed from the virtio spec but they are > >> irrelevant for tuntap. > > > > I don't understand this part. What fields are "extra" and need to be > > removed from the spec? > > All fields not included in struct tun_vnet_hash and struct > tun_vnet_hash_rss. Namely, for struct virtio_net_hash_config: > - reserved > - hash_key_length > - hash_key_data > > For struct virtio_net_rss_config: > - max_tx_vq > - hash_key_length See my above reply, and I basically meant TUNSETVETHASH accept struct virtio_net_hash_config; TUNSETVETRSS accept struct virtio_net_rss_config_hdr + struct virtio_net_rss_config_trailer; Thanks > > Regards, > Akihiko Odaki > > > > >> > >>> > >>>> The best possibility is to share structures, not > >>>> commands, and I don't think even sharing structures makes sense here > >>>> because of the reasons described above. > >>> > >>> I don't want to share structures, I meant starting from something that > >>> is simple and has been sorted in the virtio spec. Optimization could > >>> be done on top. > >> > >> I meant to reuse the structures in virtio_net.h. > >> > >> Regards, > >> Akihiko Odaki > > > > Thanks > > > >> > >>> > >>> Thanks > >>> > >>> > >>>> > >>>> Regards, > >>>> Akihiko Odaki > >>>> > >>>>> > >>>>> Thanks > >>>>> > >>>>>> > >>>>>> Regards, > >>>>>> Akihiko Odaki > >>>>>> > >>>>>>> > >>>>>>>> > >>>>>>>> The paramter will be duplicated if we have separate ioctls for RSS and > >>>>>>>> hash reporting, and the kernel will have a chiken-egg problem when > >>>>>>>> ensuring they are synchronized; when the ioctl for RSS is issued, should > >>>>>>>> the kernel ensure the "types" parameter is identical with one specified > >>>>>>>> for hash reporting? It will not work if the userspace may decide to > >>>>>>>> configure hash reporting after RSS. > >>>>>>>> > >>>>>>> > >>>>>>> See my reply above. > >>>>>>> > >>>>>>> Thanks > >>>>>>> > >>>>>> > >>>>> > >>>> > >>> > >> > > >
On 2025/03/20 10:31, Jason Wang wrote: > On Wed, Mar 19, 2025 at 1:29 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2025/03/19 9:58, Jason Wang wrote: >>> On Tue, Mar 18, 2025 at 6:10 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> On 2025/03/18 9:15, Jason Wang wrote: >>>>> On Mon, Mar 17, 2025 at 3:07 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>> >>>>>> On 2025/03/17 10:12, Jason Wang wrote: >>>>>>> On Wed, Mar 12, 2025 at 1:03 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>> >>>>>>>> On 2025/03/12 11:35, Jason Wang wrote: >>>>>>>>> On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>> >>>>>>>>>> On 2025/03/11 9:38, Jason Wang wrote: >>>>>>>>>>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>>>> >>>>>>>>>>>> On 2025/03/10 12:55, Jason Wang wrote: >>>>>>>>>>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>>>>>> >>>>>>>>>>>>>> Hash reporting >>>>>>>>>>>>>> ============== >>>>>>>>>>>>>> >>>>>>>>>>>>>> Allow the guest to reuse the hash value to make receive steering >>>>>>>>>>>>>> consistent between the host and guest, and to save hash computation. >>>>>>>>>>>>>> >>>>>>>>>>>>>> RSS >>>>>>>>>>>>>> === >>>>>>>>>>>>>> >>>>>>>>>>>>>> RSS is a receive steering algorithm that can be negotiated to use with >>>>>>>>>>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. >>>>>>>>>>>>>> However, computing the hash after the queue was chosen defeats the >>>>>>>>>>>>>> purpose of RSS. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Another approach is to use eBPF steering program. This approach has >>>>>>>>>>>>>> another downside: it cannot report the calculated hash due to the >>>>>>>>>>>>>> restrictive nature of eBPF steering program. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Introduce the code to perform RSS to the kernel in order to overcome >>>>>>>>>>>>>> thse challenges. An alternative solution is to extend the eBPF steering >>>>>>>>>>>>>> program so that it will be able to report to the userspace, but I didn't >>>>>>>>>>>>>> opt for it because extending the current mechanism of eBPF steering >>>>>>>>>>>>>> program as is because it relies on legacy context rewriting, and >>>>>>>>>>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while >>>>>>>>>>>>>> the other relevant virtualization APIs such as KVM and vhost_net are >>>>>>>>>>>>>> UAPIs. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >>>>>>>>>>>>>> Tested-by: Lei Yang <leiyang@redhat.com> >>>>>>>>>>>>>> --- >>>>>>>>>>>>>> Documentation/networking/tuntap.rst | 7 ++ >>>>>>>>>>>>>> drivers/net/Kconfig | 1 + >>>>>>>>>>>>>> drivers/net/tap.c | 68 ++++++++++++++- >>>>>>>>>>>>>> drivers/net/tun.c | 98 +++++++++++++++++----- >>>>>>>>>>>>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >>>>>>>>>>>>>> include/linux/if_tap.h | 2 + >>>>>>>>>>>>>> include/linux/skbuff.h | 3 + >>>>>>>>>>>>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >>>>>>>>>>>>>> net/core/skbuff.c | 4 + >>>>>>>>>>>>>> 9 files changed, 386 insertions(+), 31 deletions(-) >>>>>>>>>>>>>> >>>>>>>>>>>>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst >>>>>>>>>>>>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 >>>>>>>>>>>>>> --- a/Documentation/networking/tuntap.rst >>>>>>>>>>>>>> +++ b/Documentation/networking/tuntap.rst >>>>>>>>>>>>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >>>>>>>>>>>>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >>>>>>>>>>>>>> } >>>>>>>>>>>>>> >>>>>>>>> >>>>>>>>> [...] >>>>>>>>> >>>>>>>>>>>>>> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, >>>>>>>>>>>>>> + bool can_rss, void __user *argp) >>>>>>>>>>>>> >>>>>>>>>>>>> So again, can_rss seems to be tricky. Looking at its caller, it tires >>>>>>>>>>>>> to make eBPF and RSS mutually exclusive. I still don't understand why >>>>>>>>>>>>> we need this. Allow eBPF program to override some of the path seems to >>>>>>>>>>>>> be common practice. >>>>>>>>>>>>> >>>>>>>>>>>>> What's more, we didn't try (or even can't) to make automq and eBPF to >>>>>>>>>>>>> be mutually exclusive. So I still didn't see what we gain from this >>>>>>>>>>>>> and it complicates the codes and may lead to ambiguous uAPI/behaviour. >>>>>>>>>>>> >>>>>>>>>>>> automq and eBPF are mutually exclusive; automq is disabled when an eBPF >>>>>>>>>>>> steering program is set so I followed the example here. >>>>>>>>>>> >>>>>>>>>>> I meant from the view of uAPI, the kernel doesn't or can't reject eBPF >>>>>>>>>>> while using automq. >>>>>>>>>> > >> >>>>>>>>>>>> We don't even have an interface for eBPF to let it fall back to another >>>>>>>>>>>> alogirhtm. >>>>>>>>>>> >>>>>>>>>>> It doesn't even need this, e.g XDP overrides the default receiving path. >>>>>>>>>>> >>>>>>>>>>>> I could make it fall back to RSS if the eBPF steeering >>>>>>>>>>>> program is designed to fall back to automq when it returns e.g., -1. But >>>>>>>>>>>> such an interface is currently not defined and defining one is out of >>>>>>>>>>>> scope of this patch series. >>>>>>>>>>> >>>>>>>>>>> Just to make sure we are on the same page, I meant we just need to >>>>>>>>>>> make the behaviour consistent: allow eBPF to override the behaviour of >>>>>>>>>>> both automq and rss. >>>>>>>>>> >>>>>>>>>> That assumes eBPF takes precedence over RSS, which is not obvious to me. >>>>>>>>> >>>>>>>>> Well, it's kind of obvious. Not speaking the eBPF selector, we have >>>>>>>>> other eBPF stuffs like skbedit etc. >>>>>>>>> >>>>>>>>>> >>>>>>>>>> Let's add an interface for the eBPF steering program to fall back to >>>>>>>>>> another steering algorithm. I said it is out of scope before, but it >>>>>>>>>> makes clear that the eBPF steering program takes precedence over other >>>>>>>>>> algorithms and allows us to delete the code for the configuration >>>>>>>>>> validation in this patch. >>>>>>>>> >>>>>>>>> Fallback is out of scope but it's not what I meant. >>>>>>>>> >>>>>>>>> I meant in the current uAPI take eBPF precedence over automq. It's >>>>>>>>> much more simpler to stick this precedence unless we see obvious >>>>>>>>> advanatge. >>>>>>>> >>>>>>>> We still have three different design options that preserve the current >>>>>>>> precedence: >>>>>>>> >>>>>>>> 1) Precedence order: eBPF -> RSS -> automq >>>>>>>> 2) Precedence order: RSS -> eBPF -> automq >>>>>>>> 3) Precedence order: eBPF OR RSS -> automq where eBPF and RSS are >>>>>>>> mutually exclusive >>>>>>>> >>>>>>>> I think this is a unique situation for this steering program and I could >>>>>>>> not find another example in other eBPF stuffs. >>>>>>> >>>>>>> As described above, queue mapping could be overridden by tc-ebpf. So >>>>>>> there's no way to guarantee the RSS will work: >>>>>>> >>>>>>> https://github.com/DPDK/dpdk/blob/main/drivers/net/tap/bpf/tap_rss.c#L262 >>>>>>> >>>>>>> Making eBPF first leaves a chance for the management layer to override >>>>>>> the choice of Qemu. >>>>>> >>>>>> I referred to the eBPF steering program instead of tc-ebpf. tc-ebpf is >>>>>> nothing to do with the TUNSETSTEERINGEBPF ioctl, which this patch changes. >>>>> >>>>> I meant you can't do "full control" in any case, the point below >>>>> doesn't stand. Queue mapping could be restored even if RSS is set. >>>> >>>> What matters here is how we handle the control when tc didn't take it. >>>> eBPF, RSS, or automq make take all of it; I referred that as "full control". >>>> >>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>> The current version implements 3) because it is not obvious whether we >>>>>>>> should choose either 1) or 2). >>>>>>> >>>>>>> But you didn't explain why you choose 3), and it leads to tricky code >>>>>>> (e.g the can_rss stuff etc). >>>>>> >>>>>> I wrote: "because it is not obvious whether we should choose either 1) >>>>>> or 2)", but I think I can explain it better: >>>>>> >>>>>> When an eBPF steering program cannot implement a fallback, it means the >>>>>> eBPF steering program requests the full control over the steering. On >>>>>> the other hand, RSS also requests the same control. So these two will >>>>>> conflict and the entity controlling the steering will be undefined when >>>>>> both are enabled. >>>>> >>>>> Well, the fallback is orthogonal to the proposal here. We haven't had >>>>> that since the introduction of the eBPF steering program. This means >>>>> automq has been in "conflict" with eBPF for years. Again, another >>>>> advantage, allowing the eBPF program to be the first to allow the >>>>> management layer to override Qemu's steering. >>>> >>>> What if a VMM uses eBPF steering program and the management layer >>>> decides to override it with RSS? >>> >>> That's possible but I think we're seeking which approach is better. In >>> this case, RSS could be implemented in eBPF but not the reverse. >>> >>> So my point is to start from something that is simpler. Simply allow >>> eBPF on top of RSS as automq. And optimize on top. >> >> >> The in-kernel RSS implementation is more optimized and capable of hash >> reporting. I don't think either eBPF steering program or in-kernel RSS >> is more capable than the other and there is a reason to place eBPF on >> top of RSS. >> >>> >>>> >>>> eBPF is obviously predecedent to automq as eBPF is an opt-in feature and >>>> automq is the implicit default. But this logic cannot be applied to >>>> decide the order of eBPF and RSS because they are both opt-in features. >>> >>> This is from the perspective of kernel development. But let's try to >>> think from the userspace: A well written user space knows what it >>> does, rejecting eBPF while RSS is set doesn't help. But anyhow if you >>> stick, it doesn't harm. >> >> Yes, it not for the current userspace but for the future kernel >> development; the kernel can reserve the freedom to decide the priority >> of eBPF and RSS by rejecting eBPF while RSS. >> >>> >>>> >>>>> >>>>>> >>>>>> 3) eliminates the undefined semantics by rejecting to enable both. >>>>> >>>>> This would lead a usersapce noticeable change of the behaviour? And >>>>> what do you mean by "rejecting to enable both"? >>>> >>>> Existing userspace code should see no change as it only cares the case >>>> where RSS is enabled. >>>> >>>> Here, rejecting to enable both means to deny setting an eBPF steering >>>> program when RSS is enabled, and visa-versa. >>>> >>>>> >>>>>> An >>>>>> alternative approach is to allow eBPF steering programs to fall back. >>>>>> When both the eBPF program and RSS are enabled, RSS will gain the >>>>>> control of steering under the well-defined situation where the eBPF >>>>>> steering program decides to fall back. >>>>> >>>>> How about just stick the eBPF precedence in this proposal and >>>>> introduce the fallback on top? This helps to speed up the iteration >>>>> (as the version has been iterated to 11). >>>> >>>> I don't think that helps much since we have another ongoing discussion >>>> below and it is not the sole roadblock. >>>> >>>>> >>>>>> >>>>>>> >>>>>>>> But 1) will be the most capable option if >>>>>>>> eBPF has a fall-back feature. >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>> >>>>>>>>> [...] >>>>>>>>> >>>>>>>>>>>>> Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? >>>>>>>>>>>>> If yes, it should be a bug. >>>>>>>>>>>> >>>>>>>>>>>> It is possible to use RSS without TUN_VNET_HASH_REPORT. >>>>>>>>>>> >>>>>>>>>>> Another call to separate the ioctls then. >>>>>>>>>> >>>>>>>>>> RSS and hash reporting are not completely independent though. >>>>>>>>> >>>>>>>>> Spec said: >>>>>>>>> >>>>>>>>> """ >>>>>>>>> VIRTIO_NET_F_RSSRequires VIRTIO_NET_F_CTRL_VQ. >>>>>>>>> """ >>>>>>>> >>>>>>>> I meant the features can be enabled independently, but they will share >>>>>>>> the hash type set when they are enabled at the same time. >>>>>>> >>>>>>> Looking at the spec: >>>>>>> >>>>>>> Hash repot uses: >>>>>>> >>>>>>> """ >>>>>>> struct virtio_net_hash_config { >>>>>>> le32 hash_types; >>>>>>> le16 reserved[4]; >>>>>>> u8 hash_key_length; >>>>>>> u8 hash_key_data[hash_key_length]; >>>>>>> }; >>>>>>> """ >>>>>>> >>>>>>> RSS uses >>>>>>> >>>>>>> """ >>>>>>> struct rss_rq_id { >>>>>>> le16 vq_index_1_16: 15; /* Bits 1 to 16 of the virtqueue index */ >>>>>>> le16 reserved: 1; /* Set to zero */ >>>>>>> }; >>>>>>> >>>>>>> struct virtio_net_rss_config { >>>>>>> le32 hash_types; >>>>>>> le16 indirection_table_mask; >>>>>>> struct rss_rq_id unclassified_queue; >>>>>>> struct rss_rq_id indirection_table[indirection_table_length]; >>>>>>> le16 max_tx_vq; >>>>>>> u8 hash_key_length; >>>>>>> u8 hash_key_data[hash_key_length]; >>>>>>> }; >>>>>>> """ >>>>>>> >>>>>>> Instead of trying to figure out whether we can share some data >>>>>>> structures, why not simply start from what has been done in the spec? >>>>>>> This would ease the usersapce as well where it can simply do 1:1 >>>>>>> mapping between ctrl vq command and tun uAPI. >>>>>> >>>>>> The spec also defines struct virtio_net_hash_config (which will be used >>>>>> when RSS is disabled) and struct virtio_net_rss_config to match the >>>>>> layout to share some fields. However, the UAPI does not follow the >>>>>> interface design of virtio due to some problems with these structures. >>>>> >>>>> Copy-paste error. The above is copied from the virtio spec, but I >>>>> meant the existing uAPI in virtio_net.h: >>>>> >>>>> /* >>>>> * The command VIRTIO_NET_CTRL_MQ_RSS_CONFIG has the same effect as >>>>> * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET does and additionally configures >>>>> * the receive steering to use a hash calculated for incoming packet >>>>> * to decide on receive virtqueue to place the packet. The command >>>>> * also provides parameters to calculate a hash and receive virtqueue. >>>>> */ >>>>> struct virtio_net_rss_config { >>>>> __le32 hash_types; >>>>> __le16 indirection_table_mask; >>>>> __le16 unclassified_queue; >>>>> __le16 indirection_table[1/* + indirection_table_mask */]; >>>>> __le16 max_tx_vq; >>>>> __u8 hash_key_length; >>>>> __u8 hash_key_data[/* hash_key_length */]; >>>>> }; >>>>>> #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 >>>>> >>>>> /* >>>>> * The command VIRTIO_NET_CTRL_MQ_HASH_CONFIG requests the device >>>>> * to include in the virtio header of the packet the value of the >>>>> * calculated hash and the report type of hash. It also provides >>>>> * parameters for hash calculation. The command requires feature >>>>> * VIRTIO_NET_F_HASH_REPORT to be negotiated to extend the >>>>> * layout of virtio header as defined in virtio_net_hdr_v1_hash. >>>>> */ >>>>> struct virtio_net_hash_config { >>>>> __le32 hash_types; >>>>> /* for compatibility with virtio_net_rss_config */ >>>>> __le16 reserved[4]; >>>>> __u8 hash_key_length; >>>>> __u8 hash_key_data[/* hash_key_length */]; >>>>> }; >>>>> >>>>> This has been used by Qemu but I see a virtio-net version of: >>>>> >>>>> struct virtio_net_ctrl_rss { >>>>> u32 hash_types; >>>>> u16 indirection_table_mask; >>>>> u16 unclassified_queue; >>>>> u16 hash_cfg_reserved; /* for HASH_CONFIG (see >>>>> virtio_net_hash_config for details) */ >>>>> u16 max_tx_vq; >>>>> u8 hash_key_length; >>>>> u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; >>>>> >>>>> u16 *indirection_table; >>>>> }; >>>>> >>>>> This is ugly and results in a tricky code when trying to submit >>>>> RSS/HASH commands to the device: >>>>> >>>>> if (vi->has_rss) { >>>>> sg_buf_size = sizeof(uint16_t) * vi->rss_indir_table_size; >>>>> sg_set_buf(&sgs[1], vi->rss.indirection_table, sg_buf_size); >>>>> } else { >>>>> sg_set_buf(&sgs[1], &vi->rss.hash_cfg_reserved, >>>>> sizeof(uint16_t)); >>>>> } >>>> >>>> The only reference to struct virtio_net_rss_config in QEMU is to derive >>>> the offset of indirection_table. This is because the definition in >>>> virtio_net.h also includes indirection_table in the middle and the >>>> offsets of later part are unusable. >>> >>> Yes. >>> >>>> >>>> QEMU internally has a structure named VirtioNetRssData which just looks >>>> like struct virtio_net_ctrl_rss. >>> >>> It's a pity that it doesn't use uAPI. We might need to fix them. >> >> It doesn't want to use the UAPI structures for the internal storage >> because it wants to store them in native endians and QEMU is not >> interested in some fields in the UAPI structures. struct tun_vnet_hash >> and struct tun_vnet_hash_rss are easy to fill using VirtioNetRssData. >> >>> >>>> >>>>> >>>>>> >>>>>> Below is the definition of struct virtio_net_hash_config: >>>>>> >>>>>> struct virtio_net_hash_config { >>>>>> le32 hash_types; >>>>>> le16 reserved[4]; >>>>>> u8 hash_key_length; >>>>>> u8 hash_key_data[hash_key_length]; >>>>>> }; >>>>>> >>>>>> Here, hash_types, hash_key_length, and hash_key_data are shared with >>>>>> struct virtio_net_rss_config. >>>>>> >>>>>> One problem is that struct virtio_net_rss_config has a flexible array >>>>>> (indirection_table) between hash_types and hash_key_length. This is >>>>>> something we cannot express with C. >>>>> >>>>> We can split the virtio_net_rss_config to ease the dealing with >>>>> arrays, more below. >>>>> >>>>>> >>>>>> Another problem is that the semantics of the key in struct >>>>>> virtio_net_hash_config is not defined in the spec. >>>>> >>>>> If this is the case. Let's fix that in the spec first to make sure our >>>>> uAPI aligns with spec without ambiguity. It would be a nightmare to >>>>> deal with the in-consistency between virtio spec and Linux uAPIs. >>>> >>>> The userspace doesn't need to do anything to deal with inconsistency >>>> since these fields are unused. >>>> >>>>> >>>>>> >>>>>> To solve these problems, I defined the UAPI structures that do not >>>>>> include indiretion_table. >>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> A plot twist is the "types" parameter; it is a parameter that is >>>>>>>>>> "common" for RSS and hash reporting. >>>>>>>>> >>>>>>>>> So we can share part of the structure through the uAPI. >>>>>>>> >>>>>>>> Isn't that what this patch does? >>>>>>> >>>>>>> I didn't see, basically I see only one TUNSETVNETHASH that is used to >>>>>>> set both hash report and rss: >>>>>> >>>>>> The UAPI shares struct tun_vnet_hash for both hash report and rss. >>>>> >>>>> I meant sharing structure in two ioctls instead of reusing a specific >>>>> structure for two semantics in one ioctl if possible. Though I don't >>>>> think we need any sharing. >>>> >>>> The UAPI implemented in this patch already shares struct tun_vnet_hash >>>> and having two ioctls doesn't change that. >>>> >>>>> >>>>>> >>>>>>> >>>>>>> """ >>>>>>> +/** >>>>>>> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing >>>>>>> + * >>>>>>> + * The argument is a pointer to &struct tun_vnet_hash. >>>>>>> + * >>>>>>> + * The argument is a pointer to the compound of the following in order if >>>>>>> + * %TUN_VNET_HASH_RSS is set: >>>>>>> + * >>>>>>> + * 1. &struct tun_vnet_hash >>>>>>> + * 2. &struct tun_vnet_hash_rss >>>>>>> + * 3. Indirection table >>>>>>> + * 4. Key >>>>>>> + * >>>>>>> """ >>>>>>> >>>>>>> And it seems to lack parameters like max_tx_vq. >>>>>> >>>>>> max_tx_vq is not relevant with hashing. >>>>> >>>>> It is needed for RSS and we don't have that, no? >>>> >>>> No. RSS is Receive Side Scaling but it's not about receiving. >>> >>> Just to make sure I understand this, max_tx_vq is part of the >>> virtio_net_rss_config, how would Qemu behave when it receives this >>> from guest? >>> >>> """ >>> A driver sets max_tx_vq to inform a device how many transmit >>> virtqueues it may use (transmitq1…transmitq max_tx_vq). >>> """ >> >> It does nothing. > > Nope, see: > > commit 50bfcaedd78e53135ec0504302269b3b65bf1eff > Author: Philo Lu <lulie@linux.alibaba.com> > Date: Mon Nov 4 16:57:06 2024 +0800 > > virtio_net: Update rss when set queue > > RSS configuration should be updated with queue number. In particular, it > should be updated when (1) rss enabled and (2) default rss configuration > is used without user modification. > > During rss command processing, device updates queue_pairs using > rss.max_tx_vq. That is, the device updates queue_pairs together with > rss, so we can skip the sperate queue_pairs update > (VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET below) and return directly. > > Also remove the `vi->has_rss ?` check when setting vi->rss.max_tx_vq, > because this is not used in the other hash_report case. > > Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.") > Signed-off-by: Philo Lu <lulie@linux.alibaba.com> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > Acked-by: Michael S. Tsirkin <mst@redhat.com> > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > > RSS doesn't depend on MQ. > > If it is not handled by Qemu, it should be a bug? I was wrong; QEMU does handle this field, but it doesn't use the definition of struct virtio_net_rss_config and name it queue_pairs instead of max_tx_vq so I could not find it by grep. For tap, max_tx_vq is handled by changing the number of open file descriptors so passing it via an ioctl is redundant. > >> >>> >>>> >>>>> >>>>>> >>>>>>> >>>>>>> What's more, we've already had virito-net uAPI. Why not simply reusing them? >>>>>> >>>>>> See the above. >>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> RSS and hash reporting must share >>>>>>>>>> this parameter when both are enabled at the same time; otherwise RSS may >>>>>>>>>> compute hash values that are not suited for hash reporting. >>>>>>>>> >>>>>>>>> Is this mandated by the spec? If yes, we can add a check. If not, >>>>>>>>> userspace risk themselves as a mis-configuration which we don't need >>>>>>>>> to bother. >>>>>>>> >>>>>>>> Yes, it is mandated. 5.1.6.4.3 Hash calculation for incoming packets says: >>>>>>>> > A device attempts to calculate a per-packet hash in the following >>>>>>>> > cases: >>>>>>>> > >>>>>>>> > - The feature VIRTIO_NET_F_RSS was negotiated. The device uses the >>>>>>>> > hash to determine the receive virtqueue to place incoming packets. >>>>>>>> > - The feature VIRTIO_NET_F_HASH_REPORT was negotiated. The device >>>>>>>> > reports the hash value and the hash type with the packet. >>>>>>>> > >>>>>>>> > If the feature VIRTIO_NET_F_RSS was negotiated: >>>>>>>> > >>>>>>>> > - The device uses hash_types of the virtio_net_rss_config structure >>>>>>>> > as ’Enabled hash types’ bitmask. >>>>>>>> > - The device uses a key as defined in hash_key_data and >>>>>>>> hash_key_length of the virtio_net_rss_config structure (see >>>>>>>> > 5.1.6.5.7.1). >>>>>>>> > >>>>>>>> > If the feature VIRTIO_NET_F_RSS was not negotiated: >>>>>>>> > >>>>>>>> > - The device uses hash_types of the virtio_net_hash_config structure >>>>>>>> > as ’Enabled hash types’ bitmask. >>>>>>>> > - The device uses a key as defined in hash_key_data and >>>>>>>> > hash_key_length of the virtio_net_hash_config structure (see >>>>>>>> > .1.6.5.6.4). >>>>>>>> >>>>>>>> So when both VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT are >>>>>>>> negotiated, virtio_net_rss_config not only controls RSS but also the >>>>>>>> reported hash values and types. They cannot be divergent. >>>>>>>> >>>>>>>>> >>>>>>>>> Note that spec use different commands for hash_report and rss. >>>>>>>> >>>>>>>> TUNSETVNETHASH is different from these commands in terms that it also >>>>>>>> negotiates VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS. >>>>>>>> >>>>>>> >>>>>>> There Are different "issues" here: >>>>>>> >>>>>>> 1) Whether or not we need to use a unified API for negotiating RSS and >>>>>>> HASH_REPORT features >>>>>>> 2) Whether or not we need to sue a unified API for setting RSS and >>>>>>> HASH_REPORT configuration >>>>>>> >>>>>>> What I want to say is point 2. But what you raise is point 1. >>>>>>> >>>>>>> For simplicity, it looks to me like it's a call for having separated >>>>>>> ioctls for feature negotiation (for example via TUNSETIFF). You may >>>>>>> argue that either RSS or HASH_REPORT requires configurations, we can >>>>>>> just follow what spec defines or not (e.g what happens if >>>>>>> RSS/HASH_REPORT were negotiated but no configurations were set). >>>>>> >>>>>> Unfortunately TUNSETIFF does not fit in this use case. The flags set >>>>>> with TUNSETIFF are fixed, but the guest can request a different feature >>>>>> set anytime by resetting the device. >>>>> >>>>> TUNSETIFF, enables the device to be able to handle RSS/HASREPORT. >>>>> TUNSETHASH/RSS. dealing with RSS/HASH command from userspace. >>>> >>>> We also needs to be able to disable them at runtime so that we can >>>> handle resets. >>> >>> Via TUNSETHASH/RSS? I think it should have a way to accept parameters >>> that disable RSS or hash report. >> >> That's what this patch implements. TUNSETVNETHASH accepts parameters to >> choose what features to be enabled. >> >>> >>>> >>>>> >>>>> This is the way we used to do for multi queue and vnet header. >>>>> TUNSETIFF requires CAP_NET_ADMIN, this could be an extra safe guard >>>>> for unprivileged userspace. >>>> >>>> I intend to allow using this feature without privilege. A VMM is usually >>>> unprivileged and requiring a privilege to configure tuntap is too >>>> prohibitive. >>> >>> For safety, tun is not allowed to be created by unprivileged users. >>> And it's not to configure the tuntap dynamically, it's about telling >>> the function that tuntap can have (not necessarily enabled though) . >> >> I don't think we need another barrier for the new functions. Once an >> unprivileged user get a file descriptor of tuntap from a privileged >> user, they are free to enable RSS and/or hash reporting. > > Only if such a feature is allowed by the privileged user. I don't see a reason not to allow the feature to unprivileged users. It only complicates the setup. > >> >>> >>>> >>>>> >>>>>> >>>>>> > >> In the virtio-net specification, it is not defined what would >>>>>> happen if >>>>>>>> these features are negotiated but the VIRTIO_NET_CTRL_MQ_RSS_CONFIG or >>>>>>>> VIRTIO_NET_CTRL_MQ_HASH_CONFIG commands are not sent. There is no such >>>>>>>> ambiguity with TUNSETVNETHASH. >>>>>>> >>>>>>> So I don't see advantages of unifying hash reports and rss into a >>>>>>> single ioctl. Let's just follow what has been done in the spec that >>>>>>> uses separated commands. Tuntap is not a good place to debate whether >>>>>>> those commands could be unified or not. We need to move it to the spec >>>>>>> but assuming spec has been done, it might be too late or too few >>>>>>> advantages for having another design. >>>>>> >>>>>> It makes sense for the spec to reuse the generic feature negotiation >>>>>> mechanism, but the situation is different for tuntap; we cannot use >>>>>> TUNSETIFF and need to define another. Then why don't we exploit this >>>>>> opportunity to have an interface with well-defined semantics? >>>>> >>>>> That's perfectly fine, but it needs to be done in virtio-net's uAPI >>>>> not tun's. What's more, if you think two commands are not >>>>> well-defined, let's fix that in the virtio spec first. >>>>> >>>>>> The virtio >>>>>> spec does its best as an interface between the host and guest and tuntap >>>>>> does its best as an UAPI. >>>>> >>>>> See above, let's fix the uAPI first. We don't want DPDK to use tun's >>>>> uAPI for RSS >>>> >>>> virtio-net's UAPI is for the virtio spec which has a capable generic >>>> feature negotiation mechanism. tuntap needs its own feature negotiation >>>> and it's nothing to do with virtio-net's UAPI. >>> >>> Well, I don't mean the part of the feature negotiation. I mean the >>> part for rss and hash report configuration. >> >> The feature negotiation still matters when deciding the granularity of >> ioctls. We need one ioctl for a feature negotiation, and to avoid having >> an intermediate state, > > I don't understand this. For example, driver can choose to > > 1) negotiate RSS > 2) do something else. > 3) configure RSS > > Spec doesn't require those two to be configured at the same time, so > "intermediate state" is allowed. The spec doesn't define what should happen in the intermediate state either. For a hardware implementation I think it's fine whatever the implementation defines as the intermediate state. But for the UAPI, it's better avoiding having such a definition to keep the interface minimal and maximize the UAPI stability. > >> the ioctl should also do the configuration. Hence >> that one ioctl should do all of the feature negotiation and configuration. >> >>> >>>> >>>> The structures for two commands have unused or redundant fields and a >>>> flexible array in the middle of the structure, but they are ABIs so we >>>> can't change it. >>>> >>>> DPDK is another reason to define tuntap's own UAPIs. They don't care >>>> unused or redundant fields and a flexible array in middle that are >>>> present in the virtio spec. It will also not want to deal with the >>>> requirement of little endian. Constructing struct virtio_net_rss_config >>>> is an extra burden for DPDK. >>> >>> I meant for vhost-user implementation in DPDK, it needs to use >>> virtio-net uAPI not tuntap's for example. >> >> The vhost-user implementation will use tuntap's UAPIs for its ethernet >> device backend. > > That sounds pretty weird, vhost-user has nothing related to tuntap. My expression in the last email was weird. More precisely, the ethernet backend of tuntap will use the UAPIs, and the vhost-user will use the ethernet backend in turn. > >> It uses the generic interface of ethernet device so for >> RSS it will use functions like rte_eth_dev_rss_hash_update() for >> example. tuntap's UAPIs are more suited to implement these interfaces as >> they operate in native endian and don't have extra fields. > > Nope, for example it needs to use le for virtio_net_hdr if a modern > device is used. But it needs a "native" endian according to the guest > endian via TUNSETVNETLE/BE. We don't have a choice as virtio-net hdr > support in tuntap is much earlier than modern devices. > > Let's don't do the same thing (native endian) for tuntap as RSS > depends on modern, so we know it must be le. virtio_net_hdr is the data path while the current discussion is about the control path. All configuration knobs of tuntap operates in the native endian. So I think we should stick to the little endian for the data path while we should stick to the native endian for the control path to maximize the consistency. > > >> >> DPDk applications other than vhost-user also matter; they do not care >> what virtio does at all. >> >> > >> >>>> On the other hand, Constructing tuntap-specific structures is not that >>>> complicated for VMMs. >>> >>> Not complicated but redundant. >>> >>>> A VMM will need to inspect struct >>>> virtio_net_rss_config anyway to handle migration and check its size so >>>> it can store the values it inspected to struct tun_vnet_hash and struct >>>> tun_vnet_hash_rss and pass them to the kernel. >>> >>> I don't see how rss and hash reports differ from what we have now. >>> Those inspections must be done anyhow for compatibility for example >>> the check of offloading features. Such steps could not be eliminated >>> no matter how we design the uAPI. >> >> I explained the difference between the virtio and tuntap UAPIs, not >> between RSS and hash reporting. > > See above. > >> >>> >>>> >>>> The overall userspace implementation will be simpler by having >>>> structures specifically tailored for the communication between the >>>> userspace and kernel. >>> >>> This is exactly how a good uAPI should behave. If uAPI in virtio-net >>> can't do this, I don't understand why uAPI in tuntap can solve it. >> >> The UAPI in virtio-net cannot do it because it's already fixed and it >> also needs to perform endian conversion for the VM use case. tuntap >> doesn't have these restrictions. > > Same here. > >> >>> >>>> >>>>> >>>>>> >>>>>> I don't think there is an advantage to split ioctls to follow the spec >>>>>> after all. It makes sense if we can pass-through virtio commands to >>>>>> tuntap, but it is not possible as ioctl operation codes are different >>>>>> from virtio commands. >>>>> >>>>> I don't see a connection with the operation code. For example, we can >>>>> add new uAPIs in virtio-net which could be something like: >>>>> >>>>> struct virtio_net_rss_config_header { >>>>> __le32 hash_types; >>>>> __le16 indirection_table_mask; >>>>> __le16 unclassified_queue; >>>>> __le16 indirection_table[]; >>>>> } >>>>> >>>>> struct virtio_net_rss_config_tailer { >>>>> __le16 max_tx_vq; >>>>> u8 hash_key_length; >>>>> u8 hash_key_data[]; >>>>> } >>>>> >>>>> These two are used by TUNSETVNETRSS. And simply reuse the >>>>> virtio_net_hash_config for TUNSETVETHASH. >>>> > > With this, we can tweak the virtio-net driver with this new uAPI. Then >>>>> tap* can reuse this. >>>> >>>> I implemented a UAPI and driver change accordingly: >>>> https://lore.kernel.org/r/20250318-virtio-v1-0-344caf336ddd@daynix.com >>>> >>>> This is a nice improvement for the driver, but I still don't think it is >>>> suited for the UAPI of tuntap. >>> >>> Any reason for this? It should work like virtio_net_hdr. >>> >>>> The requirements of extra fields and >>>> little endian cannot be removed from the virtio spec but they are >>>> irrelevant for tuntap. >>> >>> I don't understand this part. What fields are "extra" and need to be >>> removed from the spec? >> >> All fields not included in struct tun_vnet_hash and struct >> tun_vnet_hash_rss. Namely, for struct virtio_net_hash_config: >> - reserved >> - hash_key_length >> - hash_key_data >> >> For struct virtio_net_rss_config: >> - max_tx_vq >> - hash_key_length > > See my above reply, and I basically meant > > TUNSETVETHASH accept struct virtio_net_hash_config; > TUNSETVETRSS accept struct virtio_net_rss_config_hdr + struct > virtio_net_rss_config_trailer; That still bring the extra fields I noted in the last email. Regards, Akihiko Odaki > > Thanks > >> >> Regards, >> Akihiko Odaki >> >>> >>>> >>>>> >>>>>> The best possibility is to share structures, not >>>>>> commands, and I don't think even sharing structures makes sense here >>>>>> because of the reasons described above. >>>>> >>>>> I don't want to share structures, I meant starting from something that >>>>> is simple and has been sorted in the virtio spec. Optimization could >>>>> be done on top. >>>> >>>> I meant to reuse the structures in virtio_net.h. >>>> >>>> Regards, >>>> Akihiko Odaki >>> >>> Thanks >>> >>>> >>>>> >>>>> Thanks >>>>> >>>>> >>>>>> >>>>>> Regards, >>>>>> Akihiko Odaki >>>>>> >>>>>>> >>>>>>> Thanks >>>>>>> >>>>>>>> >>>>>>>> Regards, >>>>>>>> Akihiko Odaki >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> The paramter will be duplicated if we have separate ioctls for RSS and >>>>>>>>>> hash reporting, and the kernel will have a chiken-egg problem when >>>>>>>>>> ensuring they are synchronized; when the ioctl for RSS is issued, should >>>>>>>>>> the kernel ensure the "types" parameter is identical with one specified >>>>>>>>>> for hash reporting? It will not work if the userspace may decide to >>>>>>>>>> configure hash reporting after RSS. >>>>>>>>>> >>>>>>>>> >>>>>>>>> See my reply above. >>>>>>>>> >>>>>>>>> Thanks >>>>>>>>> >>>>>>>> >>>>>>> >>>>>> >>>>> >>>> >>> >> >
On Thu, Mar 20, 2025 at 1:33 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2025/03/20 10:31, Jason Wang wrote: > > On Wed, Mar 19, 2025 at 1:29 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2025/03/19 9:58, Jason Wang wrote: > >>> On Tue, Mar 18, 2025 at 6:10 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> On 2025/03/18 9:15, Jason Wang wrote: > >>>>> On Mon, Mar 17, 2025 at 3:07 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>> > >>>>>> On 2025/03/17 10:12, Jason Wang wrote: > >>>>>>> On Wed, Mar 12, 2025 at 1:03 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>> > >>>>>>>> On 2025/03/12 11:35, Jason Wang wrote: > >>>>>>>>> On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>>>> > >>>>>>>>>> On 2025/03/11 9:38, Jason Wang wrote: > >>>>>>>>>>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>>>>>> > >>>>>>>>>>>> On 2025/03/10 12:55, Jason Wang wrote: > >>>>>>>>>>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Hash reporting > >>>>>>>>>>>>>> ============== > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Allow the guest to reuse the hash value to make receive steering > >>>>>>>>>>>>>> consistent between the host and guest, and to save hash computation. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> RSS > >>>>>>>>>>>>>> === > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> RSS is a receive steering algorithm that can be negotiated to use with > >>>>>>>>>>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. > >>>>>>>>>>>>>> However, computing the hash after the queue was chosen defeats the > >>>>>>>>>>>>>> purpose of RSS. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Another approach is to use eBPF steering program. This approach has > >>>>>>>>>>>>>> another downside: it cannot report the calculated hash due to the > >>>>>>>>>>>>>> restrictive nature of eBPF steering program. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Introduce the code to perform RSS to the kernel in order to overcome > >>>>>>>>>>>>>> thse challenges. An alternative solution is to extend the eBPF steering > >>>>>>>>>>>>>> program so that it will be able to report to the userspace, but I didn't > >>>>>>>>>>>>>> opt for it because extending the current mechanism of eBPF steering > >>>>>>>>>>>>>> program as is because it relies on legacy context rewriting, and > >>>>>>>>>>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while > >>>>>>>>>>>>>> the other relevant virtualization APIs such as KVM and vhost_net are > >>>>>>>>>>>>>> UAPIs. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > >>>>>>>>>>>>>> Tested-by: Lei Yang <leiyang@redhat.com> > >>>>>>>>>>>>>> --- > >>>>>>>>>>>>>> Documentation/networking/tuntap.rst | 7 ++ > >>>>>>>>>>>>>> drivers/net/Kconfig | 1 + > >>>>>>>>>>>>>> drivers/net/tap.c | 68 ++++++++++++++- > >>>>>>>>>>>>>> drivers/net/tun.c | 98 +++++++++++++++++----- > >>>>>>>>>>>>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- > >>>>>>>>>>>>>> include/linux/if_tap.h | 2 + > >>>>>>>>>>>>>> include/linux/skbuff.h | 3 + > >>>>>>>>>>>>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ > >>>>>>>>>>>>>> net/core/skbuff.c | 4 + > >>>>>>>>>>>>>> 9 files changed, 386 insertions(+), 31 deletions(-) > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst > >>>>>>>>>>>>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 > >>>>>>>>>>>>>> --- a/Documentation/networking/tuntap.rst > >>>>>>>>>>>>>> +++ b/Documentation/networking/tuntap.rst > >>>>>>>>>>>>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: > >>>>>>>>>>>>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); > >>>>>>>>>>>>>> } > >>>>>>>>>>>>>> > >>>>>>>>> > >>>>>>>>> [...] > >>>>>>>>> > >>>>>>>>>>>>>> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, > >>>>>>>>>>>>>> + bool can_rss, void __user *argp) > >>>>>>>>>>>>> > >>>>>>>>>>>>> So again, can_rss seems to be tricky. Looking at its caller, it tires > >>>>>>>>>>>>> to make eBPF and RSS mutually exclusive. I still don't understand why > >>>>>>>>>>>>> we need this. Allow eBPF program to override some of the path seems to > >>>>>>>>>>>>> be common practice. > >>>>>>>>>>>>> > >>>>>>>>>>>>> What's more, we didn't try (or even can't) to make automq and eBPF to > >>>>>>>>>>>>> be mutually exclusive. So I still didn't see what we gain from this > >>>>>>>>>>>>> and it complicates the codes and may lead to ambiguous uAPI/behaviour. > >>>>>>>>>>>> > >>>>>>>>>>>> automq and eBPF are mutually exclusive; automq is disabled when an eBPF > >>>>>>>>>>>> steering program is set so I followed the example here. > >>>>>>>>>>> > >>>>>>>>>>> I meant from the view of uAPI, the kernel doesn't or can't reject eBPF > >>>>>>>>>>> while using automq. > >>>>>>>>>> > >> > >>>>>>>>>>>> We don't even have an interface for eBPF to let it fall back to another > >>>>>>>>>>>> alogirhtm. > >>>>>>>>>>> > >>>>>>>>>>> It doesn't even need this, e.g XDP overrides the default receiving path. > >>>>>>>>>>> > >>>>>>>>>>>> I could make it fall back to RSS if the eBPF steeering > >>>>>>>>>>>> program is designed to fall back to automq when it returns e.g., -1. But > >>>>>>>>>>>> such an interface is currently not defined and defining one is out of > >>>>>>>>>>>> scope of this patch series. > >>>>>>>>>>> > >>>>>>>>>>> Just to make sure we are on the same page, I meant we just need to > >>>>>>>>>>> make the behaviour consistent: allow eBPF to override the behaviour of > >>>>>>>>>>> both automq and rss. > >>>>>>>>>> > >>>>>>>>>> That assumes eBPF takes precedence over RSS, which is not obvious to me. > >>>>>>>>> > >>>>>>>>> Well, it's kind of obvious. Not speaking the eBPF selector, we have > >>>>>>>>> other eBPF stuffs like skbedit etc. > >>>>>>>>> > >>>>>>>>>> > >>>>>>>>>> Let's add an interface for the eBPF steering program to fall back to > >>>>>>>>>> another steering algorithm. I said it is out of scope before, but it > >>>>>>>>>> makes clear that the eBPF steering program takes precedence over other > >>>>>>>>>> algorithms and allows us to delete the code for the configuration > >>>>>>>>>> validation in this patch. > >>>>>>>>> > >>>>>>>>> Fallback is out of scope but it's not what I meant. > >>>>>>>>> > >>>>>>>>> I meant in the current uAPI take eBPF precedence over automq. It's > >>>>>>>>> much more simpler to stick this precedence unless we see obvious > >>>>>>>>> advanatge. > >>>>>>>> > >>>>>>>> We still have three different design options that preserve the current > >>>>>>>> precedence: > >>>>>>>> > >>>>>>>> 1) Precedence order: eBPF -> RSS -> automq > >>>>>>>> 2) Precedence order: RSS -> eBPF -> automq > >>>>>>>> 3) Precedence order: eBPF OR RSS -> automq where eBPF and RSS are > >>>>>>>> mutually exclusive > >>>>>>>> > >>>>>>>> I think this is a unique situation for this steering program and I could > >>>>>>>> not find another example in other eBPF stuffs. > >>>>>>> > >>>>>>> As described above, queue mapping could be overridden by tc-ebpf. So > >>>>>>> there's no way to guarantee the RSS will work: > >>>>>>> > >>>>>>> https://github.com/DPDK/dpdk/blob/main/drivers/net/tap/bpf/tap_rss.c#L262 > >>>>>>> > >>>>>>> Making eBPF first leaves a chance for the management layer to override > >>>>>>> the choice of Qemu. > >>>>>> > >>>>>> I referred to the eBPF steering program instead of tc-ebpf. tc-ebpf is > >>>>>> nothing to do with the TUNSETSTEERINGEBPF ioctl, which this patch changes. > >>>>> > >>>>> I meant you can't do "full control" in any case, the point below > >>>>> doesn't stand. Queue mapping could be restored even if RSS is set. > >>>> > >>>> What matters here is how we handle the control when tc didn't take it. > >>>> eBPF, RSS, or automq make take all of it; I referred that as "full control". > >>>> > >>>>> > >>>>>> > >>>>>>> > >>>>>>>> > >>>>>>>> The current version implements 3) because it is not obvious whether we > >>>>>>>> should choose either 1) or 2). > >>>>>>> > >>>>>>> But you didn't explain why you choose 3), and it leads to tricky code > >>>>>>> (e.g the can_rss stuff etc). > >>>>>> > >>>>>> I wrote: "because it is not obvious whether we should choose either 1) > >>>>>> or 2)", but I think I can explain it better: > >>>>>> > >>>>>> When an eBPF steering program cannot implement a fallback, it means the > >>>>>> eBPF steering program requests the full control over the steering. On > >>>>>> the other hand, RSS also requests the same control. So these two will > >>>>>> conflict and the entity controlling the steering will be undefined when > >>>>>> both are enabled. > >>>>> > >>>>> Well, the fallback is orthogonal to the proposal here. We haven't had > >>>>> that since the introduction of the eBPF steering program. This means > >>>>> automq has been in "conflict" with eBPF for years. Again, another > >>>>> advantage, allowing the eBPF program to be the first to allow the > >>>>> management layer to override Qemu's steering. > >>>> > >>>> What if a VMM uses eBPF steering program and the management layer > >>>> decides to override it with RSS? > >>> > >>> That's possible but I think we're seeking which approach is better. In > >>> this case, RSS could be implemented in eBPF but not the reverse. > >>> > >>> So my point is to start from something that is simpler. Simply allow > >>> eBPF on top of RSS as automq. And optimize on top. > >> > >> > >> The in-kernel RSS implementation is more optimized and capable of hash > >> reporting. I don't think either eBPF steering program or in-kernel RSS > >> is more capable than the other and there is a reason to place eBPF on > >> top of RSS. > >> > >>> > >>>> > >>>> eBPF is obviously predecedent to automq as eBPF is an opt-in feature and > >>>> automq is the implicit default. But this logic cannot be applied to > >>>> decide the order of eBPF and RSS because they are both opt-in features. > >>> > >>> This is from the perspective of kernel development. But let's try to > >>> think from the userspace: A well written user space knows what it > >>> does, rejecting eBPF while RSS is set doesn't help. But anyhow if you > >>> stick, it doesn't harm. > >> > >> Yes, it not for the current userspace but for the future kernel > >> development; the kernel can reserve the freedom to decide the priority > >> of eBPF and RSS by rejecting eBPF while RSS. > >> > >>> > >>>> > >>>>> > >>>>>> > >>>>>> 3) eliminates the undefined semantics by rejecting to enable both. > >>>>> > >>>>> This would lead a usersapce noticeable change of the behaviour? And > >>>>> what do you mean by "rejecting to enable both"? > >>>> > >>>> Existing userspace code should see no change as it only cares the case > >>>> where RSS is enabled. > >>>> > >>>> Here, rejecting to enable both means to deny setting an eBPF steering > >>>> program when RSS is enabled, and visa-versa. > >>>> > >>>>> > >>>>>> An > >>>>>> alternative approach is to allow eBPF steering programs to fall back. > >>>>>> When both the eBPF program and RSS are enabled, RSS will gain the > >>>>>> control of steering under the well-defined situation where the eBPF > >>>>>> steering program decides to fall back. > >>>>> > >>>>> How about just stick the eBPF precedence in this proposal and > >>>>> introduce the fallback on top? This helps to speed up the iteration > >>>>> (as the version has been iterated to 11). > >>>> > >>>> I don't think that helps much since we have another ongoing discussion > >>>> below and it is not the sole roadblock. > >>>> > >>>>> > >>>>>> > >>>>>>> > >>>>>>>> But 1) will be the most capable option if > >>>>>>>> eBPF has a fall-back feature. > >>>>>>>> > >>>>>>>>> > >>>>>>>>>> > >>>>>>>>>>> > >>>>>>>>>>>> > >>>>>>>>>>>>> > >>>>>>>>> > >>>>>>>>> [...] > >>>>>>>>> > >>>>>>>>>>>>> Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? > >>>>>>>>>>>>> If yes, it should be a bug. > >>>>>>>>>>>> > >>>>>>>>>>>> It is possible to use RSS without TUN_VNET_HASH_REPORT. > >>>>>>>>>>> > >>>>>>>>>>> Another call to separate the ioctls then. > >>>>>>>>>> > >>>>>>>>>> RSS and hash reporting are not completely independent though. > >>>>>>>>> > >>>>>>>>> Spec said: > >>>>>>>>> > >>>>>>>>> """ > >>>>>>>>> VIRTIO_NET_F_RSSRequires VIRTIO_NET_F_CTRL_VQ. > >>>>>>>>> """ > >>>>>>>> > >>>>>>>> I meant the features can be enabled independently, but they will share > >>>>>>>> the hash type set when they are enabled at the same time. > >>>>>>> > >>>>>>> Looking at the spec: > >>>>>>> > >>>>>>> Hash repot uses: > >>>>>>> > >>>>>>> """ > >>>>>>> struct virtio_net_hash_config { > >>>>>>> le32 hash_types; > >>>>>>> le16 reserved[4]; > >>>>>>> u8 hash_key_length; > >>>>>>> u8 hash_key_data[hash_key_length]; > >>>>>>> }; > >>>>>>> """ > >>>>>>> > >>>>>>> RSS uses > >>>>>>> > >>>>>>> """ > >>>>>>> struct rss_rq_id { > >>>>>>> le16 vq_index_1_16: 15; /* Bits 1 to 16 of the virtqueue index */ > >>>>>>> le16 reserved: 1; /* Set to zero */ > >>>>>>> }; > >>>>>>> > >>>>>>> struct virtio_net_rss_config { > >>>>>>> le32 hash_types; > >>>>>>> le16 indirection_table_mask; > >>>>>>> struct rss_rq_id unclassified_queue; > >>>>>>> struct rss_rq_id indirection_table[indirection_table_length]; > >>>>>>> le16 max_tx_vq; > >>>>>>> u8 hash_key_length; > >>>>>>> u8 hash_key_data[hash_key_length]; > >>>>>>> }; > >>>>>>> """ > >>>>>>> > >>>>>>> Instead of trying to figure out whether we can share some data > >>>>>>> structures, why not simply start from what has been done in the spec? > >>>>>>> This would ease the usersapce as well where it can simply do 1:1 > >>>>>>> mapping between ctrl vq command and tun uAPI. > >>>>>> > >>>>>> The spec also defines struct virtio_net_hash_config (which will be used > >>>>>> when RSS is disabled) and struct virtio_net_rss_config to match the > >>>>>> layout to share some fields. However, the UAPI does not follow the > >>>>>> interface design of virtio due to some problems with these structures. > >>>>> > >>>>> Copy-paste error. The above is copied from the virtio spec, but I > >>>>> meant the existing uAPI in virtio_net.h: > >>>>> > >>>>> /* > >>>>> * The command VIRTIO_NET_CTRL_MQ_RSS_CONFIG has the same effect as > >>>>> * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET does and additionally configures > >>>>> * the receive steering to use a hash calculated for incoming packet > >>>>> * to decide on receive virtqueue to place the packet. The command > >>>>> * also provides parameters to calculate a hash and receive virtqueue. > >>>>> */ > >>>>> struct virtio_net_rss_config { > >>>>> __le32 hash_types; > >>>>> __le16 indirection_table_mask; > >>>>> __le16 unclassified_queue; > >>>>> __le16 indirection_table[1/* + indirection_table_mask */]; > >>>>> __le16 max_tx_vq; > >>>>> __u8 hash_key_length; > >>>>> __u8 hash_key_data[/* hash_key_length */]; > >>>>> }; > >>>>>> #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 > >>>>> > >>>>> /* > >>>>> * The command VIRTIO_NET_CTRL_MQ_HASH_CONFIG requests the device > >>>>> * to include in the virtio header of the packet the value of the > >>>>> * calculated hash and the report type of hash. It also provides > >>>>> * parameters for hash calculation. The command requires feature > >>>>> * VIRTIO_NET_F_HASH_REPORT to be negotiated to extend the > >>>>> * layout of virtio header as defined in virtio_net_hdr_v1_hash. > >>>>> */ > >>>>> struct virtio_net_hash_config { > >>>>> __le32 hash_types; > >>>>> /* for compatibility with virtio_net_rss_config */ > >>>>> __le16 reserved[4]; > >>>>> __u8 hash_key_length; > >>>>> __u8 hash_key_data[/* hash_key_length */]; > >>>>> }; > >>>>> > >>>>> This has been used by Qemu but I see a virtio-net version of: > >>>>> > >>>>> struct virtio_net_ctrl_rss { > >>>>> u32 hash_types; > >>>>> u16 indirection_table_mask; > >>>>> u16 unclassified_queue; > >>>>> u16 hash_cfg_reserved; /* for HASH_CONFIG (see > >>>>> virtio_net_hash_config for details) */ > >>>>> u16 max_tx_vq; > >>>>> u8 hash_key_length; > >>>>> u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > >>>>> > >>>>> u16 *indirection_table; > >>>>> }; > >>>>> > >>>>> This is ugly and results in a tricky code when trying to submit > >>>>> RSS/HASH commands to the device: > >>>>> > >>>>> if (vi->has_rss) { > >>>>> sg_buf_size = sizeof(uint16_t) * vi->rss_indir_table_size; > >>>>> sg_set_buf(&sgs[1], vi->rss.indirection_table, sg_buf_size); > >>>>> } else { > >>>>> sg_set_buf(&sgs[1], &vi->rss.hash_cfg_reserved, > >>>>> sizeof(uint16_t)); > >>>>> } > >>>> > >>>> The only reference to struct virtio_net_rss_config in QEMU is to derive > >>>> the offset of indirection_table. This is because the definition in > >>>> virtio_net.h also includes indirection_table in the middle and the > >>>> offsets of later part are unusable. > >>> > >>> Yes. > >>> > >>>> > >>>> QEMU internally has a structure named VirtioNetRssData which just looks > >>>> like struct virtio_net_ctrl_rss. > >>> > >>> It's a pity that it doesn't use uAPI. We might need to fix them. > >> > >> It doesn't want to use the UAPI structures for the internal storage > >> because it wants to store them in native endians and QEMU is not > >> interested in some fields in the UAPI structures. struct tun_vnet_hash > >> and struct tun_vnet_hash_rss are easy to fill using VirtioNetRssData. > >> > >>> > >>>> > >>>>> > >>>>>> > >>>>>> Below is the definition of struct virtio_net_hash_config: > >>>>>> > >>>>>> struct virtio_net_hash_config { > >>>>>> le32 hash_types; > >>>>>> le16 reserved[4]; > >>>>>> u8 hash_key_length; > >>>>>> u8 hash_key_data[hash_key_length]; > >>>>>> }; > >>>>>> > >>>>>> Here, hash_types, hash_key_length, and hash_key_data are shared with > >>>>>> struct virtio_net_rss_config. > >>>>>> > >>>>>> One problem is that struct virtio_net_rss_config has a flexible array > >>>>>> (indirection_table) between hash_types and hash_key_length. This is > >>>>>> something we cannot express with C. > >>>>> > >>>>> We can split the virtio_net_rss_config to ease the dealing with > >>>>> arrays, more below. > >>>>> > >>>>>> > >>>>>> Another problem is that the semantics of the key in struct > >>>>>> virtio_net_hash_config is not defined in the spec. > >>>>> > >>>>> If this is the case. Let's fix that in the spec first to make sure our > >>>>> uAPI aligns with spec without ambiguity. It would be a nightmare to > >>>>> deal with the in-consistency between virtio spec and Linux uAPIs. > >>>> > >>>> The userspace doesn't need to do anything to deal with inconsistency > >>>> since these fields are unused. > >>>> > >>>>> > >>>>>> > >>>>>> To solve these problems, I defined the UAPI structures that do not > >>>>>> include indiretion_table. > >>>>>> > >>>>>>> > >>>>>>>> > >>>>>>>>> > >>>>>>>>>> > >>>>>>>>>> A plot twist is the "types" parameter; it is a parameter that is > >>>>>>>>>> "common" for RSS and hash reporting. > >>>>>>>>> > >>>>>>>>> So we can share part of the structure through the uAPI. > >>>>>>>> > >>>>>>>> Isn't that what this patch does? > >>>>>>> > >>>>>>> I didn't see, basically I see only one TUNSETVNETHASH that is used to > >>>>>>> set both hash report and rss: > >>>>>> > >>>>>> The UAPI shares struct tun_vnet_hash for both hash report and rss. > >>>>> > >>>>> I meant sharing structure in two ioctls instead of reusing a specific > >>>>> structure for two semantics in one ioctl if possible. Though I don't > >>>>> think we need any sharing. > >>>> > >>>> The UAPI implemented in this patch already shares struct tun_vnet_hash > >>>> and having two ioctls doesn't change that. > >>>> > >>>>> > >>>>>> > >>>>>>> > >>>>>>> """ > >>>>>>> +/** > >>>>>>> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing > >>>>>>> + * > >>>>>>> + * The argument is a pointer to &struct tun_vnet_hash. > >>>>>>> + * > >>>>>>> + * The argument is a pointer to the compound of the following in order if > >>>>>>> + * %TUN_VNET_HASH_RSS is set: > >>>>>>> + * > >>>>>>> + * 1. &struct tun_vnet_hash > >>>>>>> + * 2. &struct tun_vnet_hash_rss > >>>>>>> + * 3. Indirection table > >>>>>>> + * 4. Key > >>>>>>> + * > >>>>>>> """ > >>>>>>> > >>>>>>> And it seems to lack parameters like max_tx_vq. > >>>>>> > >>>>>> max_tx_vq is not relevant with hashing. > >>>>> > >>>>> It is needed for RSS and we don't have that, no? > >>>> > >>>> No. RSS is Receive Side Scaling but it's not about receiving. > >>> > >>> Just to make sure I understand this, max_tx_vq is part of the > >>> virtio_net_rss_config, how would Qemu behave when it receives this > >>> from guest? > >>> > >>> """ > >>> A driver sets max_tx_vq to inform a device how many transmit > >>> virtqueues it may use (transmitq1…transmitq max_tx_vq). > >>> """ > >> > >> It does nothing. > > > > Nope, see: > > > > commit 50bfcaedd78e53135ec0504302269b3b65bf1eff > > Author: Philo Lu <lulie@linux.alibaba.com> > > Date: Mon Nov 4 16:57:06 2024 +0800 > > > > virtio_net: Update rss when set queue > > > > RSS configuration should be updated with queue number. In particular, it > > should be updated when (1) rss enabled and (2) default rss configuration > > is used without user modification. > > > > During rss command processing, device updates queue_pairs using > > rss.max_tx_vq. That is, the device updates queue_pairs together with > > rss, so we can skip the sperate queue_pairs update > > (VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET below) and return directly. > > > > Also remove the `vi->has_rss ?` check when setting vi->rss.max_tx_vq, > > because this is not used in the other hash_report case. > > > > Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.") > > Signed-off-by: Philo Lu <lulie@linux.alibaba.com> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > Acked-by: Michael S. Tsirkin <mst@redhat.com> > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > > > > RSS doesn't depend on MQ. > > > > If it is not handled by Qemu, it should be a bug? > > I was wrong; QEMU does handle this field, but it doesn't use the > definition of struct virtio_net_rss_config and name it queue_pairs > instead of max_tx_vq so I could not find it by grep. Yes, another side effect is that uAPI is not even used there... > > For tap, max_tx_vq is handled by changing the number of open file > descriptors so passing it via an ioctl is redundant. See my reply below. > > > > >> > >>> > >>>> > >>>>> > >>>>>> > >>>>>>> > >>>>>>> What's more, we've already had virito-net uAPI. Why not simply reusing them? > >>>>>> > >>>>>> See the above. > >>>>>> > >>>>>>> > >>>>>>>> > >>>>>>>>> > >>>>>>>>>> RSS and hash reporting must share > >>>>>>>>>> this parameter when both are enabled at the same time; otherwise RSS may > >>>>>>>>>> compute hash values that are not suited for hash reporting. > >>>>>>>>> > >>>>>>>>> Is this mandated by the spec? If yes, we can add a check. If not, > >>>>>>>>> userspace risk themselves as a mis-configuration which we don't need > >>>>>>>>> to bother. > >>>>>>>> > >>>>>>>> Yes, it is mandated. 5.1.6.4.3 Hash calculation for incoming packets says: > >>>>>>>> > A device attempts to calculate a per-packet hash in the following > >>>>>>>> > cases: > >>>>>>>> > > >>>>>>>> > - The feature VIRTIO_NET_F_RSS was negotiated. The device uses the > >>>>>>>> > hash to determine the receive virtqueue to place incoming packets. > >>>>>>>> > - The feature VIRTIO_NET_F_HASH_REPORT was negotiated. The device > >>>>>>>> > reports the hash value and the hash type with the packet. > >>>>>>>> > > >>>>>>>> > If the feature VIRTIO_NET_F_RSS was negotiated: > >>>>>>>> > > >>>>>>>> > - The device uses hash_types of the virtio_net_rss_config structure > >>>>>>>> > as ’Enabled hash types’ bitmask. > >>>>>>>> > - The device uses a key as defined in hash_key_data and > >>>>>>>> hash_key_length of the virtio_net_rss_config structure (see > >>>>>>>> > 5.1.6.5.7.1). > >>>>>>>> > > >>>>>>>> > If the feature VIRTIO_NET_F_RSS was not negotiated: > >>>>>>>> > > >>>>>>>> > - The device uses hash_types of the virtio_net_hash_config structure > >>>>>>>> > as ’Enabled hash types’ bitmask. > >>>>>>>> > - The device uses a key as defined in hash_key_data and > >>>>>>>> > hash_key_length of the virtio_net_hash_config structure (see > >>>>>>>> > .1.6.5.6.4). > >>>>>>>> > >>>>>>>> So when both VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT are > >>>>>>>> negotiated, virtio_net_rss_config not only controls RSS but also the > >>>>>>>> reported hash values and types. They cannot be divergent. > >>>>>>>> > >>>>>>>>> > >>>>>>>>> Note that spec use different commands for hash_report and rss. > >>>>>>>> > >>>>>>>> TUNSETVNETHASH is different from these commands in terms that it also > >>>>>>>> negotiates VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS. > >>>>>>>> > >>>>>>> > >>>>>>> There Are different "issues" here: > >>>>>>> > >>>>>>> 1) Whether or not we need to use a unified API for negotiating RSS and > >>>>>>> HASH_REPORT features > >>>>>>> 2) Whether or not we need to sue a unified API for setting RSS and > >>>>>>> HASH_REPORT configuration > >>>>>>> > >>>>>>> What I want to say is point 2. But what you raise is point 1. > >>>>>>> > >>>>>>> For simplicity, it looks to me like it's a call for having separated > >>>>>>> ioctls for feature negotiation (for example via TUNSETIFF). You may > >>>>>>> argue that either RSS or HASH_REPORT requires configurations, we can > >>>>>>> just follow what spec defines or not (e.g what happens if > >>>>>>> RSS/HASH_REPORT were negotiated but no configurations were set). > >>>>>> > >>>>>> Unfortunately TUNSETIFF does not fit in this use case. The flags set > >>>>>> with TUNSETIFF are fixed, but the guest can request a different feature > >>>>>> set anytime by resetting the device. > >>>>> > >>>>> TUNSETIFF, enables the device to be able to handle RSS/HASREPORT. > >>>>> TUNSETHASH/RSS. dealing with RSS/HASH command from userspace. > >>>> > >>>> We also needs to be able to disable them at runtime so that we can > >>>> handle resets. > >>> > >>> Via TUNSETHASH/RSS? I think it should have a way to accept parameters > >>> that disable RSS or hash report. > >> > >> That's what this patch implements. TUNSETVNETHASH accepts parameters to > >> choose what features to be enabled. > >> > >>> > >>>> > >>>>> > >>>>> This is the way we used to do for multi queue and vnet header. > >>>>> TUNSETIFF requires CAP_NET_ADMIN, this could be an extra safe guard > >>>>> for unprivileged userspace. > >>>> > >>>> I intend to allow using this feature without privilege. A VMM is usually > >>>> unprivileged and requiring a privilege to configure tuntap is too > >>>> prohibitive. > >>> > >>> For safety, tun is not allowed to be created by unprivileged users. > >>> And it's not to configure the tuntap dynamically, it's about telling > >>> the function that tuntap can have (not necessarily enabled though) . > >> > >> I don't think we need another barrier for the new functions. Once an > >> unprivileged user get a file descriptor of tuntap from a privileged > >> user, they are free to enable RSS and/or hash reporting. > > > > Only if such a feature is allowed by the privileged user. > > I don't see a reason not to allow the feature to unprivileged users. It > only complicates the setup. For safety, e.g reduce the chance for unprivileged user to explore part of the kernel codes. > > > > >> > >>> > >>>> > >>>>> > >>>>>> > >>>>>> > >> In the virtio-net specification, it is not defined what would > >>>>>> happen if > >>>>>>>> these features are negotiated but the VIRTIO_NET_CTRL_MQ_RSS_CONFIG or > >>>>>>>> VIRTIO_NET_CTRL_MQ_HASH_CONFIG commands are not sent. There is no such > >>>>>>>> ambiguity with TUNSETVNETHASH. > >>>>>>> > >>>>>>> So I don't see advantages of unifying hash reports and rss into a > >>>>>>> single ioctl. Let's just follow what has been done in the spec that > >>>>>>> uses separated commands. Tuntap is not a good place to debate whether > >>>>>>> those commands could be unified or not. We need to move it to the spec > >>>>>>> but assuming spec has been done, it might be too late or too few > >>>>>>> advantages for having another design. > >>>>>> > >>>>>> It makes sense for the spec to reuse the generic feature negotiation > >>>>>> mechanism, but the situation is different for tuntap; we cannot use > >>>>>> TUNSETIFF and need to define another. Then why don't we exploit this > >>>>>> opportunity to have an interface with well-defined semantics? > >>>>> > >>>>> That's perfectly fine, but it needs to be done in virtio-net's uAPI > >>>>> not tun's. What's more, if you think two commands are not > >>>>> well-defined, let's fix that in the virtio spec first. > >>>>> > >>>>>> The virtio > >>>>>> spec does its best as an interface between the host and guest and tuntap > >>>>>> does its best as an UAPI. > >>>>> > >>>>> See above, let's fix the uAPI first. We don't want DPDK to use tun's > >>>>> uAPI for RSS > >>>> > >>>> virtio-net's UAPI is for the virtio spec which has a capable generic > >>>> feature negotiation mechanism. tuntap needs its own feature negotiation > >>>> and it's nothing to do with virtio-net's UAPI. > >>> > >>> Well, I don't mean the part of the feature negotiation. I mean the > >>> part for rss and hash report configuration. > >> > >> The feature negotiation still matters when deciding the granularity of > >> ioctls. We need one ioctl for a feature negotiation, and to avoid having > >> an intermediate state, > > > > I don't understand this. For example, driver can choose to > > > > 1) negotiate RSS > > 2) do something else. > > 3) configure RSS > > > > Spec doesn't require those two to be configured at the same time, so > > "intermediate state" is allowed. > > The spec doesn't define what should happen in the intermediate state either. Yes but my point is that in the uAPI layer we don't need to care about the intermediate state. It can just work as other features, e.g having a default state after feature negotiation is more than enough. This is the way we deal with other features like vnet header etc. > > For a hardware implementation I think it's fine whatever the > implementation defines as the intermediate state. But for the UAPI, it's > better avoiding having such a definition to keep the interface minimal > and maximize the UAPI stability. Well, even if you think there's an issue: 1) I don't see how we can avoid the intermediate state consider guest have such state 2) We need to "fix" virtio spec and virito-net first, tuntap is not the right place to workaround virtio specific issues > > > > >> the ioctl should also do the configuration. Hence > >> that one ioctl should do all of the feature negotiation and configuration. > >> > >>> > >>>> > >>>> The structures for two commands have unused or redundant fields and a > >>>> flexible array in the middle of the structure, but they are ABIs so we > >>>> can't change it. > >>>> > >>>> DPDK is another reason to define tuntap's own UAPIs. They don't care > >>>> unused or redundant fields and a flexible array in middle that are > >>>> present in the virtio spec. It will also not want to deal with the > >>>> requirement of little endian. Constructing struct virtio_net_rss_config > >>>> is an extra burden for DPDK. > >>> > >>> I meant for vhost-user implementation in DPDK, it needs to use > >>> virtio-net uAPI not tuntap's for example. > >> > >> The vhost-user implementation will use tuntap's UAPIs for its ethernet > >> device backend. > > > > That sounds pretty weird, vhost-user has nothing related to tuntap. > > My expression in the last email was weird. More precisely, the ethernet > backend of tuntap will use the UAPIs, and the vhost-user will use the > ethernet backend in turn. I don't understand what "ethernet backend" means here. > > > > >> It uses the generic interface of ethernet device so for > >> RSS it will use functions like rte_eth_dev_rss_hash_update() for > >> example. tuntap's UAPIs are more suited to implement these interfaces as > >> they operate in native endian and don't have extra fields. > > > > Nope, for example it needs to use le for virtio_net_hdr if a modern > > device is used. But it needs a "native" endian according to the guest > > endian via TUNSETVNETLE/BE. We don't have a choice as virtio-net hdr > > support in tuntap is much earlier than modern devices. > > > > Let's don't do the same thing (native endian) for tuntap as RSS > > depends on modern, so we know it must be le. > > virtio_net_hdr is the data path while the current discussion is about > the control path. All configuration knobs of tuntap operates in the > native endian. Because they are not directly related to virtio specification. We don't want to duplicate virtio-net with our own version every time E.g once RSSv2 or aRFS were implemented. Or I would even introduce a single uAPI to transport possible cvq commands then we can avoid inventing new ioctls that just transport cvq commands. > > So I think we should stick to the little endian for the data path while > we should stick to the native endian for the control path to maximize > the consistency. I don't see a reason to differ datapath from control path. Virtio-net uAPI has been reused by tuntap for more than a decade. > > > > > > >> > >> DPDk applications other than vhost-user also matter; they do not care > >> what virtio does at all. > >> > >> > >> > >>>> On the other hand, Constructing tuntap-specific structures is not that > >>>> complicated for VMMs. > >>> > >>> Not complicated but redundant. > >>> > >>>> A VMM will need to inspect struct > >>>> virtio_net_rss_config anyway to handle migration and check its size so > >>>> it can store the values it inspected to struct tun_vnet_hash and struct > >>>> tun_vnet_hash_rss and pass them to the kernel. > >>> > >>> I don't see how rss and hash reports differ from what we have now. > >>> Those inspections must be done anyhow for compatibility for example > >>> the check of offloading features. Such steps could not be eliminated > >>> no matter how we design the uAPI. > >> > >> I explained the difference between the virtio and tuntap UAPIs, not > >> between RSS and hash reporting. > > > > See above. > > > >> > >>> > >>>> > >>>> The overall userspace implementation will be simpler by having > >>>> structures specifically tailored for the communication between the > >>>> userspace and kernel. > >>> > >>> This is exactly how a good uAPI should behave. If uAPI in virtio-net > >>> can't do this, I don't understand why uAPI in tuntap can solve it. > >> > >> The UAPI in virtio-net cannot do it because it's already fixed and it > >> also needs to perform endian conversion for the VM use case. tuntap > >> doesn't have these restrictions. > > > > Same here. > > > >> > >>> > >>>> > >>>>> > >>>>>> > >>>>>> I don't think there is an advantage to split ioctls to follow the spec > >>>>>> after all. It makes sense if we can pass-through virtio commands to > >>>>>> tuntap, but it is not possible as ioctl operation codes are different > >>>>>> from virtio commands. > >>>>> > >>>>> I don't see a connection with the operation code. For example, we can > >>>>> add new uAPIs in virtio-net which could be something like: > >>>>> > >>>>> struct virtio_net_rss_config_header { > >>>>> __le32 hash_types; > >>>>> __le16 indirection_table_mask; > >>>>> __le16 unclassified_queue; > >>>>> __le16 indirection_table[]; > >>>>> } > >>>>> > >>>>> struct virtio_net_rss_config_tailer { > >>>>> __le16 max_tx_vq; > >>>>> u8 hash_key_length; > >>>>> u8 hash_key_data[]; > >>>>> } > >>>>> > >>>>> These two are used by TUNSETVNETRSS. And simply reuse the > >>>>> virtio_net_hash_config for TUNSETVETHASH. > >>>> > > With this, we can tweak the virtio-net driver with this new uAPI. Then > >>>>> tap* can reuse this. > >>>> > >>>> I implemented a UAPI and driver change accordingly: > >>>> https://lore.kernel.org/r/20250318-virtio-v1-0-344caf336ddd@daynix.com > >>>> > >>>> This is a nice improvement for the driver, but I still don't think it is > >>>> suited for the UAPI of tuntap. > >>> > >>> Any reason for this? It should work like virtio_net_hdr. > >>> > >>>> The requirements of extra fields and > >>>> little endian cannot be removed from the virtio spec but they are > >>>> irrelevant for tuntap. > >>> > >>> I don't understand this part. What fields are "extra" and need to be > >>> removed from the spec? > >> > >> All fields not included in struct tun_vnet_hash and struct > >> tun_vnet_hash_rss. Namely, for struct virtio_net_hash_config: > >> - reserved > >> - hash_key_length > >> - hash_key_data > >> > >> For struct virtio_net_rss_config: > >> - max_tx_vq > >> - hash_key_length > > > > See my above reply, and I basically meant > > > > TUNSETVETHASH accept struct virtio_net_hash_config; > > TUNSETVETRSS accept struct virtio_net_rss_config_hdr + struct > > virtio_net_rss_config_trailer; > > That still bring the extra fields I noted in the last email. I don't know how to define "extra" here. Let's summarize here: Method A: 1) virtio specification use separate commands for has_report and rss 2) hash_port ans rss doesn't depend on each other 3) reuse virtio-net uAPI Method B: 1) trying to define and remove the "extra" fields in tuntap, and redefine it in TUNTAP It would always be much easier to start from simply reusing the virtio-net uAPI. Method B makes both the implementation and reviewing harder, as we need to 1) revisit the design of the virtio spec, this needs to be done in the virtio community not here 2) audit the difference between virtio spec and TUN/TAP, that's why we have a very long discussion here For example, the root cause of why you think the max_tx_vq is "extra" is: 1) The spec defines VIRTIO_NET_F_RSS and VIRTIO_NET_F_MQ as independent features 2) Your code tries to re-use IFF_MULTI_QUEUE for both VIRTIO_NET_F_RSS and VIRTIO_NET_F_MQ, this would have a lot of implications, e.g automatic steering might be applied when only RSS is negotiated etc The correct way to implement this is: 1) Introduce IFF_RSS and only set it during TUNSETIFF when device only offers RSS 2) reuse virtio-net uAPI and accept max_tx_vq and use that to change the queue(or queue paris) if necessary Then we have a clean and well defined behaviour (for example when devices only support RSS but not MQ). Thanks > > Regards, > Akihiko Odaki > > > > > Thanks > > > >> > >> Regards, > >> Akihiko Odaki > >> > >>> > >>>> > >>>>> > >>>>>> The best possibility is to share structures, not > >>>>>> commands, and I don't think even sharing structures makes sense here > >>>>>> because of the reasons described above. > >>>>> > >>>>> I don't want to share structures, I meant starting from something that > >>>>> is simple and has been sorted in the virtio spec. Optimization could > >>>>> be done on top. > >>>> > >>>> I meant to reuse the structures in virtio_net.h. > >>>> > >>>> Regards, > >>>> Akihiko Odaki > >>> > >>> Thanks > >>> > >>>> > >>>>> > >>>>> Thanks > >>>>> > >>>>> > >>>>>> > >>>>>> Regards, > >>>>>> Akihiko Odaki > >>>>>> > >>>>>>> > >>>>>>> Thanks > >>>>>>> > >>>>>>>> > >>>>>>>> Regards, > >>>>>>>> Akihiko Odaki > >>>>>>>> > >>>>>>>>> > >>>>>>>>>> > >>>>>>>>>> The paramter will be duplicated if we have separate ioctls for RSS and > >>>>>>>>>> hash reporting, and the kernel will have a chiken-egg problem when > >>>>>>>>>> ensuring they are synchronized; when the ioctl for RSS is issued, should > >>>>>>>>>> the kernel ensure the "types" parameter is identical with one specified > >>>>>>>>>> for hash reporting? It will not work if the userspace may decide to > >>>>>>>>>> configure hash reporting after RSS. > >>>>>>>>>> > >>>>>>>>> > >>>>>>>>> See my reply above. > >>>>>>>>> > >>>>>>>>> Thanks > >>>>>>>>> > >>>>>>>> > >>>>>>> > >>>>>> > >>>>> > >>>> > >>> > >> > > >
On 2025/03/21 10:13, Jason Wang wrote: > On Thu, Mar 20, 2025 at 1:33 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2025/03/20 10:31, Jason Wang wrote: >>> On Wed, Mar 19, 2025 at 1:29 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> On 2025/03/19 9:58, Jason Wang wrote: >>>>> On Tue, Mar 18, 2025 at 6:10 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>> >>>>>> On 2025/03/18 9:15, Jason Wang wrote: >>>>>>> On Mon, Mar 17, 2025 at 3:07 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>> >>>>>>>> On 2025/03/17 10:12, Jason Wang wrote: >>>>>>>>> On Wed, Mar 12, 2025 at 1:03 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>> >>>>>>>>>> On 2025/03/12 11:35, Jason Wang wrote: >>>>>>>>>>> On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>>>> >>>>>>>>>>>> On 2025/03/11 9:38, Jason Wang wrote: >>>>>>>>>>>>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>>>>>> >>>>>>>>>>>>>> On 2025/03/10 12:55, Jason Wang wrote: >>>>>>>>>>>>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Hash reporting >>>>>>>>>>>>>>>> ============== >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Allow the guest to reuse the hash value to make receive steering >>>>>>>>>>>>>>>> consistent between the host and guest, and to save hash computation. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> RSS >>>>>>>>>>>>>>>> === >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> RSS is a receive steering algorithm that can be negotiated to use with >>>>>>>>>>>>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. >>>>>>>>>>>>>>>> However, computing the hash after the queue was chosen defeats the >>>>>>>>>>>>>>>> purpose of RSS. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Another approach is to use eBPF steering program. This approach has >>>>>>>>>>>>>>>> another downside: it cannot report the calculated hash due to the >>>>>>>>>>>>>>>> restrictive nature of eBPF steering program. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Introduce the code to perform RSS to the kernel in order to overcome >>>>>>>>>>>>>>>> thse challenges. An alternative solution is to extend the eBPF steering >>>>>>>>>>>>>>>> program so that it will be able to report to the userspace, but I didn't >>>>>>>>>>>>>>>> opt for it because extending the current mechanism of eBPF steering >>>>>>>>>>>>>>>> program as is because it relies on legacy context rewriting, and >>>>>>>>>>>>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while >>>>>>>>>>>>>>>> the other relevant virtualization APIs such as KVM and vhost_net are >>>>>>>>>>>>>>>> UAPIs. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >>>>>>>>>>>>>>>> Tested-by: Lei Yang <leiyang@redhat.com> >>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>> Documentation/networking/tuntap.rst | 7 ++ >>>>>>>>>>>>>>>> drivers/net/Kconfig | 1 + >>>>>>>>>>>>>>>> drivers/net/tap.c | 68 ++++++++++++++- >>>>>>>>>>>>>>>> drivers/net/tun.c | 98 +++++++++++++++++----- >>>>>>>>>>>>>>>> drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++-- >>>>>>>>>>>>>>>> include/linux/if_tap.h | 2 + >>>>>>>>>>>>>>>> include/linux/skbuff.h | 3 + >>>>>>>>>>>>>>>> include/uapi/linux/if_tun.h | 75 +++++++++++++++++ >>>>>>>>>>>>>>>> net/core/skbuff.c | 4 + >>>>>>>>>>>>>>>> 9 files changed, 386 insertions(+), 31 deletions(-) >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst >>>>>>>>>>>>>>>> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 >>>>>>>>>>>>>>>> --- a/Documentation/networking/tuntap.rst >>>>>>>>>>>>>>>> +++ b/Documentation/networking/tuntap.rst >>>>>>>>>>>>>>>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >>>>>>>>>>>>>>>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> [...] >>>>>>>>>>> >>>>>>>>>>>>>>>> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, >>>>>>>>>>>>>>>> + bool can_rss, void __user *argp) >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> So again, can_rss seems to be tricky. Looking at its caller, it tires >>>>>>>>>>>>>>> to make eBPF and RSS mutually exclusive. I still don't understand why >>>>>>>>>>>>>>> we need this. Allow eBPF program to override some of the path seems to >>>>>>>>>>>>>>> be common practice. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> What's more, we didn't try (or even can't) to make automq and eBPF to >>>>>>>>>>>>>>> be mutually exclusive. So I still didn't see what we gain from this >>>>>>>>>>>>>>> and it complicates the codes and may lead to ambiguous uAPI/behaviour. >>>>>>>>>>>>>> >>>>>>>>>>>>>> automq and eBPF are mutually exclusive; automq is disabled when an eBPF >>>>>>>>>>>>>> steering program is set so I followed the example here. >>>>>>>>>>>>> >>>>>>>>>>>>> I meant from the view of uAPI, the kernel doesn't or can't reject eBPF >>>>>>>>>>>>> while using automq. >>>>>>>>>>>> > >> >>>>>>>>>>>>>> We don't even have an interface for eBPF to let it fall back to another >>>>>>>>>>>>>> alogirhtm. >>>>>>>>>>>>> >>>>>>>>>>>>> It doesn't even need this, e.g XDP overrides the default receiving path. >>>>>>>>>>>>> >>>>>>>>>>>>>> I could make it fall back to RSS if the eBPF steeering >>>>>>>>>>>>>> program is designed to fall back to automq when it returns e.g., -1. But >>>>>>>>>>>>>> such an interface is currently not defined and defining one is out of >>>>>>>>>>>>>> scope of this patch series. >>>>>>>>>>>>> >>>>>>>>>>>>> Just to make sure we are on the same page, I meant we just need to >>>>>>>>>>>>> make the behaviour consistent: allow eBPF to override the behaviour of >>>>>>>>>>>>> both automq and rss. >>>>>>>>>>>> >>>>>>>>>>>> That assumes eBPF takes precedence over RSS, which is not obvious to me. >>>>>>>>>>> >>>>>>>>>>> Well, it's kind of obvious. Not speaking the eBPF selector, we have >>>>>>>>>>> other eBPF stuffs like skbedit etc. >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Let's add an interface for the eBPF steering program to fall back to >>>>>>>>>>>> another steering algorithm. I said it is out of scope before, but it >>>>>>>>>>>> makes clear that the eBPF steering program takes precedence over other >>>>>>>>>>>> algorithms and allows us to delete the code for the configuration >>>>>>>>>>>> validation in this patch. >>>>>>>>>>> >>>>>>>>>>> Fallback is out of scope but it's not what I meant. >>>>>>>>>>> >>>>>>>>>>> I meant in the current uAPI take eBPF precedence over automq. It's >>>>>>>>>>> much more simpler to stick this precedence unless we see obvious >>>>>>>>>>> advanatge. >>>>>>>>>> >>>>>>>>>> We still have three different design options that preserve the current >>>>>>>>>> precedence: >>>>>>>>>> >>>>>>>>>> 1) Precedence order: eBPF -> RSS -> automq >>>>>>>>>> 2) Precedence order: RSS -> eBPF -> automq >>>>>>>>>> 3) Precedence order: eBPF OR RSS -> automq where eBPF and RSS are >>>>>>>>>> mutually exclusive >>>>>>>>>> >>>>>>>>>> I think this is a unique situation for this steering program and I could >>>>>>>>>> not find another example in other eBPF stuffs. >>>>>>>>> >>>>>>>>> As described above, queue mapping could be overridden by tc-ebpf. So >>>>>>>>> there's no way to guarantee the RSS will work: >>>>>>>>> >>>>>>>>> https://github.com/DPDK/dpdk/blob/main/drivers/net/tap/bpf/tap_rss.c#L262 >>>>>>>>> >>>>>>>>> Making eBPF first leaves a chance for the management layer to override >>>>>>>>> the choice of Qemu. >>>>>>>> >>>>>>>> I referred to the eBPF steering program instead of tc-ebpf. tc-ebpf is >>>>>>>> nothing to do with the TUNSETSTEERINGEBPF ioctl, which this patch changes. >>>>>>> >>>>>>> I meant you can't do "full control" in any case, the point below >>>>>>> doesn't stand. Queue mapping could be restored even if RSS is set. >>>>>> >>>>>> What matters here is how we handle the control when tc didn't take it. >>>>>> eBPF, RSS, or automq make take all of it; I referred that as "full control". >>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> The current version implements 3) because it is not obvious whether we >>>>>>>>>> should choose either 1) or 2). >>>>>>>>> >>>>>>>>> But you didn't explain why you choose 3), and it leads to tricky code >>>>>>>>> (e.g the can_rss stuff etc). >>>>>>>> >>>>>>>> I wrote: "because it is not obvious whether we should choose either 1) >>>>>>>> or 2)", but I think I can explain it better: >>>>>>>> >>>>>>>> When an eBPF steering program cannot implement a fallback, it means the >>>>>>>> eBPF steering program requests the full control over the steering. On >>>>>>>> the other hand, RSS also requests the same control. So these two will >>>>>>>> conflict and the entity controlling the steering will be undefined when >>>>>>>> both are enabled. >>>>>>> >>>>>>> Well, the fallback is orthogonal to the proposal here. We haven't had >>>>>>> that since the introduction of the eBPF steering program. This means >>>>>>> automq has been in "conflict" with eBPF for years. Again, another >>>>>>> advantage, allowing the eBPF program to be the first to allow the >>>>>>> management layer to override Qemu's steering. >>>>>> >>>>>> What if a VMM uses eBPF steering program and the management layer >>>>>> decides to override it with RSS? >>>>> >>>>> That's possible but I think we're seeking which approach is better. In >>>>> this case, RSS could be implemented in eBPF but not the reverse. >>>>> >>>>> So my point is to start from something that is simpler. Simply allow >>>>> eBPF on top of RSS as automq. And optimize on top. >>>> >>>> >>>> The in-kernel RSS implementation is more optimized and capable of hash >>>> reporting. I don't think either eBPF steering program or in-kernel RSS >>>> is more capable than the other and there is a reason to place eBPF on >>>> top of RSS. >>>> >>>>> >>>>>> >>>>>> eBPF is obviously predecedent to automq as eBPF is an opt-in feature and >>>>>> automq is the implicit default. But this logic cannot be applied to >>>>>> decide the order of eBPF and RSS because they are both opt-in features. >>>>> >>>>> This is from the perspective of kernel development. But let's try to >>>>> think from the userspace: A well written user space knows what it >>>>> does, rejecting eBPF while RSS is set doesn't help. But anyhow if you >>>>> stick, it doesn't harm. >>>> >>>> Yes, it not for the current userspace but for the future kernel >>>> development; the kernel can reserve the freedom to decide the priority >>>> of eBPF and RSS by rejecting eBPF while RSS. >>>> >>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>> 3) eliminates the undefined semantics by rejecting to enable both. >>>>>>> >>>>>>> This would lead a usersapce noticeable change of the behaviour? And >>>>>>> what do you mean by "rejecting to enable both"? >>>>>> >>>>>> Existing userspace code should see no change as it only cares the case >>>>>> where RSS is enabled. >>>>>> >>>>>> Here, rejecting to enable both means to deny setting an eBPF steering >>>>>> program when RSS is enabled, and visa-versa. >>>>>> >>>>>>> >>>>>>>> An >>>>>>>> alternative approach is to allow eBPF steering programs to fall back. >>>>>>>> When both the eBPF program and RSS are enabled, RSS will gain the >>>>>>>> control of steering under the well-defined situation where the eBPF >>>>>>>> steering program decides to fall back. >>>>>>> >>>>>>> How about just stick the eBPF precedence in this proposal and >>>>>>> introduce the fallback on top? This helps to speed up the iteration >>>>>>> (as the version has been iterated to 11). >>>>>> >>>>>> I don't think that helps much since we have another ongoing discussion >>>>>> below and it is not the sole roadblock. >>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> But 1) will be the most capable option if >>>>>>>>>> eBPF has a fall-back feature. >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> [...] >>>>>>>>>>> >>>>>>>>>>>>>>> Is there a chance that we can reach here without TUN_VNET_HASH_REPORT? >>>>>>>>>>>>>>> If yes, it should be a bug. >>>>>>>>>>>>>> >>>>>>>>>>>>>> It is possible to use RSS without TUN_VNET_HASH_REPORT. >>>>>>>>>>>>> >>>>>>>>>>>>> Another call to separate the ioctls then. >>>>>>>>>>>> >>>>>>>>>>>> RSS and hash reporting are not completely independent though. >>>>>>>>>>> >>>>>>>>>>> Spec said: >>>>>>>>>>> >>>>>>>>>>> """ >>>>>>>>>>> VIRTIO_NET_F_RSSRequires VIRTIO_NET_F_CTRL_VQ. >>>>>>>>>>> """ >>>>>>>>>> >>>>>>>>>> I meant the features can be enabled independently, but they will share >>>>>>>>>> the hash type set when they are enabled at the same time. >>>>>>>>> >>>>>>>>> Looking at the spec: >>>>>>>>> >>>>>>>>> Hash repot uses: >>>>>>>>> >>>>>>>>> """ >>>>>>>>> struct virtio_net_hash_config { >>>>>>>>> le32 hash_types; >>>>>>>>> le16 reserved[4]; >>>>>>>>> u8 hash_key_length; >>>>>>>>> u8 hash_key_data[hash_key_length]; >>>>>>>>> }; >>>>>>>>> """ >>>>>>>>> >>>>>>>>> RSS uses >>>>>>>>> >>>>>>>>> """ >>>>>>>>> struct rss_rq_id { >>>>>>>>> le16 vq_index_1_16: 15; /* Bits 1 to 16 of the virtqueue index */ >>>>>>>>> le16 reserved: 1; /* Set to zero */ >>>>>>>>> }; >>>>>>>>> >>>>>>>>> struct virtio_net_rss_config { >>>>>>>>> le32 hash_types; >>>>>>>>> le16 indirection_table_mask; >>>>>>>>> struct rss_rq_id unclassified_queue; >>>>>>>>> struct rss_rq_id indirection_table[indirection_table_length]; >>>>>>>>> le16 max_tx_vq; >>>>>>>>> u8 hash_key_length; >>>>>>>>> u8 hash_key_data[hash_key_length]; >>>>>>>>> }; >>>>>>>>> """ >>>>>>>>> >>>>>>>>> Instead of trying to figure out whether we can share some data >>>>>>>>> structures, why not simply start from what has been done in the spec? >>>>>>>>> This would ease the usersapce as well where it can simply do 1:1 >>>>>>>>> mapping between ctrl vq command and tun uAPI. >>>>>>>> >>>>>>>> The spec also defines struct virtio_net_hash_config (which will be used >>>>>>>> when RSS is disabled) and struct virtio_net_rss_config to match the >>>>>>>> layout to share some fields. However, the UAPI does not follow the >>>>>>>> interface design of virtio due to some problems with these structures. >>>>>>> >>>>>>> Copy-paste error. The above is copied from the virtio spec, but I >>>>>>> meant the existing uAPI in virtio_net.h: >>>>>>> >>>>>>> /* >>>>>>> * The command VIRTIO_NET_CTRL_MQ_RSS_CONFIG has the same effect as >>>>>>> * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET does and additionally configures >>>>>>> * the receive steering to use a hash calculated for incoming packet >>>>>>> * to decide on receive virtqueue to place the packet. The command >>>>>>> * also provides parameters to calculate a hash and receive virtqueue. >>>>>>> */ >>>>>>> struct virtio_net_rss_config { >>>>>>> __le32 hash_types; >>>>>>> __le16 indirection_table_mask; >>>>>>> __le16 unclassified_queue; >>>>>>> __le16 indirection_table[1/* + indirection_table_mask */]; >>>>>>> __le16 max_tx_vq; >>>>>>> __u8 hash_key_length; >>>>>>> __u8 hash_key_data[/* hash_key_length */]; >>>>>>> }; >>>>>>>> #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 >>>>>>> >>>>>>> /* >>>>>>> * The command VIRTIO_NET_CTRL_MQ_HASH_CONFIG requests the device >>>>>>> * to include in the virtio header of the packet the value of the >>>>>>> * calculated hash and the report type of hash. It also provides >>>>>>> * parameters for hash calculation. The command requires feature >>>>>>> * VIRTIO_NET_F_HASH_REPORT to be negotiated to extend the >>>>>>> * layout of virtio header as defined in virtio_net_hdr_v1_hash. >>>>>>> */ >>>>>>> struct virtio_net_hash_config { >>>>>>> __le32 hash_types; >>>>>>> /* for compatibility with virtio_net_rss_config */ >>>>>>> __le16 reserved[4]; >>>>>>> __u8 hash_key_length; >>>>>>> __u8 hash_key_data[/* hash_key_length */]; >>>>>>> }; >>>>>>> >>>>>>> This has been used by Qemu but I see a virtio-net version of: >>>>>>> >>>>>>> struct virtio_net_ctrl_rss { >>>>>>> u32 hash_types; >>>>>>> u16 indirection_table_mask; >>>>>>> u16 unclassified_queue; >>>>>>> u16 hash_cfg_reserved; /* for HASH_CONFIG (see >>>>>>> virtio_net_hash_config for details) */ >>>>>>> u16 max_tx_vq; >>>>>>> u8 hash_key_length; >>>>>>> u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; >>>>>>> >>>>>>> u16 *indirection_table; >>>>>>> }; >>>>>>> >>>>>>> This is ugly and results in a tricky code when trying to submit >>>>>>> RSS/HASH commands to the device: >>>>>>> >>>>>>> if (vi->has_rss) { >>>>>>> sg_buf_size = sizeof(uint16_t) * vi->rss_indir_table_size; >>>>>>> sg_set_buf(&sgs[1], vi->rss.indirection_table, sg_buf_size); >>>>>>> } else { >>>>>>> sg_set_buf(&sgs[1], &vi->rss.hash_cfg_reserved, >>>>>>> sizeof(uint16_t)); >>>>>>> } >>>>>> >>>>>> The only reference to struct virtio_net_rss_config in QEMU is to derive >>>>>> the offset of indirection_table. This is because the definition in >>>>>> virtio_net.h also includes indirection_table in the middle and the >>>>>> offsets of later part are unusable. >>>>> >>>>> Yes. >>>>> >>>>>> >>>>>> QEMU internally has a structure named VirtioNetRssData which just looks >>>>>> like struct virtio_net_ctrl_rss. >>>>> >>>>> It's a pity that it doesn't use uAPI. We might need to fix them. >>>> >>>> It doesn't want to use the UAPI structures for the internal storage >>>> because it wants to store them in native endians and QEMU is not >>>> interested in some fields in the UAPI structures. struct tun_vnet_hash >>>> and struct tun_vnet_hash_rss are easy to fill using VirtioNetRssData. >>>> >>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>> Below is the definition of struct virtio_net_hash_config: >>>>>>>> >>>>>>>> struct virtio_net_hash_config { >>>>>>>> le32 hash_types; >>>>>>>> le16 reserved[4]; >>>>>>>> u8 hash_key_length; >>>>>>>> u8 hash_key_data[hash_key_length]; >>>>>>>> }; >>>>>>>> >>>>>>>> Here, hash_types, hash_key_length, and hash_key_data are shared with >>>>>>>> struct virtio_net_rss_config. >>>>>>>> >>>>>>>> One problem is that struct virtio_net_rss_config has a flexible array >>>>>>>> (indirection_table) between hash_types and hash_key_length. This is >>>>>>>> something we cannot express with C. >>>>>>> >>>>>>> We can split the virtio_net_rss_config to ease the dealing with >>>>>>> arrays, more below. >>>>>>> >>>>>>>> >>>>>>>> Another problem is that the semantics of the key in struct >>>>>>>> virtio_net_hash_config is not defined in the spec. >>>>>>> >>>>>>> If this is the case. Let's fix that in the spec first to make sure our >>>>>>> uAPI aligns with spec without ambiguity. It would be a nightmare to >>>>>>> deal with the in-consistency between virtio spec and Linux uAPIs. >>>>>> >>>>>> The userspace doesn't need to do anything to deal with inconsistency >>>>>> since these fields are unused. >>>>>> >>>>>>> >>>>>>>> >>>>>>>> To solve these problems, I defined the UAPI structures that do not >>>>>>>> include indiretion_table. >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> A plot twist is the "types" parameter; it is a parameter that is >>>>>>>>>>>> "common" for RSS and hash reporting. >>>>>>>>>>> >>>>>>>>>>> So we can share part of the structure through the uAPI. >>>>>>>>>> >>>>>>>>>> Isn't that what this patch does? >>>>>>>>> >>>>>>>>> I didn't see, basically I see only one TUNSETVNETHASH that is used to >>>>>>>>> set both hash report and rss: >>>>>>>> >>>>>>>> The UAPI shares struct tun_vnet_hash for both hash report and rss. >>>>>>> >>>>>>> I meant sharing structure in two ioctls instead of reusing a specific >>>>>>> structure for two semantics in one ioctl if possible. Though I don't >>>>>>> think we need any sharing. >>>>>> >>>>>> The UAPI implemented in this patch already shares struct tun_vnet_hash >>>>>> and having two ioctls doesn't change that. >>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> """ >>>>>>>>> +/** >>>>>>>>> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing >>>>>>>>> + * >>>>>>>>> + * The argument is a pointer to &struct tun_vnet_hash. >>>>>>>>> + * >>>>>>>>> + * The argument is a pointer to the compound of the following in order if >>>>>>>>> + * %TUN_VNET_HASH_RSS is set: >>>>>>>>> + * >>>>>>>>> + * 1. &struct tun_vnet_hash >>>>>>>>> + * 2. &struct tun_vnet_hash_rss >>>>>>>>> + * 3. Indirection table >>>>>>>>> + * 4. Key >>>>>>>>> + * >>>>>>>>> """ >>>>>>>>> >>>>>>>>> And it seems to lack parameters like max_tx_vq. >>>>>>>> >>>>>>>> max_tx_vq is not relevant with hashing. >>>>>>> >>>>>>> It is needed for RSS and we don't have that, no? >>>>>> >>>>>> No. RSS is Receive Side Scaling but it's not about receiving. >>>>> >>>>> Just to make sure I understand this, max_tx_vq is part of the >>>>> virtio_net_rss_config, how would Qemu behave when it receives this >>>>> from guest? >>>>> >>>>> """ >>>>> A driver sets max_tx_vq to inform a device how many transmit >>>>> virtqueues it may use (transmitq1…transmitq max_tx_vq). >>>>> """ >>>> >>>> It does nothing. >>> >>> Nope, see: >>> >>> commit 50bfcaedd78e53135ec0504302269b3b65bf1eff >>> Author: Philo Lu <lulie@linux.alibaba.com> >>> Date: Mon Nov 4 16:57:06 2024 +0800 >>> >>> virtio_net: Update rss when set queue >>> >>> RSS configuration should be updated with queue number. In particular, it >>> should be updated when (1) rss enabled and (2) default rss configuration >>> is used without user modification. >>> >>> During rss command processing, device updates queue_pairs using >>> rss.max_tx_vq. That is, the device updates queue_pairs together with >>> rss, so we can skip the sperate queue_pairs update >>> (VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET below) and return directly. >>> >>> Also remove the `vi->has_rss ?` check when setting vi->rss.max_tx_vq, >>> because this is not used in the other hash_report case. >>> >>> Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.") >>> Signed-off-by: Philo Lu <lulie@linux.alibaba.com> >>> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> >>> Acked-by: Michael S. Tsirkin <mst@redhat.com> >>> Signed-off-by: Paolo Abeni <pabeni@redhat.com> >>> >>> RSS doesn't depend on MQ. >>> >>> If it is not handled by Qemu, it should be a bug? >> >> I was wrong; QEMU does handle this field, but it doesn't use the >> definition of struct virtio_net_rss_config and name it queue_pairs >> instead of max_tx_vq so I could not find it by grep. > > Yes, another side effect is that uAPI is not even used there... I hope the split structures you proposed will improve it too. > >> >> For tap, max_tx_vq is handled by changing the number of open file >> descriptors so passing it via an ioctl is redundant. > > See my reply below. > >> >>> >>>> >>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> What's more, we've already had virito-net uAPI. Why not simply reusing them? >>>>>>>> >>>>>>>> See the above. >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> RSS and hash reporting must share >>>>>>>>>>>> this parameter when both are enabled at the same time; otherwise RSS may >>>>>>>>>>>> compute hash values that are not suited for hash reporting. >>>>>>>>>>> >>>>>>>>>>> Is this mandated by the spec? If yes, we can add a check. If not, >>>>>>>>>>> userspace risk themselves as a mis-configuration which we don't need >>>>>>>>>>> to bother. >>>>>>>>>> >>>>>>>>>> Yes, it is mandated. 5.1.6.4.3 Hash calculation for incoming packets says: >>>>>>>>>> > A device attempts to calculate a per-packet hash in the following >>>>>>>>>> > cases: >>>>>>>>>> > >>>>>>>>>> > - The feature VIRTIO_NET_F_RSS was negotiated. The device uses the >>>>>>>>>> > hash to determine the receive virtqueue to place incoming packets. >>>>>>>>>> > - The feature VIRTIO_NET_F_HASH_REPORT was negotiated. The device >>>>>>>>>> > reports the hash value and the hash type with the packet. >>>>>>>>>> > >>>>>>>>>> > If the feature VIRTIO_NET_F_RSS was negotiated: >>>>>>>>>> > >>>>>>>>>> > - The device uses hash_types of the virtio_net_rss_config structure >>>>>>>>>> > as ’Enabled hash types’ bitmask. >>>>>>>>>> > - The device uses a key as defined in hash_key_data and >>>>>>>>>> hash_key_length of the virtio_net_rss_config structure (see >>>>>>>>>> > 5.1.6.5.7.1). >>>>>>>>>> > >>>>>>>>>> > If the feature VIRTIO_NET_F_RSS was not negotiated: >>>>>>>>>> > >>>>>>>>>> > - The device uses hash_types of the virtio_net_hash_config structure >>>>>>>>>> > as ’Enabled hash types’ bitmask. >>>>>>>>>> > - The device uses a key as defined in hash_key_data and >>>>>>>>>> > hash_key_length of the virtio_net_hash_config structure (see >>>>>>>>>> > .1.6.5.6.4). >>>>>>>>>> >>>>>>>>>> So when both VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT are >>>>>>>>>> negotiated, virtio_net_rss_config not only controls RSS but also the >>>>>>>>>> reported hash values and types. They cannot be divergent. >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Note that spec use different commands for hash_report and rss. >>>>>>>>>> >>>>>>>>>> TUNSETVNETHASH is different from these commands in terms that it also >>>>>>>>>> negotiates VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS. >>>>>>>>>> >>>>>>>>> >>>>>>>>> There Are different "issues" here: >>>>>>>>> >>>>>>>>> 1) Whether or not we need to use a unified API for negotiating RSS and >>>>>>>>> HASH_REPORT features >>>>>>>>> 2) Whether or not we need to sue a unified API for setting RSS and >>>>>>>>> HASH_REPORT configuration >>>>>>>>> >>>>>>>>> What I want to say is point 2. But what you raise is point 1. >>>>>>>>> >>>>>>>>> For simplicity, it looks to me like it's a call for having separated >>>>>>>>> ioctls for feature negotiation (for example via TUNSETIFF). You may >>>>>>>>> argue that either RSS or HASH_REPORT requires configurations, we can >>>>>>>>> just follow what spec defines or not (e.g what happens if >>>>>>>>> RSS/HASH_REPORT were negotiated but no configurations were set). >>>>>>>> >>>>>>>> Unfortunately TUNSETIFF does not fit in this use case. The flags set >>>>>>>> with TUNSETIFF are fixed, but the guest can request a different feature >>>>>>>> set anytime by resetting the device. >>>>>>> >>>>>>> TUNSETIFF, enables the device to be able to handle RSS/HASREPORT. >>>>>>> TUNSETHASH/RSS. dealing with RSS/HASH command from userspace. >>>>>> >>>>>> We also needs to be able to disable them at runtime so that we can >>>>>> handle resets. >>>>> >>>>> Via TUNSETHASH/RSS? I think it should have a way to accept parameters >>>>> that disable RSS or hash report. >>>> >>>> That's what this patch implements. TUNSETVNETHASH accepts parameters to >>>> choose what features to be enabled. >>>> >>>>> >>>>>> >>>>>>> >>>>>>> This is the way we used to do for multi queue and vnet header. >>>>>>> TUNSETIFF requires CAP_NET_ADMIN, this could be an extra safe guard >>>>>>> for unprivileged userspace. >>>>>> >>>>>> I intend to allow using this feature without privilege. A VMM is usually >>>>>> unprivileged and requiring a privilege to configure tuntap is too >>>>>> prohibitive. >>>>> >>>>> For safety, tun is not allowed to be created by unprivileged users. >>>>> And it's not to configure the tuntap dynamically, it's about telling >>>>> the function that tuntap can have (not necessarily enabled though) . >>>> >>>> I don't think we need another barrier for the new functions. Once an >>>> unprivileged user get a file descriptor of tuntap from a privileged >>>> user, they are free to enable RSS and/or hash reporting. >>> >>> Only if such a feature is allowed by the privileged user. >> >> I don't see a reason not to allow the feature to unprivileged users. It >> only complicates the setup. > > For safety, e.g reduce the chance for unprivileged user to explore > part of the kernel codes. It indeed reduces the attack surface, but it's fine without the reduction I guess? It's not a feature so complicated; I saw there were complicated changes like namespaces and io_uring that caused controversy when exposing them to unprivilged users, but this feature is not like them, I suppose. > >> >>> >>>> >>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>> > >> In the virtio-net specification, it is not defined what would >>>>>>>> happen if >>>>>>>>>> these features are negotiated but the VIRTIO_NET_CTRL_MQ_RSS_CONFIG or >>>>>>>>>> VIRTIO_NET_CTRL_MQ_HASH_CONFIG commands are not sent. There is no such >>>>>>>>>> ambiguity with TUNSETVNETHASH. >>>>>>>>> >>>>>>>>> So I don't see advantages of unifying hash reports and rss into a >>>>>>>>> single ioctl. Let's just follow what has been done in the spec that >>>>>>>>> uses separated commands. Tuntap is not a good place to debate whether >>>>>>>>> those commands could be unified or not. We need to move it to the spec >>>>>>>>> but assuming spec has been done, it might be too late or too few >>>>>>>>> advantages for having another design. >>>>>>>> >>>>>>>> It makes sense for the spec to reuse the generic feature negotiation >>>>>>>> mechanism, but the situation is different for tuntap; we cannot use >>>>>>>> TUNSETIFF and need to define another. Then why don't we exploit this >>>>>>>> opportunity to have an interface with well-defined semantics? >>>>>>> >>>>>>> That's perfectly fine, but it needs to be done in virtio-net's uAPI >>>>>>> not tun's. What's more, if you think two commands are not >>>>>>> well-defined, let's fix that in the virtio spec first. >>>>>>> >>>>>>>> The virtio >>>>>>>> spec does its best as an interface between the host and guest and tuntap >>>>>>>> does its best as an UAPI. >>>>>>> >>>>>>> See above, let's fix the uAPI first. We don't want DPDK to use tun's >>>>>>> uAPI for RSS >>>>>> >>>>>> virtio-net's UAPI is for the virtio spec which has a capable generic >>>>>> feature negotiation mechanism. tuntap needs its own feature negotiation >>>>>> and it's nothing to do with virtio-net's UAPI. >>>>> >>>>> Well, I don't mean the part of the feature negotiation. I mean the >>>>> part for rss and hash report configuration. >>>> >>>> The feature negotiation still matters when deciding the granularity of >>>> ioctls. We need one ioctl for a feature negotiation, and to avoid having >>>> an intermediate state, >>> >>> I don't understand this. For example, driver can choose to >>> >>> 1) negotiate RSS >>> 2) do something else. >>> 3) configure RSS >>> >>> Spec doesn't require those two to be configured at the same time, so >>> "intermediate state" is allowed. >> >> The spec doesn't define what should happen in the intermediate state either. > > Yes but my point is that in the uAPI layer we don't need to care about > the intermediate state. It can just work as other features, e.g having > a default state after feature negotiation is more than enough. This is > the way we deal with other features like vnet header etc. > >> >> For a hardware implementation I think it's fine whatever the >> implementation defines as the intermediate state. But for the UAPI, it's >> better avoiding having such a definition to keep the interface minimal >> and maximize the UAPI stability. > > Well, even if you think there's an issue: > > 1) I don't see how we can avoid the intermediate state consider guest > have such state > 2) We need to "fix" virtio spec and virito-net first, tuntap is not > the right place to workaround virtio specific issues Let me summarize my points that support having one ioctl to negotiate features and configuration: The virtio spec has a generic feature negotiation mechanism and reusing it resulted in having an intermediate state between the feature negotiation and configuration. There is nothing wrong about that so we don't need to "fix" the virtio spec. tuntap can also perform feature negotitaion with TUNSETIFF, but TUNSETIFF have a few problems: 1. It requires a privilege. One can argue that it reduces the attack surface and it indeed does, but it's fine without the reduction I guess? It's not a feature so complicated; I saw there were complicated changes like namespaces and io_uring that caused controversy when exposing them to unprivilged users, but this feature is not like them. 2. It cannot change the enabled feature set at runtime. The virtio spec allows changing it by resetting. So we need to design a set of new ioctls for both feature negotiation and configuration. When doing so, eliminating the intermediate state is a good principle to determine the optimal size of ioctls. In theory, it is possible to have small ioctls that set only one scalar value or even one bit, but that doesn't make sense. This principle helps determine the optimal size of ioctls; it minimizes the complexity of both the userspace and the kernel. > >> >>> >>>> the ioctl should also do the configuration. Hence >>>> that one ioctl should do all of the feature negotiation and configuration. >>>> >>>>> >>>>>> >>>>>> The structures for two commands have unused or redundant fields and a >>>>>> flexible array in the middle of the structure, but they are ABIs so we >>>>>> can't change it. >>>>>> >>>>>> DPDK is another reason to define tuntap's own UAPIs. They don't care >>>>>> unused or redundant fields and a flexible array in middle that are >>>>>> present in the virtio spec. It will also not want to deal with the >>>>>> requirement of little endian. Constructing struct virtio_net_rss_config >>>>>> is an extra burden for DPDK. >>>>> >>>>> I meant for vhost-user implementation in DPDK, it needs to use >>>>> virtio-net uAPI not tuntap's for example. >>>> >>>> The vhost-user implementation will use tuntap's UAPIs for its ethernet >>>> device backend. >>> >>> That sounds pretty weird, vhost-user has nothing related to tuntap. >> >> My expression in the last email was weird. More precisely, the ethernet >> backend of tuntap will use the UAPIs, and the vhost-user will use the >> ethernet backend in turn. > > I don't understand what "ethernet backend" means here. It is a driver that serves the Ethernet Device API, which is agnostic on application and driver. The Ethernet Device API, including RSS configuration is documented at: https://doc.dpdk.org/api/rte__ethdev_8h.html The Ethernet API are not bound to the virtio spec since they are not specific to the vhost application or the tuntap driver. Hence they operate in native endian and do not have extra fields, and tuntap's structures are more suited to the ethernet backend than the virtio ones. > >> >>> >>>> It uses the generic interface of ethernet device so for >>>> RSS it will use functions like rte_eth_dev_rss_hash_update() for >>>> example. tuntap's UAPIs are more suited to implement these interfaces as >>>> they operate in native endian and don't have extra fields. >>> >>> Nope, for example it needs to use le for virtio_net_hdr if a modern >>> device is used. But it needs a "native" endian according to the guest >>> endian via TUNSETVNETLE/BE. We don't have a choice as virtio-net hdr >>> support in tuntap is much earlier than modern devices. >>> >>> Let's don't do the same thing (native endian) for tuntap as RSS >>> depends on modern, so we know it must be le. >> >> virtio_net_hdr is the data path while the current discussion is about >> the control path. All configuration knobs of tuntap operates in the >> native endian. > > Because they are not directly related to virtio specification. We > don't want to duplicate virtio-net with our own version every time E.g > once RSSv2 or aRFS were implemented. Or I would even introduce a > single uAPI to transport possible cvq commands then we can avoid > inventing new ioctls that just transport cvq commands. > >> >> So I think we should stick to the little endian for the data path while >> we should stick to the native endian for the control path to maximize >> the consistency. > > I don't see a reason to differ datapath from control path. Virtio-net > uAPI has been reused by tuntap for more than a decade. tuntap's control path all operate in the native endian. They never used the endian of the data path in the control path. > >> >>> >>> >>>> >>>> DPDk applications other than vhost-user also matter; they do not care >>>> what virtio does at all. >>>> >>>> > >> >>>>>> On the other hand, Constructing tuntap-specific structures is not that >>>>>> complicated for VMMs. >>>>> >>>>> Not complicated but redundant. >>>>> >>>>>> A VMM will need to inspect struct >>>>>> virtio_net_rss_config anyway to handle migration and check its size so >>>>>> it can store the values it inspected to struct tun_vnet_hash and struct >>>>>> tun_vnet_hash_rss and pass them to the kernel. >>>>> >>>>> I don't see how rss and hash reports differ from what we have now. >>>>> Those inspections must be done anyhow for compatibility for example >>>>> the check of offloading features. Such steps could not be eliminated >>>>> no matter how we design the uAPI. >>>> >>>> I explained the difference between the virtio and tuntap UAPIs, not >>>> between RSS and hash reporting. >>> >>> See above. >>> >>>> >>>>> >>>>>> >>>>>> The overall userspace implementation will be simpler by having >>>>>> structures specifically tailored for the communication between the >>>>>> userspace and kernel. >>>>> >>>>> This is exactly how a good uAPI should behave. If uAPI in virtio-net >>>>> can't do this, I don't understand why uAPI in tuntap can solve it. >>>> >>>> The UAPI in virtio-net cannot do it because it's already fixed and it >>>> also needs to perform endian conversion for the VM use case. tuntap >>>> doesn't have these restrictions. >>> >>> Same here. >>> >>>> >>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>> I don't think there is an advantage to split ioctls to follow the spec >>>>>>>> after all. It makes sense if we can pass-through virtio commands to >>>>>>>> tuntap, but it is not possible as ioctl operation codes are different >>>>>>>> from virtio commands. >>>>>>> >>>>>>> I don't see a connection with the operation code. For example, we can >>>>>>> add new uAPIs in virtio-net which could be something like: >>>>>>> >>>>>>> struct virtio_net_rss_config_header { >>>>>>> __le32 hash_types; >>>>>>> __le16 indirection_table_mask; >>>>>>> __le16 unclassified_queue; >>>>>>> __le16 indirection_table[]; >>>>>>> } >>>>>>> >>>>>>> struct virtio_net_rss_config_tailer { >>>>>>> __le16 max_tx_vq; >>>>>>> u8 hash_key_length; >>>>>>> u8 hash_key_data[]; >>>>>>> } >>>>>>> >>>>>>> These two are used by TUNSETVNETRSS. And simply reuse the >>>>>>> virtio_net_hash_config for TUNSETVETHASH. >>>>>> > > With this, we can tweak the virtio-net driver with this new uAPI. Then >>>>>>> tap* can reuse this. >>>>>> >>>>>> I implemented a UAPI and driver change accordingly: >>>>>> https://lore.kernel.org/r/20250318-virtio-v1-0-344caf336ddd@daynix.com >>>>>> >>>>>> This is a nice improvement for the driver, but I still don't think it is >>>>>> suited for the UAPI of tuntap. >>>>> >>>>> Any reason for this? It should work like virtio_net_hdr. >>>>> >>>>>> The requirements of extra fields and >>>>>> little endian cannot be removed from the virtio spec but they are >>>>>> irrelevant for tuntap. >>>>> >>>>> I don't understand this part. What fields are "extra" and need to be >>>>> removed from the spec? >>>> >>>> All fields not included in struct tun_vnet_hash and struct >>>> tun_vnet_hash_rss. Namely, for struct virtio_net_hash_config: >>>> - reserved >>>> - hash_key_length >>>> - hash_key_data >>>> >>>> For struct virtio_net_rss_config: >>>> - max_tx_vq >>>> - hash_key_length >>> >>> See my above reply, and I basically meant >>> >>> TUNSETVETHASH accept struct virtio_net_hash_config; >>> TUNSETVETRSS accept struct virtio_net_rss_config_hdr + struct >>> virtio_net_rss_config_trailer; >> >> That still bring the extra fields I noted in the last email. > > I don't know how to define "extra" here. Let's summarize here: > > Method A: > > 1) virtio specification use separate commands for has_report and rss > 2) hash_port ans rss doesn't depend on each other > 3) reuse virtio-net uAPI > > Method B: > > 1) trying to define and remove the "extra" fields in tuntap, and > redefine it in TUNTAP > > It would always be much easier to start from simply reusing the > virtio-net uAPI. Method B makes both the implementation and reviewing > harder, as we need to > > 1) revisit the design of the virtio spec, this needs to be done in the > virtio community not here > 2) audit the difference between virtio spec and TUN/TAP, that's why we > have a very long discussion here > > For example, the root cause of why you think the max_tx_vq is "extra" is: > > 1) The spec defines VIRTIO_NET_F_RSS and VIRTIO_NET_F_MQ as independent features > 2) Your code tries to re-use IFF_MULTI_QUEUE for both VIRTIO_NET_F_RSS > and VIRTIO_NET_F_MQ, this would have a lot of implications, e.g > automatic steering might be applied when only RSS is negotiated etc > > The correct way to implement this is: > > 1) Introduce IFF_RSS and only set it during TUNSETIFF when device only > offers RSS Please see the summary of "my points that support having one ioctl to negotiate features and configuration" I wrote the above. > 2) reuse virtio-net uAPI and accept max_tx_vq and use that to change > the queue(or queue paris) if necessary I don't think it's possible; we need file descriptors associated with queues, which is something you cannot express with the virtio-net structures. Regards, Akihiko Odaki > > Then we have a clean and well defined behaviour (for example when > devices only support RSS but not MQ). > > Thanks > >> >> Regards, >> Akihiko Odaki >> >>> >>> Thanks >>> >>>> >>>> Regards, >>>> Akihiko Odaki >>>> >>>>> >>>>>> >>>>>>> >>>>>>>> The best possibility is to share structures, not >>>>>>>> commands, and I don't think even sharing structures makes sense here >>>>>>>> because of the reasons described above. >>>>>>> >>>>>>> I don't want to share structures, I meant starting from something that >>>>>>> is simple and has been sorted in the virtio spec. Optimization could >>>>>>> be done on top. >>>>>> >>>>>> I meant to reuse the structures in virtio_net.h. >>>>>> >>>>>> Regards, >>>>>> Akihiko Odaki >>>>> >>>>> Thanks >>>>> >>>>>> >>>>>>> >>>>>>> Thanks >>>>>>> >>>>>>> >>>>>>>> >>>>>>>> Regards, >>>>>>>> Akihiko Odaki >>>>>>>> >>>>>>>>> >>>>>>>>> Thanks >>>>>>>>> >>>>>>>>>> >>>>>>>>>> Regards, >>>>>>>>>> Akihiko Odaki >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> The paramter will be duplicated if we have separate ioctls for RSS and >>>>>>>>>>>> hash reporting, and the kernel will have a chiken-egg problem when >>>>>>>>>>>> ensuring they are synchronized; when the ioctl for RSS is issued, should >>>>>>>>>>>> the kernel ensure the "types" parameter is identical with one specified >>>>>>>>>>>> for hash reporting? It will not work if the userspace may decide to >>>>>>>>>>>> configure hash reporting after RSS. >>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> See my reply above. >>>>>>>>>>> >>>>>>>>>>> Thanks >>>>>>>>>>> >>>>>>>>>> >>>>>>>>> >>>>>>>> >>>>>>> >>>>>> >>>>> >>>> >>> >> >
On Fri, Mar 21, 2025 at 1:57 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2025/03/21 10:13, Jason Wang wrote: > > On Thu, Mar 20, 2025 at 1:33 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2025/03/20 10:31, Jason Wang wrote: > >>> On Wed, Mar 19, 2025 at 1:29 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> On 2025/03/19 9:58, Jason Wang wrote: > >>>>> On Tue, Mar 18, 2025 at 6:10 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>> > >>>>>> On 2025/03/18 9:15, Jason Wang wrote: > >>>>>>> On Mon, Mar 17, 2025 at 3:07 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>> > >>>>>>>> On 2025/03/17 10:12, Jason Wang wrote: > >>>>>>>>> On Wed, Mar 12, 2025 at 1:03 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>>>> > >>>>>>>>>> On 2025/03/12 11:35, Jason Wang wrote: > >>>>>>>>>>> On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>>>>>> > >>>>>>>>>>>> On 2025/03/11 9:38, Jason Wang wrote: > >>>>>>>>>>>>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> On 2025/03/10 12:55, Jason Wang wrote: > >>>>>>>>>>>>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> Hash reporting > >>>>>>>>>>>>>>>> ============== > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> Allow the guest to reuse the hash value to make receive steering > >>>>>>>>>>>>>>>> consistent between the host and guest, and to save hash computation. > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> RSS > >>>>>>>>>>>>>>>> === > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> RSS is a receive steering algorithm that can be negotiated to use with > >>>>>>>>>>>>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. > >>>>>>>>>>>>>>>> However, computing the hash after the queue was chosen defeats the > >>>>>>>>>>>>>>>> purpose of RSS. > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> Another approach is to use eBPF steering program. This approach has > >>>>>>>>>>>>>>>> another downside: it cannot report the calculated hash due to the > >>>>>>>>>>>>>>>> restrictive nature of eBPF steering program. > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> Introduce the code to perform RSS to the kernel in order to overcome > >>>>>>>>>>>>>>>> thse challenges. An alternative solution is to extend the eBPF steering > >>>>>>>>>>>>>>>> program so that it will be able to report to the userspace, but I didn't > >>>>>>>>>>>>>>>> opt for it because extending the current mechanism of eBPF steering > >>>>>>>>>>>>>>>> program as is because it relies on legacy context rewriting, and > >>>>>>>>>>>>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while > >>>>>>>>>>>>>>>> the other relevant virtualization APIs such as KVM and vhost_net are > >>>>>>>>>>>>>>>> UAPIs. > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> > >>>>>>>>>>>>>>>> Tested-by: Lei Yang <leiyang@redhat.com> > >>>>>>>>>>>>>>>> --- [...] > > > >> > >>> > >>>> > >>>>> > >>>>>> > >>>>>>> > >>>>>>>> > >>>>>>>>> > >>>>>>>>> What's more, we've already had virito-net uAPI. Why not simply reusing them? > >>>>>>>> > >>>>>>>> See the above. > >>>>>>>> > >>>>>>>>> > >>>>>>>>>> > >>>>>>>>>>> > >>>>>>>>>>>> RSS and hash reporting must share > >>>>>>>>>>>> this parameter when both are enabled at the same time; otherwise RSS may > >>>>>>>>>>>> compute hash values that are not suited for hash reporting. > >>>>>>>>>>> > >>>>>>>>>>> Is this mandated by the spec? If yes, we can add a check. If not, > >>>>>>>>>>> userspace risk themselves as a mis-configuration which we don't need > >>>>>>>>>>> to bother. > >>>>>>>>>> > >>>>>>>>>> Yes, it is mandated. 5.1.6.4.3 Hash calculation for incoming packets says: > >>>>>>>>>> > A device attempts to calculate a per-packet hash in the following > >>>>>>>>>> > cases: > >>>>>>>>>> > > >>>>>>>>>> > - The feature VIRTIO_NET_F_RSS was negotiated. The device uses the > >>>>>>>>>> > hash to determine the receive virtqueue to place incoming packets. > >>>>>>>>>> > - The feature VIRTIO_NET_F_HASH_REPORT was negotiated. The device > >>>>>>>>>> > reports the hash value and the hash type with the packet. > >>>>>>>>>> > > >>>>>>>>>> > If the feature VIRTIO_NET_F_RSS was negotiated: > >>>>>>>>>> > > >>>>>>>>>> > - The device uses hash_types of the virtio_net_rss_config structure > >>>>>>>>>> > as ’Enabled hash types’ bitmask. > >>>>>>>>>> > - The device uses a key as defined in hash_key_data and > >>>>>>>>>> hash_key_length of the virtio_net_rss_config structure (see > >>>>>>>>>> > 5.1.6.5.7.1). > >>>>>>>>>> > > >>>>>>>>>> > If the feature VIRTIO_NET_F_RSS was not negotiated: > >>>>>>>>>> > > >>>>>>>>>> > - The device uses hash_types of the virtio_net_hash_config structure > >>>>>>>>>> > as ’Enabled hash types’ bitmask. > >>>>>>>>>> > - The device uses a key as defined in hash_key_data and > >>>>>>>>>> > hash_key_length of the virtio_net_hash_config structure (see > >>>>>>>>>> > .1.6.5.6.4). > >>>>>>>>>> > >>>>>>>>>> So when both VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT are > >>>>>>>>>> negotiated, virtio_net_rss_config not only controls RSS but also the > >>>>>>>>>> reported hash values and types. They cannot be divergent. > >>>>>>>>>> > >>>>>>>>>>> > >>>>>>>>>>> Note that spec use different commands for hash_report and rss. > >>>>>>>>>> > >>>>>>>>>> TUNSETVNETHASH is different from these commands in terms that it also > >>>>>>>>>> negotiates VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS. > >>>>>>>>>> > >>>>>>>>> > >>>>>>>>> There Are different "issues" here: > >>>>>>>>> > >>>>>>>>> 1) Whether or not we need to use a unified API for negotiating RSS and > >>>>>>>>> HASH_REPORT features > >>>>>>>>> 2) Whether or not we need to sue a unified API for setting RSS and > >>>>>>>>> HASH_REPORT configuration > >>>>>>>>> > >>>>>>>>> What I want to say is point 2. But what you raise is point 1. > >>>>>>>>> > >>>>>>>>> For simplicity, it looks to me like it's a call for having separated > >>>>>>>>> ioctls for feature negotiation (for example via TUNSETIFF). You may > >>>>>>>>> argue that either RSS or HASH_REPORT requires configurations, we can > >>>>>>>>> just follow what spec defines or not (e.g what happens if > >>>>>>>>> RSS/HASH_REPORT were negotiated but no configurations were set). > >>>>>>>> > >>>>>>>> Unfortunately TUNSETIFF does not fit in this use case. The flags set > >>>>>>>> with TUNSETIFF are fixed, but the guest can request a different feature > >>>>>>>> set anytime by resetting the device. > >>>>>>> > >>>>>>> TUNSETIFF, enables the device to be able to handle RSS/HASREPORT. > >>>>>>> TUNSETHASH/RSS. dealing with RSS/HASH command from userspace. > >>>>>> > >>>>>> We also needs to be able to disable them at runtime so that we can > >>>>>> handle resets. > >>>>> > >>>>> Via TUNSETHASH/RSS? I think it should have a way to accept parameters > >>>>> that disable RSS or hash report. > >>>> > >>>> That's what this patch implements. TUNSETVNETHASH accepts parameters to > >>>> choose what features to be enabled. > >>>> > >>>>> > >>>>>> > >>>>>>> > >>>>>>> This is the way we used to do for multi queue and vnet header. > >>>>>>> TUNSETIFF requires CAP_NET_ADMIN, this could be an extra safe guard > >>>>>>> for unprivileged userspace. > >>>>>> > >>>>>> I intend to allow using this feature without privilege. A VMM is usually > >>>>>> unprivileged and requiring a privilege to configure tuntap is too > >>>>>> prohibitive. > >>>>> > >>>>> For safety, tun is not allowed to be created by unprivileged users. > >>>>> And it's not to configure the tuntap dynamically, it's about telling > >>>>> the function that tuntap can have (not necessarily enabled though) . > >>>> > >>>> I don't think we need another barrier for the new functions. Once an > >>>> unprivileged user get a file descriptor of tuntap from a privileged > >>>> user, they are free to enable RSS and/or hash reporting. > >>> > >>> Only if such a feature is allowed by the privileged user. > >> > >> I don't see a reason not to allow the feature to unprivileged users. It > >> only complicates the setup. > > > > For safety, e.g reduce the chance for unprivileged user to explore > > part of the kernel codes. > > It indeed reduces the attack surface, but it's fine without the > reduction I guess? It's not a feature so complicated; I don't know how to define complicated things here but simplicity doesn't necessarily mean safety. > I saw there were > complicated changes like namespaces and io_uring that caused controversy > when exposing them to unprivilged users, but this feature is not like > them, I suppose. We limit feature setting through tun_set_iff in the past. Instead of trying to argue if RSS is safe to be enabled without TUNSETIFF, following what has been used in the past is always simpler and easier. > > > > >> > >>> > >>>> > >>>>> > >>>>>> > >>>>>>> > >>>>>>>> > >>>>>>>> > >> In the virtio-net specification, it is not defined what would > >>>>>>>> happen if > >>>>>>>>>> these features are negotiated but the VIRTIO_NET_CTRL_MQ_RSS_CONFIG or > >>>>>>>>>> VIRTIO_NET_CTRL_MQ_HASH_CONFIG commands are not sent. There is no such > >>>>>>>>>> ambiguity with TUNSETVNETHASH. > >>>>>>>>> > >>>>>>>>> So I don't see advantages of unifying hash reports and rss into a > >>>>>>>>> single ioctl. Let's just follow what has been done in the spec that > >>>>>>>>> uses separated commands. Tuntap is not a good place to debate whether > >>>>>>>>> those commands could be unified or not. We need to move it to the spec > >>>>>>>>> but assuming spec has been done, it might be too late or too few > >>>>>>>>> advantages for having another design. > >>>>>>>> > >>>>>>>> It makes sense for the spec to reuse the generic feature negotiation > >>>>>>>> mechanism, but the situation is different for tuntap; we cannot use > >>>>>>>> TUNSETIFF and need to define another. Then why don't we exploit this > >>>>>>>> opportunity to have an interface with well-defined semantics? > >>>>>>> > >>>>>>> That's perfectly fine, but it needs to be done in virtio-net's uAPI > >>>>>>> not tun's. What's more, if you think two commands are not > >>>>>>> well-defined, let's fix that in the virtio spec first. > >>>>>>> > >>>>>>>> The virtio > >>>>>>>> spec does its best as an interface between the host and guest and tuntap > >>>>>>>> does its best as an UAPI. > >>>>>>> > >>>>>>> See above, let's fix the uAPI first. We don't want DPDK to use tun's > >>>>>>> uAPI for RSS > >>>>>> > >>>>>> virtio-net's UAPI is for the virtio spec which has a capable generic > >>>>>> feature negotiation mechanism. tuntap needs its own feature negotiation > >>>>>> and it's nothing to do with virtio-net's UAPI. > >>>>> > >>>>> Well, I don't mean the part of the feature negotiation. I mean the > >>>>> part for rss and hash report configuration. > >>>> > >>>> The feature negotiation still matters when deciding the granularity of > >>>> ioctls. We need one ioctl for a feature negotiation, and to avoid having > >>>> an intermediate state, > >>> > >>> I don't understand this. For example, driver can choose to > >>> > >>> 1) negotiate RSS > >>> 2) do something else. > >>> 3) configure RSS > >>> > >>> Spec doesn't require those two to be configured at the same time, so > >>> "intermediate state" is allowed. > >> > >> The spec doesn't define what should happen in the intermediate state either. > > > > Yes but my point is that in the uAPI layer we don't need to care about > > the intermediate state. It can just work as other features, e.g having > > a default state after feature negotiation is more than enough. This is > > the way we deal with other features like vnet header etc. > > >> > >> For a hardware implementation I think it's fine whatever the > >> implementation defines as the intermediate state. But for the UAPI, it's > >> better avoiding having such a definition to keep the interface minimal > >> and maximize the UAPI stability. > > > > Well, even if you think there's an issue: > > > > 1) I don't see how we can avoid the intermediate state consider guest > > have such state > > 2) We need to "fix" virtio spec and virito-net first, tuntap is not > > the right place to workaround virtio specific issues > > Let me summarize my points that support having one ioctl to negotiate > features and configuration: > > The virtio spec has a generic feature negotiation mechanism and reusing > it resulted in having an intermediate state between the feature > negotiation and configuration. There is nothing wrong about that so we > don't need to "fix" the virtio spec. Good to know that. > > tuntap can also perform feature negotitaion with TUNSETIFF, but > TUNSETIFF have a few problems: TUNSETIFF is not feature negotiation, it's about device or queue provisioning as well as the features. From the view of the virtio, it is used to provision the device_features. For example, qemu only calls TUNSETIFF when it tries to open the tap fd. > > 1. It requires a privilege. One can argue that it reduces the attack > surface and it indeed does, but it's fine without the reduction I guess? > It's not a feature so complicated; I saw there were complicated changes > like namespaces and io_uring that caused controversy when exposing them > to unprivilged users, but this feature is not like them. I'm not asking to invent something new, but just reuse the security stuff that has been already used for more than a decade. It would be always easier to relax the check instead of enforce the check which may break uAPI. I can imagine it would not take a lot of codes to achieve this. > > 2. It cannot change the enabled feature set at runtime. The virtio spec > allows changing it by resetting. RSS is not the first feature of those requirements. TUN has implemented various virtio specific features in the past. > > So we need to design a set of new ioctls for both feature negotiation > and configuration. When doing so, eliminating the intermediate state is > a good principle to determine the optimal size of ioctls. As discussed, having a default state after TUNSETIFF is more than enough. That is how a multi queue/vnet header works: 1) for multiqueue, when IFF_MULTIQUEUE is set, starting with 1 queue 2) for vnet header, vnet header will be zero unless TUNSETVETHDR is called I don't see how RSS makes anything different. For intermediate states, with your proposal, it still requires the userspace to assume a default state when doing TUNSETVETRSS etc. > > In theory, it is possible to have small ioctls that set only one scalar > value or even one bit, but that doesn't make sense. This principle helps > determine the optimal size of ioctls; it minimizes the complexity of > both the userspace and the kernel. Well the complexity is not measured by the number of ioctls or structures. I basically meant: 1) IF_RSS to provision the device with the RSS features, this could be fetched from TUNGETIFF 2) Having a default state implemented in TUN that complies with the spec 3) TUNSETVET/GETHASH to send and receive RSS configuration > > > > >> > >>> > >>>> the ioctl should also do the configuration. Hence > >>>> that one ioctl should do all of the feature negotiation and configuration. > >>>> > >>>>> > >>>>>> > >>>>>> The structures for two commands have unused or redundant fields and a > >>>>>> flexible array in the middle of the structure, but they are ABIs so we > >>>>>> can't change it. > >>>>>> > >>>>>> DPDK is another reason to define tuntap's own UAPIs. They don't care > >>>>>> unused or redundant fields and a flexible array in middle that are > >>>>>> present in the virtio spec. It will also not want to deal with the > >>>>>> requirement of little endian. Constructing struct virtio_net_rss_config > >>>>>> is an extra burden for DPDK. > >>>>> > >>>>> I meant for vhost-user implementation in DPDK, it needs to use > >>>>> virtio-net uAPI not tuntap's for example. > >>>> > >>>> The vhost-user implementation will use tuntap's UAPIs for its ethernet > >>>> device backend. > >>> > >>> That sounds pretty weird, vhost-user has nothing related to tuntap. > >> > >> My expression in the last email was weird. More precisely, the ethernet > >> backend of tuntap will use the UAPIs, and the vhost-user will use the > >> ethernet backend in turn. > > > > I don't understand what "ethernet backend" means here. > > It is a driver that serves the Ethernet Device API, which is agnostic on > application and driver. The Ethernet Device API, including RSS > configuration is documented at: > https://doc.dpdk.org/api/rte__ethdev_8h.html > > The Ethernet API are not bound to the virtio spec since they are not > specific to the vhost application or the tuntap driver. Hence they > operate in native endian and do not have extra fields, and tuntap's > structures are more suited to the ethernet backend than the virtio ones. vhost-user is the device implementation not an ethernet driver. Why did it use tuntap's uAPI and do the useless endian conversion twice? > > > > >> > >>> > >>>> It uses the generic interface of ethernet device so for > >>>> RSS it will use functions like rte_eth_dev_rss_hash_update() for > >>>> example. tuntap's UAPIs are more suited to implement these interfaces as > >>>> they operate in native endian and don't have extra fields. > >>> > >>> Nope, for example it needs to use le for virtio_net_hdr if a modern > >>> device is used. But it needs a "native" endian according to the guest > >>> endian via TUNSETVNETLE/BE. We don't have a choice as virtio-net hdr > >>> support in tuntap is much earlier than modern devices. > >>> > >>> Let's don't do the same thing (native endian) for tuntap as RSS > >>> depends on modern, so we know it must be le. > >> > >> virtio_net_hdr is the data path while the current discussion is about > >> the control path. All configuration knobs of tuntap operates in the > >> native endian. > > > > Because they are not directly related to virtio specification. We > > don't want to duplicate virtio-net with our own version every time E.g > > once RSSv2 or aRFS were implemented. Or I would even introduce a > > single uAPI to transport possible cvq commands then we can avoid > > inventing new ioctls that just transport cvq commands. > > > >> > >> So I think we should stick to the little endian for the data path while > >> we should stick to the native endian for the control path to maximize > >> the consistency. > > > > I don't see a reason to differ datapath from control path. Virtio-net > > uAPI has been reused by tuntap for more than a decade. > > tuntap's control path all operate in the native endian. It's just a description of the current status, people can easily say tuntap's data path all operate in the native endian before the support of version 1.0. > They never used > the endian of the data path in the control path. Once virtio uAPI can be reused, we need to do that. > > > > >> > >>> > >>> > >>>> > >>>> DPDk applications other than vhost-user also matter; they do not care > >>>> what virtio does at all. > >>>> > >>>> > >> > >>>>>> On the other hand, Constructing tuntap-specific structures is not that > >>>>>> complicated for VMMs. > >>>>> > >>>>> Not complicated but redundant. > >>>>> > >>>>>> A VMM will need to inspect struct > >>>>>> virtio_net_rss_config anyway to handle migration and check its size so > >>>>>> it can store the values it inspected to struct tun_vnet_hash and struct > >>>>>> tun_vnet_hash_rss and pass them to the kernel. > >>>>> > >>>>> I don't see how rss and hash reports differ from what we have now. > >>>>> Those inspections must be done anyhow for compatibility for example > >>>>> the check of offloading features. Such steps could not be eliminated > >>>>> no matter how we design the uAPI. > >>>> > >>>> I explained the difference between the virtio and tuntap UAPIs, not > >>>> between RSS and hash reporting. > >>> > >>> See above. > >>> > >>>> > >>>>> > >>>>>> > >>>>>> The overall userspace implementation will be simpler by having > >>>>>> structures specifically tailored for the communication between the > >>>>>> userspace and kernel. > >>>>> > >>>>> This is exactly how a good uAPI should behave. If uAPI in virtio-net > >>>>> can't do this, I don't understand why uAPI in tuntap can solve it. > >>>> > >>>> The UAPI in virtio-net cannot do it because it's already fixed and it > >>>> also needs to perform endian conversion for the VM use case. tuntap > >>>> doesn't have these restrictions. > >>> > >>> Same here. > >>> > >>>> > >>>>> > >>>>>> > >>>>>>> > >>>>>>>> > >>>>>>>> I don't think there is an advantage to split ioctls to follow the spec > >>>>>>>> after all. It makes sense if we can pass-through virtio commands to > >>>>>>>> tuntap, but it is not possible as ioctl operation codes are different > >>>>>>>> from virtio commands. > >>>>>>> > >>>>>>> I don't see a connection with the operation code. For example, we can > >>>>>>> add new uAPIs in virtio-net which could be something like: > >>>>>>> > >>>>>>> struct virtio_net_rss_config_header { > >>>>>>> __le32 hash_types; > >>>>>>> __le16 indirection_table_mask; > >>>>>>> __le16 unclassified_queue; > >>>>>>> __le16 indirection_table[]; > >>>>>>> } > >>>>>>> > >>>>>>> struct virtio_net_rss_config_tailer { > >>>>>>> __le16 max_tx_vq; > >>>>>>> u8 hash_key_length; > >>>>>>> u8 hash_key_data[]; > >>>>>>> } > >>>>>>> > >>>>>>> These two are used by TUNSETVNETRSS. And simply reuse the > >>>>>>> virtio_net_hash_config for TUNSETVETHASH. > >>>>>> > > With this, we can tweak the virtio-net driver with this new uAPI. Then > >>>>>>> tap* can reuse this. > >>>>>> > >>>>>> I implemented a UAPI and driver change accordingly: > >>>>>> https://lore.kernel.org/r/20250318-virtio-v1-0-344caf336ddd@daynix.com > >>>>>> > >>>>>> This is a nice improvement for the driver, but I still don't think it is > >>>>>> suited for the UAPI of tuntap. > >>>>> > >>>>> Any reason for this? It should work like virtio_net_hdr. > >>>>> > >>>>>> The requirements of extra fields and > >>>>>> little endian cannot be removed from the virtio spec but they are > >>>>>> irrelevant for tuntap. > >>>>> > >>>>> I don't understand this part. What fields are "extra" and need to be > >>>>> removed from the spec? > >>>> > >>>> All fields not included in struct tun_vnet_hash and struct > >>>> tun_vnet_hash_rss. Namely, for struct virtio_net_hash_config: > >>>> - reserved > >>>> - hash_key_length > >>>> - hash_key_data > >>>> > >>>> For struct virtio_net_rss_config: > >>>> - max_tx_vq > >>>> - hash_key_length > >>> > >>> See my above reply, and I basically meant > >>> > >>> TUNSETVETHASH accept struct virtio_net_hash_config; > >>> TUNSETVETRSS accept struct virtio_net_rss_config_hdr + struct > >>> virtio_net_rss_config_trailer; > >> > >> That still bring the extra fields I noted in the last email. > > > > I don't know how to define "extra" here. Let's summarize here: > > > > Method A: > > > > 1) virtio specification use separate commands for has_report and rss > > 2) hash_port ans rss doesn't depend on each other > > 3) reuse virtio-net uAPI > > > > Method B: > > > > 1) trying to define and remove the "extra" fields in tuntap, and > > redefine it in TUNTAP > > > > It would always be much easier to start from simply reusing the > > virtio-net uAPI. Method B makes both the implementation and reviewing > > harder, as we need to > > > > 1) revisit the design of the virtio spec, this needs to be done in the > > virtio community not here > > 2) audit the difference between virtio spec and TUN/TAP, that's why we > > have a very long discussion here > > > > For example, the root cause of why you think the max_tx_vq is "extra" is: > > > > 1) The spec defines VIRTIO_NET_F_RSS and VIRTIO_NET_F_MQ as independent features > > 2) Your code tries to re-use IFF_MULTI_QUEUE for both VIRTIO_NET_F_RSS > > and VIRTIO_NET_F_MQ, this would have a lot of implications, e.g > > automatic steering might be applied when only RSS is negotiated etc > > > > The correct way to implement this is: > > > > 1) Introduce IFF_RSS and only set it during TUNSETIFF when device only > > offers RSS > > Please see the summary of "my points that support having one ioctl to > negotiate features and configuration" I wrote the above. > > > 2) reuse virtio-net uAPI and accept max_tx_vq and use that to change > > the queue(or queue paris) if necessary > > I don't think it's possible; we need file descriptors associated with > queues, which is something you cannot express with the virtio-net > structures. So: 1) Provisioning queues were still done via TUNSETIFF 2) We just need to hook max_tx_vq (the helpers were already there) to the helpers to enable and disable a queue instead of depending on the TUNSETQUEUE > > Regards, > Akihiko Odaki Thanks
On 2025/03/24 13:40, Jason Wang wrote: > On Fri, Mar 21, 2025 at 1:57 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2025/03/21 10:13, Jason Wang wrote: >>> On Thu, Mar 20, 2025 at 1:33 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> On 2025/03/20 10:31, Jason Wang wrote: >>>>> On Wed, Mar 19, 2025 at 1:29 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>> >>>>>> On 2025/03/19 9:58, Jason Wang wrote: >>>>>>> On Tue, Mar 18, 2025 at 6:10 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>> >>>>>>>> On 2025/03/18 9:15, Jason Wang wrote: >>>>>>>>> On Mon, Mar 17, 2025 at 3:07 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>> >>>>>>>>>> On 2025/03/17 10:12, Jason Wang wrote: >>>>>>>>>>> On Wed, Mar 12, 2025 at 1:03 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>>>> >>>>>>>>>>>> On 2025/03/12 11:35, Jason Wang wrote: >>>>>>>>>>>>> On Tue, Mar 11, 2025 at 2:11 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>>>>>> >>>>>>>>>>>>>> On 2025/03/11 9:38, Jason Wang wrote: >>>>>>>>>>>>>>> On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> On 2025/03/10 12:55, Jason Wang wrote: >>>>>>>>>>>>>>>>> On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Hash reporting >>>>>>>>>>>>>>>>>> ============== >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Allow the guest to reuse the hash value to make receive steering >>>>>>>>>>>>>>>>>> consistent between the host and guest, and to save hash computation. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> RSS >>>>>>>>>>>>>>>>>> === >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> RSS is a receive steering algorithm that can be negotiated to use with >>>>>>>>>>>>>>>>>> virtio_net. Conventionally the hash calculation was done by the VMM. >>>>>>>>>>>>>>>>>> However, computing the hash after the queue was chosen defeats the >>>>>>>>>>>>>>>>>> purpose of RSS. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Another approach is to use eBPF steering program. This approach has >>>>>>>>>>>>>>>>>> another downside: it cannot report the calculated hash due to the >>>>>>>>>>>>>>>>>> restrictive nature of eBPF steering program. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Introduce the code to perform RSS to the kernel in order to overcome >>>>>>>>>>>>>>>>>> thse challenges. An alternative solution is to extend the eBPF steering >>>>>>>>>>>>>>>>>> program so that it will be able to report to the userspace, but I didn't >>>>>>>>>>>>>>>>>> opt for it because extending the current mechanism of eBPF steering >>>>>>>>>>>>>>>>>> program as is because it relies on legacy context rewriting, and >>>>>>>>>>>>>>>>>> introducing kfunc-based eBPF will result in non-UAPI dependency while >>>>>>>>>>>>>>>>>> the other relevant virtualization APIs such as KVM and vhost_net are >>>>>>>>>>>>>>>>>> UAPIs. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> >>>>>>>>>>>>>>>>>> Tested-by: Lei Yang <leiyang@redhat.com> >>>>>>>>>>>>>>>>>> --- > > [...] > >>> >>>> >>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> What's more, we've already had virito-net uAPI. Why not simply reusing them? >>>>>>>>>> >>>>>>>>>> See the above. >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> RSS and hash reporting must share >>>>>>>>>>>>>> this parameter when both are enabled at the same time; otherwise RSS may >>>>>>>>>>>>>> compute hash values that are not suited for hash reporting. >>>>>>>>>>>>> >>>>>>>>>>>>> Is this mandated by the spec? If yes, we can add a check. If not, >>>>>>>>>>>>> userspace risk themselves as a mis-configuration which we don't need >>>>>>>>>>>>> to bother. >>>>>>>>>>>> >>>>>>>>>>>> Yes, it is mandated. 5.1.6.4.3 Hash calculation for incoming packets says: >>>>>>>>>>>> > A device attempts to calculate a per-packet hash in the following >>>>>>>>>>>> > cases: >>>>>>>>>>>> > >>>>>>>>>>>> > - The feature VIRTIO_NET_F_RSS was negotiated. The device uses the >>>>>>>>>>>> > hash to determine the receive virtqueue to place incoming packets. >>>>>>>>>>>> > - The feature VIRTIO_NET_F_HASH_REPORT was negotiated. The device >>>>>>>>>>>> > reports the hash value and the hash type with the packet. >>>>>>>>>>>> > >>>>>>>>>>>> > If the feature VIRTIO_NET_F_RSS was negotiated: >>>>>>>>>>>> > >>>>>>>>>>>> > - The device uses hash_types of the virtio_net_rss_config structure >>>>>>>>>>>> > as ’Enabled hash types’ bitmask. >>>>>>>>>>>> > - The device uses a key as defined in hash_key_data and >>>>>>>>>>>> hash_key_length of the virtio_net_rss_config structure (see >>>>>>>>>>>> > 5.1.6.5.7.1). >>>>>>>>>>>> > >>>>>>>>>>>> > If the feature VIRTIO_NET_F_RSS was not negotiated: >>>>>>>>>>>> > >>>>>>>>>>>> > - The device uses hash_types of the virtio_net_hash_config structure >>>>>>>>>>>> > as ’Enabled hash types’ bitmask. >>>>>>>>>>>> > - The device uses a key as defined in hash_key_data and >>>>>>>>>>>> > hash_key_length of the virtio_net_hash_config structure (see >>>>>>>>>>>> > .1.6.5.6.4). >>>>>>>>>>>> >>>>>>>>>>>> So when both VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT are >>>>>>>>>>>> negotiated, virtio_net_rss_config not only controls RSS but also the >>>>>>>>>>>> reported hash values and types. They cannot be divergent. >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Note that spec use different commands for hash_report and rss. >>>>>>>>>>>> >>>>>>>>>>>> TUNSETVNETHASH is different from these commands in terms that it also >>>>>>>>>>>> negotiates VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS. >>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> There Are different "issues" here: >>>>>>>>>>> >>>>>>>>>>> 1) Whether or not we need to use a unified API for negotiating RSS and >>>>>>>>>>> HASH_REPORT features >>>>>>>>>>> 2) Whether or not we need to sue a unified API for setting RSS and >>>>>>>>>>> HASH_REPORT configuration >>>>>>>>>>> >>>>>>>>>>> What I want to say is point 2. But what you raise is point 1. >>>>>>>>>>> >>>>>>>>>>> For simplicity, it looks to me like it's a call for having separated >>>>>>>>>>> ioctls for feature negotiation (for example via TUNSETIFF). You may >>>>>>>>>>> argue that either RSS or HASH_REPORT requires configurations, we can >>>>>>>>>>> just follow what spec defines or not (e.g what happens if >>>>>>>>>>> RSS/HASH_REPORT were negotiated but no configurations were set). >>>>>>>>>> >>>>>>>>>> Unfortunately TUNSETIFF does not fit in this use case. The flags set >>>>>>>>>> with TUNSETIFF are fixed, but the guest can request a different feature >>>>>>>>>> set anytime by resetting the device. >>>>>>>>> >>>>>>>>> TUNSETIFF, enables the device to be able to handle RSS/HASREPORT. >>>>>>>>> TUNSETHASH/RSS. dealing with RSS/HASH command from userspace. >>>>>>>> >>>>>>>> We also needs to be able to disable them at runtime so that we can >>>>>>>> handle resets. >>>>>>> >>>>>>> Via TUNSETHASH/RSS? I think it should have a way to accept parameters >>>>>>> that disable RSS or hash report. >>>>>> >>>>>> That's what this patch implements. TUNSETVNETHASH accepts parameters to >>>>>> choose what features to be enabled. >>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> This is the way we used to do for multi queue and vnet header. >>>>>>>>> TUNSETIFF requires CAP_NET_ADMIN, this could be an extra safe guard >>>>>>>>> for unprivileged userspace. >>>>>>>> >>>>>>>> I intend to allow using this feature without privilege. A VMM is usually >>>>>>>> unprivileged and requiring a privilege to configure tuntap is too >>>>>>>> prohibitive. >>>>>>> >>>>>>> For safety, tun is not allowed to be created by unprivileged users. >>>>>>> And it's not to configure the tuntap dynamically, it's about telling >>>>>>> the function that tuntap can have (not necessarily enabled though) . >>>>>> >>>>>> I don't think we need another barrier for the new functions. Once an >>>>>> unprivileged user get a file descriptor of tuntap from a privileged >>>>>> user, they are free to enable RSS and/or hash reporting. >>>>> >>>>> Only if such a feature is allowed by the privileged user. >>>> >>>> I don't see a reason not to allow the feature to unprivileged users. It >>>> only complicates the setup. >>> >>> For safety, e.g reduce the chance for unprivileged user to explore >>> part of the kernel codes. >> >> It indeed reduces the attack surface, but it's fine without the >> reduction I guess? It's not a feature so complicated; > > I don't know how to define complicated things here but simplicity > doesn't necessarily mean safety. > >> I saw there were >> complicated changes like namespaces and io_uring that caused controversy >> when exposing them to unprivilged users, but this feature is not like >> them, I suppose. > > We limit feature setting through tun_set_iff in the past. Instead of > trying to argue if RSS is safe to be enabled without TUNSETIFF, > following what has been used in the past is always simpler and easier. > >> >>> >>>> >>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> > >> In the virtio-net specification, it is not defined what would >>>>>>>>>> happen if >>>>>>>>>>>> these features are negotiated but the VIRTIO_NET_CTRL_MQ_RSS_CONFIG or >>>>>>>>>>>> VIRTIO_NET_CTRL_MQ_HASH_CONFIG commands are not sent. There is no such >>>>>>>>>>>> ambiguity with TUNSETVNETHASH. >>>>>>>>>>> >>>>>>>>>>> So I don't see advantages of unifying hash reports and rss into a >>>>>>>>>>> single ioctl. Let's just follow what has been done in the spec that >>>>>>>>>>> uses separated commands. Tuntap is not a good place to debate whether >>>>>>>>>>> those commands could be unified or not. We need to move it to the spec >>>>>>>>>>> but assuming spec has been done, it might be too late or too few >>>>>>>>>>> advantages for having another design. >>>>>>>>>> >>>>>>>>>> It makes sense for the spec to reuse the generic feature negotiation >>>>>>>>>> mechanism, but the situation is different for tuntap; we cannot use >>>>>>>>>> TUNSETIFF and need to define another. Then why don't we exploit this >>>>>>>>>> opportunity to have an interface with well-defined semantics? >>>>>>>>> >>>>>>>>> That's perfectly fine, but it needs to be done in virtio-net's uAPI >>>>>>>>> not tun's. What's more, if you think two commands are not >>>>>>>>> well-defined, let's fix that in the virtio spec first. >>>>>>>>> >>>>>>>>>> The virtio >>>>>>>>>> spec does its best as an interface between the host and guest and tuntap >>>>>>>>>> does its best as an UAPI. >>>>>>>>> >>>>>>>>> See above, let's fix the uAPI first. We don't want DPDK to use tun's >>>>>>>>> uAPI for RSS >>>>>>>> >>>>>>>> virtio-net's UAPI is for the virtio spec which has a capable generic >>>>>>>> feature negotiation mechanism. tuntap needs its own feature negotiation >>>>>>>> and it's nothing to do with virtio-net's UAPI. >>>>>>> >>>>>>> Well, I don't mean the part of the feature negotiation. I mean the >>>>>>> part for rss and hash report configuration. >>>>>> >>>>>> The feature negotiation still matters when deciding the granularity of >>>>>> ioctls. We need one ioctl for a feature negotiation, and to avoid having >>>>>> an intermediate state, >>>>> >>>>> I don't understand this. For example, driver can choose to >>>>> >>>>> 1) negotiate RSS >>>>> 2) do something else. >>>>> 3) configure RSS >>>>> >>>>> Spec doesn't require those two to be configured at the same time, so >>>>> "intermediate state" is allowed. >>>> >>>> The spec doesn't define what should happen in the intermediate state either. >>> >>> Yes but my point is that in the uAPI layer we don't need to care about >>> the intermediate state. It can just work as other features, e.g having >>> a default state after feature negotiation is more than enough. This is >>> the way we deal with other features like vnet header etc. >> > >> >>>> For a hardware implementation I think it's fine whatever the >>>> implementation defines as the intermediate state. But for the UAPI, it's >>>> better avoiding having such a definition to keep the interface minimal >>>> and maximize the UAPI stability. >>> >>> Well, even if you think there's an issue: >>> >>> 1) I don't see how we can avoid the intermediate state consider guest >>> have such state >>> 2) We need to "fix" virtio spec and virito-net first, tuntap is not >>> the right place to workaround virtio specific issues >> >> Let me summarize my points that support having one ioctl to negotiate >> features and configuration: >> >> The virtio spec has a generic feature negotiation mechanism and reusing >> it resulted in having an intermediate state between the feature >> negotiation and configuration. There is nothing wrong about that so we >> don't need to "fix" the virtio spec. > > Good to know that. > >> >> tuntap can also perform feature negotitaion with TUNSETIFF, but >> TUNSETIFF have a few problems: > > TUNSETIFF is not feature negotiation, it's about device or queue > provisioning as well as the features. From the view of the virtio, it > is used to provision the device_features. For example, qemu only calls > TUNSETIFF when it tries to open the tap fd. > >> >> 1. It requires a privilege. One can argue that it reduces the attack >> surface and it indeed does, but it's fine without the reduction I guess? >> It's not a feature so complicated; I saw there were complicated changes >> like namespaces and io_uring that caused controversy when exposing them >> to unprivilged users, but this feature is not like them. > > I'm not asking to invent something new, but just reuse the security > stuff that has been already used for more than a decade. It would be > always easier to relax the check instead of enforce the check which > may break uAPI. I can imagine it would not take a lot of codes to > achieve this. > >> >> 2. It cannot change the enabled feature set at runtime. The virtio spec >> allows changing it by resetting. > > RSS is not the first feature of those requirements. TUN has > implemented various virtio specific features in the past. > >> >> So we need to design a set of new ioctls for both feature negotiation >> and configuration. When doing so, eliminating the intermediate state is >> a good principle to determine the optimal size of ioctls. > > As discussed, having a default state after TUNSETIFF is more than > enough. That is how a multi queue/vnet header works: > > 1) for multiqueue, when IFF_MULTIQUEUE is set, starting with 1 queue > 2) for vnet header, vnet header will be zero unless TUNSETVETHDR is called > > I don't see how RSS makes anything different. > > For intermediate states, with your proposal, it still requires the > userspace to assume a default state when doing TUNSETVETRSS etc. > >> >> In theory, it is possible to have small ioctls that set only one scalar >> value or even one bit, but that doesn't make sense. This principle helps >> determine the optimal size of ioctls; it minimizes the complexity of >> both the userspace and the kernel. > > Well the complexity is not measured by the number of ioctls or > structures. I basically meant: > > 1) IF_RSS to provision the device with the RSS features, this could be > fetched from TUNGETIFF It requires changes for libvirt, qemu-bridge-helper, and potentially other VMMs and DPDK. I would like to avoid such chores if the only reason to do so is the presence of prior examples. The features available via TUNSETIFF is fetched with TUNGETFEATURES, not TUNGETIFF. > 2) Having a default state implemented in TUN that complies with the spec If TUNSETIFF is only for device_feature, a natural choice will be to initialize driver_feature with VIRTIO_NET_F_HASH_REPORT and VIRTIO_NET_F_RSS unset. > 3) TUNSETVET/GETHASH to send and receive RSS configuration If TUNSETIFF is only for device_feature, we need two other ioctls: - one to set driver_feature and the RSS/hash reporting configuration - one that tells supported hash types The former is implemented with TUNSETVNETHASH. The latter is implemented with TUNGETVNETHASHCAP. > >> >>> >>>> >>>>> >>>>>> the ioctl should also do the configuration. Hence >>>>>> that one ioctl should do all of the feature negotiation and configuration. >>>>>> >>>>>>> >>>>>>>> >>>>>>>> The structures for two commands have unused or redundant fields and a >>>>>>>> flexible array in the middle of the structure, but they are ABIs so we >>>>>>>> can't change it. >>>>>>>> >>>>>>>> DPDK is another reason to define tuntap's own UAPIs. They don't care >>>>>>>> unused or redundant fields and a flexible array in middle that are >>>>>>>> present in the virtio spec. It will also not want to deal with the >>>>>>>> requirement of little endian. Constructing struct virtio_net_rss_config >>>>>>>> is an extra burden for DPDK. >>>>>>> >>>>>>> I meant for vhost-user implementation in DPDK, it needs to use >>>>>>> virtio-net uAPI not tuntap's for example. >>>>>> >>>>>> The vhost-user implementation will use tuntap's UAPIs for its ethernet >>>>>> device backend. >>>>> >>>>> That sounds pretty weird, vhost-user has nothing related to tuntap. >>>> >>>> My expression in the last email was weird. More precisely, the ethernet >>>> backend of tuntap will use the UAPIs, and the vhost-user will use the >>>> ethernet backend in turn. >>> >>> I don't understand what "ethernet backend" means here. >> >> It is a driver that serves the Ethernet Device API, which is agnostic on >> application and driver. The Ethernet Device API, including RSS >> configuration is documented at: >> https://doc.dpdk.org/api/rte__ethdev_8h.html >> >> The Ethernet API are not bound to the virtio spec since they are not >> specific to the vhost application or the tuntap driver. Hence they >> operate in native endian and do not have extra fields, and tuntap's >> structures are more suited to the ethernet backend than the virtio ones. > > vhost-user is the device implementation not an ethernet driver. Why > did it use tuntap's uAPI and do the useless endian conversion twice? Here the ethernet backend refers to the code that interacts with tuntap instead of vhost-user. Please note that I wrote "the vhost-user will use the ethernet backend" earlier. > >> >>> >>>> >>>>> >>>>>> It uses the generic interface of ethernet device so for >>>>>> RSS it will use functions like rte_eth_dev_rss_hash_update() for >>>>>> example. tuntap's UAPIs are more suited to implement these interfaces as >>>>>> they operate in native endian and don't have extra fields. >>>>> >>>>> Nope, for example it needs to use le for virtio_net_hdr if a modern >>>>> device is used. But it needs a "native" endian according to the guest >>>>> endian via TUNSETVNETLE/BE. We don't have a choice as virtio-net hdr >>>>> support in tuntap is much earlier than modern devices. >>>>> >>>>> Let's don't do the same thing (native endian) for tuntap as RSS >>>>> depends on modern, so we know it must be le. >>>> >>>> virtio_net_hdr is the data path while the current discussion is about >>>> the control path. All configuration knobs of tuntap operates in the >>>> native endian. >>> >>> Because they are not directly related to virtio specification. We >>> don't want to duplicate virtio-net with our own version every time E.g >>> once RSSv2 or aRFS were implemented. Or I would even introduce a >>> single uAPI to transport possible cvq commands then we can avoid >>> inventing new ioctls that just transport cvq commands. >>> >>>> >>>> So I think we should stick to the little endian for the data path while >>>> we should stick to the native endian for the control path to maximize >>>> the consistency. >>> >>> I don't see a reason to differ datapath from control path. Virtio-net >>> uAPI has been reused by tuntap for more than a decade. >> >> tuntap's control path all operate in the native endian. > > It's just a description of the current status, people can easily say > tuntap's data path all operate in the native endian before the support > of version 1.0. > >> They never used >> the endian of the data path in the control path. > > Once virtio uAPI can be reused, we need to do that. It discourages the usage of virtio UAPI; when the userspace wants native endian, why will we want to force using virtio UAPI, which requires little endian? > >> >>> >>>> >>>>> >>>>> >>>>>> >>>>>> DPDk applications other than vhost-user also matter; they do not care >>>>>> what virtio does at all. >>>>>> >>>>>> > >> >>>>>>>> On the other hand, Constructing tuntap-specific structures is not that >>>>>>>> complicated for VMMs. >>>>>>> >>>>>>> Not complicated but redundant. >>>>>>> >>>>>>>> A VMM will need to inspect struct >>>>>>>> virtio_net_rss_config anyway to handle migration and check its size so >>>>>>>> it can store the values it inspected to struct tun_vnet_hash and struct >>>>>>>> tun_vnet_hash_rss and pass them to the kernel. >>>>>>> >>>>>>> I don't see how rss and hash reports differ from what we have now. >>>>>>> Those inspections must be done anyhow for compatibility for example >>>>>>> the check of offloading features. Such steps could not be eliminated >>>>>>> no matter how we design the uAPI. >>>>>> >>>>>> I explained the difference between the virtio and tuntap UAPIs, not >>>>>> between RSS and hash reporting. >>>>> >>>>> See above. >>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>> The overall userspace implementation will be simpler by having >>>>>>>> structures specifically tailored for the communication between the >>>>>>>> userspace and kernel. >>>>>>> >>>>>>> This is exactly how a good uAPI should behave. If uAPI in virtio-net >>>>>>> can't do this, I don't understand why uAPI in tuntap can solve it. >>>>>> >>>>>> The UAPI in virtio-net cannot do it because it's already fixed and it >>>>>> also needs to perform endian conversion for the VM use case. tuntap >>>>>> doesn't have these restrictions. >>>>> >>>>> Same here. >>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> I don't think there is an advantage to split ioctls to follow the spec >>>>>>>>>> after all. It makes sense if we can pass-through virtio commands to >>>>>>>>>> tuntap, but it is not possible as ioctl operation codes are different >>>>>>>>>> from virtio commands. >>>>>>>>> >>>>>>>>> I don't see a connection with the operation code. For example, we can >>>>>>>>> add new uAPIs in virtio-net which could be something like: >>>>>>>>> >>>>>>>>> struct virtio_net_rss_config_header { >>>>>>>>> __le32 hash_types; >>>>>>>>> __le16 indirection_table_mask; >>>>>>>>> __le16 unclassified_queue; >>>>>>>>> __le16 indirection_table[]; >>>>>>>>> } >>>>>>>>> >>>>>>>>> struct virtio_net_rss_config_tailer { >>>>>>>>> __le16 max_tx_vq; >>>>>>>>> u8 hash_key_length; >>>>>>>>> u8 hash_key_data[]; >>>>>>>>> } >>>>>>>>> >>>>>>>>> These two are used by TUNSETVNETRSS. And simply reuse the >>>>>>>>> virtio_net_hash_config for TUNSETVETHASH. >>>>>>>> > > With this, we can tweak the virtio-net driver with this new uAPI. Then >>>>>>>>> tap* can reuse this. >>>>>>>> >>>>>>>> I implemented a UAPI and driver change accordingly: >>>>>>>> https://lore.kernel.org/r/20250318-virtio-v1-0-344caf336ddd@daynix.com >>>>>>>> >>>>>>>> This is a nice improvement for the driver, but I still don't think it is >>>>>>>> suited for the UAPI of tuntap. >>>>>>> >>>>>>> Any reason for this? It should work like virtio_net_hdr. >>>>>>> >>>>>>>> The requirements of extra fields and >>>>>>>> little endian cannot be removed from the virtio spec but they are >>>>>>>> irrelevant for tuntap. >>>>>>> >>>>>>> I don't understand this part. What fields are "extra" and need to be >>>>>>> removed from the spec? >>>>>> >>>>>> All fields not included in struct tun_vnet_hash and struct >>>>>> tun_vnet_hash_rss. Namely, for struct virtio_net_hash_config: >>>>>> - reserved >>>>>> - hash_key_length >>>>>> - hash_key_data >>>>>> >>>>>> For struct virtio_net_rss_config: >>>>>> - max_tx_vq >>>>>> - hash_key_length >>>>> >>>>> See my above reply, and I basically meant >>>>> >>>>> TUNSETVETHASH accept struct virtio_net_hash_config; >>>>> TUNSETVETRSS accept struct virtio_net_rss_config_hdr + struct >>>>> virtio_net_rss_config_trailer; >>>> >>>> That still bring the extra fields I noted in the last email. >>> >>> I don't know how to define "extra" here. Let's summarize here: >>> >>> Method A: >>> >>> 1) virtio specification use separate commands for has_report and rss >>> 2) hash_port ans rss doesn't depend on each other >>> 3) reuse virtio-net uAPI >>> >>> Method B: >>> >>> 1) trying to define and remove the "extra" fields in tuntap, and >>> redefine it in TUNTAP >>> >>> It would always be much easier to start from simply reusing the >>> virtio-net uAPI. Method B makes both the implementation and reviewing >>> harder, as we need to >>> >>> 1) revisit the design of the virtio spec, this needs to be done in the >>> virtio community not here >>> 2) audit the difference between virtio spec and TUN/TAP, that's why we >>> have a very long discussion here >>> >>> For example, the root cause of why you think the max_tx_vq is "extra" is: >>> >>> 1) The spec defines VIRTIO_NET_F_RSS and VIRTIO_NET_F_MQ as independent features >>> 2) Your code tries to re-use IFF_MULTI_QUEUE for both VIRTIO_NET_F_RSS >>> and VIRTIO_NET_F_MQ, this would have a lot of implications, e.g >>> automatic steering might be applied when only RSS is negotiated etc >>> >>> The correct way to implement this is: >>> >>> 1) Introduce IFF_RSS and only set it during TUNSETIFF when device only >>> offers RSS >> >> Please see the summary of "my points that support having one ioctl to >> negotiate features and configuration" I wrote the above. >> >>> 2) reuse virtio-net uAPI and accept max_tx_vq and use that to change >>> the queue(or queue paris) if necessary >> >> I don't think it's possible; we need file descriptors associated with >> queues, which is something you cannot express with the virtio-net >> structures. > > So: > > 1) Provisioning queues were still done via TUNSETIFF > 2) We just need to hook max_tx_vq (the helpers were already there) to > the helpers to enable and disable a queue instead of depending on the > TUNSETQUEUE 1) is sufficient and 2) is redundant. We cannot provision queues according to max_tx_vq so the only way to remove this redundancy is not to have the field in tuntap's UAPI in the first place. Regards, Akihiko Odaki > >> >> Regards, >> Akihiko Odaki > > Thanks >
diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644 --- a/Documentation/networking/tuntap.rst +++ b/Documentation/networking/tuntap.rst @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: return ioctl(fd, TUNSETQUEUE, (void *)&ifr); } +3.4 Reference +------------- + +``linux/if_tun.h`` defines the interface described below: + +.. kernel-doc:: include/uapi/linux/if_tun.h + Universal TUN/TAP device driver Frequently Asked Question ========================================================= diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 1fd5acdc73c6af0e1a861867039c3624fc618e25..aecfd244dd83585fea2c5b815dcd787c58166c28 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -395,6 +395,7 @@ config TUN tristate "Universal TUN/TAP device driver support" depends on INET select CRC32 + select SKB_EXTENSIONS help TUN/TAP provides packet reception and transmission for user space programs. It can be viewed as a simple Point-to-Point or Ethernet diff --git a/drivers/net/tap.c b/drivers/net/tap.c index d4ece538f1b23789ca60caa6232690e4d0a4d14a..9428b63ec27e7f92e78a78afcb5e24383862c00d 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -49,6 +49,10 @@ struct major_info { struct list_head next; }; +struct tap_skb_cb { + struct virtio_net_hash hash; +}; + #define GOODCOPY_LEN 128 static const struct proto_ops tap_socket_ops; @@ -179,6 +183,22 @@ static void tap_put_queue(struct tap_queue *q) sock_put(&q->sk); } +static struct tap_skb_cb *tap_skb_cb(const struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct tap_skb_cb)); + return (struct tap_skb_cb *)skb->cb; +} + +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb) +{ + return &tap_skb_cb(skb)->hash; +} + +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb) +{ + return &tap_skb_cb(skb)->hash; +} + /* * Select a queue based on the rxq of the device on which this packet * arrived. If the incoming device is not mq, calculate a flow hash @@ -189,6 +209,7 @@ static void tap_put_queue(struct tap_queue *q) static struct tap_queue *tap_get_queue(struct tap_dev *tap, struct sk_buff *skb) { + struct flow_keys_basic keys_basic; struct tap_queue *queue = NULL; /* Access to taps array is protected by rcu, but access to numvtaps * isn't. Below we use it to lookup a queue, but treat it as a hint @@ -196,17 +217,47 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, * racing against queue removal. */ int numvtaps = READ_ONCE(tap->numvtaps); + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash); __u32 rxq; + *tap_skb_cb(skb) = (struct tap_skb_cb) { + .hash = { .report = VIRTIO_NET_HASH_REPORT_NONE } + }; + if (!numvtaps) goto out; if (numvtaps == 1) goto single; + if (vnet_hash) { + if ((vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { + rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash); + queue = rcu_dereference(tap->taps[rxq]); + goto out; + } + + if (!skb->l4_hash && !skb->sw_hash) { + struct flow_keys keys; + + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + rxq = flow_hash_from_keys(&keys); + keys_basic = (struct flow_keys_basic) { + .control = keys.control, + .basic = keys.basic + }; + } else { + skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + rxq = skb->hash; + } + } else { + rxq = skb_get_hash(skb); + } + /* Check if we can use flow to select a queue */ - rxq = skb_get_hash(skb); if (rxq) { + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash); queue = rcu_dereference(tap->taps[rxq % numvtaps]); goto out; } @@ -711,11 +762,12 @@ static ssize_t tap_put_user(struct tap_queue *q, int total; if (q->flags & IFF_VNET_HDR) { - struct virtio_net_hdr vnet_hdr; + struct virtio_net_hdr_v1_hash vnet_hdr; vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); - ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); + ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb, + tap_find_hash, &vnet_hdr); if (ret) return ret; @@ -992,6 +1044,16 @@ static long tap_ioctl(struct file *file, unsigned int cmd, rtnl_unlock(); return ret; + case TUNGETVNETHASHCAP: + return tun_vnet_ioctl_gethashcap(argp); + + case TUNSETVNETHASH: + rtnl_lock(); + tap = rtnl_dereference(q->tap); + ret = tap ? tun_vnet_ioctl_sethash(&tap->vnet_hash, true, argp) : -EBADFD; + rtnl_unlock(); + return ret; + case SIOCGIFHWADDR: rtnl_lock(); tap = tap_get_tap_dev(q); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index d8f4d3e996a7a81d1f8b04635054081671a14f07..520013df416e93d3a50b46be9b53ae9ab410eab4 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -209,6 +209,7 @@ struct tun_struct { struct bpf_prog __rcu *xdp_prog; struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; + struct tun_vnet_hash_container __rcu *vnet_hash; struct ethtool_link_ksettings link_ksettings; /* init args */ struct file *file; @@ -451,20 +452,37 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) e->rps_rxhash = hash; } +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb) +{ + return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); +} + +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb) +{ + return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); +} + /* We try to identify a flow through its rxhash. The reason that * we do not check rxq no. is because some cards(e.g 82599), chooses * the rxq based on the txq where the last packet of the flow comes. As * the userspace application move between processors, we may get a * different rxq no. here. */ -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) +static u16 tun_automq_select_queue(struct tun_struct *tun, + const struct tun_vnet_hash_container *vnet_hash, + struct sk_buff *skb) { + struct flow_keys keys; + struct flow_keys_basic keys_basic; struct tun_flow_entry *e; u32 txq, numqueues; numqueues = READ_ONCE(tun->numqueues); - txq = __skb_get_hash_symmetric(skb); + memset(&keys, 0, sizeof(keys)); + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); + + txq = flow_hash_from_keys(&keys); e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); if (e) { tun_flow_save_rps_rxhash(e, txq); @@ -473,6 +491,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) txq = reciprocal_scale(txq, numqueues); } + keys_basic = (struct flow_keys_basic) { + .control = keys.control, + .basic = keys.basic + }; + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq, + tun_add_hash); + return txq; } @@ -500,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, u16 ret; rcu_read_lock(); - if (rcu_dereference(tun->steering_prog)) + if (rcu_dereference(tun->steering_prog)) { ret = tun_ebpf_select_queue(tun, skb); - else - ret = tun_automq_select_queue(tun, skb); + } else { + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); + + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) + ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash, + skb, tun_add_hash); + else + ret = tun_automq_select_queue(tun, vnet_hash, skb); + } rcu_read_unlock(); return ret; @@ -1987,7 +2019,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, ssize_t ret; if (tun->flags & IFF_VNET_HDR) { - struct virtio_net_hdr gso = { 0 }; + struct virtio_net_hdr_v1_hash gso = { 0 }; vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); @@ -2040,9 +2072,10 @@ static ssize_t tun_put_user(struct tun_struct *tun, } if (vnet_hdr_sz) { - struct virtio_net_hdr gso; + struct virtio_net_hdr_v1_hash gso; - ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); + ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, + skb, tun_find_hash, &gso); if (ret) return ret; @@ -2223,6 +2256,7 @@ static void tun_free_netdev(struct net_device *dev) security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); __tun_set_ebpf(tun, &tun->filter_prog, NULL); + kfree_rcu_mightsleep(rcu_access_pointer(tun->vnet_hash)); } static void tun_setup(struct net_device *dev) @@ -2921,13 +2955,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) } static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, - void __user *data) + int fd) { struct bpf_prog *prog; - int fd; - - if (copy_from_user(&fd, data, sizeof(fd))) - return -EFAULT; if (fd == -1) { prog = NULL; @@ -2993,7 +3023,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, int ifindex; int sndbuf; int ret; + int fd; bool do_notify = false; + struct tun_vnet_hash_container *vnet_hash; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { @@ -3020,7 +3052,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, rtnl_lock(); tun = tun_get(tfile); - if (cmd == TUNSETIFF) { + switch (cmd) { + case TUNSETIFF: ret = -EEXIST; if (tun) goto unlock; @@ -3035,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; goto unlock; - } - if (cmd == TUNSETIFINDEX) { + + case TUNSETIFINDEX: ret = -EPERM; if (tun) goto unlock; @@ -3050,6 +3083,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = 0; tfile->ifindex = ifindex; goto unlock; + + case TUNGETVNETHASHCAP: + ret = tun_vnet_ioctl_gethashcap(argp); + goto unlock; } ret = -EBADFD; @@ -3230,11 +3267,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; case TUNSETSTEERINGEBPF: - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); + if (get_user(fd, (int __user *)argp)) { + ret = -EFAULT; + break; + } + + vnet_hash = rtnl_dereference(tun->vnet_hash); + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { + ret = -EBUSY; + break; + } + + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); break; case TUNSETFILTEREBPF: - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); + if (get_user(fd, (int __user *)argp)) { + ret = -EFAULT; + break; + } + + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); break; case TUNSETCARRIER: @@ -3252,8 +3305,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = open_related_ns(&net->ns, get_net_ns); break; + case TUNSETVNETHASH: + ret = tun_vnet_ioctl_sethash(&tun->vnet_hash, + !rtnl_dereference(tun->steering_prog), + argp); + break; + default: - ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); + ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, + cmd, argp); break; } diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h index 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed 100644 --- a/drivers/net/tun_vnet.h +++ b/drivers/net/tun_vnet.h @@ -6,6 +6,16 @@ #define TUN_VNET_LE 0x80000000 #define TUN_VNET_BE 0x40000000 +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *); +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *); + +struct tun_vnet_hash_container { + struct tun_vnet_hash common; + struct tun_vnet_hash_rss rss; + u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; + u16 rss_indirection_table[]; +}; + static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) { bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && @@ -107,6 +117,123 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, } } +static inline long tun_vnet_ioctl_gethashcap(void __user *argp) +{ + static const struct tun_vnet_hash cap = { + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES + }; + + return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; +} + +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp, + bool can_rss, void __user *argp) +{ + struct tun_vnet_hash hash_buf; + struct tun_vnet_hash_container *hash; + + if (copy_from_user(&hash_buf, argp, sizeof(hash_buf))) + return -EFAULT; + argp = (struct tun_vnet_hash __user *)argp + 1; + + if (hash_buf.flags & TUN_VNET_HASH_RSS) { + struct tun_vnet_hash_rss rss; + size_t indirection_table_size; + size_t key_size; + size_t size; + + if (!can_rss) + return -EBUSY; + + if (copy_from_user(&rss, argp, sizeof(rss))) + return -EFAULT; + argp = (struct tun_vnet_hash_rss __user *)argp + 1; + + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; + key_size = virtio_net_hash_key_length(hash_buf.types); + size = struct_size(hash, rss_indirection_table, + (size_t)rss.indirection_table_mask + 1); + + hash = kmalloc(size, GFP_KERNEL); + if (!hash) + return -ENOMEM; + + if (copy_from_user(hash->rss_indirection_table, + argp, indirection_table_size)) { + kfree(hash); + return -EFAULT; + } + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; + + if (copy_from_user(hash->rss_key, argp, key_size)) { + kfree(hash); + return -EFAULT; + } + + virtio_net_toeplitz_convert_key(hash->rss_key, key_size); + hash->rss = rss; + } else { + hash = kmalloc(sizeof(hash->common), GFP_KERNEL); + if (!hash) + return -ENOMEM; + } + + hash->common = hash_buf; + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash)); + return 0; +} + +static void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash, + struct sk_buff *skb, + const struct flow_keys_basic *keys, + u32 value, + tun_vnet_hash_add vnet_hash_add) +{ + struct virtio_net_hash *report; + + if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT)) + return; + + report = vnet_hash_add(skb); + if (!report) + return; + + *report = (struct virtio_net_hash) { + .report = virtio_net_hash_report(hash->common.types, keys), + .value = value + }; +} + +static u16 tun_vnet_rss_select_queue(u32 numqueues, + const struct tun_vnet_hash_container *hash, + struct sk_buff *skb, + tun_vnet_hash_add vnet_hash_add) +{ + struct virtio_net_hash *report; + struct virtio_net_hash ret; + u16 txq, index; + + if (!numqueues) + return 0; + + virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret); + + if (!ret.report) + return hash->rss.unclassified_queue % numqueues; + + if (hash->common.flags & TUN_VNET_HASH_REPORT) { + report = vnet_hash_add(skb); + if (report) + *report = ret; + } + + index = ret.value & hash->rss.indirection_table_mask; + txq = READ_ONCE(hash->rss_indirection_table[index]); + + return txq % numqueues; +} + static inline int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, struct virtio_net_hdr *hdr) @@ -135,15 +262,17 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags, } static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, - const struct virtio_net_hdr *hdr) + const struct virtio_net_hdr_v1_hash *hdr) { + int content_sz = MIN(sizeof(*hdr), sz); + if (unlikely(iov_iter_count(iter) < sz)) return -EINVAL; - if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) + if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz)) return -EFAULT; - if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) + if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) return -EFAULT; return 0; @@ -155,26 +284,38 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); } -static inline int tun_vnet_hdr_from_skb(unsigned int flags, +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags, const struct net_device *dev, const struct sk_buff *skb, - struct virtio_net_hdr *hdr) + tun_vnet_hash_find vnet_hash_find, + struct virtio_net_hdr_v1_hash *hdr) { int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; + const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ? + NULL : vnet_hash_find(skb); + + *hdr = (struct virtio_net_hdr_v1_hash) { + .hash_report = VIRTIO_NET_HASH_REPORT_NONE + }; + + if (report) { + hdr->hash_value = cpu_to_le32(report->value); + hdr->hash_report = cpu_to_le16(report->report); + } - if (virtio_net_hdr_from_skb(skb, hdr, + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, tun_vnet_is_little_endian(flags), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb); if (net_ratelimit()) { netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", - sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size), - tun_vnet16_to_cpu(flags, hdr->hdr_len)); + sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size), + tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len)); print_hex_dump(KERN_ERR, "tun: ", DUMP_PREFIX_NONE, 16, 1, skb->head, - min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true); + min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true); } WARN_ON_ONCE(1); return -EINVAL; diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h index 553552fa635c3e1e53d1a63c203d32e4c4fd5a4f..7334c46a3f101675a0d4e5a036987cfe18842f9f 100644 --- a/include/linux/if_tap.h +++ b/include/linux/if_tap.h @@ -31,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f) #define MAX_TAP_QUEUES 256 struct tap_queue; +struct tun_vnet_hash_container; struct tap_dev { struct net_device *dev; @@ -43,6 +44,7 @@ struct tap_dev { int numqueues; netdev_features_t tap_features; int minor; + struct tun_vnet_hash_container __rcu *vnet_hash; void (*update_features)(struct tap_dev *tap, netdev_features_t features); void (*count_tx_dropped)(struct tap_dev *tap); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bb2b751d274acff931281a72e8b4b0c699b4e8af..cdd793f1c360ad5f63fcc4cbf67d845f5e2ccf6f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4842,6 +4842,9 @@ enum skb_ext_id { #endif #if IS_ENABLED(CONFIG_MCTP_FLOWS) SKB_EXT_MCTP, +#endif +#if IS_ENABLED(CONFIG_TUN) + SKB_EXT_TUN_VNET_HASH, #endif SKB_EXT_NUM, /* must be last */ }; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 287cdc81c9390c289a30545aa7ed23d81c3329d3..4887f97500a870c7ef3c96a5837b2d0a5a225040 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -62,6 +62,42 @@ #define TUNSETCARRIER _IOW('T', 226, int) #define TUNGETDEVNETNS _IO('T', 227) +/** + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability. + * + * The argument is a pointer to &struct tun_vnet_hash which will store the + * maximal virtio_net hashing configuration. + */ +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash) + +/** + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing + * + * The argument is a pointer to &struct tun_vnet_hash. + * + * The argument is a pointer to the compound of the following in order if + * %TUN_VNET_HASH_RSS is set: + * + * 1. &struct tun_vnet_hash + * 2. &struct tun_vnet_hash_rss + * 3. Indirection table + * 4. Key + * + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal + * to the size of &struct virtio_net_hdr_v1_hash. + * + * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will + * always be little-endian. + * + * This ioctl results in %EBADFD if the underlying device is deleted. It affects + * all queues attached to the same device. + * + * This ioctl currently has no effect on XDP packets and packets with + * queue_mapping set by TC. + */ +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash) + /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 #define IFF_TAP 0x0002 @@ -115,4 +151,43 @@ struct tun_filter { __u8 addr[][ETH_ALEN]; }; +/** + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost + */ +#define TUN_VNET_HASH_REPORT 0x0001 + +/** + * define TUN_VNET_HASH_RSS - Request virtio_net RSS + * + * This is mutually exclusive with eBPF steering program. + */ +#define TUN_VNET_HASH_RSS 0x0002 + +/** + * struct tun_vnet_hash - virtio_net hashing configuration + * @flags: + * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS + * @pad: + * Should be filled with zero before passing to %TUNSETVNETHASH + * @types: + * Bitmask of allowed hash types + */ +struct tun_vnet_hash { + __u16 flags; + __u8 pad[2]; + __u32 types; +}; + +/** + * struct tun_vnet_hash_rss - virtio_net RSS configuration + * @indirection_table_mask: + * Bitmask to be applied to the indirection table index + * @unclassified_queue: + * The index of the queue to place unclassified packets in + */ +struct tun_vnet_hash_rss { + __u16 indirection_table_mask; + __u16 unclassified_queue; +}; + #endif /* _UAPI__IF_TUN_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7b03b64fdcb276f68ce881d1d8da8e4c6b897efc..aa2a091b649f0c9d6e0196f34f345ba78b5498fb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -64,6 +64,7 @@ #include <linux/mpls.h> #include <linux/kcov.h> #include <linux/iov_iter.h> +#include <linux/virtio_net.h> #include <net/protocol.h> #include <net/dst.h> @@ -4969,6 +4970,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_MCTP_FLOWS) [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), #endif +#if IS_ENABLED(CONFIG_TUN) + [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct virtio_net_hash), +#endif }; static __always_inline unsigned int skb_ext_total_length(void)