diff mbox series

[net-next,v6,06/12] net: homa: create homa_peer.h and homa_peer.c

Message ID 20250115185937.1324-7-ouster@cs.stanford.edu (mailing list archive)
State New
Delegated to: Netdev Maintainers
Headers show
Series Begin upstreaming Homa transport protocol | expand

Commit Message

John Ousterhout Jan. 15, 2025, 6:59 p.m. UTC
Homa needs to keep a small amount of information for each peer that
it has communicated with. These files define that state and provide
functions for storing and accessing it.

Signed-off-by: John Ousterhout <ouster@cs.stanford.edu>
---
 net/homa/homa_peer.c | 366 +++++++++++++++++++++++++++++++++++++++++++
 net/homa/homa_peer.h | 233 +++++++++++++++++++++++++++
 2 files changed, 599 insertions(+)
 create mode 100644 net/homa/homa_peer.c
 create mode 100644 net/homa/homa_peer.h
diff mbox series

Patch

diff --git a/net/homa/homa_peer.c b/net/homa/homa_peer.c
new file mode 100644
index 000000000000..04056936bf71
--- /dev/null
+++ b/net/homa/homa_peer.c
@@ -0,0 +1,366 @@ 
+// SPDX-License-Identifier: BSD-2-Clause
+
+/* This file provides functions related to homa_peer and homa_peertab
+ * objects.
+ */
+
+#include "homa_impl.h"
+#include "homa_peer.h"
+#include "homa_rpc.h"
+
+/**
+ * homa_peertab_init() - Constructor for homa_peertabs.
+ * @peertab:  The object to initialize; previous contents are discarded.
+ *
+ * Return:    0 in the normal case, or a negative errno if there was a problem.
+ */
+int homa_peertab_init(struct homa_peertab *peertab)
+{
+	/* Note: when we return, the object must be initialized so it's
+	 * safe to call homa_peertab_destroy, even if this function returns
+	 * an error.
+	 */
+	int i;
+
+	spin_lock_init(&peertab->write_lock);
+	INIT_LIST_HEAD(&peertab->dead_dsts);
+	peertab->buckets = vmalloc(HOMA_PEERTAB_BUCKETS *
+				   sizeof(*peertab->buckets));
+	if (!peertab->buckets)
+		return -ENOMEM;
+	for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++)
+		INIT_HLIST_HEAD(&peertab->buckets[i]);
+	return 0;
+}
+
+/**
+ * homa_peertab_destroy() - Destructor for homa_peertabs. After this
+ * function returns, it is unsafe to use any results from previous calls
+ * to homa_peer_find, since all existing homa_peer objects will have been
+ * destroyed.
+ * @peertab:  The table to destroy.
+ */
+void homa_peertab_destroy(struct homa_peertab *peertab)
+{
+	struct hlist_node *next;
+	struct homa_peer *peer;
+	int i;
+
+	if (!peertab->buckets)
+		return;
+
+	for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) {
+		hlist_for_each_entry_safe(peer, next, &peertab->buckets[i],
+					  peertab_links) {
+			dst_release(peer->dst);
+			kfree(peer);
+		}
+	}
+	vfree(peertab->buckets);
+	homa_peertab_gc_dsts(peertab, ~0);
+}
+
+/**
+ * homa_peertab_get_peers() - Return information about all of the peers
+ * currently known
+ * @peertab:    The table to search for peers.
+ * @num_peers:  Modified to hold the number of peers returned.
+ * Return:      kmalloced array holding pointers to all known peers. The
+ *		caller must free this. If there is an error, or if there
+ *	        are no peers, NULL is returned.
+ */
+struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab,
+					  int *num_peers)
+{
+	struct homa_peer **result;
+	struct hlist_node *next;
+	struct homa_peer *peer;
+	int i, count;
+
+	*num_peers = 0;
+	if (!peertab->buckets)
+		return NULL;
+
+	/* Figure out how many peers there are. */
+	count = 0;
+	for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) {
+		hlist_for_each_entry_safe(peer, next, &peertab->buckets[i],
+					  peertab_links)
+			count++;
+	}
+
+	if (count == 0)
+		return NULL;
+
+	result = kmalloc_array(count, sizeof(peer), GFP_KERNEL);
+	if (!result)
+		return NULL;
+	*num_peers = count;
+	count = 0;
+	for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) {
+		hlist_for_each_entry_safe(peer, next, &peertab->buckets[i],
+					  peertab_links) {
+			result[count] = peer;
+			count++;
+		}
+	}
+	return result;
+}
+
+/**
+ * homa_peertab_gc_dsts() - Invoked to free unused dst_entries, if it is
+ * safe to do so.
+ * @peertab:       The table in which to free entries.
+ * @now:           Current time, in sched_clock() units; entries with expiration
+ *                 dates no later than this will be freed. Specify ~0 to
+ *                 free all entries.
+ */
+void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now)
+{
+	while (!list_empty(&peertab->dead_dsts)) {
+		struct homa_dead_dst *dead =
+			list_first_entry(&peertab->dead_dsts,
+					 struct homa_dead_dst, dst_links);
+		if (dead->gc_time > now)
+			break;
+		dst_release(dead->dst);
+		list_del(&dead->dst_links);
+		kfree(dead);
+	}
+}
+
+/**
+ * homa_peer_find() - Returns the peer associated with a given host; creates
+ * a new homa_peer if one doesn't already exist.
+ * @peertab:    Peer table in which to perform lookup.
+ * @addr:       Address of the desired host: IPv4 addresses are represented
+ *              as IPv4-mapped IPv6 addresses.
+ * @inet:       Socket that will be used for sending packets.
+ *
+ * Return:      The peer associated with @addr, or a negative errno if an
+ *              error occurred. The caller can retain this pointer
+ *              indefinitely: peer entries are never deleted except in
+ *              homa_peertab_destroy.
+ */
+struct homa_peer *homa_peer_find(struct homa_peertab *peertab,
+				 const struct in6_addr *addr,
+				 struct inet_sock *inet)
+{
+	/* Note: this function uses RCU operators to ensure safety even
+	 * if a concurrent call is adding a new entry.
+	 */
+	struct homa_peer *peer;
+	struct dst_entry *dst;
+
+	__u32 bucket = hash_32((__force __u32)addr->in6_u.u6_addr32[0],
+			       HOMA_PEERTAB_BUCKET_BITS);
+
+	bucket ^= hash_32((__force __u32)addr->in6_u.u6_addr32[1],
+			  HOMA_PEERTAB_BUCKET_BITS);
+	bucket ^= hash_32((__force __u32)addr->in6_u.u6_addr32[2],
+			  HOMA_PEERTAB_BUCKET_BITS);
+	bucket ^= hash_32((__force __u32)addr->in6_u.u6_addr32[3],
+			  HOMA_PEERTAB_BUCKET_BITS);
+	hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket],
+				 peertab_links) {
+		if (ipv6_addr_equal(&peer->addr, addr))
+			return peer;
+	}
+
+	/* No existing entry; create a new one.
+	 *
+	 * Note: after we acquire the lock, we have to check again to
+	 * make sure the entry still doesn't exist (it might have been
+	 * created by a concurrent invocation of this function).
+	 */
+	spin_lock_bh(&peertab->write_lock);
+	hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket],
+				 peertab_links) {
+		if (ipv6_addr_equal(&peer->addr, addr))
+			goto done;
+	}
+	peer = kmalloc(sizeof(*peer), GFP_ATOMIC);
+	if (!peer) {
+		peer = (struct homa_peer *)ERR_PTR(-ENOMEM);
+		goto done;
+	}
+	peer->addr = *addr;
+	dst = homa_peer_get_dst(peer, inet);
+	if (IS_ERR(dst)) {
+		kfree(peer);
+		peer = (struct homa_peer *)PTR_ERR(dst);
+		goto done;
+	}
+	peer->dst = dst;
+	INIT_LIST_HEAD(&peer->grantable_rpcs);
+	INIT_LIST_HEAD(&peer->grantable_links);
+	hlist_add_head_rcu(&peer->peertab_links, &peertab->buckets[bucket]);
+	peer->outstanding_resends = 0;
+	peer->most_recent_resend = 0;
+	peer->least_recent_rpc = NULL;
+	peer->least_recent_ticks = 0;
+	peer->current_ticks = -1;
+	peer->resend_rpc = NULL;
+	peer->num_acks = 0;
+	spin_lock_init(&peer->ack_lock);
+
+done:
+	spin_unlock_bh(&peertab->write_lock);
+	return peer;
+}
+
+/**
+ * homa_dst_refresh() - This method is called when the dst for a peer is
+ * obsolete; it releases that dst and creates a new one.
+ * @peertab:  Table containing the peer.
+ * @peer:     Peer whose dst is obsolete.
+ * @hsk:      Socket that will be used to transmit data to the peer.
+ */
+void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer,
+		      struct homa_sock *hsk)
+{
+	struct homa_dead_dst *save_dead;
+	struct dst_entry *dst;
+	__u64 now;
+
+	/* Need to keep around the current entry for a while in case
+	 * someone is using it. If we can't do that, then don't update
+	 * the entry.
+	 */
+	save_dead = kmalloc(sizeof(*save_dead), GFP_ATOMIC);
+	if (unlikely(!save_dead))
+		return;
+
+	dst = homa_peer_get_dst(peer, &hsk->inet);
+	if (IS_ERR(dst)) {
+		kfree(save_dead);
+		return;
+	}
+
+	spin_lock_bh(&peertab->write_lock);
+	now = sched_clock();
+	save_dead->dst = peer->dst;
+	save_dead->gc_time = now + 100000000;   /* 100 ms */
+	list_add_tail(&save_dead->dst_links, &peertab->dead_dsts);
+	homa_peertab_gc_dsts(peertab, now);
+	peer->dst = dst;
+	spin_unlock_bh(&peertab->write_lock);
+}
+
+/**
+ * homa_peer_get_dst() - Find an appropriate dst structure (either IPv4
+ * or IPv6) for a peer.
+ * @peer:   The peer for which a dst is needed. Note: this peer's flow
+ *          struct will be overwritten.
+ * @inet:   Socket that will be used for sending packets.
+ * Return:  The dst structure (or an ERR_PTR).
+ */
+struct dst_entry *homa_peer_get_dst(struct homa_peer *peer,
+				    struct inet_sock *inet)
+{
+	memset(&peer->flow, 0, sizeof(peer->flow));
+	if (inet->sk.sk_family == AF_INET) {
+		struct rtable *rt;
+
+		flowi4_init_output(&peer->flow.u.ip4, inet->sk.sk_bound_dev_if,
+				   inet->sk.sk_mark, inet->tos,
+				   RT_SCOPE_UNIVERSE, inet->sk.sk_protocol, 0,
+				   peer->addr.in6_u.u6_addr32[3],
+				   inet->inet_saddr, 0, 0, inet->sk.sk_uid);
+		security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common);
+		rt = ip_route_output_flow(sock_net(&inet->sk),
+					  &peer->flow.u.ip4, &inet->sk);
+		if (IS_ERR(rt))
+			return (struct dst_entry *)(PTR_ERR(rt));
+		return &rt->dst;
+	}
+	peer->flow.u.ip6.flowi6_oif = inet->sk.sk_bound_dev_if;
+	peer->flow.u.ip6.flowi6_iif = LOOPBACK_IFINDEX;
+	peer->flow.u.ip6.flowi6_mark = inet->sk.sk_mark;
+	peer->flow.u.ip6.flowi6_scope = RT_SCOPE_UNIVERSE;
+	peer->flow.u.ip6.flowi6_proto = inet->sk.sk_protocol;
+	peer->flow.u.ip6.flowi6_flags = 0;
+	peer->flow.u.ip6.flowi6_secid = 0;
+	peer->flow.u.ip6.flowi6_tun_key.tun_id = 0;
+	peer->flow.u.ip6.flowi6_uid = inet->sk.sk_uid;
+	peer->flow.u.ip6.daddr = peer->addr;
+	peer->flow.u.ip6.saddr = inet->pinet6->saddr;
+	peer->flow.u.ip6.fl6_dport = 0;
+	peer->flow.u.ip6.fl6_sport = 0;
+	peer->flow.u.ip6.mp_hash = 0;
+	peer->flow.u.ip6.__fl_common.flowic_tos = inet->tos;
+	peer->flow.u.ip6.flowlabel = ip6_make_flowinfo(inet->tos, 0);
+	security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common);
+	return ip6_dst_lookup_flow(sock_net(&inet->sk), &inet->sk,
+			&peer->flow.u.ip6, NULL);
+}
+
+/**
+ * homa_peer_lock_slow() - This function implements the slow path for
+ * acquiring a peer's @unacked_lock. It is invoked when the lock isn't
+ * immediately available. It waits for the lock, but also records statistics
+ * about the waiting time.
+ * @peer:    Peer to  lock.
+ */
+void homa_peer_lock_slow(struct homa_peer *peer)
+	__acquires(&peer->ack_lock)
+{
+	spin_lock_bh(&peer->ack_lock);
+}
+
+/**
+ * homa_peer_add_ack() - Add a given RPC to the list of unacked
+ * RPCs for its server. Once this method has been invoked, it's safe
+ * to delete the RPC, since it will eventually be acked to the server.
+ * @rpc:    Client RPC that has now completed.
+ */
+void homa_peer_add_ack(struct homa_rpc *rpc)
+{
+	struct homa_peer *peer = rpc->peer;
+	struct homa_ack_hdr ack;
+
+	homa_peer_lock(peer);
+	if (peer->num_acks < HOMA_MAX_ACKS_PER_PKT) {
+		peer->acks[peer->num_acks].client_id = cpu_to_be64(rpc->id);
+		peer->acks[peer->num_acks].server_port = htons(rpc->dport);
+		peer->num_acks++;
+		homa_peer_unlock(peer);
+		return;
+	}
+
+	/* The peer has filled up; send an ACK message to empty it. The
+	 * RPC in the message header will also be considered ACKed.
+	 */
+	memcpy(ack.acks, peer->acks, sizeof(peer->acks));
+	ack.num_acks = htons(peer->num_acks);
+	peer->num_acks = 0;
+	homa_peer_unlock(peer);
+	homa_xmit_control(ACK, &ack, sizeof(ack), rpc);
+}
+
+/**
+ * homa_peer_get_acks() - Copy acks out of a peer, and remove them from the
+ * peer.
+ * @peer:    Peer to check for possible unacked RPCs.
+ * @count:   Maximum number of acks to return.
+ * @dst:     The acks are copied to this location.
+ *
+ * Return:   The number of acks extracted from the peer (<= count).
+ */
+int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst)
+{
+	/* Don't waste time acquiring the lock if there are no ids available. */
+	if (peer->num_acks == 0)
+		return 0;
+
+	homa_peer_lock(peer);
+
+	if (count > peer->num_acks)
+		count = peer->num_acks;
+	memcpy(dst, &peer->acks[peer->num_acks - count],
+	       count * sizeof(peer->acks[0]));
+	peer->num_acks -= count;
+
+	homa_peer_unlock(peer);
+	return count;
+}
diff --git a/net/homa/homa_peer.h b/net/homa/homa_peer.h
new file mode 100644
index 000000000000..556aeda49656
--- /dev/null
+++ b/net/homa/homa_peer.h
@@ -0,0 +1,233 @@ 
+/* SPDX-License-Identifier: BSD-2-Clause */
+
+/* This file contains definitions related to managing peers (homa_peer
+ * and homa_peertab).
+ */
+
+#ifndef _HOMA_PEER_H
+#define _HOMA_PEER_H
+
+#include "homa_wire.h"
+#include "homa_sock.h"
+
+struct homa_rpc;
+
+/**
+ * struct homa_dead_dst - Used to retain dst_entries that are no longer
+ * needed, until it is safe to delete them (I'm not confident that the RCU
+ * mechanism will be safe for these: the reference count could get incremented
+ * after it's on the RCU list?).
+ */
+struct homa_dead_dst {
+	/** @dst: Entry that is no longer used by a struct homa_peer. */
+	struct dst_entry *dst;
+
+	/**
+	 * @gc_time: Time (in units of sched_clock()) when it is safe
+	 * to free @dst.
+	 */
+	__u64 gc_time;
+
+	/** @dst_links: Used to link together entries in peertab->dead_dsts. */
+	struct list_head dst_links;
+};
+
+/**
+ * define HOMA_PEERTAB_BUCKET_BITS - Number of bits in the bucket index for a
+ * homa_peertab.  Should be large enough to hold an entry for every server
+ * in a datacenter without long hash chains.
+ */
+#define HOMA_PEERTAB_BUCKET_BITS 16
+
+/** define HOME_PEERTAB_BUCKETS - Number of buckets in a homa_peertab. */
+#define HOMA_PEERTAB_BUCKETS BIT(HOMA_PEERTAB_BUCKET_BITS)
+
+/**
+ * struct homa_peertab - A hash table that maps from IPv6 addresses
+ * to homa_peer objects. IPv4 entries are encapsulated as IPv6 addresses.
+ * Entries are gradually added to this table, but they are never removed
+ * except when the entire table is deleted. We can't safely delete because
+ * results returned by homa_peer_find may be retained indefinitely.
+ *
+ * This table is managed exclusively by homa_peertab.c, using RCU to
+ * permit efficient lookups.
+ */
+struct homa_peertab {
+	/**
+	 * @write_lock: Synchronizes addition of new entries; not needed
+	 * for lookups (RCU is used instead).
+	 */
+	spinlock_t write_lock;
+
+	/**
+	 * @dead_dsts: List of dst_entries that are waiting to be deleted.
+	 * Hold @write_lock when manipulating.
+	 */
+	struct list_head dead_dsts;
+
+	/**
+	 * @buckets: Pointer to heads of chains of homa_peers for each bucket.
+	 * Malloc-ed, and must eventually be freed. NULL means this structure
+	 * has not been initialized.
+	 */
+	struct hlist_head *buckets;
+};
+
+/**
+ * struct homa_peer - One of these objects exists for each machine that we
+ * have communicated with (either as client or server).
+ */
+struct homa_peer {
+	/**
+	 * @addr: IPv6 address for the machine (IPv4 addresses are stored
+	 * as IPv4-mapped IPv6 addresses).
+	 */
+	struct in6_addr addr;
+
+	/** @flow: Addressing info needed to send packets. */
+	struct flowi flow;
+
+	/**
+	 * @dst: Used to route packets to this peer; we own a reference
+	 * to this, which we must eventually release.
+	 */
+	struct dst_entry *dst;
+
+	/**
+	 * @grantable_rpcs: Contains all homa_rpcs (both requests and
+	 * responses) involving this peer whose msgins require (or required
+	 * them in the past) and have not been fully received. The list is
+	 * sorted in priority order (head has fewest bytes_remaining).
+	 * Locked with homa->grantable_lock.
+	 */
+	struct list_head grantable_rpcs;
+
+	/**
+	 * @grantable_links: Used to link this peer into homa->grantable_peers.
+	 * If this RPC is not linked into homa->grantable_peers, this is an
+	 * empty list pointing to itself.
+	 */
+	struct list_head grantable_links;
+
+	/**
+	 * @peertab_links: Links this object into a bucket of its
+	 * homa_peertab.
+	 */
+	struct hlist_node peertab_links;
+
+	/**
+	 * @outstanding_resends: the number of resend requests we have
+	 * sent to this server (spaced @homa.resend_interval apart) since
+	 * we received a packet from this peer.
+	 */
+	int outstanding_resends;
+
+	/**
+	 * @most_recent_resend: @homa->timer_ticks when the most recent
+	 * resend was sent to this peer.
+	 */
+	int most_recent_resend;
+
+	/**
+	 * @least_recent_rpc: of all the RPCs for this peer scanned at
+	 * @current_ticks, this is the RPC whose @resend_timer_ticks
+	 * is farthest in the past.
+	 */
+	struct homa_rpc *least_recent_rpc;
+
+	/**
+	 * @least_recent_ticks: the @resend_timer_ticks value for
+	 * @least_recent_rpc.
+	 */
+	__u32 least_recent_ticks;
+
+	/**
+	 * @current_ticks: the value of @homa->timer_ticks the last time
+	 * that @least_recent_rpc and @least_recent_ticks were computed.
+	 * Used to detect the start of a new homa_timer pass.
+	 */
+	__u32 current_ticks;
+
+	/**
+	 * @resend_rpc: the value of @least_recent_rpc computed in the
+	 * previous homa_timer pass. This RPC will be issued a RESEND
+	 * in the current pass, if it still needs one.
+	 */
+	struct homa_rpc *resend_rpc;
+
+	/**
+	 * @num_acks: the number of (initial) entries in @acks that
+	 * currently hold valid information.
+	 */
+	int num_acks;
+
+	/**
+	 * @acks: info about client RPCs whose results have been completely
+	 * received.
+	 */
+	struct homa_ack acks[HOMA_MAX_ACKS_PER_PKT];
+
+	/**
+	 * @ack_lock: used to synchronize access to @num_acks and @acks.
+	 */
+	spinlock_t ack_lock;
+};
+
+void     homa_dst_refresh(struct homa_peertab *peertab,
+			  struct homa_peer *peer, struct homa_sock *hsk);
+void     homa_peertab_destroy(struct homa_peertab *peertab);
+struct homa_peer **
+		homa_peertab_get_peers(struct homa_peertab *peertab,
+				       int *num_peers);
+int      homa_peertab_init(struct homa_peertab *peertab);
+void     homa_peer_add_ack(struct homa_rpc *rpc);
+struct homa_peer
+	       *homa_peer_find(struct homa_peertab *peertab,
+			       const struct in6_addr *addr,
+			       struct inet_sock *inet);
+int      homa_peer_get_acks(struct homa_peer *peer, int count,
+			    struct homa_ack *dst);
+struct dst_entry
+	       *homa_peer_get_dst(struct homa_peer *peer,
+				  struct inet_sock *inet);
+void     homa_peer_lock_slow(struct homa_peer *peer);
+void     homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now);
+
+/**
+ * homa_peer_lock() - Acquire the lock for a peer's @unacked_lock. If the lock
+ * isn't immediately available, record stats on the waiting time.
+ * @peer:    Peer to lock.
+ */
+static inline void homa_peer_lock(struct homa_peer *peer)
+	__acquires(&peer->ack_lock)
+{
+	if (!spin_trylock_bh(&peer->ack_lock))
+		homa_peer_lock_slow(peer);
+}
+
+/**
+ * homa_peer_unlock() - Release the lock for a peer's @unacked_lock.
+ * @peer:   Peer to lock.
+ */
+static inline void homa_peer_unlock(struct homa_peer *peer)
+	__releases(&peer->ack_lock)
+{
+	spin_unlock_bh(&peer->ack_lock);
+}
+
+/**
+ * homa_get_dst() - Returns destination information associated with a peer,
+ * updating it if the cached information is stale.
+ * @peer:   Peer whose destination information is desired.
+ * @hsk:    Homa socket; needed by lower-level code to recreate the dst.
+ * Return:   Up-to-date destination for peer.
+ */
+static inline struct dst_entry *homa_get_dst(struct homa_peer *peer,
+					     struct homa_sock *hsk)
+{
+	if (unlikely(peer->dst->obsolete > 0))
+		homa_dst_refresh(hsk->homa->peers, peer, hsk);
+	return peer->dst;
+}
+
+#endif /* _HOMA_PEER_H */