diff mbox series

[for-next,1/1] RDMA-rxe: Allow retry sends for rdma read responses

Message ID 20230215224419.9195-2-rpearsonhpe@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Jason Gunthorpe
Headers show
Series RDMA-rxe: Allow retry sends for rdma read responses | expand

Commit Message

Bob Pearson Feb. 15, 2023, 10:44 p.m. UTC
This patch modifies read_reply() in rxe_resp.c to retry the
send if err == -EAGAIN. When IP does drop a packet it requires
more time to recover than a simple retry takes so a subroutine
read_retry_delay() is added that dynamically estimates the time
required for this recovery and inserts a delay before the retry.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_resp.c  | 62 +++++++++++++++++++++++++--
 drivers/infiniband/sw/rxe/rxe_verbs.h |  9 ++++
 2 files changed, 68 insertions(+), 3 deletions(-)

Comments

Jason Gunthorpe April 12, 2023, 4:05 p.m. UTC | #1
On Wed, Feb 15, 2023 at 04:44:21PM -0600, Bob Pearson wrote:

> @@ -937,9 +987,15 @@ static enum resp_states read_reply(struct rxe_qp *qp,
>  	}
>  
>  	err = rxe_xmit_packet(qp, &ack_pkt, skb);
> -	if (err)
> +	delay = read_retry_delay(qp, err);
> +	if (err == -EAGAIN) {
> +		udelay(delay);

I'd be happier with this patch if it wasn't a udelay here, spinning
hoping the ip stack progresses just feels wrong

Can't this use a timer or something? Re-rx the packet generating the
reply?

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index cd2d88de287c..4e2fa2d72e70 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -863,6 +863,57 @@  static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey)
 	return mr;
 }
 
+/* Compute the delay to insert before retrying sending a
+ * dropped read reply packet in microseconds. Compute as half
+ * the average burst delay over the last 128 delay bursts.
+ * Slowly decay the delay if many good packets are seen.
+ */
+static int read_retry_delay(struct rxe_qp *qp, int err)
+{
+	struct tune_read_drop *tune = &qp->resp.tune_read_drop;
+	u32 delay = tune->delay;
+	u32 num = tune->num_bursts;
+	u32 good = tune->num_good_pkts;
+	u32 burst = tune->burst_delay;
+	u32 tot = tune->total_delay;
+
+	if (err == -EAGAIN) {
+		burst += delay;
+		good = 0;
+	} else if (burst) {
+		tot += burst;
+		burst = 0;
+		num++;
+	} else {
+		good++;
+	}
+
+	if (num >= (1 << 7)) {
+		delay = tot >> 8;
+		tot = 0;
+		num = 0;
+		rxe_dbg_qp(qp, "delay = %d", delay);
+	}
+
+	if (delay > 1 && good > 512) {
+		good = 0;
+		delay--;
+	}
+
+	/* make sure delay is at least 1 else algorithm breaks
+	 * with tot = burst = 0 -> delay = 0
+	 */
+	delay = delay ?: 1;
+
+	tune->delay = delay;
+	tune->num_bursts = num;
+	tune->num_good_pkts = good;
+	tune->burst_delay = burst;
+	tune->total_delay = tot;
+
+	return delay;
+}
+
 /* RDMA read response. If res is not NULL, then we have a current RDMA request
  * being processed or replayed.
  */
@@ -878,6 +929,7 @@  static enum resp_states read_reply(struct rxe_qp *qp,
 	int err;
 	struct resp_res *res = qp->resp.res;
 	struct rxe_mr *mr;
+	int delay;
 
 	if (!res) {
 		res = rxe_prepare_res(qp, req_pkt, RXE_READ_MASK);
@@ -909,8 +961,6 @@  static enum resp_states read_reply(struct rxe_qp *qp,
 			opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST;
 	}
 
-	res->state = rdatm_res_state_next;
-
 	payload = min_t(int, res->read.resid, mtu);
 
 	skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload,
@@ -937,9 +987,15 @@  static enum resp_states read_reply(struct rxe_qp *qp,
 	}
 
 	err = rxe_xmit_packet(qp, &ack_pkt, skb);
-	if (err)
+	delay = read_retry_delay(qp, err);
+	if (err == -EAGAIN) {
+		udelay(delay);
+		return RESPST_READ_REPLY;
+	} else if (err) {
 		return RESPST_ERR_RNR;
+	}
 
+	res->state = rdatm_res_state_next;
 	res->read.va += payload;
 	res->read.resid -= payload;
 	res->cur_psn = (res->cur_psn + 1) & BTH_PSN_MASK;
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index c269ae2a3224..84994a474e9a 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -203,6 +203,15 @@  struct rxe_resp_info {
 		struct ib_sge		sge[RXE_MAX_SGE];
 	} srq_wqe;
 
+	/* dynamic delay tuning for read reply drops */
+	struct tune_read_drop {
+		u32			total_delay;
+		u32			burst_delay;
+		u32			num_bursts;
+		u32			num_good_pkts;
+		u32			delay;
+	}			tune_read_drop;
+
 	/* Responder resources. It's a circular list where the oldest
 	 * resource is dropped first.
 	 */