@@ -863,6 +863,57 @@ static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey)
return mr;
}
+/* Compute the delay to insert before retrying sending a
+ * dropped read reply packet in microseconds. Compute as half
+ * the average burst delay over the last 128 delay bursts.
+ * Slowly decay the delay if many good packets are seen.
+ */
+static int read_retry_delay(struct rxe_qp *qp, int err)
+{
+ struct tune_read_drop *tune = &qp->resp.tune_read_drop;
+ u32 delay = tune->delay;
+ u32 num = tune->num_bursts;
+ u32 good = tune->num_good_pkts;
+ u32 burst = tune->burst_delay;
+ u32 tot = tune->total_delay;
+
+ if (err == -EAGAIN) {
+ burst += delay;
+ good = 0;
+ } else if (burst) {
+ tot += burst;
+ burst = 0;
+ num++;
+ } else {
+ good++;
+ }
+
+ if (num >= (1 << 7)) {
+ delay = tot >> 8;
+ tot = 0;
+ num = 0;
+ rxe_dbg_qp(qp, "delay = %d", delay);
+ }
+
+ if (delay > 1 && good > 512) {
+ good = 0;
+ delay--;
+ }
+
+ /* make sure delay is at least 1 else algorithm breaks
+ * with tot = burst = 0 -> delay = 0
+ */
+ delay = delay ?: 1;
+
+ tune->delay = delay;
+ tune->num_bursts = num;
+ tune->num_good_pkts = good;
+ tune->burst_delay = burst;
+ tune->total_delay = tot;
+
+ return delay;
+}
+
/* RDMA read response. If res is not NULL, then we have a current RDMA request
* being processed or replayed.
*/
@@ -878,6 +929,7 @@ static enum resp_states read_reply(struct rxe_qp *qp,
int err;
struct resp_res *res = qp->resp.res;
struct rxe_mr *mr;
+ int delay;
if (!res) {
res = rxe_prepare_res(qp, req_pkt, RXE_READ_MASK);
@@ -909,8 +961,6 @@ static enum resp_states read_reply(struct rxe_qp *qp,
opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST;
}
- res->state = rdatm_res_state_next;
-
payload = min_t(int, res->read.resid, mtu);
skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload,
@@ -937,9 +987,15 @@ static enum resp_states read_reply(struct rxe_qp *qp,
}
err = rxe_xmit_packet(qp, &ack_pkt, skb);
- if (err)
+ delay = read_retry_delay(qp, err);
+ if (err == -EAGAIN) {
+ udelay(delay);
+ return RESPST_READ_REPLY;
+ } else if (err) {
return RESPST_ERR_RNR;
+ }
+ res->state = rdatm_res_state_next;
res->read.va += payload;
res->read.resid -= payload;
res->cur_psn = (res->cur_psn + 1) & BTH_PSN_MASK;
@@ -203,6 +203,15 @@ struct rxe_resp_info {
struct ib_sge sge[RXE_MAX_SGE];
} srq_wqe;
+ /* dynamic delay tuning for read reply drops */
+ struct tune_read_drop {
+ u32 total_delay;
+ u32 burst_delay;
+ u32 num_bursts;
+ u32 num_good_pkts;
+ u32 delay;
+ } tune_read_drop;
+
/* Responder resources. It's a circular list where the oldest
* resource is dropped first.
*/
This patch modifies read_reply() in rxe_resp.c to retry the send if err == -EAGAIN. When IP does drop a packet it requires more time to recover than a simple retry takes so a subroutine read_retry_delay() is added that dynamically estimates the time required for this recovery and inserts a delay before the retry. Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> --- drivers/infiniband/sw/rxe/rxe_resp.c | 62 +++++++++++++++++++++++++-- drivers/infiniband/sw/rxe/rxe_verbs.h | 9 ++++ 2 files changed, 68 insertions(+), 3 deletions(-)