diff mbox series

gve: Add retry logic for recoverable adminq errors

Message ID 20240628185446.262191-1-rushilg@google.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series gve: Add retry logic for recoverable adminq errors | expand

Checks

Context Check Description
netdev/series_format warning Single patches do not need cover letters; Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 839 this patch: 839
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 10 of 10 maintainers
netdev/build_clang success Errors and warnings before: 846 this patch: 846
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 846 this patch: 846
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Rushil Gupta June 28, 2024, 6:54 p.m. UTC
From: Jeroen de Borst <jeroendb@google.com>

An adminq command is retried if it fails with an ETIME error code
which translates to the deadline exceeded error for the device.
The create and destroy adminq commands are now managed via a common
method. This method keeps track of return codes for each queue and retries
the commands for the queues that failed with ETIME.
Other adminq commands that do not require queue level granularity are
simply retried in gve_adminq_execute_cmd.

Signed-off-by: Rushil Gupta <rushilg@google.com>
Signed-off-by: Jeroen de Borst <jeroendb@google.com>
Reviewed-by: Shailend Chand <shailend@google.com>
Reviewed-by: Ziwei Xiao <ziweixiao@google.com>
---
 drivers/net/ethernet/google/gve/gve_adminq.c | 197 ++++++++++++-------
 drivers/net/ethernet/google/gve/gve_adminq.h |   5 +
 2 files changed, 129 insertions(+), 73 deletions(-)
diff mbox series

Patch

diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c
index c5bbc1b7524e..74c61b90ea45 100644
--- a/drivers/net/ethernet/google/gve/gve_adminq.c
+++ b/drivers/net/ethernet/google/gve/gve_adminq.c
@@ -12,7 +12,7 @@ 
 
 #define GVE_MAX_ADMINQ_RELEASE_CHECK	500
 #define GVE_ADMINQ_SLEEP_LEN		20
-#define GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK	100
+#define GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK	1000
 
 #define GVE_DEVICE_OPTION_ERROR_FMT "%s option error:\n" \
 "Expected: length=%d, feature_mask=%x.\n" \
@@ -415,14 +415,17 @@  static int gve_adminq_parse_err(struct gve_priv *priv, u32 status)
 /* Flushes all AQ commands currently queued and waits for them to complete.
  * If there are failures, it will return the first error.
  */
-static int gve_adminq_kick_and_wait(struct gve_priv *priv)
+static int gve_adminq_kick_and_wait(struct gve_priv *priv, int ret_cnt, int *ret_codes)
 {
 	int tail, head;
-	int i;
+	int i, j;
 
 	tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
 	head = priv->adminq_prod_cnt;
 
+	if ((head - tail) > ret_cnt)
+		return -EINVAL;
+
 	gve_adminq_kick_cmd(priv, head);
 	if (!gve_adminq_wait_for_cmd(priv, head)) {
 		dev_err(&priv->pdev->dev, "AQ commands timed out, need to reset AQ\n");
@@ -430,16 +433,13 @@  static int gve_adminq_kick_and_wait(struct gve_priv *priv)
 		return -ENOTRECOVERABLE;
 	}
 
-	for (i = tail; i < head; i++) {
+	for (i = tail, j = 0; i < head; i++, j++) {
 		union gve_adminq_command *cmd;
-		u32 status, err;
+		u32 status;
 
 		cmd = &priv->adminq[i & priv->adminq_mask];
 		status = be32_to_cpu(READ_ONCE(cmd->status));
-		err = gve_adminq_parse_err(priv, status);
-		if (err)
-			// Return the first error if we failed.
-			return err;
+		ret_codes[j] = gve_adminq_parse_err(priv, status);
 	}
 
 	return 0;
@@ -458,24 +458,8 @@  static int gve_adminq_issue_cmd(struct gve_priv *priv,
 	tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
 
 	// Check if next command will overflow the buffer.
-	if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) ==
-	    (tail & priv->adminq_mask)) {
-		int err;
-
-		// Flush existing commands to make room.
-		err = gve_adminq_kick_and_wait(priv);
-		if (err)
-			return err;
-
-		// Retry.
-		tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
-		if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) ==
-		    (tail & priv->adminq_mask)) {
-			// This should never happen. We just flushed the
-			// command queue so there should be enough space.
-			return -ENOMEM;
-		}
-	}
+	if ((priv->adminq_prod_cnt - tail) > priv->adminq_mask)
+		return -ENOMEM;
 
 	cmd = &priv->adminq[priv->adminq_prod_cnt & priv->adminq_mask];
 	priv->adminq_prod_cnt++;
@@ -544,8 +528,9 @@  static int gve_adminq_issue_cmd(struct gve_priv *priv,
 static int gve_adminq_execute_cmd(struct gve_priv *priv,
 				  union gve_adminq_command *cmd_orig)
 {
+	int retry_cnt = 0;
 	u32 tail, head;
-	int err;
+	int err, ret;
 
 	mutex_lock(&priv->adminq_lock);
 	tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
@@ -555,15 +540,21 @@  static int gve_adminq_execute_cmd(struct gve_priv *priv,
 		goto out;
 	}
 
-	err = gve_adminq_issue_cmd(priv, cmd_orig);
-	if (err)
-		goto out;
+	do {
+		err = gve_adminq_issue_cmd(priv, cmd_orig);
+		if (err)
+			goto out;
 
-	err = gve_adminq_kick_and_wait(priv);
+		err = gve_adminq_kick_and_wait(priv, 1, &ret);
+		if (err)
+			goto out;
+	} while (ret == -ETIME && ++retry_cnt < GVE_ADMINQ_RETRY_COUNT);
 
 out:
 	mutex_unlock(&priv->adminq_lock);
-	return err;
+	if (err)
+		return err;
+	return ret;
 }
 
 static int gve_adminq_execute_extended_cmd(struct gve_priv *priv, u32 opcode,
@@ -638,6 +629,98 @@  int gve_adminq_deconfigure_device_resources(struct gve_priv *priv)
 	return gve_adminq_execute_cmd(priv, &cmd);
 }
 
+typedef int (gve_adminq_queue_cmd) (struct gve_priv *priv, u32 queue_index);
+
+static int gve_adminq_manage_queues(struct gve_priv *priv,
+				    gve_adminq_queue_cmd *cmd,
+				    u32 start_id, u32 num_queues)
+{
+	u32 cmd_idx, queue_idx, ret_code_idx;
+	int queue_done = -1;
+	int *queues_waiting;
+	int retry_cnt = 0;
+	int retry_needed;
+	int *ret_codes;
+	int *commands;
+	int err;
+	int ret;
+
+	queues_waiting = kvcalloc(num_queues, sizeof(int), GFP_KERNEL);
+	if (!queues_waiting)
+		return -ENOMEM;
+	ret_codes = kvcalloc(num_queues, sizeof(int), GFP_KERNEL);
+	if (!ret_codes) {
+		err = -ENOMEM;
+		goto free_with_queues_waiting;
+	}
+	commands = kvcalloc(num_queues, sizeof(int), GFP_KERNEL);
+	if (!commands) {
+		err = -ENOMEM;
+		goto free_with_ret_codes;
+	}
+
+	for (queue_idx = 0; queue_idx < num_queues; queue_idx++)
+		queues_waiting[queue_idx] = start_id + queue_idx;
+	do {
+		retry_needed = 0;
+		queue_idx = 0;
+		while (queue_idx < num_queues) {
+			cmd_idx = 0;
+			while (queue_idx < num_queues) {
+				if (queues_waiting[queue_idx] != queue_done) {
+					err = cmd(priv, queues_waiting[queue_idx]);
+					if (err == -ENOMEM)
+						break;
+					if (err)
+						goto free_with_commands;
+					commands[cmd_idx++] = queue_idx;
+				}
+				queue_idx++;
+			}
+
+			if (queue_idx < num_queues)
+				dev_dbg(&priv->pdev->dev,
+					"Issued %d of %d batched commands\n",
+					queue_idx, num_queues);
+
+			err = gve_adminq_kick_and_wait(priv, cmd_idx, ret_codes);
+			if (err)
+				goto free_with_commands;
+
+			for (ret_code_idx = 0; ret_code_idx < cmd_idx; ret_code_idx++) {
+				if (ret_codes[ret_code_idx] == 0) {
+					queues_waiting[commands[ret_code_idx]] = queue_done;
+				} else if (ret_codes[ret_code_idx] != -ETIME) {
+					ret = ret_codes[ret_code_idx];
+					goto free_with_commands;
+				} else {
+					retry_needed++;
+				}
+			}
+
+			if (retry_needed)
+				dev_dbg(&priv->pdev->dev,
+					"Issued %d batched commands, %d needed a retry\n",
+					cmd_idx, retry_needed);
+		}
+	} while (retry_needed && ++retry_cnt < GVE_ADMINQ_RETRY_COUNT);
+
+	ret = retry_needed ? -ETIME : 0;
+
+free_with_commands:
+	kvfree(commands);
+	commands = NULL;
+free_with_ret_codes:
+	kvfree(ret_codes);
+	ret_codes = NULL;
+free_with_queues_waiting:
+	kvfree(queues_waiting);
+	queues_waiting = NULL;
+	if (err)
+		return err;
+	return ret;
+}
+
 static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index)
 {
 	struct gve_tx_ring *tx = &priv->tx[queue_index];
@@ -678,16 +761,8 @@  static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index)
 
 int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 start_id, u32 num_queues)
 {
-	int err;
-	int i;
-
-	for (i = start_id; i < start_id + num_queues; i++) {
-		err = gve_adminq_create_tx_queue(priv, i);
-		if (err)
-			return err;
-	}
-
-	return gve_adminq_kick_and_wait(priv);
+	return gve_adminq_manage_queues(priv, &gve_adminq_create_tx_queue,
+					start_id, num_queues);
 }
 
 static void gve_adminq_get_create_rx_queue_cmd(struct gve_priv *priv,
@@ -759,16 +834,8 @@  int gve_adminq_create_single_rx_queue(struct gve_priv *priv, u32 queue_index)
 
 int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues)
 {
-	int err;
-	int i;
-
-	for (i = 0; i < num_queues; i++) {
-		err = gve_adminq_create_rx_queue(priv, i);
-		if (err)
-			return err;
-	}
-
-	return gve_adminq_kick_and_wait(priv);
+	return gve_adminq_manage_queues(priv, &gve_adminq_create_rx_queue,
+					0, num_queues);
 }
 
 static int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index)
@@ -791,16 +858,8 @@  static int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index)
 
 int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 start_id, u32 num_queues)
 {
-	int err;
-	int i;
-
-	for (i = start_id; i < start_id + num_queues; i++) {
-		err = gve_adminq_destroy_tx_queue(priv, i);
-		if (err)
-			return err;
-	}
-
-	return gve_adminq_kick_and_wait(priv);
+	return gve_adminq_manage_queues(priv, &gve_adminq_destroy_tx_queue,
+					start_id, num_queues);
 }
 
 static void gve_adminq_make_destroy_rx_queue_cmd(union gve_adminq_command *cmd,
@@ -832,16 +891,8 @@  int gve_adminq_destroy_single_rx_queue(struct gve_priv *priv, u32 queue_index)
 
 int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues)
 {
-	int err;
-	int i;
-
-	for (i = 0; i < num_queues; i++) {
-		err = gve_adminq_destroy_rx_queue(priv, i);
-		if (err)
-			return err;
-	}
-
-	return gve_adminq_kick_and_wait(priv);
+	return gve_adminq_manage_queues(priv, &gve_adminq_destroy_rx_queue,
+					0, num_queues);
 }
 
 static void gve_set_default_desc_cnt(struct gve_priv *priv,
diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h
index ed1370c9b197..96e98f65273c 100644
--- a/drivers/net/ethernet/google/gve/gve_adminq.h
+++ b/drivers/net/ethernet/google/gve/gve_adminq.h
@@ -62,6 +62,11 @@  enum gve_adminq_statuses {
 	GVE_ADMINQ_COMMAND_ERROR_UNKNOWN_ERROR		= 0xFFFFFFFF,
 };
 
+/* AdminQ commands (that aren't batched) will be retried if they encounter
+ * an recoverable error.
+ */
+#define GVE_ADMINQ_RETRY_COUNT 3
+
 #define GVE_ADMINQ_DEVICE_DESCRIPTOR_VERSION 1
 
 /* All AdminQ command structs should be naturally packed. The static_assert