gve: Add retry logic for recoverable adminq errors

Message ID	20240628185446.262191-1-rushilg@google.com (mailing list archive)
State	Superseded
Delegated to:	Netdev Maintainers
Headers	show Received: from mail-yw1-f201.google.com (mail-yw1-f201.google.com [209.85.128.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BC4E658210 for <netdev@vger.kernel.org>; Fri, 28 Jun 2024 18:54:51 +0000 (UTC) Date: Fri, 28 Jun 2024 18:54:46 +0000 Precedence: bulk Mime-Version: 1.0 Message-ID: <20240628185446.262191-1-rushilg@google.com> Subject: [PATCH] gve: Add retry logic for recoverable adminq errors From: Rushil Gupta <rushilg@google.com> To: netdev@vger.kernel.org Cc: jeroendb@google.com, pkaligineedi@google.com, davem@davemloft.net, kuba@kernel.org, edumazet@google.com, pabeni@redhat.com, willemb@google.com, hramamurthy@google.com, Rushil Gupta <rushilg@google.com>, Shailend Chand <shailend@google.com>, Ziwei Xiao <ziweixiao@google.com> Content-Type: text/plain; charset="UTF-8"
Series	gve: Add retry logic for recoverable adminq errors \| expand gve: Add retry logic for recoverable adminq errors

Context	Check	Description
netdev/series_format	warning	Single patches do not need cover letters; Target tree name not specified in the subject
netdev/tree_selection	success	Guessed tree name to be net-next
netdev/ynl	success	Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present	success	Fixes tag not required for -next series
netdev/header_inline	success	No static functions without inline keyword in header files
netdev/build_32bit	success	Errors and warnings before: 839 this patch: 839
netdev/build_tools	success	No tools touched, skip
netdev/cc_maintainers	success	CCed 10 of 10 maintainers
netdev/build_clang	success	Errors and warnings before: 846 this patch: 846
netdev/verify_signedoff	success	Signed-off-by tag matches author and committer
netdev/deprecated_api	success	None detected
netdev/check_selftest	success	No net selftest shell script
netdev/verify_fixes	success	No Fixes tag
netdev/build_allmodconfig_warn	success	Errors and warnings before: 846 this patch: 846
netdev/checkpatch	warning	WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns
netdev/build_clang_rust	success	No Rust files in patch. Skipping build
netdev/kdoc	success	Errors and warnings before: 0 this patch: 0
netdev/source_inline	success	Was 0 now: 0

diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c index c5bbc1b7524e..74c61b90ea45 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.c +++ b/drivers/net/ethernet/google/gve/gve_adminq.c @@ -12,7 +12,7 @@ #define GVE_MAX_ADMINQ_RELEASE_CHECK 500 #define GVE_ADMINQ_SLEEP_LEN 20 -#define GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK 100 +#define GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK 1000 #define GVE_DEVICE_OPTION_ERROR_FMT "%s option error:\n" \ "Expected: length=%d, feature_mask=%x.\n" \ @@ -415,14 +415,17 @@ static int gve_adminq_parse_err(struct gve_priv *priv, u32 status) /* Flushes all AQ commands currently queued and waits for them to complete. * If there are failures, it will return the first error. */ -static int gve_adminq_kick_and_wait(struct gve_priv *priv) +static int gve_adminq_kick_and_wait(struct gve_priv *priv, int ret_cnt, int *ret_codes) { int tail, head; - int i; + int i, j; tail = ioread32be(&priv->reg_bar0->adminq_event_counter); head = priv->adminq_prod_cnt; + if ((head - tail) > ret_cnt) + return -EINVAL; + gve_adminq_kick_cmd(priv, head); if (!gve_adminq_wait_for_cmd(priv, head)) { dev_err(&priv->pdev->dev, "AQ commands timed out, need to reset AQ\n"); @@ -430,16 +433,13 @@ static int gve_adminq_kick_and_wait(struct gve_priv *priv) return -ENOTRECOVERABLE; } - for (i = tail; i < head; i++) { + for (i = tail, j = 0; i < head; i++, j++) { union gve_adminq_command *cmd; - u32 status, err; + u32 status; cmd = &priv->adminq[i & priv->adminq_mask]; status = be32_to_cpu(READ_ONCE(cmd->status)); - err = gve_adminq_parse_err(priv, status); - if (err) - // Return the first error if we failed. - return err; + ret_codes[j] = gve_adminq_parse_err(priv, status); } return 0; @@ -458,24 +458,8 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv, tail = ioread32be(&priv->reg_bar0->adminq_event_counter); // Check if next command will overflow the buffer. - if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) == - (tail & priv->adminq_mask)) { - int err; - - // Flush existing commands to make room. - err = gve_adminq_kick_and_wait(priv); - if (err) - return err; - - // Retry. - tail = ioread32be(&priv->reg_bar0->adminq_event_counter); - if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) == - (tail & priv->adminq_mask)) { - // This should never happen. We just flushed the - // command queue so there should be enough space. - return -ENOMEM; - } - } + if ((priv->adminq_prod_cnt - tail) > priv->adminq_mask) + return -ENOMEM; cmd = &priv->adminq[priv->adminq_prod_cnt & priv->adminq_mask]; priv->adminq_prod_cnt++; @@ -544,8 +528,9 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv, static int gve_adminq_execute_cmd(struct gve_priv *priv, union gve_adminq_command *cmd_orig) { + int retry_cnt = 0; u32 tail, head; - int err; + int err, ret; mutex_lock(&priv->adminq_lock); tail = ioread32be(&priv->reg_bar0->adminq_event_counter); @@ -555,15 +540,21 @@ static int gve_adminq_execute_cmd(struct gve_priv *priv, goto out; } - err = gve_adminq_issue_cmd(priv, cmd_orig); - if (err) - goto out; + do { + err = gve_adminq_issue_cmd(priv, cmd_orig); + if (err) + goto out; - err = gve_adminq_kick_and_wait(priv); + err = gve_adminq_kick_and_wait(priv, 1, &ret); + if (err) + goto out; + } while (ret == -ETIME && ++retry_cnt < GVE_ADMINQ_RETRY_COUNT); out: mutex_unlock(&priv->adminq_lock); - return err; + if (err) + return err; + return ret; } static int gve_adminq_execute_extended_cmd(struct gve_priv *priv, u32 opcode, @@ -638,6 +629,98 @@ int gve_adminq_deconfigure_device_resources(struct gve_priv *priv) return gve_adminq_execute_cmd(priv, &cmd); } +typedef int (gve_adminq_queue_cmd) (struct gve_priv *priv, u32 queue_index); + +static int gve_adminq_manage_queues(struct gve_priv *priv, + gve_adminq_queue_cmd *cmd, + u32 start_id, u32 num_queues) +{ + u32 cmd_idx, queue_idx, ret_code_idx; + int queue_done = -1; + int *queues_waiting; + int retry_cnt = 0; + int retry_needed; + int *ret_codes; + int *commands; + int err; + int ret; + + queues_waiting = kvcalloc(num_queues, sizeof(int), GFP_KERNEL); + if (!queues_waiting) + return -ENOMEM; + ret_codes = kvcalloc(num_queues, sizeof(int), GFP_KERNEL); + if (!ret_codes) { + err = -ENOMEM; + goto free_with_queues_waiting; + } + commands = kvcalloc(num_queues, sizeof(int), GFP_KERNEL); + if (!commands) { + err = -ENOMEM; + goto free_with_ret_codes; + } + + for (queue_idx = 0; queue_idx < num_queues; queue_idx++) + queues_waiting[queue_idx] = start_id + queue_idx; + do { + retry_needed = 0; + queue_idx = 0; + while (queue_idx < num_queues) { + cmd_idx = 0; + while (queue_idx < num_queues) { + if (queues_waiting[queue_idx] != queue_done) { + err = cmd(priv, queues_waiting[queue_idx]); + if (err == -ENOMEM) + break; + if (err) + goto free_with_commands; + commands[cmd_idx++] = queue_idx; + } + queue_idx++; + } + + if (queue_idx < num_queues) + dev_dbg(&priv->pdev->dev, + "Issued %d of %d batched commands\n", + queue_idx, num_queues); + + err = gve_adminq_kick_and_wait(priv, cmd_idx, ret_codes); + if (err) + goto free_with_commands; + + for (ret_code_idx = 0; ret_code_idx < cmd_idx; ret_code_idx++) { + if (ret_codes[ret_code_idx] == 0) { + queues_waiting[commands[ret_code_idx]] = queue_done; + } else if (ret_codes[ret_code_idx] != -ETIME) { + ret = ret_codes[ret_code_idx]; + goto free_with_commands; + } else { + retry_needed++; + } + } + + if (retry_needed) + dev_dbg(&priv->pdev->dev, + "Issued %d batched commands, %d needed a retry\n", + cmd_idx, retry_needed); + } + } while (retry_needed && ++retry_cnt < GVE_ADMINQ_RETRY_COUNT); + + ret = retry_needed ? -ETIME : 0; + +free_with_commands: + kvfree(commands); + commands = NULL; +free_with_ret_codes: + kvfree(ret_codes); + ret_codes = NULL; +free_with_queues_waiting: + kvfree(queues_waiting); + queues_waiting = NULL; + if (err) + return err; + return ret; +} + static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index) { struct gve_tx_ring *tx = &priv->tx[queue_index]; @@ -678,16 +761,8 @@ static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index) int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 start_id, u32 num_queues) { - int err; - int i; - - for (i = start_id; i < start_id + num_queues; i++) { - err = gve_adminq_create_tx_queue(priv, i); - if (err) - return err; - } - - return gve_adminq_kick_and_wait(priv); + return gve_adminq_manage_queues(priv, &gve_adminq_create_tx_queue, + start_id, num_queues); } static void gve_adminq_get_create_rx_queue_cmd(struct gve_priv *priv, @@ -759,16 +834,8 @@ int gve_adminq_create_single_rx_queue(struct gve_priv *priv, u32 queue_index) int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues) { - int err; - int i; - - for (i = 0; i < num_queues; i++) { - err = gve_adminq_create_rx_queue(priv, i); - if (err) - return err; - } - - return gve_adminq_kick_and_wait(priv); + return gve_adminq_manage_queues(priv, &gve_adminq_create_rx_queue, + 0, num_queues); } static int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index) @@ -791,16 +858,8 @@ static int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index) int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 start_id, u32 num_queues) { - int err; - int i; - - for (i = start_id; i < start_id + num_queues; i++) { - err = gve_adminq_destroy_tx_queue(priv, i); - if (err) - return err; - } - - return gve_adminq_kick_and_wait(priv); + return gve_adminq_manage_queues(priv, &gve_adminq_destroy_tx_queue, + start_id, num_queues); } static void gve_adminq_make_destroy_rx_queue_cmd(union gve_adminq_command *cmd, @@ -832,16 +891,8 @@ int gve_adminq_destroy_single_rx_queue(struct gve_priv *priv, u32 queue_index) int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues) { - int err; - int i; - - for (i = 0; i < num_queues; i++) { - err = gve_adminq_destroy_rx_queue(priv, i); - if (err) - return err; - } - - return gve_adminq_kick_and_wait(priv); + return gve_adminq_manage_queues(priv, &gve_adminq_destroy_rx_queue, + 0, num_queues); } static void gve_set_default_desc_cnt(struct gve_priv *priv, diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h index ed1370c9b197..96e98f65273c 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.h +++ b/drivers/net/ethernet/google/gve/gve_adminq.h @@ -62,6 +62,11 @@ enum gve_adminq_statuses { GVE_ADMINQ_COMMAND_ERROR_UNKNOWN_ERROR = 0xFFFFFFFF, }; +/* AdminQ commands (that aren't batched) will be retried if they encounter + * an recoverable error. + */ +#define GVE_ADMINQ_RETRY_COUNT 3 + #define GVE_ADMINQ_DEVICE_DESCRIPTOR_VERSION 1 /* All AdminQ command structs should be naturally packed. The static_assert

gve: Add retry logic for recoverable adminq errors

Checks

Commit Message

Patch