Message ID | 1724149347-14430-1-git-send-email-schakrabarti@linux.microsoft.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net] net: mana: Fix error handling in mana_create_txq/rxq's NAPI cleanup | expand |
On Tue, Aug 20, 2024 at 03:22:27AM -0700, Souradeep Chakrabarti wrote: > Currently napi_disable() gets called during rxq and txq cleanup, > even before napi is enabled and hrtimer is initialized. It causes > kernel panic. > > ? page_fault_oops+0x136/0x2b0 > ? page_counter_cancel+0x2e/0x80 > ? do_user_addr_fault+0x2f2/0x640 > ? refill_obj_stock+0xc4/0x110 > ? exc_page_fault+0x71/0x160 > ? asm_exc_page_fault+0x27/0x30 > ? __mmdrop+0x10/0x180 > ? __mmdrop+0xec/0x180 > ? hrtimer_active+0xd/0x50 > hrtimer_try_to_cancel+0x2c/0xf0 > hrtimer_cancel+0x15/0x30 > napi_disable+0x65/0x90 > mana_destroy_rxq+0x4c/0x2f0 > mana_create_rxq.isra.0+0x56c/0x6d0 > ? mana_uncfg_vport+0x50/0x50 > mana_alloc_queues+0x21b/0x320 > ? skb_dequeue+0x5f/0x80 > > Fixes: e1b5683ff62e ("net: mana: Move NAPI from EQ to CQ") > > Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com> > --- > drivers/net/ethernet/microsoft/mana/mana_en.c | 41 +++++++++++++------ > 1 file changed, 28 insertions(+), 13 deletions(-) > > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c > index 39f56973746d..882b05e087b9 100644 > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c > @@ -1862,7 +1862,7 @@ static void mana_deinit_txq(struct mana_port_context *apc, struct mana_txq *txq) > mana_gd_destroy_queue(gd->gdma_context, txq->gdma_sq); > } > > -static void mana_destroy_txq(struct mana_port_context *apc) > +static void mana_cleanup_napi_txq(struct mana_port_context *apc) > { > struct napi_struct *napi; > int i; > @@ -1875,7 +1875,17 @@ static void mana_destroy_txq(struct mana_port_context *apc) > napi_synchronize(napi); > napi_disable(napi); > netif_napi_del(napi); > + } > +} > + > +static void mana_destroy_txq(struct mana_port_context *apc) > +{ > + int i; > + > + if (!apc->tx_qp) > + return; > > + for (i = 0; i < apc->num_queues; i++) { > mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i].tx_object); > > mana_deinit_cq(apc, &apc->tx_qp[i].tx_cq); > @@ -2007,6 +2017,21 @@ static int mana_create_txq(struct mana_port_context *apc, > return err; > } I think the mana_cleanup_napi_txq() call should also be added in the out path of mana_create_txq(). Consider this, the napi enable for first few tx queue succeeds but if queue creation for any further SQ fails, we don't cleanup the napi's for previously successful ones. > > +static void mana_cleanup_napi_rxq(struct mana_port_context *apc, > + struct mana_rxq *rxq, bool validate_state) > +{ > + struct napi_struct *napi; > + > + if (!rxq) > + return; > + > + napi = &rxq->rx_cq.napi; > + if (validate_state) > + napi_synchronize(napi); > + napi_disable(napi); > + netif_napi_del(napi); > +} > + > static void mana_destroy_rxq(struct mana_port_context *apc, > struct mana_rxq *rxq, bool validate_state) > > @@ -2014,24 +2039,14 @@ static void mana_destroy_rxq(struct mana_port_context *apc, > struct gdma_context *gc = apc->ac->gdma_dev->gdma_context; > struct mana_recv_buf_oob *rx_oob; > struct device *dev = gc->dev; > - struct napi_struct *napi; > struct page *page; > int i; > > if (!rxq) > return; > > - napi = &rxq->rx_cq.napi; > - > - if (validate_state) > - napi_synchronize(napi); > - > - napi_disable(napi); > - > xdp_rxq_info_unreg(&rxq->xdp_rxq); > > - netif_napi_del(napi); > - > mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj); > > mana_deinit_cq(apc, &rxq->rx_cq); > @@ -2336,11 +2351,11 @@ static void mana_destroy_vport(struct mana_port_context *apc) > rxq = apc->rxqs[rxq_idx]; > if (!rxq) > continue; > - > + mana_cleanup_napi_rxq(apc, rxq, true); > mana_destroy_rxq(apc, rxq, true); > apc->rxqs[rxq_idx] = NULL; > } > - > + mana_cleanup_napi_txq(apc); > mana_destroy_txq(apc); > mana_uncfg_vport(apc); > > -- > 2.34.1 > >
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 39f56973746d..882b05e087b9 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1862,7 +1862,7 @@ static void mana_deinit_txq(struct mana_port_context *apc, struct mana_txq *txq) mana_gd_destroy_queue(gd->gdma_context, txq->gdma_sq); } -static void mana_destroy_txq(struct mana_port_context *apc) +static void mana_cleanup_napi_txq(struct mana_port_context *apc) { struct napi_struct *napi; int i; @@ -1875,7 +1875,17 @@ static void mana_destroy_txq(struct mana_port_context *apc) napi_synchronize(napi); napi_disable(napi); netif_napi_del(napi); + } +} + +static void mana_destroy_txq(struct mana_port_context *apc) +{ + int i; + + if (!apc->tx_qp) + return; + for (i = 0; i < apc->num_queues; i++) { mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i].tx_object); mana_deinit_cq(apc, &apc->tx_qp[i].tx_cq); @@ -2007,6 +2017,21 @@ static int mana_create_txq(struct mana_port_context *apc, return err; } +static void mana_cleanup_napi_rxq(struct mana_port_context *apc, + struct mana_rxq *rxq, bool validate_state) +{ + struct napi_struct *napi; + + if (!rxq) + return; + + napi = &rxq->rx_cq.napi; + if (validate_state) + napi_synchronize(napi); + napi_disable(napi); + netif_napi_del(napi); +} + static void mana_destroy_rxq(struct mana_port_context *apc, struct mana_rxq *rxq, bool validate_state) @@ -2014,24 +2039,14 @@ static void mana_destroy_rxq(struct mana_port_context *apc, struct gdma_context *gc = apc->ac->gdma_dev->gdma_context; struct mana_recv_buf_oob *rx_oob; struct device *dev = gc->dev; - struct napi_struct *napi; struct page *page; int i; if (!rxq) return; - napi = &rxq->rx_cq.napi; - - if (validate_state) - napi_synchronize(napi); - - napi_disable(napi); - xdp_rxq_info_unreg(&rxq->xdp_rxq); - netif_napi_del(napi); - mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj); mana_deinit_cq(apc, &rxq->rx_cq); @@ -2336,11 +2351,11 @@ static void mana_destroy_vport(struct mana_port_context *apc) rxq = apc->rxqs[rxq_idx]; if (!rxq) continue; - + mana_cleanup_napi_rxq(apc, rxq, true); mana_destroy_rxq(apc, rxq, true); apc->rxqs[rxq_idx] = NULL; } - + mana_cleanup_napi_txq(apc); mana_destroy_txq(apc); mana_uncfg_vport(apc);
Currently napi_disable() gets called during rxq and txq cleanup, even before napi is enabled and hrtimer is initialized. It causes kernel panic. ? page_fault_oops+0x136/0x2b0 ? page_counter_cancel+0x2e/0x80 ? do_user_addr_fault+0x2f2/0x640 ? refill_obj_stock+0xc4/0x110 ? exc_page_fault+0x71/0x160 ? asm_exc_page_fault+0x27/0x30 ? __mmdrop+0x10/0x180 ? __mmdrop+0xec/0x180 ? hrtimer_active+0xd/0x50 hrtimer_try_to_cancel+0x2c/0xf0 hrtimer_cancel+0x15/0x30 napi_disable+0x65/0x90 mana_destroy_rxq+0x4c/0x2f0 mana_create_rxq.isra.0+0x56c/0x6d0 ? mana_uncfg_vport+0x50/0x50 mana_alloc_queues+0x21b/0x320 ? skb_dequeue+0x5f/0x80 Fixes: e1b5683ff62e ("net: mana: Move NAPI from EQ to CQ") Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com> --- drivers/net/ethernet/microsoft/mana/mana_en.c | 41 +++++++++++++------ 1 file changed, 28 insertions(+), 13 deletions(-)