diff mbox series

[v3,2/3] usb: xhci: Fix TD invalidation under pending Set TR Dequeue

Message ID 20241031104401.22a80b14@foxbook (mailing list archive)
State Superseded
Headers show
Series xhci: Fix Stop Endpoint problems | expand

Commit Message

Michał Pecio Oct. 31, 2024, 9:44 a.m. UTC
xhci_invalidate_cancelled_tds() may work incorrectly if the hardware
is modifying some Endpoint or Stream Context by executing a Set TR
Dequeue command at the same time. And even if it worked, it couldn't
queue another Set TR Dequeue if required, failing to clear xHC cache.

Yet this function is called by completion handlers of commands like Stop
Endpoint and Reset Endpoint without any testing for SET_DEQ_PENDING. The
problem remained undetected for long time because the UAS driver rarely
cancels URBs, but it does cancel them sometimes, e.g. on I/O errors.

Make it legal to call xhci_invalidate_cancelled_tds() under a pending
Set TR Dequeue. Bail out of the function when SET_DEQ_PENDING is set,
then make the completion handler call the function again and also call
xhci_giveback_invalidated_tds(), which typically must be called next.

Clean up some code which duplicates functionality of these functions.

Fixes: 5ceac4402f5d ("xhci: Handle TD clearing for multiple streams case")
CC: stable@vger.kernel.org
Signed-off-by: Michal Pecio <michal.pecio@gmail.com>
---
 drivers/usb/host/xhci-ring.c | 45 ++++++++++++------------------------
 1 file changed, 15 insertions(+), 30 deletions(-)

Comments

Mathias Nyman Oct. 31, 2024, 1:57 p.m. UTC | #1
On 31.10.2024 11.44, Michal Pecio wrote:
> xhci_invalidate_cancelled_tds() may work incorrectly if the hardware
> is modifying some Endpoint or Stream Context by executing a Set TR
> Dequeue command at the same time. And even if it worked, it couldn't
> queue another Set TR Dequeue if required, failing to clear xHC cache.
> 
> Yet this function is called by completion handlers of commands like Stop
> Endpoint and Reset Endpoint without any testing for SET_DEQ_PENDING. The
> problem remained undetected for long time because the UAS driver rarely
> cancels URBs, but it does cancel them sometimes, e.g. on I/O errors.
> 
> Make it legal to call xhci_invalidate_cancelled_tds() under a pending
> Set TR Dequeue. Bail out of the function when SET_DEQ_PENDING is set,
> then make the completion handler call the function again and also call
> xhci_giveback_invalidated_tds(), which typically must be called next.
> 
> Clean up some code which duplicates functionality of these functions.
> 
> Fixes: 5ceac4402f5d ("xhci: Handle TD clearing for multiple streams case")
> CC: stable@vger.kernel.org
> Signed-off-by: Michal Pecio <michal.pecio@gmail.com>
> ---
>   drivers/usb/host/xhci-ring.c | 45 ++++++++++++------------------------
>   1 file changed, 15 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
> index 98e12267bbf6..52eb3ee1d7bf 100644
> --- a/drivers/usb/host/xhci-ring.c
> +++ b/drivers/usb/host/xhci-ring.c
> @@ -959,7 +959,7 @@ static int xhci_handle_halted_endpoint(struct xhci_hcd *xhci,
>    * We're also in the event handler, so we can't get re-interrupted if another
>    * Stop Endpoint command completes.
>    *
> - * only call this when ring is not in a running state
> + * only call this on a ring in Stopped state (we may need to queue Set TR Deq)
>    */
>   
>   static int xhci_invalidate_cancelled_tds(struct xhci_virt_ep *ep)
> @@ -973,6 +973,13 @@ static int xhci_invalidate_cancelled_tds(struct xhci_virt_ep *ep)
>   	unsigned int		slot_id = ep->vdev->slot_id;
>   	int			err;
>   
> +	/*
> +	 * This is not going to work if the hardware is changing its dequeue
> +	 * pointers as we look at them. Completion handler will call us later.
> +	 */
> +	if (ep->ep_state & SET_DEQ_PENDING)
> +		return 0;
> +
>   	xhci = ep->xhci;
>   
>   	list_for_each_entry_safe(td, tmp_td, &ep->cancelled_td_list, cancelled_td_list) {
> @@ -1001,7 +1008,9 @@ static int xhci_invalidate_cancelled_tds(struct xhci_virt_ep *ep)
>   		if (td->cancel_status == TD_HALTED || trb_in_td(xhci, td, hw_deq, false)) {
>   			switch (td->cancel_status) {
>   			case TD_CLEARED: /* TD is already no-op */
> -			case TD_CLEARING_CACHE: /* set TR deq command already queued */
> +				break;
> +			case TD_CLEARING_CACHE: /* set TR deq command completed */
> +				td->cancel_status = TD_CLEARED;

This should only happen if 'Set TR Deq' fails as in success cases the hw_deq is moved past this TD.
when it fails the cache may not be cleared. i.e. rd is not TD_CLEARED

We do have the same problem in current code, but moving it here makes that even harder to fix.

>   				break;
>   			case TD_DIRTY: /* TD is cached, clear it */
>   			case TD_HALTED:
> @@ -1358,8 +1367,6 @@ static void xhci_handle_cmd_set_deq(struct xhci_hcd *xhci, int slot_id,
>   	struct xhci_virt_ep *ep;
>   	struct xhci_ep_ctx *ep_ctx;
>   	struct xhci_slot_ctx *slot_ctx;
> -	struct xhci_td *td, *tmp_td;
> -	bool deferred = false;
>   
>   	ep_index = TRB_TO_EP_INDEX(le32_to_cpu(trb->generic.field[3]));
>   	stream_id = TRB_TO_STREAM_ID(le32_to_cpu(trb->generic.field[2]));
> @@ -1451,37 +1458,15 @@ static void xhci_handle_cmd_set_deq(struct xhci_hcd *xhci, int slot_id,
>   				  ep->queued_deq_seg, ep->queued_deq_ptr);
>   		}
>   	}
> -	/* HW cached TDs cleared from cache, give them back */
> -	list_for_each_entry_safe(td, tmp_td, &ep->cancelled_td_list,
> -				 cancelled_td_list) {
> -		ep_ring = xhci_urb_to_transfer_ring(ep->xhci, td->urb);
> -		if (td->cancel_status == TD_CLEARING_CACHE) {
> -			td->cancel_status = TD_CLEARED;
> -			xhci_dbg(ep->xhci, "%s: Giveback cancelled URB %p TD\n",
> -				 __func__, td->urb);
> -			xhci_td_cleanup(ep->xhci, td, ep_ring, td->status);
> -		} else if (td->cancel_status == TD_CLEARING_CACHE_DEFERRED) {
> -			deferred = true;
> -		} else {
> -			xhci_dbg(ep->xhci, "%s: Keep cancelled URB %p TD as cancel_status is %d\n",
> -				 __func__, td->urb, td->cancel_status);
> -		}
> -	}
>   cleanup:
>   	ep->ep_state &= ~SET_DEQ_PENDING;
>   	ep->queued_deq_seg = NULL;
>   	ep->queued_deq_ptr = NULL;
>   
> -	if (deferred) {
> -		/* We have more streams to clear */
> -		xhci_dbg(ep->xhci, "%s: Pending TDs to clear, continuing with invalidation\n",
> -			 __func__);
> -		xhci_invalidate_cancelled_tds(ep);
> -	} else {
> -		/* Restart any rings with pending URBs */
> -		xhci_dbg(ep->xhci, "%s: All TDs cleared, ring doorbell\n", __func__);
> -		ring_doorbell_for_active_rings(xhci, slot_id, ep_index);
> -	}
> +	/* Continue cancellations or restart the ring if done */
> +	xhci_invalidate_cancelled_tds(ep);

xhci_invalidate_cancelled_tds() now gets a secondary purpose which is not obvious from
reading the code.
its repurposed here mainly to turn cancelled tds from TD_CLEARING_CACHE to
TD_CLEARED in cases where hw_deq is _NOT_ in the td.

xhci_invalidate_cancelled_tds() will now also perform some extra work on each
already cleared td that is still in the ep->cancelled_td_list

I think its clearer to run though the cancelled td list above, and only
call xhci_invalidate_cancelled_tds() if needed.

i.e.

list_for_each_entry_safe(td, tmp_td, &ep->cancelled_td_list, cancelled_td_list) {
	switch (td->cancel_status) {
	case TD_CLEARING_CACHE:
		td->cancel_status = TD_CLEARED;
		break;
	case TD_CLEARING_CACHE_DEFERRED:
	case TD_DIRTY:
		pending_td_cancels = true;
		break;
	default:
		break;
	}
}
...
if (pending_td_cancels)
	xhci_invalidate_cancelled_tds(ep);

Thanks
Mathias
Michał Pecio Nov. 4, 2024, 9:43 a.m. UTC | #2
On Thu, 31 Oct 2024 15:57:16 +0200, Mathias Nyman wrote:
> On 31.10.2024 11.44, Michal Pecio wrote:
> > +			case TD_CLEARING_CACHE: /* set TR deq command completed */
> > +				td->cancel_status = TD_CLEARED;  
> 
> This should only happen if 'Set TR Deq' fails as in success cases the
> hw_deq is moved past this TD. when it fails the cache may not be
> cleared. i.e. rd is not TD_CLEARED
Thanks, that was a bug, completely unintended. I should have placed
this line elsewhere.
 
> We do have the same problem in current code, but moving it here makes
> that even harder to fix.
Right, I will no longer try to move this code at all.

> xhci_invalidate_cancelled_tds() will now also perform some extra work
> on each already cleared td that is still in the ep->cancelled_td_list
I think this shouldn't be a problem, because every call to invalidate()
is now followed by a call to giveback(), so a future invalidate() should
never seen stale cleared TDs left by a past invalidate().

> I think its clearer to run though the cancelled td list above, and
> only call xhci_invalidate_cancelled_tds() if needed.
> 
> i.e.
> 
> list_for_each_entry_safe(td, tmp_td, &ep->cancelled_td_list,
> cancelled_td_list) { switch (td->cancel_status) {
> 	case TD_CLEARING_CACHE:
> 		td->cancel_status = TD_CLEARED;
> 		break;
> 	case TD_CLEARING_CACHE_DEFERRED:
> 	case TD_DIRTY:
> 		pending_td_cancels = true;
> 		break;
> 	default:
> 		break;
> 	}
> }
> ...
> if (pending_td_cancels)
> 	xhci_invalidate_cancelled_tds(ep);
I have done something similar, but without an extra bool. Simply test
for unempty cancelled_td_list.


Regards,
Michal
diff mbox series

Patch

diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 98e12267bbf6..52eb3ee1d7bf 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -959,7 +959,7 @@  static int xhci_handle_halted_endpoint(struct xhci_hcd *xhci,
  * We're also in the event handler, so we can't get re-interrupted if another
  * Stop Endpoint command completes.
  *
- * only call this when ring is not in a running state
+ * only call this on a ring in Stopped state (we may need to queue Set TR Deq)
  */
 
 static int xhci_invalidate_cancelled_tds(struct xhci_virt_ep *ep)
@@ -973,6 +973,13 @@  static int xhci_invalidate_cancelled_tds(struct xhci_virt_ep *ep)
 	unsigned int		slot_id = ep->vdev->slot_id;
 	int			err;
 
+	/*
+	 * This is not going to work if the hardware is changing its dequeue
+	 * pointers as we look at them. Completion handler will call us later.
+	 */
+	if (ep->ep_state & SET_DEQ_PENDING)
+		return 0;
+
 	xhci = ep->xhci;
 
 	list_for_each_entry_safe(td, tmp_td, &ep->cancelled_td_list, cancelled_td_list) {
@@ -1001,7 +1008,9 @@  static int xhci_invalidate_cancelled_tds(struct xhci_virt_ep *ep)
 		if (td->cancel_status == TD_HALTED || trb_in_td(xhci, td, hw_deq, false)) {
 			switch (td->cancel_status) {
 			case TD_CLEARED: /* TD is already no-op */
-			case TD_CLEARING_CACHE: /* set TR deq command already queued */
+				break;
+			case TD_CLEARING_CACHE: /* set TR deq command completed */
+				td->cancel_status = TD_CLEARED;
 				break;
 			case TD_DIRTY: /* TD is cached, clear it */
 			case TD_HALTED:
@@ -1358,8 +1367,6 @@  static void xhci_handle_cmd_set_deq(struct xhci_hcd *xhci, int slot_id,
 	struct xhci_virt_ep *ep;
 	struct xhci_ep_ctx *ep_ctx;
 	struct xhci_slot_ctx *slot_ctx;
-	struct xhci_td *td, *tmp_td;
-	bool deferred = false;
 
 	ep_index = TRB_TO_EP_INDEX(le32_to_cpu(trb->generic.field[3]));
 	stream_id = TRB_TO_STREAM_ID(le32_to_cpu(trb->generic.field[2]));
@@ -1451,37 +1458,15 @@  static void xhci_handle_cmd_set_deq(struct xhci_hcd *xhci, int slot_id,
 				  ep->queued_deq_seg, ep->queued_deq_ptr);
 		}
 	}
-	/* HW cached TDs cleared from cache, give them back */
-	list_for_each_entry_safe(td, tmp_td, &ep->cancelled_td_list,
-				 cancelled_td_list) {
-		ep_ring = xhci_urb_to_transfer_ring(ep->xhci, td->urb);
-		if (td->cancel_status == TD_CLEARING_CACHE) {
-			td->cancel_status = TD_CLEARED;
-			xhci_dbg(ep->xhci, "%s: Giveback cancelled URB %p TD\n",
-				 __func__, td->urb);
-			xhci_td_cleanup(ep->xhci, td, ep_ring, td->status);
-		} else if (td->cancel_status == TD_CLEARING_CACHE_DEFERRED) {
-			deferred = true;
-		} else {
-			xhci_dbg(ep->xhci, "%s: Keep cancelled URB %p TD as cancel_status is %d\n",
-				 __func__, td->urb, td->cancel_status);
-		}
-	}
 cleanup:
 	ep->ep_state &= ~SET_DEQ_PENDING;
 	ep->queued_deq_seg = NULL;
 	ep->queued_deq_ptr = NULL;
 
-	if (deferred) {
-		/* We have more streams to clear */
-		xhci_dbg(ep->xhci, "%s: Pending TDs to clear, continuing with invalidation\n",
-			 __func__);
-		xhci_invalidate_cancelled_tds(ep);
-	} else {
-		/* Restart any rings with pending URBs */
-		xhci_dbg(ep->xhci, "%s: All TDs cleared, ring doorbell\n", __func__);
-		ring_doorbell_for_active_rings(xhci, slot_id, ep_index);
-	}
+	/* Continue cancellations or restart the ring if done */
+	xhci_invalidate_cancelled_tds(ep);
+	ring_doorbell_for_active_rings(xhci, slot_id, ep_index);
+	xhci_giveback_invalidated_tds(ep);
 }
 
 static void xhci_handle_cmd_reset_ep(struct xhci_hcd *xhci, int slot_id,