diff mbox series

WIP! media: uvcvideo: Use dma_alloc_noncontiguos API

Message ID 20201118142546.170621-1-ribalda@chromium.org (mailing list archive)
State New, archived
Headers show
Series WIP! media: uvcvideo: Use dma_alloc_noncontiguos API | expand

Commit Message

Ricardo Ribalda Nov. 18, 2020, 2:25 p.m. UTC
On architectures where the is no coherent caching such as ARM use the
dma_alloc_noncontiguos API and handle manually the cache flushing using
dma_sync_single().

With this patch on the affected architectures we can measure up to 20x
performance improvement in uvc_video_copy_data_work().

Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
---

This patch depends on dma_alloc_contiguous API1315351diffmboxseries

https://lore.kernel.org/patchwork/patch/1315351/#1535182

 drivers/media/usb/uvc/uvc_video.c | 69 +++++++++++++++++++++++++------
 drivers/media/usb/uvc/uvcvideo.h  |  1 +
 2 files changed, 58 insertions(+), 12 deletions(-)

Comments

Christoph Hellwig Nov. 24, 2020, 11:35 a.m. UTC | #1
On Wed, Nov 18, 2020 at 03:25:46PM +0100, Ricardo Ribalda wrote:
> On architectures where the is no coherent caching such as ARM use the
> dma_alloc_noncontiguos API and handle manually the cache flushing using
> dma_sync_single().
> 
> With this patch on the affected architectures we can measure up to 20x
> performance improvement in uvc_video_copy_data_work().

This has a bunch of crazy long lines, but otherwise looks fine to me.

> 
> Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
> ---
> 
> This patch depends on dma_alloc_contiguous API1315351diffmboxseries

How do we want to proceed?  Do the media maintainers want to pick up
that patch?  Should I pick up the media patch in the dma-mapping tree?

Can you respost a combined series to get started?
Ricardo Ribalda Nov. 24, 2020, 12:01 p.m. UTC | #2
HI Christoph

On Tue, Nov 24, 2020 at 12:35 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Wed, Nov 18, 2020 at 03:25:46PM +0100, Ricardo Ribalda wrote:
> > On architectures where the is no coherent caching such as ARM use the
> > dma_alloc_noncontiguos API and handle manually the cache flushing using
> > dma_sync_single().
> >
> > With this patch on the affected architectures we can measure up to 20x
> > performance improvement in uvc_video_copy_data_work().
>
> This has a bunch of crazy long lines, but otherwise looks fine to me.

That is easy to solve :)

https://github.com/ribalda/linux/commit/17ab65a08302e845ad7ae7775ce54b387a58a887

>
> >
> > Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
> > ---
> >
> > This patch depends on dma_alloc_contiguous API1315351diffmboxseries
>
> How do we want to proceed?  Do the media maintainers want to pick up
> that patch?  Should I pick up the media patch in the dma-mapping tree?

I was hoping that you could answer that question :).

Do you have other use-cases than linux-media in mind?

I think Sergey wants to experiment also with vb2, to figure out how
much it affects it.
His change will be much more complicated than mine thought, there are
more cornercases there.

>
> Can you respost a combined series to get started?

Sure. Shall I also include the profiling patch?


Best regards
Christoph Hellwig Nov. 24, 2020, 1:33 p.m. UTC | #3
On Tue, Nov 24, 2020 at 01:01:33PM +0100, Ricardo Ribalda wrote:
> I was hoping that you could answer that question :).
> 
> Do you have other use-cases than linux-media in mind?
> 
> I think Sergey wants to experiment also with vb2, to figure out how
> much it affects it.
> His change will be much more complicated than mine thought, there are
> more cornercases there.

I don't have anything urgend lined up, although I think there are plenty
other potential use cases.

> > Can you respost a combined series to get started?
> 
> Sure. Shall I also include the profiling patch?

That is in the media code, right?  I don't really care too much.
diff mbox series

Patch

diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
index ff624bb857d3..ef1b029b8576 100644
--- a/drivers/media/usb/uvc/uvc_video.c
+++ b/drivers/media/usb/uvc/uvc_video.c
@@ -1641,6 +1641,11 @@  static void uvc_video_encode_bulk(struct uvc_urb *uvc_urb,
 	urb->transfer_buffer_length = stream->urb_size - len;
 }
 
+static inline struct device *stream_to_dmadev(struct uvc_streaming *stream)
+{
+	return stream->dev->udev->bus->controller->parent;
+}
+
 static void uvc_video_complete(struct urb *urb)
 {
 	struct uvc_urb *uvc_urb = urb->context;
@@ -1693,6 +1698,11 @@  static void uvc_video_complete(struct urb *urb)
 	 * Process the URB headers, and optionally queue expensive memcpy tasks
 	 * to be deferred to a work queue.
 	 */
+	if (uvc_urb->pages)
+		dma_sync_single_for_cpu(stream_to_dmadev(stream),
+					urb->transfer_dma,
+					urb->transfer_buffer_length,
+					DMA_FROM_DEVICE);
 	stream->decode(uvc_urb, buf, buf_meta);
 
 	/* If no async work is needed, resubmit the URB immediately. */
@@ -1723,8 +1733,15 @@  static void uvc_free_urb_buffers(struct uvc_streaming *stream)
 			continue;
 
 #ifndef CONFIG_DMA_NONCOHERENT
-		usb_free_coherent(stream->dev->udev, stream->urb_size,
-				  uvc_urb->buffer, uvc_urb->dma);
+		if (uvc_urb->pages) {
+			vunmap(uvc_urb->buffer);
+			dma_free_noncontiguous(stream_to_dmadev(stream),
+					       stream->urb_size,
+					       uvc_urb->pages, uvc_urb->dma);
+		} else {
+			usb_free_coherent(stream->dev->udev, stream->urb_size,
+					  uvc_urb->buffer, uvc_urb->dma);
+		}
 #else
 		kfree(uvc_urb->buffer);
 #endif
@@ -1734,6 +1751,42 @@  static void uvc_free_urb_buffers(struct uvc_streaming *stream)
 	stream->urb_size = 0;
 }
 
+#ifndef CONFIG_DMA_NONCOHERENT
+static bool uvc_alloc_urb_buffer(struct uvc_streaming *stream, struct uvc_urb *uvc_urb,
+				 gfp_t gfp_flags)
+{
+	struct device *dma_dev = dma_dev = stream_to_dmadev(stream);
+
+	if (!dma_can_alloc_noncontiguous(dma_dev)) {
+		uvc_urb->buffer = usb_alloc_coherent(stream->dev->udev, stream->urb_size,
+						     gfp_flags | __GFP_NOWARN, &uvc_urb->dma);
+		return uvc_urb->buffer != NULL;
+	}
+
+	uvc_urb->pages = dma_alloc_noncontiguous(dma_dev, stream->urb_size,
+						 &uvc_urb->dma, gfp_flags | __GFP_NOWARN, 0);
+	if (!uvc_urb->pages)
+		return false;
+
+	uvc_urb->buffer = vmap(uvc_urb->pages, PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT,
+			       VM_DMA_COHERENT, PAGE_KERNEL);
+	if (!uvc_urb->buffer) {
+		dma_free_noncontiguous(dma_dev, stream->urb_size, uvc_urb->pages, uvc_urb->dma);
+		return false;
+	}
+
+	return true;
+}
+#else
+static bool uvc_alloc_urb_buffer(struct uvc_streaming *stream, struct uvc_urb *uvc_urb,
+				 gfp_t gfp_flags)
+{
+	uvc_urb->buffer = kmalloc(stream->urb_size, gfp_flags | __GFP_NOWARN);
+
+	return uvc_urb->buffer != NULL;
+}
+#endif
+
 /*
  * Allocate transfer buffers. This function can be called with buffers
  * already allocated when resuming from suspend, in which case it will
@@ -1764,19 +1817,11 @@  static int uvc_alloc_urb_buffers(struct uvc_streaming *stream,
 
 	/* Retry allocations until one succeed. */
 	for (; npackets > 1; npackets /= 2) {
+		stream->urb_size = psize * npackets;
 		for (i = 0; i < UVC_URBS; ++i) {
 			struct uvc_urb *uvc_urb = &stream->uvc_urb[i];
 
-			stream->urb_size = psize * npackets;
-#ifndef CONFIG_DMA_NONCOHERENT
-			uvc_urb->buffer = usb_alloc_coherent(
-				stream->dev->udev, stream->urb_size,
-				gfp_flags | __GFP_NOWARN, &uvc_urb->dma);
-#else
-			uvc_urb->buffer =
-			    kmalloc(stream->urb_size, gfp_flags | __GFP_NOWARN);
-#endif
-			if (!uvc_urb->buffer) {
+			if (!uvc_alloc_urb_buffer(stream, uvc_urb, gfp_flags)) {
 				uvc_free_urb_buffers(stream);
 				break;
 			}
diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h
index 60d830d74ac1..80eeeaf3cd06 100644
--- a/drivers/media/usb/uvc/uvcvideo.h
+++ b/drivers/media/usb/uvc/uvcvideo.h
@@ -544,6 +544,7 @@  struct uvc_urb {
 
 	char *buffer;
 	dma_addr_t dma;
+	struct page **pages;
 
 	unsigned int async_operations;
 	struct uvc_copy_op copy_operations[UVC_MAX_PACKETS];