diff mbox series

[2/2] drm/gud: Use scatter-gather USB bulk transfer

Message ID 20210329180120.27380-2-noralf@tronnes.org (mailing list archive)
State New, archived
Headers show
Series [1/2] drm/gud: Free buffers on device removal | expand

Commit Message

Noralf Trønnes March 29, 2021, 6:01 p.m. UTC
There'a limit to how big a kmalloc buffer can be, and as memory gets
fragmented it becomes more difficult to get big buffers. The downside of
smaller buffers is that the driver has to split the transfer up which
hampers performance. Compression might also take a hit because of the
splitting.

Solve this by allocating the transfer buffer using vmalloc and create a
SG table to be passed on to the USB subsystem. vmalloc_32() is used to
avoid DMA bounce buffers on USB controllers that can only access 32-bit
addresses.

This also solves the problem that split transfers can give host side
tearing since flushing is decoupled from rendering.

Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
---
 drivers/gpu/drm/gud/gud_drv.c      | 49 +++++++++++++++++++++---------
 drivers/gpu/drm/gud/gud_internal.h |  2 ++
 drivers/gpu/drm/gud/gud_pipe.c     | 47 ++++++++++++++++++++++++----
 3 files changed, 77 insertions(+), 21 deletions(-)

Comments

Linus Walleij June 14, 2021, 8:54 p.m. UTC | #1
Hi Noralf,

On Mon, Mar 29, 2021 at 8:01 PM Noralf Trønnes <noralf@tronnes.org> wrote:

> There'a limit to how big a kmalloc buffer can be, and as memory gets
> fragmented it becomes more difficult to get big buffers. The downside of
> smaller buffers is that the driver has to split the transfer up which
> hampers performance. Compression might also take a hit because of the
> splitting.
>
> Solve this by allocating the transfer buffer using vmalloc and create a
> SG table to be passed on to the USB subsystem. vmalloc_32() is used to
> avoid DMA bounce buffers on USB controllers that can only access 32-bit
> addresses.
>
> This also solves the problem that split transfers can give host side
> tearing since flushing is decoupled from rendering.
>
> Signed-off-by: Noralf Trønnes <noralf@tronnes.org>

> +       num_pages = PAGE_ALIGN(gdrm->bulk_len) >> PAGE_SHIFT;

Isn't it the same to write:

num_pages = round_up(gdrm->bulk_len, PAGE_SIZE)?

Slightly easier to read IMO.

> +       if (max_buffer_size > SZ_64M)
> +               max_buffer_size = SZ_64M; /* safeguard */

Explain this choice of max buffer in the commit message
or as a comment please because I don't get why this size
is the roof.

> +struct gud_usb_bulk_context {
> +       struct timer_list timer;
> +       struct usb_sg_request sgr;
> +};
> +
> +static void gud_usb_bulk_timeout(struct timer_list *t)
> +{
> +       struct gud_usb_bulk_context *timer = from_timer(timer, t, timer);
> +
> +       usb_sg_cancel(&timer->sgr);

Error message here? "Timeout on sg bulk transfer".

> +}
> +
> +static int gud_usb_bulk(struct gud_device *gdrm, size_t len)
> +{
> +       struct gud_usb_bulk_context ctx;
> +       int ret;
> +
> +       ret = usb_sg_init(&ctx.sgr, gud_to_usb_device(gdrm), gdrm->bulk_pipe, 0,
> +                         gdrm->bulk_sgt.sgl, gdrm->bulk_sgt.nents, len, GFP_KERNEL);
> +       if (ret)
> +               return ret;
> +
> +       timer_setup_on_stack(&ctx.timer, gud_usb_bulk_timeout, 0);
> +       mod_timer(&ctx.timer, jiffies + msecs_to_jiffies(3000));
> +
> +       usb_sg_wait(&ctx.sgr);
> +
> +       if (!del_timer_sync(&ctx.timer))
> +               ret = -ETIMEDOUT;
> +       else if (ctx.sgr.status < 0)
> +               ret = ctx.sgr.status;
> +       else if (ctx.sgr.bytes != len)
> +               ret = -EIO;
> +
> +       destroy_timer_on_stack(&ctx.timer);
> +
> +       return ret;
> +}

Mention in the commit message that sending USB bulk transfers with
an sglist could be unstable so you set up a timeout around
usb_sg_wait() (did this happen to you? then write that)

The other users of usb_sg_wait() in the kernel do not have these
timeout wrappers, I suspect the reasoning is something like
"it's graphics, not storage, so if we timeout and lose an update,
too bad but let's just continue hoping the lost graphics will be
less than noticeable" so then we should write that as a comment
about that in the code or something.

With these comments fixed up:
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>

Yours,
Linus Walleij
Noralf Trønnes June 15, 2021, 8:48 a.m. UTC | #2
Den 14.06.2021 22.54, skrev Linus Walleij:
> Hi Noralf,
> 
> On Mon, Mar 29, 2021 at 8:01 PM Noralf Trønnes <noralf@tronnes.org> wrote:
> 
>> There'a limit to how big a kmalloc buffer can be, and as memory gets
>> fragmented it becomes more difficult to get big buffers. The downside of
>> smaller buffers is that the driver has to split the transfer up which
>> hampers performance. Compression might also take a hit because of the
>> splitting.
>>
>> Solve this by allocating the transfer buffer using vmalloc and create a
>> SG table to be passed on to the USB subsystem. vmalloc_32() is used to
>> avoid DMA bounce buffers on USB controllers that can only access 32-bit
>> addresses.
>>
>> This also solves the problem that split transfers can give host side
>> tearing since flushing is decoupled from rendering.
>>
>> Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
> 
>> +       num_pages = PAGE_ALIGN(gdrm->bulk_len) >> PAGE_SHIFT;
> 
> Isn't it the same to write:
> 
> num_pages = round_up(gdrm->bulk_len, PAGE_SIZE)?
> 
> Slightly easier to read IMO.
> 

Yes it's the same, I just copied this from elsewhere in the kernel where
a vmalloc buffer is turned into an sg list. I can change that.

>> +       if (max_buffer_size > SZ_64M)
>> +               max_buffer_size = SZ_64M; /* safeguard */
> 
> Explain this choice of max buffer in the commit message
> or as a comment please because I don't get why this size
> is the roof.
> 
>> +struct gud_usb_bulk_context {
>> +       struct timer_list timer;
>> +       struct usb_sg_request sgr;
>> +};
>> +
>> +static void gud_usb_bulk_timeout(struct timer_list *t)
>> +{
>> +       struct gud_usb_bulk_context *timer = from_timer(timer, t, timer);
>> +
>> +       usb_sg_cancel(&timer->sgr);
> 
> Error message here? "Timeout on sg bulk transfer".
> 

A timeout is detected in gud_usb_bulk() which will return -ETIMEDOUT if
the timer did fire. gud_flush_work() will print an error message.

>> +}
>> +
>> +static int gud_usb_bulk(struct gud_device *gdrm, size_t len)
>> +{
>> +       struct gud_usb_bulk_context ctx;
>> +       int ret;
>> +
>> +       ret = usb_sg_init(&ctx.sgr, gud_to_usb_device(gdrm), gdrm->bulk_pipe, 0,
>> +                         gdrm->bulk_sgt.sgl, gdrm->bulk_sgt.nents, len, GFP_KERNEL);
>> +       if (ret)
>> +               return ret;
>> +
>> +       timer_setup_on_stack(&ctx.timer, gud_usb_bulk_timeout, 0);
>> +       mod_timer(&ctx.timer, jiffies + msecs_to_jiffies(3000));
>> +
>> +       usb_sg_wait(&ctx.sgr);
>> +
>> +       if (!del_timer_sync(&ctx.timer))
>> +               ret = -ETIMEDOUT;
>> +       else if (ctx.sgr.status < 0)
>> +               ret = ctx.sgr.status;
>> +       else if (ctx.sgr.bytes != len)
>> +               ret = -EIO;
>> +
>> +       destroy_timer_on_stack(&ctx.timer);
>> +
>> +       return ret;
>> +}
> 
> Mention in the commit message that sending USB bulk transfers with
> an sglist could be unstable so you set up a timeout around
> usb_sg_wait() (did this happen to you? then write that)
> 
> The other users of usb_sg_wait() in the kernel do not have these
> timeout wrappers, I suspect the reasoning is something like
> "it's graphics, not storage, so if we timeout and lose an update,
> too bad but let's just continue hoping the lost graphics will be
> less than noticeable" so then we should write that as a comment
> about that in the code or something.
> 

There are 5 users of usb_sg_wait() in the kernel:
drivers/input/touchscreen/sur40.c
drivers/misc/cardreader/rtsx_usb.c
drivers/mmc/host/vub300.c
drivers/usb/misc/usbtest.c
drivers/usb/storage/transport.c

3 of those wrap it in a timer:
drivers/misc/cardreader/rtsx_usb.c: rtsx_usb_bulk_transfer_sglist()
drivers/mmc/host/vub300.c: __command_write_data()
drivers/usb/misc/usbtest.c: perform_sglist()

And it looks to me like usb/storage has some timeout handling through
the scsi layer:
/drivers/usb/storage/scsiglue.c: command_abort() ->
usb_stor_stop_transport() -> usb_sg_cancel()

This leaves 1 out of 5 users without timeout handling?

usb_bulk_msg() has builtin timeout handling and during development of a
microcontroller gadget implementation I've triggered this timeout
several times when the uC usb interrupts stopped firing.

I can add a comment in the commit message about the timer.

Noralf.

> With these comments fixed up:
> Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
> 
> Yours,
> Linus Walleij
>
Peter Stuge June 15, 2021, 9:17 a.m. UTC | #3
Hi Noralf,

Noralf Trønnes wrote:
> >> +static int gud_usb_bulk(struct gud_device *gdrm, size_t len)
..
> >> +       timer_setup_on_stack(&ctx.timer, gud_usb_bulk_timeout, 0);
> >> +       mod_timer(&ctx.timer, jiffies + msecs_to_jiffies(3000));
> >> +
> >> +       usb_sg_wait(&ctx.sgr);
> >> +
> >> +       if (!del_timer_sync(&ctx.timer))
> >> +               ret = -ETIMEDOUT;
..
> > Mention in the commit message that sending USB bulk transfers with
> > an sglist could be unstable

Can you explain a bit about /how/ it is unstable?

As you write, usb_bulk_msg() (as used before) has a timeout which is
passed to the host controller hardware and implemented there.

I haven't used SG with kernel USB but I would expect such a timeout
to still be available with SG?


> usb_bulk_msg() has builtin timeout handling and during development of
> a microcontroller gadget implementation I've triggered this timeout
> several times when the uC usb interrupts stopped firing.

The device not responding to bulk packets scheduled and sent by the host
is a real error /in the device/ and thus not neccessarily something the
kernel must handle gracefully.. I think it's quite nice to do so, but
one can argue that it's not strictly required.

But more importantly: Remember that bulk transfer has no delivery time
guarantee. It can take indefinitely long until a bulk transfer is
scheduled by the host on a busy bus which is starved with more
important things (control, interrupt, iso transfers) - that's not
an error at all, and may be indistinguishable from the device not
responding to packets actually sent by the host.

Having a timeout is important, I just expect the USB SG interface to
support it since it is the hardware that times out in the non-SG case.


And since this is essentially real time data maybe a shorter timeout
is better? 3 seconds seems really long.

The timeout must include all latency for a frame, so e.g. 16ms (60 Hz)
is too short for sure. But maybe something like 500ms?


//Peter
Noralf Trønnes June 15, 2021, 12:19 p.m. UTC | #4
Den 15.06.2021 11.17, skrev Peter Stuge:
> Hi Noralf,
> 
> Noralf Trønnes wrote:
>>>> +static int gud_usb_bulk(struct gud_device *gdrm, size_t len)
> ..
>>>> +       timer_setup_on_stack(&ctx.timer, gud_usb_bulk_timeout, 0);
>>>> +       mod_timer(&ctx.timer, jiffies + msecs_to_jiffies(3000));
>>>> +
>>>> +       usb_sg_wait(&ctx.sgr);
>>>> +
>>>> +       if (!del_timer_sync(&ctx.timer))
>>>> +               ret = -ETIMEDOUT;
> ..
>>> Mention in the commit message that sending USB bulk transfers with
>>> an sglist could be unstable
> 
> Can you explain a bit about /how/ it is unstable?
> 
> As you write, usb_bulk_msg() (as used before) has a timeout which is
> passed to the host controller hardware and implemented there.
> 
> I haven't used SG with kernel USB but I would expect such a timeout
> to still be available with SG?
> 

I have taken a closer look and usb_bulk_msg() calls usb_start_wait_urb()
which uses wait_for_completion_timeout() so the timeout isn't handled by
the hardware.

usb_sg_wait() on the other hand uses plain wait_for_completion() without
the timeout. So ideally usb_sg_wait() should have had a timeout
parameter and used wait_for_completion_timeout().

> 
>> usb_bulk_msg() has builtin timeout handling and during development of
>> a microcontroller gadget implementation I've triggered this timeout
>> several times when the uC usb interrupts stopped firing.
> 
> The device not responding to bulk packets scheduled and sent by the host
> is a real error /in the device/ and thus not neccessarily something the
> kernel must handle gracefully.. I think it's quite nice to do so, but
> one can argue that it's not strictly required.
> 
> But more importantly: Remember that bulk transfer has no delivery time
> guarantee. It can take indefinitely long until a bulk transfer is
> scheduled by the host on a busy bus which is starved with more
> important things (control, interrupt, iso transfers) - that's not
> an error at all, and may be indistinguishable from the device not
> responding to packets actually sent by the host.
> 
> Having a timeout is important, I just expect the USB SG interface to
> support it since it is the hardware that times out in the non-SG case.
> 
> 
> And since this is essentially real time data maybe a shorter timeout
> is better? 3 seconds seems really long.
> 

I have looked at what the others are using:
- rtsx_usb uses 10 seconds in one place
- vub300 uses 2 seconds plus a length based addition
- usbtest uses 10 seconds.

The other USB DRM driver gm12u320 uses a 1 second timeout per block, so
a worst case timeout of 20 seconds per frame.

3 seconds is a "long time", but compared to the default control request
timeout USB_CTRL_GET_TIMEOUT which is 5 seconds, and which gud uses,
it's not that long. I don't want to put too much limitation on the
device, but ofc can't allow it to hang the driver.

And a timeout is an exception so hitting that probably means something
is seriously wrong. I though of adding some kind of usb bus reset
handling to the driver that kicks in after the device has been
unresponsive for some time, but dropped that since I have so limited
understanding of things USB.

Noralf.

> The timeout must include all latency for a frame, so e.g. 16ms (60 Hz)
> is too short for sure. But maybe something like 500ms?
> 
> 
> //Peter
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/gud/gud_drv.c b/drivers/gpu/drm/gud/gud_drv.c
index 820c7331b3b3..8f9bcf6561e8 100644
--- a/drivers/gpu/drm/gud/gud_drv.c
+++ b/drivers/gpu/drm/gud/gud_drv.c
@@ -394,13 +394,40 @@  static const struct drm_driver gud_drm_driver = {
 	.minor			= 0,
 };
 
+static int gud_alloc_bulk_buffer(struct gud_device *gdrm)
+{
+	unsigned int i, num_pages;
+	struct page **pages;
+	void *ptr;
+	int ret;
+
+	gdrm->bulk_buf = vmalloc_32(gdrm->bulk_len);
+	if (!gdrm->bulk_buf)
+		return -ENOMEM;
+
+	num_pages = PAGE_ALIGN(gdrm->bulk_len) >> PAGE_SHIFT;
+	pages = kmalloc_array(num_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	for (i = 0, ptr = gdrm->bulk_buf; i < num_pages; i++, ptr += PAGE_SIZE)
+		pages[i] = vmalloc_to_page(ptr);
+
+	ret = sg_alloc_table_from_pages(&gdrm->bulk_sgt, pages, num_pages,
+					0, gdrm->bulk_len, GFP_KERNEL);
+	kfree(pages);
+
+	return ret;
+}
+
 static void gud_free_buffers_and_mutex(void *data)
 {
 	struct gud_device *gdrm = data;
 
 	vfree(gdrm->compress_buf);
 	gdrm->compress_buf = NULL;
-	kfree(gdrm->bulk_buf);
+	sg_free_table(&gdrm->bulk_sgt);
+	vfree(gdrm->bulk_buf);
 	gdrm->bulk_buf = NULL;
 	mutex_destroy(&gdrm->ctrl_lock);
 	mutex_destroy(&gdrm->damage_lock);
@@ -538,24 +565,16 @@  static int gud_probe(struct usb_interface *intf, const struct usb_device_id *id)
 
 	if (desc.max_buffer_size)
 		max_buffer_size = le32_to_cpu(desc.max_buffer_size);
-retry:
-	/*
-	 * Use plain kmalloc here since devm_kmalloc() places struct devres at the beginning
-	 * of the buffer it allocates. This wastes a lot of memory when allocating big buffers.
-	 * Asking for 2M would actually allocate 4M. This would also prevent getting the biggest
-	 * possible buffer potentially leading to split transfers.
-	 */
-	gdrm->bulk_buf = kmalloc(max_buffer_size, GFP_KERNEL | __GFP_NOWARN);
-	if (!gdrm->bulk_buf) {
-		max_buffer_size = roundup_pow_of_two(max_buffer_size) / 2;
-		if (max_buffer_size < SZ_512K)
-			return -ENOMEM;
-		goto retry;
-	}
+	if (max_buffer_size > SZ_64M)
+		max_buffer_size = SZ_64M; /* safeguard */
 
 	gdrm->bulk_pipe = usb_sndbulkpipe(interface_to_usbdev(intf), usb_endpoint_num(bulk_out));
 	gdrm->bulk_len = max_buffer_size;
 
+	ret = gud_alloc_bulk_buffer(gdrm);
+	if (ret)
+		return ret;
+
 	if (gdrm->compression & GUD_COMPRESSION_LZ4) {
 		gdrm->lz4_comp_mem = devm_kmalloc(dev, LZ4_MEM_COMPRESS, GFP_KERNEL);
 		if (!gdrm->lz4_comp_mem)
diff --git a/drivers/gpu/drm/gud/gud_internal.h b/drivers/gpu/drm/gud/gud_internal.h
index de2f2d2dbc60..1bb65a46c347 100644
--- a/drivers/gpu/drm/gud/gud_internal.h
+++ b/drivers/gpu/drm/gud/gud_internal.h
@@ -5,6 +5,7 @@ 
 
 #include <linux/list.h>
 #include <linux/mutex.h>
+#include <linux/scatterlist.h>
 #include <linux/usb.h>
 #include <linux/workqueue.h>
 #include <uapi/drm/drm_fourcc.h>
@@ -26,6 +27,7 @@  struct gud_device {
 	unsigned int bulk_pipe;
 	void *bulk_buf;
 	size_t bulk_len;
+	struct sg_table bulk_sgt;
 
 	u8 compression;
 	void *lz4_comp_mem;
diff --git a/drivers/gpu/drm/gud/gud_pipe.c b/drivers/gpu/drm/gud/gud_pipe.c
index 2f83ab6b8e61..7dd63a8c7c2d 100644
--- a/drivers/gpu/drm/gud/gud_pipe.c
+++ b/drivers/gpu/drm/gud/gud_pipe.c
@@ -220,13 +220,51 @@  static int gud_prep_flush(struct gud_device *gdrm, struct drm_framebuffer *fb,
 	return ret;
 }
 
+struct gud_usb_bulk_context {
+	struct timer_list timer;
+	struct usb_sg_request sgr;
+};
+
+static void gud_usb_bulk_timeout(struct timer_list *t)
+{
+	struct gud_usb_bulk_context *timer = from_timer(timer, t, timer);
+
+	usb_sg_cancel(&timer->sgr);
+}
+
+static int gud_usb_bulk(struct gud_device *gdrm, size_t len)
+{
+	struct gud_usb_bulk_context ctx;
+	int ret;
+
+	ret = usb_sg_init(&ctx.sgr, gud_to_usb_device(gdrm), gdrm->bulk_pipe, 0,
+			  gdrm->bulk_sgt.sgl, gdrm->bulk_sgt.nents, len, GFP_KERNEL);
+	if (ret)
+		return ret;
+
+	timer_setup_on_stack(&ctx.timer, gud_usb_bulk_timeout, 0);
+	mod_timer(&ctx.timer, jiffies + msecs_to_jiffies(3000));
+
+	usb_sg_wait(&ctx.sgr);
+
+	if (!del_timer_sync(&ctx.timer))
+		ret = -ETIMEDOUT;
+	else if (ctx.sgr.status < 0)
+		ret = ctx.sgr.status;
+	else if (ctx.sgr.bytes != len)
+		ret = -EIO;
+
+	destroy_timer_on_stack(&ctx.timer);
+
+	return ret;
+}
+
 static int gud_flush_rect(struct gud_device *gdrm, struct drm_framebuffer *fb,
 			  const struct drm_format_info *format, struct drm_rect *rect)
 {
-	struct usb_device *usb = gud_to_usb_device(gdrm);
 	struct gud_set_buffer_req req;
-	int ret, actual_length;
 	size_t len, trlen;
+	int ret;
 
 	drm_dbg(&gdrm->drm, "Flushing [FB:%d] " DRM_RECT_FMT "\n", fb->base.id, DRM_RECT_ARG(rect));
 
@@ -255,10 +293,7 @@  static int gud_flush_rect(struct gud_device *gdrm, struct drm_framebuffer *fb,
 			return ret;
 	}
 
-	ret = usb_bulk_msg(usb, gdrm->bulk_pipe, gdrm->bulk_buf, trlen,
-			   &actual_length, msecs_to_jiffies(3000));
-	if (!ret && trlen != actual_length)
-		ret = -EIO;
+	ret = gud_usb_bulk(gdrm, trlen);
 	if (ret)
 		gdrm->stats_num_errors++;