diff mbox series

[RFC,v4,22/42] drm/vkms: Use s32 for internal color pipeline precision

Message ID 20240226211100.100108-23-harry.wentland@amd.com (mailing list archive)
State New, archived
Headers show
Series Color Pipeline API w/ VKMS | expand

Commit Message

Harry Wentland Feb. 26, 2024, 9:10 p.m. UTC
Certain operations require us to preserve values below 0.0 and
above 1.0 (0x0 and 0xffff respectively in 16 bpc unorm). One
such operation is a BT709 encoding operation followed by its
decoding operation, or the reverse.

We'll use s32 values as intermediate in and outputs of our
color operations, for the operations where it matters.

For now this won't apply to LUT operations. We might want to
update those to work on s32 as well, but it's unclear how
that should work for unorm LUT definitions. We'll revisit
that once we add LUT + CTM tests.

In order to allow for this we'll also invert the nesting of our
colorop processing loops. We now use the pixel iteration loop
on the outside and the colorop iteration on the inside.

v4:
 - Clarify that we're packing 16-bit UNORM into s32, not
   converting values to a different representation (Pekka)

v3:
 - Use new colorop->next pointer

Signed-off-by: Harry Wentland <harry.wentland@amd.com>
---
 drivers/gpu/drm/vkms/vkms_composer.c | 57 +++++++++++++++++++++-------
 drivers/gpu/drm/vkms/vkms_drv.h      |  4 ++
 2 files changed, 48 insertions(+), 13 deletions(-)

Comments

Pekka Paalanen March 12, 2024, 3:50 p.m. UTC | #1
On Mon, 26 Feb 2024 16:10:36 -0500
Harry Wentland <harry.wentland@amd.com> wrote:

> Certain operations require us to preserve values below 0.0 and
> above 1.0 (0x0 and 0xffff respectively in 16 bpc unorm). One
> such operation is a BT709 encoding operation followed by its
> decoding operation, or the reverse.
> 
> We'll use s32 values as intermediate in and outputs of our
> color operations, for the operations where it matters.
> 
> For now this won't apply to LUT operations. We might want to
> update those to work on s32 as well, but it's unclear how
> that should work for unorm LUT definitions. We'll revisit
> that once we add LUT + CTM tests.
> 
> In order to allow for this we'll also invert the nesting of our
> colorop processing loops. We now use the pixel iteration loop
> on the outside and the colorop iteration on the inside.

I always go through the same thought process:

- putting the pixel iteration loop outermost is going to be horrible
  for performance

- maybe should turn the temporary line buffer to argb_s32 for all of
  VKMS

- that's going to be a lot of memory traffic... maybe your way is not
  horrible for performance

- maybe processing pixel by pixel is good, if you can prepare the
  operation in advance, so you don't need to analyze colorops again and
  again on each pixel

- eh, maybe it's not a gain after all, needs benchmarking

My comment on patch 17 was like the step 0 of this train of thought. I
just always get the same comments in my mind when seeing the same code
again.


Thanks,
pq

> 
> v4:
>  - Clarify that we're packing 16-bit UNORM into s32, not
>    converting values to a different representation (Pekka)
> 
> v3:
>  - Use new colorop->next pointer
> 
> Signed-off-by: Harry Wentland <harry.wentland@amd.com>
> ---
>  drivers/gpu/drm/vkms/vkms_composer.c | 57 +++++++++++++++++++++-------
>  drivers/gpu/drm/vkms/vkms_drv.h      |  4 ++
>  2 files changed, 48 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/vkms/vkms_composer.c b/drivers/gpu/drm/vkms/vkms_composer.c
> index 25b786b49db0..d2101fa55aa3 100644
> --- a/drivers/gpu/drm/vkms/vkms_composer.c
> +++ b/drivers/gpu/drm/vkms/vkms_composer.c
> @@ -164,7 +164,7 @@ static void apply_lut(const struct vkms_crtc_state *crtc_state, struct line_buff
>  	}
>  }
>  
> -static void apply_colorop(struct pixel_argb_u16 *pixel, struct drm_colorop *colorop)
> +static void apply_colorop(struct pixel_argb_s32 *pixel, struct drm_colorop *colorop)
>  {
>  	/* TODO is this right? */
>  	struct drm_colorop_state *colorop_state = colorop->state;
> @@ -191,25 +191,56 @@ static void apply_colorop(struct pixel_argb_u16 *pixel, struct drm_colorop *colo
>  
>  static void pre_blend_color_transform(const struct vkms_plane_state *plane_state, struct line_buffer *output_buffer)
>  {
> -	struct drm_colorop *colorop = plane_state->base.base.color_pipeline;
> +	struct drm_colorop *colorop;
> +	struct pixel_argb_s32 pixel;
>  
> -	while (colorop) {
> -		struct drm_colorop_state *colorop_state;
> +	for (size_t x = 0; x < output_buffer->n_pixels; x++) {
>  
> -		if (!colorop)
> -			return;
> +		/*
> +		 * Some operations, such as applying a BT709 encoding matrix,
> +		 * followed by a decoding matrix, require that we preserve
> +		 * values above 1.0 and below 0.0 until the end of the pipeline.
> +		 *
> +		 * Pack the 16-bit UNORM values into s32 to give us head-room to
> +		 * avoid clipping until we're at the end of the pipeline. Clip
> +		 * intentionally at the end of the pipeline before packing
> +		 * UNORM values back into u16.
> +		 */
> +		pixel.a = output_buffer->pixels[x].a;
> +		pixel.r = output_buffer->pixels[x].r;
> +		pixel.g = output_buffer->pixels[x].g;
> +		pixel.b = output_buffer->pixels[x].b;
>  
> -		/* TODO this is probably wrong */
> -		colorop_state = colorop->state;
> +		colorop = plane_state->base.base.color_pipeline;
> +		while (colorop) {
> +			struct drm_colorop_state *colorop_state;
>  
> -		if (!colorop_state)
> -			return;
> +			if (!colorop)
> +				return;
> +
> +			/* TODO this is probably wrong */
> +			colorop_state = colorop->state;
> +
> +			if (!colorop_state)
> +				return;
>  
> -		for (size_t x = 0; x < output_buffer->n_pixels; x++)
>  			if (!colorop_state->bypass)
> -				apply_colorop(&output_buffer->pixels[x], colorop);
> +				apply_colorop(&pixel, colorop);
>  
> -		colorop = colorop->next;
> +			colorop = colorop->next;
> +		}
> +
> +		/* clamp pixel */
> +		pixel.a = max(min(pixel.a, 0xffff), 0x0);
> +		pixel.r = max(min(pixel.r, 0xffff), 0x0);
> +		pixel.g = max(min(pixel.g, 0xffff), 0x0);
> +		pixel.b = max(min(pixel.b, 0xffff), 0x0);
> +
> +		/* put back to output_buffer */
> +		output_buffer->pixels[x].a = pixel.a;
> +		output_buffer->pixels[x].r = pixel.r;
> +		output_buffer->pixels[x].g = pixel.g;
> +		output_buffer->pixels[x].b = pixel.b;
>  	}
>  }
>  
> diff --git a/drivers/gpu/drm/vkms/vkms_drv.h b/drivers/gpu/drm/vkms/vkms_drv.h
> index 2bcc24c196a2..fadb7685a360 100644
> --- a/drivers/gpu/drm/vkms/vkms_drv.h
> +++ b/drivers/gpu/drm/vkms/vkms_drv.h
> @@ -36,6 +36,10 @@ struct vkms_frame_info {
>  	unsigned int cpp;
>  };
>  
> +struct pixel_argb_s32 {
> +	s32 a, r, g, b;
> +};
> +
>  struct pixel_argb_u16 {
>  	u16 a, r, g, b;
>  };
diff mbox series

Patch

diff --git a/drivers/gpu/drm/vkms/vkms_composer.c b/drivers/gpu/drm/vkms/vkms_composer.c
index 25b786b49db0..d2101fa55aa3 100644
--- a/drivers/gpu/drm/vkms/vkms_composer.c
+++ b/drivers/gpu/drm/vkms/vkms_composer.c
@@ -164,7 +164,7 @@  static void apply_lut(const struct vkms_crtc_state *crtc_state, struct line_buff
 	}
 }
 
-static void apply_colorop(struct pixel_argb_u16 *pixel, struct drm_colorop *colorop)
+static void apply_colorop(struct pixel_argb_s32 *pixel, struct drm_colorop *colorop)
 {
 	/* TODO is this right? */
 	struct drm_colorop_state *colorop_state = colorop->state;
@@ -191,25 +191,56 @@  static void apply_colorop(struct pixel_argb_u16 *pixel, struct drm_colorop *colo
 
 static void pre_blend_color_transform(const struct vkms_plane_state *plane_state, struct line_buffer *output_buffer)
 {
-	struct drm_colorop *colorop = plane_state->base.base.color_pipeline;
+	struct drm_colorop *colorop;
+	struct pixel_argb_s32 pixel;
 
-	while (colorop) {
-		struct drm_colorop_state *colorop_state;
+	for (size_t x = 0; x < output_buffer->n_pixels; x++) {
 
-		if (!colorop)
-			return;
+		/*
+		 * Some operations, such as applying a BT709 encoding matrix,
+		 * followed by a decoding matrix, require that we preserve
+		 * values above 1.0 and below 0.0 until the end of the pipeline.
+		 *
+		 * Pack the 16-bit UNORM values into s32 to give us head-room to
+		 * avoid clipping until we're at the end of the pipeline. Clip
+		 * intentionally at the end of the pipeline before packing
+		 * UNORM values back into u16.
+		 */
+		pixel.a = output_buffer->pixels[x].a;
+		pixel.r = output_buffer->pixels[x].r;
+		pixel.g = output_buffer->pixels[x].g;
+		pixel.b = output_buffer->pixels[x].b;
 
-		/* TODO this is probably wrong */
-		colorop_state = colorop->state;
+		colorop = plane_state->base.base.color_pipeline;
+		while (colorop) {
+			struct drm_colorop_state *colorop_state;
 
-		if (!colorop_state)
-			return;
+			if (!colorop)
+				return;
+
+			/* TODO this is probably wrong */
+			colorop_state = colorop->state;
+
+			if (!colorop_state)
+				return;
 
-		for (size_t x = 0; x < output_buffer->n_pixels; x++)
 			if (!colorop_state->bypass)
-				apply_colorop(&output_buffer->pixels[x], colorop);
+				apply_colorop(&pixel, colorop);
 
-		colorop = colorop->next;
+			colorop = colorop->next;
+		}
+
+		/* clamp pixel */
+		pixel.a = max(min(pixel.a, 0xffff), 0x0);
+		pixel.r = max(min(pixel.r, 0xffff), 0x0);
+		pixel.g = max(min(pixel.g, 0xffff), 0x0);
+		pixel.b = max(min(pixel.b, 0xffff), 0x0);
+
+		/* put back to output_buffer */
+		output_buffer->pixels[x].a = pixel.a;
+		output_buffer->pixels[x].r = pixel.r;
+		output_buffer->pixels[x].g = pixel.g;
+		output_buffer->pixels[x].b = pixel.b;
 	}
 }
 
diff --git a/drivers/gpu/drm/vkms/vkms_drv.h b/drivers/gpu/drm/vkms/vkms_drv.h
index 2bcc24c196a2..fadb7685a360 100644
--- a/drivers/gpu/drm/vkms/vkms_drv.h
+++ b/drivers/gpu/drm/vkms/vkms_drv.h
@@ -36,6 +36,10 @@  struct vkms_frame_info {
 	unsigned int cpp;
 };
 
+struct pixel_argb_s32 {
+	s32 a, r, g, b;
+};
+
 struct pixel_argb_u16 {
 	u16 a, r, g, b;
 };