diff mbox

[v2,2/5] gpu: ipu-v3: Add mem2mem image conversion support to IC

Message ID 1426674173-17088-3-git-send-email-p.zabel@pengutronix.de (mailing list archive)
State New, archived
Headers show

Commit Message

Philipp Zabel March 18, 2015, 10:22 a.m. UTC
This patch adds support for mem2mem scaling and colorspace conversion
using the IC module's post-processing task.

Scaling images larger than 1024x1024 is supported by tiling over multiple
IC scaling runs. Since the IDMAC and IC units have interesting and different
alignment limitations for buffer base addresses (left edges) and burst size
(row lengths), depending on input and output pixel formats, the tile rectangles
and scaling coefficients are chosen to minimize distortion. Due to possible
overlap, the tiles have to be rendered right to left and bottom to top.
Up to 7 pixels (depending on frame sizes and scaling factor) have to be
available after the end of the frame if the width is not burst size aligned.
The tiling code has a parameter to optionally round frame sizes up or down
and avoid overdraw in compositing scenarios.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
Changes since v1:
 - Removed deinterlacer support left-overs
---
 drivers/gpu/ipu-v3/ipu-ic.c | 787 +++++++++++++++++++++++++++++++++++++++++++-
 include/video/imx-ipu-v3.h  |  34 +-
 2 files changed, 804 insertions(+), 17 deletions(-)

Comments

Jean-Michel Hautbois May 27, 2015, 6:42 p.m. UTC | #1
Hi Philipp, Lucas and Sascha,

Thanks for that patch series.

2015-03-18 11:22 GMT+01:00 Philipp Zabel <p.zabel@pengutronix.de>:
>
> This patch adds support for mem2mem scaling and colorspace conversion
> using the IC module's post-processing task.
>
> Scaling images larger than 1024x1024 is supported by tiling over multiple
> IC scaling runs. Since the IDMAC and IC units have interesting and different
> alignment limitations for buffer base addresses (left edges) and burst size
> (row lengths), depending on input and output pixel formats, the tile rectangles
> and scaling coefficients are chosen to minimize distortion. Due to possible
> overlap, the tiles have to be rendered right to left and bottom to top.
> Up to 7 pixels (depending on frame sizes and scaling factor) have to be
> available after the end of the frame if the width is not burst size aligned.
> The tiling code has a parameter to optionally round frame sizes up or down
> and avoid overdraw in compositing scenarios.

Can you detail what you call "compositing scenarios" ?

>
> Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
> Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
> Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
> ---
> Changes since v1:
>  - Removed deinterlacer support left-overs
> ---
>  drivers/gpu/ipu-v3/ipu-ic.c | 787 +++++++++++++++++++++++++++++++++++++++++++-
>  include/video/imx-ipu-v3.h  |  34 +-
>  2 files changed, 804 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/ipu-v3/ipu-ic.c b/drivers/gpu/ipu-v3/ipu-ic.c
> index ad75588..984f68f 100644
> --- a/drivers/gpu/ipu-v3/ipu-ic.c
> +++ b/drivers/gpu/ipu-v3/ipu-ic.c
> @@ -15,6 +15,7 @@
>  #include <linux/errno.h>
>  #include <linux/spinlock.h>
>  #include <linux/bitrev.h>
> +#include <linux/interrupt.h>
>  #include <linux/io.h>
>  #include <linux/err.h>
>  #include "ipu-prv.h"
> @@ -96,6 +97,15 @@ struct ic_task_bitfields {
>         u32 ic_cmb_galpha_bit;
>  };
>
> +struct ic_task_channels {
> +       u8 in;
> +       u8 out;
> +       u8 rot_in;
> +       u8 rot_out;
> +       u8 in_prev;
> +       u8 in_next;
> +};
> +
>  static const struct ic_task_regoffs ic_task_reg[IC_NUM_TASKS] = {
>         [IC_TASK_ENCODER] = {
>                 .rsc = IC_PRP_ENC_RSC,
> @@ -138,12 +148,53 @@ static const struct ic_task_bitfields ic_task_bit[IC_NUM_TASKS] = {
>         },
>  };
>
> +static const struct ic_task_channels ic_task_ch[IC_NUM_TASKS] = {
> +       [IC_TASK_ENCODER] = {
> +               .in = IPUV3_CHANNEL_MEM_IC_PRP_VF,
> +               .out = IPUV3_CHANNEL_IC_PRP_ENC_MEM,
> +               .rot_in = IPUV3_CHANNEL_MEM_ROT_ENC,
> +               .rot_out = IPUV3_CHANNEL_ROT_ENC_MEM,
> +       },
> +       [IC_TASK_VIEWFINDER] = {
> +               .in = IPUV3_CHANNEL_MEM_VDI_CUR,
> +               .out = IPUV3_CHANNEL_IC_PRP_VF_MEM,
> +               .rot_in = IPUV3_CHANNEL_MEM_ROT_VF,
> +               .rot_out = IPUV3_CHANNEL_ROT_VF_MEM,
> +               .in_prev = IPUV3_CHANNEL_MEM_VDI_PREV,
> +               .in_next = IPUV3_CHANNEL_MEM_VDI_NEXT,
> +       },
> +       [IC_TASK_POST_PROCESSOR] = {
> +               .in = IPUV3_CHANNEL_MEM_IC_PP,
> +               .out = IPUV3_CHANNEL_IC_PP_MEM,
> +               .rot_in = IPUV3_CHANNEL_MEM_ROT_PP,
> +               .rot_out = IPUV3_CHANNEL_ROT_PP_MEM,
> +       },
> +};
> +
> +struct image_convert_ctx {
> +       void (*complete)(void *ctx, int err);
> +       void *complete_context;
> +
> +       struct list_head list;
> +       struct ipu_image in;
> +       struct ipu_image in_n;
> +       struct ipu_image in_p;
> +       struct ipu_image out;
> +
> +       void *freep;
> +
> +       bool rotate:1;
> +
> +       u32 rsc;
> +};
> +
>  struct ipu_ic_priv;
>
>  struct ipu_ic {
>         enum ipu_ic_task task;
>         const struct ic_task_regoffs *reg;
>         const struct ic_task_bitfields *bit;
> +       const struct ic_task_channels *ch;
>
>         enum ipu_color_space in_cs, g_in_cs;
>         enum ipu_color_space out_cs;
> @@ -152,6 +203,19 @@ struct ipu_ic {
>         bool in_use;
>
>         struct ipu_ic_priv *priv;
> +
> +       struct ipuv3_channel *input_channel_p;
> +       struct ipuv3_channel *input_channel;
> +       struct ipuv3_channel *input_channel_n;
> +       struct ipuv3_channel *output_channel;
> +       struct ipuv3_channel *rotation_input_channel;
> +       struct ipuv3_channel *rotation_output_channel;
> +
> +       struct list_head image_list;
> +
> +       struct workqueue_struct *workqueue;
> +       struct work_struct work;
> +       struct completion complete;
>  };

As this is a workqueue, it can sleep, and you don't know when it is
called exactly.
Can we be sure that it is "real-time" compatible ? If you have this
scaler after a capture source, and before the coda driver, you can be
starved of buffers ?
And you can even have multiple instances of the scaler, so you
probably can get into troubles if there is not enough buffers on the
capture and output queues, right ?
I have played with it a bit and have been successful having two
instances on IPU1 and two other on IPU2.
But I don't know if there can be side effects...

JM

>
>  struct ipu_ic_priv {
> @@ -168,7 +232,8 @@ static inline u32 ipu_ic_read(struct ipu_ic *ic, unsigned offset)
>         return readl(ic->priv->base + offset);
>  }
>
> -static inline void ipu_ic_write(struct ipu_ic *ic, u32 value, unsigned offset)
> +static inline void ipu_ic_write(struct ipu_ic *ic, u32 value,
> +                               unsigned offset)
>  {
>         writel(value, ic->priv->base + offset);
>  }
> @@ -446,32 +511,35 @@ int ipu_ic_task_init(struct ipu_ic *ic,
>                      int in_width, int in_height,
>                      int out_width, int out_height,
>                      enum ipu_color_space in_cs,
> -                    enum ipu_color_space out_cs)
> +                    enum ipu_color_space out_cs,
> +                    u32 rsc)
>  {
>         struct ipu_ic_priv *priv = ic->priv;
> -       u32 reg, downsize_coeff, resize_coeff;
> +       u32 downsize_coeff, resize_coeff;
>         unsigned long flags;
>         int ret = 0;
>
> -       /* Setup vertical resizing */
> -       ret = calc_resize_coeffs(ic, in_height, out_height,
> -                                &resize_coeff, &downsize_coeff);
> -       if (ret)
> -               return ret;
> +       if (!rsc) {
> +               /* Setup vertical resizing */
> +               ret = calc_resize_coeffs(ic, in_height, out_height,
> +                                        &resize_coeff, &downsize_coeff);
> +               if (ret)
> +                       return ret;
>
> -       reg = (downsize_coeff << 30) | (resize_coeff << 16);
> +               rsc = (downsize_coeff << 30) | (resize_coeff << 16);
>
> -       /* Setup horizontal resizing */
> -       ret = calc_resize_coeffs(ic, in_width, out_width,
> -                                &resize_coeff, &downsize_coeff);
> -       if (ret)
> -               return ret;
> +               /* Setup horizontal resizing */
> +               ret = calc_resize_coeffs(ic, in_width, out_width,
> +                                        &resize_coeff, &downsize_coeff);
> +               if (ret)
> +                       return ret;
>
> -       reg |= (downsize_coeff << 14) | resize_coeff;
> +               rsc |= (downsize_coeff << 14) | resize_coeff;
> +       }
>
>         spin_lock_irqsave(&priv->lock, flags);
>
> -       ipu_ic_write(ic, reg, ic->reg->rsc);
> +       ipu_ic_write(ic, rsc, ic->reg->rsc);
>
>         /* Setup color space conversion */
>         ic->in_cs = in_cs;
> @@ -629,6 +697,675 @@ unlock:
>  }
>  EXPORT_SYMBOL_GPL(ipu_ic_task_idma_init);
>
> +static struct image_convert_ctx *ipu_image_convert_next(struct ipu_ic *ic)
> +{
> +       struct ipu_ic_priv *priv = ic->priv;
> +       struct ipuv3_channel *ch_in = ic->input_channel;
> +       struct ipuv3_channel *ch_out = ic->output_channel;
> +       struct image_convert_ctx *ctx;
> +       struct ipu_image *in_p, *in, *in_n;
> +       struct ipu_image *out;
> +       int ret;
> +       unsigned long flags;
> +       unsigned int inburst, outburst;
> +       unsigned int in_height;
> +
> +       spin_lock_irqsave(&priv->lock, flags);
> +
> +       if (list_empty(&ic->image_list)) {
> +               spin_unlock_irqrestore(&priv->lock, flags);
> +               return NULL;
> +       }
> +
> +       ctx = list_first_entry(&ic->image_list, struct image_convert_ctx, list);
> +
> +       list_del(&ctx->list);
> +
> +       spin_unlock_irqrestore(&priv->lock, flags);
> +
> +       in_p = &ctx->in_p;
> +       in = &ctx->in;
> +       in_n = &ctx->in_n;
> +       out = &ctx->out;
> +
> +       ipu_cpmem_zero(ch_in);
> +       ipu_cpmem_zero(ch_out);
> +
> +       inburst = in->rect.width & 0xf ? 8 : 16;
> +       outburst = out->rect.width & 0xf ? 8 : 16;
> +
> +       ipu_ic_enable(ic);
> +
> +       ipu_ic_task_idma_init(ic, ic->input_channel, in->rect.width,
> +                             in->rect.height, inburst, IPU_ROTATE_NONE);
> +       ipu_ic_task_idma_init(ic, ic->output_channel, out->rect.width,
> +                             out->rect.height, outburst, IPU_ROTATE_NONE);
> +
> +       ipu_cpmem_set_image(ch_in, &ctx->in);
> +       ipu_cpmem_set_image(ch_out, &ctx->out);
> +
> +       ipu_cpmem_set_burstsize(ch_in, inburst);
> +       ipu_cpmem_set_burstsize(ch_out, outburst);
> +
> +       in_height = in->rect.height;
> +
> +       dev_dbg(priv->ipu->dev, "%s: %dx%d(%dx%d@%d,%d) -> %dx%d(%dx%d@%d,%d)\n",
> +               __func__, in->pix.width, in->pix.height,
> +               in->rect.width, in->rect.height, in->rect.left, in->rect.top,
> +               out->pix.width, out->pix.height,
> +               out->rect.width, out->rect.height,
> +               out->rect.left, out->rect.top);
> +
> +       dev_dbg(priv->ipu->dev,
> +               "%s: hscale: >>%d, *8192/%d vscale: >>%d, *8192/%d\n",
> +               __func__, (ctx->rsc >> 14) & 0x3, (ctx->rsc & 0x3fff),
> +               ctx->rsc >> 30, (ctx->rsc >> 16) & 0x3fff);
> +
> +       ret = ipu_ic_task_init(ic, in->rect.width, in_height,
> +                       out->rect.width, out->rect.height,
> +                       ipu_pixelformat_to_colorspace(in->pix.pixelformat),
> +                       ipu_pixelformat_to_colorspace(out->pix.pixelformat),
> +                       ctx->rsc);
> +       if (ret) {
> +               ipu_ic_disable(ic);
> +               return ERR_PTR(ret);
> +       }
> +
> +       ipu_idmac_enable_channel(ic->input_channel);
> +       ipu_idmac_enable_channel(ic->output_channel);
> +
> +       ipu_ic_task_enable(ic);
> +
> +       ipu_idmac_select_buffer(ic->input_channel, 0);
> +       ipu_idmac_select_buffer(ic->output_channel, 0);
> +
> +       return ctx;
> +}
> +
> +static void ipu_image_convert_work(struct work_struct *work)
> +{
> +       struct ipu_ic *ic = container_of(work, struct ipu_ic, work);
> +       struct image_convert_ctx *ctx;
> +       int ret;
> +
> +       while (1) {
> +               int task_error = 0;
> +
> +               ctx = ipu_image_convert_next(ic);
> +               if (!ctx)
> +                       return;
> +
> +               if (IS_ERR(ctx)) {
> +                       task_error = PTR_ERR(ctx);
> +               } else {
> +                       ret = wait_for_completion_interruptible_timeout(
> +                                               &ic->complete, 100 * HZ);
> +                       if (!ret)
> +                               task_error = -ETIMEDOUT;
> +               }
> +
> +               ipu_ic_task_disable(ic);
> +               ipu_ic_disable(ic);
> +
> +               if (ctx->complete)
> +                       ctx->complete(ctx->complete_context, task_error);
> +               kfree(ctx->freep);
> +       }
> +}
> +
> +static irqreturn_t ipu_image_convert_handler(int irq, void *context)
> +{
> +       struct ipu_ic *ic = context;
> +
> +       complete(&ic->complete);
> +
> +       return IRQ_HANDLED;
> +}
> +
> +
> +/*
> + * IDMAC base addresses are 8-byte aligned
> + */
> +static int ipu_image_halign(u32 pixfmt)
> +{
> +       switch (pixfmt) {
> +       /* 2 RGB32 pixels correspond to 8 bytes */
> +       case V4L2_PIX_FMT_RGB32:
> +       case V4L2_PIX_FMT_BGR32:
> +               return 2;
> +       /* 4 RGB565 or YUYV pixels correspond to 8 bytes */
> +       case V4L2_PIX_FMT_RGB565:
> +       case V4L2_PIX_FMT_UYVY:
> +       case V4L2_PIX_FMT_YUYV:
> +               return 4;
> +       /*
> +        * 8 RGB24 pixels correspond to 24 bytes,
> +        * 8 NV12 pixels correspond to 8 bytes, both in luma and chroma
> +        */
> +       case V4L2_PIX_FMT_RGB24:
> +       case V4L2_PIX_FMT_BGR24:
> +       case V4L2_PIX_FMT_NV12:
> +               return 8;
> +       /* 16 YUV420 pixels correspond to 16 bytes in luma, 8 bytes in chroma */
> +       case V4L2_PIX_FMT_YUV420:
> +       case V4L2_PIX_FMT_YVU420:
> +       case V4L2_PIX_FMT_YUV422P:
> +               return 16;
> +       default:
> +               return -EINVAL;
> +       }
> +}
> +
> +/*
> + * Vertically chroma-subsampled formats are limited to even heights and vertical
> + * positions
> + */
> +static int ipu_image_valign(u32 pixfmt)
> +{
> +       switch (pixfmt) {
> +       case V4L2_PIX_FMT_RGB24:
> +       case V4L2_PIX_FMT_BGR24:
> +       case V4L2_PIX_FMT_RGB32:
> +       case V4L2_PIX_FMT_BGR32:
> +       case V4L2_PIX_FMT_RGB565:
> +       case V4L2_PIX_FMT_UYVY:
> +       case V4L2_PIX_FMT_YUYV:
> +       case V4L2_PIX_FMT_YUV422P:
> +               return 1;
> +       case V4L2_PIX_FMT_NV12:
> +       case V4L2_PIX_FMT_YUV420:
> +       case V4L2_PIX_FMT_YVU420:
> +               return 2;
> +       default:
> +               return -EINVAL;
> +       }
> +}
> +
> +#define round_closest(x, y) round_down((x) + (y)/2, (y))
> +
> +struct image_convert_ctx *ipu_image_convert_prepare(struct ipu_soc *ipu,
> +               struct ipu_image *in, struct ipu_image *out,
> +               enum ipu_image_scale_ctrl ctrl, int *num_tiles)
> +{
> +       struct image_convert_ctx *ctx, *c;
> +       int htiles, vtiles;
> +       int in_valign, in_halign, in_burst, out_valign, out_halign, out_burst;
> +       int left, top;
> +       int x, y;
> +       int h_resize_opt, v_resize_opt;
> +       u32 v_downsize_coeff = 0, h_downsize_coeff = 0;
> +       u32 v_resize_coeff, h_resize_coeff;
> +
> +       /* validate input */
> +       if (in->rect.width < 16 || out->rect.width < 16 ||
> +           (in->rect.width / 8) > out->rect.width)
> +               return ERR_PTR(-EINVAL);
> +
> +       /* tile setup */
> +       htiles = DIV_ROUND_UP(out->rect.width, 1024);
> +       vtiles = DIV_ROUND_UP(out->rect.height, 1024);
> +
> +       in_valign = ipu_image_valign(in->pix.pixelformat);
> +       in_halign = ipu_image_halign(in->pix.pixelformat);
> +       out_valign = ipu_image_valign(out->pix.pixelformat);
> +       out_halign = ipu_image_halign(out->pix.pixelformat);
> +
> +       /* IC bursts are limited to either 8 or 16 pixels */
> +       in_burst = 8;
> +       out_burst = 8;
> +
> +       if (in_valign < 0 || in_halign < 0 ||
> +           out_valign < 0 || out_halign < 0) {
> +               dev_err(ipu->dev, "unsupported in/out format\n");
> +               return ERR_PTR(-EINVAL);
> +       }
> +
> +       /* compute static decimator coefficients */
> +       while ((in->rect.width >> h_downsize_coeff) > out->rect.width)
> +               h_downsize_coeff++;
> +       while ((in->rect.height >> v_downsize_coeff) > out->rect.height)
> +               v_downsize_coeff++;
> +
> +       /* move and crop the output image according to IDMAC limitations */
> +       switch (ctrl) {
> +       case IPU_IMAGE_SCALE_ROUND_DOWN:
> +               left = round_up(in->rect.left, in_halign);
> +               top = round_up(in->rect.top, in_valign);
> +               in->rect.width = in->rect.width - (left - in->rect.left);
> +               in->rect.height = in->rect.height - (top - in->rect.top);
> +               in->rect.left = left;
> +               in->rect.top = top;
> +               left = round_up(out->rect.left, out_halign);
> +               top = round_up(out->rect.top, out_valign);
> +               out->rect.width = round_down(out->rect.width - (left -
> +                                            out->rect.left), out_burst);
> +               out->rect.height = round_down(out->rect.height - (top -
> +                                             out->rect.top), out_valign);
> +               break;
> +       case IPU_IMAGE_SCALE_ROUND_UP:
> +               left = round_down(in->rect.left, in_halign);
> +               top = round_down(in->rect.top, in_valign);
> +               in->rect.width = in->rect.width + in->rect.left - left;
> +               in->rect.height = in->rect.height + in->rect.top - top;
> +               in->rect.left = left;
> +               in->rect.top = top;
> +               left = round_down(out->rect.left, out_halign);
> +               top = round_down(out->rect.top, out_valign);
> +               out->rect.width = round_up(out->rect.width + out->rect.left -
> +                                          left, out_burst);
> +               out->rect.height = round_up(out->rect.height + out->rect.top -
> +                                           top, out_valign);
> +               break;
> +       case IPU_IMAGE_SCALE_PIXELPERFECT:
> +               left = round_down(in->rect.left, in_halign);
> +               top = round_down(in->rect.top, in_valign);
> +               in->rect.width = in->rect.width + in->rect.left - left;
> +               in->rect.height = in->rect.height + in->rect.top - top;
> +               in->rect.left = left;
> +               in->rect.top = top;
> +               left = round_down(out->rect.left + out_halign / 2, out_halign);
> +               top = round_down(out->rect.top + out_valign / 2, out_valign);
> +               /*
> +                * don't round width and height to burst size / pixel format
> +                * limitations yet, we do it after determining the scaling
> +                * coefficients
> +                */
> +               out->rect.width = out->rect.width + out->rect.left - left;
> +               out->rect.height = out->rect.height + out->rect.top - top;
> +               break;
> +       default:
> +               return ERR_PTR(-EINVAL);
> +       }
> +       out->rect.left = left;
> +       out->rect.top = top;
> +
> +       /* Round input width and height according to decimation */
> +       in->rect.width = round_down(in->rect.width, 1 << h_downsize_coeff);
> +       in->rect.height = round_down(in->rect.height, 1 << v_downsize_coeff);
> +
> +       dev_dbg(ipu->dev,
> +               "%s: in: %dx%d(%dx%d@%d,%d) -> out: %dx%d(%dx%d@%d,%d)\n",
> +               __func__, in->pix.width, in->pix.height, in->rect.width,
> +               in->rect.height, in->rect.left, in->rect.top, out->pix.width,
> +               out->pix.height, out->rect.width, out->rect.height,
> +               out->rect.left, out->rect.top);
> +
> +       /*
> +        * Compute the bilinear resizing coefficients that can/could be used if
> +        * scaling using a single tile. The bottom right pixel should sample the
> +        * input as close as possible to but not beyond the bottom right input
> +        * pixel out of the decimator:
> +        *
> +        * (out->rect.width - 1) * h_resize / 8192.0 <= (in->rect.width >>
> +        *                                               h_downsize_coeff) - 1
> +        * (out->rect.height - 1) * v_resize / 8192.0 <= (in->rect.height >>
> +        *                                                v_downsize_coeff) - 1
> +        */
> +       h_resize_opt = 8192 * ((in->rect.width >> h_downsize_coeff) - 1) /
> +                      (out->rect.width - 1);
> +       v_resize_opt = 8192 * ((in->rect.height >> v_downsize_coeff) - 1) /
> +                      (out->rect.height - 1);
> +
> +       dev_dbg(ipu->dev,
> +               "%s: hscale: >>%d, *8192/%d vscale: >>%d, *8192/%d, %dx%d tiles\n",
> +               __func__, h_downsize_coeff, h_resize_opt, v_downsize_coeff,
> +               v_resize_opt, htiles, vtiles);
> +
> +       ctx = kcalloc(htiles * vtiles, sizeof(*ctx), GFP_KERNEL);
> +       if (!ctx)
> +               return ERR_PTR(-ENOMEM);
> +
> +       c = ctx;
> +
> +       for (x = htiles - 1; x >= 0; x--) {
> +               int in_right, out_right;
> +
> +               /*
> +                * Since we render tiles right to left, the right edge
> +                * is already known. Depending on tile position and
> +                * scaling mode, we may overshoot it.
> +                */
> +               if (x == htiles - 1) {
> +                       out_right = out->rect.left + out->rect.width;
> +                       in_right = in->rect.left + in->rect.width;
> +               } else {
> +                       struct image_convert_ctx *c_right = c - vtiles;
> +
> +                       out_right = c_right->out.rect.left;
> +                       in_right = c_right->in.rect.left;
> +               }
> +
> +               /* Now determine the left edge of this tile column */
> +               if (x == 0) {
> +                       /* For the leftmost column this is trivial */
> +                       c->out.rect.left = out->rect.left;
> +                       c->in.rect.left = in->rect.left;
> +               } else {
> +                       int best_left, best_in_left;
> +                       int min_left, max_left;
> +                       int min_diff = INT_MAX;
> +
> +                       /*
> +                        * Find the best possible left edge. It must be adjusted
> +                        * according to IDMAC limitations, and should be
> +                        * chosen so that
> +                        * (in->rect.left + (c->out.rect.left - out->rect.left)
> +                        *  * h_resize_opt / (8192 >> h_downsize_coeff))
> +                        * is as close as possible to a valid left edge in the
> +                        * input.
> +                        */
> +                       min_left = max(0,
> +                                      round_up(out_right - 1024, out_halign));
> +                       max_left = min(round_down(out_right, out_halign),
> +                                      x * 1024);
> +                       best_left = min_left;
> +                       best_in_left = (best_left - out->rect.left) *
> +                                      h_resize_opt;
> +                       for (left = min_left; left < max_left;
> +                            left += out_halign) {
> +                               int diff, in_left;
> +
> +                               /*
> +                                * In ROUND_UP and ROUND_DOWN modes, for the
> +                                * rightmost column, only consider left edges
> +                                * that are a multiple of the burst size away
> +                                * from the right edge.
> +                                */
> +                               if ((ctrl != IPU_IMAGE_SCALE_PIXELPERFECT) &&
> +                                   (x == htiles - 1) &&
> +                                   ((out_right - left) % out_burst))
> +                                       continue;
> +                               in_left = in->rect.left +
> +                                         (((left - out->rect.left) *
> +                                           h_resize_opt) << h_downsize_coeff);
> +                               diff = abs(in_left -
> +                                          round_closest(in_left,
> +                                                        8192 * in_halign));
> +
> +                               if (diff < min_diff) {
> +                                       min_diff = diff;
> +                                       best_left = left;
> +                                       best_in_left = in_left;
> +                               }
> +                       }
> +
> +                       c->out.rect.left = best_left;
> +                       c->in.rect.left = DIV_ROUND_CLOSEST(best_in_left, 8192);
> +
> +                       dev_dbg(ipu->dev,
> +                               "%s: tile(%d,y):\tleft: %d -> %d (instead of %d.%04d -> %d)",
> +                               __func__, x, c->in.rect.left,
> +                               c->out.rect.left, best_in_left / 8192,
> +                               (best_in_left % 8192) * 10000 / 8192,
> +                               out->rect.left +
> +                               DIV_ROUND_CLOSEST((c->in.rect.left -
> +                                                  in->rect.left) *
> +                                                 (8192 >> h_downsize_coeff),
> +                                                 h_resize_opt));
> +               }
> +
> +               /* Determine tile width from left and right edges */
> +               c->out.rect.width = out_right - c->out.rect.left;
> +               c->in.rect.width = in_right - c->in.rect.left;
> +
> +               /* Now we can determine the actual per-tile scaling factor */
> +               if (x == htiles - 1) {
> +                       /*
> +                        * Round down for the right column, since we
> +                        * don't want to read beyond the right edge.
> +                        */
> +                       h_resize_coeff = 8192 * ((c->in.rect.width >>
> +                                                h_downsize_coeff) - 1) /
> +                                        (c->out.rect.width - 1);
> +               } else {
> +                       /*
> +                        * Round to closest for seams between tiles for
> +                        * minimal distortion.
> +                        */
> +                       h_resize_coeff = DIV_ROUND_CLOSEST(8192 *
> +                                                          (c->in.rect.width >>
> +                                                           h_downsize_coeff),
> +                                                          c->out.rect.width);
> +               }
> +
> +               /*
> +                * With the scaling factor known, round up output width
> +                * to burst size. In ROUND_UP and ROUND_DOWN scaling mode
> +                * this is a no-op for the right column.
> +                */
> +               c->out.rect.width = round_up(c->out.rect.width, out_burst);
> +
> +               /*
> +                * Calculate input width from the last accessed input pixel
> +                * given output width and scaling coefficients. Round to
> +                * burst size.
> +                */
> +               c->in.rect.width = (DIV_ROUND_UP((c->out.rect.width - 1) *
> +                                                h_resize_coeff, 8192) + 1)
> +                                  << h_downsize_coeff;
> +               c->in.rect.width = round_up(c->in.rect.width, in_burst);
> +
> +               for (y = vtiles - 1; y >= 0; y--) {
> +                       int in_bottom, out_bottom;
> +
> +                       memcpy(&c->in.pix, &in->pix,
> +                             sizeof(struct v4l2_pix_format));
> +
> +                       if (y == vtiles - 1) {
> +                               out_bottom = out->rect.top + out->rect.height;
> +                               in_bottom = in->rect.top + in->rect.height;
> +                       } else {
> +                               struct image_convert_ctx *c_below = c - 1;
> +
> +                               out_bottom = c_below->out.rect.top;
> +                               in_bottom = c_below->in.rect.top;
> +
> +                               /*
> +                                * Copy horizontal parameters from the tile
> +                                * below
> +                                */
> +                               c->out.rect.left = c_below->out.rect.left;
> +                               c->out.rect.width = c_below->out.rect.width;
> +                               c->in.rect.left = c_below->in.rect.left;
> +                               c->in.rect.width = c_below->in.rect.width;
> +                       }
> +
> +                       if (y == 0) {
> +                               c->out.rect.top = out->rect.top;
> +                               c->in.rect.top = in->rect.top;
> +                       } else {
> +                               int best_top, best_in_top;
> +                               int min_top, max_top;
> +                               int min_diff = INT_MAX;
> +
> +                               /*
> +                                * Find the best possible top edge. It must be
> +                                * adjusted according to IDMAC limitations, and
> +                                * should be chosen so that
> +                                * (in->rect.top + (c->out.rect.top -
> +                                *  out->rect.top) * v_resize_opt /
> +                                * (8192 >> v_downsize_coeff))
> +                                * is as close as possible to a valid top edge
> +                                * in the input.
> +                                */
> +                               min_top = max(0,
> +                                             round_up(out_bottom - 1024,
> +                                                      out_valign));
> +                               max_top = min(round_down(out_bottom,
> +                                                        out_halign), y * 1024);
> +                               best_top = min_top;
> +                               best_in_top = (best_top - out->rect.top) *
> +                                              v_resize_opt;
> +                               for (top = min_top; top < max_top;
> +                                    top += out_valign) {
> +                                       int diff, in_top;
> +
> +                                       in_top = in->rect.top +
> +                                                (((top - out->rect.top) *
> +                                                  v_resize_opt) <<
> +                                                 v_downsize_coeff);
> +                                       diff = abs(in_top -
> +                                                  round_closest(in_top, 8192 *
> +                                                                in_valign));
> +
> +                                       if (diff < min_diff) {
> +                                               min_diff = diff;
> +                                               best_top = top;
> +                                               best_in_top = in_top;
> +                                       }
> +                               }
> +
> +                               c->out.rect.top = best_top;
> +                               c->in.rect.top = DIV_ROUND_CLOSEST(best_in_top,
> +                                                                  8192);
> +
> +                               dev_dbg(ipu->dev,
> +                                       "%s: tile(%d,%d):\ttop: %d -> %d (instead of %d.%04d -> %d)",
> +                                       __func__, x, y, c->in.rect.top,
> +                                       c->out.rect.top, best_in_top / 8192,
> +                                       (best_in_top % 8192) * 10000 / 8192,
> +                                       out->rect.top +
> +                                       DIV_ROUND_CLOSEST((c->in.rect.top -
> +                                                          in->rect.top) * (8192
> +                                                         >> v_downsize_coeff),
> +                                                         v_resize_opt));
> +                       }
> +
> +                       /* Determine tile height from top and bottom edges */
> +                       c->out.rect.height = out_bottom - c->out.rect.top;
> +                       c->in.rect.height = in_bottom - c->in.rect.top;
> +
> +                       /*
> +                        * Now we can determine the actual vertical per-tile
> +                        * scaling factor
> +                        */
> +                       if (y == vtiles - 1) {
> +                               /*
> +                                * Round down for the bottom row, since we
> +                                * don't want to read beyond the lower border.
> +                                */
> +                               v_resize_coeff = 8192 * ((c->in.rect.height >>
> +                                                        v_downsize_coeff) - 1)
> +                                                / (c->out.rect.height - 1);
> +                       } else {
> +                               /*
> +                                * Round to closest for seams between tiles for
> +                                * minimal distortion.
> +                                */
> +                               v_resize_coeff = DIV_ROUND_CLOSEST(8192 *
> +                                                       (c->in.rect.height >>
> +                                                        v_downsize_coeff),
> +                                                       c->out.rect.height);
> +                       }
> +
> +                       /*
> +                        * With the scaling factor known, round up output height
> +                        * to IDMAC limitations
> +                        */
> +                       c->out.rect.height = round_up(c->out.rect.height,
> +                                                     out_valign);
> +
> +                       /*
> +                        * Calculate input height from the last accessed input
> +                        * line given output height and scaling coefficients.
> +                        */
> +                       c->in.rect.height = (DIV_ROUND_UP(
> +                                               (c->out.rect.height - 1) *
> +                                               v_resize_coeff, 8192) + 1)
> +                                           << v_downsize_coeff;
> +
> +                       /* align height according to IDMAC restrictions */
> +                       c->in.rect.height = round_up(c->in.rect.height,
> +                               in_valign);
> +
> +                       memcpy(&c->out.pix, &out->pix,
> +                              sizeof(struct v4l2_pix_format));
> +
> +                       dev_dbg(ipu->dev,
> +                               "%s: tile(%d,%d): %dx%d(%dx%d@%d,%d) -> %dx%d(%dx%d@%d,%d), resize: %dx%d\n",
> +                               __func__, x, y,
> +                               c->in.pix.width, c->in.pix.height,
> +                               c->in.rect.width, c->in.rect.height,
> +                               c->in.rect.left, c->in.rect.top,
> +                               c->out.pix.width, c->out.pix.height,
> +                               c->out.rect.width, c->out.rect.height,
> +                               c->out.rect.left, c->out.rect.top,
> +                               h_resize_coeff, v_resize_coeff);
> +
> +                       c->rsc = (v_downsize_coeff << 30) |
> +                                (v_resize_coeff << 16) |
> +                                (h_downsize_coeff << 14) |
> +                                h_resize_coeff;
> +
> +                       c++;
> +               }
> +       }
> +
> +       *num_tiles = htiles * vtiles;
> +
> +       return ctx;
> +}
> +EXPORT_SYMBOL_GPL(ipu_image_convert_prepare);
> +
> +int ipu_image_convert_run(struct ipu_soc *ipu, struct ipu_image *in,
> +                         struct ipu_image *out, struct image_convert_ctx *ctx,
> +                         int num_tiles, void (*complete)(void *ctx, int err),
> +                         void *complete_context, bool free_ctx)
> +{
> +       struct ipu_ic_priv *priv = ipu->ic_priv;
> +       struct ipu_ic *ic = &priv->task[IC_TASK_POST_PROCESSOR];
> +       unsigned long flags;
> +       int i;
> +
> +       for (i = 0; i < num_tiles; i++) {
> +               ctx[i].in.phys0 = in->phys0;
> +               ctx[i].out.phys0 = out->phys0;
> +       }
> +       ctx[num_tiles - 1].complete = complete;
> +       ctx[num_tiles - 1].complete_context = complete_context;
> +       if (free_ctx)
> +               ctx[num_tiles - 1].freep = ctx;
> +
> +       spin_lock_irqsave(&priv->lock, flags);
> +
> +       for (i = 0; i < num_tiles; i++)
> +               list_add_tail(&ctx[i].list, &ic->image_list);
> +
> +       queue_work(ic->workqueue, &ic->work);
> +
> +       spin_unlock_irqrestore(&priv->lock, flags);
> +
> +       return 0;
> +}
> +EXPORT_SYMBOL_GPL(ipu_image_convert_run);
> +
> +static int ipu_image_convert_init(struct device *dev, struct ipu_soc *ipu,
> +               struct ipu_ic_priv *priv)
> +{
> +       int ret;
> +       struct ipu_ic *ic = ipu_ic_get(ipu, IC_TASK_POST_PROCESSOR);
> +       int irq = ipu_idmac_channel_irq(ipu, ic->output_channel,
> +                                       IPU_IRQ_EOF);
> +
> +       ic->workqueue = create_singlethread_workqueue(dev_name(ipu->dev));
> +       if (!ic->workqueue)
> +               return -ENOMEM;
> +
> +       INIT_WORK(&ic->work, ipu_image_convert_work);
> +       init_completion(&ic->complete);
> +
> +       ret = devm_request_threaded_irq(dev, irq, NULL,
> +                               ipu_image_convert_handler,
> +                               IRQF_ONESHOT, "IC PP", ic);
> +       if (ret)
> +               goto err;
> +
> +       return 0;
> +err:
> +       destroy_workqueue(ic->workqueue);
> +       return ret;
> +}
> +
>  int ipu_ic_enable(struct ipu_ic *ic)
>  {
>         struct ipu_ic_priv *priv = ic->priv;
> @@ -736,12 +1473,30 @@ int ipu_ic_init(struct ipu_soc *ipu, struct device *dev,
>         priv->ipu = ipu;
>
>         for (i = 0; i < IC_NUM_TASKS; i++) {
> +               INIT_LIST_HEAD(&priv->task[i].image_list);
>                 priv->task[i].task = i;
>                 priv->task[i].priv = priv;
>                 priv->task[i].reg = &ic_task_reg[i];
>                 priv->task[i].bit = &ic_task_bit[i];
> +
> +               priv->task[i].input_channel = ipu_idmac_get(ipu,
> +                                                       ic_task_ch[i].in);
> +               priv->task[i].output_channel = ipu_idmac_get(ipu,
> +                                                       ic_task_ch[i].out);
> +               priv->task[i].rotation_input_channel = ipu_idmac_get(ipu,
> +                                                       ic_task_ch[i].rot_in);
> +               priv->task[i].rotation_output_channel = ipu_idmac_get(ipu,
> +                                                       ic_task_ch[i].rot_out);
> +               if (ic_task_ch[i].in_prev) {
> +                       priv->task[i].input_channel_p = ipu_idmac_get(ipu,
> +                                                       ic_task_ch[i].in_prev);
> +                       priv->task[i].input_channel_n = ipu_idmac_get(ipu,
> +                                                       ic_task_ch[i].in_next);
> +               }
>         }
>
> +       ipu_image_convert_init(dev, ipu, priv);
> +
>         return 0;
>  }
>
> diff --git a/include/video/imx-ipu-v3.h b/include/video/imx-ipu-v3.h
> index 459508e..6d98a38 100644
> --- a/include/video/imx-ipu-v3.h
> +++ b/include/video/imx-ipu-v3.h
> @@ -316,7 +316,8 @@ int ipu_ic_task_init(struct ipu_ic *ic,
>                      int in_width, int in_height,
>                      int out_width, int out_height,
>                      enum ipu_color_space in_cs,
> -                    enum ipu_color_space out_cs);
> +                    enum ipu_color_space out_cs,
> +                    u32 rsc);
>  int ipu_ic_task_graphics_init(struct ipu_ic *ic,
>                               enum ipu_color_space in_g_cs,
>                               bool galpha_en, u32 galpha,
> @@ -362,4 +363,35 @@ struct ipu_client_platformdata {
>         int dma[2];
>  };
>
> +enum ipu_image_scale_ctrl {
> +       IPU_IMAGE_SCALE_ROUND_DOWN,
> +       IPU_IMAGE_SCALE_PIXELPERFECT,
> +       IPU_IMAGE_SCALE_ROUND_UP,
> +};
> +
> +struct image_convert_ctx;
> +
> +struct image_convert_ctx *ipu_image_convert_prepare(struct ipu_soc *ipu,
> +               struct ipu_image *in, struct ipu_image *out,
> +               enum ipu_image_scale_ctrl ctrl, int *num_tiles);
> +int ipu_image_convert_run(struct ipu_soc *ipu, struct ipu_image *in,
> +               struct ipu_image *out, struct image_convert_ctx *ctx,
> +               int num_tiles, void (*complete)(void *ctx, int err),
> +               void *complete_context, bool free_ctx);
> +
> +static inline int ipu_image_convert(struct ipu_soc *ipu, struct ipu_image *in,
> +               struct ipu_image *out, void (*complete)(void *ctx, int err),
> +               void *complete_context, enum ipu_image_scale_ctrl ctrl)
> +{
> +       struct image_convert_ctx *ctx;
> +       int num_tiles;
> +
> +       ctx = ipu_image_convert_prepare(ipu, in, out, ctrl, &num_tiles);
> +       if (IS_ERR(ctx))
> +               return PTR_ERR(ctx);
> +
> +       return ipu_image_convert_run(ipu, in, out, ctx, num_tiles, complete,
> +                                    complete_context, true);
> +}
> +
>  #endif /* __DRM_IPU_H__ */
> --
> 2.1.4
>
Enrico Weigelt, metux IT consult May 28, 2015, 9 a.m. UTC | #2
Am 27.05.2015 um 20:42 schrieb Jean-Michel Hautbois:

<snip>

@Phillip,

I've missed the previous mails (just subscribed here yesterday) ...

Are these patches same as in your git branch tmp/imx-ipu-scaler ?
I've got them running on 4.0.4 and currently trying on 4.1-rc*

Yet another question:

when using it w/ gst for video playback, can be directly pass buffers
between VPU, IPU and FB (or let them directly write into shared
buffers), so CPU doesn't need to act on each frame for each step
in the decoding pipeline ?
Playing an 800x400 mp4 still produces about 70..75%.


cu
--
Enrico Weigelt, metux IT consult
+49-151-27565287
MELAG Medizintechnik oHG Sitz Berlin Registergericht AG Charlottenburg HRA 21333 B

Wichtiger Hinweis: Diese Nachricht kann vertrauliche oder nur für einen begrenzten Personenkreis bestimmte Informationen enthalten. Sie ist ausschließlich für denjenigen bestimmt, an den sie gerichtet worden ist. Wenn Sie nicht der Adressat dieser E-Mail sind, dürfen Sie diese nicht kopieren, weiterleiten, weitergeben oder sie ganz oder teilweise in irgendeiner Weise nutzen. Sollten Sie diese E-Mail irrtümlich erhalten haben, so benachrichtigen Sie bitte den Absender, indem Sie auf diese Nachricht antworten. Bitte löschen Sie in diesem Fall diese Nachricht und alle Anhänge, ohne eine Kopie zu behalten.
Important Notice: This message may contain confidential or privileged information. It is intended only for the person it was addressed to. If you are not the intended recipient of this email you may not copy, forward, disclose or otherwise use it or any part of it in any form whatsoever. If you received this email in error please notify the sender by replying and delete this message and any attachments without retaining a copy.
Philipp Zabel May 28, 2015, 10:35 a.m. UTC | #3
Hi Jean-Michel,

Am Mittwoch, den 27.05.2015, 20:42 +0200 schrieb Jean-Michel Hautbois:
[...]
> > The tiling code has a parameter to optionally round frame sizes up or down
> > and avoid overdraw in compositing scenarios.
> 
> Can you detail what you call "compositing scenarios" ?

I meant using the v4l2 selection API to draw the scaled result into a
compose rectangle in a larger v4l2 capture buffer. To avoid overdrawing
pixels outside of the rectangle, its right edge must be aligned with the
burst size of the rightmost tiles.

[...]
> > @@ -152,6 +203,19 @@ struct ipu_ic {
> >         bool in_use;
> >
> >         struct ipu_ic_priv *priv;
> > +
> > +       struct ipuv3_channel *input_channel_p;
> > +       struct ipuv3_channel *input_channel;
> > +       struct ipuv3_channel *input_channel_n;
> > +       struct ipuv3_channel *output_channel;
> > +       struct ipuv3_channel *rotation_input_channel;
> > +       struct ipuv3_channel *rotation_output_channel;
> > +
> > +       struct list_head image_list;
> > +
> > +       struct workqueue_struct *workqueue;
> > +       struct work_struct work;
> > +       struct completion complete;
> >  };
> 
> As this is a workqueue, it can sleep, and you don't know when it is
> called exactly.
> Can we be sure that it is "real-time" compatible ? If you have this
> scaler after a capture source, and before the coda driver, you can be
> starved of buffers ?
> And you can even have multiple instances of the scaler, so you
> probably can get into troubles if there is not enough buffers on the
> capture and output queues, right ?

When there are no buffers available, the m2m scaler won't run. What do
you mean by "real-time" compatible? We can't make any guarantees that
scaling will be finished in a certain timeframe in general, as the
scaler competes with other hardware units for memory bandwidth, which
often is the limiting factor.
If multiple scaling instances are run on the same IPU, they will take
turns using the IC.

> I have played with it a bit and have been successful having two
> instances on IPU1 and two other on IPU2.
> But I don't know if there can be side effects...

regards
Philipp
Philipp Zabel May 28, 2015, 10:44 a.m. UTC | #4
Hi Enrico,

Am Donnerstag, den 28.05.2015, 11:00 +0200 schrieb Enrico Weigelt, metux
IT consult:
> Am 27.05.2015 um 20:42 schrieb Jean-Michel Hautbois:
> 
> <snip>
> 
> @Phillip,
> 
> I've missed the previous mails (just subscribed here yesterday) ...
>
> Are these patches same as in your git branch tmp/imx-ipu-scaler ?

No, that is an older version.

> I've got them running on 4.0.4 and currently trying on 4.1-rc*
> 
> Yet another question:
> 
> when using it w/ gst for video playback, can be directly pass buffers
> between VPU, IPU and FB (or let them directly write into shared
> buffers), so CPU doesn't need to act on each frame for each step
> in the decoding pipeline ?

Check out the (capture/output-)io-mode parameters, that's what the
dmabuf/dmabuf-import option pairs are for.

regards
Philipp
Enrico Weigelt, metux IT consult May 28, 2015, 11:31 a.m. UTC | #5
Am 28.05.2015 um 12:44 schrieb Philipp Zabel:

Hi,

 >> Are these patches same as in your git branch tmp/imx-ipu-scaler ?
>
> No, that is an older version.

Where can I get the recent ones ?
Could you push it to your public repo ?

>> when using it w/ gst for video playback, can be directly pass buffers
>> between VPU, IPU and FB (or let them directly write into shared
>> buffers), so CPU doesn't need to act on each frame for each step
>> in the decoding pipeline ?
>
> Check out the (capture/output-)io-mode parameters, that's what the
> dmabuf/dmabuf-import option pairs are for.

Tried dmabuf, but load stays at the same (77..80% CPU, 1.2 loadavg).
dmabuf-import doesnt run at all:

root@KoMo:/usr/share/videos/komo gst-launch-1.0 filesrc
location=montage.mp4 \! qtdemux \! h264parse \! v4l2video4dec
output-io-mode=5 \! v4l2video0convert capture-io-mode=5 output-io-mode=4
\! fbdevsink

Setting pipeline to PAUSED ...
Pipeline is PREROLLING ...
ERROR: from element
/GstPipeline:pipeline0/v4l2video0convert:v4l2video0convert0: No
downstream pool to import from.
Additional debug info:
gstv4l2object.c(3441): gst_v4l2_object_decide_allocation ():
/GstPipeline:pipeline0/v4l2video0convert:v4l2video0convert0:
When importing DMABUF or USERPTR, we need a pool to import from
ERROR: pipeline doesn't want to preroll.
Setting pipeline to NULL ...
Freeing pipeline ...


Perhaps not implemented yet in the old version of the patches ?

By the way: do you have any idea whether the proprietary driver
(or the gpus itself) might talk to ipu and vpu ?


cu
--
Enrico Weigelt, metux IT consult
+49-151-27565287
MELAG Medizintechnik oHG Sitz Berlin Registergericht AG Charlottenburg HRA 21333 B

Wichtiger Hinweis: Diese Nachricht kann vertrauliche oder nur für einen begrenzten Personenkreis bestimmte Informationen enthalten. Sie ist ausschließlich für denjenigen bestimmt, an den sie gerichtet worden ist. Wenn Sie nicht der Adressat dieser E-Mail sind, dürfen Sie diese nicht kopieren, weiterleiten, weitergeben oder sie ganz oder teilweise in irgendeiner Weise nutzen. Sollten Sie diese E-Mail irrtümlich erhalten haben, so benachrichtigen Sie bitte den Absender, indem Sie auf diese Nachricht antworten. Bitte löschen Sie in diesem Fall diese Nachricht und alle Anhänge, ohne eine Kopie zu behalten.
Important Notice: This message may contain confidential or privileged information. It is intended only for the person it was addressed to. If you are not the intended recipient of this email you may not copy, forward, disclose or otherwise use it or any part of it in any form whatsoever. If you received this email in error please notify the sender by replying and delete this message and any attachments without retaining a copy.
Philipp Zabel May 28, 2015, 11:59 a.m. UTC | #6
Am Donnerstag, den 28.05.2015, 13:31 +0200 schrieb Enrico Weigelt, metux
IT consult:
> Am 28.05.2015 um 12:44 schrieb Philipp Zabel:
> 
> Hi,
> 
>  >> Are these patches same as in your git branch tmp/imx-ipu-scaler ?
> >
> > No, that is an older version.
> 
> Where can I get the recent ones ?
> Could you push it to your public repo ?

I've updated the tmp/imx-ipu-scaler branch.

> >> when using it w/ gst for video playback, can be directly pass buffers
> >> between VPU, IPU and FB (or let them directly write into shared
> >> buffers), so CPU doesn't need to act on each frame for each step
> >> in the decoding pipeline ?
> >
> > Check out the (capture/output-)io-mode parameters, that's what the
> > dmabuf/dmabuf-import option pairs are for.
> 
> Tried dmabuf, but load stays at the same (77..80% CPU, 1.2 loadavg).
> dmabuf-import doesnt run at all:
> 
> root@KoMo:/usr/share/videos/komo gst-launch-1.0 filesrc
> location=montage.mp4 \! qtdemux \! h264parse \! v4l2video4dec
> output-io-mode=5 \! v4l2video0convert capture-io-mode=5 output-io-mode=4
> \! fbdevsink

That should be capture-io-mode=dmabuf for the decoder and
output-io-mode=dmabuf-import for the converter element. h264parse
doesn't provide and fbdevsink can't handle dmabufs, so the decoder's
output-io-mode and the converter's capture-io-mode should be kept as
mmio.

[...]
> By the way: do you have any idea whether the proprietary driver
> (or the gpus itself) might talk to ipu and vpu ?

Not that I am aware of.

regards
Philipp
Enrico Weigelt, metux IT consult May 28, 2015, 5:38 p.m. UTC | #7
Am 28.05.2015 um 13:59 schrieb Philipp Zabel:

>> Where can I get the recent ones ?
>> Could you push it to your public repo ?
>
> I've updated the tmp/imx-ipu-scaler branch.

Thx. already integrated it into my tree - works fine :)

By the way: i still have some your older patches (2012) in my tree,
eg. some mediabus, camara, display timing stuff, etc ... not sure
whether I really need them for my device.

Should I post them to linux-media list for review ?

Oh, and I also still have your famous DRM_IOCTL_MODE_MAP_DUMB hack
(the "Reluctantly-signed-off-by:" one ;-), meanwhile rebased / adapted
into 4.x. Do you have any idea, what the amd-gpu driver/library exactly
does with the retrieved address ? Send it directly to the gpu ?

> That should be capture-io-mode=dmabuf for the decoder and
> output-io-mode=dmabuf-import for the converter element. h264parse
> doesn't provide and fbdevsink can't handle dmabufs, so the decoder's
> output-io-mode and the converter's capture-io-mode should be kept as
> mmio.

I played around a little bit - this command line only takes 55% cpu:

gst-launch-1.0 filesrc location=montage.mp4 \!
qtdemux \! h264parse \! v4l2video4dec output-io-mode=4 capture-io-mode=4
\! v4l2
video0convert capture-io-mode=4 output-io-mode=5 \! fbdevsink

By the way: what's the exact difference between dmabuf and
dmabuf-import ?

 > > By the way: do you have any idea whether the proprietary driver
 > > (or the gpus itself) might talk to ipu and vpu ?
 >
 > Not that I am aware of.

Well, you perhaps can imagine - I dont trust these guys ...



--mtx

ps: greetings from Bene ... you won't guess where I met him
last weekend ;-)
--
Enrico Weigelt, metux IT consult
+49-151-27565287
MELAG Medizintechnik oHG Sitz Berlin Registergericht AG Charlottenburg HRA 21333 B

Wichtiger Hinweis: Diese Nachricht kann vertrauliche oder nur für einen begrenzten Personenkreis bestimmte Informationen enthalten. Sie ist ausschließlich für denjenigen bestimmt, an den sie gerichtet worden ist. Wenn Sie nicht der Adressat dieser E-Mail sind, dürfen Sie diese nicht kopieren, weiterleiten, weitergeben oder sie ganz oder teilweise in irgendeiner Weise nutzen. Sollten Sie diese E-Mail irrtümlich erhalten haben, so benachrichtigen Sie bitte den Absender, indem Sie auf diese Nachricht antworten. Bitte löschen Sie in diesem Fall diese Nachricht und alle Anhänge, ohne eine Kopie zu behalten.
Important Notice: This message may contain confidential or privileged information. It is intended only for the person it was addressed to. If you are not the intended recipient of this email you may not copy, forward, disclose or otherwise use it or any part of it in any form whatsoever. If you received this email in error please notify the sender by replying and delete this message and any attachments without retaining a copy.
Robert Schwebel May 28, 2015, 5:54 p.m. UTC | #8
On Thu, May 28, 2015 at 07:38:20PM +0200, Enrico Weigelt, metux IT consult wrote:
> Thx. already integrated it into my tree - works fine :)
> 
> By the way: i still have some your older patches (2012) in my tree,
> eg. some mediabus, camara, display timing stuff, etc ... not sure
> whether I really need them for my device.
> 
> Should I post them to linux-media list for review?

No. That's all old stuff and has developed quite a lot since then. We'll
post new series here on the lists when they are ready for mainline.

rsc
Enrico Weigelt, metux IT consult May 29, 2015, 9:02 a.m. UTC | #9
Am 28.05.2015 um 19:54 schrieb Robert Schwebel:

>> By the way: i still have some your older patches (2012) in my tree,
>> eg. some mediabus, camara, display timing stuff, etc ... not sure
>> whether I really need them for my device.
>>
>> Should I post them to linux-media list for review?
>
> No. That's all old stuff and has developed quite a lot since then. We'll
> post new series here on the lists when they are ready for mainline.

Great :)

Do you have them on some public repo, so I can give 'em a try ?


--mtx

--
Enrico Weigelt, metux IT consult
+49-151-27565287
MELAG Medizintechnik oHG Sitz Berlin Registergericht AG Charlottenburg HRA 21333 B

Wichtiger Hinweis: Diese Nachricht kann vertrauliche oder nur für einen begrenzten Personenkreis bestimmte Informationen enthalten. Sie ist ausschließlich für denjenigen bestimmt, an den sie gerichtet worden ist. Wenn Sie nicht der Adressat dieser E-Mail sind, dürfen Sie diese nicht kopieren, weiterleiten, weitergeben oder sie ganz oder teilweise in irgendeiner Weise nutzen. Sollten Sie diese E-Mail irrtümlich erhalten haben, so benachrichtigen Sie bitte den Absender, indem Sie auf diese Nachricht antworten. Bitte löschen Sie in diesem Fall diese Nachricht und alle Anhänge, ohne eine Kopie zu behalten.
Important Notice: This message may contain confidential or privileged information. It is intended only for the person it was addressed to. If you are not the intended recipient of this email you may not copy, forward, disclose or otherwise use it or any part of it in any form whatsoever. If you received this email in error please notify the sender by replying and delete this message and any attachments without retaining a copy.
diff mbox

Patch

diff --git a/drivers/gpu/ipu-v3/ipu-ic.c b/drivers/gpu/ipu-v3/ipu-ic.c
index ad75588..984f68f 100644
--- a/drivers/gpu/ipu-v3/ipu-ic.c
+++ b/drivers/gpu/ipu-v3/ipu-ic.c
@@ -15,6 +15,7 @@ 
 #include <linux/errno.h>
 #include <linux/spinlock.h>
 #include <linux/bitrev.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/err.h>
 #include "ipu-prv.h"
@@ -96,6 +97,15 @@  struct ic_task_bitfields {
 	u32 ic_cmb_galpha_bit;
 };
 
+struct ic_task_channels {
+	u8 in;
+	u8 out;
+	u8 rot_in;
+	u8 rot_out;
+	u8 in_prev;
+	u8 in_next;
+};
+
 static const struct ic_task_regoffs ic_task_reg[IC_NUM_TASKS] = {
 	[IC_TASK_ENCODER] = {
 		.rsc = IC_PRP_ENC_RSC,
@@ -138,12 +148,53 @@  static const struct ic_task_bitfields ic_task_bit[IC_NUM_TASKS] = {
 	},
 };
 
+static const struct ic_task_channels ic_task_ch[IC_NUM_TASKS] = {
+	[IC_TASK_ENCODER] = {
+		.in = IPUV3_CHANNEL_MEM_IC_PRP_VF,
+		.out = IPUV3_CHANNEL_IC_PRP_ENC_MEM,
+		.rot_in = IPUV3_CHANNEL_MEM_ROT_ENC,
+		.rot_out = IPUV3_CHANNEL_ROT_ENC_MEM,
+	},
+	[IC_TASK_VIEWFINDER] = {
+		.in = IPUV3_CHANNEL_MEM_VDI_CUR,
+		.out = IPUV3_CHANNEL_IC_PRP_VF_MEM,
+		.rot_in = IPUV3_CHANNEL_MEM_ROT_VF,
+		.rot_out = IPUV3_CHANNEL_ROT_VF_MEM,
+		.in_prev = IPUV3_CHANNEL_MEM_VDI_PREV,
+		.in_next = IPUV3_CHANNEL_MEM_VDI_NEXT,
+	},
+	[IC_TASK_POST_PROCESSOR] = {
+		.in = IPUV3_CHANNEL_MEM_IC_PP,
+		.out = IPUV3_CHANNEL_IC_PP_MEM,
+		.rot_in = IPUV3_CHANNEL_MEM_ROT_PP,
+		.rot_out = IPUV3_CHANNEL_ROT_PP_MEM,
+	},
+};
+
+struct image_convert_ctx {
+	void (*complete)(void *ctx, int err);
+	void *complete_context;
+
+	struct list_head list;
+	struct ipu_image in;
+	struct ipu_image in_n;
+	struct ipu_image in_p;
+	struct ipu_image out;
+
+	void *freep;
+
+	bool rotate:1;
+
+	u32 rsc;
+};
+
 struct ipu_ic_priv;
 
 struct ipu_ic {
 	enum ipu_ic_task task;
 	const struct ic_task_regoffs *reg;
 	const struct ic_task_bitfields *bit;
+	const struct ic_task_channels *ch;
 
 	enum ipu_color_space in_cs, g_in_cs;
 	enum ipu_color_space out_cs;
@@ -152,6 +203,19 @@  struct ipu_ic {
 	bool in_use;
 
 	struct ipu_ic_priv *priv;
+
+	struct ipuv3_channel *input_channel_p;
+	struct ipuv3_channel *input_channel;
+	struct ipuv3_channel *input_channel_n;
+	struct ipuv3_channel *output_channel;
+	struct ipuv3_channel *rotation_input_channel;
+	struct ipuv3_channel *rotation_output_channel;
+
+	struct list_head image_list;
+
+	struct workqueue_struct *workqueue;
+	struct work_struct work;
+	struct completion complete;
 };
 
 struct ipu_ic_priv {
@@ -168,7 +232,8 @@  static inline u32 ipu_ic_read(struct ipu_ic *ic, unsigned offset)
 	return readl(ic->priv->base + offset);
 }
 
-static inline void ipu_ic_write(struct ipu_ic *ic, u32 value, unsigned offset)
+static inline void ipu_ic_write(struct ipu_ic *ic, u32 value,
+				unsigned offset)
 {
 	writel(value, ic->priv->base + offset);
 }
@@ -446,32 +511,35 @@  int ipu_ic_task_init(struct ipu_ic *ic,
 		     int in_width, int in_height,
 		     int out_width, int out_height,
 		     enum ipu_color_space in_cs,
-		     enum ipu_color_space out_cs)
+		     enum ipu_color_space out_cs,
+		     u32 rsc)
 {
 	struct ipu_ic_priv *priv = ic->priv;
-	u32 reg, downsize_coeff, resize_coeff;
+	u32 downsize_coeff, resize_coeff;
 	unsigned long flags;
 	int ret = 0;
 
-	/* Setup vertical resizing */
-	ret = calc_resize_coeffs(ic, in_height, out_height,
-				 &resize_coeff, &downsize_coeff);
-	if (ret)
-		return ret;
+	if (!rsc) {
+		/* Setup vertical resizing */
+		ret = calc_resize_coeffs(ic, in_height, out_height,
+					 &resize_coeff, &downsize_coeff);
+		if (ret)
+			return ret;
 
-	reg = (downsize_coeff << 30) | (resize_coeff << 16);
+		rsc = (downsize_coeff << 30) | (resize_coeff << 16);
 
-	/* Setup horizontal resizing */
-	ret = calc_resize_coeffs(ic, in_width, out_width,
-				 &resize_coeff, &downsize_coeff);
-	if (ret)
-		return ret;
+		/* Setup horizontal resizing */
+		ret = calc_resize_coeffs(ic, in_width, out_width,
+					 &resize_coeff, &downsize_coeff);
+		if (ret)
+			return ret;
 
-	reg |= (downsize_coeff << 14) | resize_coeff;
+		rsc |= (downsize_coeff << 14) | resize_coeff;
+	}
 
 	spin_lock_irqsave(&priv->lock, flags);
 
-	ipu_ic_write(ic, reg, ic->reg->rsc);
+	ipu_ic_write(ic, rsc, ic->reg->rsc);
 
 	/* Setup color space conversion */
 	ic->in_cs = in_cs;
@@ -629,6 +697,675 @@  unlock:
 }
 EXPORT_SYMBOL_GPL(ipu_ic_task_idma_init);
 
+static struct image_convert_ctx *ipu_image_convert_next(struct ipu_ic *ic)
+{
+	struct ipu_ic_priv *priv = ic->priv;
+	struct ipuv3_channel *ch_in = ic->input_channel;
+	struct ipuv3_channel *ch_out = ic->output_channel;
+	struct image_convert_ctx *ctx;
+	struct ipu_image *in_p, *in, *in_n;
+	struct ipu_image *out;
+	int ret;
+	unsigned long flags;
+	unsigned int inburst, outburst;
+	unsigned int in_height;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (list_empty(&ic->image_list)) {
+		spin_unlock_irqrestore(&priv->lock, flags);
+		return NULL;
+	}
+
+	ctx = list_first_entry(&ic->image_list, struct image_convert_ctx, list);
+
+	list_del(&ctx->list);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	in_p = &ctx->in_p;
+	in = &ctx->in;
+	in_n = &ctx->in_n;
+	out = &ctx->out;
+
+	ipu_cpmem_zero(ch_in);
+	ipu_cpmem_zero(ch_out);
+
+	inburst = in->rect.width & 0xf ? 8 : 16;
+	outburst = out->rect.width & 0xf ? 8 : 16;
+
+	ipu_ic_enable(ic);
+
+	ipu_ic_task_idma_init(ic, ic->input_channel, in->rect.width,
+			      in->rect.height, inburst, IPU_ROTATE_NONE);
+	ipu_ic_task_idma_init(ic, ic->output_channel, out->rect.width,
+			      out->rect.height, outburst, IPU_ROTATE_NONE);
+
+	ipu_cpmem_set_image(ch_in, &ctx->in);
+	ipu_cpmem_set_image(ch_out, &ctx->out);
+
+	ipu_cpmem_set_burstsize(ch_in, inburst);
+	ipu_cpmem_set_burstsize(ch_out, outburst);
+
+	in_height = in->rect.height;
+
+	dev_dbg(priv->ipu->dev, "%s: %dx%d(%dx%d@%d,%d) -> %dx%d(%dx%d@%d,%d)\n",
+		__func__, in->pix.width, in->pix.height,
+		in->rect.width, in->rect.height, in->rect.left, in->rect.top,
+		out->pix.width, out->pix.height,
+		out->rect.width, out->rect.height,
+		out->rect.left, out->rect.top);
+
+	dev_dbg(priv->ipu->dev,
+		"%s: hscale: >>%d, *8192/%d vscale: >>%d, *8192/%d\n",
+		__func__, (ctx->rsc >> 14) & 0x3, (ctx->rsc & 0x3fff),
+		ctx->rsc >> 30, (ctx->rsc >> 16) & 0x3fff);
+
+	ret = ipu_ic_task_init(ic, in->rect.width, in_height,
+			out->rect.width, out->rect.height,
+			ipu_pixelformat_to_colorspace(in->pix.pixelformat),
+			ipu_pixelformat_to_colorspace(out->pix.pixelformat),
+			ctx->rsc);
+	if (ret) {
+		ipu_ic_disable(ic);
+		return ERR_PTR(ret);
+	}
+
+	ipu_idmac_enable_channel(ic->input_channel);
+	ipu_idmac_enable_channel(ic->output_channel);
+
+	ipu_ic_task_enable(ic);
+
+	ipu_idmac_select_buffer(ic->input_channel, 0);
+	ipu_idmac_select_buffer(ic->output_channel, 0);
+
+	return ctx;
+}
+
+static void ipu_image_convert_work(struct work_struct *work)
+{
+	struct ipu_ic *ic = container_of(work, struct ipu_ic, work);
+	struct image_convert_ctx *ctx;
+	int ret;
+
+	while (1) {
+		int task_error = 0;
+
+		ctx = ipu_image_convert_next(ic);
+		if (!ctx)
+			return;
+
+		if (IS_ERR(ctx)) {
+			task_error = PTR_ERR(ctx);
+		} else {
+			ret = wait_for_completion_interruptible_timeout(
+						&ic->complete, 100 * HZ);
+			if (!ret)
+				task_error = -ETIMEDOUT;
+		}
+
+		ipu_ic_task_disable(ic);
+		ipu_ic_disable(ic);
+
+		if (ctx->complete)
+			ctx->complete(ctx->complete_context, task_error);
+		kfree(ctx->freep);
+	}
+}
+
+static irqreturn_t ipu_image_convert_handler(int irq, void *context)
+{
+	struct ipu_ic *ic = context;
+
+	complete(&ic->complete);
+
+	return IRQ_HANDLED;
+}
+
+
+/*
+ * IDMAC base addresses are 8-byte aligned
+ */
+static int ipu_image_halign(u32 pixfmt)
+{
+	switch (pixfmt) {
+	/* 2 RGB32 pixels correspond to 8 bytes */
+	case V4L2_PIX_FMT_RGB32:
+	case V4L2_PIX_FMT_BGR32:
+		return 2;
+	/* 4 RGB565 or YUYV pixels correspond to 8 bytes */
+	case V4L2_PIX_FMT_RGB565:
+	case V4L2_PIX_FMT_UYVY:
+	case V4L2_PIX_FMT_YUYV:
+		return 4;
+	/*
+	 * 8 RGB24 pixels correspond to 24 bytes,
+	 * 8 NV12 pixels correspond to 8 bytes, both in luma and chroma
+	 */
+	case V4L2_PIX_FMT_RGB24:
+	case V4L2_PIX_FMT_BGR24:
+	case V4L2_PIX_FMT_NV12:
+		return 8;
+	/* 16 YUV420 pixels correspond to 16 bytes in luma, 8 bytes in chroma */
+	case V4L2_PIX_FMT_YUV420:
+	case V4L2_PIX_FMT_YVU420:
+	case V4L2_PIX_FMT_YUV422P:
+		return 16;
+	default:
+		return -EINVAL;
+	}
+}
+
+/*
+ * Vertically chroma-subsampled formats are limited to even heights and vertical
+ * positions
+ */
+static int ipu_image_valign(u32 pixfmt)
+{
+	switch (pixfmt) {
+	case V4L2_PIX_FMT_RGB24:
+	case V4L2_PIX_FMT_BGR24:
+	case V4L2_PIX_FMT_RGB32:
+	case V4L2_PIX_FMT_BGR32:
+	case V4L2_PIX_FMT_RGB565:
+	case V4L2_PIX_FMT_UYVY:
+	case V4L2_PIX_FMT_YUYV:
+	case V4L2_PIX_FMT_YUV422P:
+		return 1;
+	case V4L2_PIX_FMT_NV12:
+	case V4L2_PIX_FMT_YUV420:
+	case V4L2_PIX_FMT_YVU420:
+		return 2;
+	default:
+		return -EINVAL;
+	}
+}
+
+#define round_closest(x, y) round_down((x) + (y)/2, (y))
+
+struct image_convert_ctx *ipu_image_convert_prepare(struct ipu_soc *ipu,
+		struct ipu_image *in, struct ipu_image *out,
+		enum ipu_image_scale_ctrl ctrl, int *num_tiles)
+{
+	struct image_convert_ctx *ctx, *c;
+	int htiles, vtiles;
+	int in_valign, in_halign, in_burst, out_valign, out_halign, out_burst;
+	int left, top;
+	int x, y;
+	int h_resize_opt, v_resize_opt;
+	u32 v_downsize_coeff = 0, h_downsize_coeff = 0;
+	u32 v_resize_coeff, h_resize_coeff;
+
+	/* validate input */
+	if (in->rect.width < 16 || out->rect.width < 16 ||
+	    (in->rect.width / 8) > out->rect.width)
+		return ERR_PTR(-EINVAL);
+
+	/* tile setup */
+	htiles = DIV_ROUND_UP(out->rect.width, 1024);
+	vtiles = DIV_ROUND_UP(out->rect.height, 1024);
+
+	in_valign = ipu_image_valign(in->pix.pixelformat);
+	in_halign = ipu_image_halign(in->pix.pixelformat);
+	out_valign = ipu_image_valign(out->pix.pixelformat);
+	out_halign = ipu_image_halign(out->pix.pixelformat);
+
+	/* IC bursts are limited to either 8 or 16 pixels */
+	in_burst = 8;
+	out_burst = 8;
+
+	if (in_valign < 0 || in_halign < 0 ||
+	    out_valign < 0 || out_halign < 0) {
+		dev_err(ipu->dev, "unsupported in/out format\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* compute static decimator coefficients */
+	while ((in->rect.width >> h_downsize_coeff) > out->rect.width)
+		h_downsize_coeff++;
+	while ((in->rect.height >> v_downsize_coeff) > out->rect.height)
+		v_downsize_coeff++;
+
+	/* move and crop the output image according to IDMAC limitations */
+	switch (ctrl) {
+	case IPU_IMAGE_SCALE_ROUND_DOWN:
+		left = round_up(in->rect.left, in_halign);
+		top = round_up(in->rect.top, in_valign);
+		in->rect.width = in->rect.width - (left - in->rect.left);
+		in->rect.height = in->rect.height - (top - in->rect.top);
+		in->rect.left = left;
+		in->rect.top = top;
+		left = round_up(out->rect.left, out_halign);
+		top = round_up(out->rect.top, out_valign);
+		out->rect.width = round_down(out->rect.width - (left -
+					     out->rect.left), out_burst);
+		out->rect.height = round_down(out->rect.height - (top -
+					      out->rect.top), out_valign);
+		break;
+	case IPU_IMAGE_SCALE_ROUND_UP:
+		left = round_down(in->rect.left, in_halign);
+		top = round_down(in->rect.top, in_valign);
+		in->rect.width = in->rect.width + in->rect.left - left;
+		in->rect.height = in->rect.height + in->rect.top - top;
+		in->rect.left = left;
+		in->rect.top = top;
+		left = round_down(out->rect.left, out_halign);
+		top = round_down(out->rect.top, out_valign);
+		out->rect.width = round_up(out->rect.width + out->rect.left -
+					   left, out_burst);
+		out->rect.height = round_up(out->rect.height + out->rect.top -
+					    top, out_valign);
+		break;
+	case IPU_IMAGE_SCALE_PIXELPERFECT:
+		left = round_down(in->rect.left, in_halign);
+		top = round_down(in->rect.top, in_valign);
+		in->rect.width = in->rect.width + in->rect.left - left;
+		in->rect.height = in->rect.height + in->rect.top - top;
+		in->rect.left = left;
+		in->rect.top = top;
+		left = round_down(out->rect.left + out_halign / 2, out_halign);
+		top = round_down(out->rect.top + out_valign / 2, out_valign);
+		/*
+		 * don't round width and height to burst size / pixel format
+		 * limitations yet, we do it after determining the scaling
+		 * coefficients
+		 */
+		out->rect.width = out->rect.width + out->rect.left - left;
+		out->rect.height = out->rect.height + out->rect.top - top;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+	out->rect.left = left;
+	out->rect.top = top;
+
+	/* Round input width and height according to decimation */
+	in->rect.width = round_down(in->rect.width, 1 << h_downsize_coeff);
+	in->rect.height = round_down(in->rect.height, 1 << v_downsize_coeff);
+
+	dev_dbg(ipu->dev,
+		"%s: in: %dx%d(%dx%d@%d,%d) -> out: %dx%d(%dx%d@%d,%d)\n",
+		__func__, in->pix.width, in->pix.height, in->rect.width,
+		in->rect.height, in->rect.left, in->rect.top, out->pix.width,
+		out->pix.height, out->rect.width, out->rect.height,
+		out->rect.left, out->rect.top);
+
+	/*
+	 * Compute the bilinear resizing coefficients that can/could be used if
+	 * scaling using a single tile. The bottom right pixel should sample the
+	 * input as close as possible to but not beyond the bottom right input
+	 * pixel out of the decimator:
+	 *
+	 * (out->rect.width - 1) * h_resize / 8192.0 <= (in->rect.width >>
+	 *						 h_downsize_coeff) - 1
+	 * (out->rect.height - 1) * v_resize / 8192.0 <= (in->rect.height >>
+	 *						  v_downsize_coeff) - 1
+	 */
+	h_resize_opt = 8192 * ((in->rect.width >> h_downsize_coeff) - 1) /
+		       (out->rect.width - 1);
+	v_resize_opt = 8192 * ((in->rect.height >> v_downsize_coeff) - 1) /
+		       (out->rect.height - 1);
+
+	dev_dbg(ipu->dev,
+		"%s: hscale: >>%d, *8192/%d vscale: >>%d, *8192/%d, %dx%d tiles\n",
+		__func__, h_downsize_coeff, h_resize_opt, v_downsize_coeff,
+		v_resize_opt, htiles, vtiles);
+
+	ctx = kcalloc(htiles * vtiles, sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	c = ctx;
+
+	for (x = htiles - 1; x >= 0; x--) {
+		int in_right, out_right;
+
+		/*
+		 * Since we render tiles right to left, the right edge
+		 * is already known. Depending on tile position and
+		 * scaling mode, we may overshoot it.
+		 */
+		if (x == htiles - 1) {
+			out_right = out->rect.left + out->rect.width;
+			in_right = in->rect.left + in->rect.width;
+		} else {
+			struct image_convert_ctx *c_right = c - vtiles;
+
+			out_right = c_right->out.rect.left;
+			in_right = c_right->in.rect.left;
+		}
+
+		/* Now determine the left edge of this tile column */
+		if (x == 0) {
+			/* For the leftmost column this is trivial */
+			c->out.rect.left = out->rect.left;
+			c->in.rect.left = in->rect.left;
+		} else {
+			int best_left, best_in_left;
+			int min_left, max_left;
+			int min_diff = INT_MAX;
+
+			/*
+			 * Find the best possible left edge. It must be adjusted
+			 * according to IDMAC limitations, and should be
+			 * chosen so that
+			 * (in->rect.left + (c->out.rect.left - out->rect.left)
+			 *  * h_resize_opt / (8192 >> h_downsize_coeff))
+			 * is as close as possible to a valid left edge in the
+			 * input.
+			 */
+			min_left = max(0,
+				       round_up(out_right - 1024, out_halign));
+			max_left = min(round_down(out_right, out_halign),
+				       x * 1024);
+			best_left = min_left;
+			best_in_left = (best_left - out->rect.left) *
+				       h_resize_opt;
+			for (left = min_left; left < max_left;
+			     left += out_halign) {
+				int diff, in_left;
+
+				/*
+				 * In ROUND_UP and ROUND_DOWN modes, for the
+				 * rightmost column, only consider left edges
+				 * that are a multiple of the burst size away
+				 * from the right edge.
+				 */
+				if ((ctrl != IPU_IMAGE_SCALE_PIXELPERFECT) &&
+				    (x == htiles - 1) &&
+				    ((out_right - left) % out_burst))
+					continue;
+				in_left = in->rect.left +
+					  (((left - out->rect.left) *
+					    h_resize_opt) << h_downsize_coeff);
+				diff = abs(in_left -
+					   round_closest(in_left,
+							 8192 * in_halign));
+
+				if (diff < min_diff) {
+					min_diff = diff;
+					best_left = left;
+					best_in_left = in_left;
+				}
+			}
+
+			c->out.rect.left = best_left;
+			c->in.rect.left = DIV_ROUND_CLOSEST(best_in_left, 8192);
+
+			dev_dbg(ipu->dev,
+				"%s: tile(%d,y):\tleft: %d -> %d (instead of %d.%04d -> %d)",
+				__func__, x, c->in.rect.left,
+				c->out.rect.left, best_in_left / 8192,
+				(best_in_left % 8192) * 10000 / 8192,
+				out->rect.left +
+				DIV_ROUND_CLOSEST((c->in.rect.left -
+						   in->rect.left) *
+						  (8192 >> h_downsize_coeff),
+						  h_resize_opt));
+		}
+
+		/* Determine tile width from left and right edges */
+		c->out.rect.width = out_right - c->out.rect.left;
+		c->in.rect.width = in_right - c->in.rect.left;
+
+		/* Now we can determine the actual per-tile scaling factor */
+		if (x == htiles - 1) {
+			/*
+			 * Round down for the right column, since we
+			 * don't want to read beyond the right edge.
+			 */
+			h_resize_coeff = 8192 * ((c->in.rect.width >>
+						 h_downsize_coeff) - 1) /
+					 (c->out.rect.width - 1);
+		} else {
+			/*
+			 * Round to closest for seams between tiles for
+			 * minimal distortion.
+			 */
+			h_resize_coeff = DIV_ROUND_CLOSEST(8192 *
+							   (c->in.rect.width >>
+							    h_downsize_coeff),
+							   c->out.rect.width);
+		}
+
+		/*
+		 * With the scaling factor known, round up output width
+		 * to burst size. In ROUND_UP and ROUND_DOWN scaling mode
+		 * this is a no-op for the right column.
+		 */
+		c->out.rect.width = round_up(c->out.rect.width, out_burst);
+
+		/*
+		 * Calculate input width from the last accessed input pixel
+		 * given output width and scaling coefficients. Round to
+		 * burst size.
+		 */
+		c->in.rect.width = (DIV_ROUND_UP((c->out.rect.width - 1) *
+						 h_resize_coeff, 8192) + 1)
+				   << h_downsize_coeff;
+		c->in.rect.width = round_up(c->in.rect.width, in_burst);
+
+		for (y = vtiles - 1; y >= 0; y--) {
+			int in_bottom, out_bottom;
+
+			memcpy(&c->in.pix, &in->pix,
+			      sizeof(struct v4l2_pix_format));
+
+			if (y == vtiles - 1) {
+				out_bottom = out->rect.top + out->rect.height;
+				in_bottom = in->rect.top + in->rect.height;
+			} else {
+				struct image_convert_ctx *c_below = c - 1;
+
+				out_bottom = c_below->out.rect.top;
+				in_bottom = c_below->in.rect.top;
+
+				/*
+				 * Copy horizontal parameters from the tile
+				 * below
+				 */
+				c->out.rect.left = c_below->out.rect.left;
+				c->out.rect.width = c_below->out.rect.width;
+				c->in.rect.left = c_below->in.rect.left;
+				c->in.rect.width = c_below->in.rect.width;
+			}
+
+			if (y == 0) {
+				c->out.rect.top = out->rect.top;
+				c->in.rect.top = in->rect.top;
+			} else {
+				int best_top, best_in_top;
+				int min_top, max_top;
+				int min_diff = INT_MAX;
+
+				/*
+				 * Find the best possible top edge. It must be
+				 * adjusted according to IDMAC limitations, and
+				 * should be chosen so that
+				 * (in->rect.top + (c->out.rect.top -
+				 *  out->rect.top) * v_resize_opt /
+				 * (8192 >> v_downsize_coeff))
+				 * is as close as possible to a valid top edge
+				 * in the input.
+				 */
+				min_top = max(0,
+					      round_up(out_bottom - 1024,
+						       out_valign));
+				max_top = min(round_down(out_bottom,
+							 out_halign), y * 1024);
+				best_top = min_top;
+				best_in_top = (best_top - out->rect.top) *
+					       v_resize_opt;
+				for (top = min_top; top < max_top;
+				     top += out_valign) {
+					int diff, in_top;
+
+					in_top = in->rect.top +
+						 (((top - out->rect.top) *
+						   v_resize_opt) <<
+						  v_downsize_coeff);
+					diff = abs(in_top -
+						   round_closest(in_top, 8192 *
+								 in_valign));
+
+					if (diff < min_diff) {
+						min_diff = diff;
+						best_top = top;
+						best_in_top = in_top;
+					}
+				}
+
+				c->out.rect.top = best_top;
+				c->in.rect.top = DIV_ROUND_CLOSEST(best_in_top,
+								   8192);
+
+				dev_dbg(ipu->dev,
+					"%s: tile(%d,%d):\ttop: %d -> %d (instead of %d.%04d -> %d)",
+					__func__, x, y, c->in.rect.top,
+					c->out.rect.top, best_in_top / 8192,
+					(best_in_top % 8192) * 10000 / 8192,
+					out->rect.top +
+					DIV_ROUND_CLOSEST((c->in.rect.top -
+							   in->rect.top) * (8192
+							  >> v_downsize_coeff),
+							  v_resize_opt));
+			}
+
+			/* Determine tile height from top and bottom edges */
+			c->out.rect.height = out_bottom - c->out.rect.top;
+			c->in.rect.height = in_bottom - c->in.rect.top;
+
+			/*
+			 * Now we can determine the actual vertical per-tile
+			 * scaling factor
+			 */
+			if (y == vtiles - 1) {
+				/*
+				 * Round down for the bottom row, since we
+				 * don't want to read beyond the lower border.
+				 */
+				v_resize_coeff = 8192 * ((c->in.rect.height >>
+							 v_downsize_coeff) - 1)
+						 / (c->out.rect.height - 1);
+			} else {
+				/*
+				 * Round to closest for seams between tiles for
+				 * minimal distortion.
+				 */
+				v_resize_coeff = DIV_ROUND_CLOSEST(8192 *
+							(c->in.rect.height >>
+							 v_downsize_coeff),
+							c->out.rect.height);
+			}
+
+			/*
+			 * With the scaling factor known, round up output height
+			 * to IDMAC limitations
+			 */
+			c->out.rect.height = round_up(c->out.rect.height,
+						      out_valign);
+
+			/*
+			 * Calculate input height from the last accessed input
+			 * line given output height and scaling coefficients.
+			 */
+			c->in.rect.height = (DIV_ROUND_UP(
+						(c->out.rect.height - 1) *
+						v_resize_coeff, 8192) + 1)
+					    << v_downsize_coeff;
+
+			/* align height according to IDMAC restrictions */
+			c->in.rect.height = round_up(c->in.rect.height,
+				in_valign);
+
+			memcpy(&c->out.pix, &out->pix,
+			       sizeof(struct v4l2_pix_format));
+
+			dev_dbg(ipu->dev,
+				"%s: tile(%d,%d): %dx%d(%dx%d@%d,%d) -> %dx%d(%dx%d@%d,%d), resize: %dx%d\n",
+				__func__, x, y,
+				c->in.pix.width, c->in.pix.height,
+				c->in.rect.width, c->in.rect.height,
+				c->in.rect.left, c->in.rect.top,
+				c->out.pix.width, c->out.pix.height,
+				c->out.rect.width, c->out.rect.height,
+				c->out.rect.left, c->out.rect.top,
+				h_resize_coeff, v_resize_coeff);
+
+			c->rsc = (v_downsize_coeff << 30) |
+				 (v_resize_coeff << 16) |
+				 (h_downsize_coeff << 14) |
+				 h_resize_coeff;
+
+			c++;
+		}
+	}
+
+	*num_tiles = htiles * vtiles;
+
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(ipu_image_convert_prepare);
+
+int ipu_image_convert_run(struct ipu_soc *ipu, struct ipu_image *in,
+			  struct ipu_image *out, struct image_convert_ctx *ctx,
+			  int num_tiles, void (*complete)(void *ctx, int err),
+			  void *complete_context, bool free_ctx)
+{
+	struct ipu_ic_priv *priv = ipu->ic_priv;
+	struct ipu_ic *ic = &priv->task[IC_TASK_POST_PROCESSOR];
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < num_tiles; i++) {
+		ctx[i].in.phys0 = in->phys0;
+		ctx[i].out.phys0 = out->phys0;
+	}
+	ctx[num_tiles - 1].complete = complete;
+	ctx[num_tiles - 1].complete_context = complete_context;
+	if (free_ctx)
+		ctx[num_tiles - 1].freep = ctx;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	for (i = 0; i < num_tiles; i++)
+		list_add_tail(&ctx[i].list, &ic->image_list);
+
+	queue_work(ic->workqueue, &ic->work);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ipu_image_convert_run);
+
+static int ipu_image_convert_init(struct device *dev, struct ipu_soc *ipu,
+		struct ipu_ic_priv *priv)
+{
+	int ret;
+	struct ipu_ic *ic = ipu_ic_get(ipu, IC_TASK_POST_PROCESSOR);
+	int irq = ipu_idmac_channel_irq(ipu, ic->output_channel,
+					IPU_IRQ_EOF);
+
+	ic->workqueue = create_singlethread_workqueue(dev_name(ipu->dev));
+	if (!ic->workqueue)
+		return -ENOMEM;
+
+	INIT_WORK(&ic->work, ipu_image_convert_work);
+	init_completion(&ic->complete);
+
+	ret = devm_request_threaded_irq(dev, irq, NULL,
+				ipu_image_convert_handler,
+				IRQF_ONESHOT, "IC PP", ic);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	destroy_workqueue(ic->workqueue);
+	return ret;
+}
+
 int ipu_ic_enable(struct ipu_ic *ic)
 {
 	struct ipu_ic_priv *priv = ic->priv;
@@ -736,12 +1473,30 @@  int ipu_ic_init(struct ipu_soc *ipu, struct device *dev,
 	priv->ipu = ipu;
 
 	for (i = 0; i < IC_NUM_TASKS; i++) {
+		INIT_LIST_HEAD(&priv->task[i].image_list);
 		priv->task[i].task = i;
 		priv->task[i].priv = priv;
 		priv->task[i].reg = &ic_task_reg[i];
 		priv->task[i].bit = &ic_task_bit[i];
+
+		priv->task[i].input_channel = ipu_idmac_get(ipu,
+							ic_task_ch[i].in);
+		priv->task[i].output_channel = ipu_idmac_get(ipu,
+							ic_task_ch[i].out);
+		priv->task[i].rotation_input_channel = ipu_idmac_get(ipu,
+							ic_task_ch[i].rot_in);
+		priv->task[i].rotation_output_channel = ipu_idmac_get(ipu,
+							ic_task_ch[i].rot_out);
+		if (ic_task_ch[i].in_prev) {
+			priv->task[i].input_channel_p = ipu_idmac_get(ipu,
+							ic_task_ch[i].in_prev);
+			priv->task[i].input_channel_n = ipu_idmac_get(ipu,
+							ic_task_ch[i].in_next);
+		}
 	}
 
+	ipu_image_convert_init(dev, ipu, priv);
+
 	return 0;
 }
 
diff --git a/include/video/imx-ipu-v3.h b/include/video/imx-ipu-v3.h
index 459508e..6d98a38 100644
--- a/include/video/imx-ipu-v3.h
+++ b/include/video/imx-ipu-v3.h
@@ -316,7 +316,8 @@  int ipu_ic_task_init(struct ipu_ic *ic,
 		     int in_width, int in_height,
 		     int out_width, int out_height,
 		     enum ipu_color_space in_cs,
-		     enum ipu_color_space out_cs);
+		     enum ipu_color_space out_cs,
+		     u32 rsc);
 int ipu_ic_task_graphics_init(struct ipu_ic *ic,
 			      enum ipu_color_space in_g_cs,
 			      bool galpha_en, u32 galpha,
@@ -362,4 +363,35 @@  struct ipu_client_platformdata {
 	int dma[2];
 };
 
+enum ipu_image_scale_ctrl {
+	IPU_IMAGE_SCALE_ROUND_DOWN,
+	IPU_IMAGE_SCALE_PIXELPERFECT,
+	IPU_IMAGE_SCALE_ROUND_UP,
+};
+
+struct image_convert_ctx;
+
+struct image_convert_ctx *ipu_image_convert_prepare(struct ipu_soc *ipu,
+		struct ipu_image *in, struct ipu_image *out,
+		enum ipu_image_scale_ctrl ctrl, int *num_tiles);
+int ipu_image_convert_run(struct ipu_soc *ipu, struct ipu_image *in,
+		struct ipu_image *out, struct image_convert_ctx *ctx,
+		int num_tiles, void (*complete)(void *ctx, int err),
+		void *complete_context, bool free_ctx);
+
+static inline int ipu_image_convert(struct ipu_soc *ipu, struct ipu_image *in,
+		struct ipu_image *out, void (*complete)(void *ctx, int err),
+		void *complete_context, enum ipu_image_scale_ctrl ctrl)
+{
+	struct image_convert_ctx *ctx;
+	int num_tiles;
+
+	ctx = ipu_image_convert_prepare(ipu, in, out, ctrl, &num_tiles);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	return ipu_image_convert_run(ipu, in, out, ctx, num_tiles, complete,
+				     complete_context, true);
+}
+
 #endif /* __DRM_IPU_H__ */