diff mbox series

[03/12] media: hantro: Fix H264 motion vector buffer offset

Message ID HE1PR06MB40115337CD86C429EF24430CACBF0@HE1PR06MB4011.eurprd06.prod.outlook.com (mailing list archive)
State New, archived
Headers show
Series media: hantro: H264 fixes and improvements | expand

Commit Message

Jonas Karlman Sept. 1, 2019, 12:45 p.m. UTC
A decoded 8-bit 4:2:0 frame need memory for up to 448 macroblocks
and is laid out in memory as follow:

+-------------------+
| Y-plane   256 MBs |
+-------------------+
| UV-plane  128 MBs |
+-------------------+
| MV buffer  64 MBs |
+-------------------+

The motion vector buffer offset is currently correct for 4:2:0 because
the extra space for motion vectors is overallocated with an extra 64 MBs.

Wrong offset for both destination and motion vector buffer are used
for the bottom field of field encoded content, wrong offset is
also used for 4:0:0 (monochrome) content.

Fix this by always setting the motion vector address to the expected
384 MBs offset for 4:2:0 and 256 MBs offset for 4:0:0 content.

Also use correct destination and motion vector buffer offset
for the bottom field of field encoded content.

While at it also extend the check for 4:0:0 (monochrome) to include an
additional check for High Profile (100).

Fixes: dea0a82f3d22 ("media: hantro: Add support for H264 decoding on G1")
Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
---
 .../staging/media/hantro/hantro_g1_h264_dec.c | 33 +++++++++++--------
 1 file changed, 19 insertions(+), 14 deletions(-)

Comments

Philipp Zabel Sept. 3, 2019, 10:58 a.m. UTC | #1
Hi Jonas,

On Sun, 2019-09-01 at 12:45 +0000, Jonas Karlman wrote:
> A decoded 8-bit 4:2:0 frame need memory for up to 448 macroblocks
> and is laid out in memory as follow:

Do you mean "A decoded 8-bit 4:2:0 frame needs up to 448 bytes per
macroblock"?

A 1280x720 frame already consists of 3600 macroblocks (each 16x16 Y +
2x8x8 Cb,Cr).

> +-------------------+
> > Y-plane   256 MBs |

So that looks like it should be 256 bytes * number of macroblocks
instead, same for the following two.

> +-------------------+
> > UV-plane  128 MBs |
> +-------------------+
> > MV buffer  64 MBs |
> 
> +-------------------+
>
> The motion vector buffer offset is currently correct for 4:2:0 because
> the extra space for motion vectors is overallocated with an extra 64 MBs.
> 
> Wrong offset for both destination and motion vector buffer are used
> for the bottom field of field encoded content, wrong offset is
> also used for 4:0:0 (monochrome) content.
> 
> Fix this by always setting the motion vector address to the expected
> 384 MBs offset for 4:2:0 and 256 MBs offset for 4:0:0 content.

Expected by whom? For example, could these be placed in separate buffers
instead of appended to the VB2 allocated buffers?

> Also use correct destination and motion vector buffer offset
> for the bottom field of field encoded content.
> 
> While at it also extend the check for 4:0:0 (monochrome) to include an
> additional check for High Profile (100).
> 
> Fixes: dea0a82f3d22 ("media: hantro: Add support for H264 decoding on G1")
> Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
> ---
>  .../staging/media/hantro/hantro_g1_h264_dec.c | 33 +++++++++++--------
>  1 file changed, 19 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/staging/media/hantro/hantro_g1_h264_dec.c b/drivers/staging/media/hantro/hantro_g1_h264_dec.c
> index 7ab534936843..159bd67e0a36 100644
> --- a/drivers/staging/media/hantro/hantro_g1_h264_dec.c
> +++ b/drivers/staging/media/hantro/hantro_g1_h264_dec.c
> @@ -19,6 +19,9 @@
>  #include "hantro_hw.h"
>  #include "hantro_v4l2.h"
>  
> +#define MV_OFFSET_420	384
> +#define MV_OFFSET_400	256
> +
>  static void set_params(struct hantro_ctx *ctx)
>  {
>  	const struct hantro_h264_dec_ctrls *ctrls = &ctx->h264_dec.ctrls;
> @@ -49,8 +52,8 @@ static void set_params(struct hantro_ctx *ctx)
>  	vdpu_write_relaxed(vpu, reg, G1_REG_DEC_CTRL0);
>  
>  	/* Decoder control register 1. */
> -	reg = G1_REG_DEC_CTRL1_PIC_MB_WIDTH(sps->pic_width_in_mbs_minus1 + 1) |
> -	      G1_REG_DEC_CTRL1_PIC_MB_HEIGHT_P(sps->pic_height_in_map_units_minus1 + 1) |
> +	reg = G1_REG_DEC_CTRL1_PIC_MB_WIDTH(H264_MB_WIDTH(ctx->dst_fmt.width)) |
> +	      G1_REG_DEC_CTRL1_PIC_MB_HEIGHT_P(H264_MB_HEIGHT(ctx->dst_fmt.height)) |
>  	      G1_REG_DEC_CTRL1_REF_FRAMES(sps->max_num_ref_frames);
>  	vdpu_write_relaxed(vpu, reg, G1_REG_DEC_CTRL1);
>  
> @@ -79,7 +82,7 @@ static void set_params(struct hantro_ctx *ctx)
>  		reg |= G1_REG_DEC_CTRL4_CABAC_E;
>  	if (sps->flags & V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE)
>  		reg |= G1_REG_DEC_CTRL4_DIR_8X8_INFER_E;
> -	if (sps->chroma_format_idc == 0)
> +	if (sps->profile_idc >= 100 && sps->chroma_format_idc == 0)
>  		reg |= G1_REG_DEC_CTRL4_BLACKWHITE_E;
>  	if (pps->flags & V4L2_H264_PPS_FLAG_WEIGHTED_PRED)
>  		reg |= G1_REG_DEC_CTRL4_WEIGHT_PRED_E;
> @@ -233,6 +236,7 @@ static void set_buffers(struct hantro_ctx *ctx)
>  	struct vb2_v4l2_buffer *src_buf, *dst_buf;
>  	struct hantro_dev *vpu = ctx->dev;
>  	dma_addr_t src_dma, dst_dma;
> +	unsigned int offset = MV_OFFSET_420;
>  
>  	src_buf = hantro_get_src_buf(ctx);
>  	dst_buf = hantro_get_dst_buf(ctx);
> @@ -243,19 +247,20 @@ static void set_buffers(struct hantro_ctx *ctx)
>  
>  	/* Destination (decoded frame) buffer. */
>  	dst_dma = vb2_dma_contig_plane_dma_addr(&dst_buf->vb2_buf, 0);
> +	if (ctrls->slices[0].flags & V4L2_H264_SLICE_FLAG_BOTTOM_FIELD)
> +		dst_dma += ALIGN(ctx->dst_fmt.width, H264_MB_DIM);

How does this work? Does userspace decode two fields into the same
capture buffer and the hardware writes each field with a stride of 2
lines? I suppose this corresponds to V4L2_FIELD_INTERLACED. Could this
also be made to support V4L2_FIELD_SEQ_TB output?

regards
Philipp
Jonas Karlman Sept. 3, 2019, 8:13 p.m. UTC | #2
On 2019-09-03 12:58, Philipp Zabel wrote:
> Hi Jonas,
>
> On Sun, 2019-09-01 at 12:45 +0000, Jonas Karlman wrote:
>> A decoded 8-bit 4:2:0 frame need memory for up to 448 macroblocks
>> and is laid out in memory as follow:
> Do you mean "A decoded 8-bit 4:2:0 frame needs up to 448 bytes per
> macroblock"?
>
> A 1280x720 frame already consists of 3600 macroblocks (each 16x16 Y +
> 2x8x8 Cb,Cr).

You are correct, thanks for pointing out, I will change in a v2.

>
>> +-------------------+
>>> Y-plane   256 MBs |
> So that looks like it should be 256 bytes * number of macroblocks
> instead, same for the following two.

Ack.

>
>> +-------------------+
>>> UV-plane  128 MBs |
>> +-------------------+
>>> MV buffer  64 MBs |
>> +-------------------+
>>
>> The motion vector buffer offset is currently correct for 4:2:0 because
>> the extra space for motion vectors is overallocated with an extra 64 MBs.
>>
>> Wrong offset for both destination and motion vector buffer are used
>> for the bottom field of field encoded content, wrong offset is
>> also used for 4:0:0 (monochrome) content.
>>
>> Fix this by always setting the motion vector address to the expected
>> 384 MBs offset for 4:2:0 and 256 MBs offset for 4:0:0 content.
> Expected by whom? For example, could these be placed in separate buffers
> instead of appended to the VB2 allocated buffers?

From what I understand main and high profile decoding have hw constraints in that
the direct mode motion vectors buffer must be located continuously after the YUV buffer.

I also observed instances where the current requirement for profile_idc > 66 caused issues
for some streams, e.g. big_buck_bunny_1080p_H264_AAC_25fps_7200K.mp4

Because of this it was just easier to always configure the motion vector buffer address.

>
>> Also use correct destination and motion vector buffer offset
>> for the bottom field of field encoded content.
>>
>> While at it also extend the check for 4:0:0 (monochrome) to include an
>> additional check for High Profile (100).
>>
>> Fixes: dea0a82f3d22 ("media: hantro: Add support for H264 decoding on G1")
>> Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
>> ---
>>  .../staging/media/hantro/hantro_g1_h264_dec.c | 33 +++++++++++--------
>>  1 file changed, 19 insertions(+), 14 deletions(-)
>>
>> diff --git a/drivers/staging/media/hantro/hantro_g1_h264_dec.c b/drivers/staging/media/hantro/hantro_g1_h264_dec.c
>> index 7ab534936843..159bd67e0a36 100644
>> --- a/drivers/staging/media/hantro/hantro_g1_h264_dec.c
>> +++ b/drivers/staging/media/hantro/hantro_g1_h264_dec.c
>> @@ -19,6 +19,9 @@
>>  #include "hantro_hw.h"
>>  #include "hantro_v4l2.h"
>>  
>> +#define MV_OFFSET_420	384
>> +#define MV_OFFSET_400	256
>> +
>>  static void set_params(struct hantro_ctx *ctx)
>>  {
>>  	const struct hantro_h264_dec_ctrls *ctrls = &ctx->h264_dec.ctrls;
>> @@ -49,8 +52,8 @@ static void set_params(struct hantro_ctx *ctx)
>>  	vdpu_write_relaxed(vpu, reg, G1_REG_DEC_CTRL0);
>>  
>>  	/* Decoder control register 1. */
>> -	reg = G1_REG_DEC_CTRL1_PIC_MB_WIDTH(sps->pic_width_in_mbs_minus1 + 1) |
>> -	      G1_REG_DEC_CTRL1_PIC_MB_HEIGHT_P(sps->pic_height_in_map_units_minus1 + 1) |
>> +	reg = G1_REG_DEC_CTRL1_PIC_MB_WIDTH(H264_MB_WIDTH(ctx->dst_fmt.width)) |
>> +	      G1_REG_DEC_CTRL1_PIC_MB_HEIGHT_P(H264_MB_HEIGHT(ctx->dst_fmt.height)) |
>>  	      G1_REG_DEC_CTRL1_REF_FRAMES(sps->max_num_ref_frames);
>>  	vdpu_write_relaxed(vpu, reg, G1_REG_DEC_CTRL1);
>>  
>> @@ -79,7 +82,7 @@ static void set_params(struct hantro_ctx *ctx)
>>  		reg |= G1_REG_DEC_CTRL4_CABAC_E;
>>  	if (sps->flags & V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE)
>>  		reg |= G1_REG_DEC_CTRL4_DIR_8X8_INFER_E;
>> -	if (sps->chroma_format_idc == 0)
>> +	if (sps->profile_idc >= 100 && sps->chroma_format_idc == 0)
>>  		reg |= G1_REG_DEC_CTRL4_BLACKWHITE_E;
>>  	if (pps->flags & V4L2_H264_PPS_FLAG_WEIGHTED_PRED)
>>  		reg |= G1_REG_DEC_CTRL4_WEIGHT_PRED_E;
>> @@ -233,6 +236,7 @@ static void set_buffers(struct hantro_ctx *ctx)
>>  	struct vb2_v4l2_buffer *src_buf, *dst_buf;
>>  	struct hantro_dev *vpu = ctx->dev;
>>  	dma_addr_t src_dma, dst_dma;
>> +	unsigned int offset = MV_OFFSET_420;
>>  
>>  	src_buf = hantro_get_src_buf(ctx);
>>  	dst_buf = hantro_get_dst_buf(ctx);
>> @@ -243,19 +247,20 @@ static void set_buffers(struct hantro_ctx *ctx)
>>  
>>  	/* Destination (decoded frame) buffer. */
>>  	dst_dma = vb2_dma_contig_plane_dma_addr(&dst_buf->vb2_buf, 0);
>> +	if (ctrls->slices[0].flags & V4L2_H264_SLICE_FLAG_BOTTOM_FIELD)
>> +		dst_dma += ALIGN(ctx->dst_fmt.width, H264_MB_DIM);
> How does this work? Does userspace decode two fields into the same
> capture buffer and the hardware writes each field with a stride of 2
> lines? I suppose this corresponds to V4L2_FIELD_INTERLACED. Could this
> also be made to support V4L2_FIELD_SEQ_TB output?

Yes, both fields are decoded into the same capture buffer, top field to odd numbered lines
and bottom field to even numbered lines, so I guess this corresponds to V4L2_FIELD_INTERLACED.
This is also how the cedrus driver handles field decoding with Jernej's h264 patches.

I do not know if it is possible to configure the hw to decode into a V4L2_FIELD_SEQ_TB type output.

Regards,
Jonas

>
> regards
> Philipp
Ezequiel Garcia Sept. 10, 2019, 10:18 a.m. UTC | #3
Hi Jonas,

Thanks for fixing this, I'm happy we are reducing the amount
of black magic here.

On Sun, 2019-09-01 at 12:45 +0000, Jonas Karlman wrote:
> A decoded 8-bit 4:2:0 frame need memory for up to 448 macroblocks
> and is laid out in memory as follow:
> 
> +-------------------+
> > Y-plane   256 MBs |
> +-------------------+
> > UV-plane  128 MBs |
> +-------------------+
> > MV buffer  64 MBs |
> +-------------------+
> 
> The motion vector buffer offset is currently correct for 4:2:0 because
> the extra space for motion vectors is overallocated with an extra 64 MBs.
> 
> Wrong offset for both destination and motion vector buffer are used
> for the bottom field of field encoded content, wrong offset is
> also used for 4:0:0 (monochrome) content.
> 
> Fix this by always setting the motion vector address to the expected
> 384 MBs offset for 4:2:0 and 256 MBs offset for 4:0:0 content.
> 
> Also use correct destination and motion vector buffer offset
> for the bottom field of field encoded content.
> 
> While at it also extend the check for 4:0:0 (monochrome) to include an
> additional check for High Profile (100).
> 

As with the scaling list, I believe it would make a lot of sense
to document this in the driver itself.

Thanks,
Ezequiel
Ezequiel Garcia Sept. 10, 2019, 11:34 a.m. UTC | #4
A few more comments...

On Sun, 2019-09-01 at 12:45 +0000, Jonas Karlman wrote:
> A decoded 8-bit 4:2:0 frame need memory for up to 448 macroblocks
> and is laid out in memory as follow:
> 
> +-------------------+
> > Y-plane   256 MBs |
> +-------------------+
> > UV-plane  128 MBs |
> +-------------------+
> > MV buffer  64 MBs |
> +-------------------+
> 
> The motion vector buffer offset is currently correct for 4:2:0 because
> the extra space for motion vectors is overallocated with an extra 64 MBs.
> 
> Wrong offset for both destination and motion vector buffer are used
> for the bottom field of field encoded content, wrong offset is
> also used for 4:0:0 (monochrome) content.
> 
> Fix this by always setting the motion vector address to the expected
> 384 MBs offset for 4:2:0 and 256 MBs offset for 4:0:0 content.
> 
> Also use correct destination and motion vector buffer offset
> for the bottom field of field encoded content.
> 
> While at it also extend the check for 4:0:0 (monochrome) to include an
> additional check for High Profile (100).
> 
> Fixes: dea0a82f3d22 ("media: hantro: Add support for H264 decoding on G1")
> Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
> ---
>  .../staging/media/hantro/hantro_g1_h264_dec.c | 33 +++++++++++--------
>  1 file changed, 19 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/staging/media/hantro/hantro_g1_h264_dec.c b/drivers/staging/media/hantro/hantro_g1_h264_dec.c
> index 7ab534936843..159bd67e0a36 100644
> --- a/drivers/staging/media/hantro/hantro_g1_h264_dec.c
> +++ b/drivers/staging/media/hantro/hantro_g1_h264_dec.c
> @@ -19,6 +19,9 @@
>  #include "hantro_hw.h"
>  #include "hantro_v4l2.h"
>  
> +#define MV_OFFSET_420	384
> +#define MV_OFFSET_400	256
> +

Instead of introducing these macros, I'd just use the macroblock width
and height ones explicitly. This way it's more clear where is
the code coming from.

>  static void set_params(struct hantro_ctx *ctx)
>  {
>  	const struct hantro_h264_dec_ctrls *ctrls = &ctx->h264_dec.ctrls;
> @@ -49,8 +52,8 @@ static void set_params(struct hantro_ctx *ctx)
>  	vdpu_write_relaxed(vpu, reg, G1_REG_DEC_CTRL0);
>  
>  	/* Decoder control register 1. */
> -	reg = G1_REG_DEC_CTRL1_PIC_MB_WIDTH(sps->pic_width_in_mbs_minus1 + 1) |
> -	      G1_REG_DEC_CTRL1_PIC_MB_HEIGHT_P(sps->pic_height_in_map_units_minus1 + 1) |
> +	reg = G1_REG_DEC_CTRL1_PIC_MB_WIDTH(H264_MB_WIDTH(ctx->dst_fmt.width)) |
> +	      G1_REG_DEC_CTRL1_PIC_MB_HEIGHT_P(H264_MB_HEIGHT(ctx->dst_fmt.height)) |

This is a nice fix, but unless I'm missing something it's unrelated to this patch.
 
>  	      G1_REG_DEC_CTRL1_REF_FRAMES(sps->max_num_ref_frames);
>  	vdpu_write_relaxed(vpu, reg, G1_REG_DEC_CTRL1);
>  
> @@ -79,7 +82,7 @@ static void set_params(struct hantro_ctx *ctx)
>  		reg |= G1_REG_DEC_CTRL4_CABAC_E;
>  	if (sps->flags & V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE)
>  		reg |= G1_REG_DEC_CTRL4_DIR_8X8_INFER_E;
> -	if (sps->chroma_format_idc == 0)
> +	if (sps->profile_idc >= 100 && sps->chroma_format_idc == 0)
>  		reg |= G1_REG_DEC_CTRL4_BLACKWHITE_E;
>  	if (pps->flags & V4L2_H264_PPS_FLAG_WEIGHTED_PRED)
>  		reg |= G1_REG_DEC_CTRL4_WEIGHT_PRED_E;
> @@ -233,6 +236,7 @@ static void set_buffers(struct hantro_ctx *ctx)
>  	struct vb2_v4l2_buffer *src_buf, *dst_buf;
>  	struct hantro_dev *vpu = ctx->dev;
>  	dma_addr_t src_dma, dst_dma;
> +	unsigned int offset = MV_OFFSET_420;
>  
>  	src_buf = hantro_get_src_buf(ctx);
>  	dst_buf = hantro_get_dst_buf(ctx);
> @@ -243,19 +247,20 @@ static void set_buffers(struct hantro_ctx *ctx)
>  
>  	/* Destination (decoded frame) buffer. */
>  	dst_dma = vb2_dma_contig_plane_dma_addr(&dst_buf->vb2_buf, 0);
> +	if (ctrls->slices[0].flags & V4L2_H264_SLICE_FLAG_BOTTOM_FIELD)
> +		dst_dma += ALIGN(ctx->dst_fmt.width, H264_MB_DIM);
>  	vdpu_write_relaxed(vpu, dst_dma, G1_REG_ADDR_DST);
>  
> -	/* Higher profiles require DMV buffer appended to reference frames. */
> -	if (ctrls->sps->profile_idc > 66) {
> -		size_t pic_size = ctx->h264_dec.pic_size;
> -		size_t mv_offset = round_up(pic_size, 8);
> -
> -		if (ctrls->slices[0].flags & V4L2_H264_SLICE_FLAG_BOTTOM_FIELD)
> -			mv_offset += 32 * H264_MB_WIDTH(ctx->dst_fmt.width);
> -
> -		vdpu_write_relaxed(vpu, dst_dma + mv_offset,
> -				   G1_REG_ADDR_DIR_MV);
> -	}
> +	/* Motion vector buffer is located after the decoded frame. */
> +	dst_dma = vb2_dma_contig_plane_dma_addr(&dst_buf->vb2_buf, 0);

I would try to rework the code to avoid calling
vb2_dma_contig_plane_dma_addr() again.

> +	if (ctrls->sps->profile_idc >= 100 && ctrls->sps->chroma_format_idc == 0)
> +		offset = MV_OFFSET_400;
> +	dst_dma += offset * H264_MB_WIDTH(ctx->dst_fmt.width) *
> +		   H264_MB_HEIGHT(ctx->dst_fmt.height);

Perhaps rename 'offset' to something different? Maybe bytes_per_mb
or similar.

> +	if (ctrls->slices[0].flags & V4L2_H264_SLICE_FLAG_BOTTOM_FIELD)
> +		dst_dma += 32 * H264_MB_WIDTH(ctx->dst_fmt.width) *
> +			   H264_MB_HEIGHT(ctx->dst_fmt.height);

While here, could you replace this 32 magic number with some
meaningful macro?

> +	vdpu_write_relaxed(vpu, dst_dma, G1_REG_ADDR_DIR_MV);
>  
>  	/* Auxiliary buffer prepared in hantro_g1_h264_dec_prepare_table(). */
>  	vdpu_write_relaxed(vpu, ctx->h264_dec.priv.dma, G1_REG_ADDR_QTABLE);

Thanks a lot,
Ezequiel
diff mbox series

Patch

diff --git a/drivers/staging/media/hantro/hantro_g1_h264_dec.c b/drivers/staging/media/hantro/hantro_g1_h264_dec.c
index 7ab534936843..159bd67e0a36 100644
--- a/drivers/staging/media/hantro/hantro_g1_h264_dec.c
+++ b/drivers/staging/media/hantro/hantro_g1_h264_dec.c
@@ -19,6 +19,9 @@ 
 #include "hantro_hw.h"
 #include "hantro_v4l2.h"
 
+#define MV_OFFSET_420	384
+#define MV_OFFSET_400	256
+
 static void set_params(struct hantro_ctx *ctx)
 {
 	const struct hantro_h264_dec_ctrls *ctrls = &ctx->h264_dec.ctrls;
@@ -49,8 +52,8 @@  static void set_params(struct hantro_ctx *ctx)
 	vdpu_write_relaxed(vpu, reg, G1_REG_DEC_CTRL0);
 
 	/* Decoder control register 1. */
-	reg = G1_REG_DEC_CTRL1_PIC_MB_WIDTH(sps->pic_width_in_mbs_minus1 + 1) |
-	      G1_REG_DEC_CTRL1_PIC_MB_HEIGHT_P(sps->pic_height_in_map_units_minus1 + 1) |
+	reg = G1_REG_DEC_CTRL1_PIC_MB_WIDTH(H264_MB_WIDTH(ctx->dst_fmt.width)) |
+	      G1_REG_DEC_CTRL1_PIC_MB_HEIGHT_P(H264_MB_HEIGHT(ctx->dst_fmt.height)) |
 	      G1_REG_DEC_CTRL1_REF_FRAMES(sps->max_num_ref_frames);
 	vdpu_write_relaxed(vpu, reg, G1_REG_DEC_CTRL1);
 
@@ -79,7 +82,7 @@  static void set_params(struct hantro_ctx *ctx)
 		reg |= G1_REG_DEC_CTRL4_CABAC_E;
 	if (sps->flags & V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE)
 		reg |= G1_REG_DEC_CTRL4_DIR_8X8_INFER_E;
-	if (sps->chroma_format_idc == 0)
+	if (sps->profile_idc >= 100 && sps->chroma_format_idc == 0)
 		reg |= G1_REG_DEC_CTRL4_BLACKWHITE_E;
 	if (pps->flags & V4L2_H264_PPS_FLAG_WEIGHTED_PRED)
 		reg |= G1_REG_DEC_CTRL4_WEIGHT_PRED_E;
@@ -233,6 +236,7 @@  static void set_buffers(struct hantro_ctx *ctx)
 	struct vb2_v4l2_buffer *src_buf, *dst_buf;
 	struct hantro_dev *vpu = ctx->dev;
 	dma_addr_t src_dma, dst_dma;
+	unsigned int offset = MV_OFFSET_420;
 
 	src_buf = hantro_get_src_buf(ctx);
 	dst_buf = hantro_get_dst_buf(ctx);
@@ -243,19 +247,20 @@  static void set_buffers(struct hantro_ctx *ctx)
 
 	/* Destination (decoded frame) buffer. */
 	dst_dma = vb2_dma_contig_plane_dma_addr(&dst_buf->vb2_buf, 0);
+	if (ctrls->slices[0].flags & V4L2_H264_SLICE_FLAG_BOTTOM_FIELD)
+		dst_dma += ALIGN(ctx->dst_fmt.width, H264_MB_DIM);
 	vdpu_write_relaxed(vpu, dst_dma, G1_REG_ADDR_DST);
 
-	/* Higher profiles require DMV buffer appended to reference frames. */
-	if (ctrls->sps->profile_idc > 66) {
-		size_t pic_size = ctx->h264_dec.pic_size;
-		size_t mv_offset = round_up(pic_size, 8);
-
-		if (ctrls->slices[0].flags & V4L2_H264_SLICE_FLAG_BOTTOM_FIELD)
-			mv_offset += 32 * H264_MB_WIDTH(ctx->dst_fmt.width);
-
-		vdpu_write_relaxed(vpu, dst_dma + mv_offset,
-				   G1_REG_ADDR_DIR_MV);
-	}
+	/* Motion vector buffer is located after the decoded frame. */
+	dst_dma = vb2_dma_contig_plane_dma_addr(&dst_buf->vb2_buf, 0);
+	if (ctrls->sps->profile_idc >= 100 && ctrls->sps->chroma_format_idc == 0)
+		offset = MV_OFFSET_400;
+	dst_dma += offset * H264_MB_WIDTH(ctx->dst_fmt.width) *
+		   H264_MB_HEIGHT(ctx->dst_fmt.height);
+	if (ctrls->slices[0].flags & V4L2_H264_SLICE_FLAG_BOTTOM_FIELD)
+		dst_dma += 32 * H264_MB_WIDTH(ctx->dst_fmt.width) *
+			   H264_MB_HEIGHT(ctx->dst_fmt.height);
+	vdpu_write_relaxed(vpu, dst_dma, G1_REG_ADDR_DIR_MV);
 
 	/* Auxiliary buffer prepared in hantro_g1_h264_dec_prepare_table(). */
 	vdpu_write_relaxed(vpu, ctx->h264_dec.priv.dma, G1_REG_ADDR_QTABLE);