diff mbox series

[RFC,2/6] media: uapi: h264: Add the concept of decoding mode

Message ID 20190603110946.4952-3-boris.brezillon@collabora.com (mailing list archive)
State New, archived
Headers show
Series media: uapi: h264: First batch of adjusments | expand

Commit Message

Boris Brezillon June 3, 2019, 11:09 a.m. UTC
Some stateless decoders don't support per-slice decoding (or at least
not in a way that would make them efficient or easy to use).
Let's expose a menu to control and expose the supported decoding modes.
Drivers are allowed to support only one decoding but they can support
both too.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 .../media/uapi/v4l/ext-ctrls-codec.rst        | 42 ++++++++++++++++++-
 drivers/media/v4l2-core/v4l2-ctrls.c          |  9 ++++
 include/media/h264-ctrls.h                    | 13 ++++++
 3 files changed, 63 insertions(+), 1 deletion(-)

Comments

Thierry Reding June 3, 2019, 12:30 p.m. UTC | #1
On Mon, Jun 03, 2019 at 01:09:42PM +0200, Boris Brezillon wrote:
> Some stateless decoders don't support per-slice decoding (or at least
> not in a way that would make them efficient or easy to use).
> Let's expose a menu to control and expose the supported decoding modes.
> Drivers are allowed to support only one decoding but they can support
> both too.
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
>  .../media/uapi/v4l/ext-ctrls-codec.rst        | 42 ++++++++++++++++++-
>  drivers/media/v4l2-core/v4l2-ctrls.c          |  9 ++++
>  include/media/h264-ctrls.h                    | 13 ++++++
>  3 files changed, 63 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> index 82547d5de250..188f625acb7c 100644
> --- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> +++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> @@ -1748,6 +1748,14 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
>      * - __u32
>        - ``size``
>        -
> +    * - __u32
> +      - ``start_byte_offset``
> +      - Where the slice payload starts in the output buffer. Useful when
> +        operating in per frame decoding mode and decoding multi-slice content.
> +        In this case, the output buffer will contain more than one slice and
> +        some codecs need to know where each slice starts. Note that this
> +        offsets points to the beginning of the slice which is supposed to
> +        contain an ANNEX B start code
>      * - __u32
>        - ``header_bit_size``
>        -
> @@ -1931,7 +1939,10 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
>        -
>      * - __u16
>        - ``num_slices``
> -      - Number of slices needed to decode the current frame
> +      - Number of slices needed to decode the current frame/field. When
> +        operating in per-slice decoding mode (see
> +        :c:type:`v4l2_mpeg_video_h264_decoding_mode`), this field
> +        should always be set to one
>      * - __u16
>        - ``nal_ref_idc``
>        - NAL reference ID value coming from the NAL Unit header
> @@ -2022,6 +2033,35 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
>        - 0x00000004
>        - The DPB entry is a long term reference frame
>  
> +``V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE (enum)``
> +    Specifies the decoding mode to use. Currently exposes per slice and per
> +    frame decoding but new modes might be added later on.
> +
> +    .. note::
> +
> +       This menu control is not yet part of the public kernel API and
> +       it is expected to change.
> +
> +.. c:type:: v4l2_mpeg_video_h264_decoding_mode
> +
> +.. cssclass:: longtable
> +
> +.. flat-table::
> +    :header-rows:  0
> +    :stub-columns: 0
> +    :widths:       1 1 2
> +
> +    * - ``V4L2_MPEG_VIDEO_H264_DECODING_PER_SLICE``
> +      - 0
> +      - The decoding is done per slice. v4l2_ctrl_h264_decode_params->num_slices
> +        must be set to 1 and the output buffer should contain only one slice.

I wonder if we need to be that strict. Wouldn't it be possible for
drivers to just iterate over a number of slices and decode each in turn
if userspace passed more than one?

Or perhaps a decoder can batch queue a couple of slices. I'm not sure
how frequent this is, but consider something like a spike in activity on
your system, causing some slices to get delayed so you actually get a
few buffered up before you get a chance to hand them to the V4L2 device.
Processing them all at once may help conceal the lag.

Thierry

> +    * - ``V4L2_MPEG_VIDEO_H264_DECODING_PER_FRAME``
> +      - 1
> +      - The decoding is done per frame. v4l2_ctrl_h264_decode_params->num_slices
> +        can be > 1. When that happens, the output buffer should contain all
> +        slices needed to decode a frame/field, each slice being prefixed by an
> +        Annex B NAL header/start-code.
> +
>  .. _v4l2-mpeg-mpeg2:
>  
>  ``V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS (struct)``
> diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
> index 1217d38ea394..72bb3c8882f5 100644
> --- a/drivers/media/v4l2-core/v4l2-ctrls.c
> +++ b/drivers/media/v4l2-core/v4l2-ctrls.c
> @@ -406,6 +406,11 @@ const char * const *v4l2_ctrl_get_menu(u32 id)
>  		"Explicit",
>  		NULL,
>  	};
> +	static const char * const h264_decoding_mode[] = {
> +		"Per Slice",
> +		"Per Frame",
> +		NULL,
> +	};
>  	static const char * const mpeg_mpeg2_level[] = {
>  		"Low",
>  		"Main",
> @@ -637,6 +642,8 @@ const char * const *v4l2_ctrl_get_menu(u32 id)
>  		return h264_fp_arrangement_type;
>  	case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE:
>  		return h264_fmo_map_type;
> +	case V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE:
> +		return h264_decoding_mode;
>  	case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL:
>  		return mpeg_mpeg2_level;
>  	case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE:
> @@ -856,6 +863,7 @@ const char *v4l2_ctrl_get_name(u32 id)
>  	case V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX:		return "H264 Scaling Matrix";
>  	case V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS:		return "H264 Slice Parameters";
>  	case V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS:		return "H264 Decode Parameters";
> +	case V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE:		return "H264 Decoding Mode";
>  	case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL:			return "MPEG2 Level";
>  	case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE:			return "MPEG2 Profile";
>  	case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP:		return "MPEG4 I-Frame QP Value";
> @@ -1224,6 +1232,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
>  	case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC:
>  	case V4L2_CID_MPEG_VIDEO_H264_SEI_FP_ARRANGEMENT_TYPE:
>  	case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE:
> +	case V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE:
>  	case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL:
>  	case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE:
>  	case V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL:
> diff --git a/include/media/h264-ctrls.h b/include/media/h264-ctrls.h
> index e1404d78d6ff..26de2243f6f5 100644
> --- a/include/media/h264-ctrls.h
> +++ b/include/media/h264-ctrls.h
> @@ -26,6 +26,7 @@
>  #define V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX	(V4L2_CID_MPEG_BASE+1002)
>  #define V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS	(V4L2_CID_MPEG_BASE+1003)
>  #define V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS	(V4L2_CID_MPEG_BASE+1004)
> +#define V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE	(V4L2_CID_MPEG_BASE+1005)
>  
>  /* enum v4l2_ctrl_type type values */
>  #define V4L2_CTRL_TYPE_H264_SPS			0x0110
> @@ -33,6 +34,12 @@
>  #define V4L2_CTRL_TYPE_H264_SCALING_MATRIX	0x0112
>  #define V4L2_CTRL_TYPE_H264_SLICE_PARAMS	0x0113
>  #define V4L2_CTRL_TYPE_H264_DECODE_PARAMS	0x0114
> +#define V4L2_CTRL_TYPE_H264_DECODING_MODE	0x0115
> +
> +enum v4l2_mpeg_video_h264_decoding_mode {
> +	V4L2_MPEG_VIDEO_H264_DECODING_PER_SLICE,
> +	V4L2_MPEG_VIDEO_H264_DECODING_PER_FRAME,
> +};
>  
>  #define V4L2_H264_SPS_CONSTRAINT_SET0_FLAG			0x01
>  #define V4L2_H264_SPS_CONSTRAINT_SET1_FLAG			0x02
> @@ -111,6 +118,8 @@ struct v4l2_h264_pred_weight_table {
>  	struct v4l2_h264_weight_factors weight_factors[2];
>  };
>  
> +#define V4L2_H264_MAX_SLICES_PER_FRAME			16
> +
>  #define V4L2_H264_SLICE_TYPE_P				0
>  #define V4L2_H264_SLICE_TYPE_B				1
>  #define V4L2_H264_SLICE_TYPE_I				2
> @@ -125,6 +134,10 @@ struct v4l2_h264_pred_weight_table {
>  struct v4l2_ctrl_h264_slice_params {
>  	/* Size in bytes, including header */
>  	__u32 size;
> +
> +	/* Where the slice starts in the output buffer (expressed in bytes). */
> +	__u32 start_byte_offset;
> +
>  	/* Offset in bits to slice_data() from the beginning of this slice. */
>  	__u32 header_bit_size;
>  
> -- 
> 2.20.1
>
Boris Brezillon June 3, 2019, 12:51 p.m. UTC | #2
+Maxime

Oops, just realized Maxime was not Cc-ed on this series.

On Mon, 3 Jun 2019 14:30:20 +0200
Thierry Reding <thierry.reding@gmail.com> wrote:

> On Mon, Jun 03, 2019 at 01:09:42PM +0200, Boris Brezillon wrote:
> > Some stateless decoders don't support per-slice decoding (or at least
> > not in a way that would make them efficient or easy to use).
> > Let's expose a menu to control and expose the supported decoding modes.
> > Drivers are allowed to support only one decoding but they can support
> > both too.
> > 
> > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > ---
> >  .../media/uapi/v4l/ext-ctrls-codec.rst        | 42 ++++++++++++++++++-
> >  drivers/media/v4l2-core/v4l2-ctrls.c          |  9 ++++
> >  include/media/h264-ctrls.h                    | 13 ++++++
> >  3 files changed, 63 insertions(+), 1 deletion(-)
> > 
> > diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > index 82547d5de250..188f625acb7c 100644
> > --- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > +++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > @@ -1748,6 +1748,14 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> >      * - __u32
> >        - ``size``
> >        -
> > +    * - __u32
> > +      - ``start_byte_offset``
> > +      - Where the slice payload starts in the output buffer. Useful when
> > +        operating in per frame decoding mode and decoding multi-slice content.
> > +        In this case, the output buffer will contain more than one slice and
> > +        some codecs need to know where each slice starts. Note that this
> > +        offsets points to the beginning of the slice which is supposed to
> > +        contain an ANNEX B start code
> >      * - __u32
> >        - ``header_bit_size``
> >        -
> > @@ -1931,7 +1939,10 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> >        -
> >      * - __u16
> >        - ``num_slices``
> > -      - Number of slices needed to decode the current frame
> > +      - Number of slices needed to decode the current frame/field. When
> > +        operating in per-slice decoding mode (see
> > +        :c:type:`v4l2_mpeg_video_h264_decoding_mode`), this field
> > +        should always be set to one
> >      * - __u16
> >        - ``nal_ref_idc``
> >        - NAL reference ID value coming from the NAL Unit header
> > @@ -2022,6 +2033,35 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> >        - 0x00000004
> >        - The DPB entry is a long term reference frame
> >  
> > +``V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE (enum)``
> > +    Specifies the decoding mode to use. Currently exposes per slice and per
> > +    frame decoding but new modes might be added later on.
> > +
> > +    .. note::
> > +
> > +       This menu control is not yet part of the public kernel API and
> > +       it is expected to change.
> > +
> > +.. c:type:: v4l2_mpeg_video_h264_decoding_mode
> > +
> > +.. cssclass:: longtable
> > +
> > +.. flat-table::
> > +    :header-rows:  0
> > +    :stub-columns: 0
> > +    :widths:       1 1 2
> > +
> > +    * - ``V4L2_MPEG_VIDEO_H264_DECODING_PER_SLICE``
> > +      - 0
> > +      - The decoding is done per slice. v4l2_ctrl_h264_decode_params->num_slices
> > +        must be set to 1 and the output buffer should contain only one slice.  
> 
> I wonder if we need to be that strict. Wouldn't it be possible for
> drivers to just iterate over a number of slices and decode each in turn
> if userspace passed more than one?
> 
> Or perhaps a decoder can batch queue a couple of slices. I'm not sure
> how frequent this is, but consider something like a spike in activity on
> your system, causing some slices to get delayed so you actually get a
> few buffered up before you get a chance to hand them to the V4L2 device.
> Processing them all at once may help conceal the lag.

Hm, so we'd be in some kind of slice-batch mode, which means we could
trigger a decode operation with more than one slice, but not
necessarily all the slices needed to decode a frame. TBH, supporting
per-frame (or the batch mode you suggest) on a HW that supports
per-slice decoding should be pretty simple and has not real impact on
perfs (as you said, it's just a matter of iterating over all the slices
attached to a decode operation), so I'm fine relaxing the rule here and
patching the cedrus driver accordingly (I can't really test the
changes though). Paul, Maxime, what's your opinion?
Thierry Reding June 3, 2019, 2:05 p.m. UTC | #3
On Mon, Jun 03, 2019 at 02:51:13PM +0200, Boris Brezillon wrote:
> +Maxime
> 
> Oops, just realized Maxime was not Cc-ed on this series.
> 
> On Mon, 3 Jun 2019 14:30:20 +0200
> Thierry Reding <thierry.reding@gmail.com> wrote:
> 
> > On Mon, Jun 03, 2019 at 01:09:42PM +0200, Boris Brezillon wrote:
> > > Some stateless decoders don't support per-slice decoding (or at least
> > > not in a way that would make them efficient or easy to use).
> > > Let's expose a menu to control and expose the supported decoding modes.
> > > Drivers are allowed to support only one decoding but they can support
> > > both too.
> > > 
> > > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > > ---
> > >  .../media/uapi/v4l/ext-ctrls-codec.rst        | 42 ++++++++++++++++++-
> > >  drivers/media/v4l2-core/v4l2-ctrls.c          |  9 ++++
> > >  include/media/h264-ctrls.h                    | 13 ++++++
> > >  3 files changed, 63 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > index 82547d5de250..188f625acb7c 100644
> > > --- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > +++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > @@ -1748,6 +1748,14 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > >      * - __u32
> > >        - ``size``
> > >        -
> > > +    * - __u32
> > > +      - ``start_byte_offset``
> > > +      - Where the slice payload starts in the output buffer. Useful when
> > > +        operating in per frame decoding mode and decoding multi-slice content.
> > > +        In this case, the output buffer will contain more than one slice and
> > > +        some codecs need to know where each slice starts. Note that this
> > > +        offsets points to the beginning of the slice which is supposed to
> > > +        contain an ANNEX B start code
> > >      * - __u32
> > >        - ``header_bit_size``
> > >        -
> > > @@ -1931,7 +1939,10 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > >        -
> > >      * - __u16
> > >        - ``num_slices``
> > > -      - Number of slices needed to decode the current frame
> > > +      - Number of slices needed to decode the current frame/field. When
> > > +        operating in per-slice decoding mode (see
> > > +        :c:type:`v4l2_mpeg_video_h264_decoding_mode`), this field
> > > +        should always be set to one
> > >      * - __u16
> > >        - ``nal_ref_idc``
> > >        - NAL reference ID value coming from the NAL Unit header
> > > @@ -2022,6 +2033,35 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > >        - 0x00000004
> > >        - The DPB entry is a long term reference frame
> > >  
> > > +``V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE (enum)``
> > > +    Specifies the decoding mode to use. Currently exposes per slice and per
> > > +    frame decoding but new modes might be added later on.
> > > +
> > > +    .. note::
> > > +
> > > +       This menu control is not yet part of the public kernel API and
> > > +       it is expected to change.
> > > +
> > > +.. c:type:: v4l2_mpeg_video_h264_decoding_mode
> > > +
> > > +.. cssclass:: longtable
> > > +
> > > +.. flat-table::
> > > +    :header-rows:  0
> > > +    :stub-columns: 0
> > > +    :widths:       1 1 2
> > > +
> > > +    * - ``V4L2_MPEG_VIDEO_H264_DECODING_PER_SLICE``
> > > +      - 0
> > > +      - The decoding is done per slice. v4l2_ctrl_h264_decode_params->num_slices
> > > +        must be set to 1 and the output buffer should contain only one slice.  
> > 
> > I wonder if we need to be that strict. Wouldn't it be possible for
> > drivers to just iterate over a number of slices and decode each in turn
> > if userspace passed more than one?
> > 
> > Or perhaps a decoder can batch queue a couple of slices. I'm not sure
> > how frequent this is, but consider something like a spike in activity on
> > your system, causing some slices to get delayed so you actually get a
> > few buffered up before you get a chance to hand them to the V4L2 device.
> > Processing them all at once may help conceal the lag.
> 
> Hm, so we'd be in some kind of slice-batch mode, which means we could
> trigger a decode operation with more than one slice, but not
> necessarily all the slices needed to decode a frame. TBH, supporting
> per-frame (or the batch mode you suggest) on a HW that supports
> per-slice decoding should be pretty simple and has not real impact on
> perfs (as you said, it's just a matter of iterating over all the slices
> attached to a decode operation), so I'm fine relaxing the rule here and
> patching the cedrus driver accordingly (I can't really test the
> changes though). Paul, Maxime, what's your opinion?

We could perhaps have a test program to orchestrate such a scenario. I
think the assumption should obviously still be that we don't cross the
frame boundary using slices in one batch. Just that if a frame was made
up of, say, 4 slices and you first pass 3 slices, then 1, that it'd be
nice if the driver would be able to cope with that. It's something that
could probably even be implemented in the framework as a helper, though
I suspect it'd be just a couple of lines of extra code to wrap a loop
around everything.

Thierry
Boris Brezillon June 3, 2019, 3:37 p.m. UTC | #4
On Mon, 3 Jun 2019 16:05:26 +0200
Thierry Reding <thierry.reding@gmail.com> wrote:

> On Mon, Jun 03, 2019 at 02:51:13PM +0200, Boris Brezillon wrote:
> > +Maxime
> > 
> > Oops, just realized Maxime was not Cc-ed on this series.
> > 
> > On Mon, 3 Jun 2019 14:30:20 +0200
> > Thierry Reding <thierry.reding@gmail.com> wrote:
> >   
> > > On Mon, Jun 03, 2019 at 01:09:42PM +0200, Boris Brezillon wrote:  
> > > > Some stateless decoders don't support per-slice decoding (or at least
> > > > not in a way that would make them efficient or easy to use).
> > > > Let's expose a menu to control and expose the supported decoding modes.
> > > > Drivers are allowed to support only one decoding but they can support
> > > > both too.
> > > > 
> > > > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > > > ---
> > > >  .../media/uapi/v4l/ext-ctrls-codec.rst        | 42 ++++++++++++++++++-
> > > >  drivers/media/v4l2-core/v4l2-ctrls.c          |  9 ++++
> > > >  include/media/h264-ctrls.h                    | 13 ++++++
> > > >  3 files changed, 63 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > > index 82547d5de250..188f625acb7c 100644
> > > > --- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > > +++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > > @@ -1748,6 +1748,14 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > > >      * - __u32
> > > >        - ``size``
> > > >        -
> > > > +    * - __u32
> > > > +      - ``start_byte_offset``
> > > > +      - Where the slice payload starts in the output buffer. Useful when
> > > > +        operating in per frame decoding mode and decoding multi-slice content.
> > > > +        In this case, the output buffer will contain more than one slice and
> > > > +        some codecs need to know where each slice starts. Note that this
> > > > +        offsets points to the beginning of the slice which is supposed to
> > > > +        contain an ANNEX B start code
> > > >      * - __u32
> > > >        - ``header_bit_size``
> > > >        -
> > > > @@ -1931,7 +1939,10 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > > >        -
> > > >      * - __u16
> > > >        - ``num_slices``
> > > > -      - Number of slices needed to decode the current frame
> > > > +      - Number of slices needed to decode the current frame/field. When
> > > > +        operating in per-slice decoding mode (see
> > > > +        :c:type:`v4l2_mpeg_video_h264_decoding_mode`), this field
> > > > +        should always be set to one
> > > >      * - __u16
> > > >        - ``nal_ref_idc``
> > > >        - NAL reference ID value coming from the NAL Unit header
> > > > @@ -2022,6 +2033,35 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > > >        - 0x00000004
> > > >        - The DPB entry is a long term reference frame
> > > >  
> > > > +``V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE (enum)``
> > > > +    Specifies the decoding mode to use. Currently exposes per slice and per
> > > > +    frame decoding but new modes might be added later on.
> > > > +
> > > > +    .. note::
> > > > +
> > > > +       This menu control is not yet part of the public kernel API and
> > > > +       it is expected to change.
> > > > +
> > > > +.. c:type:: v4l2_mpeg_video_h264_decoding_mode
> > > > +
> > > > +.. cssclass:: longtable
> > > > +
> > > > +.. flat-table::
> > > > +    :header-rows:  0
> > > > +    :stub-columns: 0
> > > > +    :widths:       1 1 2
> > > > +
> > > > +    * - ``V4L2_MPEG_VIDEO_H264_DECODING_PER_SLICE``
> > > > +      - 0
> > > > +      - The decoding is done per slice. v4l2_ctrl_h264_decode_params->num_slices
> > > > +        must be set to 1 and the output buffer should contain only one slice.    
> > > 
> > > I wonder if we need to be that strict. Wouldn't it be possible for
> > > drivers to just iterate over a number of slices and decode each in turn
> > > if userspace passed more than one?
> > > 
> > > Or perhaps a decoder can batch queue a couple of slices. I'm not sure
> > > how frequent this is, but consider something like a spike in activity on
> > > your system, causing some slices to get delayed so you actually get a
> > > few buffered up before you get a chance to hand them to the V4L2 device.
> > > Processing them all at once may help conceal the lag.  
> > 
> > Hm, so we'd be in some kind of slice-batch mode, which means we could
> > trigger a decode operation with more than one slice, but not
> > necessarily all the slices needed to decode a frame. TBH, supporting
> > per-frame (or the batch mode you suggest) on a HW that supports
> > per-slice decoding should be pretty simple and has not real impact on
> > perfs (as you said, it's just a matter of iterating over all the slices
> > attached to a decode operation), so I'm fine relaxing the rule here and
> > patching the cedrus driver accordingly (I can't really test the
> > changes though). Paul, Maxime, what's your opinion?  
> 
> We could perhaps have a test program to orchestrate such a scenario. I
> think the assumption should obviously still be that we don't cross the
> frame boundary using slices in one batch.

We should definitely forbid mixing slices of different frames in the
same decode operation, since each decode operation is targeting a
single capture buffer.

> Just that if a frame was made
> up of, say, 4 slices and you first pass 3 slices, then 1, that it'd be
> nice if the driver would be able to cope with that.

Yep, that makes sense.

> It's something that
> could probably even be implemented in the framework as a helper, though
> I suspect it'd be just a couple of lines of extra code to wrap a loop
> around everything.

I also thought about providing generic wrappers, both for this case and
the per-slice -> per-frame case (this one would be a bit more
complicated as it implies queuing slices in a bounce buffer and
triggering the decode operation only when we have all slices of a
frame).
Thierry Reding June 4, 2019, 8:16 a.m. UTC | #5
On Mon, Jun 03, 2019 at 05:37:11PM +0200, Boris Brezillon wrote:
> On Mon, 3 Jun 2019 16:05:26 +0200
> Thierry Reding <thierry.reding@gmail.com> wrote:
> 
> > On Mon, Jun 03, 2019 at 02:51:13PM +0200, Boris Brezillon wrote:
> > > +Maxime
> > > 
> > > Oops, just realized Maxime was not Cc-ed on this series.
> > > 
> > > On Mon, 3 Jun 2019 14:30:20 +0200
> > > Thierry Reding <thierry.reding@gmail.com> wrote:
> > >   
> > > > On Mon, Jun 03, 2019 at 01:09:42PM +0200, Boris Brezillon wrote:  
> > > > > Some stateless decoders don't support per-slice decoding (or at least
> > > > > not in a way that would make them efficient or easy to use).
> > > > > Let's expose a menu to control and expose the supported decoding modes.
> > > > > Drivers are allowed to support only one decoding but they can support
> > > > > both too.
> > > > > 
> > > > > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > > > > ---
> > > > >  .../media/uapi/v4l/ext-ctrls-codec.rst        | 42 ++++++++++++++++++-
> > > > >  drivers/media/v4l2-core/v4l2-ctrls.c          |  9 ++++
> > > > >  include/media/h264-ctrls.h                    | 13 ++++++
> > > > >  3 files changed, 63 insertions(+), 1 deletion(-)
> > > > > 
> > > > > diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > > > index 82547d5de250..188f625acb7c 100644
> > > > > --- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > > > +++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > > > @@ -1748,6 +1748,14 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > > > >      * - __u32
> > > > >        - ``size``
> > > > >        -
> > > > > +    * - __u32
> > > > > +      - ``start_byte_offset``
> > > > > +      - Where the slice payload starts in the output buffer. Useful when
> > > > > +        operating in per frame decoding mode and decoding multi-slice content.
> > > > > +        In this case, the output buffer will contain more than one slice and
> > > > > +        some codecs need to know where each slice starts. Note that this
> > > > > +        offsets points to the beginning of the slice which is supposed to
> > > > > +        contain an ANNEX B start code
> > > > >      * - __u32
> > > > >        - ``header_bit_size``
> > > > >        -
> > > > > @@ -1931,7 +1939,10 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > > > >        -
> > > > >      * - __u16
> > > > >        - ``num_slices``
> > > > > -      - Number of slices needed to decode the current frame
> > > > > +      - Number of slices needed to decode the current frame/field. When
> > > > > +        operating in per-slice decoding mode (see
> > > > > +        :c:type:`v4l2_mpeg_video_h264_decoding_mode`), this field
> > > > > +        should always be set to one
> > > > >      * - __u16
> > > > >        - ``nal_ref_idc``
> > > > >        - NAL reference ID value coming from the NAL Unit header
> > > > > @@ -2022,6 +2033,35 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > > > >        - 0x00000004
> > > > >        - The DPB entry is a long term reference frame
> > > > >  
> > > > > +``V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE (enum)``
> > > > > +    Specifies the decoding mode to use. Currently exposes per slice and per
> > > > > +    frame decoding but new modes might be added later on.
> > > > > +
> > > > > +    .. note::
> > > > > +
> > > > > +       This menu control is not yet part of the public kernel API and
> > > > > +       it is expected to change.
> > > > > +
> > > > > +.. c:type:: v4l2_mpeg_video_h264_decoding_mode
> > > > > +
> > > > > +.. cssclass:: longtable
> > > > > +
> > > > > +.. flat-table::
> > > > > +    :header-rows:  0
> > > > > +    :stub-columns: 0
> > > > > +    :widths:       1 1 2
> > > > > +
> > > > > +    * - ``V4L2_MPEG_VIDEO_H264_DECODING_PER_SLICE``
> > > > > +      - 0
> > > > > +      - The decoding is done per slice. v4l2_ctrl_h264_decode_params->num_slices
> > > > > +        must be set to 1 and the output buffer should contain only one slice.    
> > > > 
> > > > I wonder if we need to be that strict. Wouldn't it be possible for
> > > > drivers to just iterate over a number of slices and decode each in turn
> > > > if userspace passed more than one?
> > > > 
> > > > Or perhaps a decoder can batch queue a couple of slices. I'm not sure
> > > > how frequent this is, but consider something like a spike in activity on
> > > > your system, causing some slices to get delayed so you actually get a
> > > > few buffered up before you get a chance to hand them to the V4L2 device.
> > > > Processing them all at once may help conceal the lag.  
> > > 
> > > Hm, so we'd be in some kind of slice-batch mode, which means we could
> > > trigger a decode operation with more than one slice, but not
> > > necessarily all the slices needed to decode a frame. TBH, supporting
> > > per-frame (or the batch mode you suggest) on a HW that supports
> > > per-slice decoding should be pretty simple and has not real impact on
> > > perfs (as you said, it's just a matter of iterating over all the slices
> > > attached to a decode operation), so I'm fine relaxing the rule here and
> > > patching the cedrus driver accordingly (I can't really test the
> > > changes though). Paul, Maxime, what's your opinion?  
> > 
> > We could perhaps have a test program to orchestrate such a scenario. I
> > think the assumption should obviously still be that we don't cross the
> > frame boundary using slices in one batch.
> 
> We should definitely forbid mixing slices of different frames in the
> same decode operation, since each decode operation is targeting a
> single capture buffer.
> 
> > Just that if a frame was made
> > up of, say, 4 slices and you first pass 3 slices, then 1, that it'd be
> > nice if the driver would be able to cope with that.
> 
> Yep, that makes sense.
> 
> > It's something that
> > could probably even be implemented in the framework as a helper, though
> > I suspect it'd be just a couple of lines of extra code to wrap a loop
> > around everything.
> 
> I also thought about providing generic wrappers, both for this case and
> the per-slice -> per-frame case (this one would be a bit more
> complicated as it implies queuing slices in a bounce buffer and
> triggering the decode operation only when we have all slices of a
> frame).

I like deferring the addition of that kind of helper until a clear
pattern emerges out of the drivers that need this, just because that
gives us real examples on which to model those helpers. But yeah, I
think it should be possible to have helpers for these for most cases.

Thierry
Paul Kocialkowski June 5, 2019, 8:48 p.m. UTC | #6
Hi,

Le mardi 04 juin 2019 à 10:16 +0200, Thierry Reding a écrit :
> On Mon, Jun 03, 2019 at 05:37:11PM +0200, Boris Brezillon wrote:
> > On Mon, 3 Jun 2019 16:05:26 +0200
> > Thierry Reding <thierry.reding@gmail.com> wrote:
> > 
> > > On Mon, Jun 03, 2019 at 02:51:13PM +0200, Boris Brezillon wrote:
> > > > +Maxime
> > > > 
> > > > Oops, just realized Maxime was not Cc-ed on this series.
> > > > 
> > > > On Mon, 3 Jun 2019 14:30:20 +0200
> > > > Thierry Reding <thierry.reding@gmail.com> wrote:
> > > >   
> > > > > On Mon, Jun 03, 2019 at 01:09:42PM +0200, Boris Brezillon wrote:  
> > > > > > Some stateless decoders don't support per-slice decoding (or at least
> > > > > > not in a way that would make them efficient or easy to use).
> > > > > > Let's expose a menu to control and expose the supported decoding modes.
> > > > > > Drivers are allowed to support only one decoding but they can support
> > > > > > both too.
> > > > > > 
> > > > > > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > > > > > ---
> > > > > >  .../media/uapi/v4l/ext-ctrls-codec.rst        | 42 ++++++++++++++++++-
> > > > > >  drivers/media/v4l2-core/v4l2-ctrls.c          |  9 ++++
> > > > > >  include/media/h264-ctrls.h                    | 13 ++++++
> > > > > >  3 files changed, 63 insertions(+), 1 deletion(-)
> > > > > > 
> > > > > > diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > > > > index 82547d5de250..188f625acb7c 100644
> > > > > > --- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > > > > +++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > > > > @@ -1748,6 +1748,14 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > > > > >      * - __u32
> > > > > >        - ``size``
> > > > > >        -
> > > > > > +    * - __u32
> > > > > > +      - ``start_byte_offset``
> > > > > > +      - Where the slice payload starts in the output buffer. Useful when
> > > > > > +        operating in per frame decoding mode and decoding multi-slice content.
> > > > > > +        In this case, the output buffer will contain more than one slice and
> > > > > > +        some codecs need to know where each slice starts. Note that this
> > > > > > +        offsets points to the beginning of the slice which is supposed to
> > > > > > +        contain an ANNEX B start code
> > > > > >      * - __u32
> > > > > >        - ``header_bit_size``
> > > > > >        -
> > > > > > @@ -1931,7 +1939,10 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > > > > >        -
> > > > > >      * - __u16
> > > > > >        - ``num_slices``
> > > > > > -      - Number of slices needed to decode the current frame
> > > > > > +      - Number of slices needed to decode the current frame/field. When
> > > > > > +        operating in per-slice decoding mode (see
> > > > > > +        :c:type:`v4l2_mpeg_video_h264_decoding_mode`), this field
> > > > > > +        should always be set to one
> > > > > >      * - __u16
> > > > > >        - ``nal_ref_idc``
> > > > > >        - NAL reference ID value coming from the NAL Unit header
> > > > > > @@ -2022,6 +2033,35 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > > > > >        - 0x00000004
> > > > > >        - The DPB entry is a long term reference frame
> > > > > >  
> > > > > > +``V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE (enum)``
> > > > > > +    Specifies the decoding mode to use. Currently exposes per slice and per
> > > > > > +    frame decoding but new modes might be added later on.
> > > > > > +
> > > > > > +    .. note::
> > > > > > +
> > > > > > +       This menu control is not yet part of the public kernel API and
> > > > > > +       it is expected to change.
> > > > > > +
> > > > > > +.. c:type:: v4l2_mpeg_video_h264_decoding_mode
> > > > > > +
> > > > > > +.. cssclass:: longtable
> > > > > > +
> > > > > > +.. flat-table::
> > > > > > +    :header-rows:  0
> > > > > > +    :stub-columns: 0
> > > > > > +    :widths:       1 1 2
> > > > > > +
> > > > > > +    * - ``V4L2_MPEG_VIDEO_H264_DECODING_PER_SLICE``
> > > > > > +      - 0
> > > > > > +      - The decoding is done per slice. v4l2_ctrl_h264_decode_params->num_slices
> > > > > > +        must be set to 1 and the output buffer should contain only one slice.    
> > > > > 
> > > > > I wonder if we need to be that strict. Wouldn't it be possible for
> > > > > drivers to just iterate over a number of slices and decode each in turn
> > > > > if userspace passed more than one?
> > > > > 
> > > > > Or perhaps a decoder can batch queue a couple of slices. I'm not sure
> > > > > how frequent this is, but consider something like a spike in activity on
> > > > > your system, causing some slices to get delayed so you actually get a
> > > > > few buffered up before you get a chance to hand them to the V4L2 device.
> > > > > Processing them all at once may help conceal the lag.  
> > > > 
> > > > Hm, so we'd be in some kind of slice-batch mode, which means we could
> > > > trigger a decode operation with more than one slice, but not
> > > > necessarily all the slices needed to decode a frame. TBH, supporting
> > > > per-frame (or the batch mode you suggest) on a HW that supports
> > > > per-slice decoding should be pretty simple and has not real impact on
> > > > perfs (as you said, it's just a matter of iterating over all the slices
> > > > attached to a decode operation), so I'm fine relaxing the rule here and
> > > > patching the cedrus driver accordingly (I can't really test the
> > > > changes though). Paul, Maxime, what's your opinion?  

So perhaps we could just allow passing any number of slices with each
request within a frame boundary and have userspace set the
V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF flag accordingly.

I'm not totally sure we need a batching method beyond that, just
submitting requests with groups of slices and the same destination
buffer should work out fine. We can leave it up to the drivers to
handle multiple slices submitted at once.

> > > We could perhaps have a test program to orchestrate such a scenario. I
> > > think the assumption should obviously still be that we don't cross the
> > > frame boundary using slices in one batch.
> > 
> > We should definitely forbid mixing slices of different frames in the
> > same decode operation, since each decode operation is targeting a
> > single capture buffer.

Agreed.

> > > Just that if a frame was made
> > > up of, say, 4 slices and you first pass 3 slices, then 1, that it'd be
> > > nice if the driver would be able to cope with that.
> > 
> > Yep, that makes sense.
> > 
> > > It's something that
> > > could probably even be implemented in the framework as a helper, though
> > > I suspect it'd be just a couple of lines of extra code to wrap a loop
> > > around everything.
> > 
> > I also thought about providing generic wrappers, both for this case and
> > the per-slice -> per-frame case (this one would be a bit more
> > complicated as it implies queuing slices in a bounce buffer and
> > triggering the decode operation only when we have all slices of a
> > frame).
> 
> I like deferring the addition of that kind of helper until a clear
> pattern emerges out of the drivers that need this, just because that
> gives us real examples on which to model those helpers. But yeah, I
> think it should be possible to have helpers for these for most cases.

So it seems that we need some convenient way to iterate over each slice
and configure registers accordingly. Perhaps we could have some common
per-codec helpers with callbacks to set the common controls and iterate
over the per-slice controls.

That would help keep our drivers tidy and understandable, and maybe we
could have start/stop steps too, pretty much like we do in cedrus.

What do you think?

Cheers,

Paul
Paul Kocialkowski June 5, 2019, 8:55 p.m. UTC | #7
Hi,

Le lundi 03 juin 2019 à 13:09 +0200, Boris Brezillon a écrit :
> Some stateless decoders don't support per-slice decoding (or at least
> not in a way that would make them efficient or easy to use).
> Let's expose a menu to control and expose the supported decoding modes.
> Drivers are allowed to support only one decoding but they can support
> both too.
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
>  .../media/uapi/v4l/ext-ctrls-codec.rst        | 42 ++++++++++++++++++-
>  drivers/media/v4l2-core/v4l2-ctrls.c          |  9 ++++
>  include/media/h264-ctrls.h                    | 13 ++++++
>  3 files changed, 63 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> index 82547d5de250..188f625acb7c 100644
> --- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> +++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> @@ -1748,6 +1748,14 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
>      * - __u32
>        - ``size``
>        -
> +    * - __u32
> +      - ``start_byte_offset``
> +      - Where the slice payload starts in the output buffer. Useful when
> +        operating in per frame decoding mode and decoding multi-slice content.
> +        In this case, the output buffer will contain more than one slice and
> +        some codecs need to know where each slice starts. Note that this
> +        offsets points to the beginning of the slice which is supposed to
> +        contain an ANNEX B start code

Looks good, maybe add a terminating dot.

>      * - __u32
>        - ``header_bit_size``
>        -
> @@ -1931,7 +1939,10 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
>        -
>      * - __u16
>        - ``num_slices``
> -      - Number of slices needed to decode the current frame
> +      - Number of slices needed to decode the current frame/field. When
> +        operating in per-slice decoding mode (see
> +        :c:type:`v4l2_mpeg_video_h264_decoding_mode`), this field
> +        should always be set to one

So maybe we should allow an arbitrary number of slices unless only per-
frame decoding is selected and we need all the slices at once.

>      * - __u16
>        - ``nal_ref_idc``
>        - NAL reference ID value coming from the NAL Unit header
> @@ -2022,6 +2033,35 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
>        - 0x00000004
>        - The DPB entry is a long term reference frame
>  
> +``V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE (enum)``
> +    Specifies the decoding mode to use. Currently exposes per slice and per
> +    frame decoding but new modes might be added later on.

I think it definitey makes sense to have this per-codec.

> +
> +    .. note::
> +
> +       This menu control is not yet part of the public kernel API and
> +       it is expected to change.
> +
> +.. c:type:: v4l2_mpeg_video_h264_decoding_mode
> +
> +.. cssclass:: longtable
> +
> +.. flat-table::
> +    :header-rows:  0
> +    :stub-columns: 0
> +    :widths:       1 1 2
> +
> +    * - ``V4L2_MPEG_VIDEO_H264_DECODING_PER_SLICE``
> +      - 0
> +      - The decoding is done per slice. v4l2_ctrl_h264_decode_params->num_slices
> +        must be set to 1 and the output buffer should contain only one slice.

See above about having arbitrary numbers of slices (within frame
boundary).

> +    * - ``V4L2_MPEG_VIDEO_H264_DECODING_PER_FRAME``
> +      - 1
> +      - The decoding is done per frame. v4l2_ctrl_h264_decode_params->num_slices
> +        can be > 1. When that happens, the output buffer should contain all
> +        slices needed to decode a frame/field, each slice being prefixed by an
> +        Annex B NAL header/start-code.

Looks good!

> +
>  .. _v4l2-mpeg-mpeg2:
>  
>  ``V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS (struct)``
> diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
> index 1217d38ea394..72bb3c8882f5 100644
> --- a/drivers/media/v4l2-core/v4l2-ctrls.c
> +++ b/drivers/media/v4l2-core/v4l2-ctrls.c
> @@ -406,6 +406,11 @@ const char * const *v4l2_ctrl_get_menu(u32 id)
>  		"Explicit",
>  		NULL,
>  	};
> +	static const char * const h264_decoding_mode[] = {
> +		"Per Slice",
> +		"Per Frame",
> +		NULL,
> +	};
>  	static const char * const mpeg_mpeg2_level[] = {
>  		"Low",
>  		"Main",
> @@ -637,6 +642,8 @@ const char * const *v4l2_ctrl_get_menu(u32 id)
>  		return h264_fp_arrangement_type;
>  	case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE:
>  		return h264_fmo_map_type;
> +	case V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE:
> +		return h264_decoding_mode;
>  	case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL:
>  		return mpeg_mpeg2_level;
>  	case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE:
> @@ -856,6 +863,7 @@ const char *v4l2_ctrl_get_name(u32 id)
>  	case V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX:		return "H264 Scaling Matrix";
>  	case V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS:		return "H264 Slice Parameters";
>  	case V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS:		return "H264 Decode Parameters";
> +	case V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE:		return "H264 Decoding Mode";
>  	case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL:			return "MPEG2 Level";
>  	case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE:			return "MPEG2 Profile";
>  	case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP:		return "MPEG4 I-Frame QP Value";
> @@ -1224,6 +1232,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
>  	case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC:
>  	case V4L2_CID_MPEG_VIDEO_H264_SEI_FP_ARRANGEMENT_TYPE:
>  	case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE:
> +	case V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE:
>  	case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL:
>  	case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE:
>  	case V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL:
> diff --git a/include/media/h264-ctrls.h b/include/media/h264-ctrls.h
> index e1404d78d6ff..26de2243f6f5 100644
> --- a/include/media/h264-ctrls.h
> +++ b/include/media/h264-ctrls.h
> @@ -26,6 +26,7 @@
>  #define V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX	(V4L2_CID_MPEG_BASE+1002)
>  #define V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS	(V4L2_CID_MPEG_BASE+1003)
>  #define V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS	(V4L2_CID_MPEG_BASE+1004)
> +#define V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE	(V4L2_CID_MPEG_BASE+1005)
>  
>  /* enum v4l2_ctrl_type type values */
>  #define V4L2_CTRL_TYPE_H264_SPS			0x0110
> @@ -33,6 +34,12 @@
>  #define V4L2_CTRL_TYPE_H264_SCALING_MATRIX	0x0112
>  #define V4L2_CTRL_TYPE_H264_SLICE_PARAMS	0x0113
>  #define V4L2_CTRL_TYPE_H264_DECODE_PARAMS	0x0114
> +#define V4L2_CTRL_TYPE_H264_DECODING_MODE	0x0115
> +
> +enum v4l2_mpeg_video_h264_decoding_mode {
> +	V4L2_MPEG_VIDEO_H264_DECODING_PER_SLICE,
> +	V4L2_MPEG_VIDEO_H264_DECODING_PER_FRAME,
> +};
>  
>  #define V4L2_H264_SPS_CONSTRAINT_SET0_FLAG			0x01
>  #define V4L2_H264_SPS_CONSTRAINT_SET1_FLAG			0x02
> @@ -111,6 +118,8 @@ struct v4l2_h264_pred_weight_table {
>  	struct v4l2_h264_weight_factors weight_factors[2];
>  };
>  
> +#define V4L2_H264_MAX_SLICES_PER_FRAME			16
> +
>  #define V4L2_H264_SLICE_TYPE_P				0
>  #define V4L2_H264_SLICE_TYPE_B				1
>  #define V4L2_H264_SLICE_TYPE_I				2
> @@ -125,6 +134,10 @@ struct v4l2_h264_pred_weight_table {
>  struct v4l2_ctrl_h264_slice_params {
>  	/* Size in bytes, including header */
>  	__u32 size;
> +
> +	/* Where the slice starts in the output buffer (expressed in bytes). */

Maybe call it "beginning of the slice" or adapt the following comment
to use the same terminology.

Looks good otherwise, thanks!

Cheers,

Paul

> +	__u32 start_byte_offset;
> +
>  	/* Offset in bits to slice_data() from the beginning of this slice. */
>  	__u32 header_bit_size;
>
Boris Brezillon June 6, 2019, 6:55 a.m. UTC | #8
On Wed, 05 Jun 2019 22:48:08 +0200
Paul Kocialkowski <paul.kocialkowski@bootlin.com> wrote:

> Hi,
> 
> Le mardi 04 juin 2019 à 10:16 +0200, Thierry Reding a écrit :
> > On Mon, Jun 03, 2019 at 05:37:11PM +0200, Boris Brezillon wrote:  
> > > On Mon, 3 Jun 2019 16:05:26 +0200
> > > Thierry Reding <thierry.reding@gmail.com> wrote:
> > >   
> > > > On Mon, Jun 03, 2019 at 02:51:13PM +0200, Boris Brezillon wrote:  
> > > > > +Maxime
> > > > > 
> > > > > Oops, just realized Maxime was not Cc-ed on this series.
> > > > > 
> > > > > On Mon, 3 Jun 2019 14:30:20 +0200
> > > > > Thierry Reding <thierry.reding@gmail.com> wrote:
> > > > >     
> > > > > > On Mon, Jun 03, 2019 at 01:09:42PM +0200, Boris Brezillon wrote:    
> > > > > > > Some stateless decoders don't support per-slice decoding (or at least
> > > > > > > not in a way that would make them efficient or easy to use).
> > > > > > > Let's expose a menu to control and expose the supported decoding modes.
> > > > > > > Drivers are allowed to support only one decoding but they can support
> > > > > > > both too.
> > > > > > > 
> > > > > > > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > > > > > > ---
> > > > > > >  .../media/uapi/v4l/ext-ctrls-codec.rst        | 42 ++++++++++++++++++-
> > > > > > >  drivers/media/v4l2-core/v4l2-ctrls.c          |  9 ++++
> > > > > > >  include/media/h264-ctrls.h                    | 13 ++++++
> > > > > > >  3 files changed, 63 insertions(+), 1 deletion(-)
> > > > > > > 
> > > > > > > diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > > > > > index 82547d5de250..188f625acb7c 100644
> > > > > > > --- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > > > > > +++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
> > > > > > > @@ -1748,6 +1748,14 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > > > > > >      * - __u32
> > > > > > >        - ``size``
> > > > > > >        -
> > > > > > > +    * - __u32
> > > > > > > +      - ``start_byte_offset``
> > > > > > > +      - Where the slice payload starts in the output buffer. Useful when
> > > > > > > +        operating in per frame decoding mode and decoding multi-slice content.
> > > > > > > +        In this case, the output buffer will contain more than one slice and
> > > > > > > +        some codecs need to know where each slice starts. Note that this
> > > > > > > +        offsets points to the beginning of the slice which is supposed to
> > > > > > > +        contain an ANNEX B start code
> > > > > > >      * - __u32
> > > > > > >        - ``header_bit_size``
> > > > > > >        -
> > > > > > > @@ -1931,7 +1939,10 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > > > > > >        -
> > > > > > >      * - __u16
> > > > > > >        - ``num_slices``
> > > > > > > -      - Number of slices needed to decode the current frame
> > > > > > > +      - Number of slices needed to decode the current frame/field. When
> > > > > > > +        operating in per-slice decoding mode (see
> > > > > > > +        :c:type:`v4l2_mpeg_video_h264_decoding_mode`), this field
> > > > > > > +        should always be set to one
> > > > > > >      * - __u16
> > > > > > >        - ``nal_ref_idc``
> > > > > > >        - NAL reference ID value coming from the NAL Unit header
> > > > > > > @@ -2022,6 +2033,35 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
> > > > > > >        - 0x00000004
> > > > > > >        - The DPB entry is a long term reference frame
> > > > > > >  
> > > > > > > +``V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE (enum)``
> > > > > > > +    Specifies the decoding mode to use. Currently exposes per slice and per
> > > > > > > +    frame decoding but new modes might be added later on.
> > > > > > > +
> > > > > > > +    .. note::
> > > > > > > +
> > > > > > > +       This menu control is not yet part of the public kernel API and
> > > > > > > +       it is expected to change.
> > > > > > > +
> > > > > > > +.. c:type:: v4l2_mpeg_video_h264_decoding_mode
> > > > > > > +
> > > > > > > +.. cssclass:: longtable
> > > > > > > +
> > > > > > > +.. flat-table::
> > > > > > > +    :header-rows:  0
> > > > > > > +    :stub-columns: 0
> > > > > > > +    :widths:       1 1 2
> > > > > > > +
> > > > > > > +    * - ``V4L2_MPEG_VIDEO_H264_DECODING_PER_SLICE``
> > > > > > > +      - 0
> > > > > > > +      - The decoding is done per slice. v4l2_ctrl_h264_decode_params->num_slices
> > > > > > > +        must be set to 1 and the output buffer should contain only one slice.      
> > > > > > 
> > > > > > I wonder if we need to be that strict. Wouldn't it be possible for
> > > > > > drivers to just iterate over a number of slices and decode each in turn
> > > > > > if userspace passed more than one?
> > > > > > 
> > > > > > Or perhaps a decoder can batch queue a couple of slices. I'm not sure
> > > > > > how frequent this is, but consider something like a spike in activity on
> > > > > > your system, causing some slices to get delayed so you actually get a
> > > > > > few buffered up before you get a chance to hand them to the V4L2 device.
> > > > > > Processing them all at once may help conceal the lag.    
> > > > > 
> > > > > Hm, so we'd be in some kind of slice-batch mode, which means we could
> > > > > trigger a decode operation with more than one slice, but not
> > > > > necessarily all the slices needed to decode a frame. TBH, supporting
> > > > > per-frame (or the batch mode you suggest) on a HW that supports
> > > > > per-slice decoding should be pretty simple and has not real impact on
> > > > > perfs (as you said, it's just a matter of iterating over all the slices
> > > > > attached to a decode operation), so I'm fine relaxing the rule here and
> > > > > patching the cedrus driver accordingly (I can't really test the
> > > > > changes though). Paul, Maxime, what's your opinion?    
> 
> So perhaps we could just allow passing any number of slices with each
> request within a frame boundary and have userspace set the
> V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF flag accordingly.
> 
> I'm not totally sure we need a batching method beyond that, just
> submitting requests with groups of slices and the same destination
> buffer should work out fine. We can leave it up to the drivers to
> handle multiple slices submitted at once.

The batching mode we're talking about is exactly that: we let drivers
collect slices and send them through a single decode operation.

> 
> > > > We could perhaps have a test program to orchestrate such a scenario. I
> > > > think the assumption should obviously still be that we don't cross the
> > > > frame boundary using slices in one batch.  
> > > 
> > > We should definitely forbid mixing slices of different frames in the
> > > same decode operation, since each decode operation is targeting a
> > > single capture buffer.  
> 
> Agreed.
> 
> > > > Just that if a frame was made
> > > > up of, say, 4 slices and you first pass 3 slices, then 1, that it'd be
> > > > nice if the driver would be able to cope with that.  
> > > 
> > > Yep, that makes sense.
> > >   
> > > > It's something that
> > > > could probably even be implemented in the framework as a helper, though
> > > > I suspect it'd be just a couple of lines of extra code to wrap a loop
> > > > around everything.  
> > > 
> > > I also thought about providing generic wrappers, both for this case and
> > > the per-slice -> per-frame case (this one would be a bit more
> > > complicated as it implies queuing slices in a bounce buffer and
> > > triggering the decode operation only when we have all slices of a
> > > frame).  
> > 
> > I like deferring the addition of that kind of helper until a clear
> > pattern emerges out of the drivers that need this, just because that
> > gives us real examples on which to model those helpers. But yeah, I
> > think it should be possible to have helpers for these for most cases.  
> 
> So it seems that we need some convenient way to iterate over each slice
> and configure registers accordingly. Perhaps we could have some common
> per-codec helpers with callbacks to set the common controls and iterate
> over the per-slice controls.

Yep, I'm working on extracting common bits out of cedrus/hantro
drivers, that new helper (to iterate over all slices) can be part of
this series.

> 
> That would help keep our drivers tidy and understandable, and maybe we
> could have start/stop steps too, pretty much like we do in cedrus.
> 
> What do you think?

Makes total sense, and I'm all for having generic code provided as
vb2-m2m/codec helpers.
diff mbox series

Patch

diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
index 82547d5de250..188f625acb7c 100644
--- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
+++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
@@ -1748,6 +1748,14 @@  enum v4l2_mpeg_video_h264_hierarchical_coding_type -
     * - __u32
       - ``size``
       -
+    * - __u32
+      - ``start_byte_offset``
+      - Where the slice payload starts in the output buffer. Useful when
+        operating in per frame decoding mode and decoding multi-slice content.
+        In this case, the output buffer will contain more than one slice and
+        some codecs need to know where each slice starts. Note that this
+        offsets points to the beginning of the slice which is supposed to
+        contain an ANNEX B start code
     * - __u32
       - ``header_bit_size``
       -
@@ -1931,7 +1939,10 @@  enum v4l2_mpeg_video_h264_hierarchical_coding_type -
       -
     * - __u16
       - ``num_slices``
-      - Number of slices needed to decode the current frame
+      - Number of slices needed to decode the current frame/field. When
+        operating in per-slice decoding mode (see
+        :c:type:`v4l2_mpeg_video_h264_decoding_mode`), this field
+        should always be set to one
     * - __u16
       - ``nal_ref_idc``
       - NAL reference ID value coming from the NAL Unit header
@@ -2022,6 +2033,35 @@  enum v4l2_mpeg_video_h264_hierarchical_coding_type -
       - 0x00000004
       - The DPB entry is a long term reference frame
 
+``V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE (enum)``
+    Specifies the decoding mode to use. Currently exposes per slice and per
+    frame decoding but new modes might be added later on.
+
+    .. note::
+
+       This menu control is not yet part of the public kernel API and
+       it is expected to change.
+
+.. c:type:: v4l2_mpeg_video_h264_decoding_mode
+
+.. cssclass:: longtable
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_MPEG_VIDEO_H264_DECODING_PER_SLICE``
+      - 0
+      - The decoding is done per slice. v4l2_ctrl_h264_decode_params->num_slices
+        must be set to 1 and the output buffer should contain only one slice.
+    * - ``V4L2_MPEG_VIDEO_H264_DECODING_PER_FRAME``
+      - 1
+      - The decoding is done per frame. v4l2_ctrl_h264_decode_params->num_slices
+        can be > 1. When that happens, the output buffer should contain all
+        slices needed to decode a frame/field, each slice being prefixed by an
+        Annex B NAL header/start-code.
+
 .. _v4l2-mpeg-mpeg2:
 
 ``V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS (struct)``
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 1217d38ea394..72bb3c8882f5 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -406,6 +406,11 @@  const char * const *v4l2_ctrl_get_menu(u32 id)
 		"Explicit",
 		NULL,
 	};
+	static const char * const h264_decoding_mode[] = {
+		"Per Slice",
+		"Per Frame",
+		NULL,
+	};
 	static const char * const mpeg_mpeg2_level[] = {
 		"Low",
 		"Main",
@@ -637,6 +642,8 @@  const char * const *v4l2_ctrl_get_menu(u32 id)
 		return h264_fp_arrangement_type;
 	case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE:
 		return h264_fmo_map_type;
+	case V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE:
+		return h264_decoding_mode;
 	case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL:
 		return mpeg_mpeg2_level;
 	case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE:
@@ -856,6 +863,7 @@  const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX:		return "H264 Scaling Matrix";
 	case V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS:		return "H264 Slice Parameters";
 	case V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS:		return "H264 Decode Parameters";
+	case V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE:		return "H264 Decoding Mode";
 	case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL:			return "MPEG2 Level";
 	case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE:			return "MPEG2 Profile";
 	case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP:		return "MPEG4 I-Frame QP Value";
@@ -1224,6 +1232,7 @@  void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC:
 	case V4L2_CID_MPEG_VIDEO_H264_SEI_FP_ARRANGEMENT_TYPE:
 	case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE:
+	case V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE:
 	case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL:
 	case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE:
 	case V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL:
diff --git a/include/media/h264-ctrls.h b/include/media/h264-ctrls.h
index e1404d78d6ff..26de2243f6f5 100644
--- a/include/media/h264-ctrls.h
+++ b/include/media/h264-ctrls.h
@@ -26,6 +26,7 @@ 
 #define V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX	(V4L2_CID_MPEG_BASE+1002)
 #define V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS	(V4L2_CID_MPEG_BASE+1003)
 #define V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS	(V4L2_CID_MPEG_BASE+1004)
+#define V4L2_CID_MPEG_VIDEO_H264_DECODING_MODE	(V4L2_CID_MPEG_BASE+1005)
 
 /* enum v4l2_ctrl_type type values */
 #define V4L2_CTRL_TYPE_H264_SPS			0x0110
@@ -33,6 +34,12 @@ 
 #define V4L2_CTRL_TYPE_H264_SCALING_MATRIX	0x0112
 #define V4L2_CTRL_TYPE_H264_SLICE_PARAMS	0x0113
 #define V4L2_CTRL_TYPE_H264_DECODE_PARAMS	0x0114
+#define V4L2_CTRL_TYPE_H264_DECODING_MODE	0x0115
+
+enum v4l2_mpeg_video_h264_decoding_mode {
+	V4L2_MPEG_VIDEO_H264_DECODING_PER_SLICE,
+	V4L2_MPEG_VIDEO_H264_DECODING_PER_FRAME,
+};
 
 #define V4L2_H264_SPS_CONSTRAINT_SET0_FLAG			0x01
 #define V4L2_H264_SPS_CONSTRAINT_SET1_FLAG			0x02
@@ -111,6 +118,8 @@  struct v4l2_h264_pred_weight_table {
 	struct v4l2_h264_weight_factors weight_factors[2];
 };
 
+#define V4L2_H264_MAX_SLICES_PER_FRAME			16
+
 #define V4L2_H264_SLICE_TYPE_P				0
 #define V4L2_H264_SLICE_TYPE_B				1
 #define V4L2_H264_SLICE_TYPE_I				2
@@ -125,6 +134,10 @@  struct v4l2_h264_pred_weight_table {
 struct v4l2_ctrl_h264_slice_params {
 	/* Size in bytes, including header */
 	__u32 size;
+
+	/* Where the slice starts in the output buffer (expressed in bytes). */
+	__u32 start_byte_offset;
+
 	/* Offset in bits to slice_data() from the beginning of this slice. */
 	__u32 header_bit_size;