diff mbox

[14/17] drm/msm: add atomic support

Message ID 1400956226-28053-15-git-send-email-robdclark@gmail.com (mailing list archive)
State Accepted
Headers show

Commit Message

Rob Clark May 24, 2014, 6:30 p.m. UTC
Signed-off-by: Rob Clark <robdclark@gmail.com>
---
 drivers/gpu/drm/msm/Makefile              |   1 +
 drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c  |  57 ++++++------
 drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c   |   6 ++
 drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h   |   1 +
 drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c |   5 --
 drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c  |  56 ++++++------
 drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c   |   6 ++
 drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h   |   2 +-
 drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c |   5 --
 drivers/gpu/drm/msm/msm_atomic.c          | 141 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/msm/msm_drv.c             |  22 ++++-
 drivers/gpu/drm/msm/msm_drv.h             |   7 ++
 drivers/gpu/drm/msm/msm_gem.c             |  24 +----
 drivers/gpu/drm/msm/msm_gem.h             |  13 +++
 drivers/gpu/drm/msm/msm_kms.h             |   1 +
 15 files changed, 253 insertions(+), 94 deletions(-)
 create mode 100644 drivers/gpu/drm/msm/msm_atomic.c

Comments

Daniel Vetter May 26, 2014, 5:54 p.m. UTC | #1
On Sat, May 24, 2014 at 02:30:23PM -0400, Rob Clark wrote:
> Signed-off-by: Rob Clark <robdclark@gmail.com>
> ---
>  drivers/gpu/drm/msm/Makefile              |   1 +
>  drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c  |  57 ++++++------
>  drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c   |   6 ++
>  drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h   |   1 +
>  drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c |   5 --
>  drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c  |  56 ++++++------
>  drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c   |   6 ++
>  drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h   |   2 +-
>  drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c |   5 --
>  drivers/gpu/drm/msm/msm_atomic.c          | 141 ++++++++++++++++++++++++++++++
>  drivers/gpu/drm/msm/msm_drv.c             |  22 ++++-
>  drivers/gpu/drm/msm/msm_drv.h             |   7 ++
>  drivers/gpu/drm/msm/msm_gem.c             |  24 +----
>  drivers/gpu/drm/msm/msm_gem.h             |  13 +++
>  drivers/gpu/drm/msm/msm_kms.h             |   1 +
>  15 files changed, 253 insertions(+), 94 deletions(-)
>  create mode 100644 drivers/gpu/drm/msm/msm_atomic.c
> 
> diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
> index 5e1e6b0..dd12f56 100644
> --- a/drivers/gpu/drm/msm/Makefile
> +++ b/drivers/gpu/drm/msm/Makefile
> @@ -27,6 +27,7 @@ msm-y := \
>  	mdp/mdp5/mdp5_kms.o \
>  	mdp/mdp5/mdp5_plane.o \
>  	mdp/mdp5/mdp5_smp.o \
> +	msm_atomic.o \
>  	msm_drv.o \
>  	msm_fb.o \
>  	msm_gem.o \
> diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
> index d0d8befd..2fa6b75 100644
> --- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
> +++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
> @@ -52,7 +52,6 @@ struct mdp4_crtc {
>  
>  	/* if there is a pending flip, these will be non-null: */
>  	struct drm_pending_vblank_event *event;
> -	struct msm_fence_cb pageflip_cb;
>  
>  #define PENDING_CURSOR 0x1
>  #define PENDING_FLIP   0x2
> @@ -93,12 +92,16 @@ static void request_pending(struct drm_crtc *crtc, uint32_t pending)
>  	mdp_irq_register(&get_kms(crtc)->base, &mdp4_crtc->vblank);
>  }
>  
> -static void crtc_flush(struct drm_crtc *crtc)
> +void mdp4_crtc_flush(struct drm_crtc *crtc)
>  {
> +	struct msm_drm_private *priv = crtc->dev->dev_private;
>  	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
>  	struct mdp4_kms *mdp4_kms = get_kms(crtc);
>  	uint32_t i, flush = 0;
>  
> +	if (priv->pending_crtcs & (1 << crtc->id))
> +		return;
> +
>  	for (i = 0; i < ARRAY_SIZE(mdp4_crtc->planes); i++) {
>  		struct drm_plane *plane = mdp4_crtc->planes[i];
>  		if (plane) {
> @@ -142,7 +145,7 @@ static void update_scanout(struct drm_crtc *crtc, struct drm_framebuffer *fb)
>  	 * so that we can safely queue unref to current fb (ie. next
>  	 * vblank we know hw is done w/ previous scanout_fb).
>  	 */
> -	crtc_flush(crtc);
> +	mdp4_crtc_flush(crtc);
>  
>  	if (mdp4_crtc->scanout_fb)
>  		drm_flip_work_queue(&mdp4_crtc->unref_fb_work,
> @@ -177,21 +180,6 @@ static void complete_flip(struct drm_crtc *crtc, struct drm_file *file)
>  	spin_unlock_irqrestore(&dev->event_lock, flags);
>  }
>  
> -static void pageflip_cb(struct msm_fence_cb *cb)
> -{
> -	struct mdp4_crtc *mdp4_crtc =
> -		container_of(cb, struct mdp4_crtc, pageflip_cb);
> -	struct drm_crtc *crtc = &mdp4_crtc->base;
> -	struct drm_framebuffer *fb = crtc->primary->fb;
> -
> -	if (!fb)
> -		return;
> -
> -	drm_framebuffer_reference(fb);
> -	mdp4_plane_set_scanout(mdp4_crtc->plane, fb);
> -	update_scanout(crtc, fb);
> -}
> -
>  static void unref_fb_worker(struct drm_flip_work *work, void *val)
>  {
>  	struct mdp4_crtc *mdp4_crtc =
> @@ -406,7 +394,7 @@ static void mdp4_crtc_prepare(struct drm_crtc *crtc)
>  static void mdp4_crtc_commit(struct drm_crtc *crtc)
>  {
>  	mdp4_crtc_dpms(crtc, DRM_MODE_DPMS_ON);
> -	crtc_flush(crtc);
> +	mdp4_crtc_flush(crtc);
>  	/* drop the ref to mdp clk's that we got in prepare: */
>  	mdp4_disable(get_kms(crtc));
>  }
> @@ -448,23 +436,27 @@ static int mdp4_crtc_page_flip(struct drm_crtc *crtc,
>  {
>  	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
>  	struct drm_device *dev = crtc->dev;
> -	struct drm_gem_object *obj;
>  	unsigned long flags;
>  
> +	spin_lock_irqsave(&dev->event_lock, flags);
>  	if (mdp4_crtc->event) {
>  		dev_err(dev->dev, "already pending flip!\n");
> +		spin_unlock_irqrestore(&dev->event_lock, flags);
>  		return -EBUSY;
>  	}
>  
> -	obj = msm_framebuffer_bo(new_fb, 0);
> -
> -	spin_lock_irqsave(&dev->event_lock, flags);
>  	mdp4_crtc->event = event;
>  	spin_unlock_irqrestore(&dev->event_lock, flags);
>  
>  	update_fb(crtc, new_fb);
>  
> -	return msm_gem_queue_inactive_cb(obj, &mdp4_crtc->pageflip_cb);
> +
> +	/* grab extra ref for update_scanout() */
> +	drm_framebuffer_reference(new_fb);
> +	mdp4_plane_set_scanout(mdp4_crtc->plane, new_fb);
> +	update_scanout(crtc, new_fb);
> +
> +	return 0;
>  }
>  
>  static int mdp4_crtc_set_property(struct drm_crtc *crtc,
> @@ -598,7 +590,7 @@ static int mdp4_crtc_cursor_move(struct drm_crtc *crtc, int x, int y)
>  	mdp4_crtc->cursor.y = y;
>  	spin_unlock_irqrestore(&mdp4_crtc->cursor.lock, flags);
>  
> -	crtc_flush(crtc);
> +	mdp4_crtc_flush(crtc);
>  	request_pending(crtc, PENDING_CURSOR);
>  
>  	return 0;
> @@ -635,8 +627,13 @@ static void mdp4_crtc_vblank_irq(struct mdp_irq *irq, uint32_t irqstatus)
>  	pending = atomic_xchg(&mdp4_crtc->pending, 0);
>  
>  	if (pending & PENDING_FLIP) {
> -		complete_flip(crtc, NULL);
> -		drm_flip_work_commit(&mdp4_crtc->unref_fb_work, priv->wq);
> +		if (priv->pending_crtcs & (1 << crtc->id)) {
> +			/* our update hasn't been flushed yet: */
> +			request_pending(crtc, PENDING_FLIP);
> +		} else {
> +			complete_flip(crtc, NULL);
> +			drm_flip_work_commit(&mdp4_crtc->unref_fb_work, priv->wq);
> +		}
>  	}
>  
>  	if (pending & PENDING_CURSOR) {
> @@ -650,7 +647,7 @@ static void mdp4_crtc_err_irq(struct mdp_irq *irq, uint32_t irqstatus)
>  	struct mdp4_crtc *mdp4_crtc = container_of(irq, struct mdp4_crtc, err);
>  	struct drm_crtc *crtc = &mdp4_crtc->base;
>  	DBG("%s: error: %08x", mdp4_crtc->name, irqstatus);
> -	crtc_flush(crtc);
> +	mdp4_crtc_flush(crtc);
>  }
>  
>  uint32_t mdp4_crtc_vblank(struct drm_crtc *crtc)
> @@ -730,7 +727,7 @@ static void set_attach(struct drm_crtc *crtc, enum mdp4_pipe pipe_id,
>  	mdp4_crtc->planes[pipe_id] = plane;
>  	blend_setup(crtc);
>  	if (mdp4_crtc->enabled && (plane != mdp4_crtc->plane))
> -		crtc_flush(crtc);
> +		mdp4_crtc_flush(crtc);
>  }
>  
>  void mdp4_crtc_attach(struct drm_crtc *crtc, struct drm_plane *plane)
> @@ -792,8 +789,6 @@ struct drm_crtc *mdp4_crtc_init(struct drm_device *dev,
>  	ret = drm_flip_work_init(&mdp4_crtc->unref_cursor_work, 64,
>  			"unref cursor", unref_cursor_worker);
>  
> -	INIT_FENCE_CB(&mdp4_crtc->pageflip_cb, pageflip_cb);
> -
>  	drm_crtc_init_with_planes(dev, crtc, plane, NULL, &mdp4_crtc_funcs);
>  	drm_crtc_helper_add(crtc, &mdp4_crtc_helper_funcs);
>  
> diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
> index 0bb4faa..af0bfb1 100644
> --- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
> +++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
> @@ -131,6 +131,11 @@ static long mdp4_round_pixclk(struct msm_kms *kms, unsigned long rate,
>  	return mdp4_dtv_round_pixclk(encoder, rate);
>  }
>  
> +static void mdp4_flush(struct msm_kms *kms, struct drm_crtc *crtc)
> +{
> +	mdp4_crtc_flush(crtc);
> +}
> +
>  static void mdp4_preclose(struct msm_kms *kms, struct drm_file *file)
>  {
>  	struct mdp4_kms *mdp4_kms = to_mdp4_kms(to_mdp_kms(kms));
> @@ -162,6 +167,7 @@ static const struct mdp_kms_funcs kms_funcs = {
>  		.disable_vblank  = mdp4_disable_vblank,
>  		.get_format      = mdp_get_format,
>  		.round_pixclk    = mdp4_round_pixclk,
> +		.flush           = mdp4_flush,
>  		.preclose        = mdp4_preclose,
>  		.destroy         = mdp4_destroy,
>  	},
> diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
> index 715520c5..834454c 100644
> --- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
> +++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
> @@ -184,6 +184,7 @@ enum mdp4_pipe mdp4_plane_pipe(struct drm_plane *plane);
>  struct drm_plane *mdp4_plane_init(struct drm_device *dev,
>  		enum mdp4_pipe pipe_id, bool private_plane);
>  
> +void mdp4_crtc_flush(struct drm_crtc *crtc);
>  uint32_t mdp4_crtc_vblank(struct drm_crtc *crtc);
>  void mdp4_crtc_cancel_pending_flip(struct drm_crtc *crtc, struct drm_file *file);
>  void mdp4_crtc_set_config(struct drm_crtc *crtc, uint32_t config);
> diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c
> index 4c92985..f35848d 100644
> --- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c
> +++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c
> @@ -48,11 +48,6 @@ static int mdp4_plane_update(struct drm_plane *plane,
>  
>  	mdp4_plane->enabled = true;
>  
> -	if (plane->fb)
> -		drm_framebuffer_unreference(plane->fb);
> -
> -	drm_framebuffer_reference(fb);
> -
>  	return mdp4_plane_mode_set(plane, crtc, fb,
>  			crtc_x, crtc_y, crtc_w, crtc_h,
>  			src_x, src_y, src_w, src_h);
> diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
> index 7f4ee99..0e74317 100644
> --- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
> +++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
> @@ -35,7 +35,6 @@ struct mdp5_crtc {
>  
>  	/* if there is a pending flip, these will be non-null: */
>  	struct drm_pending_vblank_event *event;
> -	struct msm_fence_cb pageflip_cb;
>  
>  #define PENDING_CURSOR 0x1
>  #define PENDING_FLIP   0x2
> @@ -73,13 +72,17 @@ static void request_pending(struct drm_crtc *crtc, uint32_t pending)
>  	mdp_irq_register(&get_kms(crtc)->base, &mdp5_crtc->vblank);
>  }
>  
> -static void crtc_flush(struct drm_crtc *crtc)
> +void mdp5_crtc_flush(struct drm_crtc *crtc)
>  {
> +	struct msm_drm_private *priv = crtc->dev->dev_private;
>  	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
>  	struct mdp5_kms *mdp5_kms = get_kms(crtc);
>  	int id = mdp5_crtc->id;
>  	uint32_t i, flush = 0;
>  
> +	if (priv->pending_crtcs & (1 << crtc->id))
> +		return;
> +
>  	for (i = 0; i < ARRAY_SIZE(mdp5_crtc->planes); i++) {
>  		struct drm_plane *plane = mdp5_crtc->planes[i];
>  		if (plane) {
> @@ -124,7 +127,7 @@ static void update_scanout(struct drm_crtc *crtc, struct drm_framebuffer *fb)
>  	 * so that we can safely queue unref to current fb (ie. next
>  	 * vblank we know hw is done w/ previous scanout_fb).
>  	 */
> -	crtc_flush(crtc);
> +	mdp5_crtc_flush(crtc);
>  
>  	if (mdp5_crtc->scanout_fb)
>  		drm_flip_work_queue(&mdp5_crtc->unref_fb_work,
> @@ -165,21 +168,6 @@ static void complete_flip(struct drm_crtc *crtc, struct drm_file *file)
>  	}
>  }
>  
> -static void pageflip_cb(struct msm_fence_cb *cb)
> -{
> -	struct mdp5_crtc *mdp5_crtc =
> -		container_of(cb, struct mdp5_crtc, pageflip_cb);
> -	struct drm_crtc *crtc = &mdp5_crtc->base;
> -	struct drm_framebuffer *fb = mdp5_crtc->fb;
> -
> -	if (!fb)
> -		return;
> -
> -	drm_framebuffer_reference(fb);
> -	mdp5_plane_set_scanout(mdp5_crtc->plane, fb);
> -	update_scanout(crtc, fb);
> -}
> -
>  static void unref_fb_worker(struct drm_flip_work *work, void *val)
>  {
>  	struct mdp5_crtc *mdp5_crtc =
> @@ -324,7 +312,7 @@ static void mdp5_crtc_prepare(struct drm_crtc *crtc)
>  static void mdp5_crtc_commit(struct drm_crtc *crtc)
>  {
>  	mdp5_crtc_dpms(crtc, DRM_MODE_DPMS_ON);
> -	crtc_flush(crtc);
> +	mdp5_crtc_flush(crtc);
>  	/* drop the ref to mdp clk's that we got in prepare: */
>  	mdp5_disable(get_kms(crtc));
>  }
> @@ -366,23 +354,26 @@ static int mdp5_crtc_page_flip(struct drm_crtc *crtc,
>  {
>  	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
>  	struct drm_device *dev = crtc->dev;
> -	struct drm_gem_object *obj;
>  	unsigned long flags;
>  
> +	spin_lock_irqsave(&dev->event_lock, flags);
>  	if (mdp5_crtc->event) {
>  		dev_err(dev->dev, "already pending flip!\n");
> +		spin_unlock_irqrestore(&dev->event_lock, flags);
>  		return -EBUSY;
>  	}
>  
> -	obj = msm_framebuffer_bo(new_fb, 0);
> -
> -	spin_lock_irqsave(&dev->event_lock, flags);
>  	mdp5_crtc->event = event;
>  	spin_unlock_irqrestore(&dev->event_lock, flags);
>  
>  	update_fb(crtc, new_fb);
>  
> -	return msm_gem_queue_inactive_cb(obj, &mdp5_crtc->pageflip_cb);
> +	/* grab extra ref for update_scanout() */
> +	drm_framebuffer_reference(new_fb);
> +	mdp5_plane_set_scanout(mdp5_crtc->plane, new_fb);
> +	update_scanout(crtc, new_fb);
> +
> +	return 0;
>  }
>  
>  static int mdp5_crtc_set_property(struct drm_crtc *crtc,
> @@ -424,8 +415,13 @@ static void mdp5_crtc_vblank_irq(struct mdp_irq *irq, uint32_t irqstatus)
>  	pending = atomic_xchg(&mdp5_crtc->pending, 0);
>  
>  	if (pending & PENDING_FLIP) {
> -		complete_flip(crtc, NULL);
> -		drm_flip_work_commit(&mdp5_crtc->unref_fb_work, priv->wq);
> +		if (priv->pending_crtcs & (1 << crtc->id)) {
> +			/* our update hasn't been flushed yet: */
> +			request_pending(crtc, PENDING_FLIP);
> +		} else {
> +			complete_flip(crtc, NULL);
> +			drm_flip_work_commit(&mdp5_crtc->unref_fb_work, priv->wq);
> +		}
>  	}
>  }
>  
> @@ -434,7 +430,7 @@ static void mdp5_crtc_err_irq(struct mdp_irq *irq, uint32_t irqstatus)
>  	struct mdp5_crtc *mdp5_crtc = container_of(irq, struct mdp5_crtc, err);
>  	struct drm_crtc *crtc = &mdp5_crtc->base;
>  	DBG("%s: error: %08x", mdp5_crtc->name, irqstatus);
> -	crtc_flush(crtc);
> +	mdp5_crtc_flush(crtc);
>  }
>  
>  uint32_t mdp5_crtc_vblank(struct drm_crtc *crtc)
> @@ -501,7 +497,7 @@ void mdp5_crtc_set_intf(struct drm_crtc *crtc, int intf,
>  			MDP5_CTL_OP_MODE(MODE_NONE) |
>  			MDP5_CTL_OP_INTF_NUM(intfnum[intf]));
>  
> -	crtc_flush(crtc);
> +	mdp5_crtc_flush(crtc);
>  }
>  
>  static void set_attach(struct drm_crtc *crtc, enum mdp5_pipe pipe_id,
> @@ -517,7 +513,7 @@ static void set_attach(struct drm_crtc *crtc, enum mdp5_pipe pipe_id,
>  	mdp5_crtc->planes[pipe_id] = plane;
>  	blend_setup(crtc);
>  	if (mdp5_crtc->enabled && (plane != mdp5_crtc->plane))
> -		crtc_flush(crtc);
> +		mdp5_crtc_flush(crtc);
>  }
>  
>  void mdp5_crtc_attach(struct drm_crtc *crtc, struct drm_plane *plane)
> @@ -563,8 +559,6 @@ struct drm_crtc *mdp5_crtc_init(struct drm_device *dev,
>  	if (ret)
>  		goto fail;
>  
> -	INIT_FENCE_CB(&mdp5_crtc->pageflip_cb, pageflip_cb);
> -
>  	drm_crtc_init_with_planes(dev, crtc, plane, NULL, &mdp5_crtc_funcs);
>  	drm_crtc_helper_add(crtc, &mdp5_crtc_helper_funcs);
>  
> diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
> index ee8446c..01c3d70 100644
> --- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
> +++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
> @@ -91,6 +91,11 @@ static long mdp5_round_pixclk(struct msm_kms *kms, unsigned long rate,
>  	return rate;
>  }
>  
> +static void mdp5_flush(struct msm_kms *kms, struct drm_crtc *crtc)
> +{
> +	mdp5_crtc_flush(crtc);
> +}
> +
>  static void mdp5_preclose(struct msm_kms *kms, struct drm_file *file)
>  {
>  	struct mdp5_kms *mdp5_kms = to_mdp5_kms(to_mdp_kms(kms));
> @@ -118,6 +123,7 @@ static const struct mdp_kms_funcs kms_funcs = {
>  		.disable_vblank  = mdp5_disable_vblank,
>  		.get_format      = mdp_get_format,
>  		.round_pixclk    = mdp5_round_pixclk,
> +		.flush           = mdp5_flush,
>  		.preclose        = mdp5_preclose,
>  		.destroy         = mdp5_destroy,
>  	},
> diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
> index c8b1a25..18b031b 100644
> --- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
> +++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
> @@ -197,8 +197,8 @@ enum mdp5_pipe mdp5_plane_pipe(struct drm_plane *plane);
>  struct drm_plane *mdp5_plane_init(struct drm_device *dev,
>  		enum mdp5_pipe pipe, bool private_plane);
>  
> +void mdp5_crtc_flush(struct drm_crtc *crtc);
>  uint32_t mdp5_crtc_vblank(struct drm_crtc *crtc);
> -
>  void mdp5_crtc_cancel_pending_flip(struct drm_crtc *crtc, struct drm_file *file);
>  void mdp5_crtc_set_intf(struct drm_crtc *crtc, int intf,
>  		enum mdp5_intf intf_id);
> diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
> index 53cc8c6..f1bf3c2 100644
> --- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
> +++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
> @@ -48,11 +48,6 @@ static int mdp5_plane_update(struct drm_plane *plane,
>  
>  	mdp5_plane->enabled = true;
>  
> -	if (plane->fb)
> -		drm_framebuffer_unreference(plane->fb);
> -
> -	drm_framebuffer_reference(fb);
> -
>  	return mdp5_plane_mode_set(plane, crtc, fb,
>  			crtc_x, crtc_y, crtc_w, crtc_h,
>  			src_x, src_y, src_w, src_h);
> diff --git a/drivers/gpu/drm/msm/msm_atomic.c b/drivers/gpu/drm/msm/msm_atomic.c
> new file mode 100644
> index 0000000..b231377
> --- /dev/null
> +++ b/drivers/gpu/drm/msm/msm_atomic.c
> @@ -0,0 +1,141 @@
> +/*
> + * Copyright (C) 2013 Red Hat
> + * Author: Rob Clark <robdclark@gmail.com>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published by
> + * the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "msm_drv.h"
> +#include "msm_kms.h"
> +#include "msm_gem.h"
> +
> +struct msm_async_commit {
> +	struct drm_atomic_state *state;
> +	uint32_t fence;
> +	struct msm_fence_cb fence_cb;
> +};
> +
> +static void fence_cb(struct msm_fence_cb *cb);
> +static int commit_sync(struct drm_device *dev, void *state, bool unlocked);
> +
> +static struct msm_async_commit *new_commit(struct drm_atomic_state *state)
> +{
> +	struct msm_async_commit *c = kzalloc(sizeof(*c), GFP_KERNEL);
> +
> +	if (!c)
> +		return NULL;
> +
> +	drm_atomic_state_reference(state);
> +	c->state = state;
> +	INIT_FENCE_CB(&c->fence_cb, fence_cb);
> +
> +	return c;
> +}
> +static void free_commit(struct msm_async_commit *c)
> +{
> +	drm_atomic_state_unreference(c->state);
> +	kfree(c);
> +}
> +
> +static void fence_cb(struct msm_fence_cb *cb)
> +{
> +	struct msm_async_commit *c =
> +			container_of(cb, struct msm_async_commit, fence_cb);
> +	commit_sync(c->state->dev, c->state, true);
> +	free_commit(c);
> +}
> +
> +static void add_fb(struct msm_async_commit *c, struct drm_crtc *crtc,
> +		struct drm_framebuffer *fb)
> +{
> +	struct drm_gem_object *obj = msm_framebuffer_bo(fb, 0);
> +	c->fence = max(c->fence, msm_gem_fence(to_msm_bo(obj), MSM_PREP_READ));
> +}
> +
> +static int wait_fb(struct drm_crtc *crtc, struct drm_framebuffer *fb)
> +{
> +	// XXX TODO wait..
> +	return 0;
> +}
> +
> +#define pending_fb(state) ((state) && (state)->fb && (state)->new_fb)
> +
> +static int commit_sync(struct drm_device *dev, void *state, bool unlocked)
> +{
> +	struct drm_atomic_state *a = state;
> +	struct msm_drm_private *priv = dev->dev_private;
> +	struct msm_kms *kms = priv->kms;
> +	int ncrtcs = dev->mode_config.num_crtc;
> +	uint32_t pending_crtcs = 0;
> +	int i, ret;
> +
> +	for (i = 0; i < ncrtcs; i++)
> +		if (a->crtcs[i])
> +			pending_crtcs |= (1 << a->crtcs[i]->id);
> +
> +	mutex_lock(&dev->struct_mutex);
> +	WARN_ON(priv->pending_crtcs & pending_crtcs);
> +	priv->pending_crtcs |= pending_crtcs;
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	if (unlocked)
> +		ret = drm_atomic_commit_unlocked(dev, state);
> +	else
> +		ret = drm_atomic_commit(dev, state);
> +
> +	mutex_lock(&dev->struct_mutex);
> +	priv->pending_crtcs &= ~pending_crtcs;
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	if (ret)
> +		return ret;
> +
> +	for (i = 0; i < ncrtcs; i++)
> +		if (a->crtcs[i])
> +			kms->funcs->flush(kms, a->crtcs[i]);
> +
> +	return 0;
> +}
> +
> +int msm_atomic_commit(struct drm_device *dev, struct drm_atomic_state *state)
> +{
> +	struct drm_atomic_state *a = state;
> +	int nplanes = dev->mode_config.num_total_plane;
> +	int i;
> +
> +	if (a->flags & DRM_MODE_ATOMIC_NONBLOCK) {
> +		/* non-block mode: defer commit until fb's are ready */
> +		struct msm_async_commit *c = new_commit(state);
> +
> +		if (!c)
> +			return -ENOMEM;
> +
> +		for (i = 0; i < nplanes; i++)
> +			if (pending_fb(a->pstates[i]))
> +				add_fb(c, a->pstates[i]->crtc, a->pstates[i]->fb);
> +
> +		return msm_queue_fence_cb(dev, &c->fence_cb, c->fence);
> +	} else {
> +		/* blocking mode: wait until fb's are ready */
> +		int ret = 0;
> +
> +		for (i = 0; i < nplanes && !ret; i++)
> +			if (pending_fb(a->pstates[i]))
> +				ret = wait_fb(a->pstates[i]->crtc, a->pstates[i]->fb);
> +
> +		if (ret)
> +			return ret;
> +
> +		return commit_sync(dev, state, false);
> +	}
> +}

Ok, I think I should have read your msm implementation a _lot_ earlier.
Explains your desing choices neatly.

Two observations:

- A GO bit makes nuclear pageflips ridiculously easy to implement,
  presuming the hardware actually works. And it's the sane model, so imo a
  good one to wrap the atomic helpers around.

  But reality is often a lot more ugly, especially if you're employed by
  Intel. Which is why I'm harping so much on this helpers-vs-core
  interface issues ... We really need the full state transition in one
  piece to do anything useful.

- msm doesn't have any resource sharing going on for modeset operations
  (which I mean lighting up crtcs to feed pixel streams to connectors).
  Which means the simplistic "loop over all crtcs and call the old
  ->setcrtc" approach actually works.

  The problem I see here is that if you have hardware with more elaborate
  needs (e.g. shared dplls), but otherwise sanity for plane updates (i.e.
  a GO bit) then your current atomic helpers will make it rather hard to
  mix this. So I think we should pimp the crtc helpers a bit to be atomic
  compliant (i.e. kill all outputs first before enabling anything new) and
  try to integrate this with the atomic helpers used for GO bit style
  updates.

  i915 has dpll sharing on ivb/hsw, but doesn't use the the crtc helpers
  anymore. But the radone eyefinity (or whatever the damn thing is called)
  I have lying around here fits the bill: It has 5 DP+ outputs but only 2
  dplls. So you can drive e.g. 3 DP screens and then switch to 2 HDMI
  screens atomically and it should all pan out.

  But since your current helpers just loop over all crtcs without any
  regard to ordering constraints this will fall over if the 2 HDMI outputs
  get enabled before the 3 DP outputs get disabled all disabled.

So with all that in mind I have 3 sanity checks for the atomic interfaces
and helpers:

1. The atomic helpers should make enabling sane hw with a GO bit easy for
nuclear pageflips. Benchmark would be sane hw like msm or omap.

2. It should cooperate with the crtc helpers such that hw with some shared
resources (like dplls) works for atomic modesets. That pretty much means
"disable everything (including crtc/connetor pipelines that only changed
some parts) before enabling anything". Benchmark would be a platform with
shared dplls.

3. The core->driver interface should be powerful enough to support
insanity like i915, but no more. Which means all the code that's share
(i.e. the set_prop code I've been harping all over the place about) should
be done in the core, without any means for drivers to override. Currently
the drm core also takes care of a bunch of things like basic locking and
refcounting, and that's kinda the spirit for this. i915 is the obvious
benchmark here.

I think we can roll this out piecemeal (and it's probably better to do it
that way), but I also think that until we've resolved the requirements of
2&3 we should try to minimize subsystem wide changes as much as possible
by making them opt-in and the vfuncs optional.

If you compare this approach with my review for Matt's universal plane
patches it's exactly the same song.

I hope this elaboration of my thinking clarifies all my review comments a
bit and explains what I'm aiming for.

Cheers, Daniel
Rob Clark May 27, 2014, 3:58 p.m. UTC | #2
On Mon, May 26, 2014 at 1:54 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Sat, May 24, 2014 at 02:30:23PM -0400, Rob Clark wrote:
>> Signed-off-by: Rob Clark <robdclark@gmail.com>
>> ---
>>  drivers/gpu/drm/msm/Makefile              |   1 +
>>  drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c  |  57 ++++++------
>>  drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c   |   6 ++
>>  drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h   |   1 +
>>  drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c |   5 --
>>  drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c  |  56 ++++++------
>>  drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c   |   6 ++
>>  drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h   |   2 +-
>>  drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c |   5 --
>>  drivers/gpu/drm/msm/msm_atomic.c          | 141 ++++++++++++++++++++++++++++++
>>  drivers/gpu/drm/msm/msm_drv.c             |  22 ++++-
>>  drivers/gpu/drm/msm/msm_drv.h             |   7 ++
>>  drivers/gpu/drm/msm/msm_gem.c             |  24 +----
>>  drivers/gpu/drm/msm/msm_gem.h             |  13 +++
>>  drivers/gpu/drm/msm/msm_kms.h             |   1 +
>>  15 files changed, 253 insertions(+), 94 deletions(-)
>>  create mode 100644 drivers/gpu/drm/msm/msm_atomic.c
>>
>> diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
>> index 5e1e6b0..dd12f56 100644
>> --- a/drivers/gpu/drm/msm/Makefile
>> +++ b/drivers/gpu/drm/msm/Makefile
>> @@ -27,6 +27,7 @@ msm-y := \
>>       mdp/mdp5/mdp5_kms.o \
>>       mdp/mdp5/mdp5_plane.o \
>>       mdp/mdp5/mdp5_smp.o \
>> +     msm_atomic.o \
>>       msm_drv.o \
>>       msm_fb.o \
>>       msm_gem.o \
>> diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
>> index d0d8befd..2fa6b75 100644
>> --- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
>> +++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
>> @@ -52,7 +52,6 @@ struct mdp4_crtc {
>>
>>       /* if there is a pending flip, these will be non-null: */
>>       struct drm_pending_vblank_event *event;
>> -     struct msm_fence_cb pageflip_cb;
>>
>>  #define PENDING_CURSOR 0x1
>>  #define PENDING_FLIP   0x2
>> @@ -93,12 +92,16 @@ static void request_pending(struct drm_crtc *crtc, uint32_t pending)
>>       mdp_irq_register(&get_kms(crtc)->base, &mdp4_crtc->vblank);
>>  }
>>
>> -static void crtc_flush(struct drm_crtc *crtc)
>> +void mdp4_crtc_flush(struct drm_crtc *crtc)
>>  {
>> +     struct msm_drm_private *priv = crtc->dev->dev_private;
>>       struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
>>       struct mdp4_kms *mdp4_kms = get_kms(crtc);
>>       uint32_t i, flush = 0;
>>
>> +     if (priv->pending_crtcs & (1 << crtc->id))
>> +             return;
>> +
>>       for (i = 0; i < ARRAY_SIZE(mdp4_crtc->planes); i++) {
>>               struct drm_plane *plane = mdp4_crtc->planes[i];
>>               if (plane) {
>> @@ -142,7 +145,7 @@ static void update_scanout(struct drm_crtc *crtc, struct drm_framebuffer *fb)
>>        * so that we can safely queue unref to current fb (ie. next
>>        * vblank we know hw is done w/ previous scanout_fb).
>>        */
>> -     crtc_flush(crtc);
>> +     mdp4_crtc_flush(crtc);
>>
>>       if (mdp4_crtc->scanout_fb)
>>               drm_flip_work_queue(&mdp4_crtc->unref_fb_work,
>> @@ -177,21 +180,6 @@ static void complete_flip(struct drm_crtc *crtc, struct drm_file *file)
>>       spin_unlock_irqrestore(&dev->event_lock, flags);
>>  }
>>
>> -static void pageflip_cb(struct msm_fence_cb *cb)
>> -{
>> -     struct mdp4_crtc *mdp4_crtc =
>> -             container_of(cb, struct mdp4_crtc, pageflip_cb);
>> -     struct drm_crtc *crtc = &mdp4_crtc->base;
>> -     struct drm_framebuffer *fb = crtc->primary->fb;
>> -
>> -     if (!fb)
>> -             return;
>> -
>> -     drm_framebuffer_reference(fb);
>> -     mdp4_plane_set_scanout(mdp4_crtc->plane, fb);
>> -     update_scanout(crtc, fb);
>> -}
>> -
>>  static void unref_fb_worker(struct drm_flip_work *work, void *val)
>>  {
>>       struct mdp4_crtc *mdp4_crtc =
>> @@ -406,7 +394,7 @@ static void mdp4_crtc_prepare(struct drm_crtc *crtc)
>>  static void mdp4_crtc_commit(struct drm_crtc *crtc)
>>  {
>>       mdp4_crtc_dpms(crtc, DRM_MODE_DPMS_ON);
>> -     crtc_flush(crtc);
>> +     mdp4_crtc_flush(crtc);
>>       /* drop the ref to mdp clk's that we got in prepare: */
>>       mdp4_disable(get_kms(crtc));
>>  }
>> @@ -448,23 +436,27 @@ static int mdp4_crtc_page_flip(struct drm_crtc *crtc,
>>  {
>>       struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
>>       struct drm_device *dev = crtc->dev;
>> -     struct drm_gem_object *obj;
>>       unsigned long flags;
>>
>> +     spin_lock_irqsave(&dev->event_lock, flags);
>>       if (mdp4_crtc->event) {
>>               dev_err(dev->dev, "already pending flip!\n");
>> +             spin_unlock_irqrestore(&dev->event_lock, flags);
>>               return -EBUSY;
>>       }
>>
>> -     obj = msm_framebuffer_bo(new_fb, 0);
>> -
>> -     spin_lock_irqsave(&dev->event_lock, flags);
>>       mdp4_crtc->event = event;
>>       spin_unlock_irqrestore(&dev->event_lock, flags);
>>
>>       update_fb(crtc, new_fb);
>>
>> -     return msm_gem_queue_inactive_cb(obj, &mdp4_crtc->pageflip_cb);
>> +
>> +     /* grab extra ref for update_scanout() */
>> +     drm_framebuffer_reference(new_fb);
>> +     mdp4_plane_set_scanout(mdp4_crtc->plane, new_fb);
>> +     update_scanout(crtc, new_fb);
>> +
>> +     return 0;
>>  }
>>
>>  static int mdp4_crtc_set_property(struct drm_crtc *crtc,
>> @@ -598,7 +590,7 @@ static int mdp4_crtc_cursor_move(struct drm_crtc *crtc, int x, int y)
>>       mdp4_crtc->cursor.y = y;
>>       spin_unlock_irqrestore(&mdp4_crtc->cursor.lock, flags);
>>
>> -     crtc_flush(crtc);
>> +     mdp4_crtc_flush(crtc);
>>       request_pending(crtc, PENDING_CURSOR);
>>
>>       return 0;
>> @@ -635,8 +627,13 @@ static void mdp4_crtc_vblank_irq(struct mdp_irq *irq, uint32_t irqstatus)
>>       pending = atomic_xchg(&mdp4_crtc->pending, 0);
>>
>>       if (pending & PENDING_FLIP) {
>> -             complete_flip(crtc, NULL);
>> -             drm_flip_work_commit(&mdp4_crtc->unref_fb_work, priv->wq);
>> +             if (priv->pending_crtcs & (1 << crtc->id)) {
>> +                     /* our update hasn't been flushed yet: */
>> +                     request_pending(crtc, PENDING_FLIP);
>> +             } else {
>> +                     complete_flip(crtc, NULL);
>> +                     drm_flip_work_commit(&mdp4_crtc->unref_fb_work, priv->wq);
>> +             }
>>       }
>>
>>       if (pending & PENDING_CURSOR) {
>> @@ -650,7 +647,7 @@ static void mdp4_crtc_err_irq(struct mdp_irq *irq, uint32_t irqstatus)
>>       struct mdp4_crtc *mdp4_crtc = container_of(irq, struct mdp4_crtc, err);
>>       struct drm_crtc *crtc = &mdp4_crtc->base;
>>       DBG("%s: error: %08x", mdp4_crtc->name, irqstatus);
>> -     crtc_flush(crtc);
>> +     mdp4_crtc_flush(crtc);
>>  }
>>
>>  uint32_t mdp4_crtc_vblank(struct drm_crtc *crtc)
>> @@ -730,7 +727,7 @@ static void set_attach(struct drm_crtc *crtc, enum mdp4_pipe pipe_id,
>>       mdp4_crtc->planes[pipe_id] = plane;
>>       blend_setup(crtc);
>>       if (mdp4_crtc->enabled && (plane != mdp4_crtc->plane))
>> -             crtc_flush(crtc);
>> +             mdp4_crtc_flush(crtc);
>>  }
>>
>>  void mdp4_crtc_attach(struct drm_crtc *crtc, struct drm_plane *plane)
>> @@ -792,8 +789,6 @@ struct drm_crtc *mdp4_crtc_init(struct drm_device *dev,
>>       ret = drm_flip_work_init(&mdp4_crtc->unref_cursor_work, 64,
>>                       "unref cursor", unref_cursor_worker);
>>
>> -     INIT_FENCE_CB(&mdp4_crtc->pageflip_cb, pageflip_cb);
>> -
>>       drm_crtc_init_with_planes(dev, crtc, plane, NULL, &mdp4_crtc_funcs);
>>       drm_crtc_helper_add(crtc, &mdp4_crtc_helper_funcs);
>>
>> diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
>> index 0bb4faa..af0bfb1 100644
>> --- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
>> +++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
>> @@ -131,6 +131,11 @@ static long mdp4_round_pixclk(struct msm_kms *kms, unsigned long rate,
>>       return mdp4_dtv_round_pixclk(encoder, rate);
>>  }
>>
>> +static void mdp4_flush(struct msm_kms *kms, struct drm_crtc *crtc)
>> +{
>> +     mdp4_crtc_flush(crtc);
>> +}
>> +
>>  static void mdp4_preclose(struct msm_kms *kms, struct drm_file *file)
>>  {
>>       struct mdp4_kms *mdp4_kms = to_mdp4_kms(to_mdp_kms(kms));
>> @@ -162,6 +167,7 @@ static const struct mdp_kms_funcs kms_funcs = {
>>               .disable_vblank  = mdp4_disable_vblank,
>>               .get_format      = mdp_get_format,
>>               .round_pixclk    = mdp4_round_pixclk,
>> +             .flush           = mdp4_flush,
>>               .preclose        = mdp4_preclose,
>>               .destroy         = mdp4_destroy,
>>       },
>> diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
>> index 715520c5..834454c 100644
>> --- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
>> +++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
>> @@ -184,6 +184,7 @@ enum mdp4_pipe mdp4_plane_pipe(struct drm_plane *plane);
>>  struct drm_plane *mdp4_plane_init(struct drm_device *dev,
>>               enum mdp4_pipe pipe_id, bool private_plane);
>>
>> +void mdp4_crtc_flush(struct drm_crtc *crtc);
>>  uint32_t mdp4_crtc_vblank(struct drm_crtc *crtc);
>>  void mdp4_crtc_cancel_pending_flip(struct drm_crtc *crtc, struct drm_file *file);
>>  void mdp4_crtc_set_config(struct drm_crtc *crtc, uint32_t config);
>> diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c
>> index 4c92985..f35848d 100644
>> --- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c
>> +++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c
>> @@ -48,11 +48,6 @@ static int mdp4_plane_update(struct drm_plane *plane,
>>
>>       mdp4_plane->enabled = true;
>>
>> -     if (plane->fb)
>> -             drm_framebuffer_unreference(plane->fb);
>> -
>> -     drm_framebuffer_reference(fb);
>> -
>>       return mdp4_plane_mode_set(plane, crtc, fb,
>>                       crtc_x, crtc_y, crtc_w, crtc_h,
>>                       src_x, src_y, src_w, src_h);
>> diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
>> index 7f4ee99..0e74317 100644
>> --- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
>> +++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
>> @@ -35,7 +35,6 @@ struct mdp5_crtc {
>>
>>       /* if there is a pending flip, these will be non-null: */
>>       struct drm_pending_vblank_event *event;
>> -     struct msm_fence_cb pageflip_cb;
>>
>>  #define PENDING_CURSOR 0x1
>>  #define PENDING_FLIP   0x2
>> @@ -73,13 +72,17 @@ static void request_pending(struct drm_crtc *crtc, uint32_t pending)
>>       mdp_irq_register(&get_kms(crtc)->base, &mdp5_crtc->vblank);
>>  }
>>
>> -static void crtc_flush(struct drm_crtc *crtc)
>> +void mdp5_crtc_flush(struct drm_crtc *crtc)
>>  {
>> +     struct msm_drm_private *priv = crtc->dev->dev_private;
>>       struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
>>       struct mdp5_kms *mdp5_kms = get_kms(crtc);
>>       int id = mdp5_crtc->id;
>>       uint32_t i, flush = 0;
>>
>> +     if (priv->pending_crtcs & (1 << crtc->id))
>> +             return;
>> +
>>       for (i = 0; i < ARRAY_SIZE(mdp5_crtc->planes); i++) {
>>               struct drm_plane *plane = mdp5_crtc->planes[i];
>>               if (plane) {
>> @@ -124,7 +127,7 @@ static void update_scanout(struct drm_crtc *crtc, struct drm_framebuffer *fb)
>>        * so that we can safely queue unref to current fb (ie. next
>>        * vblank we know hw is done w/ previous scanout_fb).
>>        */
>> -     crtc_flush(crtc);
>> +     mdp5_crtc_flush(crtc);
>>
>>       if (mdp5_crtc->scanout_fb)
>>               drm_flip_work_queue(&mdp5_crtc->unref_fb_work,
>> @@ -165,21 +168,6 @@ static void complete_flip(struct drm_crtc *crtc, struct drm_file *file)
>>       }
>>  }
>>
>> -static void pageflip_cb(struct msm_fence_cb *cb)
>> -{
>> -     struct mdp5_crtc *mdp5_crtc =
>> -             container_of(cb, struct mdp5_crtc, pageflip_cb);
>> -     struct drm_crtc *crtc = &mdp5_crtc->base;
>> -     struct drm_framebuffer *fb = mdp5_crtc->fb;
>> -
>> -     if (!fb)
>> -             return;
>> -
>> -     drm_framebuffer_reference(fb);
>> -     mdp5_plane_set_scanout(mdp5_crtc->plane, fb);
>> -     update_scanout(crtc, fb);
>> -}
>> -
>>  static void unref_fb_worker(struct drm_flip_work *work, void *val)
>>  {
>>       struct mdp5_crtc *mdp5_crtc =
>> @@ -324,7 +312,7 @@ static void mdp5_crtc_prepare(struct drm_crtc *crtc)
>>  static void mdp5_crtc_commit(struct drm_crtc *crtc)
>>  {
>>       mdp5_crtc_dpms(crtc, DRM_MODE_DPMS_ON);
>> -     crtc_flush(crtc);
>> +     mdp5_crtc_flush(crtc);
>>       /* drop the ref to mdp clk's that we got in prepare: */
>>       mdp5_disable(get_kms(crtc));
>>  }
>> @@ -366,23 +354,26 @@ static int mdp5_crtc_page_flip(struct drm_crtc *crtc,
>>  {
>>       struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
>>       struct drm_device *dev = crtc->dev;
>> -     struct drm_gem_object *obj;
>>       unsigned long flags;
>>
>> +     spin_lock_irqsave(&dev->event_lock, flags);
>>       if (mdp5_crtc->event) {
>>               dev_err(dev->dev, "already pending flip!\n");
>> +             spin_unlock_irqrestore(&dev->event_lock, flags);
>>               return -EBUSY;
>>       }
>>
>> -     obj = msm_framebuffer_bo(new_fb, 0);
>> -
>> -     spin_lock_irqsave(&dev->event_lock, flags);
>>       mdp5_crtc->event = event;
>>       spin_unlock_irqrestore(&dev->event_lock, flags);
>>
>>       update_fb(crtc, new_fb);
>>
>> -     return msm_gem_queue_inactive_cb(obj, &mdp5_crtc->pageflip_cb);
>> +     /* grab extra ref for update_scanout() */
>> +     drm_framebuffer_reference(new_fb);
>> +     mdp5_plane_set_scanout(mdp5_crtc->plane, new_fb);
>> +     update_scanout(crtc, new_fb);
>> +
>> +     return 0;
>>  }
>>
>>  static int mdp5_crtc_set_property(struct drm_crtc *crtc,
>> @@ -424,8 +415,13 @@ static void mdp5_crtc_vblank_irq(struct mdp_irq *irq, uint32_t irqstatus)
>>       pending = atomic_xchg(&mdp5_crtc->pending, 0);
>>
>>       if (pending & PENDING_FLIP) {
>> -             complete_flip(crtc, NULL);
>> -             drm_flip_work_commit(&mdp5_crtc->unref_fb_work, priv->wq);
>> +             if (priv->pending_crtcs & (1 << crtc->id)) {
>> +                     /* our update hasn't been flushed yet: */
>> +                     request_pending(crtc, PENDING_FLIP);
>> +             } else {
>> +                     complete_flip(crtc, NULL);
>> +                     drm_flip_work_commit(&mdp5_crtc->unref_fb_work, priv->wq);
>> +             }
>>       }
>>  }
>>
>> @@ -434,7 +430,7 @@ static void mdp5_crtc_err_irq(struct mdp_irq *irq, uint32_t irqstatus)
>>       struct mdp5_crtc *mdp5_crtc = container_of(irq, struct mdp5_crtc, err);
>>       struct drm_crtc *crtc = &mdp5_crtc->base;
>>       DBG("%s: error: %08x", mdp5_crtc->name, irqstatus);
>> -     crtc_flush(crtc);
>> +     mdp5_crtc_flush(crtc);
>>  }
>>
>>  uint32_t mdp5_crtc_vblank(struct drm_crtc *crtc)
>> @@ -501,7 +497,7 @@ void mdp5_crtc_set_intf(struct drm_crtc *crtc, int intf,
>>                       MDP5_CTL_OP_MODE(MODE_NONE) |
>>                       MDP5_CTL_OP_INTF_NUM(intfnum[intf]));
>>
>> -     crtc_flush(crtc);
>> +     mdp5_crtc_flush(crtc);
>>  }
>>
>>  static void set_attach(struct drm_crtc *crtc, enum mdp5_pipe pipe_id,
>> @@ -517,7 +513,7 @@ static void set_attach(struct drm_crtc *crtc, enum mdp5_pipe pipe_id,
>>       mdp5_crtc->planes[pipe_id] = plane;
>>       blend_setup(crtc);
>>       if (mdp5_crtc->enabled && (plane != mdp5_crtc->plane))
>> -             crtc_flush(crtc);
>> +             mdp5_crtc_flush(crtc);
>>  }
>>
>>  void mdp5_crtc_attach(struct drm_crtc *crtc, struct drm_plane *plane)
>> @@ -563,8 +559,6 @@ struct drm_crtc *mdp5_crtc_init(struct drm_device *dev,
>>       if (ret)
>>               goto fail;
>>
>> -     INIT_FENCE_CB(&mdp5_crtc->pageflip_cb, pageflip_cb);
>> -
>>       drm_crtc_init_with_planes(dev, crtc, plane, NULL, &mdp5_crtc_funcs);
>>       drm_crtc_helper_add(crtc, &mdp5_crtc_helper_funcs);
>>
>> diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
>> index ee8446c..01c3d70 100644
>> --- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
>> +++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
>> @@ -91,6 +91,11 @@ static long mdp5_round_pixclk(struct msm_kms *kms, unsigned long rate,
>>       return rate;
>>  }
>>
>> +static void mdp5_flush(struct msm_kms *kms, struct drm_crtc *crtc)
>> +{
>> +     mdp5_crtc_flush(crtc);
>> +}
>> +
>>  static void mdp5_preclose(struct msm_kms *kms, struct drm_file *file)
>>  {
>>       struct mdp5_kms *mdp5_kms = to_mdp5_kms(to_mdp_kms(kms));
>> @@ -118,6 +123,7 @@ static const struct mdp_kms_funcs kms_funcs = {
>>               .disable_vblank  = mdp5_disable_vblank,
>>               .get_format      = mdp_get_format,
>>               .round_pixclk    = mdp5_round_pixclk,
>> +             .flush           = mdp5_flush,
>>               .preclose        = mdp5_preclose,
>>               .destroy         = mdp5_destroy,
>>       },
>> diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
>> index c8b1a25..18b031b 100644
>> --- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
>> +++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
>> @@ -197,8 +197,8 @@ enum mdp5_pipe mdp5_plane_pipe(struct drm_plane *plane);
>>  struct drm_plane *mdp5_plane_init(struct drm_device *dev,
>>               enum mdp5_pipe pipe, bool private_plane);
>>
>> +void mdp5_crtc_flush(struct drm_crtc *crtc);
>>  uint32_t mdp5_crtc_vblank(struct drm_crtc *crtc);
>> -
>>  void mdp5_crtc_cancel_pending_flip(struct drm_crtc *crtc, struct drm_file *file);
>>  void mdp5_crtc_set_intf(struct drm_crtc *crtc, int intf,
>>               enum mdp5_intf intf_id);
>> diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
>> index 53cc8c6..f1bf3c2 100644
>> --- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
>> +++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
>> @@ -48,11 +48,6 @@ static int mdp5_plane_update(struct drm_plane *plane,
>>
>>       mdp5_plane->enabled = true;
>>
>> -     if (plane->fb)
>> -             drm_framebuffer_unreference(plane->fb);
>> -
>> -     drm_framebuffer_reference(fb);
>> -
>>       return mdp5_plane_mode_set(plane, crtc, fb,
>>                       crtc_x, crtc_y, crtc_w, crtc_h,
>>                       src_x, src_y, src_w, src_h);
>> diff --git a/drivers/gpu/drm/msm/msm_atomic.c b/drivers/gpu/drm/msm/msm_atomic.c
>> new file mode 100644
>> index 0000000..b231377
>> --- /dev/null
>> +++ b/drivers/gpu/drm/msm/msm_atomic.c
>> @@ -0,0 +1,141 @@
>> +/*
>> + * Copyright (C) 2013 Red Hat
>> + * Author: Rob Clark <robdclark@gmail.com>
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms of the GNU General Public License version 2 as published by
>> + * the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope that it will be useful, but WITHOUT
>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
>> + * more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along with
>> + * this program.  If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include "msm_drv.h"
>> +#include "msm_kms.h"
>> +#include "msm_gem.h"
>> +
>> +struct msm_async_commit {
>> +     struct drm_atomic_state *state;
>> +     uint32_t fence;
>> +     struct msm_fence_cb fence_cb;
>> +};
>> +
>> +static void fence_cb(struct msm_fence_cb *cb);
>> +static int commit_sync(struct drm_device *dev, void *state, bool unlocked);
>> +
>> +static struct msm_async_commit *new_commit(struct drm_atomic_state *state)
>> +{
>> +     struct msm_async_commit *c = kzalloc(sizeof(*c), GFP_KERNEL);
>> +
>> +     if (!c)
>> +             return NULL;
>> +
>> +     drm_atomic_state_reference(state);
>> +     c->state = state;
>> +     INIT_FENCE_CB(&c->fence_cb, fence_cb);
>> +
>> +     return c;
>> +}
>> +static void free_commit(struct msm_async_commit *c)
>> +{
>> +     drm_atomic_state_unreference(c->state);
>> +     kfree(c);
>> +}
>> +
>> +static void fence_cb(struct msm_fence_cb *cb)
>> +{
>> +     struct msm_async_commit *c =
>> +                     container_of(cb, struct msm_async_commit, fence_cb);
>> +     commit_sync(c->state->dev, c->state, true);
>> +     free_commit(c);
>> +}
>> +
>> +static void add_fb(struct msm_async_commit *c, struct drm_crtc *crtc,
>> +             struct drm_framebuffer *fb)
>> +{
>> +     struct drm_gem_object *obj = msm_framebuffer_bo(fb, 0);
>> +     c->fence = max(c->fence, msm_gem_fence(to_msm_bo(obj), MSM_PREP_READ));
>> +}
>> +
>> +static int wait_fb(struct drm_crtc *crtc, struct drm_framebuffer *fb)
>> +{
>> +     // XXX TODO wait..
>> +     return 0;
>> +}
>> +
>> +#define pending_fb(state) ((state) && (state)->fb && (state)->new_fb)
>> +
>> +static int commit_sync(struct drm_device *dev, void *state, bool unlocked)
>> +{
>> +     struct drm_atomic_state *a = state;
>> +     struct msm_drm_private *priv = dev->dev_private;
>> +     struct msm_kms *kms = priv->kms;
>> +     int ncrtcs = dev->mode_config.num_crtc;
>> +     uint32_t pending_crtcs = 0;
>> +     int i, ret;
>> +
>> +     for (i = 0; i < ncrtcs; i++)
>> +             if (a->crtcs[i])
>> +                     pending_crtcs |= (1 << a->crtcs[i]->id);
>> +
>> +     mutex_lock(&dev->struct_mutex);
>> +     WARN_ON(priv->pending_crtcs & pending_crtcs);
>> +     priv->pending_crtcs |= pending_crtcs;
>> +     mutex_unlock(&dev->struct_mutex);
>> +
>> +     if (unlocked)
>> +             ret = drm_atomic_commit_unlocked(dev, state);
>> +     else
>> +             ret = drm_atomic_commit(dev, state);
>> +
>> +     mutex_lock(&dev->struct_mutex);
>> +     priv->pending_crtcs &= ~pending_crtcs;
>> +     mutex_unlock(&dev->struct_mutex);
>> +
>> +     if (ret)
>> +             return ret;
>> +
>> +     for (i = 0; i < ncrtcs; i++)
>> +             if (a->crtcs[i])
>> +                     kms->funcs->flush(kms, a->crtcs[i]);
>> +
>> +     return 0;
>> +}
>> +
>> +int msm_atomic_commit(struct drm_device *dev, struct drm_atomic_state *state)
>> +{
>> +     struct drm_atomic_state *a = state;
>> +     int nplanes = dev->mode_config.num_total_plane;
>> +     int i;
>> +
>> +     if (a->flags & DRM_MODE_ATOMIC_NONBLOCK) {
>> +             /* non-block mode: defer commit until fb's are ready */
>> +             struct msm_async_commit *c = new_commit(state);
>> +
>> +             if (!c)
>> +                     return -ENOMEM;
>> +
>> +             for (i = 0; i < nplanes; i++)
>> +                     if (pending_fb(a->pstates[i]))
>> +                             add_fb(c, a->pstates[i]->crtc, a->pstates[i]->fb);
>> +
>> +             return msm_queue_fence_cb(dev, &c->fence_cb, c->fence);
>> +     } else {
>> +             /* blocking mode: wait until fb's are ready */
>> +             int ret = 0;
>> +
>> +             for (i = 0; i < nplanes && !ret; i++)
>> +                     if (pending_fb(a->pstates[i]))
>> +                             ret = wait_fb(a->pstates[i]->crtc, a->pstates[i]->fb);
>> +
>> +             if (ret)
>> +                     return ret;
>> +
>> +             return commit_sync(dev, state, false);
>> +     }
>> +}
>
> Ok, I think I should have read your msm implementation a _lot_ earlier.
> Explains your desing choices neatly.
>
> Two observations:
>
> - A GO bit makes nuclear pageflips ridiculously easy to implement,
>   presuming the hardware actually works. And it's the sane model, so imo a
>   good one to wrap the atomic helpers around.
>
>   But reality is often a lot more ugly, especially if you're employed by
>   Intel. Which is why I'm harping so much on this helpers-vs-core
>   interface issues ... We really need the full state transition in one
>   piece to do anything useful.
>
> - msm doesn't have any resource sharing going on for modeset operations
>   (which I mean lighting up crtcs to feed pixel streams to connectors).
>   Which means the simplistic "loop over all crtcs and call the old
>   ->setcrtc" approach actually works.

we do actually have some shared resources on mdp5 generation (the
"smp" blocks, basically a shared buffer which we need to allocate fifo
space from for each plane)..

I'm mostly ignoring this at the moment, because we don't support
enough crtc's yet to run out of smp blocks.  But hooking in at the
current ->atomic_commit() would be enough for me, I think.  Tbh, it is
something that should be handled at the ->atomic_check() stage so I
hadn't given much though to ->atomic_commit() stage.

That all said, I've no problem with adding one more hook, after
->atomic_commit() and lock magic, before loop over planes/crtcs, so
drivers that need to can replace the loops and do their own thing.
Current state should be reasonably good for sane hw.  I'm just waiting
for patches for i915 to add whatever it needs.

>   The problem I see here is that if you have hardware with more elaborate
>   needs (e.g. shared dplls), but otherwise sanity for plane updates (i.e.
>   a GO bit) then your current atomic helpers will make it rather hard to
>   mix this. So I think we should pimp the crtc helpers a bit to be atomic
>   compliant (i.e. kill all outputs first before enabling anything new) and
>   try to integrate this with the atomic helpers used for GO bit style
>   updates.

Not really, I don't think.  You can still do whatever shared resource
setup in ->atomic_commit().  For drivers which want to defer commit
(ie. doing commit after fb's ready from cpu, rather than from gpu), it
would probably be more convenient to split up atomic_commit() so
drivers have a post lock hook.  I think it is just a few line patch,
and I think that it probably isn't really needed until i915 is ready.
I'm pretty sure we can change this to do what i915 needs, while at the
same time keeping it simple for 'GO bit' drivers.

The bit about crtc helpers, I'll have to think about.  I'm not sure
that we want to require that 'atomic' (modeset or pageflip) completely
*replaces* current state.  So disabling unrelated crtcs doesn't seem
like the right thing.  But we have some time to decide about the
semantics of an atomic modeset before we expose to userspace.

>   i915 has dpll sharing on ivb/hsw, but doesn't use the the crtc helpers
>   anymore. But the radone eyefinity (or whatever the damn thing is called)
>   I have lying around here fits the bill: It has 5 DP+ outputs but only 2
>   dplls. So you can drive e.g. 3 DP screens and then switch to 2 HDMI
>   screens atomically and it should all pan out.
>
>   But since your current helpers just loop over all crtcs without any
>   regard to ordering constraints this will fall over if the 2 HDMI outputs
>   get enabled before the 3 DP outputs get disabled all disabled.

the driver should have rejected the config before it gets to commit
stage, if it were an impossible config.

> So with all that in mind I have 3 sanity checks for the atomic interfaces
> and helpers:
>
> 1. The atomic helpers should make enabling sane hw with a GO bit easy for
> nuclear pageflips. Benchmark would be sane hw like msm or omap.
>
> 2. It should cooperate with the crtc helpers such that hw with some shared
> resources (like dplls) works for atomic modesets. That pretty much means
> "disable everything (including crtc/connetor pipelines that only changed
> some parts) before enabling anything". Benchmark would be a platform with
> shared dplls.

btw, not something I'd thought about before, but shared dplls seem
common enough, that it is something core could help with.  (Assuming
they are all related to pixel clk and not some derived thing that core
wouldn't know about.)

I think we do need to decide what partial state updates me in the
context of modeset or pageflip.  I kinda think the right thing is
different for modeset vs pageflip.  Maybe we want to define an atomic
flag to mean "disable/discard everything else"..  at any rate, we only
need to sort that before exposing the API to userspace.

> 3. The core->driver interface should be powerful enough to support
> insanity like i915, but no more. Which means all the code that's share
> (i.e. the set_prop code I've been harping all over the place about) should
> be done in the core, without any means for drivers to override. Currently
> the drm core also takes care of a bunch of things like basic locking and
> refcounting, and that's kinda the spirit for this. i915 is the obvious
> benchmark here.

The more I think about it, the more I think we should leave set_prop
as it is (although possibly with drm_atomic_state ->
drm_{plane,crtc,etc}_state).

In the past, before primary plane, I really needed this.  And I expect
having a convenient way to 'sniff' changing properties as they go by
will be useful in some cases.

> I think we can roll this out piecemeal (and it's probably better to do it
> that way), but I also think that until we've resolved the requirements of
> 2&3 we should try to minimize subsystem wide changes as much as possible
> by making them opt-in and the vfuncs optional.

the new mandatory vfuncs don't amount to too much churn.. not as much
as the extra param for crtc lock/unlock fxns.  Maybe if I was planning
on keeping this alive on a branch for so long in the beginning I would
have arranged a few things slightly different, but meh.

A very early iteration of atomic preserved the old pageflip, setcrtc,
setplane, etc paths for drivers not implementing atomic fxns.  But
that was at least a bit ugly.  I like the current approach since the
code paths are very much the same for atomic and non-atomic.

BR,
-R

> If you compare this approach with my review for Matt's universal plane
> patches it's exactly the same song.
>
> I hope this elaboration of my thinking clarifies all my review comments a
> bit and explains what I'm aiming for.
>
> Cheers, Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
Daniel Vetter May 27, 2014, 5:50 p.m. UTC | #3
On Tue, May 27, 2014 at 11:58:33AM -0400, Rob Clark wrote:
> On Mon, May 26, 2014 at 1:54 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> > Ok, I think I should have read your msm implementation a _lot_ earlier.
> > Explains your desing choices neatly.
> >
> > Two observations:
> >
> > - A GO bit makes nuclear pageflips ridiculously easy to implement,
> >   presuming the hardware actually works. And it's the sane model, so imo a
> >   good one to wrap the atomic helpers around.
> >
> >   But reality is often a lot more ugly, especially if you're employed by
> >   Intel. Which is why I'm harping so much on this helpers-vs-core
> >   interface issues ... We really need the full state transition in one
> >   piece to do anything useful.
> >
> > - msm doesn't have any resource sharing going on for modeset operations
> >   (which I mean lighting up crtcs to feed pixel streams to connectors).
> >   Which means the simplistic "loop over all crtcs and call the old
> >   ->setcrtc" approach actually works.
> 
> we do actually have some shared resources on mdp5 generation (the
> "smp" blocks, basically a shared buffer which we need to allocate fifo
> space from for each plane)..
> 
> I'm mostly ignoring this at the moment, because we don't support
> enough crtc's yet to run out of smp blocks.  But hooking in at the
> current ->atomic_commit() would be enough for me, I think.  Tbh, it is
> something that should be handled at the ->atomic_check() stage so I
> hadn't given much though to ->atomic_commit() stage.
> 
> That all said, I've no problem with adding one more hook, after
> ->atomic_commit() and lock magic, before loop over planes/crtcs, so
> drivers that need to can replace the loops and do their own thing.
> Current state should be reasonably good for sane hw.  I'm just waiting
> for patches for i915 to add whatever it needs.

I don't think that will be enough for you. Example, slightly hypothetical:

- Configuration A: crtc 0 displays a small screen, crtc 1 a giant screen.
  Together they just barely fit into your fifo space.

- Configuration B: crtc 0 drives a giant screen, crtc 1 a tiny one.
  Presume different outputs here so that no implicit output stealing
  happens upon mode switching.

Atomic switch should work, but don't since if you just loop over crtcs you
have the intermediate stage where you try to drive 2 giant screens, which
will run out of fifo resources. And I think it's really common to have
such implicit resource sharing, maybe except for the most beefy desktop
gpu which simply can't run out of memory bw with today's screen.

So afaics you really need to push a bit of smarts into the crtc helpers to
make atomic possible, which then leaves you with interaction issues
between the atomic stuff for GO bit capable hw for plane updates and the
modeset ordering hell.

> >   The problem I see here is that if you have hardware with more elaborate
> >   needs (e.g. shared dplls), but otherwise sanity for plane updates (i.e.
> >   a GO bit) then your current atomic helpers will make it rather hard to
> >   mix this. So I think we should pimp the crtc helpers a bit to be atomic
> >   compliant (i.e. kill all outputs first before enabling anything new) and
> >   try to integrate this with the atomic helpers used for GO bit style
> >   updates.
> 
> Not really, I don't think.  You can still do whatever shared resource
> setup in ->atomic_commit().  For drivers which want to defer commit
> (ie. doing commit after fb's ready from cpu, rather than from gpu), it
> would probably be more convenient to split up atomic_commit() so
> drivers have a post lock hook.  I think it is just a few line patch,
> and I think that it probably isn't really needed until i915 is ready.
> I'm pretty sure we can change this to do what i915 needs, while at the
> same time keeping it simple for 'GO bit' drivers.
>
> The bit about crtc helpers, I'll have to think about.  I'm not sure
> that we want to require that 'atomic' (modeset or pageflip) completely
> *replaces* current state.  So disabling unrelated crtcs doesn't seem
> like the right thing.  But we have some time to decide about the
> semantics of an atomic modeset before we expose to userspace.

I'm not talking about replacing unrelated crtcs. It's also not about
underspecified state or about enabling/changing global resources.

It is _only_ about ordering of the various operations: If both the
current and the desired new configuration are at the limit of what the hw
can do you can't switch to the new configuration by looping over all
crtcs. The fact that this doesn't work is the entire point of atomic
modesets. And if we have helpers which aren't cut out for the task at hand
for any hw where we need to have atomic modesets to make such changes
possible without userspace going nuts, then the helpers are imo simply not
useful

> >   i915 has dpll sharing on ivb/hsw, but doesn't use the the crtc helpers
> >   anymore. But the radone eyefinity (or whatever the damn thing is called)
> >   I have lying around here fits the bill: It has 5 DP+ outputs but only 2
> >   dplls. So you can drive e.g. 3 DP screens and then switch to 2 HDMI
> >   screens atomically and it should all pan out.
> >
> >   But since your current helpers just loop over all crtcs without any
> >   regard to ordering constraints this will fall over if the 2 HDMI outputs
> >   get enabled before the 3 DP outputs get disabled all disabled.
> 
> the driver should have rejected the config before it gets to commit
> stage, if it were an impossible config.

The configuration _is_ possible. We simply have to be somewhat careful
with transitioning to it, since not _all_ intermediate states are
possible. Your current helpers presume that's the case, which afaik isn't
the case on any hw where we have global limits. For modesets.

Nuclear pageflips are a completely different thing, as long as your hw has
a GO bit.

> > So with all that in mind I have 3 sanity checks for the atomic interfaces
> > and helpers:
> >
> > 1. The atomic helpers should make enabling sane hw with a GO bit easy for
> > nuclear pageflips. Benchmark would be sane hw like msm or omap.
> >
> > 2. It should cooperate with the crtc helpers such that hw with some shared
> > resources (like dplls) works for atomic modesets. That pretty much means
> > "disable everything (including crtc/connetor pipelines that only changed
> > some parts) before enabling anything". Benchmark would be a platform with
> > shared dplls.
> 
> btw, not something I'd thought about before, but shared dplls seem
> common enough, that it is something core could help with.  (Assuming
> they are all related to pixel clk and not some derived thing that core
> wouldn't know about.)

Yup, that's what I'm trying to say ;-) But it was just an example, I
believe that atm your helpers don't help for any shared modeset resources
at all.

> I think we do need to decide what partial state updates me in the
> context of modeset or pageflip.  I kinda think the right thing is
> different for modeset vs pageflip.  Maybe we want to define an atomic
> flag to mean "disable/discard everything else"..  at any rate, we only
> need to sort that before exposing the API to userspace.

Yeah, I still strongly support this split in the api itself. For i915 my
plan is to have separate configuration structures for modeset state and
pageflip/plane config state. When we do an atomic modeset we compute both
(maybe with some shortcut if we know that the pipe config didn't change at
all). Then comes the big switch:

- If we have the same pipe config, we simply to a vblank synce nuclear
  flip to the new config.

- If pipe configs change it will be much more elaborate:

  1. Do a vblank synced nuclear flip to black on all pipes that need to go
  off (whether they get disabled or reconfigured doesn't matter for now).

  2. Disable pipes.

  3. Commit new state on the sw side.

  4. Enable all pipes with the new config, but only scanning out black.

  5. Do a vblank-synced nuclear flip to the new config. This would also be
  the one that would signale the drm events that the atomic update
  completed.

  For fastboot we might need some hacks to fuse this all together, e.g for
  some panel fitter changes we don't need to disable the pipe completely.
  But that's the advanced stuff really.

I think modelling the crtc helpers after this model could work. But that
means that the crtc helpers and the nuclear flip atomic helpers for GO
bit capable hw need to be rather tightly integrated, while still allowing
drivers to override the nuclear flip parts.

> > 3. The core->driver interface should be powerful enough to support
> > insanity like i915, but no more. Which means all the code that's share
> > (i.e. the set_prop code I've been harping all over the place about) should
> > be done in the core, without any means for drivers to override. Currently
> > the drm core also takes care of a bunch of things like basic locking and
> > refcounting, and that's kinda the spirit for this. i915 is the obvious
> > benchmark here.
> 
> The more I think about it, the more I think we should leave set_prop
> as it is (although possibly with drm_atomic_state ->
> drm_{plane,crtc,etc}_state).
> 
> In the past, before primary plane, I really needed this.  And I expect
> having a convenient way to 'sniff' changing properties as they go by
> will be useful in some cases.

I actually really like the addition of the state object to ->set_prop. At
least for i915 (which already has a fair pile of additional properties)
that looks like the perfect way to prep the stage.

But at least for the transition we should keep the impact minimal. So no
manadatory callbacks and don't enforce the usage of the state object until
the drm core is fully converted to always follow a set_prop with a
->commit. Since currently we have internal mode_set calls in all i915
set_prop functions and we need to convert them all. But we can't do that
until all the core stuff uses the atomic interface for all legacy ioctl
ops.

> > I think we can roll this out piecemeal (and it's probably better to do it
> > that way), but I also think that until we've resolved the requirements of
> > 2&3 we should try to minimize subsystem wide changes as much as possible
> > by making them opt-in and the vfuncs optional.
> 
> the new mandatory vfuncs don't amount to too much churn.. not as much
> as the extra param for crtc lock/unlock fxns.  Maybe if I was planning
> on keeping this alive on a branch for so long in the beginning I would
> have arranged a few things slightly different, but meh.
> 
> A very early iteration of atomic preserved the old pageflip, setcrtc,
> setplane, etc paths for drivers not implementing atomic fxns.  But
> that was at least a bit ugly.  I like the current approach since the
> code paths are very much the same for atomic and non-atomic.

I'm not advertising for keeping the old driver interfaces, that would be
much worse. I'm only saying that we'll likely change this a few times, and
we will transition fairly slowly anyway. Reducing the overall churn and
especially reducing the amount of code we might need to fix up if we
change our minds seems like a good approach to me.
-Daniel
Rob Clark May 27, 2014, 6:48 p.m. UTC | #4
On Tue, May 27, 2014 at 1:50 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Tue, May 27, 2014 at 11:58:33AM -0400, Rob Clark wrote:
>> On Mon, May 26, 2014 at 1:54 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>> > Ok, I think I should have read your msm implementation a _lot_ earlier.
>> > Explains your desing choices neatly.
>> >
>> > Two observations:
>> >
>> > - A GO bit makes nuclear pageflips ridiculously easy to implement,
>> >   presuming the hardware actually works. And it's the sane model, so imo a
>> >   good one to wrap the atomic helpers around.
>> >
>> >   But reality is often a lot more ugly, especially if you're employed by
>> >   Intel. Which is why I'm harping so much on this helpers-vs-core
>> >   interface issues ... We really need the full state transition in one
>> >   piece to do anything useful.
>> >
>> > - msm doesn't have any resource sharing going on for modeset operations
>> >   (which I mean lighting up crtcs to feed pixel streams to connectors).
>> >   Which means the simplistic "loop over all crtcs and call the old
>> >   ->setcrtc" approach actually works.
>>
>> we do actually have some shared resources on mdp5 generation (the
>> "smp" blocks, basically a shared buffer which we need to allocate fifo
>> space from for each plane)..
>>
>> I'm mostly ignoring this at the moment, because we don't support
>> enough crtc's yet to run out of smp blocks.  But hooking in at the
>> current ->atomic_commit() would be enough for me, I think.  Tbh, it is
>> something that should be handled at the ->atomic_check() stage so I
>> hadn't given much though to ->atomic_commit() stage.
>>
>> That all said, I've no problem with adding one more hook, after
>> ->atomic_commit() and lock magic, before loop over planes/crtcs, so
>> drivers that need to can replace the loops and do their own thing.
>> Current state should be reasonably good for sane hw.  I'm just waiting
>> for patches for i915 to add whatever it needs.
>
> I don't think that will be enough for you. Example, slightly hypothetical:
>
> - Configuration A: crtc 0 displays a small screen, crtc 1 a giant screen.
>   Together they just barely fit into your fifo space.
>
> - Configuration B: crtc 0 drives a giant screen, crtc 1 a tiny one.
>   Presume different outputs here so that no implicit output stealing
>   happens upon mode switching.
>
> Atomic switch should work, but don't since if you just loop over crtcs you
> have the intermediate stage where you try to drive 2 giant screens, which
> will run out of fifo resources. And I think it's really common to have
> such implicit resource sharing, maybe except for the most beefy desktop
> gpu which simply can't run out of memory bw with today's screen.

Well, this situation is a bit problematic in other similar cases..
moving plane between crtc's which needs to wait on a vblank is another
vaguely similar case.  I was kinda thinking of punting on that one
(ie. -EBUSY and userspace tries again on next frame).  Maybe for
modeset that doesn't work out so well, since frame N+1 you'll still be
at configuration A and have the same problem.

Would be kinda nice if helpers could order things according to what
decreases resource requirements vs what increases.  Although we could
probably get a lot of mileage out of just making the 'atomic loop over
things helper' apply config for crtcs/planes which will be disabled
first, before the ones which will be enabled at the end of the commit.
 Hmm.

Either way, if you have to replace 'atomic loop over things helper'
with your own i915 specific thing, it shouldn't make much difference
to you.  And I don't really think we need to solve all the worlds
problems in the first version.  But seems like there could be some
value in making helpers a bit more aware of shared resource
constraints.

> So afaics you really need to push a bit of smarts into the crtc helpers to
> make atomic possible, which then leaves you with interaction issues
> between the atomic stuff for GO bit capable hw for plane updates and the
> modeset ordering hell.
>
>> >   The problem I see here is that if you have hardware with more elaborate
>> >   needs (e.g. shared dplls), but otherwise sanity for plane updates (i.e.
>> >   a GO bit) then your current atomic helpers will make it rather hard to
>> >   mix this. So I think we should pimp the crtc helpers a bit to be atomic
>> >   compliant (i.e. kill all outputs first before enabling anything new) and
>> >   try to integrate this with the atomic helpers used for GO bit style
>> >   updates.
>>
>> Not really, I don't think.  You can still do whatever shared resource
>> setup in ->atomic_commit().  For drivers which want to defer commit
>> (ie. doing commit after fb's ready from cpu, rather than from gpu), it
>> would probably be more convenient to split up atomic_commit() so
>> drivers have a post lock hook.  I think it is just a few line patch,
>> and I think that it probably isn't really needed until i915 is ready.
>> I'm pretty sure we can change this to do what i915 needs, while at the
>> same time keeping it simple for 'GO bit' drivers.
>>
>> The bit about crtc helpers, I'll have to think about.  I'm not sure
>> that we want to require that 'atomic' (modeset or pageflip) completely
>> *replaces* current state.  So disabling unrelated crtcs doesn't seem
>> like the right thing.  But we have some time to decide about the
>> semantics of an atomic modeset before we expose to userspace.
>
> I'm not talking about replacing unrelated crtcs. It's also not about
> underspecified state or about enabling/changing global resources.
>
> It is _only_ about ordering of the various operations: If both the
> current and the desired new configuration are at the limit of what the hw
> can do you can't switch to the new configuration by looping over all
> crtcs. The fact that this doesn't work is the entire point of atomic
> modesets. And if we have helpers which aren't cut out for the task at hand
> for any hw where we need to have atomic modesets to make such changes
> possible without userspace going nuts, then the helpers are imo simply not
> useful
>
>> >   i915 has dpll sharing on ivb/hsw, but doesn't use the the crtc helpers
>> >   anymore. But the radone eyefinity (or whatever the damn thing is called)
>> >   I have lying around here fits the bill: It has 5 DP+ outputs but only 2
>> >   dplls. So you can drive e.g. 3 DP screens and then switch to 2 HDMI
>> >   screens atomically and it should all pan out.
>> >
>> >   But since your current helpers just loop over all crtcs without any
>> >   regard to ordering constraints this will fall over if the 2 HDMI outputs
>> >   get enabled before the 3 DP outputs get disabled all disabled.
>>
>> the driver should have rejected the config before it gets to commit
>> stage, if it were an impossible config.
>
> The configuration _is_ possible. We simply have to be somewhat careful
> with transitioning to it, since not _all_ intermediate states are
> possible. Your current helpers presume that's the case, which afaik isn't
> the case on any hw where we have global limits. For modesets.
>
> Nuclear pageflips are a completely different thing, as long as your hw has
> a GO bit.
>
>> > So with all that in mind I have 3 sanity checks for the atomic interfaces
>> > and helpers:
>> >
>> > 1. The atomic helpers should make enabling sane hw with a GO bit easy for
>> > nuclear pageflips. Benchmark would be sane hw like msm or omap.
>> >
>> > 2. It should cooperate with the crtc helpers such that hw with some shared
>> > resources (like dplls) works for atomic modesets. That pretty much means
>> > "disable everything (including crtc/connetor pipelines that only changed
>> > some parts) before enabling anything". Benchmark would be a platform with
>> > shared dplls.
>>
>> btw, not something I'd thought about before, but shared dplls seem
>> common enough, that it is something core could help with.  (Assuming
>> they are all related to pixel clk and not some derived thing that core
>> wouldn't know about.)
>
> Yup, that's what I'm trying to say ;-) But it was just an example, I
> believe that atm your helpers don't help for any shared modeset resources
> at all.

no, not at all (other than the ww_mutex stuff which should be useful
for shared resources and more fine grained locking within driver).  It
wasn't really a goal.

But having some knowledge about shared resources seems like it could
make core helpers more useful when I get closer to exploiting the
limits of the hw I have..  I suspect i915 is just further down that
path than the other drivers.

>> I think we do need to decide what partial state updates me in the
>> context of modeset or pageflip.  I kinda think the right thing is
>> different for modeset vs pageflip.  Maybe we want to define an atomic
>> flag to mean "disable/discard everything else"..  at any rate, we only
>> need to sort that before exposing the API to userspace.
>
> Yeah, I still strongly support this split in the api itself. For i915 my
> plan is to have separate configuration structures for modeset state and
> pageflip/plane config state. When we do an atomic modeset we compute both
> (maybe with some shortcut if we know that the pipe config didn't change at
> all). Then comes the big switch:
>
> - If we have the same pipe config, we simply to a vblank synce nuclear
>   flip to the new config.
>
> - If pipe configs change it will be much more elaborate:
>
>   1. Do a vblank synced nuclear flip to black on all pipes that need to go
>   off (whether they get disabled or reconfigured doesn't matter for now).
>
>   2. Disable pipes.
>
>   3. Commit new state on the sw side.
>
>   4. Enable all pipes with the new config, but only scanning out black.
>
>   5. Do a vblank-synced nuclear flip to the new config. This would also be
>   the one that would signale the drm events that the atomic update
>   completed.
>
>   For fastboot we might need some hacks to fuse this all together, e.g for
>   some panel fitter changes we don't need to disable the pipe completely.
>   But that's the advanced stuff really.
>
> I think modelling the crtc helpers after this model could work. But that
> means that the crtc helpers and the nuclear flip atomic helpers for GO
> bit capable hw need to be rather tightly integrated, while still allowing
> drivers to override the nuclear flip parts.
>
>> > 3. The core->driver interface should be powerful enough to support
>> > insanity like i915, but no more. Which means all the code that's share
>> > (i.e. the set_prop code I've been harping all over the place about) should
>> > be done in the core, without any means for drivers to override. Currently
>> > the drm core also takes care of a bunch of things like basic locking and
>> > refcounting, and that's kinda the spirit for this. i915 is the obvious
>> > benchmark here.
>>
>> The more I think about it, the more I think we should leave set_prop
>> as it is (although possibly with drm_atomic_state ->
>> drm_{plane,crtc,etc}_state).
>>
>> In the past, before primary plane, I really needed this.  And I expect
>> having a convenient way to 'sniff' changing properties as they go by
>> will be useful in some cases.
>
> I actually really like the addition of the state object to ->set_prop. At
> least for i915 (which already has a fair pile of additional properties)
> that looks like the perfect way to prep the stage.
>
> But at least for the transition we should keep the impact minimal. So no
> manadatory callbacks and don't enforce the usage of the state object until
> the drm core is fully converted to always follow a set_prop with a
> ->commit. Since currently we have internal mode_set calls in all i915
> set_prop functions and we need to convert them all. But we can't do that
> until all the core stuff uses the atomic interface for all legacy ioctl
> ops.
>

fwiw, at least all set_prop ioctl stuff from core follows up with a
atomic_commit now.  There are one or two more places doing mode_set
un-atomically.  And I'm not sure if you call set_prop anywhere
internally from i915.

BR,
-R
Daniel Vetter May 27, 2014, 7:26 p.m. UTC | #5
On Tue, May 27, 2014 at 02:48:41PM -0400, Rob Clark wrote:
> On Tue, May 27, 2014 at 1:50 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> > On Tue, May 27, 2014 at 11:58:33AM -0400, Rob Clark wrote:
> >> On Mon, May 26, 2014 at 1:54 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> >> > Ok, I think I should have read your msm implementation a _lot_ earlier.
> >> > Explains your desing choices neatly.
> >> >
> >> > Two observations:
> >> >
> >> > - A GO bit makes nuclear pageflips ridiculously easy to implement,
> >> >   presuming the hardware actually works. And it's the sane model, so imo a
> >> >   good one to wrap the atomic helpers around.
> >> >
> >> >   But reality is often a lot more ugly, especially if you're employed by
> >> >   Intel. Which is why I'm harping so much on this helpers-vs-core
> >> >   interface issues ... We really need the full state transition in one
> >> >   piece to do anything useful.
> >> >
> >> > - msm doesn't have any resource sharing going on for modeset operations
> >> >   (which I mean lighting up crtcs to feed pixel streams to connectors).
> >> >   Which means the simplistic "loop over all crtcs and call the old
> >> >   ->setcrtc" approach actually works.
> >>
> >> we do actually have some shared resources on mdp5 generation (the
> >> "smp" blocks, basically a shared buffer which we need to allocate fifo
> >> space from for each plane)..
> >>
> >> I'm mostly ignoring this at the moment, because we don't support
> >> enough crtc's yet to run out of smp blocks.  But hooking in at the
> >> current ->atomic_commit() would be enough for me, I think.  Tbh, it is
> >> something that should be handled at the ->atomic_check() stage so I
> >> hadn't given much though to ->atomic_commit() stage.
> >>
> >> That all said, I've no problem with adding one more hook, after
> >> ->atomic_commit() and lock magic, before loop over planes/crtcs, so
> >> drivers that need to can replace the loops and do their own thing.
> >> Current state should be reasonably good for sane hw.  I'm just waiting
> >> for patches for i915 to add whatever it needs.
> >
> > I don't think that will be enough for you. Example, slightly hypothetical:
> >
> > - Configuration A: crtc 0 displays a small screen, crtc 1 a giant screen.
> >   Together they just barely fit into your fifo space.
> >
> > - Configuration B: crtc 0 drives a giant screen, crtc 1 a tiny one.
> >   Presume different outputs here so that no implicit output stealing
> >   happens upon mode switching.
> >
> > Atomic switch should work, but don't since if you just loop over crtcs you
> > have the intermediate stage where you try to drive 2 giant screens, which
> > will run out of fifo resources. And I think it's really common to have
> > such implicit resource sharing, maybe except for the most beefy desktop
> > gpu which simply can't run out of memory bw with today's screen.
> 
> Well, this situation is a bit problematic in other similar cases..
> moving plane between crtc's which needs to wait on a vblank is another
> vaguely similar case.  I was kinda thinking of punting on that one
> (ie. -EBUSY and userspace tries again on next frame).  Maybe for
> modeset that doesn't work out so well, since frame N+1 you'll still be
> at configuration A and have the same problem.
> 
> Would be kinda nice if helpers could order things according to what
> decreases resource requirements vs what increases.  Although we could
> probably get a lot of mileage out of just making the 'atomic loop over
> things helper' apply config for crtcs/planes which will be disabled
> first, before the ones which will be enabled at the end of the commit.
>  Hmm.

Yeah, that one should go a long way, but only for modeset changes. If we
do this for plane updates it will look _really_ bad ;-)

So for the "move plane from crtc to crtc" I think we need a separate step.
Presuming we don't have any modeset operations we should be able to group
all the plane updates per crtc. This ofc presumes a sane hw with GO bit.
Then the helper could figure out which crtc it needs to nuclear flip first
to be able to move the plane.

At least with the current atomic ioctl proposal we can't reject this with
-EBUSY since with a full modeset (which takes longer than one vblank
anyway) it's possible. Otoh we _should_ reject it when userspace expects a
vblank synced update.

Which is another reason for why I think we really should have a flag
somewhere for vblank synced atomic updates vs. atomic updates which take
longer than one vblank and might even involved a few msec of blank screens
for switching.

> Either way, if you have to replace 'atomic loop over things helper'
> with your own i915 specific thing, it shouldn't make much difference
> to you.  And I don't really think we need to solve all the worlds
> problems in the first version.  But seems like there could be some
> value in making helpers a bit more aware of shared resource
> constraints.

This isn't about i915, but about all the drivers using crtc helpers.
Correct ordering of the crtc helper hooks should pretty much solve this,
without the need to track global resources (i.e. disable everything before
we start enabling). At least for modeset-like atomic updates.

i915 will roll their own, but not because atomic updates aren't possible
with the crtc helpers, but because the crtc helpers are inadequate for our
hw. For modeset updates atomic or not doesn't factor in here.

And imo if we can make the crtc helpers work, we should do that. Otherwise
there won't be a whole lot of use behind the atomic modeset updates imo.

> > So afaics you really need to push a bit of smarts into the crtc helpers to
> > make atomic possible, which then leaves you with interaction issues
> > between the atomic stuff for GO bit capable hw for plane updates and the
> > modeset ordering hell.
> >
> >> >   The problem I see here is that if you have hardware with more elaborate
> >> >   needs (e.g. shared dplls), but otherwise sanity for plane updates (i.e.
> >> >   a GO bit) then your current atomic helpers will make it rather hard to
> >> >   mix this. So I think we should pimp the crtc helpers a bit to be atomic
> >> >   compliant (i.e. kill all outputs first before enabling anything new) and
> >> >   try to integrate this with the atomic helpers used for GO bit style
> >> >   updates.
> >>
> >> Not really, I don't think.  You can still do whatever shared resource
> >> setup in ->atomic_commit().  For drivers which want to defer commit
> >> (ie. doing commit after fb's ready from cpu, rather than from gpu), it
> >> would probably be more convenient to split up atomic_commit() so
> >> drivers have a post lock hook.  I think it is just a few line patch,
> >> and I think that it probably isn't really needed until i915 is ready.
> >> I'm pretty sure we can change this to do what i915 needs, while at the
> >> same time keeping it simple for 'GO bit' drivers.
> >>
> >> The bit about crtc helpers, I'll have to think about.  I'm not sure
> >> that we want to require that 'atomic' (modeset or pageflip) completely
> >> *replaces* current state.  So disabling unrelated crtcs doesn't seem
> >> like the right thing.  But we have some time to decide about the
> >> semantics of an atomic modeset before we expose to userspace.
> >
> > I'm not talking about replacing unrelated crtcs. It's also not about
> > underspecified state or about enabling/changing global resources.
> >
> > It is _only_ about ordering of the various operations: If both the
> > current and the desired new configuration are at the limit of what the hw
> > can do you can't switch to the new configuration by looping over all
> > crtcs. The fact that this doesn't work is the entire point of atomic
> > modesets. And if we have helpers which aren't cut out for the task at hand
> > for any hw where we need to have atomic modesets to make such changes
> > possible without userspace going nuts, then the helpers are imo simply not
> > useful
> >
> >> >   i915 has dpll sharing on ivb/hsw, but doesn't use the the crtc helpers
> >> >   anymore. But the radone eyefinity (or whatever the damn thing is called)
> >> >   I have lying around here fits the bill: It has 5 DP+ outputs but only 2
> >> >   dplls. So you can drive e.g. 3 DP screens and then switch to 2 HDMI
> >> >   screens atomically and it should all pan out.
> >> >
> >> >   But since your current helpers just loop over all crtcs without any
> >> >   regard to ordering constraints this will fall over if the 2 HDMI outputs
> >> >   get enabled before the 3 DP outputs get disabled all disabled.
> >>
> >> the driver should have rejected the config before it gets to commit
> >> stage, if it were an impossible config.
> >
> > The configuration _is_ possible. We simply have to be somewhat careful
> > with transitioning to it, since not _all_ intermediate states are
> > possible. Your current helpers presume that's the case, which afaik isn't
> > the case on any hw where we have global limits. For modesets.
> >
> > Nuclear pageflips are a completely different thing, as long as your hw has
> > a GO bit.
> >
> >> > So with all that in mind I have 3 sanity checks for the atomic interfaces
> >> > and helpers:
> >> >
> >> > 1. The atomic helpers should make enabling sane hw with a GO bit easy for
> >> > nuclear pageflips. Benchmark would be sane hw like msm or omap.
> >> >
> >> > 2. It should cooperate with the crtc helpers such that hw with some shared
> >> > resources (like dplls) works for atomic modesets. That pretty much means
> >> > "disable everything (including crtc/connetor pipelines that only changed
> >> > some parts) before enabling anything". Benchmark would be a platform with
> >> > shared dplls.
> >>
> >> btw, not something I'd thought about before, but shared dplls seem
> >> common enough, that it is something core could help with.  (Assuming
> >> they are all related to pixel clk and not some derived thing that core
> >> wouldn't know about.)
> >
> > Yup, that's what I'm trying to say ;-) But it was just an example, I
> > believe that atm your helpers don't help for any shared modeset resources
> > at all.
> 
> no, not at all (other than the ww_mutex stuff which should be useful
> for shared resources and more fine grained locking within driver).  It
> wasn't really a goal.
> 
> But having some knowledge about shared resources seems like it could
> make core helpers more useful when I get closer to exploiting the
> limits of the hw I have..  I suspect i915 is just further down that
> path than the other drivers.

I'm repeating myself, but simply ordering updates correctly should already
solve it. At least if the driver provides checks to make sure the new
config doesn't go over limits (e.g. by counting plls or required fifo
space). If we don't have that, the helpers are imo not sufficiently
validated as generally useful. And I have seen _way_ too much single use
code in the drm core from the old ums/dri1 days.

> >> I think we do need to decide what partial state updates me in the
> >> context of modeset or pageflip.  I kinda think the right thing is
> >> different for modeset vs pageflip.  Maybe we want to define an atomic
> >> flag to mean "disable/discard everything else"..  at any rate, we only
> >> need to sort that before exposing the API to userspace.
> >
> > Yeah, I still strongly support this split in the api itself. For i915 my
> > plan is to have separate configuration structures for modeset state and
> > pageflip/plane config state. When we do an atomic modeset we compute both
> > (maybe with some shortcut if we know that the pipe config didn't change at
> > all). Then comes the big switch:
> >
> > - If we have the same pipe config, we simply to a vblank synce nuclear
> >   flip to the new config.
> >
> > - If pipe configs change it will be much more elaborate:
> >
> >   1. Do a vblank synced nuclear flip to black on all pipes that need to go
> >   off (whether they get disabled or reconfigured doesn't matter for now).
> >
> >   2. Disable pipes.
> >
> >   3. Commit new state on the sw side.
> >
> >   4. Enable all pipes with the new config, but only scanning out black.
> >
> >   5. Do a vblank-synced nuclear flip to the new config. This would also be
> >   the one that would signale the drm events that the atomic update
> >   completed.
> >
> >   For fastboot we might need some hacks to fuse this all together, e.g for
> >   some panel fitter changes we don't need to disable the pipe completely.
> >   But that's the advanced stuff really.
> >
> > I think modelling the crtc helpers after this model could work. But that
> > means that the crtc helpers and the nuclear flip atomic helpers for GO
> > bit capable hw need to be rather tightly integrated, while still allowing
> > drivers to override the nuclear flip parts.
> >
> >> > 3. The core->driver interface should be powerful enough to support
> >> > insanity like i915, but no more. Which means all the code that's share
> >> > (i.e. the set_prop code I've been harping all over the place about) should
> >> > be done in the core, without any means for drivers to override. Currently
> >> > the drm core also takes care of a bunch of things like basic locking and
> >> > refcounting, and that's kinda the spirit for this. i915 is the obvious
> >> > benchmark here.
> >>
> >> The more I think about it, the more I think we should leave set_prop
> >> as it is (although possibly with drm_atomic_state ->
> >> drm_{plane,crtc,etc}_state).
> >>
> >> In the past, before primary plane, I really needed this.  And I expect
> >> having a convenient way to 'sniff' changing properties as they go by
> >> will be useful in some cases.
> >
> > I actually really like the addition of the state object to ->set_prop. At
> > least for i915 (which already has a fair pile of additional properties)
> > that looks like the perfect way to prep the stage.
> >
> > But at least for the transition we should keep the impact minimal. So no
> > manadatory callbacks and don't enforce the usage of the state object until
> > the drm core is fully converted to always follow a set_prop with a
> > ->commit. Since currently we have internal mode_set calls in all i915
> > set_prop functions and we need to convert them all. But we can't do that
> > until all the core stuff uses the atomic interface for all legacy ioctl
> > ops.
> >
> 
> fwiw, at least all set_prop ioctl stuff from core follows up with a
> atomic_commit now.  There are one or two more places doing mode_set
> un-atomically.  And I'm not sure if you call set_prop anywhere
> internally from i915.

We do modesets internally, but as long as those aren't externally visible
(which they shouldn't be if we grab locks before checking the state) it
should be all fine. Also from my pov the ->set_prop stuff is just
interface to construct the in-kernel representation of the desired config.
The real magic happens in the check/commit hooks (which is the same level
i915-internal modeset changes happens at).

I think one excellent use-case we get for free (almost) without the ioctl
would be fbcon. It very much wants to do an atomic update, so converting
that over to the atomic interface would be good imo.
-Daniel
Rob Clark May 27, 2014, 8:06 p.m. UTC | #6
On Tue, May 27, 2014 at 3:26 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Tue, May 27, 2014 at 02:48:41PM -0400, Rob Clark wrote:
>> On Tue, May 27, 2014 at 1:50 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>> > On Tue, May 27, 2014 at 11:58:33AM -0400, Rob Clark wrote:
>> >> On Mon, May 26, 2014 at 1:54 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>> >> > Ok, I think I should have read your msm implementation a _lot_ earlier.
>> >> > Explains your desing choices neatly.
>> >> >
>> >> > Two observations:
>> >> >
>> >> > - A GO bit makes nuclear pageflips ridiculously easy to implement,
>> >> >   presuming the hardware actually works. And it's the sane model, so imo a
>> >> >   good one to wrap the atomic helpers around.
>> >> >
>> >> >   But reality is often a lot more ugly, especially if you're employed by
>> >> >   Intel. Which is why I'm harping so much on this helpers-vs-core
>> >> >   interface issues ... We really need the full state transition in one
>> >> >   piece to do anything useful.
>> >> >
>> >> > - msm doesn't have any resource sharing going on for modeset operations
>> >> >   (which I mean lighting up crtcs to feed pixel streams to connectors).
>> >> >   Which means the simplistic "loop over all crtcs and call the old
>> >> >   ->setcrtc" approach actually works.
>> >>
>> >> we do actually have some shared resources on mdp5 generation (the
>> >> "smp" blocks, basically a shared buffer which we need to allocate fifo
>> >> space from for each plane)..
>> >>
>> >> I'm mostly ignoring this at the moment, because we don't support
>> >> enough crtc's yet to run out of smp blocks.  But hooking in at the
>> >> current ->atomic_commit() would be enough for me, I think.  Tbh, it is
>> >> something that should be handled at the ->atomic_check() stage so I
>> >> hadn't given much though to ->atomic_commit() stage.
>> >>
>> >> That all said, I've no problem with adding one more hook, after
>> >> ->atomic_commit() and lock magic, before loop over planes/crtcs, so
>> >> drivers that need to can replace the loops and do their own thing.
>> >> Current state should be reasonably good for sane hw.  I'm just waiting
>> >> for patches for i915 to add whatever it needs.
>> >
>> > I don't think that will be enough for you. Example, slightly hypothetical:
>> >
>> > - Configuration A: crtc 0 displays a small screen, crtc 1 a giant screen.
>> >   Together they just barely fit into your fifo space.
>> >
>> > - Configuration B: crtc 0 drives a giant screen, crtc 1 a tiny one.
>> >   Presume different outputs here so that no implicit output stealing
>> >   happens upon mode switching.
>> >
>> > Atomic switch should work, but don't since if you just loop over crtcs you
>> > have the intermediate stage where you try to drive 2 giant screens, which
>> > will run out of fifo resources. And I think it's really common to have
>> > such implicit resource sharing, maybe except for the most beefy desktop
>> > gpu which simply can't run out of memory bw with today's screen.
>>
>> Well, this situation is a bit problematic in other similar cases..
>> moving plane between crtc's which needs to wait on a vblank is another
>> vaguely similar case.  I was kinda thinking of punting on that one
>> (ie. -EBUSY and userspace tries again on next frame).  Maybe for
>> modeset that doesn't work out so well, since frame N+1 you'll still be
>> at configuration A and have the same problem.
>>
>> Would be kinda nice if helpers could order things according to what
>> decreases resource requirements vs what increases.  Although we could
>> probably get a lot of mileage out of just making the 'atomic loop over
>> things helper' apply config for crtcs/planes which will be disabled
>> first, before the ones which will be enabled at the end of the commit.
>>  Hmm.
>
> Yeah, that one should go a long way, but only for modeset changes. If we
> do this for plane updates it will look _really_ bad ;-)
>
> So for the "move plane from crtc to crtc" I think we need a separate step.
> Presuming we don't have any modeset operations we should be able to group
> all the plane updates per crtc. This ofc presumes a sane hw with GO bit.
> Then the helper could figure out which crtc it needs to nuclear flip first
> to be able to move the plane.
>
> At least with the current atomic ioctl proposal we can't reject this with
> -EBUSY since with a full modeset (which takes longer than one vblank
> anyway) it's possible. Otoh we _should_ reject it when userspace expects a
> vblank synced update.
>
> Which is another reason for why I think we really should have a flag
> somewhere for vblank synced atomic updates vs. atomic updates which take
> longer than one vblank and might even involved a few msec of blank screens
> for switching.

Well, there was the NONBLOCK atomic flag.. I'm not entirely sure if we
should hang so much off of that one flag.

>> Either way, if you have to replace 'atomic loop over things helper'
>> with your own i915 specific thing, it shouldn't make much difference
>> to you.  And I don't really think we need to solve all the worlds
>> problems in the first version.  But seems like there could be some
>> value in making helpers a bit more aware of shared resource
>> constraints.
>
> This isn't about i915, but about all the drivers using crtc helpers.
> Correct ordering of the crtc helper hooks should pretty much solve this,
> without the need to track global resources (i.e. disable everything before
> we start enabling). At least for modeset-like atomic updates.
>
> i915 will roll their own, but not because atomic updates aren't possible
> with the crtc helpers, but because the crtc helpers are inadequate for our
> hw. For modeset updates atomic or not doesn't factor in here.
>
> And imo if we can make the crtc helpers work, we should do that. Otherwise
> there won't be a whole lot of use behind the atomic modeset updates imo.
>
>> > So afaics you really need to push a bit of smarts into the crtc helpers to
>> > make atomic possible, which then leaves you with interaction issues
>> > between the atomic stuff for GO bit capable hw for plane updates and the
>> > modeset ordering hell.
>> >
>> >> >   The problem I see here is that if you have hardware with more elaborate
>> >> >   needs (e.g. shared dplls), but otherwise sanity for plane updates (i.e.
>> >> >   a GO bit) then your current atomic helpers will make it rather hard to
>> >> >   mix this. So I think we should pimp the crtc helpers a bit to be atomic
>> >> >   compliant (i.e. kill all outputs first before enabling anything new) and
>> >> >   try to integrate this with the atomic helpers used for GO bit style
>> >> >   updates.
>> >>
>> >> Not really, I don't think.  You can still do whatever shared resource
>> >> setup in ->atomic_commit().  For drivers which want to defer commit
>> >> (ie. doing commit after fb's ready from cpu, rather than from gpu), it
>> >> would probably be more convenient to split up atomic_commit() so
>> >> drivers have a post lock hook.  I think it is just a few line patch,
>> >> and I think that it probably isn't really needed until i915 is ready.
>> >> I'm pretty sure we can change this to do what i915 needs, while at the
>> >> same time keeping it simple for 'GO bit' drivers.
>> >>
>> >> The bit about crtc helpers, I'll have to think about.  I'm not sure
>> >> that we want to require that 'atomic' (modeset or pageflip) completely
>> >> *replaces* current state.  So disabling unrelated crtcs doesn't seem
>> >> like the right thing.  But we have some time to decide about the
>> >> semantics of an atomic modeset before we expose to userspace.
>> >
>> > I'm not talking about replacing unrelated crtcs. It's also not about
>> > underspecified state or about enabling/changing global resources.
>> >
>> > It is _only_ about ordering of the various operations: If both the
>> > current and the desired new configuration are at the limit of what the hw
>> > can do you can't switch to the new configuration by looping over all
>> > crtcs. The fact that this doesn't work is the entire point of atomic
>> > modesets. And if we have helpers which aren't cut out for the task at hand
>> > for any hw where we need to have atomic modesets to make such changes
>> > possible without userspace going nuts, then the helpers are imo simply not
>> > useful
>> >
>> >> >   i915 has dpll sharing on ivb/hsw, but doesn't use the the crtc helpers
>> >> >   anymore. But the radone eyefinity (or whatever the damn thing is called)
>> >> >   I have lying around here fits the bill: It has 5 DP+ outputs but only 2
>> >> >   dplls. So you can drive e.g. 3 DP screens and then switch to 2 HDMI
>> >> >   screens atomically and it should all pan out.
>> >> >
>> >> >   But since your current helpers just loop over all crtcs without any
>> >> >   regard to ordering constraints this will fall over if the 2 HDMI outputs
>> >> >   get enabled before the 3 DP outputs get disabled all disabled.
>> >>
>> >> the driver should have rejected the config before it gets to commit
>> >> stage, if it were an impossible config.
>> >
>> > The configuration _is_ possible. We simply have to be somewhat careful
>> > with transitioning to it, since not _all_ intermediate states are
>> > possible. Your current helpers presume that's the case, which afaik isn't
>> > the case on any hw where we have global limits. For modesets.
>> >
>> > Nuclear pageflips are a completely different thing, as long as your hw has
>> > a GO bit.
>> >
>> >> > So with all that in mind I have 3 sanity checks for the atomic interfaces
>> >> > and helpers:
>> >> >
>> >> > 1. The atomic helpers should make enabling sane hw with a GO bit easy for
>> >> > nuclear pageflips. Benchmark would be sane hw like msm or omap.
>> >> >
>> >> > 2. It should cooperate with the crtc helpers such that hw with some shared
>> >> > resources (like dplls) works for atomic modesets. That pretty much means
>> >> > "disable everything (including crtc/connetor pipelines that only changed
>> >> > some parts) before enabling anything". Benchmark would be a platform with
>> >> > shared dplls.
>> >>
>> >> btw, not something I'd thought about before, but shared dplls seem
>> >> common enough, that it is something core could help with.  (Assuming
>> >> they are all related to pixel clk and not some derived thing that core
>> >> wouldn't know about.)
>> >
>> > Yup, that's what I'm trying to say ;-) But it was just an example, I
>> > believe that atm your helpers don't help for any shared modeset resources
>> > at all.
>>
>> no, not at all (other than the ww_mutex stuff which should be useful
>> for shared resources and more fine grained locking within driver).  It
>> wasn't really a goal.
>>
>> But having some knowledge about shared resources seems like it could
>> make core helpers more useful when I get closer to exploiting the
>> limits of the hw I have..  I suspect i915 is just further down that
>> path than the other drivers.
>
> I'm repeating myself, but simply ordering updates correctly should already
> solve it. At least if the driver provides checks to make sure the new
> config doesn't go over limits (e.g. by counting plls or required fifo
> space). If we don't have that, the helpers are imo not sufficiently
> validated as generally useful. And I have seen _way_ too much single use
> code in the drm core from the old ums/dri1 days.
>
>> >> I think we do need to decide what partial state updates me in the
>> >> context of modeset or pageflip.  I kinda think the right thing is
>> >> different for modeset vs pageflip.  Maybe we want to define an atomic
>> >> flag to mean "disable/discard everything else"..  at any rate, we only
>> >> need to sort that before exposing the API to userspace.
>> >
>> > Yeah, I still strongly support this split in the api itself. For i915 my
>> > plan is to have separate configuration structures for modeset state and
>> > pageflip/plane config state. When we do an atomic modeset we compute both
>> > (maybe with some shortcut if we know that the pipe config didn't change at
>> > all). Then comes the big switch:
>> >
>> > - If we have the same pipe config, we simply to a vblank synce nuclear
>> >   flip to the new config.
>> >
>> > - If pipe configs change it will be much more elaborate:
>> >
>> >   1. Do a vblank synced nuclear flip to black on all pipes that need to go
>> >   off (whether they get disabled or reconfigured doesn't matter for now).
>> >
>> >   2. Disable pipes.
>> >
>> >   3. Commit new state on the sw side.
>> >
>> >   4. Enable all pipes with the new config, but only scanning out black.
>> >
>> >   5. Do a vblank-synced nuclear flip to the new config. This would also be
>> >   the one that would signale the drm events that the atomic update
>> >   completed.
>> >
>> >   For fastboot we might need some hacks to fuse this all together, e.g for
>> >   some panel fitter changes we don't need to disable the pipe completely.
>> >   But that's the advanced stuff really.
>> >
>> > I think modelling the crtc helpers after this model could work. But that
>> > means that the crtc helpers and the nuclear flip atomic helpers for GO
>> > bit capable hw need to be rather tightly integrated, while still allowing
>> > drivers to override the nuclear flip parts.
>> >
>> >> > 3. The core->driver interface should be powerful enough to support
>> >> > insanity like i915, but no more. Which means all the code that's share
>> >> > (i.e. the set_prop code I've been harping all over the place about) should
>> >> > be done in the core, without any means for drivers to override. Currently
>> >> > the drm core also takes care of a bunch of things like basic locking and
>> >> > refcounting, and that's kinda the spirit for this. i915 is the obvious
>> >> > benchmark here.
>> >>
>> >> The more I think about it, the more I think we should leave set_prop
>> >> as it is (although possibly with drm_atomic_state ->
>> >> drm_{plane,crtc,etc}_state).
>> >>
>> >> In the past, before primary plane, I really needed this.  And I expect
>> >> having a convenient way to 'sniff' changing properties as they go by
>> >> will be useful in some cases.
>> >
>> > I actually really like the addition of the state object to ->set_prop. At
>> > least for i915 (which already has a fair pile of additional properties)
>> > that looks like the perfect way to prep the stage.
>> >
>> > But at least for the transition we should keep the impact minimal. So no
>> > manadatory callbacks and don't enforce the usage of the state object until
>> > the drm core is fully converted to always follow a set_prop with a
>> > ->commit. Since currently we have internal mode_set calls in all i915
>> > set_prop functions and we need to convert them all. But we can't do that
>> > until all the core stuff uses the atomic interface for all legacy ioctl
>> > ops.
>> >
>>
>> fwiw, at least all set_prop ioctl stuff from core follows up with a
>> atomic_commit now.  There are one or two more places doing mode_set
>> un-atomically.  And I'm not sure if you call set_prop anywhere
>> internally from i915.
>
> We do modesets internally, but as long as those aren't externally visible
> (which they shouldn't be if we grab locks before checking the state) it
> should be all fine. Also from my pov the ->set_prop stuff is just
> interface to construct the in-kernel representation of the desired config.
> The real magic happens in the check/commit hooks (which is the same level
> i915-internal modeset changes happens at).
>
> I think one excellent use-case we get for free (almost) without the ioctl
> would be fbcon. It very much wants to do an atomic update, so converting
> that over to the atomic interface would be good imo.

Yes, iirc the remaining non-atomic paths are fbcon related.  In
principle it should be a simple matter to increment the refcnt on
fbcon state object and re-apply it.  Although at the moment we keep
track of *how* to apply the state (ie. page_flip vs set_config, etc)
as the state object is built up.. which isn't very conducive to
re-committing an existing state object.  Which is part of the reason I
wanted to deprecate the various existing
->page_flip/->update_plane/->set_config/etc and introduce per object
->commit()'s.  (Which could either be called by helpers, or called
internally by driver or completely ignored by driver)

I've been a bit reluctant so far to do too much additional refactoring
on top of atomic, since I'm about at the limit of what I have time to
repeatedly rebase each kernel version.  This is why I'm a bit anxious
to start merging some of atomic, even if it doesn't do absolutely
everything yet.

BR,
-R

> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
Daniel Vetter May 27, 2014, 10:09 p.m. UTC | #7
On Tue, May 27, 2014 at 04:06:28PM -0400, Rob Clark wrote:
> On Tue, May 27, 2014 at 3:26 PM, Daniel Vetter <daniel@ffwll.ch> wrote:

[snip]

> Well, there was the NONBLOCK atomic flag.. I'm not entirely sure if we
> should hang so much off of that one flag.

Yeah, a separate VBLANK_SYNCED might be useful. Apparently people also
want non-blocking modesets.

[snip]

> > I think one excellent use-case we get for free (almost) without the ioctl
> > would be fbcon. It very much wants to do an atomic update, so converting
> > that over to the atomic interface would be good imo.
> 
> Yes, iirc the remaining non-atomic paths are fbcon related.  In
> principle it should be a simple matter to increment the refcnt on
> fbcon state object and re-apply it.  Although at the moment we keep
> track of *how* to apply the state (ie. page_flip vs set_config, etc)
> as the state object is built up.. which isn't very conducive to
> re-committing an existing state object.  Which is part of the reason I
> wanted to deprecate the various existing
> ->page_flip/->update_plane/->set_config/etc and introduce per object
> ->commit()'s.  (Which could either be called by helpers, or called
> internally by driver or completely ignored by driver)

Yeah, I think the approach in here with a few helpers to bend atomic
->commit to the old hooks (somewhat-ish) is good. And with the crtc
helpers we should be able to move most drivers away from the old hooks
quickly. The exception will be pageflips/cursors, but that requires a lot
of driver-specific work, and probably first a full conversion to universal
planes (which atm don't support everything even for the primary plane due
to the arbitrary restriction to rgbx8888).

> I've been a bit reluctant so far to do too much additional refactoring
> on top of atomic, since I'm about at the limit of what I have time to
> repeatedly rebase each kernel version.  This is why I'm a bit anxious
> to start merging some of atomic, even if it doesn't do absolutely
> everything yet.

I understand that, which is why I suggested a bunch of things to split out
already so we can get them in.

On top of that I think with the split-up mode_config.mutex like I've just
proposed in an RFC we have a clear path for the locking issues, too. So
could go ahead an merge the w/w conversion, too.

That leaves the set_prop refactoring which is still under discussion.
Those three pieces hopefully help a lot with reducing rebasing pain.

On top of that I think we should look at cutting away functional pieces of
the conversion, e.g. ignore planes at first and only look at atomic
modeset. Or ignore atomic modesets and only look at plane updates,
rejecting anything that changes connectors or crtcs. By cutting out slices
we should be able to get patches into shape step-by-step.
-Daniel
Rob Clark May 27, 2014, 11:32 p.m. UTC | #8
On Tue, May 27, 2014 at 6:09 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Tue, May 27, 2014 at 04:06:28PM -0400, Rob Clark wrote:
>> On Tue, May 27, 2014 at 3:26 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>
> [snip]
>
>> Well, there was the NONBLOCK atomic flag.. I'm not entirely sure if we
>> should hang so much off of that one flag.
>
> Yeah, a separate VBLANK_SYNCED might be useful. Apparently people also
> want non-blocking modesets.

random-diverging-from-original-topic-thought.. seems like userspace
just wants a deadline/timeout (hopefully in units of vblanks).. at
least to the level of "I want this to happen on the next vblank (after
rendering completes), or give me an error" vs "I want this to complete
atomically even if it takes a few extra vblanks for things to sort
out"..

I guess that amounts to what you mean by VBLANK_SYNCED flag, but
VBLANKED_SYNCED might not be a good name.. at least for some hw all
you can do is vblank sync'd..

BR,
-R
Rob Clark May 27, 2014, 11:47 p.m. UTC | #9
On Tue, May 27, 2014 at 6:09 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Tue, May 27, 2014 at 04:06:28PM -0400, Rob Clark wrote:
>> On Tue, May 27, 2014 at 3:26 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>
> [snip]
>
>> Well, there was the NONBLOCK atomic flag.. I'm not entirely sure if we
>> should hang so much off of that one flag.
>
> Yeah, a separate VBLANK_SYNCED might be useful. Apparently people also
> want non-blocking modesets.
>
> [snip]
>
>> > I think one excellent use-case we get for free (almost) without the ioctl
>> > would be fbcon. It very much wants to do an atomic update, so converting
>> > that over to the atomic interface would be good imo.
>>
>> Yes, iirc the remaining non-atomic paths are fbcon related.  In
>> principle it should be a simple matter to increment the refcnt on
>> fbcon state object and re-apply it.  Although at the moment we keep
>> track of *how* to apply the state (ie. page_flip vs set_config, etc)
>> as the state object is built up.. which isn't very conducive to
>> re-committing an existing state object.  Which is part of the reason I
>> wanted to deprecate the various existing
>> ->page_flip/->update_plane/->set_config/etc and introduce per object
>> ->commit()'s.  (Which could either be called by helpers, or called
>> internally by driver or completely ignored by driver)
>
> Yeah, I think the approach in here with a few helpers to bend atomic
> ->commit to the old hooks (somewhat-ish) is good. And with the crtc
> helpers we should be able to move most drivers away from the old hooks
> quickly. The exception will be pageflips/cursors, but that requires a lot
> of driver-specific work, and probably first a full conversion to universal
> planes (which atm don't support everything even for the primary plane due
> to the arbitrary restriction to rgbx8888).

btw, I guess/hope most SoC drivers (ie. the ones that want to do crazy
things with overlays) would be moving rapidly away from using primary
plane helpers.  I think native primary planes fits better most of the
non-trivial mobile display controller blocks..

>> I've been a bit reluctant so far to do too much additional refactoring
>> on top of atomic, since I'm about at the limit of what I have time to
>> repeatedly rebase each kernel version.  This is why I'm a bit anxious
>> to start merging some of atomic, even if it doesn't do absolutely
>> everything yet.
>
> I understand that, which is why I suggested a bunch of things to split out
> already so we can get them in.
>
> On top of that I think with the split-up mode_config.mutex like I've just
> proposed in an RFC we have a clear path for the locking issues, too. So
> could go ahead an merge the w/w conversion, too.

I suppose now that I split modeset acquire ctx out into it's own thing
(originally it was inline w/ drm_atomic_state), I could reshuffle the
patches and move ww_mutex stuff below atomic.. will take a bit of
patch surgery since some parts of that patch would have to move to
later patches, but I can sort it out

> That leaves the set_prop refactoring which is still under discussion.
> Those three pieces hopefully help a lot with reducing rebasing pain.
>
> On top of that I think we should look at cutting away functional pieces of
> the conversion, e.g. ignore planes at first and only look at atomic
> modeset. Or ignore atomic modesets and only look at plane updates,
> rejecting anything that changes connectors or crtcs. By cutting out slices
> we should be able to get patches into shape step-by-step.

hmm, well, not sure how much value it is splitting up like that.  I
suppose the only-plane approach would sidestep/punt some issues.  But
once locking and set_prop are in place / agreed, it isn't that much
more from a churn perspective.

tbh, I think the worse problems are when we actually expose atomic
ioctl to userspace.  Right now we are rather restricted in the updates
triggered via atomic in that there are fixed well defined entry points
to atomic.  Ie. only pageflip() sets _NONBLOCK flag, and it does not
set mode property, so you won't ever see a _NONBLOCK modeset.

BR,
-R

> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
Daniel Vetter May 28, 2014, 1:21 p.m. UTC | #10
On Tue, May 27, 2014 at 07:32:46PM -0400, Rob Clark wrote:
> On Tue, May 27, 2014 at 6:09 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> > On Tue, May 27, 2014 at 04:06:28PM -0400, Rob Clark wrote:
> >> On Tue, May 27, 2014 at 3:26 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > [snip]
> >
> >> Well, there was the NONBLOCK atomic flag.. I'm not entirely sure if we
> >> should hang so much off of that one flag.
> >
> > Yeah, a separate VBLANK_SYNCED might be useful. Apparently people also
> > want non-blocking modesets.
> 
> random-diverging-from-original-topic-thought.. seems like userspace
> just wants a deadline/timeout (hopefully in units of vblanks).. at
> least to the level of "I want this to happen on the next vblank (after
> rendering completes), or give me an error" vs "I want this to complete
> atomically even if it takes a few extra vblanks for things to sort
> out"..
> 
> I guess that amounts to what you mean by VBLANK_SYNCED flag, but
> VBLANKED_SYNCED might not be a good name.. at least for some hw all
> you can do is vblank sync'd..

Hm, we might as well go full monty and allow userspace to request a
specific vblank counter. Would be useful for e.g. queuing up a bunch of
video frames, which some hw can do. But that would then require cancelling
of existing flips, so I guess for now a simple VBLANK_SYNCED flag which
emulates pageflip behaviour should be good enough.

That would also be useful if userspace attempts an atomic update on
drivers which only support atomic modeset (through the crtc helpers), but
can't do vblank synced updates. Then they could easily reject those. After
all current drivers also often lack a pageflip implementation ...
-Daniel
Daniel Vetter May 28, 2014, 1:32 p.m. UTC | #11
On Tue, May 27, 2014 at 07:47:42PM -0400, Rob Clark wrote:
> On Tue, May 27, 2014 at 6:09 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> > On Tue, May 27, 2014 at 04:06:28PM -0400, Rob Clark wrote:
> >> On Tue, May 27, 2014 at 3:26 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > [snip]
> >
> >> Well, there was the NONBLOCK atomic flag.. I'm not entirely sure if we
> >> should hang so much off of that one flag.
> >
> > Yeah, a separate VBLANK_SYNCED might be useful. Apparently people also
> > want non-blocking modesets.
> >
> > [snip]
> >
> >> > I think one excellent use-case we get for free (almost) without the ioctl
> >> > would be fbcon. It very much wants to do an atomic update, so converting
> >> > that over to the atomic interface would be good imo.
> >>
> >> Yes, iirc the remaining non-atomic paths are fbcon related.  In
> >> principle it should be a simple matter to increment the refcnt on
> >> fbcon state object and re-apply it.  Although at the moment we keep
> >> track of *how* to apply the state (ie. page_flip vs set_config, etc)
> >> as the state object is built up.. which isn't very conducive to
> >> re-committing an existing state object.  Which is part of the reason I
> >> wanted to deprecate the various existing
> >> ->page_flip/->update_plane/->set_config/etc and introduce per object
> >> ->commit()'s.  (Which could either be called by helpers, or called
> >> internally by driver or completely ignored by driver)
> >
> > Yeah, I think the approach in here with a few helpers to bend atomic
> > ->commit to the old hooks (somewhat-ish) is good. And with the crtc
> > helpers we should be able to move most drivers away from the old hooks
> > quickly. The exception will be pageflips/cursors, but that requires a lot
> > of driver-specific work, and probably first a full conversion to universal
> > planes (which atm don't support everything even for the primary plane due
> > to the arbitrary restriction to rgbx8888).
> 
> btw, I guess/hope most SoC drivers (ie. the ones that want to do crazy
> things with overlays) would be moving rapidly away from using primary
> plane helpers.  I think native primary planes fits better most of the
> non-trivial mobile display controller blocks..

Fully agreed. See my comment about the VBLANK_SYNCED flag, I think without
a driver implementation we should simply aggressively reject such updates.
Same with disabling the primary plane, at least until we've reworked the
interfaces a bit.

> >> I've been a bit reluctant so far to do too much additional refactoring
> >> on top of atomic, since I'm about at the limit of what I have time to
> >> repeatedly rebase each kernel version.  This is why I'm a bit anxious
> >> to start merging some of atomic, even if it doesn't do absolutely
> >> everything yet.
> >
> > I understand that, which is why I suggested a bunch of things to split out
> > already so we can get them in.
> >
> > On top of that I think with the split-up mode_config.mutex like I've just
> > proposed in an RFC we have a clear path for the locking issues, too. So
> > could go ahead an merge the w/w conversion, too.
> 
> I suppose now that I split modeset acquire ctx out into it's own thing
> (originally it was inline w/ drm_atomic_state), I could reshuffle the
> patches and move ww_mutex stuff below atomic.. will take a bit of
> patch surgery since some parts of that patch would have to move to
> later patches, but I can sort it out

below = earlier in the series or later? I'll assume earlier since that
what git log shows ;-)

As mentioned on irc I think we could also throw per-plane locks into the
mix to really validate the locking side of this. One thing we didn't
discuss that much in the last few emails is accessing the obj->state data
and other stuff which is autodetect (e.g. whether the sink is audio
capable or not). I still think we need one additional plain mutex to
protect that. It would sit between the mode_config.mutex and all the ww
mutexes in the locking hierarchy.

But we can look at that once the mode_config.mutex mess is untangled and
clear.

> > That leaves the set_prop refactoring which is still under discussion.
> > Those three pieces hopefully help a lot with reducing rebasing pain.
> >
> > On top of that I think we should look at cutting away functional pieces of
> > the conversion, e.g. ignore planes at first and only look at atomic
> > modeset. Or ignore atomic modesets and only look at plane updates,
> > rejecting anything that changes connectors or crtcs. By cutting out slices
> > we should be able to get patches into shape step-by-step.
> 
> hmm, well, not sure how much value it is splitting up like that.  I
> suppose the only-plane approach would sidestep/punt some issues.  But
> once locking and set_prop are in place / agreed, it isn't that much
> more from a churn perspective.

Hm, could be that it's not worth it, was just an idea.

> tbh, I think the worse problems are when we actually expose atomic
> ioctl to userspace.  Right now we are rather restricted in the updates
> triggered via atomic in that there are fixed well defined entry points
> to atomic.  Ie. only pageflip() sets _NONBLOCK flag, and it does not
> set mode property, so you won't ever see a _NONBLOCK modeset.

I guess we could lift limits step-by-step, e.g. reject any NONBLOCK
updates if they change crtc/connector properties. And reject any NONBLOCK
which isn't also vsynced.

Wrt earlier testing fbcon should give us basic in-kernel validation, since
it really wants to kill all the planes and everything and put a new config
into place in one go. Another good one would be suspend/resume. It needs
vt-switchless resume enabled (which is a per-driver knob and only i915 has
it for now, but it's easy to do). Then we could suspend/resume a full X
session with overlays, cursors and all and see whether the atomic engine
can cope and restore it all.

Once that's proven the ioctl shouldn't be that bad really any more. For
i915 I'll demand lots of tests for corner-cases, but the easiest to
implement workload might be the vt restore in the ddx. If we can do that
with atomic you can fire up a bunch of ridiculous (and conflicting) X
configs with cursors and overlays and the vt-switch between them like mad.
That should give us good testing I hope.
-Daniel
Ville Syrjälä May 28, 2014, 2:14 p.m. UTC | #12
On Wed, May 28, 2014 at 03:21:41PM +0200, Daniel Vetter wrote:
> On Tue, May 27, 2014 at 07:32:46PM -0400, Rob Clark wrote:
> > On Tue, May 27, 2014 at 6:09 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> > > On Tue, May 27, 2014 at 04:06:28PM -0400, Rob Clark wrote:
> > >> On Tue, May 27, 2014 at 3:26 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> > >
> > > [snip]
> > >
> > >> Well, there was the NONBLOCK atomic flag.. I'm not entirely sure if we
> > >> should hang so much off of that one flag.
> > >
> > > Yeah, a separate VBLANK_SYNCED might be useful. Apparently people also
> > > want non-blocking modesets.
> > 
> > random-diverging-from-original-topic-thought.. seems like userspace
> > just wants a deadline/timeout (hopefully in units of vblanks).. at
> > least to the level of "I want this to happen on the next vblank (after
> > rendering completes), or give me an error" vs "I want this to complete
> > atomically even if it takes a few extra vblanks for things to sort
> > out"..
> > 
> > I guess that amounts to what you mean by VBLANK_SYNCED flag, but
> > VBLANKED_SYNCED might not be a good name.. at least for some hw all
> > you can do is vblank sync'd..
> 
> Hm, we might as well go full monty and allow userspace to request a
> specific vblank counter. Would be useful for e.g. queuing up a bunch of
> video frames, which some hw can do. But that would then require cancelling
> of existing flips,

The hard part there would be rolling back the user visible state. But
if we were to disallow it for anything but simple page flips, we could
rather easily keep the fb pointers around in the structure that tracks
the progress of the operation.

Otherwise cancelling should be trivial when using mmio instead of the
cs.

But we could certainly do the target vblank thing without allowing
multiple queued flips initially, and without a cancel capability.

However using vblank counter is a bit problematic with multiple crtcs.
But I guess userspace can try to do the something_else->vbl conversion
itself if it wants to use some other units.

There's also the question whether we should allow a target vbl count for
each crtc, or just one of them.

We could make it so that you can specify the target vblank count only
for a single crtc, and the rest of the crtcs will flip soon before or
soon after. The reason for allowing the "soon before" case is because
it'll make the implementation much simpler. We just have to perform
all the register writes somewhere between 'target_vbl-1 - target_vbl'
of the specified crtc. The order in which the flips actually happen
then depends on the vblank period and phase of each crtc.

If user space wants target counts for all crtcs, it could issue separate
nuclear flips for each crtc, although that does raise the issue that we
can't check the entire target state, so we can't guarantee that all of
the nuclear flips will succeed. So that's a bit bad.

So I guess we could allow a target vbl count for all the crtcs, just
need to convey that information inside another array in the ioctl to
avoid imposing a limit in the number of crtcs. But then there's the
question what happens if only a subset of the involved crtcs have a
target count. Return an error? Or just pick one of the crtcs that
did get a target vblank and flip the rest at the same time? I guess
the latter option is better to allow one crtc to act as the master
clock for the whole thing, and the rest just hop along as best they
can.
Daniel Vetter May 28, 2014, 2:50 p.m. UTC | #13
On Wed, May 28, 2014 at 05:14:21PM +0300, Ville Syrjälä wrote:
> On Wed, May 28, 2014 at 03:21:41PM +0200, Daniel Vetter wrote:
> > On Tue, May 27, 2014 at 07:32:46PM -0400, Rob Clark wrote:
> > > On Tue, May 27, 2014 at 6:09 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > On Tue, May 27, 2014 at 04:06:28PM -0400, Rob Clark wrote:
> > > >> On Tue, May 27, 2014 at 3:26 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> > > >
> > > > [snip]
> > > >
> > > >> Well, there was the NONBLOCK atomic flag.. I'm not entirely sure if we
> > > >> should hang so much off of that one flag.
> > > >
> > > > Yeah, a separate VBLANK_SYNCED might be useful. Apparently people also
> > > > want non-blocking modesets.
> > > 
> > > random-diverging-from-original-topic-thought.. seems like userspace
> > > just wants a deadline/timeout (hopefully in units of vblanks).. at
> > > least to the level of "I want this to happen on the next vblank (after
> > > rendering completes), or give me an error" vs "I want this to complete
> > > atomically even if it takes a few extra vblanks for things to sort
> > > out"..
> > > 
> > > I guess that amounts to what you mean by VBLANK_SYNCED flag, but
> > > VBLANKED_SYNCED might not be a good name.. at least for some hw all
> > > you can do is vblank sync'd..
> > 
> > Hm, we might as well go full monty and allow userspace to request a
> > specific vblank counter. Would be useful for e.g. queuing up a bunch of
> > video frames, which some hw can do. But that would then require cancelling
> > of existing flips,
> 
> The hard part there would be rolling back the user visible state. But
> if we were to disallow it for anything but simple page flips, we could
> rather easily keep the fb pointers around in the structure that tracks
> the progress of the operation.
> 
> Otherwise cancelling should be trivial when using mmio instead of the
> cs.
> 
> But we could certainly do the target vblank thing without allowing
> multiple queued flips initially, and without a cancel capability.
> 
> However using vblank counter is a bit problematic with multiple crtcs.
> But I guess userspace can try to do the something_else->vbl conversion
> itself if it wants to use some other units.
> 
> There's also the question whether we should allow a target vbl count for
> each crtc, or just one of them.
> 
> We could make it so that you can specify the target vblank count only
> for a single crtc, and the rest of the crtcs will flip soon before or
> soon after. The reason for allowing the "soon before" case is because
> it'll make the implementation much simpler. We just have to perform
> all the register writes somewhere between 'target_vbl-1 - target_vbl'
> of the specified crtc. The order in which the flips actually happen
> then depends on the vblank period and phase of each crtc.
> 
> If user space wants target counts for all crtcs, it could issue separate
> nuclear flips for each crtc, although that does raise the issue that we
> can't check the entire target state, so we can't guarantee that all of
> the nuclear flips will succeed. So that's a bit bad.
> 
> So I guess we could allow a target vbl count for all the crtcs, just
> need to convey that information inside another array in the ioctl to
> avoid imposing a limit in the number of crtcs. But then there's the
> question what happens if only a subset of the involved crtcs have a
> target count. Return an error? Or just pick one of the crtcs that
> did get a target vblank and flip the rest at the same time? I guess
> the latter option is better to allow one crtc to act as the master
> clock for the whole thing, and the rest just hop along as best they
> can.

I guess if userspace asks for a target count on more than one crtc we'd
reject the request. That leaves a bit a window unfortunately where
userspace partially submitted a request and then the kernel starts to tell
it that it's too late. But then I don't think we can really avoid this
situation without going completely nuts on the implementation side.

So a best effort (maybe with the guarantee that if we're too late it wont
happen so that userspace could try to render the next frame and display
that for smoothness) approach should be good enough here.

Anyway, let's not get too distracted with fancy ideas before we have the
basics ;-)

Cheers, Daniel
Rob Clark May 28, 2014, 3:19 p.m. UTC | #14
On Wed, May 28, 2014 at 9:21 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Tue, May 27, 2014 at 07:32:46PM -0400, Rob Clark wrote:
>> On Tue, May 27, 2014 at 6:09 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>> > On Tue, May 27, 2014 at 04:06:28PM -0400, Rob Clark wrote:
>> >> On Tue, May 27, 2014 at 3:26 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>> >
>> > [snip]
>> >
>> >> Well, there was the NONBLOCK atomic flag.. I'm not entirely sure if we
>> >> should hang so much off of that one flag.
>> >
>> > Yeah, a separate VBLANK_SYNCED might be useful. Apparently people also
>> > want non-blocking modesets.
>>
>> random-diverging-from-original-topic-thought.. seems like userspace
>> just wants a deadline/timeout (hopefully in units of vblanks).. at
>> least to the level of "I want this to happen on the next vblank (after
>> rendering completes), or give me an error" vs "I want this to complete
>> atomically even if it takes a few extra vblanks for things to sort
>> out"..
>>
>> I guess that amounts to what you mean by VBLANK_SYNCED flag, but
>> VBLANKED_SYNCED might not be a good name.. at least for some hw all
>> you can do is vblank sync'd..
>
> Hm, we might as well go full monty and allow userspace to request a
> specific vblank counter. Would be useful for e.g. queuing up a bunch of
> video frames, which some hw can do. But that would then require cancelling
> of existing flips, so I guess for now a simple VBLANK_SYNCED flag which
> emulates pageflip behaviour should be good enough.

The full on vblank counter and queue stuff up could be interesting..
not entirely sure how we'd manage ->atomic_check() against a not yet
applied base state.  I suppose it is just a matter of copying the
previous-state values for new state objects from the not-yet applied
state, rather than {plane,crtc,etc}->state..

And since state objects are refcnt'd I guess forming a linked list out
of 'em would not be hard.

Probably we should not allow queuing in the beginning.  But does kinda
seem like some magic is possible to handle this.

> That would also be useful if userspace attempts an atomic update on
> drivers which only support atomic modeset (through the crtc helpers), but
> can't do vblank synced updates. Then they could easily reject those. After
> all current drivers also often lack a pageflip implementation ...

yeah.. but I guess this mainly the server chips and old stuff?  I
wonder how many drivers support multiple crtcs but not pageflip?

BR,
-R

> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
diff mbox

Patch

diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
index 5e1e6b0..dd12f56 100644
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@@ -27,6 +27,7 @@  msm-y := \
 	mdp/mdp5/mdp5_kms.o \
 	mdp/mdp5/mdp5_plane.o \
 	mdp/mdp5/mdp5_smp.o \
+	msm_atomic.o \
 	msm_drv.o \
 	msm_fb.o \
 	msm_gem.o \
diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
index d0d8befd..2fa6b75 100644
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
@@ -52,7 +52,6 @@  struct mdp4_crtc {
 
 	/* if there is a pending flip, these will be non-null: */
 	struct drm_pending_vblank_event *event;
-	struct msm_fence_cb pageflip_cb;
 
 #define PENDING_CURSOR 0x1
 #define PENDING_FLIP   0x2
@@ -93,12 +92,16 @@  static void request_pending(struct drm_crtc *crtc, uint32_t pending)
 	mdp_irq_register(&get_kms(crtc)->base, &mdp4_crtc->vblank);
 }
 
-static void crtc_flush(struct drm_crtc *crtc)
+void mdp4_crtc_flush(struct drm_crtc *crtc)
 {
+	struct msm_drm_private *priv = crtc->dev->dev_private;
 	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
 	struct mdp4_kms *mdp4_kms = get_kms(crtc);
 	uint32_t i, flush = 0;
 
+	if (priv->pending_crtcs & (1 << crtc->id))
+		return;
+
 	for (i = 0; i < ARRAY_SIZE(mdp4_crtc->planes); i++) {
 		struct drm_plane *plane = mdp4_crtc->planes[i];
 		if (plane) {
@@ -142,7 +145,7 @@  static void update_scanout(struct drm_crtc *crtc, struct drm_framebuffer *fb)
 	 * so that we can safely queue unref to current fb (ie. next
 	 * vblank we know hw is done w/ previous scanout_fb).
 	 */
-	crtc_flush(crtc);
+	mdp4_crtc_flush(crtc);
 
 	if (mdp4_crtc->scanout_fb)
 		drm_flip_work_queue(&mdp4_crtc->unref_fb_work,
@@ -177,21 +180,6 @@  static void complete_flip(struct drm_crtc *crtc, struct drm_file *file)
 	spin_unlock_irqrestore(&dev->event_lock, flags);
 }
 
-static void pageflip_cb(struct msm_fence_cb *cb)
-{
-	struct mdp4_crtc *mdp4_crtc =
-		container_of(cb, struct mdp4_crtc, pageflip_cb);
-	struct drm_crtc *crtc = &mdp4_crtc->base;
-	struct drm_framebuffer *fb = crtc->primary->fb;
-
-	if (!fb)
-		return;
-
-	drm_framebuffer_reference(fb);
-	mdp4_plane_set_scanout(mdp4_crtc->plane, fb);
-	update_scanout(crtc, fb);
-}
-
 static void unref_fb_worker(struct drm_flip_work *work, void *val)
 {
 	struct mdp4_crtc *mdp4_crtc =
@@ -406,7 +394,7 @@  static void mdp4_crtc_prepare(struct drm_crtc *crtc)
 static void mdp4_crtc_commit(struct drm_crtc *crtc)
 {
 	mdp4_crtc_dpms(crtc, DRM_MODE_DPMS_ON);
-	crtc_flush(crtc);
+	mdp4_crtc_flush(crtc);
 	/* drop the ref to mdp clk's that we got in prepare: */
 	mdp4_disable(get_kms(crtc));
 }
@@ -448,23 +436,27 @@  static int mdp4_crtc_page_flip(struct drm_crtc *crtc,
 {
 	struct mdp4_crtc *mdp4_crtc = to_mdp4_crtc(crtc);
 	struct drm_device *dev = crtc->dev;
-	struct drm_gem_object *obj;
 	unsigned long flags;
 
+	spin_lock_irqsave(&dev->event_lock, flags);
 	if (mdp4_crtc->event) {
 		dev_err(dev->dev, "already pending flip!\n");
+		spin_unlock_irqrestore(&dev->event_lock, flags);
 		return -EBUSY;
 	}
 
-	obj = msm_framebuffer_bo(new_fb, 0);
-
-	spin_lock_irqsave(&dev->event_lock, flags);
 	mdp4_crtc->event = event;
 	spin_unlock_irqrestore(&dev->event_lock, flags);
 
 	update_fb(crtc, new_fb);
 
-	return msm_gem_queue_inactive_cb(obj, &mdp4_crtc->pageflip_cb);
+
+	/* grab extra ref for update_scanout() */
+	drm_framebuffer_reference(new_fb);
+	mdp4_plane_set_scanout(mdp4_crtc->plane, new_fb);
+	update_scanout(crtc, new_fb);
+
+	return 0;
 }
 
 static int mdp4_crtc_set_property(struct drm_crtc *crtc,
@@ -598,7 +590,7 @@  static int mdp4_crtc_cursor_move(struct drm_crtc *crtc, int x, int y)
 	mdp4_crtc->cursor.y = y;
 	spin_unlock_irqrestore(&mdp4_crtc->cursor.lock, flags);
 
-	crtc_flush(crtc);
+	mdp4_crtc_flush(crtc);
 	request_pending(crtc, PENDING_CURSOR);
 
 	return 0;
@@ -635,8 +627,13 @@  static void mdp4_crtc_vblank_irq(struct mdp_irq *irq, uint32_t irqstatus)
 	pending = atomic_xchg(&mdp4_crtc->pending, 0);
 
 	if (pending & PENDING_FLIP) {
-		complete_flip(crtc, NULL);
-		drm_flip_work_commit(&mdp4_crtc->unref_fb_work, priv->wq);
+		if (priv->pending_crtcs & (1 << crtc->id)) {
+			/* our update hasn't been flushed yet: */
+			request_pending(crtc, PENDING_FLIP);
+		} else {
+			complete_flip(crtc, NULL);
+			drm_flip_work_commit(&mdp4_crtc->unref_fb_work, priv->wq);
+		}
 	}
 
 	if (pending & PENDING_CURSOR) {
@@ -650,7 +647,7 @@  static void mdp4_crtc_err_irq(struct mdp_irq *irq, uint32_t irqstatus)
 	struct mdp4_crtc *mdp4_crtc = container_of(irq, struct mdp4_crtc, err);
 	struct drm_crtc *crtc = &mdp4_crtc->base;
 	DBG("%s: error: %08x", mdp4_crtc->name, irqstatus);
-	crtc_flush(crtc);
+	mdp4_crtc_flush(crtc);
 }
 
 uint32_t mdp4_crtc_vblank(struct drm_crtc *crtc)
@@ -730,7 +727,7 @@  static void set_attach(struct drm_crtc *crtc, enum mdp4_pipe pipe_id,
 	mdp4_crtc->planes[pipe_id] = plane;
 	blend_setup(crtc);
 	if (mdp4_crtc->enabled && (plane != mdp4_crtc->plane))
-		crtc_flush(crtc);
+		mdp4_crtc_flush(crtc);
 }
 
 void mdp4_crtc_attach(struct drm_crtc *crtc, struct drm_plane *plane)
@@ -792,8 +789,6 @@  struct drm_crtc *mdp4_crtc_init(struct drm_device *dev,
 	ret = drm_flip_work_init(&mdp4_crtc->unref_cursor_work, 64,
 			"unref cursor", unref_cursor_worker);
 
-	INIT_FENCE_CB(&mdp4_crtc->pageflip_cb, pageflip_cb);
-
 	drm_crtc_init_with_planes(dev, crtc, plane, NULL, &mdp4_crtc_funcs);
 	drm_crtc_helper_add(crtc, &mdp4_crtc_helper_funcs);
 
diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
index 0bb4faa..af0bfb1 100644
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
@@ -131,6 +131,11 @@  static long mdp4_round_pixclk(struct msm_kms *kms, unsigned long rate,
 	return mdp4_dtv_round_pixclk(encoder, rate);
 }
 
+static void mdp4_flush(struct msm_kms *kms, struct drm_crtc *crtc)
+{
+	mdp4_crtc_flush(crtc);
+}
+
 static void mdp4_preclose(struct msm_kms *kms, struct drm_file *file)
 {
 	struct mdp4_kms *mdp4_kms = to_mdp4_kms(to_mdp_kms(kms));
@@ -162,6 +167,7 @@  static const struct mdp_kms_funcs kms_funcs = {
 		.disable_vblank  = mdp4_disable_vblank,
 		.get_format      = mdp_get_format,
 		.round_pixclk    = mdp4_round_pixclk,
+		.flush           = mdp4_flush,
 		.preclose        = mdp4_preclose,
 		.destroy         = mdp4_destroy,
 	},
diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
index 715520c5..834454c 100644
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
@@ -184,6 +184,7 @@  enum mdp4_pipe mdp4_plane_pipe(struct drm_plane *plane);
 struct drm_plane *mdp4_plane_init(struct drm_device *dev,
 		enum mdp4_pipe pipe_id, bool private_plane);
 
+void mdp4_crtc_flush(struct drm_crtc *crtc);
 uint32_t mdp4_crtc_vblank(struct drm_crtc *crtc);
 void mdp4_crtc_cancel_pending_flip(struct drm_crtc *crtc, struct drm_file *file);
 void mdp4_crtc_set_config(struct drm_crtc *crtc, uint32_t config);
diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c
index 4c92985..f35848d 100644
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_plane.c
@@ -48,11 +48,6 @@  static int mdp4_plane_update(struct drm_plane *plane,
 
 	mdp4_plane->enabled = true;
 
-	if (plane->fb)
-		drm_framebuffer_unreference(plane->fb);
-
-	drm_framebuffer_reference(fb);
-
 	return mdp4_plane_mode_set(plane, crtc, fb,
 			crtc_x, crtc_y, crtc_w, crtc_h,
 			src_x, src_y, src_w, src_h);
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
index 7f4ee99..0e74317 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
@@ -35,7 +35,6 @@  struct mdp5_crtc {
 
 	/* if there is a pending flip, these will be non-null: */
 	struct drm_pending_vblank_event *event;
-	struct msm_fence_cb pageflip_cb;
 
 #define PENDING_CURSOR 0x1
 #define PENDING_FLIP   0x2
@@ -73,13 +72,17 @@  static void request_pending(struct drm_crtc *crtc, uint32_t pending)
 	mdp_irq_register(&get_kms(crtc)->base, &mdp5_crtc->vblank);
 }
 
-static void crtc_flush(struct drm_crtc *crtc)
+void mdp5_crtc_flush(struct drm_crtc *crtc)
 {
+	struct msm_drm_private *priv = crtc->dev->dev_private;
 	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
 	struct mdp5_kms *mdp5_kms = get_kms(crtc);
 	int id = mdp5_crtc->id;
 	uint32_t i, flush = 0;
 
+	if (priv->pending_crtcs & (1 << crtc->id))
+		return;
+
 	for (i = 0; i < ARRAY_SIZE(mdp5_crtc->planes); i++) {
 		struct drm_plane *plane = mdp5_crtc->planes[i];
 		if (plane) {
@@ -124,7 +127,7 @@  static void update_scanout(struct drm_crtc *crtc, struct drm_framebuffer *fb)
 	 * so that we can safely queue unref to current fb (ie. next
 	 * vblank we know hw is done w/ previous scanout_fb).
 	 */
-	crtc_flush(crtc);
+	mdp5_crtc_flush(crtc);
 
 	if (mdp5_crtc->scanout_fb)
 		drm_flip_work_queue(&mdp5_crtc->unref_fb_work,
@@ -165,21 +168,6 @@  static void complete_flip(struct drm_crtc *crtc, struct drm_file *file)
 	}
 }
 
-static void pageflip_cb(struct msm_fence_cb *cb)
-{
-	struct mdp5_crtc *mdp5_crtc =
-		container_of(cb, struct mdp5_crtc, pageflip_cb);
-	struct drm_crtc *crtc = &mdp5_crtc->base;
-	struct drm_framebuffer *fb = mdp5_crtc->fb;
-
-	if (!fb)
-		return;
-
-	drm_framebuffer_reference(fb);
-	mdp5_plane_set_scanout(mdp5_crtc->plane, fb);
-	update_scanout(crtc, fb);
-}
-
 static void unref_fb_worker(struct drm_flip_work *work, void *val)
 {
 	struct mdp5_crtc *mdp5_crtc =
@@ -324,7 +312,7 @@  static void mdp5_crtc_prepare(struct drm_crtc *crtc)
 static void mdp5_crtc_commit(struct drm_crtc *crtc)
 {
 	mdp5_crtc_dpms(crtc, DRM_MODE_DPMS_ON);
-	crtc_flush(crtc);
+	mdp5_crtc_flush(crtc);
 	/* drop the ref to mdp clk's that we got in prepare: */
 	mdp5_disable(get_kms(crtc));
 }
@@ -366,23 +354,26 @@  static int mdp5_crtc_page_flip(struct drm_crtc *crtc,
 {
 	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
 	struct drm_device *dev = crtc->dev;
-	struct drm_gem_object *obj;
 	unsigned long flags;
 
+	spin_lock_irqsave(&dev->event_lock, flags);
 	if (mdp5_crtc->event) {
 		dev_err(dev->dev, "already pending flip!\n");
+		spin_unlock_irqrestore(&dev->event_lock, flags);
 		return -EBUSY;
 	}
 
-	obj = msm_framebuffer_bo(new_fb, 0);
-
-	spin_lock_irqsave(&dev->event_lock, flags);
 	mdp5_crtc->event = event;
 	spin_unlock_irqrestore(&dev->event_lock, flags);
 
 	update_fb(crtc, new_fb);
 
-	return msm_gem_queue_inactive_cb(obj, &mdp5_crtc->pageflip_cb);
+	/* grab extra ref for update_scanout() */
+	drm_framebuffer_reference(new_fb);
+	mdp5_plane_set_scanout(mdp5_crtc->plane, new_fb);
+	update_scanout(crtc, new_fb);
+
+	return 0;
 }
 
 static int mdp5_crtc_set_property(struct drm_crtc *crtc,
@@ -424,8 +415,13 @@  static void mdp5_crtc_vblank_irq(struct mdp_irq *irq, uint32_t irqstatus)
 	pending = atomic_xchg(&mdp5_crtc->pending, 0);
 
 	if (pending & PENDING_FLIP) {
-		complete_flip(crtc, NULL);
-		drm_flip_work_commit(&mdp5_crtc->unref_fb_work, priv->wq);
+		if (priv->pending_crtcs & (1 << crtc->id)) {
+			/* our update hasn't been flushed yet: */
+			request_pending(crtc, PENDING_FLIP);
+		} else {
+			complete_flip(crtc, NULL);
+			drm_flip_work_commit(&mdp5_crtc->unref_fb_work, priv->wq);
+		}
 	}
 }
 
@@ -434,7 +430,7 @@  static void mdp5_crtc_err_irq(struct mdp_irq *irq, uint32_t irqstatus)
 	struct mdp5_crtc *mdp5_crtc = container_of(irq, struct mdp5_crtc, err);
 	struct drm_crtc *crtc = &mdp5_crtc->base;
 	DBG("%s: error: %08x", mdp5_crtc->name, irqstatus);
-	crtc_flush(crtc);
+	mdp5_crtc_flush(crtc);
 }
 
 uint32_t mdp5_crtc_vblank(struct drm_crtc *crtc)
@@ -501,7 +497,7 @@  void mdp5_crtc_set_intf(struct drm_crtc *crtc, int intf,
 			MDP5_CTL_OP_MODE(MODE_NONE) |
 			MDP5_CTL_OP_INTF_NUM(intfnum[intf]));
 
-	crtc_flush(crtc);
+	mdp5_crtc_flush(crtc);
 }
 
 static void set_attach(struct drm_crtc *crtc, enum mdp5_pipe pipe_id,
@@ -517,7 +513,7 @@  static void set_attach(struct drm_crtc *crtc, enum mdp5_pipe pipe_id,
 	mdp5_crtc->planes[pipe_id] = plane;
 	blend_setup(crtc);
 	if (mdp5_crtc->enabled && (plane != mdp5_crtc->plane))
-		crtc_flush(crtc);
+		mdp5_crtc_flush(crtc);
 }
 
 void mdp5_crtc_attach(struct drm_crtc *crtc, struct drm_plane *plane)
@@ -563,8 +559,6 @@  struct drm_crtc *mdp5_crtc_init(struct drm_device *dev,
 	if (ret)
 		goto fail;
 
-	INIT_FENCE_CB(&mdp5_crtc->pageflip_cb, pageflip_cb);
-
 	drm_crtc_init_with_planes(dev, crtc, plane, NULL, &mdp5_crtc_funcs);
 	drm_crtc_helper_add(crtc, &mdp5_crtc_helper_funcs);
 
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
index ee8446c..01c3d70 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
@@ -91,6 +91,11 @@  static long mdp5_round_pixclk(struct msm_kms *kms, unsigned long rate,
 	return rate;
 }
 
+static void mdp5_flush(struct msm_kms *kms, struct drm_crtc *crtc)
+{
+	mdp5_crtc_flush(crtc);
+}
+
 static void mdp5_preclose(struct msm_kms *kms, struct drm_file *file)
 {
 	struct mdp5_kms *mdp5_kms = to_mdp5_kms(to_mdp_kms(kms));
@@ -118,6 +123,7 @@  static const struct mdp_kms_funcs kms_funcs = {
 		.disable_vblank  = mdp5_disable_vblank,
 		.get_format      = mdp_get_format,
 		.round_pixclk    = mdp5_round_pixclk,
+		.flush           = mdp5_flush,
 		.preclose        = mdp5_preclose,
 		.destroy         = mdp5_destroy,
 	},
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
index c8b1a25..18b031b 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
@@ -197,8 +197,8 @@  enum mdp5_pipe mdp5_plane_pipe(struct drm_plane *plane);
 struct drm_plane *mdp5_plane_init(struct drm_device *dev,
 		enum mdp5_pipe pipe, bool private_plane);
 
+void mdp5_crtc_flush(struct drm_crtc *crtc);
 uint32_t mdp5_crtc_vblank(struct drm_crtc *crtc);
-
 void mdp5_crtc_cancel_pending_flip(struct drm_crtc *crtc, struct drm_file *file);
 void mdp5_crtc_set_intf(struct drm_crtc *crtc, int intf,
 		enum mdp5_intf intf_id);
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
index 53cc8c6..f1bf3c2 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
@@ -48,11 +48,6 @@  static int mdp5_plane_update(struct drm_plane *plane,
 
 	mdp5_plane->enabled = true;
 
-	if (plane->fb)
-		drm_framebuffer_unreference(plane->fb);
-
-	drm_framebuffer_reference(fb);
-
 	return mdp5_plane_mode_set(plane, crtc, fb,
 			crtc_x, crtc_y, crtc_w, crtc_h,
 			src_x, src_y, src_w, src_h);
diff --git a/drivers/gpu/drm/msm/msm_atomic.c b/drivers/gpu/drm/msm/msm_atomic.c
new file mode 100644
index 0000000..b231377
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_atomic.c
@@ -0,0 +1,141 @@ 
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "msm_drv.h"
+#include "msm_kms.h"
+#include "msm_gem.h"
+
+struct msm_async_commit {
+	struct drm_atomic_state *state;
+	uint32_t fence;
+	struct msm_fence_cb fence_cb;
+};
+
+static void fence_cb(struct msm_fence_cb *cb);
+static int commit_sync(struct drm_device *dev, void *state, bool unlocked);
+
+static struct msm_async_commit *new_commit(struct drm_atomic_state *state)
+{
+	struct msm_async_commit *c = kzalloc(sizeof(*c), GFP_KERNEL);
+
+	if (!c)
+		return NULL;
+
+	drm_atomic_state_reference(state);
+	c->state = state;
+	INIT_FENCE_CB(&c->fence_cb, fence_cb);
+
+	return c;
+}
+static void free_commit(struct msm_async_commit *c)
+{
+	drm_atomic_state_unreference(c->state);
+	kfree(c);
+}
+
+static void fence_cb(struct msm_fence_cb *cb)
+{
+	struct msm_async_commit *c =
+			container_of(cb, struct msm_async_commit, fence_cb);
+	commit_sync(c->state->dev, c->state, true);
+	free_commit(c);
+}
+
+static void add_fb(struct msm_async_commit *c, struct drm_crtc *crtc,
+		struct drm_framebuffer *fb)
+{
+	struct drm_gem_object *obj = msm_framebuffer_bo(fb, 0);
+	c->fence = max(c->fence, msm_gem_fence(to_msm_bo(obj), MSM_PREP_READ));
+}
+
+static int wait_fb(struct drm_crtc *crtc, struct drm_framebuffer *fb)
+{
+	// XXX TODO wait..
+	return 0;
+}
+
+#define pending_fb(state) ((state) && (state)->fb && (state)->new_fb)
+
+static int commit_sync(struct drm_device *dev, void *state, bool unlocked)
+{
+	struct drm_atomic_state *a = state;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_kms *kms = priv->kms;
+	int ncrtcs = dev->mode_config.num_crtc;
+	uint32_t pending_crtcs = 0;
+	int i, ret;
+
+	for (i = 0; i < ncrtcs; i++)
+		if (a->crtcs[i])
+			pending_crtcs |= (1 << a->crtcs[i]->id);
+
+	mutex_lock(&dev->struct_mutex);
+	WARN_ON(priv->pending_crtcs & pending_crtcs);
+	priv->pending_crtcs |= pending_crtcs;
+	mutex_unlock(&dev->struct_mutex);
+
+	if (unlocked)
+		ret = drm_atomic_commit_unlocked(dev, state);
+	else
+		ret = drm_atomic_commit(dev, state);
+
+	mutex_lock(&dev->struct_mutex);
+	priv->pending_crtcs &= ~pending_crtcs;
+	mutex_unlock(&dev->struct_mutex);
+
+	if (ret)
+		return ret;
+
+	for (i = 0; i < ncrtcs; i++)
+		if (a->crtcs[i])
+			kms->funcs->flush(kms, a->crtcs[i]);
+
+	return 0;
+}
+
+int msm_atomic_commit(struct drm_device *dev, struct drm_atomic_state *state)
+{
+	struct drm_atomic_state *a = state;
+	int nplanes = dev->mode_config.num_total_plane;
+	int i;
+
+	if (a->flags & DRM_MODE_ATOMIC_NONBLOCK) {
+		/* non-block mode: defer commit until fb's are ready */
+		struct msm_async_commit *c = new_commit(state);
+
+		if (!c)
+			return -ENOMEM;
+
+		for (i = 0; i < nplanes; i++)
+			if (pending_fb(a->pstates[i]))
+				add_fb(c, a->pstates[i]->crtc, a->pstates[i]->fb);
+
+		return msm_queue_fence_cb(dev, &c->fence_cb, c->fence);
+	} else {
+		/* blocking mode: wait until fb's are ready */
+		int ret = 0;
+
+		for (i = 0; i < nplanes && !ret; i++)
+			if (pending_fb(a->pstates[i]))
+				ret = wait_fb(a->pstates[i]->crtc, a->pstates[i]->fb);
+
+		if (ret)
+			return ret;
+
+		return commit_sync(dev, state, false);
+	}
+}
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index d24ca45..bcee218 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -600,6 +600,26 @@  int msm_wait_fence_interruptable(struct drm_device *dev, uint32_t fence,
 	return ret;
 }
 
+int msm_queue_fence_cb(struct drm_device *dev,
+		struct msm_fence_cb *cb, uint32_t fence)
+{
+	struct msm_drm_private *priv = dev->dev_private;
+	int ret = 0;
+
+	mutex_lock(&dev->struct_mutex);
+	if (!list_empty(&cb->work.entry)) {
+		ret = -EINVAL;
+	} else if (fence > priv->completed_fence) {
+		cb->fence = fence;
+		list_add_tail(&cb->work.entry, &priv->fence_cbs);
+	} else {
+		queue_work(priv->wq, &cb->work);
+	}
+	mutex_unlock(&dev->struct_mutex);
+
+	return ret;
+}
+
 /* called from workqueue */
 void msm_update_fence(struct drm_device *dev, uint32_t fence)
 {
@@ -815,7 +835,7 @@  static struct drm_driver msm_driver = {
 	.atomic_begin       = drm_atomic_begin,
 	.atomic_set_event   = drm_atomic_set_event,
 	.atomic_check       = drm_atomic_check,
-	.atomic_commit      = drm_atomic_commit,
+	.atomic_commit      = msm_atomic_commit,
 	.atomic_end         = drm_atomic_end,
 	.atomic_funcs       = &drm_atomic_funcs,
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index afc990e..5e1e7da 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -110,6 +110,9 @@  struct msm_drm_private {
 	unsigned int num_connectors;
 	struct drm_connector *connectors[8];
 
+	/* crtc's pending atomic update: */
+	uint32_t pending_crtcs;
+
 	/* VRAM carveout, used when no IOMMU: */
 	struct {
 		unsigned long size;
@@ -143,11 +146,15 @@  int msm_register_mmu(struct drm_device *dev, struct msm_mmu *mmu);
 
 int msm_wait_fence_interruptable(struct drm_device *dev, uint32_t fence,
 		struct timespec *timeout);
+int msm_queue_fence_cb(struct drm_device *dev,
+		struct msm_fence_cb *cb, uint32_t fence);
 void msm_update_fence(struct drm_device *dev, uint32_t fence);
 
 int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 		struct drm_file *file);
 
+int msm_atomic_commit(struct drm_device *dev, struct drm_atomic_state *state);
+
 int msm_gem_mmap(struct file *filp, struct vm_area_struct *vma);
 int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
 uint64_t msm_gem_mmap_offset(struct drm_gem_object *obj);
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index bb8026d..90cb138 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -392,23 +392,11 @@  void *msm_gem_vaddr(struct drm_gem_object *obj)
 int msm_gem_queue_inactive_cb(struct drm_gem_object *obj,
 		struct msm_fence_cb *cb)
 {
-	struct drm_device *dev = obj->dev;
-	struct msm_drm_private *priv = dev->dev_private;
 	struct msm_gem_object *msm_obj = to_msm_bo(obj);
-	int ret = 0;
+	uint32_t fence = msm_gem_fence(msm_obj,
+			MSM_PREP_READ | MSM_PREP_WRITE);
 
-	mutex_lock(&dev->struct_mutex);
-	if (!list_empty(&cb->work.entry)) {
-		ret = -EINVAL;
-	} else if (is_active(msm_obj)) {
-		cb->fence = max(msm_obj->read_fence, msm_obj->write_fence);
-		list_add_tail(&cb->work.entry, &priv->fence_cbs);
-	} else {
-		queue_work(priv->wq, &cb->work);
-	}
-	mutex_unlock(&dev->struct_mutex);
-
-	return ret;
+	return msm_queue_fence_cb(obj->dev, cb, fence);
 }
 
 void msm_gem_move_to_active(struct drm_gem_object *obj,
@@ -447,12 +435,8 @@  int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op,
 	int ret = 0;
 
 	if (is_active(msm_obj)) {
-		uint32_t fence = 0;
+		uint32_t fence = msm_gem_fence(msm_obj, op);
 
-		if (op & MSM_PREP_READ)
-			fence = msm_obj->write_fence;
-		if (op & MSM_PREP_WRITE)
-			fence = max(fence, msm_obj->read_fence);
 		if (op & MSM_PREP_NOSYNC)
 			timeout = NULL;
 
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index 3246bb4..41a60cb 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -70,6 +70,19 @@  static inline bool is_active(struct msm_gem_object *msm_obj)
 	return msm_obj->gpu != NULL;
 }
 
+static inline uint32_t msm_gem_fence(struct msm_gem_object *msm_obj,
+		uint32_t op)
+{
+	uint32_t fence = 0;
+
+	if (op & MSM_PREP_READ)
+		fence = msm_obj->write_fence;
+	if (op & MSM_PREP_WRITE)
+		fence = max(fence, msm_obj->read_fence);
+
+	return fence;
+}
+
 #define MAX_CMDS 4
 
 /* Created per submit-ioctl, to track bo's and cmdstream bufs, etc,
diff --git a/drivers/gpu/drm/msm/msm_kms.h b/drivers/gpu/drm/msm/msm_kms.h
index 0643774..91ddffc 100644
--- a/drivers/gpu/drm/msm/msm_kms.h
+++ b/drivers/gpu/drm/msm/msm_kms.h
@@ -42,6 +42,7 @@  struct msm_kms_funcs {
 	const struct msm_format *(*get_format)(struct msm_kms *kms, uint32_t format);
 	long (*round_pixclk)(struct msm_kms *kms, unsigned long rate,
 			struct drm_encoder *encoder);
+	void (*flush)(struct msm_kms *kms, struct drm_crtc *crtc);
 	/* cleanup: */
 	void (*preclose)(struct msm_kms *kms, struct drm_file *file);
 	void (*destroy)(struct msm_kms *kms);