diff mbox

[052/165] drm/radeon/cik: Add support for compute queues (v2)

Message ID 1372253045-17042-53-git-send-email-alexdeucher@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Alex Deucher June 26, 2013, 1:22 p.m. UTC
From: Alex Deucher <alexander.deucher@amd.com>

On CIK, the compute rings work slightly differently than
on previous asics, however the basic concepts are the same.

The main differences:
- New MEC engines for compute queues
- Multiple queues per MEC:
  - CI/KB: 1 MEC, 4 pipes per MEC, 8 queues per pipe = 32 queues
  -    KV: 2 MEC, 4 pipes per MEC, 8 queues per pipe = 64 queues
- Queues can be allocated and scheduled by another queue
- New doorbell aperture allows you to assign space in the aperture
  for the wptr which allows for userspace access to queues

v2: add wptr shadow, fix eop setup

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/radeon/cik.c       |  528 +++++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/radeon/cikd.h      |   62 +++++
 drivers/gpu/drm/radeon/radeon.h    |   19 ++
 drivers/gpu/drm/radeon/radeon_cs.c |    4 +-
 4 files changed, 601 insertions(+), 12 deletions(-)

Comments

Jerome Glisse June 26, 2013, 10:08 a.m. UTC | #1
On Wed, Jun 26, 2013 at 09:22:12AM -0400, alexdeucher@gmail.com wrote:
> From: Alex Deucher <alexander.deucher@amd.com>
> 
> On CIK, the compute rings work slightly differently than
> on previous asics, however the basic concepts are the same.
> 
> The main differences:
> - New MEC engines for compute queues
> - Multiple queues per MEC:
>   - CI/KB: 1 MEC, 4 pipes per MEC, 8 queues per pipe = 32 queues
>   -    KV: 2 MEC, 4 pipes per MEC, 8 queues per pipe = 64 queues
> - Queues can be allocated and scheduled by another queue
> - New doorbell aperture allows you to assign space in the aperture
>   for the wptr which allows for userspace access to queues
> 
> v2: add wptr shadow, fix eop setup
> 
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

Minor nitpick below otherwise

Reviewed-by: Jerome Glisse <jglisse@redhat.com>

> ---
>  drivers/gpu/drm/radeon/cik.c       |  528 +++++++++++++++++++++++++++++++++++-
>  drivers/gpu/drm/radeon/cikd.h      |   62 +++++
>  drivers/gpu/drm/radeon/radeon.h    |   19 ++
>  drivers/gpu/drm/radeon/radeon_cs.c |    4 +-
>  4 files changed, 601 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
> index 5c28fa5..9d2d6bb 100644
> --- a/drivers/gpu/drm/radeon/cik.c
> +++ b/drivers/gpu/drm/radeon/cik.c
> @@ -1687,6 +1687,7 @@ int cik_ring_test(struct radeon_device *rdev, struct radeon_ring *ring)
>  	radeon_ring_write(ring, ((scratch - PACKET3_SET_UCONFIG_REG_START) >> 2));
>  	radeon_ring_write(ring, 0xDEADBEEF);
>  	radeon_ring_unlock_commit(rdev, ring);
> +
>  	for (i = 0; i < rdev->usec_timeout; i++) {
>  		tmp = RREG32(scratch);
>  		if (tmp == 0xDEADBEEF)
> @@ -2112,6 +2113,51 @@ static int cik_cp_gfx_resume(struct radeon_device *rdev)
>  	return 0;
>  }
>  
> +static u32 cik_compute_ring_get_rptr(struct radeon_device *rdev,
> +				     struct radeon_ring *ring)
> +{
> +	u32 rptr;
> +
> +
> +
> +	if (rdev->wb.enabled) {
> +		rptr = le32_to_cpu(rdev->wb.wb[ring->rptr_offs/4]);
> +	} else {
> +		cik_srbm_select(rdev, ring->me, ring->pipe, ring->queue, 0);
> +		rptr = RREG32(CP_HQD_PQ_RPTR);
> +		cik_srbm_select(rdev, 0, 0, 0, 0);
> +	}
> +	rptr = (rptr & ring->ptr_reg_mask) >> ring->ptr_reg_shift;
> +
> +	return rptr;
> +}
> +
> +static u32 cik_compute_ring_get_wptr(struct radeon_device *rdev,
> +				     struct radeon_ring *ring)
> +{
> +	u32 wptr;
> +
> +	if (rdev->wb.enabled) {
> +		wptr = le32_to_cpu(rdev->wb.wb[ring->wptr_offs/4]);
> +	} else {
> +		cik_srbm_select(rdev, ring->me, ring->pipe, ring->queue, 0);
> +		wptr = RREG32(CP_HQD_PQ_WPTR);
> +		cik_srbm_select(rdev, 0, 0, 0, 0);
> +	}
> +	wptr = (wptr & ring->ptr_reg_mask) >> ring->ptr_reg_shift;
> +
> +	return wptr;
> +}
> +
> +static void cik_compute_ring_set_wptr(struct radeon_device *rdev,
> +				      struct radeon_ring *ring)
> +{
> +	u32 wptr = (ring->wptr << ring->ptr_reg_shift) & ring->ptr_reg_mask;
> +
> +	rdev->wb.wb[ring->wptr_offs/4] = cpu_to_le32(wptr);
> +	WDOORBELL32(ring->doorbell_offset, wptr);
> +}
> +
>  /**
>   * cik_cp_compute_enable - enable/disable the compute CP MEs
>   *
> @@ -2176,7 +2222,8 @@ static int cik_cp_compute_load_microcode(struct radeon_device *rdev)
>   */
>  static int cik_cp_compute_start(struct radeon_device *rdev)
>  {
> -	//todo
> +	cik_cp_compute_enable(rdev, true);
> +
>  	return 0;
>  }
>  
> @@ -2190,10 +2237,171 @@ static int cik_cp_compute_start(struct radeon_device *rdev)
>   */
>  static void cik_cp_compute_fini(struct radeon_device *rdev)
>  {
> +	int i, idx, r;
> +
>  	cik_cp_compute_enable(rdev, false);
> -	//todo
> +
> +	for (i = 0; i < 2; i++) {
> +		if (i == 0)
> +			idx = CAYMAN_RING_TYPE_CP1_INDEX;
> +		else
> +			idx = CAYMAN_RING_TYPE_CP2_INDEX;
> +
> +		if (rdev->ring[idx].mqd_obj) {
> +			r = radeon_bo_reserve(rdev->ring[idx].mqd_obj, false);
> +			if (unlikely(r != 0))
> +				dev_warn(rdev->dev, "(%d) reserve MQD bo failed\n", r);
> +
> +			radeon_bo_unpin(rdev->ring[idx].mqd_obj);
> +			radeon_bo_unreserve(rdev->ring[idx].mqd_obj);
> +
> +			radeon_bo_unref(&rdev->ring[idx].mqd_obj);
> +			rdev->ring[idx].mqd_obj = NULL;
> +		}
> +	}
> +}
> +
> +static void cik_mec_fini(struct radeon_device *rdev)
> +{
> +	int r;
> +
> +	if (rdev->mec.hpd_eop_obj) {
> +		r = radeon_bo_reserve(rdev->mec.hpd_eop_obj, false);
> +		if (unlikely(r != 0))
> +			dev_warn(rdev->dev, "(%d) reserve HPD EOP bo failed\n", r);
> +		radeon_bo_unpin(rdev->mec.hpd_eop_obj);
> +		radeon_bo_unreserve(rdev->mec.hpd_eop_obj);
> +
> +		radeon_bo_unref(&rdev->mec.hpd_eop_obj);
> +		rdev->mec.hpd_eop_obj = NULL;
> +	}
> +}
> +
> +#define MEC_HPD_SIZE 2048
> +
> +static int cik_mec_init(struct radeon_device *rdev)
> +{
> +	int r;
> +	u32 *hpd;
> +
> +	/*
> +	 * KV:    2 MEC, 4 Pipes/MEC, 8 Queues/Pipe - 64 Queues total
> +	 * CI/KB: 1 MEC, 4 Pipes/MEC, 8 Queues/Pipe - 32 Queues total
> +	 */
> +	if (rdev->family == CHIP_KAVERI)
> +		rdev->mec.num_mec = 2;
> +	else
> +		rdev->mec.num_mec = 1;
> +	rdev->mec.num_pipe = 4;
> +	rdev->mec.num_queue = rdev->mec.num_mec * rdev->mec.num_pipe * 8;
> +
> +	if (rdev->mec.hpd_eop_obj == NULL) {
> +		r = radeon_bo_create(rdev,
> +				     rdev->mec.num_mec *rdev->mec.num_pipe * MEC_HPD_SIZE * 2,
> +				     PAGE_SIZE, true,
> +				     RADEON_GEM_DOMAIN_GTT, NULL,
> +				     &rdev->mec.hpd_eop_obj);
> +		if (r) {
> +			dev_warn(rdev->dev, "(%d) create HDP EOP bo failed\n", r);
> +			return r;
> +		}
> +	}
> +
> +	r = radeon_bo_reserve(rdev->mec.hpd_eop_obj, false);
> +	if (unlikely(r != 0)) {
> +		cik_mec_fini(rdev);
> +		return r;
> +	}
> +	r = radeon_bo_pin(rdev->mec.hpd_eop_obj, RADEON_GEM_DOMAIN_GTT,
> +			  &rdev->mec.hpd_eop_gpu_addr);
> +	if (r) {
> +		dev_warn(rdev->dev, "(%d) pin HDP EOP bo failed\n", r);
> +		cik_mec_fini(rdev);
> +		return r;
> +	}
> +	r = radeon_bo_kmap(rdev->mec.hpd_eop_obj, (void **)&hpd);
> +	if (r) {
> +		dev_warn(rdev->dev, "(%d) map HDP EOP bo failed\n", r);
> +		cik_mec_fini(rdev);
> +		return r;
> +	}
> +
> +	/* clear memory.  Not sure if this is required or not */
> +	memset(hpd, 0, rdev->mec.num_mec *rdev->mec.num_pipe * MEC_HPD_SIZE * 2);
> +
> +	radeon_bo_kunmap(rdev->mec.hpd_eop_obj);
> +	radeon_bo_unreserve(rdev->mec.hpd_eop_obj);
> +
> +	return 0;
>  }
>  
> +struct hqd_registers
> +{
> +	u32 cp_mqd_base_addr;
> +	u32 cp_mqd_base_addr_hi;
> +	u32 cp_hqd_active;
> +	u32 cp_hqd_vmid;
> +	u32 cp_hqd_persistent_state;
> +	u32 cp_hqd_pipe_priority;
> +	u32 cp_hqd_queue_priority;
> +	u32 cp_hqd_quantum;
> +	u32 cp_hqd_pq_base;
> +	u32 cp_hqd_pq_base_hi;
> +	u32 cp_hqd_pq_rptr;
> +	u32 cp_hqd_pq_rptr_report_addr;
> +	u32 cp_hqd_pq_rptr_report_addr_hi;
> +	u32 cp_hqd_pq_wptr_poll_addr;
> +	u32 cp_hqd_pq_wptr_poll_addr_hi;
> +	u32 cp_hqd_pq_doorbell_control;
> +	u32 cp_hqd_pq_wptr;
> +	u32 cp_hqd_pq_control;
> +	u32 cp_hqd_ib_base_addr;
> +	u32 cp_hqd_ib_base_addr_hi;
> +	u32 cp_hqd_ib_rptr;
> +	u32 cp_hqd_ib_control;
> +	u32 cp_hqd_iq_timer;
> +	u32 cp_hqd_iq_rptr;
> +	u32 cp_hqd_dequeue_request;
> +	u32 cp_hqd_dma_offload;
> +	u32 cp_hqd_sema_cmd;
> +	u32 cp_hqd_msg_type;
> +	u32 cp_hqd_atomic0_preop_lo;
> +	u32 cp_hqd_atomic0_preop_hi;
> +	u32 cp_hqd_atomic1_preop_lo;
> +	u32 cp_hqd_atomic1_preop_hi;
> +	u32 cp_hqd_hq_scheduler0;
> +	u32 cp_hqd_hq_scheduler1;
> +	u32 cp_mqd_control;
> +};
> +
> +struct bonaire_mqd
> +{
> +	u32 header;
> +	u32 dispatch_initiator;
> +	u32 dimensions[3];
> +	u32 start_idx[3];
> +	u32 num_threads[3];
> +	u32 pipeline_stat_enable;
> +	u32 perf_counter_enable;
> +	u32 pgm[2];
> +	u32 tba[2];
> +	u32 tma[2];
> +	u32 pgm_rsrc[2];
> +	u32 vmid;
> +	u32 resource_limits;
> +	u32 static_thread_mgmt01[2];
> +	u32 tmp_ring_size;
> +	u32 static_thread_mgmt23[2];
> +	u32 restart[3];
> +	u32 thread_trace_enable;
> +	u32 reserved1;
> +	u32 user_data[16];
> +	u32 vgtcs_invoke_count[2];
> +	struct hqd_registers queue_state;
> +	u32 dequeue_cntr;
> +	u32 interrupt_queue[64];
> +};
> +
>  /**
>   * cik_cp_compute_resume - setup the compute queue registers
>   *
> @@ -2205,24 +2413,247 @@ static void cik_cp_compute_fini(struct radeon_device *rdev)
>   */
>  static int cik_cp_compute_resume(struct radeon_device *rdev)
>  {
> -	int r;
> +	int r, i, idx;
> +	u32 tmp;
> +	bool use_doorbell = true;
> +	u64 hqd_gpu_addr;
> +	u64 mqd_gpu_addr;
> +	u64 eop_gpu_addr;
> +	u64 wb_gpu_addr;
> +	u32 *buf;
> +	struct bonaire_mqd *mqd;
>  
> -	//todo
>  	r = cik_cp_compute_start(rdev);
>  	if (r)
>  		return r;
> +
> +	/* fix up chicken bits */
> +	tmp = RREG32(CP_CPF_DEBUG);
> +	tmp |= (1 << 23);

What is this chicken bit ? Is it tasty ?

> +	WREG32(CP_CPF_DEBUG, tmp);
> +
> +	/* init the pipes */
> +	for (i = 0; i < (rdev->mec.num_pipe * rdev->mec.num_mec); i++) {
> +		int me = (i < 4) ? 1 : 2;
> +		int pipe = (i < 4) ? i : (i - 4);
> +
> +		eop_gpu_addr = rdev->mec.hpd_eop_gpu_addr + (i * MEC_HPD_SIZE * 2);
> +
> +		cik_srbm_select(rdev, me, pipe, 0, 0);
> +
> +		/* write the EOP addr */
> +		WREG32(CP_HPD_EOP_BASE_ADDR, eop_gpu_addr >> 8);
> +		WREG32(CP_HPD_EOP_BASE_ADDR_HI, upper_32_bits(eop_gpu_addr) >> 8);
> +
> +		/* set the VMID assigned */
> +		WREG32(CP_HPD_EOP_VMID, 0);
> +
> +		/* set the EOP size, register value is 2^(EOP_SIZE+1) dwords */
> +		tmp = RREG32(CP_HPD_EOP_CONTROL);
> +		tmp &= ~EOP_SIZE_MASK;
> +		tmp |= drm_order(MEC_HPD_SIZE / 8);
> +		WREG32(CP_HPD_EOP_CONTROL, tmp);
> +	}
> +	cik_srbm_select(rdev, 0, 0, 0, 0);
> +
> +	/* init the queues.  Just two for now. */
> +	for (i = 0; i < 2; i++) {
> +		if (i == 0)
> +			idx = CAYMAN_RING_TYPE_CP1_INDEX;
> +		else
> +			idx = CAYMAN_RING_TYPE_CP2_INDEX;
> +
> +		if (rdev->ring[idx].mqd_obj == NULL) {
> +			r = radeon_bo_create(rdev,
> +					     sizeof(struct bonaire_mqd),
> +					     PAGE_SIZE, true,
> +					     RADEON_GEM_DOMAIN_GTT, NULL,
> +					     &rdev->ring[idx].mqd_obj);
> +			if (r) {
> +				dev_warn(rdev->dev, "(%d) create MQD bo failed\n", r);
> +				return r;
> +			}
> +		}
> +
> +		r = radeon_bo_reserve(rdev->ring[idx].mqd_obj, false);
> +		if (unlikely(r != 0)) {
> +			cik_cp_compute_fini(rdev);
> +			return r;
> +		}
> +		r = radeon_bo_pin(rdev->ring[idx].mqd_obj, RADEON_GEM_DOMAIN_GTT,
> +				  &mqd_gpu_addr);
> +		if (r) {
> +			dev_warn(rdev->dev, "(%d) pin MQD bo failed\n", r);
> +			cik_cp_compute_fini(rdev);
> +			return r;
> +		}
> +		r = radeon_bo_kmap(rdev->ring[idx].mqd_obj, (void **)&buf);
> +		if (r) {
> +			dev_warn(rdev->dev, "(%d) map MQD bo failed\n", r);
> +			cik_cp_compute_fini(rdev);
> +			return r;
> +		}
> +
> +		/* doorbell offset */
> +		rdev->ring[idx].doorbell_offset =
> +			(rdev->ring[idx].doorbell_page_num * PAGE_SIZE) + 0;
> +
> +		/* init the mqd struct */
> +		memset(buf, 0, sizeof(struct bonaire_mqd));
> +
> +		mqd = (struct bonaire_mqd *)buf;
> +		mqd->header = 0xC0310800;
> +		mqd->static_thread_mgmt01[0] = 0xffffffff;
> +		mqd->static_thread_mgmt01[1] = 0xffffffff;
> +		mqd->static_thread_mgmt23[0] = 0xffffffff;
> +		mqd->static_thread_mgmt23[1] = 0xffffffff;
> +
> +		cik_srbm_select(rdev, rdev->ring[idx].me,
> +				rdev->ring[idx].pipe,
> +				rdev->ring[idx].queue, 0);
> +
> +		/* disable wptr polling */
> +		tmp = RREG32(CP_PQ_WPTR_POLL_CNTL);
> +		tmp &= ~WPTR_POLL_EN;
> +		WREG32(CP_PQ_WPTR_POLL_CNTL, tmp);
> +
> +		/* enable doorbell? */
> +		mqd->queue_state.cp_hqd_pq_doorbell_control =
> +			RREG32(CP_HQD_PQ_DOORBELL_CONTROL);
> +		if (use_doorbell)
> +			mqd->queue_state.cp_hqd_pq_doorbell_control |= DOORBELL_EN;
> +		else
> +			mqd->queue_state.cp_hqd_pq_doorbell_control &= ~DOORBELL_EN;
> +		WREG32(CP_HQD_PQ_DOORBELL_CONTROL,
> +		       mqd->queue_state.cp_hqd_pq_doorbell_control);
> +
> +		/* disable the queue if it's active */
> +		mqd->queue_state.cp_hqd_dequeue_request = 0;
> +		mqd->queue_state.cp_hqd_pq_rptr = 0;
> +		mqd->queue_state.cp_hqd_pq_wptr= 0;
> +		if (RREG32(CP_HQD_ACTIVE) & 1) {
> +			WREG32(CP_HQD_DEQUEUE_REQUEST, 1);
> +			for (i = 0; i < rdev->usec_timeout; i++) {
> +				if (!(RREG32(CP_HQD_ACTIVE) & 1))
> +					break;
> +				udelay(1);
> +			}
> +			WREG32(CP_HQD_DEQUEUE_REQUEST, mqd->queue_state.cp_hqd_dequeue_request);
> +			WREG32(CP_HQD_PQ_RPTR, mqd->queue_state.cp_hqd_pq_rptr);
> +			WREG32(CP_HQD_PQ_WPTR, mqd->queue_state.cp_hqd_pq_wptr);
> +		}
> +
> +		/* set the pointer to the MQD */
> +		mqd->queue_state.cp_mqd_base_addr = mqd_gpu_addr & 0xfffffffc;
> +		mqd->queue_state.cp_mqd_base_addr_hi = upper_32_bits(mqd_gpu_addr);
> +		WREG32(CP_MQD_BASE_ADDR, mqd->queue_state.cp_mqd_base_addr);
> +		WREG32(CP_MQD_BASE_ADDR_HI, mqd->queue_state.cp_mqd_base_addr_hi);
> +		/* set MQD vmid to 0 */
> +		mqd->queue_state.cp_mqd_control = RREG32(CP_MQD_CONTROL);
> +		mqd->queue_state.cp_mqd_control &= ~MQD_VMID_MASK;
> +		WREG32(CP_MQD_CONTROL, mqd->queue_state.cp_mqd_control);
> +
> +		/* set the pointer to the HQD, this is similar CP_RB0_BASE/_HI */
> +		hqd_gpu_addr = rdev->ring[idx].gpu_addr >> 8;
> +		mqd->queue_state.cp_hqd_pq_base = hqd_gpu_addr;
> +		mqd->queue_state.cp_hqd_pq_base_hi = upper_32_bits(hqd_gpu_addr);
> +		WREG32(CP_HQD_PQ_BASE, mqd->queue_state.cp_hqd_pq_base);
> +		WREG32(CP_HQD_PQ_BASE_HI, mqd->queue_state.cp_hqd_pq_base_hi);
> +
> +		/* set up the HQD, this is similar to CP_RB0_CNTL */
> +		mqd->queue_state.cp_hqd_pq_control = RREG32(CP_HQD_PQ_CONTROL);
> +		mqd->queue_state.cp_hqd_pq_control &=
> +			~(QUEUE_SIZE_MASK | RPTR_BLOCK_SIZE_MASK);
> +
> +		mqd->queue_state.cp_hqd_pq_control |=
> +			drm_order(rdev->ring[idx].ring_size / 8);
> +		mqd->queue_state.cp_hqd_pq_control |=
> +			(drm_order(RADEON_GPU_PAGE_SIZE/8) << 8);
> +#ifdef __BIG_ENDIAN
> +		mqd->queue_state.cp_hqd_pq_control |= BUF_SWAP_32BIT;
> +#endif
> +		mqd->queue_state.cp_hqd_pq_control &=
> +			~(UNORD_DISPATCH | ROQ_PQ_IB_FLIP | PQ_VOLATILE);
> +		mqd->queue_state.cp_hqd_pq_control |=
> +			PRIV_STATE | KMD_QUEUE; /* assuming kernel queue control */
> +		WREG32(CP_HQD_PQ_CONTROL, mqd->queue_state.cp_hqd_pq_control);
> +
> +		/* only used if CP_PQ_WPTR_POLL_CNTL.WPTR_POLL_EN=1 */
> +		if (i == 0)
> +			wb_gpu_addr = rdev->wb.gpu_addr + CIK_WB_CP1_WPTR_OFFSET;
> +		else
> +			wb_gpu_addr = rdev->wb.gpu_addr + CIK_WB_CP2_WPTR_OFFSET;
> +		mqd->queue_state.cp_hqd_pq_wptr_poll_addr = wb_gpu_addr & 0xfffffffc;
> +		mqd->queue_state.cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits(wb_gpu_addr) & 0xffff;
> +		WREG32(CP_HQD_PQ_WPTR_POLL_ADDR, mqd->queue_state.cp_hqd_pq_wptr_poll_addr);
> +		WREG32(CP_HQD_PQ_WPTR_POLL_ADDR_HI,
> +		       mqd->queue_state.cp_hqd_pq_wptr_poll_addr_hi);
> +
> +		/* set the wb address wether it's enabled or not */
> +		if (i == 0)
> +			wb_gpu_addr = rdev->wb.gpu_addr + RADEON_WB_CP1_RPTR_OFFSET;
> +		else
> +			wb_gpu_addr = rdev->wb.gpu_addr + RADEON_WB_CP2_RPTR_OFFSET;
> +		mqd->queue_state.cp_hqd_pq_rptr_report_addr = wb_gpu_addr & 0xfffffffc;
> +		mqd->queue_state.cp_hqd_pq_rptr_report_addr_hi =
> +			upper_32_bits(wb_gpu_addr) & 0xffff;
> +		WREG32(CP_HQD_PQ_RPTR_REPORT_ADDR,
> +		       mqd->queue_state.cp_hqd_pq_rptr_report_addr);
> +		WREG32(CP_HQD_PQ_RPTR_REPORT_ADDR_HI,
> +		       mqd->queue_state.cp_hqd_pq_rptr_report_addr_hi);
> +
> +		/* enable the doorbell if requested */
> +		if (use_doorbell) {
> +			mqd->queue_state.cp_hqd_pq_doorbell_control =
> +				RREG32(CP_HQD_PQ_DOORBELL_CONTROL);
> +			mqd->queue_state.cp_hqd_pq_doorbell_control &= ~DOORBELL_OFFSET_MASK;
> +			mqd->queue_state.cp_hqd_pq_doorbell_control |=
> +				DOORBELL_OFFSET(rdev->ring[idx].doorbell_offset / 4);
> +			mqd->queue_state.cp_hqd_pq_doorbell_control |= DOORBELL_EN;
> +			mqd->queue_state.cp_hqd_pq_doorbell_control &=
> +				~(DOORBELL_SOURCE | DOORBELL_HIT);
> +
> +		} else {
> +			mqd->queue_state.cp_hqd_pq_doorbell_control = 0;
> +		}
> +		WREG32(CP_HQD_PQ_DOORBELL_CONTROL,
> +		       mqd->queue_state.cp_hqd_pq_doorbell_control);
> +
> +		/* read and write pointers, similar to CP_RB0_WPTR/_RPTR */
> +		rdev->ring[idx].wptr = 0;
> +		mqd->queue_state.cp_hqd_pq_wptr = rdev->ring[idx].wptr;
> +		WREG32(CP_HQD_PQ_WPTR, mqd->queue_state.cp_hqd_pq_wptr);
> +		rdev->ring[idx].rptr = RREG32(CP_HQD_PQ_RPTR);
> +		mqd->queue_state.cp_hqd_pq_rptr = rdev->ring[idx].rptr;
> +
> +		/* set the vmid for the queue */
> +		mqd->queue_state.cp_hqd_vmid = 0;
> +		WREG32(CP_HQD_VMID, mqd->queue_state.cp_hqd_vmid);
> +
> +		/* activate the queue */
> +		mqd->queue_state.cp_hqd_active = 1;
> +		WREG32(CP_HQD_ACTIVE, mqd->queue_state.cp_hqd_active);
> +
> +		cik_srbm_select(rdev, 0, 0, 0, 0);
> +
> +		radeon_bo_kunmap(rdev->ring[idx].mqd_obj);
> +		radeon_bo_unreserve(rdev->ring[idx].mqd_obj);
> +
> +		rdev->ring[idx].ready = true;
> +		r = radeon_ring_test(rdev, idx, &rdev->ring[idx]);
> +		if (r)
> +			rdev->ring[idx].ready = false;
> +	}
> +
>  	return 0;
>  }
>  
> -/* XXX temporary wrappers to handle both compute and gfx */
> -/* XXX */
>  static void cik_cp_enable(struct radeon_device *rdev, bool enable)
>  {
>  	cik_cp_gfx_enable(rdev, enable);
>  	cik_cp_compute_enable(rdev, enable);
>  }
>  
> -/* XXX */
>  static int cik_cp_load_microcode(struct radeon_device *rdev)
>  {
>  	int r;
> @@ -2237,14 +2668,12 @@ static int cik_cp_load_microcode(struct radeon_device *rdev)
>  	return 0;
>  }
>  
> -/* XXX */
>  static void cik_cp_fini(struct radeon_device *rdev)
>  {
>  	cik_cp_gfx_fini(rdev);
>  	cik_cp_compute_fini(rdev);
>  }
>  
> -/* XXX */
>  static int cik_cp_resume(struct radeon_device *rdev)
>  {
>  	int r;
> @@ -2865,6 +3294,22 @@ static void cik_print_gpu_status_regs(struct radeon_device *rdev)
>  		RREG32(SDMA0_STATUS_REG + SDMA0_REGISTER_OFFSET));
>  	dev_info(rdev->dev, "  SDMA1_STATUS_REG   = 0x%08X\n",
>  		 RREG32(SDMA0_STATUS_REG + SDMA1_REGISTER_OFFSET));
> +	dev_info(rdev->dev, "  CP_STAT = 0x%08x\n", RREG32(CP_STAT));
> +	dev_info(rdev->dev, "  CP_STALLED_STAT1 = 0x%08x\n",
> +		 RREG32(CP_STALLED_STAT1));
> +	dev_info(rdev->dev, "  CP_STALLED_STAT2 = 0x%08x\n",
> +		 RREG32(CP_STALLED_STAT2));
> +	dev_info(rdev->dev, "  CP_STALLED_STAT3 = 0x%08x\n",
> +		 RREG32(CP_STALLED_STAT3));
> +	dev_info(rdev->dev, "  CP_CPF_BUSY_STAT = 0x%08x\n",
> +		 RREG32(CP_CPF_BUSY_STAT));
> +	dev_info(rdev->dev, "  CP_CPF_STALLED_STAT1 = 0x%08x\n",
> +		 RREG32(CP_CPF_STALLED_STAT1));
> +	dev_info(rdev->dev, "  CP_CPF_STATUS = 0x%08x\n", RREG32(CP_CPF_STATUS));
> +	dev_info(rdev->dev, "  CP_CPC_BUSY_STAT = 0x%08x\n", RREG32(CP_CPC_BUSY_STAT));
> +	dev_info(rdev->dev, "  CP_CPC_STALLED_STAT1 = 0x%08x\n",
> +		 RREG32(CP_CPC_STALLED_STAT1));
> +	dev_info(rdev->dev, "  CP_CPC_STATUS = 0x%08x\n", RREG32(CP_CPC_STATUS));
>  }
>  
>  /**
> @@ -4952,12 +5397,31 @@ static int cik_startup(struct radeon_device *rdev)
>  	if (r)
>  		return r;
>  
> +	/* allocate rlc buffers */

Init mec not allocate rlc ...

> +	r = cik_mec_init(rdev);
> +	if (r) {
> +		DRM_ERROR("Failed to init MEC BOs!\n");
> +		return r;
> +	}
> +
>  	r = radeon_fence_driver_start_ring(rdev, RADEON_RING_TYPE_GFX_INDEX);
>  	if (r) {
>  		dev_err(rdev->dev, "failed initializing CP fences (%d).\n", r);
>  		return r;
>  	}
>  
> +	r = radeon_fence_driver_start_ring(rdev, CAYMAN_RING_TYPE_CP1_INDEX);
> +	if (r) {
> +		dev_err(rdev->dev, "failed initializing CP fences (%d).\n", r);
> +		return r;
> +	}
> +
> +	r = radeon_fence_driver_start_ring(rdev, CAYMAN_RING_TYPE_CP2_INDEX);
> +	if (r) {
> +		dev_err(rdev->dev, "failed initializing CP fences (%d).\n", r);
> +		return r;
> +	}
> +
>  	r = radeon_fence_driver_start_ring(rdev, R600_RING_TYPE_DMA_INDEX);
>  	if (r) {
>  		dev_err(rdev->dev, "failed initializing DMA fences (%d).\n", r);
> @@ -5002,6 +5466,36 @@ static int cik_startup(struct radeon_device *rdev)
>  	if (r)
>  		return r;
>  
> +	/* set up the compute queues */
> +	ring = &rdev->ring[CAYMAN_RING_TYPE_CP1_INDEX];
> +	r = radeon_ring_init(rdev, ring, ring->ring_size, RADEON_WB_CP1_RPTR_OFFSET,
> +			     CP_HQD_PQ_RPTR, CP_HQD_PQ_WPTR,
> +			     0, 0xfffff, RADEON_CP_PACKET2);
> +	if (r)
> +		return r;
> +	ring->me = 1; /* first MEC */
> +	ring->pipe = 0; /* first pipe */
> +	ring->queue = 0; /* first queue */
> +	ring->wptr_offs = CIK_WB_CP1_WPTR_OFFSET;
> +	ring->funcs.get_rptr = &cik_compute_ring_get_rptr;
> +	ring->funcs.get_wptr = &cik_compute_ring_get_wptr;
> +	ring->funcs.set_wptr = &cik_compute_ring_set_wptr;
> +
> +	ring = &rdev->ring[CAYMAN_RING_TYPE_CP2_INDEX];
> +	r = radeon_ring_init(rdev, ring, ring->ring_size, RADEON_WB_CP2_RPTR_OFFSET,
> +			     CP_HQD_PQ_RPTR, CP_HQD_PQ_WPTR,
> +			     0, 0xffffffff, RADEON_CP_PACKET2);
> +	if (r)
> +		return r;
> +	/* dGPU only have 1 MEC */
> +	ring->me = 1; /* first MEC */
> +	ring->pipe = 0; /* first pipe */
> +	ring->queue = 1; /* second queue */
> +	ring->wptr_offs = CIK_WB_CP2_WPTR_OFFSET;
> +	ring->funcs.get_rptr = &cik_compute_ring_get_rptr;
> +	ring->funcs.get_wptr = &cik_compute_ring_get_wptr;
> +	ring->funcs.set_wptr = &cik_compute_ring_set_wptr;
> +
>  	ring = &rdev->ring[R600_RING_TYPE_DMA_INDEX];
>  	r = radeon_ring_init(rdev, ring, ring->ring_size, R600_WB_DMA_RPTR_OFFSET,
>  			     SDMA0_GFX_RB_RPTR + SDMA0_REGISTER_OFFSET,
> @@ -5176,6 +5670,20 @@ int cik_init(struct radeon_device *rdev)
>  	ring->ring_obj = NULL;
>  	r600_ring_init(rdev, ring, 1024 * 1024);
>  
> +	ring = &rdev->ring[CAYMAN_RING_TYPE_CP1_INDEX];
> +	ring->ring_obj = NULL;
> +	r600_ring_init(rdev, ring, 1024 * 1024);
> +	r = radeon_doorbell_get(rdev, &ring->doorbell_page_num);
> +	if (r)
> +		return r;
> +
> +	ring = &rdev->ring[CAYMAN_RING_TYPE_CP2_INDEX];
> +	ring->ring_obj = NULL;
> +	r600_ring_init(rdev, ring, 1024 * 1024);
> +	r = radeon_doorbell_get(rdev, &ring->doorbell_page_num);
> +	if (r)
> +		return r;
> +
>  	ring = &rdev->ring[R600_RING_TYPE_DMA_INDEX];
>  	ring->ring_obj = NULL;
>  	r600_ring_init(rdev, ring, 256 * 1024);
> @@ -5206,6 +5714,7 @@ int cik_init(struct radeon_device *rdev)
>  		cik_sdma_fini(rdev);
>  		cik_irq_fini(rdev);
>  		si_rlc_fini(rdev);
> +		cik_mec_fini(rdev);
>  		radeon_wb_fini(rdev);
>  		radeon_ib_pool_fini(rdev);
>  		radeon_vm_manager_fini(rdev);
> @@ -5241,6 +5750,7 @@ void cik_fini(struct radeon_device *rdev)
>  	cik_sdma_fini(rdev);
>  	cik_irq_fini(rdev);
>  	si_rlc_fini(rdev);
> +	cik_mec_fini(rdev);
>  	radeon_wb_fini(rdev);
>  	radeon_vm_manager_fini(rdev);
>  	radeon_ib_pool_fini(rdev);
> diff --git a/drivers/gpu/drm/radeon/cikd.h b/drivers/gpu/drm/radeon/cikd.h
> index 79be39e..63514b9 100644
> --- a/drivers/gpu/drm/radeon/cikd.h
> +++ b/drivers/gpu/drm/radeon/cikd.h
> @@ -460,6 +460,13 @@
>  #       define RDERR_INT_ENABLE                         (1 << 0)
>  #       define GUI_IDLE_INT_ENABLE                      (1 << 19)
>  
> +#define CP_CPC_STATUS					0x8210
> +#define CP_CPC_BUSY_STAT				0x8214
> +#define CP_CPC_STALLED_STAT1				0x8218
> +#define CP_CPF_STATUS					0x821c
> +#define CP_CPF_BUSY_STAT				0x8220
> +#define CP_CPF_STALLED_STAT1				0x8224
> +
>  #define CP_MEC_CNTL					0x8234
>  #define		MEC_ME2_HALT					(1 << 28)
>  #define		MEC_ME1_HALT					(1 << 30)
> @@ -468,6 +475,12 @@
>  #define		MEC_ME2_HALT					(1 << 28)
>  #define		MEC_ME1_HALT					(1 << 30)
>  
> +#define CP_STALLED_STAT3				0x8670
> +#define CP_STALLED_STAT1				0x8674
> +#define CP_STALLED_STAT2				0x8678
> +
> +#define CP_STAT						0x8680
> +
>  #define CP_ME_CNTL					0x86D8
>  #define		CP_CE_HALT					(1 << 24)
>  #define		CP_PFP_HALT					(1 << 26)
> @@ -701,6 +714,11 @@
>  #       define CP_RINGID1_INT_STAT                      (1 << 30)
>  #       define CP_RINGID0_INT_STAT                      (1 << 31)
>  
> +#define CP_CPF_DEBUG                                    0xC200
> +
> +#define CP_PQ_WPTR_POLL_CNTL                            0xC20C
> +#define		WPTR_POLL_EN      			(1 << 31)
> +
>  #define CP_ME1_PIPE0_INT_CNTL                           0xC214
>  #define CP_ME1_PIPE1_INT_CNTL                           0xC218
>  #define CP_ME1_PIPE2_INT_CNTL                           0xC21C
> @@ -773,6 +791,50 @@
>  #define RLC_GPM_SCRATCH_ADDR                              0xC4B0
>  #define RLC_GPM_SCRATCH_DATA                              0xC4B4
>  
> +#define CP_HPD_EOP_BASE_ADDR                              0xC904
> +#define CP_HPD_EOP_BASE_ADDR_HI                           0xC908
> +#define CP_HPD_EOP_VMID                                   0xC90C
> +#define CP_HPD_EOP_CONTROL                                0xC910
> +#define		EOP_SIZE(x)				((x) << 0)
> +#define		EOP_SIZE_MASK      			(0x3f << 0)
> +#define CP_MQD_BASE_ADDR                                  0xC914
> +#define CP_MQD_BASE_ADDR_HI                               0xC918
> +#define CP_HQD_ACTIVE                                     0xC91C
> +#define CP_HQD_VMID                                       0xC920
> +
> +#define CP_HQD_PQ_BASE                                    0xC934
> +#define CP_HQD_PQ_BASE_HI                                 0xC938
> +#define CP_HQD_PQ_RPTR                                    0xC93C
> +#define CP_HQD_PQ_RPTR_REPORT_ADDR                        0xC940
> +#define CP_HQD_PQ_RPTR_REPORT_ADDR_HI                     0xC944
> +#define CP_HQD_PQ_WPTR_POLL_ADDR                          0xC948
> +#define CP_HQD_PQ_WPTR_POLL_ADDR_HI                       0xC94C
> +#define CP_HQD_PQ_DOORBELL_CONTROL                        0xC950
> +#define		DOORBELL_OFFSET(x)			((x) << 2)
> +#define		DOORBELL_OFFSET_MASK			(0x1fffff << 2)
> +#define		DOORBELL_SOURCE      			(1 << 28)
> +#define		DOORBELL_SCHD_HIT      			(1 << 29)
> +#define		DOORBELL_EN      			(1 << 30)
> +#define		DOORBELL_HIT      			(1 << 31)
> +#define CP_HQD_PQ_WPTR                                    0xC954
> +#define CP_HQD_PQ_CONTROL                                 0xC958
> +#define		QUEUE_SIZE(x)				((x) << 0)
> +#define		QUEUE_SIZE_MASK      			(0x3f << 0)
> +#define		RPTR_BLOCK_SIZE(x)			((x) << 8)
> +#define		RPTR_BLOCK_SIZE_MASK			(0x3f << 8)
> +#define		PQ_VOLATILE      			(1 << 26)
> +#define		NO_UPDATE_RPTR      			(1 << 27)
> +#define		UNORD_DISPATCH      			(1 << 28)
> +#define		ROQ_PQ_IB_FLIP      			(1 << 29)
> +#define		PRIV_STATE      			(1 << 30)
> +#define		KMD_QUEUE      				(1 << 31)
> +
> +#define CP_HQD_DEQUEUE_REQUEST                          0xC974
> +
> +#define CP_MQD_CONTROL                                  0xC99C
> +#define		MQD_VMID(x)				((x) << 0)
> +#define		MQD_VMID_MASK      			(0xf << 0)
> +
>  #define PA_SC_RASTER_CONFIG                             0x28350
>  #       define RASTER_CONFIG_RB_MAP_0                   0
>  #       define RASTER_CONFIG_RB_MAP_1                   1
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index a2a3430..d40d506 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -714,6 +714,22 @@ struct radeon_ring {
>  		u32			(*get_wptr)(struct radeon_device *rdev, struct radeon_ring *ring);
>  		void			(*set_wptr)(struct radeon_device *rdev, struct radeon_ring *ring);
>  	} funcs;
> +	/* for CIK queues */
> +	u32 me;
> +	u32 pipe;
> +	u32 queue;
> +	struct radeon_bo	*mqd_obj;
> +	u32 doorbell_page_num;
> +	u32 doorbell_offset;
> +	unsigned		wptr_offs;
> +};
> +
> +struct radeon_mec {
> +	struct radeon_bo	*hpd_eop_obj;
> +	u64			hpd_eop_gpu_addr;
> +	u32 num_pipe;
> +	u32 num_mec;
> +	u32 num_queue;
>  };
>  
>  /*
> @@ -971,6 +987,8 @@ struct radeon_wb {
>  #define CAYMAN_WB_DMA1_RPTR_OFFSET   2304
>  #define R600_WB_UVD_RPTR_OFFSET  2560
>  #define R600_WB_EVENT_OFFSET     3072
> +#define CIK_WB_CP1_WPTR_OFFSET     3328
> +#define CIK_WB_CP2_WPTR_OFFSET     3584
>  
>  /**
>   * struct radeon_pm - power management datas
> @@ -1760,6 +1778,7 @@ struct radeon_device {
>  	int msi_enabled; /* msi enabled */
>  	struct r600_ih ih; /* r6/700 interrupt ring */
>  	struct si_rlc rlc;
> +	struct radeon_mec mec;
>  	struct work_struct hotplug_work;
>  	struct work_struct audio_work;
>  	struct work_struct reset_work;
> diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
> index cf71734..7e265a5 100644
> --- a/drivers/gpu/drm/radeon/radeon_cs.c
> +++ b/drivers/gpu/drm/radeon/radeon_cs.c
> @@ -121,9 +121,7 @@ static int radeon_cs_get_ring(struct radeon_cs_parser *p, u32 ring, s32 priority
>  		p->ring = RADEON_RING_TYPE_GFX_INDEX;
>  		break;
>  	case RADEON_CS_RING_COMPUTE:
> -		if (p->rdev->family >= CHIP_BONAIRE)
> -			p->ring = RADEON_RING_TYPE_GFX_INDEX;
> -		else if (p->rdev->family >= CHIP_TAHITI) {
> +		if (p->rdev->family >= CHIP_TAHITI) {
>  			if (p->priority > 0)
>  				p->ring = CAYMAN_RING_TYPE_CP1_INDEX;
>  			else
> -- 
> 1.7.7.5
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
diff mbox

Patch

diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
index 5c28fa5..9d2d6bb 100644
--- a/drivers/gpu/drm/radeon/cik.c
+++ b/drivers/gpu/drm/radeon/cik.c
@@ -1687,6 +1687,7 @@  int cik_ring_test(struct radeon_device *rdev, struct radeon_ring *ring)
 	radeon_ring_write(ring, ((scratch - PACKET3_SET_UCONFIG_REG_START) >> 2));
 	radeon_ring_write(ring, 0xDEADBEEF);
 	radeon_ring_unlock_commit(rdev, ring);
+
 	for (i = 0; i < rdev->usec_timeout; i++) {
 		tmp = RREG32(scratch);
 		if (tmp == 0xDEADBEEF)
@@ -2112,6 +2113,51 @@  static int cik_cp_gfx_resume(struct radeon_device *rdev)
 	return 0;
 }
 
+static u32 cik_compute_ring_get_rptr(struct radeon_device *rdev,
+				     struct radeon_ring *ring)
+{
+	u32 rptr;
+
+
+
+	if (rdev->wb.enabled) {
+		rptr = le32_to_cpu(rdev->wb.wb[ring->rptr_offs/4]);
+	} else {
+		cik_srbm_select(rdev, ring->me, ring->pipe, ring->queue, 0);
+		rptr = RREG32(CP_HQD_PQ_RPTR);
+		cik_srbm_select(rdev, 0, 0, 0, 0);
+	}
+	rptr = (rptr & ring->ptr_reg_mask) >> ring->ptr_reg_shift;
+
+	return rptr;
+}
+
+static u32 cik_compute_ring_get_wptr(struct radeon_device *rdev,
+				     struct radeon_ring *ring)
+{
+	u32 wptr;
+
+	if (rdev->wb.enabled) {
+		wptr = le32_to_cpu(rdev->wb.wb[ring->wptr_offs/4]);
+	} else {
+		cik_srbm_select(rdev, ring->me, ring->pipe, ring->queue, 0);
+		wptr = RREG32(CP_HQD_PQ_WPTR);
+		cik_srbm_select(rdev, 0, 0, 0, 0);
+	}
+	wptr = (wptr & ring->ptr_reg_mask) >> ring->ptr_reg_shift;
+
+	return wptr;
+}
+
+static void cik_compute_ring_set_wptr(struct radeon_device *rdev,
+				      struct radeon_ring *ring)
+{
+	u32 wptr = (ring->wptr << ring->ptr_reg_shift) & ring->ptr_reg_mask;
+
+	rdev->wb.wb[ring->wptr_offs/4] = cpu_to_le32(wptr);
+	WDOORBELL32(ring->doorbell_offset, wptr);
+}
+
 /**
  * cik_cp_compute_enable - enable/disable the compute CP MEs
  *
@@ -2176,7 +2222,8 @@  static int cik_cp_compute_load_microcode(struct radeon_device *rdev)
  */
 static int cik_cp_compute_start(struct radeon_device *rdev)
 {
-	//todo
+	cik_cp_compute_enable(rdev, true);
+
 	return 0;
 }
 
@@ -2190,10 +2237,171 @@  static int cik_cp_compute_start(struct radeon_device *rdev)
  */
 static void cik_cp_compute_fini(struct radeon_device *rdev)
 {
+	int i, idx, r;
+
 	cik_cp_compute_enable(rdev, false);
-	//todo
+
+	for (i = 0; i < 2; i++) {
+		if (i == 0)
+			idx = CAYMAN_RING_TYPE_CP1_INDEX;
+		else
+			idx = CAYMAN_RING_TYPE_CP2_INDEX;
+
+		if (rdev->ring[idx].mqd_obj) {
+			r = radeon_bo_reserve(rdev->ring[idx].mqd_obj, false);
+			if (unlikely(r != 0))
+				dev_warn(rdev->dev, "(%d) reserve MQD bo failed\n", r);
+
+			radeon_bo_unpin(rdev->ring[idx].mqd_obj);
+			radeon_bo_unreserve(rdev->ring[idx].mqd_obj);
+
+			radeon_bo_unref(&rdev->ring[idx].mqd_obj);
+			rdev->ring[idx].mqd_obj = NULL;
+		}
+	}
+}
+
+static void cik_mec_fini(struct radeon_device *rdev)
+{
+	int r;
+
+	if (rdev->mec.hpd_eop_obj) {
+		r = radeon_bo_reserve(rdev->mec.hpd_eop_obj, false);
+		if (unlikely(r != 0))
+			dev_warn(rdev->dev, "(%d) reserve HPD EOP bo failed\n", r);
+		radeon_bo_unpin(rdev->mec.hpd_eop_obj);
+		radeon_bo_unreserve(rdev->mec.hpd_eop_obj);
+
+		radeon_bo_unref(&rdev->mec.hpd_eop_obj);
+		rdev->mec.hpd_eop_obj = NULL;
+	}
+}
+
+#define MEC_HPD_SIZE 2048
+
+static int cik_mec_init(struct radeon_device *rdev)
+{
+	int r;
+	u32 *hpd;
+
+	/*
+	 * KV:    2 MEC, 4 Pipes/MEC, 8 Queues/Pipe - 64 Queues total
+	 * CI/KB: 1 MEC, 4 Pipes/MEC, 8 Queues/Pipe - 32 Queues total
+	 */
+	if (rdev->family == CHIP_KAVERI)
+		rdev->mec.num_mec = 2;
+	else
+		rdev->mec.num_mec = 1;
+	rdev->mec.num_pipe = 4;
+	rdev->mec.num_queue = rdev->mec.num_mec * rdev->mec.num_pipe * 8;
+
+	if (rdev->mec.hpd_eop_obj == NULL) {
+		r = radeon_bo_create(rdev,
+				     rdev->mec.num_mec *rdev->mec.num_pipe * MEC_HPD_SIZE * 2,
+				     PAGE_SIZE, true,
+				     RADEON_GEM_DOMAIN_GTT, NULL,
+				     &rdev->mec.hpd_eop_obj);
+		if (r) {
+			dev_warn(rdev->dev, "(%d) create HDP EOP bo failed\n", r);
+			return r;
+		}
+	}
+
+	r = radeon_bo_reserve(rdev->mec.hpd_eop_obj, false);
+	if (unlikely(r != 0)) {
+		cik_mec_fini(rdev);
+		return r;
+	}
+	r = radeon_bo_pin(rdev->mec.hpd_eop_obj, RADEON_GEM_DOMAIN_GTT,
+			  &rdev->mec.hpd_eop_gpu_addr);
+	if (r) {
+		dev_warn(rdev->dev, "(%d) pin HDP EOP bo failed\n", r);
+		cik_mec_fini(rdev);
+		return r;
+	}
+	r = radeon_bo_kmap(rdev->mec.hpd_eop_obj, (void **)&hpd);
+	if (r) {
+		dev_warn(rdev->dev, "(%d) map HDP EOP bo failed\n", r);
+		cik_mec_fini(rdev);
+		return r;
+	}
+
+	/* clear memory.  Not sure if this is required or not */
+	memset(hpd, 0, rdev->mec.num_mec *rdev->mec.num_pipe * MEC_HPD_SIZE * 2);
+
+	radeon_bo_kunmap(rdev->mec.hpd_eop_obj);
+	radeon_bo_unreserve(rdev->mec.hpd_eop_obj);
+
+	return 0;
 }
 
+struct hqd_registers
+{
+	u32 cp_mqd_base_addr;
+	u32 cp_mqd_base_addr_hi;
+	u32 cp_hqd_active;
+	u32 cp_hqd_vmid;
+	u32 cp_hqd_persistent_state;
+	u32 cp_hqd_pipe_priority;
+	u32 cp_hqd_queue_priority;
+	u32 cp_hqd_quantum;
+	u32 cp_hqd_pq_base;
+	u32 cp_hqd_pq_base_hi;
+	u32 cp_hqd_pq_rptr;
+	u32 cp_hqd_pq_rptr_report_addr;
+	u32 cp_hqd_pq_rptr_report_addr_hi;
+	u32 cp_hqd_pq_wptr_poll_addr;
+	u32 cp_hqd_pq_wptr_poll_addr_hi;
+	u32 cp_hqd_pq_doorbell_control;
+	u32 cp_hqd_pq_wptr;
+	u32 cp_hqd_pq_control;
+	u32 cp_hqd_ib_base_addr;
+	u32 cp_hqd_ib_base_addr_hi;
+	u32 cp_hqd_ib_rptr;
+	u32 cp_hqd_ib_control;
+	u32 cp_hqd_iq_timer;
+	u32 cp_hqd_iq_rptr;
+	u32 cp_hqd_dequeue_request;
+	u32 cp_hqd_dma_offload;
+	u32 cp_hqd_sema_cmd;
+	u32 cp_hqd_msg_type;
+	u32 cp_hqd_atomic0_preop_lo;
+	u32 cp_hqd_atomic0_preop_hi;
+	u32 cp_hqd_atomic1_preop_lo;
+	u32 cp_hqd_atomic1_preop_hi;
+	u32 cp_hqd_hq_scheduler0;
+	u32 cp_hqd_hq_scheduler1;
+	u32 cp_mqd_control;
+};
+
+struct bonaire_mqd
+{
+	u32 header;
+	u32 dispatch_initiator;
+	u32 dimensions[3];
+	u32 start_idx[3];
+	u32 num_threads[3];
+	u32 pipeline_stat_enable;
+	u32 perf_counter_enable;
+	u32 pgm[2];
+	u32 tba[2];
+	u32 tma[2];
+	u32 pgm_rsrc[2];
+	u32 vmid;
+	u32 resource_limits;
+	u32 static_thread_mgmt01[2];
+	u32 tmp_ring_size;
+	u32 static_thread_mgmt23[2];
+	u32 restart[3];
+	u32 thread_trace_enable;
+	u32 reserved1;
+	u32 user_data[16];
+	u32 vgtcs_invoke_count[2];
+	struct hqd_registers queue_state;
+	u32 dequeue_cntr;
+	u32 interrupt_queue[64];
+};
+
 /**
  * cik_cp_compute_resume - setup the compute queue registers
  *
@@ -2205,24 +2413,247 @@  static void cik_cp_compute_fini(struct radeon_device *rdev)
  */
 static int cik_cp_compute_resume(struct radeon_device *rdev)
 {
-	int r;
+	int r, i, idx;
+	u32 tmp;
+	bool use_doorbell = true;
+	u64 hqd_gpu_addr;
+	u64 mqd_gpu_addr;
+	u64 eop_gpu_addr;
+	u64 wb_gpu_addr;
+	u32 *buf;
+	struct bonaire_mqd *mqd;
 
-	//todo
 	r = cik_cp_compute_start(rdev);
 	if (r)
 		return r;
+
+	/* fix up chicken bits */
+	tmp = RREG32(CP_CPF_DEBUG);
+	tmp |= (1 << 23);
+	WREG32(CP_CPF_DEBUG, tmp);
+
+	/* init the pipes */
+	for (i = 0; i < (rdev->mec.num_pipe * rdev->mec.num_mec); i++) {
+		int me = (i < 4) ? 1 : 2;
+		int pipe = (i < 4) ? i : (i - 4);
+
+		eop_gpu_addr = rdev->mec.hpd_eop_gpu_addr + (i * MEC_HPD_SIZE * 2);
+
+		cik_srbm_select(rdev, me, pipe, 0, 0);
+
+		/* write the EOP addr */
+		WREG32(CP_HPD_EOP_BASE_ADDR, eop_gpu_addr >> 8);
+		WREG32(CP_HPD_EOP_BASE_ADDR_HI, upper_32_bits(eop_gpu_addr) >> 8);
+
+		/* set the VMID assigned */
+		WREG32(CP_HPD_EOP_VMID, 0);
+
+		/* set the EOP size, register value is 2^(EOP_SIZE+1) dwords */
+		tmp = RREG32(CP_HPD_EOP_CONTROL);
+		tmp &= ~EOP_SIZE_MASK;
+		tmp |= drm_order(MEC_HPD_SIZE / 8);
+		WREG32(CP_HPD_EOP_CONTROL, tmp);
+	}
+	cik_srbm_select(rdev, 0, 0, 0, 0);
+
+	/* init the queues.  Just two for now. */
+	for (i = 0; i < 2; i++) {
+		if (i == 0)
+			idx = CAYMAN_RING_TYPE_CP1_INDEX;
+		else
+			idx = CAYMAN_RING_TYPE_CP2_INDEX;
+
+		if (rdev->ring[idx].mqd_obj == NULL) {
+			r = radeon_bo_create(rdev,
+					     sizeof(struct bonaire_mqd),
+					     PAGE_SIZE, true,
+					     RADEON_GEM_DOMAIN_GTT, NULL,
+					     &rdev->ring[idx].mqd_obj);
+			if (r) {
+				dev_warn(rdev->dev, "(%d) create MQD bo failed\n", r);
+				return r;
+			}
+		}
+
+		r = radeon_bo_reserve(rdev->ring[idx].mqd_obj, false);
+		if (unlikely(r != 0)) {
+			cik_cp_compute_fini(rdev);
+			return r;
+		}
+		r = radeon_bo_pin(rdev->ring[idx].mqd_obj, RADEON_GEM_DOMAIN_GTT,
+				  &mqd_gpu_addr);
+		if (r) {
+			dev_warn(rdev->dev, "(%d) pin MQD bo failed\n", r);
+			cik_cp_compute_fini(rdev);
+			return r;
+		}
+		r = radeon_bo_kmap(rdev->ring[idx].mqd_obj, (void **)&buf);
+		if (r) {
+			dev_warn(rdev->dev, "(%d) map MQD bo failed\n", r);
+			cik_cp_compute_fini(rdev);
+			return r;
+		}
+
+		/* doorbell offset */
+		rdev->ring[idx].doorbell_offset =
+			(rdev->ring[idx].doorbell_page_num * PAGE_SIZE) + 0;
+
+		/* init the mqd struct */
+		memset(buf, 0, sizeof(struct bonaire_mqd));
+
+		mqd = (struct bonaire_mqd *)buf;
+		mqd->header = 0xC0310800;
+		mqd->static_thread_mgmt01[0] = 0xffffffff;
+		mqd->static_thread_mgmt01[1] = 0xffffffff;
+		mqd->static_thread_mgmt23[0] = 0xffffffff;
+		mqd->static_thread_mgmt23[1] = 0xffffffff;
+
+		cik_srbm_select(rdev, rdev->ring[idx].me,
+				rdev->ring[idx].pipe,
+				rdev->ring[idx].queue, 0);
+
+		/* disable wptr polling */
+		tmp = RREG32(CP_PQ_WPTR_POLL_CNTL);
+		tmp &= ~WPTR_POLL_EN;
+		WREG32(CP_PQ_WPTR_POLL_CNTL, tmp);
+
+		/* enable doorbell? */
+		mqd->queue_state.cp_hqd_pq_doorbell_control =
+			RREG32(CP_HQD_PQ_DOORBELL_CONTROL);
+		if (use_doorbell)
+			mqd->queue_state.cp_hqd_pq_doorbell_control |= DOORBELL_EN;
+		else
+			mqd->queue_state.cp_hqd_pq_doorbell_control &= ~DOORBELL_EN;
+		WREG32(CP_HQD_PQ_DOORBELL_CONTROL,
+		       mqd->queue_state.cp_hqd_pq_doorbell_control);
+
+		/* disable the queue if it's active */
+		mqd->queue_state.cp_hqd_dequeue_request = 0;
+		mqd->queue_state.cp_hqd_pq_rptr = 0;
+		mqd->queue_state.cp_hqd_pq_wptr= 0;
+		if (RREG32(CP_HQD_ACTIVE) & 1) {
+			WREG32(CP_HQD_DEQUEUE_REQUEST, 1);
+			for (i = 0; i < rdev->usec_timeout; i++) {
+				if (!(RREG32(CP_HQD_ACTIVE) & 1))
+					break;
+				udelay(1);
+			}
+			WREG32(CP_HQD_DEQUEUE_REQUEST, mqd->queue_state.cp_hqd_dequeue_request);
+			WREG32(CP_HQD_PQ_RPTR, mqd->queue_state.cp_hqd_pq_rptr);
+			WREG32(CP_HQD_PQ_WPTR, mqd->queue_state.cp_hqd_pq_wptr);
+		}
+
+		/* set the pointer to the MQD */
+		mqd->queue_state.cp_mqd_base_addr = mqd_gpu_addr & 0xfffffffc;
+		mqd->queue_state.cp_mqd_base_addr_hi = upper_32_bits(mqd_gpu_addr);
+		WREG32(CP_MQD_BASE_ADDR, mqd->queue_state.cp_mqd_base_addr);
+		WREG32(CP_MQD_BASE_ADDR_HI, mqd->queue_state.cp_mqd_base_addr_hi);
+		/* set MQD vmid to 0 */
+		mqd->queue_state.cp_mqd_control = RREG32(CP_MQD_CONTROL);
+		mqd->queue_state.cp_mqd_control &= ~MQD_VMID_MASK;
+		WREG32(CP_MQD_CONTROL, mqd->queue_state.cp_mqd_control);
+
+		/* set the pointer to the HQD, this is similar CP_RB0_BASE/_HI */
+		hqd_gpu_addr = rdev->ring[idx].gpu_addr >> 8;
+		mqd->queue_state.cp_hqd_pq_base = hqd_gpu_addr;
+		mqd->queue_state.cp_hqd_pq_base_hi = upper_32_bits(hqd_gpu_addr);
+		WREG32(CP_HQD_PQ_BASE, mqd->queue_state.cp_hqd_pq_base);
+		WREG32(CP_HQD_PQ_BASE_HI, mqd->queue_state.cp_hqd_pq_base_hi);
+
+		/* set up the HQD, this is similar to CP_RB0_CNTL */
+		mqd->queue_state.cp_hqd_pq_control = RREG32(CP_HQD_PQ_CONTROL);
+		mqd->queue_state.cp_hqd_pq_control &=
+			~(QUEUE_SIZE_MASK | RPTR_BLOCK_SIZE_MASK);
+
+		mqd->queue_state.cp_hqd_pq_control |=
+			drm_order(rdev->ring[idx].ring_size / 8);
+		mqd->queue_state.cp_hqd_pq_control |=
+			(drm_order(RADEON_GPU_PAGE_SIZE/8) << 8);
+#ifdef __BIG_ENDIAN
+		mqd->queue_state.cp_hqd_pq_control |= BUF_SWAP_32BIT;
+#endif
+		mqd->queue_state.cp_hqd_pq_control &=
+			~(UNORD_DISPATCH | ROQ_PQ_IB_FLIP | PQ_VOLATILE);
+		mqd->queue_state.cp_hqd_pq_control |=
+			PRIV_STATE | KMD_QUEUE; /* assuming kernel queue control */
+		WREG32(CP_HQD_PQ_CONTROL, mqd->queue_state.cp_hqd_pq_control);
+
+		/* only used if CP_PQ_WPTR_POLL_CNTL.WPTR_POLL_EN=1 */
+		if (i == 0)
+			wb_gpu_addr = rdev->wb.gpu_addr + CIK_WB_CP1_WPTR_OFFSET;
+		else
+			wb_gpu_addr = rdev->wb.gpu_addr + CIK_WB_CP2_WPTR_OFFSET;
+		mqd->queue_state.cp_hqd_pq_wptr_poll_addr = wb_gpu_addr & 0xfffffffc;
+		mqd->queue_state.cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits(wb_gpu_addr) & 0xffff;
+		WREG32(CP_HQD_PQ_WPTR_POLL_ADDR, mqd->queue_state.cp_hqd_pq_wptr_poll_addr);
+		WREG32(CP_HQD_PQ_WPTR_POLL_ADDR_HI,
+		       mqd->queue_state.cp_hqd_pq_wptr_poll_addr_hi);
+
+		/* set the wb address wether it's enabled or not */
+		if (i == 0)
+			wb_gpu_addr = rdev->wb.gpu_addr + RADEON_WB_CP1_RPTR_OFFSET;
+		else
+			wb_gpu_addr = rdev->wb.gpu_addr + RADEON_WB_CP2_RPTR_OFFSET;
+		mqd->queue_state.cp_hqd_pq_rptr_report_addr = wb_gpu_addr & 0xfffffffc;
+		mqd->queue_state.cp_hqd_pq_rptr_report_addr_hi =
+			upper_32_bits(wb_gpu_addr) & 0xffff;
+		WREG32(CP_HQD_PQ_RPTR_REPORT_ADDR,
+		       mqd->queue_state.cp_hqd_pq_rptr_report_addr);
+		WREG32(CP_HQD_PQ_RPTR_REPORT_ADDR_HI,
+		       mqd->queue_state.cp_hqd_pq_rptr_report_addr_hi);
+
+		/* enable the doorbell if requested */
+		if (use_doorbell) {
+			mqd->queue_state.cp_hqd_pq_doorbell_control =
+				RREG32(CP_HQD_PQ_DOORBELL_CONTROL);
+			mqd->queue_state.cp_hqd_pq_doorbell_control &= ~DOORBELL_OFFSET_MASK;
+			mqd->queue_state.cp_hqd_pq_doorbell_control |=
+				DOORBELL_OFFSET(rdev->ring[idx].doorbell_offset / 4);
+			mqd->queue_state.cp_hqd_pq_doorbell_control |= DOORBELL_EN;
+			mqd->queue_state.cp_hqd_pq_doorbell_control &=
+				~(DOORBELL_SOURCE | DOORBELL_HIT);
+
+		} else {
+			mqd->queue_state.cp_hqd_pq_doorbell_control = 0;
+		}
+		WREG32(CP_HQD_PQ_DOORBELL_CONTROL,
+		       mqd->queue_state.cp_hqd_pq_doorbell_control);
+
+		/* read and write pointers, similar to CP_RB0_WPTR/_RPTR */
+		rdev->ring[idx].wptr = 0;
+		mqd->queue_state.cp_hqd_pq_wptr = rdev->ring[idx].wptr;
+		WREG32(CP_HQD_PQ_WPTR, mqd->queue_state.cp_hqd_pq_wptr);
+		rdev->ring[idx].rptr = RREG32(CP_HQD_PQ_RPTR);
+		mqd->queue_state.cp_hqd_pq_rptr = rdev->ring[idx].rptr;
+
+		/* set the vmid for the queue */
+		mqd->queue_state.cp_hqd_vmid = 0;
+		WREG32(CP_HQD_VMID, mqd->queue_state.cp_hqd_vmid);
+
+		/* activate the queue */
+		mqd->queue_state.cp_hqd_active = 1;
+		WREG32(CP_HQD_ACTIVE, mqd->queue_state.cp_hqd_active);
+
+		cik_srbm_select(rdev, 0, 0, 0, 0);
+
+		radeon_bo_kunmap(rdev->ring[idx].mqd_obj);
+		radeon_bo_unreserve(rdev->ring[idx].mqd_obj);
+
+		rdev->ring[idx].ready = true;
+		r = radeon_ring_test(rdev, idx, &rdev->ring[idx]);
+		if (r)
+			rdev->ring[idx].ready = false;
+	}
+
 	return 0;
 }
 
-/* XXX temporary wrappers to handle both compute and gfx */
-/* XXX */
 static void cik_cp_enable(struct radeon_device *rdev, bool enable)
 {
 	cik_cp_gfx_enable(rdev, enable);
 	cik_cp_compute_enable(rdev, enable);
 }
 
-/* XXX */
 static int cik_cp_load_microcode(struct radeon_device *rdev)
 {
 	int r;
@@ -2237,14 +2668,12 @@  static int cik_cp_load_microcode(struct radeon_device *rdev)
 	return 0;
 }
 
-/* XXX */
 static void cik_cp_fini(struct radeon_device *rdev)
 {
 	cik_cp_gfx_fini(rdev);
 	cik_cp_compute_fini(rdev);
 }
 
-/* XXX */
 static int cik_cp_resume(struct radeon_device *rdev)
 {
 	int r;
@@ -2865,6 +3294,22 @@  static void cik_print_gpu_status_regs(struct radeon_device *rdev)
 		RREG32(SDMA0_STATUS_REG + SDMA0_REGISTER_OFFSET));
 	dev_info(rdev->dev, "  SDMA1_STATUS_REG   = 0x%08X\n",
 		 RREG32(SDMA0_STATUS_REG + SDMA1_REGISTER_OFFSET));
+	dev_info(rdev->dev, "  CP_STAT = 0x%08x\n", RREG32(CP_STAT));
+	dev_info(rdev->dev, "  CP_STALLED_STAT1 = 0x%08x\n",
+		 RREG32(CP_STALLED_STAT1));
+	dev_info(rdev->dev, "  CP_STALLED_STAT2 = 0x%08x\n",
+		 RREG32(CP_STALLED_STAT2));
+	dev_info(rdev->dev, "  CP_STALLED_STAT3 = 0x%08x\n",
+		 RREG32(CP_STALLED_STAT3));
+	dev_info(rdev->dev, "  CP_CPF_BUSY_STAT = 0x%08x\n",
+		 RREG32(CP_CPF_BUSY_STAT));
+	dev_info(rdev->dev, "  CP_CPF_STALLED_STAT1 = 0x%08x\n",
+		 RREG32(CP_CPF_STALLED_STAT1));
+	dev_info(rdev->dev, "  CP_CPF_STATUS = 0x%08x\n", RREG32(CP_CPF_STATUS));
+	dev_info(rdev->dev, "  CP_CPC_BUSY_STAT = 0x%08x\n", RREG32(CP_CPC_BUSY_STAT));
+	dev_info(rdev->dev, "  CP_CPC_STALLED_STAT1 = 0x%08x\n",
+		 RREG32(CP_CPC_STALLED_STAT1));
+	dev_info(rdev->dev, "  CP_CPC_STATUS = 0x%08x\n", RREG32(CP_CPC_STATUS));
 }
 
 /**
@@ -4952,12 +5397,31 @@  static int cik_startup(struct radeon_device *rdev)
 	if (r)
 		return r;
 
+	/* allocate rlc buffers */
+	r = cik_mec_init(rdev);
+	if (r) {
+		DRM_ERROR("Failed to init MEC BOs!\n");
+		return r;
+	}
+
 	r = radeon_fence_driver_start_ring(rdev, RADEON_RING_TYPE_GFX_INDEX);
 	if (r) {
 		dev_err(rdev->dev, "failed initializing CP fences (%d).\n", r);
 		return r;
 	}
 
+	r = radeon_fence_driver_start_ring(rdev, CAYMAN_RING_TYPE_CP1_INDEX);
+	if (r) {
+		dev_err(rdev->dev, "failed initializing CP fences (%d).\n", r);
+		return r;
+	}
+
+	r = radeon_fence_driver_start_ring(rdev, CAYMAN_RING_TYPE_CP2_INDEX);
+	if (r) {
+		dev_err(rdev->dev, "failed initializing CP fences (%d).\n", r);
+		return r;
+	}
+
 	r = radeon_fence_driver_start_ring(rdev, R600_RING_TYPE_DMA_INDEX);
 	if (r) {
 		dev_err(rdev->dev, "failed initializing DMA fences (%d).\n", r);
@@ -5002,6 +5466,36 @@  static int cik_startup(struct radeon_device *rdev)
 	if (r)
 		return r;
 
+	/* set up the compute queues */
+	ring = &rdev->ring[CAYMAN_RING_TYPE_CP1_INDEX];
+	r = radeon_ring_init(rdev, ring, ring->ring_size, RADEON_WB_CP1_RPTR_OFFSET,
+			     CP_HQD_PQ_RPTR, CP_HQD_PQ_WPTR,
+			     0, 0xfffff, RADEON_CP_PACKET2);
+	if (r)
+		return r;
+	ring->me = 1; /* first MEC */
+	ring->pipe = 0; /* first pipe */
+	ring->queue = 0; /* first queue */
+	ring->wptr_offs = CIK_WB_CP1_WPTR_OFFSET;
+	ring->funcs.get_rptr = &cik_compute_ring_get_rptr;
+	ring->funcs.get_wptr = &cik_compute_ring_get_wptr;
+	ring->funcs.set_wptr = &cik_compute_ring_set_wptr;
+
+	ring = &rdev->ring[CAYMAN_RING_TYPE_CP2_INDEX];
+	r = radeon_ring_init(rdev, ring, ring->ring_size, RADEON_WB_CP2_RPTR_OFFSET,
+			     CP_HQD_PQ_RPTR, CP_HQD_PQ_WPTR,
+			     0, 0xffffffff, RADEON_CP_PACKET2);
+	if (r)
+		return r;
+	/* dGPU only have 1 MEC */
+	ring->me = 1; /* first MEC */
+	ring->pipe = 0; /* first pipe */
+	ring->queue = 1; /* second queue */
+	ring->wptr_offs = CIK_WB_CP2_WPTR_OFFSET;
+	ring->funcs.get_rptr = &cik_compute_ring_get_rptr;
+	ring->funcs.get_wptr = &cik_compute_ring_get_wptr;
+	ring->funcs.set_wptr = &cik_compute_ring_set_wptr;
+
 	ring = &rdev->ring[R600_RING_TYPE_DMA_INDEX];
 	r = radeon_ring_init(rdev, ring, ring->ring_size, R600_WB_DMA_RPTR_OFFSET,
 			     SDMA0_GFX_RB_RPTR + SDMA0_REGISTER_OFFSET,
@@ -5176,6 +5670,20 @@  int cik_init(struct radeon_device *rdev)
 	ring->ring_obj = NULL;
 	r600_ring_init(rdev, ring, 1024 * 1024);
 
+	ring = &rdev->ring[CAYMAN_RING_TYPE_CP1_INDEX];
+	ring->ring_obj = NULL;
+	r600_ring_init(rdev, ring, 1024 * 1024);
+	r = radeon_doorbell_get(rdev, &ring->doorbell_page_num);
+	if (r)
+		return r;
+
+	ring = &rdev->ring[CAYMAN_RING_TYPE_CP2_INDEX];
+	ring->ring_obj = NULL;
+	r600_ring_init(rdev, ring, 1024 * 1024);
+	r = radeon_doorbell_get(rdev, &ring->doorbell_page_num);
+	if (r)
+		return r;
+
 	ring = &rdev->ring[R600_RING_TYPE_DMA_INDEX];
 	ring->ring_obj = NULL;
 	r600_ring_init(rdev, ring, 256 * 1024);
@@ -5206,6 +5714,7 @@  int cik_init(struct radeon_device *rdev)
 		cik_sdma_fini(rdev);
 		cik_irq_fini(rdev);
 		si_rlc_fini(rdev);
+		cik_mec_fini(rdev);
 		radeon_wb_fini(rdev);
 		radeon_ib_pool_fini(rdev);
 		radeon_vm_manager_fini(rdev);
@@ -5241,6 +5750,7 @@  void cik_fini(struct radeon_device *rdev)
 	cik_sdma_fini(rdev);
 	cik_irq_fini(rdev);
 	si_rlc_fini(rdev);
+	cik_mec_fini(rdev);
 	radeon_wb_fini(rdev);
 	radeon_vm_manager_fini(rdev);
 	radeon_ib_pool_fini(rdev);
diff --git a/drivers/gpu/drm/radeon/cikd.h b/drivers/gpu/drm/radeon/cikd.h
index 79be39e..63514b9 100644
--- a/drivers/gpu/drm/radeon/cikd.h
+++ b/drivers/gpu/drm/radeon/cikd.h
@@ -460,6 +460,13 @@ 
 #       define RDERR_INT_ENABLE                         (1 << 0)
 #       define GUI_IDLE_INT_ENABLE                      (1 << 19)
 
+#define CP_CPC_STATUS					0x8210
+#define CP_CPC_BUSY_STAT				0x8214
+#define CP_CPC_STALLED_STAT1				0x8218
+#define CP_CPF_STATUS					0x821c
+#define CP_CPF_BUSY_STAT				0x8220
+#define CP_CPF_STALLED_STAT1				0x8224
+
 #define CP_MEC_CNTL					0x8234
 #define		MEC_ME2_HALT					(1 << 28)
 #define		MEC_ME1_HALT					(1 << 30)
@@ -468,6 +475,12 @@ 
 #define		MEC_ME2_HALT					(1 << 28)
 #define		MEC_ME1_HALT					(1 << 30)
 
+#define CP_STALLED_STAT3				0x8670
+#define CP_STALLED_STAT1				0x8674
+#define CP_STALLED_STAT2				0x8678
+
+#define CP_STAT						0x8680
+
 #define CP_ME_CNTL					0x86D8
 #define		CP_CE_HALT					(1 << 24)
 #define		CP_PFP_HALT					(1 << 26)
@@ -701,6 +714,11 @@ 
 #       define CP_RINGID1_INT_STAT                      (1 << 30)
 #       define CP_RINGID0_INT_STAT                      (1 << 31)
 
+#define CP_CPF_DEBUG                                    0xC200
+
+#define CP_PQ_WPTR_POLL_CNTL                            0xC20C
+#define		WPTR_POLL_EN      			(1 << 31)
+
 #define CP_ME1_PIPE0_INT_CNTL                           0xC214
 #define CP_ME1_PIPE1_INT_CNTL                           0xC218
 #define CP_ME1_PIPE2_INT_CNTL                           0xC21C
@@ -773,6 +791,50 @@ 
 #define RLC_GPM_SCRATCH_ADDR                              0xC4B0
 #define RLC_GPM_SCRATCH_DATA                              0xC4B4
 
+#define CP_HPD_EOP_BASE_ADDR                              0xC904
+#define CP_HPD_EOP_BASE_ADDR_HI                           0xC908
+#define CP_HPD_EOP_VMID                                   0xC90C
+#define CP_HPD_EOP_CONTROL                                0xC910
+#define		EOP_SIZE(x)				((x) << 0)
+#define		EOP_SIZE_MASK      			(0x3f << 0)
+#define CP_MQD_BASE_ADDR                                  0xC914
+#define CP_MQD_BASE_ADDR_HI                               0xC918
+#define CP_HQD_ACTIVE                                     0xC91C
+#define CP_HQD_VMID                                       0xC920
+
+#define CP_HQD_PQ_BASE                                    0xC934
+#define CP_HQD_PQ_BASE_HI                                 0xC938
+#define CP_HQD_PQ_RPTR                                    0xC93C
+#define CP_HQD_PQ_RPTR_REPORT_ADDR                        0xC940
+#define CP_HQD_PQ_RPTR_REPORT_ADDR_HI                     0xC944
+#define CP_HQD_PQ_WPTR_POLL_ADDR                          0xC948
+#define CP_HQD_PQ_WPTR_POLL_ADDR_HI                       0xC94C
+#define CP_HQD_PQ_DOORBELL_CONTROL                        0xC950
+#define		DOORBELL_OFFSET(x)			((x) << 2)
+#define		DOORBELL_OFFSET_MASK			(0x1fffff << 2)
+#define		DOORBELL_SOURCE      			(1 << 28)
+#define		DOORBELL_SCHD_HIT      			(1 << 29)
+#define		DOORBELL_EN      			(1 << 30)
+#define		DOORBELL_HIT      			(1 << 31)
+#define CP_HQD_PQ_WPTR                                    0xC954
+#define CP_HQD_PQ_CONTROL                                 0xC958
+#define		QUEUE_SIZE(x)				((x) << 0)
+#define		QUEUE_SIZE_MASK      			(0x3f << 0)
+#define		RPTR_BLOCK_SIZE(x)			((x) << 8)
+#define		RPTR_BLOCK_SIZE_MASK			(0x3f << 8)
+#define		PQ_VOLATILE      			(1 << 26)
+#define		NO_UPDATE_RPTR      			(1 << 27)
+#define		UNORD_DISPATCH      			(1 << 28)
+#define		ROQ_PQ_IB_FLIP      			(1 << 29)
+#define		PRIV_STATE      			(1 << 30)
+#define		KMD_QUEUE      				(1 << 31)
+
+#define CP_HQD_DEQUEUE_REQUEST                          0xC974
+
+#define CP_MQD_CONTROL                                  0xC99C
+#define		MQD_VMID(x)				((x) << 0)
+#define		MQD_VMID_MASK      			(0xf << 0)
+
 #define PA_SC_RASTER_CONFIG                             0x28350
 #       define RASTER_CONFIG_RB_MAP_0                   0
 #       define RASTER_CONFIG_RB_MAP_1                   1
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index a2a3430..d40d506 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -714,6 +714,22 @@  struct radeon_ring {
 		u32			(*get_wptr)(struct radeon_device *rdev, struct radeon_ring *ring);
 		void			(*set_wptr)(struct radeon_device *rdev, struct radeon_ring *ring);
 	} funcs;
+	/* for CIK queues */
+	u32 me;
+	u32 pipe;
+	u32 queue;
+	struct radeon_bo	*mqd_obj;
+	u32 doorbell_page_num;
+	u32 doorbell_offset;
+	unsigned		wptr_offs;
+};
+
+struct radeon_mec {
+	struct radeon_bo	*hpd_eop_obj;
+	u64			hpd_eop_gpu_addr;
+	u32 num_pipe;
+	u32 num_mec;
+	u32 num_queue;
 };
 
 /*
@@ -971,6 +987,8 @@  struct radeon_wb {
 #define CAYMAN_WB_DMA1_RPTR_OFFSET   2304
 #define R600_WB_UVD_RPTR_OFFSET  2560
 #define R600_WB_EVENT_OFFSET     3072
+#define CIK_WB_CP1_WPTR_OFFSET     3328
+#define CIK_WB_CP2_WPTR_OFFSET     3584
 
 /**
  * struct radeon_pm - power management datas
@@ -1760,6 +1778,7 @@  struct radeon_device {
 	int msi_enabled; /* msi enabled */
 	struct r600_ih ih; /* r6/700 interrupt ring */
 	struct si_rlc rlc;
+	struct radeon_mec mec;
 	struct work_struct hotplug_work;
 	struct work_struct audio_work;
 	struct work_struct reset_work;
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index cf71734..7e265a5 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -121,9 +121,7 @@  static int radeon_cs_get_ring(struct radeon_cs_parser *p, u32 ring, s32 priority
 		p->ring = RADEON_RING_TYPE_GFX_INDEX;
 		break;
 	case RADEON_CS_RING_COMPUTE:
-		if (p->rdev->family >= CHIP_BONAIRE)
-			p->ring = RADEON_RING_TYPE_GFX_INDEX;
-		else if (p->rdev->family >= CHIP_TAHITI) {
+		if (p->rdev->family >= CHIP_TAHITI) {
 			if (p->priority > 0)
 				p->ring = CAYMAN_RING_TYPE_CP1_INDEX;
 			else