diff mbox

[2/2] radeon/kms: cleanup async dma packet checking

Message ID 1357767643-3538-2-git-send-email-j.glisse@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jerome Glisse Jan. 9, 2013, 9:40 p.m. UTC
From: Jerome Glisse <jglisse@redhat.com>

This simplify and cleanup the async dma checking.

Signed-off-by: Jerome Glisse <jglisse@redhat.com>
---
 drivers/gpu/drm/radeon/evergreen.c    |  16 +-
 drivers/gpu/drm/radeon/evergreen_cs.c | 807 +++++++++++++++++-----------------
 drivers/gpu/drm/radeon/evergreend.h   |  29 +-
 3 files changed, 417 insertions(+), 435 deletions(-)
diff mbox

Patch

diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c
index f92f6bb..28f8d4f 100644
--- a/drivers/gpu/drm/radeon/evergreen.c
+++ b/drivers/gpu/drm/radeon/evergreen.c
@@ -3223,14 +3223,14 @@  void evergreen_dma_fence_ring_emit(struct radeon_device *rdev,
 	struct radeon_ring *ring = &rdev->ring[fence->ring];
 	u64 addr = rdev->fence_drv[fence->ring].gpu_addr;
 	/* write the fence */
-	radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_FENCE, 0, 0, 0));
+	radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_FENCE, 0, 0));
 	radeon_ring_write(ring, addr & 0xfffffffc);
 	radeon_ring_write(ring, (upper_32_bits(addr) & 0xff));
 	radeon_ring_write(ring, fence->seq);
 	/* generate an interrupt */
-	radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_TRAP, 0, 0, 0));
+	radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_TRAP, 0, 0));
 	/* flush HDP */
-	radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_SRBM_WRITE, 0, 0, 0));
+	radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_SRBM_WRITE, 0, 0));
 	radeon_ring_write(ring, (0xf << 16) | HDP_MEM_COHERENCY_FLUSH_CNTL);
 	radeon_ring_write(ring, 1);
 }
@@ -3253,7 +3253,7 @@  void evergreen_dma_ring_ib_execute(struct radeon_device *rdev,
 		while ((next_rptr & 7) != 5)
 			next_rptr++;
 		next_rptr += 3;
-		radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 1));
+		radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_WRITE, 0, 1));
 		radeon_ring_write(ring, ring->next_rptr_gpu_addr & 0xfffffffc);
 		radeon_ring_write(ring, upper_32_bits(ring->next_rptr_gpu_addr) & 0xff);
 		radeon_ring_write(ring, next_rptr);
@@ -3263,8 +3263,8 @@  void evergreen_dma_ring_ib_execute(struct radeon_device *rdev,
 	 * Pad as necessary with NOPs.
 	 */
 	while ((ring->wptr & 7) != 5)
-		radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0));
-	radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_INDIRECT_BUFFER, 0, 0, 0));
+		radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_NOP, 0, 0));
+	radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_INDIRECT_BUFFER, 0, 0));
 	radeon_ring_write(ring, (ib->gpu_addr & 0xFFFFFFE0));
 	radeon_ring_write(ring, (ib->length_dw << 12) | (upper_32_bits(ib->gpu_addr) & 0xFF));
 
@@ -3323,7 +3323,7 @@  int evergreen_copy_dma(struct radeon_device *rdev,
 		if (cur_size_in_dw > 0xFFFFF)
 			cur_size_in_dw = 0xFFFFF;
 		size_in_dw -= cur_size_in_dw;
-		radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_COPY, 0, 0, cur_size_in_dw));
+		radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_COPY, 0, cur_size_in_dw));
 		radeon_ring_write(ring, dst_offset & 0xfffffffc);
 		radeon_ring_write(ring, src_offset & 0xfffffffc);
 		radeon_ring_write(ring, upper_32_bits(dst_offset) & 0xff);
@@ -3431,7 +3431,7 @@  static int evergreen_startup(struct radeon_device *rdev)
 	ring = &rdev->ring[R600_RING_TYPE_DMA_INDEX];
 	r = radeon_ring_init(rdev, ring, ring->ring_size, R600_WB_DMA_RPTR_OFFSET,
 			     DMA_RB_RPTR, DMA_RB_WPTR,
-			     2, 0x3fffc, DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0));
+			     2, 0x3fffc, DMA_PACKET(DMA_PACKET_NOP, 0, 0));
 	if (r)
 		return r;
 
diff --git a/drivers/gpu/drm/radeon/evergreen_cs.c b/drivers/gpu/drm/radeon/evergreen_cs.c
index 7a44566..32c07bb 100644
--- a/drivers/gpu/drm/radeon/evergreen_cs.c
+++ b/drivers/gpu/drm/radeon/evergreen_cs.c
@@ -2858,16 +2858,6 @@  int evergreen_cs_parse(struct radeon_cs_parser *p)
 	return 0;
 }
 
-/*
- *  DMA
- */
-
-#define GET_DMA_CMD(h) (((h) & 0xf0000000) >> 28)
-#define GET_DMA_COUNT(h) ((h) & 0x000fffff)
-#define GET_DMA_T(h) (((h) & 0x00800000) >> 23)
-#define GET_DMA_NEW(h) (((h) & 0x04000000) >> 26)
-#define GET_DMA_MISC(h) (((h) & 0x0700000) >> 20)
-
 /**
  * evergreen_dma_cs_parse() - parse the DMA IB
  * @p:		parser structure holding parsing context.
@@ -2881,9 +2871,9 @@  int evergreen_dma_cs_parse(struct radeon_cs_parser *p)
 {
 	struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx];
 	struct radeon_cs_reloc *src_reloc, *dst_reloc, *dst2_reloc;
-	u32 header, cmd, count, tiled, new_cmd, misc;
+	u32 header, cmd, count, sub_cmd;
 	volatile u32 *ib = p->ib.ptr;
-	u32 idx, idx_value;
+	u32 idx;
 	u64 src_offset, dst_offset, dst2_offset;
 	int r;
 
@@ -2897,9 +2887,7 @@  int evergreen_dma_cs_parse(struct radeon_cs_parser *p)
 		header = radeon_get_ib_value(p, idx);
 		cmd = GET_DMA_CMD(header);
 		count = GET_DMA_COUNT(header);
-		tiled = GET_DMA_T(header);
-		new_cmd = GET_DMA_NEW(header);
-		misc = GET_DMA_MISC(header);
+		sub_cmd = GET_DMA_SUB_CMD(header);
 
 		switch (cmd) {
 		case DMA_PACKET_WRITE:
@@ -2908,19 +2896,27 @@  int evergreen_dma_cs_parse(struct radeon_cs_parser *p)
 				DRM_ERROR("bad DMA_PACKET_WRITE\n");
 				return -EINVAL;
 			}
-			if (tiled) {
+			switch (sub_cmd) {
+			/* tiled */
+			case 8:
 				dst_offset = ib[idx+1];
 				dst_offset <<= 8;
 
 				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
 				p->idx += count + 7;
-			} else {
+				break;
+			/* linear */
+			case 0:
 				dst_offset = ib[idx+1];
 				dst_offset |= ((u64)(ib[idx+2] & 0xff)) << 32;
 
 				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
 				ib[idx+2] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
 				p->idx += count + 3;
+				break;
+			default:
+				DRM_ERROR("bad DMA_PACKET_WRITE [%6d] 0x%08x sub cmd is not 0 or 8\n", idx, ib[idx+0]);
+				return -EINVAL;
 			}
 			if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
 				dev_warn(p->dev, "DMA write buffer too small (%llu %lu)\n",
@@ -2939,338 +2935,330 @@  int evergreen_dma_cs_parse(struct radeon_cs_parser *p)
 				DRM_ERROR("bad DMA_PACKET_COPY\n");
 				return -EINVAL;
 			}
-			if (tiled) {
-				idx_value = radeon_get_ib_value(p, idx + 2);
-				if (new_cmd) {
-					switch (misc) {
-					case 0:
-						/* L2T, frame to fields */
-						if (idx_value & (1 << 31)) {
-							DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						r = r600_dma_cs_next_reloc(p, &dst2_reloc);
-						if (r) {
-							DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						dst_offset = ib[idx+1];
-						dst_offset <<= 8;
-						dst2_offset = ib[idx+2];
-						dst2_offset <<= 8;
-						src_offset = ib[idx+8];
-						src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32;
-						if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, frame to fields src buffer too small (%llu %lu)\n",
-								 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n",
-								 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n",
-								 dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
-						ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-						ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-						p->idx += 10;
-						break;
-					case 1:
-						/* L2T, T2L partial */
-						if (p->family < CHIP_CAYMAN) {
-							DRM_ERROR("L2T, T2L Partial is cayman only !\n");
-							return -EINVAL;
-						}
-						/* detile bit */
-						if (idx_value & (1 << 31)) {
-							/* tiled src, linear dst */
-							ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
-
-							ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-							ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-						} else {
-							/* linear src, tiled dst */
-							ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-							ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-
-							ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						}
-						p->idx += 12;
-						break;
-					case 3:
-						/* L2T, broadcast */
-						if (idx_value & (1 << 31)) {
-							DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						r = r600_dma_cs_next_reloc(p, &dst2_reloc);
-						if (r) {
-							DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						dst_offset = ib[idx+1];
-						dst_offset <<= 8;
-						dst2_offset = ib[idx+2];
-						dst2_offset <<= 8;
-						src_offset = ib[idx+8];
-						src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32;
-						if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
-								 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
-								 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n",
-								 dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
-						ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-						ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-						p->idx += 10;
-						break;
-					case 4:
-						/* L2T, T2L */
-						/* detile bit */
-						if (idx_value & (1 << 31)) {
-							/* tiled src, linear dst */
-							src_offset = ib[idx+1];
-							src_offset <<= 8;
-							ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
-
-							dst_offset = ib[idx+7];
-							dst_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
-							ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-							ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-						} else {
-							/* linear src, tiled dst */
-							src_offset = ib[idx+7];
-							src_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
-							ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-							ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-
-							dst_offset = ib[idx+1];
-							dst_offset <<= 8;
-							ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						}
-						if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, T2L src buffer too small (%llu %lu)\n",
-								 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, T2L dst buffer too small (%llu %lu)\n",
-								 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						p->idx += 9;
-						break;
-					case 5:
-						/* T2T partial */
-						if (p->family < CHIP_CAYMAN) {
-							DRM_ERROR("L2T, T2L Partial is cayman only !\n");
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
-						ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						p->idx += 13;
-						break;
-					case 7:
-						/* L2T, broadcast */
-						if (idx_value & (1 << 31)) {
-							DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						r = r600_dma_cs_next_reloc(p, &dst2_reloc);
-						if (r) {
-							DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						dst_offset = ib[idx+1];
-						dst_offset <<= 8;
-						dst2_offset = ib[idx+2];
-						dst2_offset <<= 8;
-						src_offset = ib[idx+8];
-						src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32;
-						if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
-								 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
-								 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n",
-								 dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
-						ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-						ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-						p->idx += 10;
-						break;
-					default:
-						DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-						return -EINVAL;
-					}
+			switch (sub_cmd) {
+			/* Copy L2L, DW aligned */
+			case 0x00:
+				/* L2L, dw */
+				src_offset = ib[idx+2];
+				src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32;
+				dst_offset = ib[idx+1];
+				dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32;
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, dw src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, dw dst buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+				ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				p->idx += 5;
+				break;
+			/* Copy L2T/T2L */
+			case 0x08:
+				/* detile bit */
+				if (ib[idx + 2] & (1 << 31)) {
+					/* tiled src, linear dst */
+					src_offset = ib[idx+1];
+					src_offset <<= 8;
+					ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
+
+					dst_offset = radeon_get_ib_value(p, idx + 7);
+					dst_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
+					ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+					ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
 				} else {
-					switch (misc) {
-					case 0:
-						/* detile bit */
-						if (idx_value & (1 << 31)) {
-							/* tiled src, linear dst */
-							src_offset = ib[idx+1];
-							src_offset <<= 8;
-							ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
-
-							dst_offset = ib[idx+7];
-							dst_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
-							ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-							ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-						} else {
-							/* linear src, tiled dst */
-							src_offset = ib[idx+7];
-							src_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
-							ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-							ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-
-							dst_offset = ib[idx+1];
-							dst_offset <<= 8;
-							ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						}
-						if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
-								 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
-								 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						p->idx += 9;
-						break;
-					default:
-						DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-						return -EINVAL;
-					}
+					/* linear src, tiled dst */
+					src_offset = ib[idx+7];
+					src_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
+					ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+					ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+
+					dst_offset = ib[idx+1];
+					dst_offset <<= 8;
+					ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
 				}
-			} else {
-				if (new_cmd) {
-					switch (misc) {
-					case 0:
-						/* L2L, byte */
-						src_offset = ib[idx+2];
-						src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32;
-						dst_offset = ib[idx+1];
-						dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32;
-						if ((src_offset + count) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2L, byte src buffer too small (%llu %lu)\n",
-								 src_offset + count, radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + count) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2L, byte dst buffer too small (%llu %lu)\n",
-								 dst_offset + count, radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff);
-						ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff);
-						ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-						ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-						p->idx += 5;
-						break;
-					case 1:
-						/* L2L, partial */
-						if (p->family < CHIP_CAYMAN) {
-							DRM_ERROR("L2L Partial is cayman only !\n");
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff);
-						ib[idx+2] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-						ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff);
-						ib[idx+5] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-
-						p->idx += 9;
-						break;
-					case 4:
-						/* L2L, dw, broadcast */
-						r = r600_dma_cs_next_reloc(p, &dst2_reloc);
-						if (r) {
-							DRM_ERROR("bad L2L, dw, broadcast DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						dst_offset = ib[idx+1];
-						dst_offset |= ((u64)(ib[idx+4] & 0xff)) << 32;
-						dst2_offset = ib[idx+2];
-						dst2_offset |= ((u64)(ib[idx+5] & 0xff)) << 32;
-						src_offset = ib[idx+3];
-						src_offset |= ((u64)(ib[idx+6] & 0xff)) << 32;
-						if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2L, dw, broadcast src buffer too small (%llu %lu)\n",
-								 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2L, dw, broadcast dst buffer too small (%llu %lu)\n",
-								 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2L, dw, broadcast dst2 buffer too small (%llu %lu)\n",
-								 dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-						ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset & 0xfffffffc);
-						ib[idx+3] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-						ib[idx+4] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-						ib[idx+5] += upper_32_bits(dst2_reloc->lobj.gpu_offset) & 0xff;
-						ib[idx+6] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-						p->idx += 7;
-						break;
-					default:
-						DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-						return -EINVAL;
-					}
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, dst buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				p->idx += 9;
+				break;
+			/* Copy L2L, byte aligned */
+			case 0x40:
+				/* L2L, byte */
+				src_offset = ib[idx+2];
+				src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32;
+				dst_offset = ib[idx+1];
+				dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32;
+				if ((src_offset + count) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, byte src buffer too small (%llu %lu)\n",
+							src_offset + count, radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + count) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, byte dst buffer too small (%llu %lu)\n",
+							dst_offset + count, radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff);
+				ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff);
+				ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+				ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				p->idx += 5;
+				break;
+			/* Copy L2L, partial */
+			case 0x41:
+				/* L2L, partial */
+				if (p->family < CHIP_CAYMAN) {
+					DRM_ERROR("L2L Partial is cayman only !\n");
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff);
+				ib[idx+2] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff);
+				ib[idx+5] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+
+				p->idx += 9;
+				break;
+			/* Copy L2L, DW aligned, broadcast */
+			case 0x44:
+				/* L2L, dw, broadcast */
+				r = r600_dma_cs_next_reloc(p, &dst2_reloc);
+				if (r) {
+					DRM_ERROR("bad L2L, dw, broadcast DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				dst_offset = ib[idx+1];
+				dst_offset |= ((u64)(ib[idx+4] & 0xff)) << 32;
+				dst2_offset = ib[idx+2];
+				dst2_offset |= ((u64)(ib[idx+5] & 0xff)) << 32;
+				src_offset = ib[idx+3];
+				src_offset |= ((u64)(ib[idx+6] & 0xff)) << 32;
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, dw, broadcast src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, dw, broadcast dst buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, dw, broadcast dst2 buffer too small (%llu %lu)\n",
+							dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+3] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+4] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+				ib[idx+5] += upper_32_bits(dst2_reloc->lobj.gpu_offset) & 0xff;
+				ib[idx+6] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				p->idx += 7;
+				break;
+			/* Copy L2T Frame to Field */
+			case 0x48:
+				if (ib[idx + 2] & (1 << 31)) {
+					DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				r = r600_dma_cs_next_reloc(p, &dst2_reloc);
+				if (r) {
+					DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				dst_offset = ib[idx+1];
+				dst_offset <<= 8;
+				dst2_offset = ib[idx+2];
+				dst2_offset <<= 8;
+				src_offset = ib[idx+8];
+				src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32;
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, frame to fields src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n",
+							dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+				ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
+				ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				p->idx += 10;
+				break;
+			/* Copy L2T/T2L, partial */
+			case 0x49:
+				/* L2T, T2L partial */
+				if (p->family < CHIP_CAYMAN) {
+					DRM_ERROR("L2T, T2L Partial is cayman only !\n");
+					return -EINVAL;
+				}
+				/* detile bit */
+				if (ib[idx + 2 ] & (1 << 31)) {
+					/* tiled src, linear dst */
+					ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
+
+					ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+					ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+				} else {
+					/* linear src, tiled dst */
+					ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+					ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+
+					ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+				}
+				p->idx += 12;
+				break;
+			/* Copy L2T broadcast */
+			case 0x4b:
+				/* L2T, broadcast */
+				if (ib[idx + 2] & (1 << 31)) {
+					DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				r = r600_dma_cs_next_reloc(p, &dst2_reloc);
+				if (r) {
+					DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				dst_offset = ib[idx+1];
+				dst_offset <<= 8;
+				dst2_offset = ib[idx+2];
+				dst2_offset <<= 8;
+				src_offset = ib[idx+8];
+				src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32;
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n",
+							dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+				ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
+				ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				p->idx += 10;
+				break;
+			/* Copy L2T/T2L (tile units) */
+			case 0x4c:
+				/* L2T, T2L */
+				/* detile bit */
+				if (ib[idx + 2] & (1 << 31)) {
+					/* tiled src, linear dst */
+					src_offset = ib[idx+1];
+					src_offset <<= 8;
+					ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
+
+					dst_offset = ib[idx+7];
+					dst_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
+					ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+					ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
 				} else {
-					/* L2L, dw */
-					src_offset = ib[idx+2];
-					src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32;
+					/* linear src, tiled dst */
+					src_offset = ib[idx+7];
+					src_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
+					ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+					ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+
 					dst_offset = ib[idx+1];
-					dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32;
-					if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-						dev_warn(p->dev, "DMA L2L, dw src buffer too small (%llu %lu)\n",
-							 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-						return -EINVAL;
-					}
-					if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-						dev_warn(p->dev, "DMA L2L, dw dst buffer too small (%llu %lu)\n",
-							 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-						return -EINVAL;
-					}
-					ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-					ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-					ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-					ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-					p->idx += 5;
+					dst_offset <<= 8;
+					ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
 				}
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, T2L src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, T2L dst buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				p->idx += 9;
+				break;
+			/* Copy T2T, partial (tile units) */
+			case 0x4d:
+				/* T2T partial */
+				if (p->family < CHIP_CAYMAN) {
+					DRM_ERROR("L2T, T2L Partial is cayman only !\n");
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
+				ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+				p->idx += 13;
+				break;
+			/* Copy L2T broadcast (tile units) */
+			case 0x4f:
+				/* L2T, broadcast */
+				if (ib[idx + 2] & (1 << 31)) {
+					DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				r = r600_dma_cs_next_reloc(p, &dst2_reloc);
+				if (r) {
+					DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				dst_offset = ib[idx+1];
+				dst_offset <<= 8;
+				dst2_offset = ib[idx+2];
+				dst2_offset <<= 8;
+				src_offset = ib[idx+8];
+				src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32;
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n",
+							dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+				ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
+				ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				p->idx += 10;
+				break;
+			default:
+				DRM_ERROR("bad DMA_PACKET_COPY [%6d] 0x%08x invalid sub cmd\n", idx, ib[idx+0]);
+				return -EINVAL;
 			}
 			break;
 		case DMA_PACKET_CONSTANT_FILL:
@@ -3623,88 +3611,79 @@  int evergreen_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib)
 int evergreen_dma_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib)
 {
 	u32 idx = 0;
-	u32 header, cmd, count, tiled, new_cmd, misc;
+	u32 header, cmd, count, sub_cmd;
 
 	do {
 		header = ib->ptr[idx];
 		cmd = GET_DMA_CMD(header);
 		count = GET_DMA_COUNT(header);
-		tiled = GET_DMA_T(header);
-		new_cmd = GET_DMA_NEW(header);
-		misc = GET_DMA_MISC(header);
+		sub_cmd = GET_DMA_SUB_CMD(header);
 
 		switch (cmd) {
 		case DMA_PACKET_WRITE:
-			if (tiled)
+			switch (sub_cmd) {
+			/* tiled */
+			case 8:
 				idx += count + 7;
-			else
+				break;
+			/* linear */
+			case 0:
 				idx += count + 3;
+				break;
+			default:
+				DRM_ERROR("bad DMA_PACKET_WRITE [%6d] 0x%08x sub cmd is not 0 or 8\n", idx, ib->ptr[idx]);
+				return -EINVAL;
+			}
 			break;
 		case DMA_PACKET_COPY:
-			if (tiled) {
-				if (new_cmd) {
-					switch (misc) {
-					case 0:
-						/* L2T, frame to fields */
-						idx += 10;
-						break;
-					case 1:
-						/* L2T, T2L partial */
-						idx += 12;
-						break;
-					case 3:
-						/* L2T, broadcast */
-						idx += 10;
-						break;
-					case 4:
-						/* L2T, T2L */
-						idx += 9;
-						break;
-					case 5:
-						/* T2T partial */
-						idx += 13;
-						break;
-					case 7:
-						/* L2T, broadcast */
-						idx += 10;
-						break;
-					default:
-						DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-						return -EINVAL;
-					}
-				} else {
-					switch (misc) {
-					case 0:
-						idx += 9;
-						break;
-					default:
-						DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-						return -EINVAL;
-					}
-				}
-			} else {
-				if (new_cmd) {
-					switch (misc) {
-					case 0:
-						/* L2L, byte */
-						idx += 5;
-						break;
-					case 1:
-						/* L2L, partial */
-						idx += 9;
-						break;
-					case 4:
-						/* L2L, dw, broadcast */
-						idx += 7;
-						break;
-					default:
-						DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-						return -EINVAL;
-					}
-				} else {
-					/* L2L, dw */
-					idx += 5;
-				}
+			switch (sub_cmd) {
+			/* Copy L2L, DW aligned */
+			case 0x00:
+				idx += 5;
+				break;
+			/* Copy L2T/T2L */
+			case 0x08:
+				idx += 9;
+				break;
+			/* Copy L2L, byte aligned */
+			case 0x40:
+				idx += 5;
+				break;
+			/* Copy L2L, partial */
+			case 0x41:
+				idx += 9;
+				break;
+			/* Copy L2L, DW aligned, broadcast */
+			case 0x44:
+				idx += 7;
+				break;
+			/* Copy L2T Frame to Field */
+			case 0x48:
+				idx += 10;
+				break;
+			/* Copy L2T/T2L, partial */
+			case 0x49:
+				idx += 12;
+				break;
+			/* Copy L2T broadcast */
+			case 0x4b:
+				idx += 10;
+				break;
+			/* Copy L2T/T2L (tile units) */
+			case 0x4c:
+				idx += 9;
+				break;
+			/* Copy T2T, partial (tile units) */
+			case 0x4d:
+				idx += 13;
+				break;
+			/* Copy L2T broadcast (tile units) */
+			case 0x4f:
+				idx += 10;
+				break;
+			default:
+				DRM_ERROR("bad DMA_PACKET_COPY [%6d] 0x%08x invalid sub cmd\n", idx, ib->ptr[idx]);
+				return -EINVAL;
 			}
 			break;
 		case DMA_PACKET_CONSTANT_FILL:
diff --git a/drivers/gpu/drm/radeon/evergreend.h b/drivers/gpu/drm/radeon/evergreend.h
index 5786a32..abb1571 100644
--- a/drivers/gpu/drm/radeon/evergreend.h
+++ b/drivers/gpu/drm/radeon/evergreend.h
@@ -924,20 +924,23 @@ 
 #define CAYMAN_DMA1_CNTL                                  0xd82c
 
 /* async DMA packets */
-#define DMA_PACKET(cmd, t, s, n)	((((cmd) & 0xF) << 28) |	\
-					 (((t) & 0x1) << 23) |		\
-					 (((s) & 0x1) << 22) |		\
-					 (((n) & 0xFFFFF) << 0))
+#define DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) |    \
+                                    (((sub_cmd) & 0xFF) << 20) |\
+                                    (((n) & 0xFFFFF) << 0))
+#define GET_DMA_CMD(h) (((h) & 0xf0000000) >> 28)
+#define GET_DMA_COUNT(h) ((h) & 0x000fffff)
+#define GET_DMA_SUB_CMD(h) (((h) & 0x0ff00000) >> 20)
+
 /* async DMA Packet types */
-#define	DMA_PACKET_WRITE				  0x2
-#define	DMA_PACKET_COPY					  0x3
-#define	DMA_PACKET_INDIRECT_BUFFER			  0x4
-#define	DMA_PACKET_SEMAPHORE				  0x5
-#define	DMA_PACKET_FENCE				  0x6
-#define	DMA_PACKET_TRAP					  0x7
-#define	DMA_PACKET_SRBM_WRITE				  0x9
-#define	DMA_PACKET_CONSTANT_FILL			  0xd
-#define	DMA_PACKET_NOP					  0xf
+#define	DMA_PACKET_WRITE                        0x2
+#define	DMA_PACKET_COPY                         0x3
+#define	DMA_PACKET_INDIRECT_BUFFER              0x4
+#define	DMA_PACKET_SEMAPHORE                    0x5
+#define	DMA_PACKET_FENCE                        0x6
+#define	DMA_PACKET_TRAP                         0x7
+#define	DMA_PACKET_SRBM_WRITE                   0x9
+#define	DMA_PACKET_CONSTANT_FILL                0xd
+#define	DMA_PACKET_NOP                          0xf
 
 /* PCIE link stuff */
 #define PCIE_LC_TRAINING_CNTL                             0xa1 /* PCIE_P */