diff mbox

[drm-next] drm/radeon: use IBs for VM page table updates

Message ID 1359668271-3697-1-git-send-email-alexdeucher@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Alex Deucher Jan. 31, 2013, 9:37 p.m. UTC
From: Alex Deucher <alexander.deucher@amd.com>

For very large page table updates, we can exceed the
size of the ring.  To avoid this, use an IB to perform
the page table update.

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---

We may want to do something similar for the BO move code as we
could potentially run out of ring space on a small ring like the
DMA rings if we are moving very large buffers.

Alex

 drivers/gpu/drm/radeon/ni.c          |   30 +++++++++++--------
 drivers/gpu/drm/radeon/radeon.h      |    6 ++-
 drivers/gpu/drm/radeon/radeon_asic.h |    8 ++++-
 drivers/gpu/drm/radeon/radeon_gart.c |   45 +++++++++++++++++++++++------
 drivers/gpu/drm/radeon/si.c          |   52 ++++++++++++++++++---------------
 5 files changed, 90 insertions(+), 51 deletions(-)

Comments

Christian König Feb. 1, 2013, 10:25 a.m. UTC | #1
Am 31.01.2013 22:37, schrieb alexdeucher@gmail.com:
> From: Alex Deucher <alexander.deucher@amd.com>
>
> For very large page table updates, we can exceed the
> size of the ring.  To avoid this, use an IB to perform
> the page table update.
>
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> ---
>
> We may want to do something similar for the BO move code as we
> could potentially run out of ring space on a small ring like the
> DMA rings if we are moving very large buffers.
>
> Alex
>
>   drivers/gpu/drm/radeon/ni.c          |   30 +++++++++++--------
>   drivers/gpu/drm/radeon/radeon.h      |    6 ++-
>   drivers/gpu/drm/radeon/radeon_asic.h |    8 ++++-
>   drivers/gpu/drm/radeon/radeon_gart.c |   45 +++++++++++++++++++++++------
>   drivers/gpu/drm/radeon/si.c          |   52 ++++++++++++++++++---------------
>   5 files changed, 90 insertions(+), 51 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
> index 170bd03..7cead76 100644
> --- a/drivers/gpu/drm/radeon/ni.c
> +++ b/drivers/gpu/drm/radeon/ni.c
> @@ -1946,19 +1946,21 @@ uint32_t cayman_vm_page_flags(struct radeon_device *rdev, uint32_t flags)
>    * cayman_vm_set_page - update the page tables using the CP
>    *
>    * @rdev: radeon_device pointer
> + * @ib: indirect buffer to fill with commands
>    * @pe: addr of the page entry
>    * @addr: dst addr to write into pe
>    * @count: number of page entries to update
>    * @incr: increase next addr by incr bytes
>    * @flags: access flags
>    *
> - * Update the page tables using the CP (cayman-si).
> + * Update the page tables using the CP (cayman/TN).
>    */
> -void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
> +void cayman_vm_set_page(struct radeon_device *rdev,
> +			struct radeon_ib *ib,
> +			uint64_t pe,
>   			uint64_t addr, unsigned count,
>   			uint32_t incr, uint32_t flags)
>   {
> -	struct radeon_ring *ring = &rdev->ring[rdev->asic->vm.pt_ring_index];
>   	uint32_t r600_flags = cayman_vm_page_flags(rdev, flags);
>   	uint64_t value;
>   	unsigned ndw;
> @@ -1969,9 +1971,9 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   			if (ndw > 0x3FFF)
>   				ndw = 0x3FFF;
>   
> -			radeon_ring_write(ring, PACKET3(PACKET3_ME_WRITE, ndw));
> -			radeon_ring_write(ring, pe);
> -			radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
> +			ib->ptr[ib->length_dw++] = PACKET3(PACKET3_ME_WRITE, ndw);
> +			ib->ptr[ib->length_dw++] = pe;
> +			ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
>   			for (; ndw > 1; ndw -= 2, --count, pe += 8) {
>   				if (flags & RADEON_VM_PAGE_SYSTEM) {
>   					value = radeon_vm_map_gart(rdev, addr);
> @@ -1983,8 +1985,8 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   				}
>   				addr += incr;
>   				value |= r600_flags;
> -				radeon_ring_write(ring, value);
> -				radeon_ring_write(ring, upper_32_bits(value));
> +				ib->ptr[ib->length_dw++] = value;
> +				ib->ptr[ib->length_dw++] = upper_32_bits(value);
>   			}
>   		}
>   	} else {
> @@ -1994,9 +1996,9 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   				ndw = 0xFFFFE;
>   
>   			/* for non-physically contiguous pages (system) */
> -			radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_WRITE, 0, 0, ndw));
> -			radeon_ring_write(ring, pe);
> -			radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
> +			ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, ndw);
> +			ib->ptr[ib->length_dw++] = pe;
> +			ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
>   			for (; ndw > 0; ndw -= 2, --count, pe += 8) {
>   				if (flags & RADEON_VM_PAGE_SYSTEM) {
>   					value = radeon_vm_map_gart(rdev, addr);
> @@ -2008,10 +2010,12 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   				}
>   				addr += incr;
>   				value |= r600_flags;
> -				radeon_ring_write(ring, value);
> -				radeon_ring_write(ring, upper_32_bits(value));
> +				ib->ptr[ib->length_dw++] = value;
> +				ib->ptr[ib->length_dw++] = upper_32_bits(value);
>   			}
>   		}
> +		while (ib->length_dw & 0x7)
> +			ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0);
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 6539d6c..89b9645 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -1188,7 +1188,9 @@ struct radeon_asic {
>   		void (*fini)(struct radeon_device *rdev);
>   
>   		u32 pt_ring_index;
> -		void (*set_page)(struct radeon_device *rdev, uint64_t pe,
> +		void (*set_page)(struct radeon_device *rdev,
> +				 struct radeon_ib *ib,
> +				 uint64_t pe,
>   				 uint64_t addr, unsigned count,
>   				 uint32_t incr, uint32_t flags);
>   	} vm;
> @@ -1810,7 +1812,7 @@ void radeon_ring_write(struct radeon_ring *ring, uint32_t v);
>   #define radeon_gart_set_page(rdev, i, p) (rdev)->asic->gart.set_page((rdev), (i), (p))
>   #define radeon_asic_vm_init(rdev) (rdev)->asic->vm.init((rdev))
>   #define radeon_asic_vm_fini(rdev) (rdev)->asic->vm.fini((rdev))
> -#define radeon_asic_vm_set_page(rdev, pe, addr, count, incr, flags) ((rdev)->asic->vm.set_page((rdev), (pe), (addr), (count), (incr), (flags)))
> +#define radeon_asic_vm_set_page(rdev, ib, pe, addr, count, incr, flags) ((rdev)->asic->vm.set_page((rdev), (ib), (pe), (addr), (count), (incr), (flags)))
>   #define radeon_ring_start(rdev, r, cp) (rdev)->asic->ring[(r)].ring_start((rdev), (cp))
>   #define radeon_ring_test(rdev, r, cp) (rdev)->asic->ring[(r)].ring_test((rdev), (cp))
>   #define radeon_ib_test(rdev, r, cp) (rdev)->asic->ring[(r)].ib_test((rdev), (cp))
> diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h
> index e429e25..f4134a8 100644
> --- a/drivers/gpu/drm/radeon/radeon_asic.h
> +++ b/drivers/gpu/drm/radeon/radeon_asic.h
> @@ -474,7 +474,9 @@ int cayman_vm_init(struct radeon_device *rdev);
>   void cayman_vm_fini(struct radeon_device *rdev);
>   void cayman_vm_flush(struct radeon_device *rdev, int ridx, struct radeon_vm *vm);
>   uint32_t cayman_vm_page_flags(struct radeon_device *rdev, uint32_t flags);
> -void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
> +void cayman_vm_set_page(struct radeon_device *rdev,
> +			struct radeon_ib *ib,
> +			uint64_t pe,
>   			uint64_t addr, unsigned count,
>   			uint32_t incr, uint32_t flags);
>   int evergreen_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib);
> @@ -506,7 +508,9 @@ int si_irq_set(struct radeon_device *rdev);
>   int si_irq_process(struct radeon_device *rdev);
>   int si_vm_init(struct radeon_device *rdev);
>   void si_vm_fini(struct radeon_device *rdev);
> -void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
> +void si_vm_set_page(struct radeon_device *rdev,
> +		    struct radeon_ib *ib,
> +		    uint64_t pe,
>   		    uint64_t addr, unsigned count,
>   		    uint32_t incr, uint32_t flags);
>   void si_vm_flush(struct radeon_device *rdev, int ridx, struct radeon_vm *vm);
> diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c
> index 6e24f84..d95c61d 100644
> --- a/drivers/gpu/drm/radeon/radeon_gart.c
> +++ b/drivers/gpu/drm/radeon/radeon_gart.c
> @@ -929,6 +929,7 @@ uint64_t radeon_vm_map_gart(struct radeon_device *rdev, uint64_t addr)
>    */
>   static int radeon_vm_update_pdes(struct radeon_device *rdev,
>   				 struct radeon_vm *vm,
> +				 struct radeon_ib *ib,
>   				 uint64_t start, uint64_t end)
>   {
>   	static const uint32_t incr = RADEON_VM_PTE_COUNT * 8;
> @@ -971,7 +972,7 @@ retry:
>   		    ((last_pt + incr * count) != pt)) {
>   
>   			if (count) {
> -				radeon_asic_vm_set_page(rdev, last_pde,
> +				radeon_asic_vm_set_page(rdev, ib, last_pde,
>   							last_pt, count, incr,
>   							RADEON_VM_PAGE_VALID);
>   			}
> @@ -985,7 +986,7 @@ retry:
>   	}
>   
>   	if (count) {
> -		radeon_asic_vm_set_page(rdev, last_pde, last_pt, count,
> +		radeon_asic_vm_set_page(rdev, ib, last_pde, last_pt, count,
>   					incr, RADEON_VM_PAGE_VALID);
>   
>   	}
> @@ -1009,6 +1010,7 @@ retry:
>    */
>   static void radeon_vm_update_ptes(struct radeon_device *rdev,
>   				  struct radeon_vm *vm,
> +				  struct radeon_ib *ib,
>   				  uint64_t start, uint64_t end,
>   				  uint64_t dst, uint32_t flags)
>   {
> @@ -1038,7 +1040,7 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev,
>   		if ((last_pte + 8 * count) != pte) {
>   
>   			if (count) {
> -				radeon_asic_vm_set_page(rdev, last_pte,
> +				radeon_asic_vm_set_page(rdev, ib, last_pte,
>   							last_dst, count,
>   							RADEON_GPU_PAGE_SIZE,
>   							flags);
> @@ -1056,7 +1058,8 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev,
>   	}
>   
>   	if (count) {
> -		radeon_asic_vm_set_page(rdev, last_pte,	last_dst, count,
> +		radeon_asic_vm_set_page(rdev, ib, last_pte,
> +					last_dst, count,
>   					RADEON_GPU_PAGE_SIZE, flags);
>   	}
>   }
> @@ -1082,6 +1085,7 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev,
>   	unsigned ridx = rdev->asic->vm.pt_ring_index;
>   	struct radeon_ring *ring = &rdev->ring[ridx];
>   	struct radeon_semaphore *sem = NULL;
> +	struct radeon_ib ib;
>   	struct radeon_bo_va *bo_va;
>   	unsigned nptes, npdes, ndw;
>   	uint64_t addr;
> @@ -1140,9 +1144,8 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev,
>   	/* assume two extra pdes in case the mapping overlaps the borders */
>   	npdes = (nptes >> RADEON_VM_BLOCK_SIZE) + 2;
>   
> -	/* estimate number of dw needed */
> -	/* semaphore, fence and padding */
> -	ndw = 32;
> +	/* padding, etc. */
> +	ndw = 64;
>   
>   	if (RADEON_VM_BLOCK_SIZE > 11)
>   		/* reserve space for one header for every 2k dwords */
> @@ -1161,8 +1164,25 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev,
>   	/* reserve space for pde addresses */
>   	ndw += npdes * 2;
>   
> -	r = radeon_ring_lock(rdev, ring, ndw);
> +	/* update too big for an IB */
> +	if (ndw > 0xfffff)
> +		return -ENOMEM;
> +
> +	r = radeon_sa_bo_new(rdev, &rdev->ring_tmp_bo, &ib.sa_bo, ndw * 4, 256, true);
> +	if (r) {
> +		return r;
> +	}
> +	ib.ring = ridx;
> +	ib.fence = NULL;
> +	ib.ptr = radeon_sa_bo_cpu_addr(ib.sa_bo);
> +	ib.vm = NULL;
> +	ib.gpu_addr = radeon_sa_bo_gpu_addr(ib.sa_bo);
> +	ib.is_const_ib = false;
> +	ib.length_dw = 0;

Wouldn't it make sense to use the IB functions (ib_get, ib_schedule, 
ib_free) here instead of filling the IB structure manually?

Apart from that it looks quite good.

Christian.

> +
> +	r = radeon_ring_lock(rdev, ring, 64);
>   	if (r) {
> +		radeon_sa_bo_free(rdev, &ib.sa_bo, NULL);
>   		return r;
>   	}
>   
> @@ -1171,23 +1191,28 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev,
>   		radeon_fence_note_sync(vm->fence, ridx);
>   	}
>   
> -	r = radeon_vm_update_pdes(rdev, vm, bo_va->soffset, bo_va->eoffset);
> +	r = radeon_vm_update_pdes(rdev, vm, &ib, bo_va->soffset, bo_va->eoffset);
>   	if (r) {
> +		radeon_sa_bo_free(rdev, &ib.sa_bo, NULL);
>   		radeon_ring_unlock_undo(rdev, ring);
>   		return r;
>   	}
>   
> -	radeon_vm_update_ptes(rdev, vm, bo_va->soffset, bo_va->eoffset,
> +	radeon_vm_update_ptes(rdev, vm, &ib, bo_va->soffset, bo_va->eoffset,
>   			      addr, bo_va->flags);
>   
> +	radeon_ring_ib_execute(rdev, ib.ring, &ib);
> +
>   	radeon_fence_unref(&vm->fence);
>   	r = radeon_fence_emit(rdev, &vm->fence, ridx);
>   	if (r) {
> +		radeon_sa_bo_free(rdev, &ib.sa_bo, NULL);
>   		radeon_ring_unlock_undo(rdev, ring);
>   		return r;
>   	}
>   	radeon_ring_unlock_commit(rdev, ring);
>   	radeon_semaphore_free(rdev, &sem, vm->fence);
> +	radeon_sa_bo_free(rdev, &ib.sa_bo, vm->fence);
>   	radeon_fence_unref(&vm->last_flush);
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c
> index cd83bc5..a910cb9 100644
> --- a/drivers/gpu/drm/radeon/si.c
> +++ b/drivers/gpu/drm/radeon/si.c
> @@ -3043,19 +3043,21 @@ void si_vm_fini(struct radeon_device *rdev)
>    * si_vm_set_page - update the page tables using the CP
>    *
>    * @rdev: radeon_device pointer
> + * @ib: indirect buffer to fill with commands
>    * @pe: addr of the page entry
>    * @addr: dst addr to write into pe
>    * @count: number of page entries to update
>    * @incr: increase next addr by incr bytes
>    * @flags: access flags
>    *
> - * Update the page tables using the CP (cayman-si).
> + * Update the page tables using the CP (SI).
>    */
> -void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
> +void si_vm_set_page(struct radeon_device *rdev,
> +		    struct radeon_ib *ib,
> +		    uint64_t pe,
>   		    uint64_t addr, unsigned count,
>   		    uint32_t incr, uint32_t flags)
>   {
> -	struct radeon_ring *ring = &rdev->ring[rdev->asic->vm.pt_ring_index];
>   	uint32_t r600_flags = cayman_vm_page_flags(rdev, flags);
>   	uint64_t value;
>   	unsigned ndw;
> @@ -3066,11 +3068,11 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   			if (ndw > 0x3FFE)
>   				ndw = 0x3FFE;
>   
> -			radeon_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, ndw));
> -			radeon_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
> -						 WRITE_DATA_DST_SEL(1)));
> -			radeon_ring_write(ring, pe);
> -			radeon_ring_write(ring, upper_32_bits(pe));
> +			ib->ptr[ib->length_dw++] = PACKET3(PACKET3_WRITE_DATA, ndw);
> +			ib->ptr[ib->length_dw++] = (WRITE_DATA_ENGINE_SEL(0) |
> +					WRITE_DATA_DST_SEL(1));
> +			ib->ptr[ib->length_dw++] = pe;
> +			ib->ptr[ib->length_dw++] = upper_32_bits(pe);
>   			for (; ndw > 2; ndw -= 2, --count, pe += 8) {
>   				if (flags & RADEON_VM_PAGE_SYSTEM) {
>   					value = radeon_vm_map_gart(rdev, addr);
> @@ -3082,8 +3084,8 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   				}
>   				addr += incr;
>   				value |= r600_flags;
> -				radeon_ring_write(ring, value);
> -				radeon_ring_write(ring, upper_32_bits(value));
> +				ib->ptr[ib->length_dw++] = value;
> +				ib->ptr[ib->length_dw++] = upper_32_bits(value);
>   			}
>   		}
>   	} else {
> @@ -3095,9 +3097,9 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   					ndw = 0xFFFFE;
>   
>   				/* for non-physically contiguous pages (system) */
> -				radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, ndw));
> -				radeon_ring_write(ring, pe);
> -				radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
> +				ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, ndw);
> +				ib->ptr[ib->length_dw++] = pe;
> +				ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
>   				for (; ndw > 0; ndw -= 2, --count, pe += 8) {
>   					if (flags & RADEON_VM_PAGE_SYSTEM) {
>   						value = radeon_vm_map_gart(rdev, addr);
> @@ -3109,8 +3111,8 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   					}
>   					addr += incr;
>   					value |= r600_flags;
> -					radeon_ring_write(ring, value);
> -					radeon_ring_write(ring, upper_32_bits(value));
> +					ib->ptr[ib->length_dw++] = value;
> +					ib->ptr[ib->length_dw++] = upper_32_bits(value);
>   				}
>   			}
>   		} else {
> @@ -3124,20 +3126,22 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   				else
>   					value = 0;
>   				/* for physically contiguous pages (vram) */
> -				radeon_ring_write(ring, DMA_PTE_PDE_PACKET(ndw));
> -				radeon_ring_write(ring, pe); /* dst addr */
> -				radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
> -				radeon_ring_write(ring, r600_flags); /* mask */
> -				radeon_ring_write(ring, 0);
> -				radeon_ring_write(ring, value); /* value */
> -				radeon_ring_write(ring, upper_32_bits(value));
> -				radeon_ring_write(ring, incr); /* increment size */
> -				radeon_ring_write(ring, 0);
> +				ib->ptr[ib->length_dw++] = DMA_PTE_PDE_PACKET(ndw);
> +				ib->ptr[ib->length_dw++] = pe; /* dst addr */
> +				ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
> +				ib->ptr[ib->length_dw++] = r600_flags; /* mask */
> +				ib->ptr[ib->length_dw++] = 0;
> +				ib->ptr[ib->length_dw++] = value; /* value */
> +				ib->ptr[ib->length_dw++] = upper_32_bits(value);
> +				ib->ptr[ib->length_dw++] = incr; /* increment size */
> +				ib->ptr[ib->length_dw++] = 0;
>   				pe += ndw * 4;
>   				addr += (ndw / 2) * incr;
>   				count -= ndw / 2;
>   			}
>   		}
> +		while (ib->length_dw & 0x7)
> +			ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0, 0);
>   	}
>   }
>
Alex Deucher Feb. 1, 2013, 2:04 p.m. UTC | #2
On Fri, Feb 1, 2013 at 5:25 AM, Christian König <deathsimple@vodafone.de> wrote:
> Am 31.01.2013 22:37, schrieb alexdeucher@gmail.com:
>
>> From: Alex Deucher <alexander.deucher@amd.com>
>>
>> For very large page table updates, we can exceed the
>> size of the ring.  To avoid this, use an IB to perform
>> the page table update.
>>
>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>> ---
>>
>> We may want to do something similar for the BO move code as we
>> could potentially run out of ring space on a small ring like the
>> DMA rings if we are moving very large buffers.
>>
>> Alex
>>
>>   drivers/gpu/drm/radeon/ni.c          |   30 +++++++++++--------
>>   drivers/gpu/drm/radeon/radeon.h      |    6 ++-
>>   drivers/gpu/drm/radeon/radeon_asic.h |    8 ++++-
>>   drivers/gpu/drm/radeon/radeon_gart.c |   45
>> +++++++++++++++++++++++------
>>   drivers/gpu/drm/radeon/si.c          |   52
>> ++++++++++++++++++---------------
>>   5 files changed, 90 insertions(+), 51 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
>> index 170bd03..7cead76 100644
>> --- a/drivers/gpu/drm/radeon/ni.c
>> +++ b/drivers/gpu/drm/radeon/ni.c
>> @@ -1946,19 +1946,21 @@ uint32_t cayman_vm_page_flags(struct radeon_device
>> *rdev, uint32_t flags)
>>    * cayman_vm_set_page - update the page tables using the CP
>>    *
>>    * @rdev: radeon_device pointer
>> + * @ib: indirect buffer to fill with commands
>>    * @pe: addr of the page entry
>>    * @addr: dst addr to write into pe
>>    * @count: number of page entries to update
>>    * @incr: increase next addr by incr bytes
>>    * @flags: access flags
>>    *
>> - * Update the page tables using the CP (cayman-si).
>> + * Update the page tables using the CP (cayman/TN).
>>    */
>> -void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>> +void cayman_vm_set_page(struct radeon_device *rdev,
>> +                       struct radeon_ib *ib,
>> +                       uint64_t pe,
>>                         uint64_t addr, unsigned count,
>>                         uint32_t incr, uint32_t flags)
>>   {
>> -       struct radeon_ring *ring =
>> &rdev->ring[rdev->asic->vm.pt_ring_index];
>>         uint32_t r600_flags = cayman_vm_page_flags(rdev, flags);
>>         uint64_t value;
>>         unsigned ndw;
>> @@ -1969,9 +1971,9 @@ void cayman_vm_set_page(struct radeon_device *rdev,
>> uint64_t pe,
>>                         if (ndw > 0x3FFF)
>>                                 ndw = 0x3FFF;
>>   -                     radeon_ring_write(ring, PACKET3(PACKET3_ME_WRITE,
>> ndw));
>> -                       radeon_ring_write(ring, pe);
>> -                       radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
>> +                       ib->ptr[ib->length_dw++] =
>> PACKET3(PACKET3_ME_WRITE, ndw);
>> +                       ib->ptr[ib->length_dw++] = pe;
>> +                       ib->ptr[ib->length_dw++] = upper_32_bits(pe) &
>> 0xff;
>>                         for (; ndw > 1; ndw -= 2, --count, pe += 8) {
>>                                 if (flags & RADEON_VM_PAGE_SYSTEM) {
>>                                         value = radeon_vm_map_gart(rdev,
>> addr);
>> @@ -1983,8 +1985,8 @@ void cayman_vm_set_page(struct radeon_device *rdev,
>> uint64_t pe,
>>                                 }
>>                                 addr += incr;
>>                                 value |= r600_flags;
>> -                               radeon_ring_write(ring, value);
>> -                               radeon_ring_write(ring,
>> upper_32_bits(value));
>> +                               ib->ptr[ib->length_dw++] = value;
>> +                               ib->ptr[ib->length_dw++] =
>> upper_32_bits(value);
>>                         }
>>                 }
>>         } else {
>> @@ -1994,9 +1996,9 @@ void cayman_vm_set_page(struct radeon_device *rdev,
>> uint64_t pe,
>>                                 ndw = 0xFFFFE;
>>                         /* for non-physically contiguous pages (system) */
>> -                       radeon_ring_write(ring,
>> DMA_PACKET(DMA_PACKET_WRITE, 0, 0, ndw));
>> -                       radeon_ring_write(ring, pe);
>> -                       radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
>> +                       ib->ptr[ib->length_dw++] =
>> DMA_PACKET(DMA_PACKET_WRITE, 0, 0, ndw);
>> +                       ib->ptr[ib->length_dw++] = pe;
>> +                       ib->ptr[ib->length_dw++] = upper_32_bits(pe) &
>> 0xff;
>>                         for (; ndw > 0; ndw -= 2, --count, pe += 8) {
>>                                 if (flags & RADEON_VM_PAGE_SYSTEM) {
>>                                         value = radeon_vm_map_gart(rdev,
>> addr);
>> @@ -2008,10 +2010,12 @@ void cayman_vm_set_page(struct radeon_device
>> *rdev, uint64_t pe,
>>                                 }
>>                                 addr += incr;
>>                                 value |= r600_flags;
>> -                               radeon_ring_write(ring, value);
>> -                               radeon_ring_write(ring,
>> upper_32_bits(value));
>> +                               ib->ptr[ib->length_dw++] = value;
>> +                               ib->ptr[ib->length_dw++] =
>> upper_32_bits(value);
>>                         }
>>                 }
>> +               while (ib->length_dw & 0x7)
>> +                       ib->ptr[ib->length_dw++] =
>> DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0);
>>         }
>>   }
>>   diff --git a/drivers/gpu/drm/radeon/radeon.h
>> b/drivers/gpu/drm/radeon/radeon.h
>> index 6539d6c..89b9645 100644
>> --- a/drivers/gpu/drm/radeon/radeon.h
>> +++ b/drivers/gpu/drm/radeon/radeon.h
>> @@ -1188,7 +1188,9 @@ struct radeon_asic {
>>                 void (*fini)(struct radeon_device *rdev);
>>                 u32 pt_ring_index;
>> -               void (*set_page)(struct radeon_device *rdev, uint64_t pe,
>> +               void (*set_page)(struct radeon_device *rdev,
>> +                                struct radeon_ib *ib,
>> +                                uint64_t pe,
>>                                  uint64_t addr, unsigned count,
>>                                  uint32_t incr, uint32_t flags);
>>         } vm;
>> @@ -1810,7 +1812,7 @@ void radeon_ring_write(struct radeon_ring *ring,
>> uint32_t v);
>>   #define radeon_gart_set_page(rdev, i, p)
>> (rdev)->asic->gart.set_page((rdev), (i), (p))
>>   #define radeon_asic_vm_init(rdev) (rdev)->asic->vm.init((rdev))
>>   #define radeon_asic_vm_fini(rdev) (rdev)->asic->vm.fini((rdev))
>> -#define radeon_asic_vm_set_page(rdev, pe, addr, count, incr, flags)
>> ((rdev)->asic->vm.set_page((rdev), (pe), (addr), (count), (incr), (flags)))
>> +#define radeon_asic_vm_set_page(rdev, ib, pe, addr, count, incr, flags)
>> ((rdev)->asic->vm.set_page((rdev), (ib), (pe), (addr), (count), (incr),
>> (flags)))
>>   #define radeon_ring_start(rdev, r, cp)
>> (rdev)->asic->ring[(r)].ring_start((rdev), (cp))
>>   #define radeon_ring_test(rdev, r, cp)
>> (rdev)->asic->ring[(r)].ring_test((rdev), (cp))
>>   #define radeon_ib_test(rdev, r, cp)
>> (rdev)->asic->ring[(r)].ib_test((rdev), (cp))
>> diff --git a/drivers/gpu/drm/radeon/radeon_asic.h
>> b/drivers/gpu/drm/radeon/radeon_asic.h
>> index e429e25..f4134a8 100644
>> --- a/drivers/gpu/drm/radeon/radeon_asic.h
>> +++ b/drivers/gpu/drm/radeon/radeon_asic.h
>> @@ -474,7 +474,9 @@ int cayman_vm_init(struct radeon_device *rdev);
>>   void cayman_vm_fini(struct radeon_device *rdev);
>>   void cayman_vm_flush(struct radeon_device *rdev, int ridx, struct
>> radeon_vm *vm);
>>   uint32_t cayman_vm_page_flags(struct radeon_device *rdev, uint32_t
>> flags);
>> -void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>> +void cayman_vm_set_page(struct radeon_device *rdev,
>> +                       struct radeon_ib *ib,
>> +                       uint64_t pe,
>>                         uint64_t addr, unsigned count,
>>                         uint32_t incr, uint32_t flags);
>>   int evergreen_ib_parse(struct radeon_device *rdev, struct radeon_ib
>> *ib);
>> @@ -506,7 +508,9 @@ int si_irq_set(struct radeon_device *rdev);
>>   int si_irq_process(struct radeon_device *rdev);
>>   int si_vm_init(struct radeon_device *rdev);
>>   void si_vm_fini(struct radeon_device *rdev);
>> -void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>> +void si_vm_set_page(struct radeon_device *rdev,
>> +                   struct radeon_ib *ib,
>> +                   uint64_t pe,
>>                     uint64_t addr, unsigned count,
>>                     uint32_t incr, uint32_t flags);
>>   void si_vm_flush(struct radeon_device *rdev, int ridx, struct radeon_vm
>> *vm);
>> diff --git a/drivers/gpu/drm/radeon/radeon_gart.c
>> b/drivers/gpu/drm/radeon/radeon_gart.c
>> index 6e24f84..d95c61d 100644
>> --- a/drivers/gpu/drm/radeon/radeon_gart.c
>> +++ b/drivers/gpu/drm/radeon/radeon_gart.c
>> @@ -929,6 +929,7 @@ uint64_t radeon_vm_map_gart(struct radeon_device
>> *rdev, uint64_t addr)
>>    */
>>   static int radeon_vm_update_pdes(struct radeon_device *rdev,
>>                                  struct radeon_vm *vm,
>> +                                struct radeon_ib *ib,
>>                                  uint64_t start, uint64_t end)
>>   {
>>         static const uint32_t incr = RADEON_VM_PTE_COUNT * 8;
>> @@ -971,7 +972,7 @@ retry:
>>                     ((last_pt + incr * count) != pt)) {
>>                         if (count) {
>> -                               radeon_asic_vm_set_page(rdev, last_pde,
>> +                               radeon_asic_vm_set_page(rdev, ib,
>> last_pde,
>>                                                         last_pt, count,
>> incr,
>>
>> RADEON_VM_PAGE_VALID);
>>                         }
>> @@ -985,7 +986,7 @@ retry:
>>         }
>>         if (count) {
>> -               radeon_asic_vm_set_page(rdev, last_pde, last_pt, count,
>> +               radeon_asic_vm_set_page(rdev, ib, last_pde, last_pt,
>> count,
>>                                         incr, RADEON_VM_PAGE_VALID);
>>         }
>> @@ -1009,6 +1010,7 @@ retry:
>>    */
>>   static void radeon_vm_update_ptes(struct radeon_device *rdev,
>>                                   struct radeon_vm *vm,
>> +                                 struct radeon_ib *ib,
>>                                   uint64_t start, uint64_t end,
>>                                   uint64_t dst, uint32_t flags)
>>   {
>> @@ -1038,7 +1040,7 @@ static void radeon_vm_update_ptes(struct
>> radeon_device *rdev,
>>                 if ((last_pte + 8 * count) != pte) {
>>                         if (count) {
>> -                               radeon_asic_vm_set_page(rdev, last_pte,
>> +                               radeon_asic_vm_set_page(rdev, ib,
>> last_pte,
>>                                                         last_dst, count,
>>
>> RADEON_GPU_PAGE_SIZE,
>>                                                         flags);
>> @@ -1056,7 +1058,8 @@ static void radeon_vm_update_ptes(struct
>> radeon_device *rdev,
>>         }
>>         if (count) {
>> -               radeon_asic_vm_set_page(rdev, last_pte, last_dst, count,
>> +               radeon_asic_vm_set_page(rdev, ib, last_pte,
>> +                                       last_dst, count,
>>                                         RADEON_GPU_PAGE_SIZE, flags);
>>         }
>>   }
>> @@ -1082,6 +1085,7 @@ int radeon_vm_bo_update_pte(struct radeon_device
>> *rdev,
>>         unsigned ridx = rdev->asic->vm.pt_ring_index;
>>         struct radeon_ring *ring = &rdev->ring[ridx];
>>         struct radeon_semaphore *sem = NULL;
>> +       struct radeon_ib ib;
>>         struct radeon_bo_va *bo_va;
>>         unsigned nptes, npdes, ndw;
>>         uint64_t addr;
>> @@ -1140,9 +1144,8 @@ int radeon_vm_bo_update_pte(struct radeon_device
>> *rdev,
>>         /* assume two extra pdes in case the mapping overlaps the borders
>> */
>>         npdes = (nptes >> RADEON_VM_BLOCK_SIZE) + 2;
>>   -     /* estimate number of dw needed */
>> -       /* semaphore, fence and padding */
>> -       ndw = 32;
>> +       /* padding, etc. */
>> +       ndw = 64;
>>         if (RADEON_VM_BLOCK_SIZE > 11)
>>                 /* reserve space for one header for every 2k dwords */
>> @@ -1161,8 +1164,25 @@ int radeon_vm_bo_update_pte(struct radeon_device
>> *rdev,
>>         /* reserve space for pde addresses */
>>         ndw += npdes * 2;
>>   -     r = radeon_ring_lock(rdev, ring, ndw);
>> +       /* update too big for an IB */
>> +       if (ndw > 0xfffff)
>> +               return -ENOMEM;
>> +
>> +       r = radeon_sa_bo_new(rdev, &rdev->ring_tmp_bo, &ib.sa_bo, ndw * 4,
>> 256, true);
>> +       if (r) {
>> +               return r;
>> +       }
>> +       ib.ring = ridx;
>> +       ib.fence = NULL;
>> +       ib.ptr = radeon_sa_bo_cpu_addr(ib.sa_bo);
>> +       ib.vm = NULL;
>> +       ib.gpu_addr = radeon_sa_bo_gpu_addr(ib.sa_bo);
>> +       ib.is_const_ib = false;
>> +       ib.length_dw = 0;
>
>
> Wouldn't it make sense to use the IB functions (ib_get, ib_schedule,
> ib_free) here instead of filling the IB structure manually?

I tried that initially, but ran into several problems and it ended up
being easier to just allocate and schedule the IB directly.  The IB
functions have self contained code to deal with semaphores and fences
that I couldn't figure out how to make work cleanly with the vm
semaphore and fence.  radeon_ib_schedule() takes the ring lock so if I
needed to do any direct ring manipuation prior, I'd need to drop the
lock before calling ib_schedule().  Let me know if you have any
suggestions.

Alex


>
> Apart from that it looks quite good.
>
> Christian.
>
Christian König Feb. 1, 2013, 4:35 p.m. UTC | #3
Am 01.02.2013 15:04, schrieb Alex Deucher:
> On Fri, Feb 1, 2013 at 5:25 AM, Christian König <deathsimple@vodafone.de> wrote:
>> [SNAP]
>> Wouldn't it make sense to use the IB functions (ib_get, ib_schedule,
>> ib_free) here instead of filling the IB structure manually?
> I tried that initially, but ran into several problems and it ended up
> being easier to just allocate and schedule the IB directly.  The IB
> functions have self contained code to deal with semaphores and fences
> that I couldn't figure out how to make work cleanly with the vm
> semaphore and fence.  radeon_ib_schedule() takes the ring lock so if I
> needed to do any direct ring manipuation prior, I'd need to drop the
> lock before calling ib_schedule().  Let me know if you have any
> suggestions.

Just send out an updated version of the patch, which seems to look quite 
a bit cleaner and still works fine (at least on my SI board).

Christian.

>
> Alex
>
>
>> Apart from that it looks quite good.
>>
>> Christian.
>>
diff mbox

Patch

diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
index 170bd03..7cead76 100644
--- a/drivers/gpu/drm/radeon/ni.c
+++ b/drivers/gpu/drm/radeon/ni.c
@@ -1946,19 +1946,21 @@  uint32_t cayman_vm_page_flags(struct radeon_device *rdev, uint32_t flags)
  * cayman_vm_set_page - update the page tables using the CP
  *
  * @rdev: radeon_device pointer
+ * @ib: indirect buffer to fill with commands
  * @pe: addr of the page entry
  * @addr: dst addr to write into pe
  * @count: number of page entries to update
  * @incr: increase next addr by incr bytes
  * @flags: access flags
  *
- * Update the page tables using the CP (cayman-si).
+ * Update the page tables using the CP (cayman/TN).
  */
-void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
+void cayman_vm_set_page(struct radeon_device *rdev,
+			struct radeon_ib *ib,
+			uint64_t pe,
 			uint64_t addr, unsigned count,
 			uint32_t incr, uint32_t flags)
 {
-	struct radeon_ring *ring = &rdev->ring[rdev->asic->vm.pt_ring_index];
 	uint32_t r600_flags = cayman_vm_page_flags(rdev, flags);
 	uint64_t value;
 	unsigned ndw;
@@ -1969,9 +1971,9 @@  void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
 			if (ndw > 0x3FFF)
 				ndw = 0x3FFF;
 
-			radeon_ring_write(ring, PACKET3(PACKET3_ME_WRITE, ndw));
-			radeon_ring_write(ring, pe);
-			radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
+			ib->ptr[ib->length_dw++] = PACKET3(PACKET3_ME_WRITE, ndw);
+			ib->ptr[ib->length_dw++] = pe;
+			ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
 			for (; ndw > 1; ndw -= 2, --count, pe += 8) {
 				if (flags & RADEON_VM_PAGE_SYSTEM) {
 					value = radeon_vm_map_gart(rdev, addr);
@@ -1983,8 +1985,8 @@  void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
 				}
 				addr += incr;
 				value |= r600_flags;
-				radeon_ring_write(ring, value);
-				radeon_ring_write(ring, upper_32_bits(value));
+				ib->ptr[ib->length_dw++] = value;
+				ib->ptr[ib->length_dw++] = upper_32_bits(value);
 			}
 		}
 	} else {
@@ -1994,9 +1996,9 @@  void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
 				ndw = 0xFFFFE;
 
 			/* for non-physically contiguous pages (system) */
-			radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_WRITE, 0, 0, ndw));
-			radeon_ring_write(ring, pe);
-			radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
+			ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, ndw);
+			ib->ptr[ib->length_dw++] = pe;
+			ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
 			for (; ndw > 0; ndw -= 2, --count, pe += 8) {
 				if (flags & RADEON_VM_PAGE_SYSTEM) {
 					value = radeon_vm_map_gart(rdev, addr);
@@ -2008,10 +2010,12 @@  void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
 				}
 				addr += incr;
 				value |= r600_flags;
-				radeon_ring_write(ring, value);
-				radeon_ring_write(ring, upper_32_bits(value));
+				ib->ptr[ib->length_dw++] = value;
+				ib->ptr[ib->length_dw++] = upper_32_bits(value);
 			}
 		}
+		while (ib->length_dw & 0x7)
+			ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0);
 	}
 }
 
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 6539d6c..89b9645 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -1188,7 +1188,9 @@  struct radeon_asic {
 		void (*fini)(struct radeon_device *rdev);
 
 		u32 pt_ring_index;
-		void (*set_page)(struct radeon_device *rdev, uint64_t pe,
+		void (*set_page)(struct radeon_device *rdev,
+				 struct radeon_ib *ib,
+				 uint64_t pe,
 				 uint64_t addr, unsigned count,
 				 uint32_t incr, uint32_t flags);
 	} vm;
@@ -1810,7 +1812,7 @@  void radeon_ring_write(struct radeon_ring *ring, uint32_t v);
 #define radeon_gart_set_page(rdev, i, p) (rdev)->asic->gart.set_page((rdev), (i), (p))
 #define radeon_asic_vm_init(rdev) (rdev)->asic->vm.init((rdev))
 #define radeon_asic_vm_fini(rdev) (rdev)->asic->vm.fini((rdev))
-#define radeon_asic_vm_set_page(rdev, pe, addr, count, incr, flags) ((rdev)->asic->vm.set_page((rdev), (pe), (addr), (count), (incr), (flags)))
+#define radeon_asic_vm_set_page(rdev, ib, pe, addr, count, incr, flags) ((rdev)->asic->vm.set_page((rdev), (ib), (pe), (addr), (count), (incr), (flags)))
 #define radeon_ring_start(rdev, r, cp) (rdev)->asic->ring[(r)].ring_start((rdev), (cp))
 #define radeon_ring_test(rdev, r, cp) (rdev)->asic->ring[(r)].ring_test((rdev), (cp))
 #define radeon_ib_test(rdev, r, cp) (rdev)->asic->ring[(r)].ib_test((rdev), (cp))
diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h
index e429e25..f4134a8 100644
--- a/drivers/gpu/drm/radeon/radeon_asic.h
+++ b/drivers/gpu/drm/radeon/radeon_asic.h
@@ -474,7 +474,9 @@  int cayman_vm_init(struct radeon_device *rdev);
 void cayman_vm_fini(struct radeon_device *rdev);
 void cayman_vm_flush(struct radeon_device *rdev, int ridx, struct radeon_vm *vm);
 uint32_t cayman_vm_page_flags(struct radeon_device *rdev, uint32_t flags);
-void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
+void cayman_vm_set_page(struct radeon_device *rdev,
+			struct radeon_ib *ib,
+			uint64_t pe,
 			uint64_t addr, unsigned count,
 			uint32_t incr, uint32_t flags);
 int evergreen_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib);
@@ -506,7 +508,9 @@  int si_irq_set(struct radeon_device *rdev);
 int si_irq_process(struct radeon_device *rdev);
 int si_vm_init(struct radeon_device *rdev);
 void si_vm_fini(struct radeon_device *rdev);
-void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
+void si_vm_set_page(struct radeon_device *rdev,
+		    struct radeon_ib *ib,
+		    uint64_t pe,
 		    uint64_t addr, unsigned count,
 		    uint32_t incr, uint32_t flags);
 void si_vm_flush(struct radeon_device *rdev, int ridx, struct radeon_vm *vm);
diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c
index 6e24f84..d95c61d 100644
--- a/drivers/gpu/drm/radeon/radeon_gart.c
+++ b/drivers/gpu/drm/radeon/radeon_gart.c
@@ -929,6 +929,7 @@  uint64_t radeon_vm_map_gart(struct radeon_device *rdev, uint64_t addr)
  */
 static int radeon_vm_update_pdes(struct radeon_device *rdev,
 				 struct radeon_vm *vm,
+				 struct radeon_ib *ib,
 				 uint64_t start, uint64_t end)
 {
 	static const uint32_t incr = RADEON_VM_PTE_COUNT * 8;
@@ -971,7 +972,7 @@  retry:
 		    ((last_pt + incr * count) != pt)) {
 
 			if (count) {
-				radeon_asic_vm_set_page(rdev, last_pde,
+				radeon_asic_vm_set_page(rdev, ib, last_pde,
 							last_pt, count, incr,
 							RADEON_VM_PAGE_VALID);
 			}
@@ -985,7 +986,7 @@  retry:
 	}
 
 	if (count) {
-		radeon_asic_vm_set_page(rdev, last_pde, last_pt, count,
+		radeon_asic_vm_set_page(rdev, ib, last_pde, last_pt, count,
 					incr, RADEON_VM_PAGE_VALID);
 
 	}
@@ -1009,6 +1010,7 @@  retry:
  */
 static void radeon_vm_update_ptes(struct radeon_device *rdev,
 				  struct radeon_vm *vm,
+				  struct radeon_ib *ib,
 				  uint64_t start, uint64_t end,
 				  uint64_t dst, uint32_t flags)
 {
@@ -1038,7 +1040,7 @@  static void radeon_vm_update_ptes(struct radeon_device *rdev,
 		if ((last_pte + 8 * count) != pte) {
 
 			if (count) {
-				radeon_asic_vm_set_page(rdev, last_pte,
+				radeon_asic_vm_set_page(rdev, ib, last_pte,
 							last_dst, count,
 							RADEON_GPU_PAGE_SIZE,
 							flags);
@@ -1056,7 +1058,8 @@  static void radeon_vm_update_ptes(struct radeon_device *rdev,
 	}
 
 	if (count) {
-		radeon_asic_vm_set_page(rdev, last_pte,	last_dst, count,
+		radeon_asic_vm_set_page(rdev, ib, last_pte,
+					last_dst, count,
 					RADEON_GPU_PAGE_SIZE, flags);
 	}
 }
@@ -1082,6 +1085,7 @@  int radeon_vm_bo_update_pte(struct radeon_device *rdev,
 	unsigned ridx = rdev->asic->vm.pt_ring_index;
 	struct radeon_ring *ring = &rdev->ring[ridx];
 	struct radeon_semaphore *sem = NULL;
+	struct radeon_ib ib;
 	struct radeon_bo_va *bo_va;
 	unsigned nptes, npdes, ndw;
 	uint64_t addr;
@@ -1140,9 +1144,8 @@  int radeon_vm_bo_update_pte(struct radeon_device *rdev,
 	/* assume two extra pdes in case the mapping overlaps the borders */
 	npdes = (nptes >> RADEON_VM_BLOCK_SIZE) + 2;
 
-	/* estimate number of dw needed */
-	/* semaphore, fence and padding */
-	ndw = 32;
+	/* padding, etc. */
+	ndw = 64;
 
 	if (RADEON_VM_BLOCK_SIZE > 11)
 		/* reserve space for one header for every 2k dwords */
@@ -1161,8 +1164,25 @@  int radeon_vm_bo_update_pte(struct radeon_device *rdev,
 	/* reserve space for pde addresses */
 	ndw += npdes * 2;
 
-	r = radeon_ring_lock(rdev, ring, ndw);
+	/* update too big for an IB */
+	if (ndw > 0xfffff)
+		return -ENOMEM;
+
+	r = radeon_sa_bo_new(rdev, &rdev->ring_tmp_bo, &ib.sa_bo, ndw * 4, 256, true);
+	if (r) {
+		return r;
+	}
+	ib.ring = ridx;
+	ib.fence = NULL;
+	ib.ptr = radeon_sa_bo_cpu_addr(ib.sa_bo);
+	ib.vm = NULL;
+	ib.gpu_addr = radeon_sa_bo_gpu_addr(ib.sa_bo);
+	ib.is_const_ib = false;
+	ib.length_dw = 0;
+
+	r = radeon_ring_lock(rdev, ring, 64);
 	if (r) {
+		radeon_sa_bo_free(rdev, &ib.sa_bo, NULL);
 		return r;
 	}
 
@@ -1171,23 +1191,28 @@  int radeon_vm_bo_update_pte(struct radeon_device *rdev,
 		radeon_fence_note_sync(vm->fence, ridx);
 	}
 
-	r = radeon_vm_update_pdes(rdev, vm, bo_va->soffset, bo_va->eoffset);
+	r = radeon_vm_update_pdes(rdev, vm, &ib, bo_va->soffset, bo_va->eoffset);
 	if (r) {
+		radeon_sa_bo_free(rdev, &ib.sa_bo, NULL);
 		radeon_ring_unlock_undo(rdev, ring);
 		return r;
 	}
 
-	radeon_vm_update_ptes(rdev, vm, bo_va->soffset, bo_va->eoffset,
+	radeon_vm_update_ptes(rdev, vm, &ib, bo_va->soffset, bo_va->eoffset,
 			      addr, bo_va->flags);
 
+	radeon_ring_ib_execute(rdev, ib.ring, &ib);
+
 	radeon_fence_unref(&vm->fence);
 	r = radeon_fence_emit(rdev, &vm->fence, ridx);
 	if (r) {
+		radeon_sa_bo_free(rdev, &ib.sa_bo, NULL);
 		radeon_ring_unlock_undo(rdev, ring);
 		return r;
 	}
 	radeon_ring_unlock_commit(rdev, ring);
 	radeon_semaphore_free(rdev, &sem, vm->fence);
+	radeon_sa_bo_free(rdev, &ib.sa_bo, vm->fence);
 	radeon_fence_unref(&vm->last_flush);
 
 	return 0;
diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c
index cd83bc5..a910cb9 100644
--- a/drivers/gpu/drm/radeon/si.c
+++ b/drivers/gpu/drm/radeon/si.c
@@ -3043,19 +3043,21 @@  void si_vm_fini(struct radeon_device *rdev)
  * si_vm_set_page - update the page tables using the CP
  *
  * @rdev: radeon_device pointer
+ * @ib: indirect buffer to fill with commands
  * @pe: addr of the page entry
  * @addr: dst addr to write into pe
  * @count: number of page entries to update
  * @incr: increase next addr by incr bytes
  * @flags: access flags
  *
- * Update the page tables using the CP (cayman-si).
+ * Update the page tables using the CP (SI).
  */
-void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
+void si_vm_set_page(struct radeon_device *rdev,
+		    struct radeon_ib *ib,
+		    uint64_t pe,
 		    uint64_t addr, unsigned count,
 		    uint32_t incr, uint32_t flags)
 {
-	struct radeon_ring *ring = &rdev->ring[rdev->asic->vm.pt_ring_index];
 	uint32_t r600_flags = cayman_vm_page_flags(rdev, flags);
 	uint64_t value;
 	unsigned ndw;
@@ -3066,11 +3068,11 @@  void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
 			if (ndw > 0x3FFE)
 				ndw = 0x3FFE;
 
-			radeon_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, ndw));
-			radeon_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
-						 WRITE_DATA_DST_SEL(1)));
-			radeon_ring_write(ring, pe);
-			radeon_ring_write(ring, upper_32_bits(pe));
+			ib->ptr[ib->length_dw++] = PACKET3(PACKET3_WRITE_DATA, ndw);
+			ib->ptr[ib->length_dw++] = (WRITE_DATA_ENGINE_SEL(0) |
+					WRITE_DATA_DST_SEL(1));
+			ib->ptr[ib->length_dw++] = pe;
+			ib->ptr[ib->length_dw++] = upper_32_bits(pe);
 			for (; ndw > 2; ndw -= 2, --count, pe += 8) {
 				if (flags & RADEON_VM_PAGE_SYSTEM) {
 					value = radeon_vm_map_gart(rdev, addr);
@@ -3082,8 +3084,8 @@  void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
 				}
 				addr += incr;
 				value |= r600_flags;
-				radeon_ring_write(ring, value);
-				radeon_ring_write(ring, upper_32_bits(value));
+				ib->ptr[ib->length_dw++] = value;
+				ib->ptr[ib->length_dw++] = upper_32_bits(value);
 			}
 		}
 	} else {
@@ -3095,9 +3097,9 @@  void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
 					ndw = 0xFFFFE;
 
 				/* for non-physically contiguous pages (system) */
-				radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, ndw));
-				radeon_ring_write(ring, pe);
-				radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
+				ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, ndw);
+				ib->ptr[ib->length_dw++] = pe;
+				ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
 				for (; ndw > 0; ndw -= 2, --count, pe += 8) {
 					if (flags & RADEON_VM_PAGE_SYSTEM) {
 						value = radeon_vm_map_gart(rdev, addr);
@@ -3109,8 +3111,8 @@  void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
 					}
 					addr += incr;
 					value |= r600_flags;
-					radeon_ring_write(ring, value);
-					radeon_ring_write(ring, upper_32_bits(value));
+					ib->ptr[ib->length_dw++] = value;
+					ib->ptr[ib->length_dw++] = upper_32_bits(value);
 				}
 			}
 		} else {
@@ -3124,20 +3126,22 @@  void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
 				else
 					value = 0;
 				/* for physically contiguous pages (vram) */
-				radeon_ring_write(ring, DMA_PTE_PDE_PACKET(ndw));
-				radeon_ring_write(ring, pe); /* dst addr */
-				radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
-				radeon_ring_write(ring, r600_flags); /* mask */
-				radeon_ring_write(ring, 0);
-				radeon_ring_write(ring, value); /* value */
-				radeon_ring_write(ring, upper_32_bits(value));
-				radeon_ring_write(ring, incr); /* increment size */
-				radeon_ring_write(ring, 0);
+				ib->ptr[ib->length_dw++] = DMA_PTE_PDE_PACKET(ndw);
+				ib->ptr[ib->length_dw++] = pe; /* dst addr */
+				ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
+				ib->ptr[ib->length_dw++] = r600_flags; /* mask */
+				ib->ptr[ib->length_dw++] = 0;
+				ib->ptr[ib->length_dw++] = value; /* value */
+				ib->ptr[ib->length_dw++] = upper_32_bits(value);
+				ib->ptr[ib->length_dw++] = incr; /* increment size */
+				ib->ptr[ib->length_dw++] = 0;
 				pe += ndw * 4;
 				addr += (ndw / 2) * incr;
 				count -= ndw / 2;
 			}
 		}
+		while (ib->length_dw & 0x7)
+			ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0, 0);
 	}
 }