diff mbox

[RFC] drm/radeon: make dynamic allocation of page tables on demand in radeon_vm_update_pte v2

Message ID 1349361147-12712-1-git-send-email-Dmitrii.Cherkasov@amd.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dmitry Cherkasov Oct. 4, 2012, 2:32 p.m. UTC
v2: setup and alloc number of contiguous PTs if possible

Warning: Heaven benchmark /sometimes/ fails with this patch after
10 or 15 minutes of working, so any insight is greatly appreciated.

The code is a bit bloated because it's a question how a decent optimization
should be made: via macros? using structs? etc.

The rationale for struct radeon_pt is that BO may contain several contiguous
PTs and we should have that u64 gpu_addr to point to actual begining of PT.

I've only tested it on cayman card, should work on SI but who knows? ;)

Please say your ideas.
---
 drivers/gpu/drm/radeon/radeon.h      |   12 ++
 drivers/gpu/drm/radeon/radeon_gart.c |  263 ++++++++++++++++++++++++++++++++--
 2 files changed, 260 insertions(+), 15 deletions(-)

Comments

Christian König Oct. 5, 2012, 10:27 a.m. UTC | #1
Trying to resolve the remaining bugs today. Expect an v3 of the patch 
this evening or Monday morning.

Cheers,
Christian.

On 04.10.2012 16:32, Dmitry Cherkasov wrote:
> v2: setup and alloc number of contiguous PTs if possible
>
> Warning: Heaven benchmark /sometimes/ fails with this patch after
> 10 or 15 minutes of working, so any insight is greatly appreciated.
>
> The code is a bit bloated because it's a question how a decent optimization
> should be made: via macros? using structs? etc.
>
> The rationale for struct radeon_pt is that BO may contain several contiguous
> PTs and we should have that u64 gpu_addr to point to actual begining of PT.
>
> I've only tested it on cayman card, should work on SI but who knows? ;)
>
> Please say your ideas.
> ---
>   drivers/gpu/drm/radeon/radeon.h      |   12 ++
>   drivers/gpu/drm/radeon/radeon_gart.c |  263 ++++++++++++++++++++++++++++++++--
>   2 files changed, 260 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index b04c064..38d4eda 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -659,6 +659,15 @@ struct radeon_ring {
>   /* number of entries in page table */
>   #define RADEON_VM_PTE_COUNT (1 << RADEON_VM_BLOCK_SIZE)
>   
> +struct radeon_pt {
> +	/* BO containing the page table */
> +	/* radeon_sa_bo_gpu_addr(sa_bo); */
> +	struct radeon_sa_bo *bo;
> +
> +	/* GPU address of page table */
> +	u64 gpu_addr;
> +};
> +
>   struct radeon_vm {
>   	struct list_head		list;
>   	struct list_head		va;
> @@ -671,6 +680,9 @@ struct radeon_vm {
>   	struct radeon_fence		*fence;
>   	/* last flush or NULL if we still need to flush */
>   	struct radeon_fence		*last_flush;
> +
> +	/* page tables list */
> +	struct radeon_pt *vm_pts;
>   };
>   
>   struct radeon_vm_manager {
> diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c
> index 753b7ca..cea918d 100644
> --- a/drivers/gpu/drm/radeon/radeon_gart.c
> +++ b/drivers/gpu/drm/radeon/radeon_gart.c
> @@ -500,6 +500,10 @@ static void radeon_vm_free_pt(struct radeon_device *rdev,
>   				    struct radeon_vm *vm)
>   {
>   	struct radeon_bo_va *bo_va;
> +	int i;
> +
> +	int driver_table_entries = (rdev->vm_manager.max_pfn >>
> +				    RADEON_VM_BLOCK_SIZE);
>   
>   	if (!vm->sa_bo)
>   		return;
> @@ -510,6 +514,14 @@ static void radeon_vm_free_pt(struct radeon_device *rdev,
>   	list_for_each_entry(bo_va, &vm->va, vm_list) {
>   		bo_va->valid = false;
>   	}
> +
> +	if (vm->vm_pts == NULL)
> +		return;
> +
> +	for (i = 0;i < driver_table_entries; i++)
> +		radeon_sa_bo_free(rdev, &vm->vm_pts[i].bo, vm->fence);
> +
> +	kfree (vm->vm_pts);
>   }
>   
>   /**
> @@ -563,6 +575,9 @@ int radeon_vm_alloc_pt(struct radeon_device *rdev, struct radeon_vm *vm)
>   	int r;
>   	u64 *pd_addr;
>   	int tables_size;
> +	int driver_table_size = (rdev->vm_manager.max_pfn >>
> +				 RADEON_VM_BLOCK_SIZE) *
> +		sizeof(struct radeon_pt);
>   
>   	if (vm == NULL) {
>   		return -EINVAL;
> @@ -570,7 +585,6 @@ int radeon_vm_alloc_pt(struct radeon_device *rdev, struct radeon_vm *vm)
>   
>   	/* allocate enough to cover the current VM size */
>   	tables_size = RADEON_GPU_PAGE_ALIGN(radeon_vm_directory_size(rdev));
> -	tables_size += RADEON_GPU_PAGE_ALIGN(vm->last_pfn * 8);
>   
>   	if (vm->sa_bo != NULL) {
>   		/* update lru */
> @@ -600,6 +614,16 @@ retry:
>   	vm->pd_gpu_addr = radeon_sa_bo_gpu_addr(vm->sa_bo);
>   	memset(pd_addr, 0, tables_size);
>   
> +	vm->vm_pts = kmalloc(driver_table_size, GFP_KERNEL);
> +
> +	if (vm->vm_pts == NULL) {
> +		DRM_ERROR("Cannot allocate space for driver page table\n");
> +		radeon_sa_bo_free(rdev, &vm->sa_bo, vm->fence);
> +		return -ENOMEM;
> +	}
> +
> +	memset(vm->vm_pts, 0, driver_table_size);
> +
>   	list_add_tail(&vm->list, &rdev->vm_manager.lru_vm);
>   	return radeon_vm_bo_update_pte(rdev, vm, rdev->ring_tmp_bo.bo,
>   				       &rdev->ring_tmp_bo.bo->tbo.mem);
> @@ -864,6 +888,69 @@ uint64_t radeon_vm_map_gart(struct radeon_device *rdev, uint64_t addr)
>   	return result;
>   }
>   
> +/* setup @count pfns starting at @addr to PTEs starting at @pt_start and
> + * @pde_count pdes starting at @pd_start */
> +
> +static void radeon_vm_map_pfns (struct radeon_device *rdev,
> +				uint64_t pt_addr, uint64_t pt_offset,
> +				uint64_t addr, uint64_t pte_count,
> +				uint64_t pd_start, uint32_t pde_count, uint32_t flags)
> +{
> +	if (pde_count == 0 && pte_count == 0)
> +		return;
> +
> +	radeon_asic_vm_set_page(rdev, pt_addr + pt_offset, addr,
> +				pte_count,
> +				RADEON_GPU_PAGE_SIZE, flags);
> +
> +	radeon_asic_vm_set_page(rdev, pd_start, pt_addr,
> +				pde_count,
> +				RADEON_VM_PTE_COUNT * 8, RADEON_VM_PAGE_VALID);
> +}
> +
> +int radeon_suballoc_pts(struct radeon_device *rdev, struct radeon_vm *vm, uint64_t start_pt, uint32_t count)
> +{
> +	uint32_t i;
> +	int r;
> +	struct radeon_vm *vm_evict;
> +	struct radeon_pt *pt = &vm->vm_pts[start_pt], *pti;
> +retry:
> +	r = radeon_sa_bo_new(rdev, &rdev->vm_manager.sa_manager,
> +			     &pt->bo,
> +			     RADEON_VM_PTE_COUNT * 8 * count,
> +			     RADEON_GPU_PAGE_SIZE, false);
> +
> +	if (r == -ENOMEM) {
> +		if (list_empty(&rdev->vm_manager.lru_vm)) {
> +			DRM_ERROR("cannot allocate driver page table"
> +				  "for vmid = %d", vm->id);
> +			return r;
> +		}
> +
> +		vm_evict = list_first_entry(&rdev->vm_manager.lru_vm,
> +					    struct radeon_vm, list);
> +
> +		mutex_lock(&vm_evict->mutex);
> +		radeon_vm_free_pt(rdev, vm_evict);
> +		mutex_unlock(&vm_evict->mutex);
> +
> +		DRM_INFO("run out of SA memory for PT. Trying to free LRU vm id = %d\n", vm_evict->id);
> +
> +		goto retry;
> +	}
> +
> +	pt->gpu_addr = radeon_sa_bo_gpu_addr(pt->bo);
> +
> +	for (i = 1; i < count; i++) {
> +		pti = &vm->vm_pts[start_pt + i];
> +		pti->bo = NULL;
> +		pti->gpu_addr = pt->gpu_addr + i * RADEON_VM_PTE_COUNT * 8;
> +	}
> +
> +	return 0;
> +}
> +
> +
>   /**
>    * radeon_vm_bo_update_pte - map a bo into the vm page table
>    *
> @@ -886,10 +973,18 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev,
>   	struct radeon_ring *ring = &rdev->ring[ridx];
>   	struct radeon_semaphore *sem = NULL;
>   	struct radeon_bo_va *bo_va;
> -	unsigned nptes, npdes, ndw;
> -	uint64_t pe, addr;
> +	struct radeon_pt *pt;
> +	unsigned nptes, npdes, ndw, count;
> +	uint64_t  addr;
>   	uint64_t pfn;
> +	uint32_t pfns_to_pt_edge, pfns_to_end;
>   	int r;
> +	uint64_t mem_pfn_offset;
> +	uint64_t pfn_idx, last_pfn, pde_num, pte_num;
> +	uint64_t pfn_map_start, pde_map_start, pte_map_start, pde_map_count, pte_map_count;
> +	uint64_t prev_gpu_addr;
> +	char need_alloc, need_map;
> +
>   
>   	/* nothing to do if vm isn't bound */
>   	if (vm->sa_bo == NULL)
> @@ -971,22 +1066,159 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev,
>   		radeon_fence_note_sync(vm->fence, ridx);
>   	}
>   
> -	/* update page table entries */
> -	pe = vm->pd_gpu_addr;
> -	pe += radeon_vm_directory_size(rdev);
> -	pe += (bo_va->soffset / RADEON_GPU_PAGE_SIZE) * 8;
> +	pfn_idx = pfn;
> +	last_pfn = pfn_idx + nptes;
> +
> +	pfn_map_start = 0;
> +	pde_map_start = pfn_idx / RADEON_VM_PTE_COUNT;
> +	pte_map_start = pfn_idx % RADEON_VM_PTE_COUNT;
> +	pde_map_count = 0;
> +	pte_map_count = 0;
> +	pte_num = pfn_idx % RADEON_VM_PTE_COUNT;
> +	pde_num = pfn_idx / RADEON_VM_PTE_COUNT;
> +	pt = &vm->vm_pts[pde_num];
> +	prev_gpu_addr = 0;
> +
> +	need_alloc = 0;
> +	need_map = 0;
> +
> +	for (mem_pfn_offset = 0; mem_pfn_offset < nptes;) {
> +		pfns_to_end = last_pfn - pfn_idx;
> +		pfns_to_pt_edge = RADEON_VM_PTE_COUNT -
> +			(pfn_idx % RADEON_VM_PTE_COUNT);
> +
> +		count = pfns_to_pt_edge < pfns_to_end ?
> +			pfns_to_pt_edge : pfns_to_end;
> +
> +		pde_num = pfn_idx / RADEON_VM_PTE_COUNT;
> +		pte_num = pfn_idx % RADEON_VM_PTE_COUNT;
> +
> +		pt = &vm->vm_pts[pde_num];
> +
> +		if (pt->gpu_addr == 0 && prev_gpu_addr == -1) {
> +			/* case 1 */
> +			/* pt for current pfn_idx is unmapped */
> +			/* previous ptes are unmapped */
> +			need_alloc = 1;
> +			need_map = 1;
> +		}
> +		else if (pt->gpu_addr != 0 && prev_gpu_addr != -1) {
> +			/* case 4 */
> +			/* pt for current pfn_idx is mapped */
> +			/* previous ptes are mapped */
> +
> +			if ( pt->gpu_addr != prev_gpu_addr + RADEON_VM_PTE_COUNT * 8) {
> +				/* current pt is not contiguous with previous
> +				   one */
> +				/* flush prior pts */
> +
> +				radeon_vm_map_pfns(
> +					rdev, vm->vm_pts[pde_map_start].gpu_addr,
> +					pte_map_start * 8,
> +					addr + pfn_map_start * RADEON_GPU_PAGE_SIZE,
> +					pte_map_count,
> +					vm->pd_gpu_addr + pde_map_start * 8,
> +					pde_map_count,
> +					bo_va->flags);
> +
> +				pfn_map_start = mem_pfn_offset;
> +				pde_map_start = pfn_idx / RADEON_VM_PTE_COUNT;
> +				pte_map_start = pfn_idx % RADEON_VM_PTE_COUNT;
> +
> +				pde_map_count = 0;
> +				pte_map_count = 0;
> +			}
> +
> +			prev_gpu_addr = pt->gpu_addr;
> +
> +			need_alloc = 0;
> +			need_map = 1;
> +		}
> +		else	if (pt->gpu_addr == 0 && prev_gpu_addr != -1) {
> +			/* case 2 */
> +			/* pt for current pfn_idx is unmapped */
> +			/* previous ptes are mapped */
>   
> -	radeon_asic_vm_set_page(rdev, pe, addr, nptes,
> -				RADEON_GPU_PAGE_SIZE, bo_va->flags);
> +			/* setup prior pdes & ptes here */
>   
> -	/* update page directory entries */
> -	addr = pe;
> +			radeon_vm_map_pfns(
> +				rdev, vm->vm_pts[pde_map_start].gpu_addr,
> +				pte_map_start * 8,
> +				addr + pfn_map_start * RADEON_GPU_PAGE_SIZE,
> +				pte_map_count,
> +				vm->pd_gpu_addr + pde_map_start * 8,
> +				pde_map_count,
> +				bo_va->flags);
>   
> -	pe = vm->pd_gpu_addr;
> -	pe += ((bo_va->soffset / RADEON_GPU_PAGE_SIZE) >> RADEON_VM_BLOCK_SIZE) * 8;
> +			pfn_map_start = mem_pfn_offset;
> +			pde_map_start = pfn_idx / RADEON_VM_PTE_COUNT;
> +			pte_map_start = pfn_idx % RADEON_VM_PTE_COUNT;
>   
> -	radeon_asic_vm_set_page(rdev, pe, addr, npdes,
> -				RADEON_VM_PTE_COUNT * 8, RADEON_VM_PAGE_VALID);
> +			pde_map_count = 0;
> +			pte_map_count = 0;
> +
> +			prev_gpu_addr = -1;
> +
> +			need_alloc = 1;
> +			need_map = 1;
> +		}
> +		else if (pt->gpu_addr != 0 && prev_gpu_addr == -1) {
> +			/* case 3 */
> +			/* pt for current pfn_idx is mapped */
> +			/* previous ptes are unmapped */
> +
> +			/* map prior pfns if there are any */
> +			if (pfn_map_start < mem_pfn_offset) {
> +				radeon_suballoc_pts(rdev, vm,
> +						    pde_map_start, pde_map_count);
> +				radeon_vm_map_pfns(
> +					rdev, vm->vm_pts[pde_map_start].gpu_addr,
> +					pte_map_start * 8,
> +					addr + pfn_map_start * RADEON_GPU_PAGE_SIZE,
> +					pte_map_count,
> +					vm->pd_gpu_addr + pde_map_start * 8,
> +					pde_map_count,
> +					bo_va->flags);
> +			}
> +
> +			pfn_map_start = mem_pfn_offset;
> +			pde_map_start = pfn_idx / RADEON_VM_PTE_COUNT;
> +			pte_map_start = pfn_idx % RADEON_VM_PTE_COUNT;
> +
> +			pde_map_count = 0;
> +			pte_map_count = 0;
> +			
> +			prev_gpu_addr = pt->gpu_addr;
> +
> +			need_alloc = 0;
> +			need_map = 1;
> +
> +		}
> +
> +		pde_map_count++;
> +		pte_map_count += count;
> +
> +		pfn_idx += count;
> +		mem_pfn_offset += count;
> +	}
> +
> +	if (need_alloc) {
> +		radeon_suballoc_pts(rdev, vm, pde_map_start, pde_map_count);
> +	}
> +
> +	if (need_map) {
> +		if (vm->vm_pts[pde_map_start].gpu_addr == 0)
> +			DRM_ERROR("gpu_addr == 0. smth is wrong\n");
> +
> +		radeon_vm_map_pfns(
> +			rdev, vm->vm_pts[pde_map_start].gpu_addr,
> +			pte_map_start * 8,
> +			addr + pfn_map_start * RADEON_GPU_PAGE_SIZE,
> +			pte_map_count,
> +			vm->pd_gpu_addr + pde_map_start * 8,
> +			pde_map_count,
> +			bo_va->flags);
> +	}
>   
>   	radeon_fence_unref(&vm->fence);
>   	r = radeon_fence_emit(rdev, &vm->fence, ridx);
> @@ -997,6 +1229,7 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev,
>   	radeon_ring_unlock_commit(rdev, ring);
>   	radeon_semaphore_free(rdev, &sem, vm->fence);
>   	radeon_fence_unref(&vm->last_flush);
> +
>   	return 0;
>   }
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index b04c064..38d4eda 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -659,6 +659,15 @@  struct radeon_ring {
 /* number of entries in page table */
 #define RADEON_VM_PTE_COUNT (1 << RADEON_VM_BLOCK_SIZE)
 
+struct radeon_pt {
+	/* BO containing the page table */
+	/* radeon_sa_bo_gpu_addr(sa_bo); */
+	struct radeon_sa_bo *bo;
+
+	/* GPU address of page table */
+	u64 gpu_addr;
+};
+
 struct radeon_vm {
 	struct list_head		list;
 	struct list_head		va;
@@ -671,6 +680,9 @@  struct radeon_vm {
 	struct radeon_fence		*fence;
 	/* last flush or NULL if we still need to flush */
 	struct radeon_fence		*last_flush;
+
+	/* page tables list */
+	struct radeon_pt *vm_pts;
 };
 
 struct radeon_vm_manager {
diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c
index 753b7ca..cea918d 100644
--- a/drivers/gpu/drm/radeon/radeon_gart.c
+++ b/drivers/gpu/drm/radeon/radeon_gart.c
@@ -500,6 +500,10 @@  static void radeon_vm_free_pt(struct radeon_device *rdev,
 				    struct radeon_vm *vm)
 {
 	struct radeon_bo_va *bo_va;
+	int i;
+
+	int driver_table_entries = (rdev->vm_manager.max_pfn >>
+				    RADEON_VM_BLOCK_SIZE);
 
 	if (!vm->sa_bo)
 		return;
@@ -510,6 +514,14 @@  static void radeon_vm_free_pt(struct radeon_device *rdev,
 	list_for_each_entry(bo_va, &vm->va, vm_list) {
 		bo_va->valid = false;
 	}
+
+	if (vm->vm_pts == NULL)
+		return;
+
+	for (i = 0;i < driver_table_entries; i++)
+		radeon_sa_bo_free(rdev, &vm->vm_pts[i].bo, vm->fence);
+
+	kfree (vm->vm_pts);
 }
 
 /**
@@ -563,6 +575,9 @@  int radeon_vm_alloc_pt(struct radeon_device *rdev, struct radeon_vm *vm)
 	int r;
 	u64 *pd_addr;
 	int tables_size;
+	int driver_table_size = (rdev->vm_manager.max_pfn >>
+				 RADEON_VM_BLOCK_SIZE) *
+		sizeof(struct radeon_pt);
 
 	if (vm == NULL) {
 		return -EINVAL;
@@ -570,7 +585,6 @@  int radeon_vm_alloc_pt(struct radeon_device *rdev, struct radeon_vm *vm)
 
 	/* allocate enough to cover the current VM size */
 	tables_size = RADEON_GPU_PAGE_ALIGN(radeon_vm_directory_size(rdev));
-	tables_size += RADEON_GPU_PAGE_ALIGN(vm->last_pfn * 8);
 
 	if (vm->sa_bo != NULL) {
 		/* update lru */
@@ -600,6 +614,16 @@  retry:
 	vm->pd_gpu_addr = radeon_sa_bo_gpu_addr(vm->sa_bo);
 	memset(pd_addr, 0, tables_size);
 
+	vm->vm_pts = kmalloc(driver_table_size, GFP_KERNEL);
+
+	if (vm->vm_pts == NULL) {
+		DRM_ERROR("Cannot allocate space for driver page table\n");
+		radeon_sa_bo_free(rdev, &vm->sa_bo, vm->fence);
+		return -ENOMEM;
+	}
+
+	memset(vm->vm_pts, 0, driver_table_size);
+
 	list_add_tail(&vm->list, &rdev->vm_manager.lru_vm);
 	return radeon_vm_bo_update_pte(rdev, vm, rdev->ring_tmp_bo.bo,
 				       &rdev->ring_tmp_bo.bo->tbo.mem);
@@ -864,6 +888,69 @@  uint64_t radeon_vm_map_gart(struct radeon_device *rdev, uint64_t addr)
 	return result;
 }
 
+/* setup @count pfns starting at @addr to PTEs starting at @pt_start and
+ * @pde_count pdes starting at @pd_start */
+
+static void radeon_vm_map_pfns (struct radeon_device *rdev,
+				uint64_t pt_addr, uint64_t pt_offset,
+				uint64_t addr, uint64_t pte_count,
+				uint64_t pd_start, uint32_t pde_count, uint32_t flags)
+{
+	if (pde_count == 0 && pte_count == 0)
+		return;
+
+	radeon_asic_vm_set_page(rdev, pt_addr + pt_offset, addr,
+				pte_count,
+				RADEON_GPU_PAGE_SIZE, flags);
+
+	radeon_asic_vm_set_page(rdev, pd_start, pt_addr,
+				pde_count,
+				RADEON_VM_PTE_COUNT * 8, RADEON_VM_PAGE_VALID);
+}
+
+int radeon_suballoc_pts(struct radeon_device *rdev, struct radeon_vm *vm, uint64_t start_pt, uint32_t count)
+{
+	uint32_t i;
+	int r;
+	struct radeon_vm *vm_evict;
+	struct radeon_pt *pt = &vm->vm_pts[start_pt], *pti;
+retry:
+	r = radeon_sa_bo_new(rdev, &rdev->vm_manager.sa_manager,
+			     &pt->bo,
+			     RADEON_VM_PTE_COUNT * 8 * count,
+			     RADEON_GPU_PAGE_SIZE, false);
+
+	if (r == -ENOMEM) {
+		if (list_empty(&rdev->vm_manager.lru_vm)) {
+			DRM_ERROR("cannot allocate driver page table"
+				  "for vmid = %d", vm->id);
+			return r;
+		}
+
+		vm_evict = list_first_entry(&rdev->vm_manager.lru_vm,
+					    struct radeon_vm, list);
+
+		mutex_lock(&vm_evict->mutex);
+		radeon_vm_free_pt(rdev, vm_evict);
+		mutex_unlock(&vm_evict->mutex);
+
+		DRM_INFO("run out of SA memory for PT. Trying to free LRU vm id = %d\n", vm_evict->id);
+
+		goto retry;
+	}
+
+	pt->gpu_addr = radeon_sa_bo_gpu_addr(pt->bo);
+
+	for (i = 1; i < count; i++) {
+		pti = &vm->vm_pts[start_pt + i];
+		pti->bo = NULL;
+		pti->gpu_addr = pt->gpu_addr + i * RADEON_VM_PTE_COUNT * 8;
+	}
+
+	return 0;
+}
+
+
 /**
  * radeon_vm_bo_update_pte - map a bo into the vm page table
  *
@@ -886,10 +973,18 @@  int radeon_vm_bo_update_pte(struct radeon_device *rdev,
 	struct radeon_ring *ring = &rdev->ring[ridx];
 	struct radeon_semaphore *sem = NULL;
 	struct radeon_bo_va *bo_va;
-	unsigned nptes, npdes, ndw;
-	uint64_t pe, addr;
+	struct radeon_pt *pt;
+	unsigned nptes, npdes, ndw, count;
+	uint64_t  addr;
 	uint64_t pfn;
+	uint32_t pfns_to_pt_edge, pfns_to_end;
 	int r;
+	uint64_t mem_pfn_offset;
+	uint64_t pfn_idx, last_pfn, pde_num, pte_num;
+	uint64_t pfn_map_start, pde_map_start, pte_map_start, pde_map_count, pte_map_count;
+	uint64_t prev_gpu_addr;
+	char need_alloc, need_map;
+
 
 	/* nothing to do if vm isn't bound */
 	if (vm->sa_bo == NULL)
@@ -971,22 +1066,159 @@  int radeon_vm_bo_update_pte(struct radeon_device *rdev,
 		radeon_fence_note_sync(vm->fence, ridx);
 	}
 
-	/* update page table entries */
-	pe = vm->pd_gpu_addr;
-	pe += radeon_vm_directory_size(rdev);
-	pe += (bo_va->soffset / RADEON_GPU_PAGE_SIZE) * 8;
+	pfn_idx = pfn;
+	last_pfn = pfn_idx + nptes;
+
+	pfn_map_start = 0;
+	pde_map_start = pfn_idx / RADEON_VM_PTE_COUNT;
+	pte_map_start = pfn_idx % RADEON_VM_PTE_COUNT;
+	pde_map_count = 0;
+	pte_map_count = 0;
+	pte_num = pfn_idx % RADEON_VM_PTE_COUNT;
+	pde_num = pfn_idx / RADEON_VM_PTE_COUNT;
+	pt = &vm->vm_pts[pde_num];
+	prev_gpu_addr = 0;
+
+	need_alloc = 0;
+	need_map = 0;
+
+	for (mem_pfn_offset = 0; mem_pfn_offset < nptes;) {
+		pfns_to_end = last_pfn - pfn_idx;
+		pfns_to_pt_edge = RADEON_VM_PTE_COUNT -
+			(pfn_idx % RADEON_VM_PTE_COUNT);
+
+		count = pfns_to_pt_edge < pfns_to_end ?
+			pfns_to_pt_edge : pfns_to_end;
+
+		pde_num = pfn_idx / RADEON_VM_PTE_COUNT;
+		pte_num = pfn_idx % RADEON_VM_PTE_COUNT;
+
+		pt = &vm->vm_pts[pde_num];
+
+		if (pt->gpu_addr == 0 && prev_gpu_addr == -1) {
+			/* case 1 */
+			/* pt for current pfn_idx is unmapped */
+			/* previous ptes are unmapped */
+			need_alloc = 1;
+			need_map = 1;
+		}
+		else if (pt->gpu_addr != 0 && prev_gpu_addr != -1) {
+			/* case 4 */
+			/* pt for current pfn_idx is mapped */
+			/* previous ptes are mapped */
+
+			if ( pt->gpu_addr != prev_gpu_addr + RADEON_VM_PTE_COUNT * 8) {
+				/* current pt is not contiguous with previous
+				   one */
+				/* flush prior pts */
+
+				radeon_vm_map_pfns(
+					rdev, vm->vm_pts[pde_map_start].gpu_addr,
+					pte_map_start * 8,
+					addr + pfn_map_start * RADEON_GPU_PAGE_SIZE,
+					pte_map_count,
+					vm->pd_gpu_addr + pde_map_start * 8,
+					pde_map_count,
+					bo_va->flags);
+
+				pfn_map_start = mem_pfn_offset;
+				pde_map_start = pfn_idx / RADEON_VM_PTE_COUNT;
+				pte_map_start = pfn_idx % RADEON_VM_PTE_COUNT;
+
+				pde_map_count = 0;
+				pte_map_count = 0;
+			}
+
+			prev_gpu_addr = pt->gpu_addr;
+
+			need_alloc = 0;
+			need_map = 1;
+		}
+		else	if (pt->gpu_addr == 0 && prev_gpu_addr != -1) {
+			/* case 2 */
+			/* pt for current pfn_idx is unmapped */
+			/* previous ptes are mapped */
 
-	radeon_asic_vm_set_page(rdev, pe, addr, nptes,
-				RADEON_GPU_PAGE_SIZE, bo_va->flags);
+			/* setup prior pdes & ptes here */
 
-	/* update page directory entries */
-	addr = pe;
+			radeon_vm_map_pfns(
+				rdev, vm->vm_pts[pde_map_start].gpu_addr,
+				pte_map_start * 8,
+				addr + pfn_map_start * RADEON_GPU_PAGE_SIZE,
+				pte_map_count,
+				vm->pd_gpu_addr + pde_map_start * 8,
+				pde_map_count,
+				bo_va->flags);
 
-	pe = vm->pd_gpu_addr;
-	pe += ((bo_va->soffset / RADEON_GPU_PAGE_SIZE) >> RADEON_VM_BLOCK_SIZE) * 8;
+			pfn_map_start = mem_pfn_offset;
+			pde_map_start = pfn_idx / RADEON_VM_PTE_COUNT;
+			pte_map_start = pfn_idx % RADEON_VM_PTE_COUNT;
 
-	radeon_asic_vm_set_page(rdev, pe, addr, npdes,
-				RADEON_VM_PTE_COUNT * 8, RADEON_VM_PAGE_VALID);
+			pde_map_count = 0;
+			pte_map_count = 0;
+
+			prev_gpu_addr = -1;
+
+			need_alloc = 1;
+			need_map = 1;
+		}
+		else if (pt->gpu_addr != 0 && prev_gpu_addr == -1) {
+			/* case 3 */
+			/* pt for current pfn_idx is mapped */
+			/* previous ptes are unmapped */
+
+			/* map prior pfns if there are any */
+			if (pfn_map_start < mem_pfn_offset) {
+				radeon_suballoc_pts(rdev, vm,
+						    pde_map_start, pde_map_count);
+				radeon_vm_map_pfns(
+					rdev, vm->vm_pts[pde_map_start].gpu_addr,
+					pte_map_start * 8,
+					addr + pfn_map_start * RADEON_GPU_PAGE_SIZE,
+					pte_map_count,
+					vm->pd_gpu_addr + pde_map_start * 8,
+					pde_map_count,
+					bo_va->flags);
+			}
+
+			pfn_map_start = mem_pfn_offset;
+			pde_map_start = pfn_idx / RADEON_VM_PTE_COUNT;
+			pte_map_start = pfn_idx % RADEON_VM_PTE_COUNT;
+
+			pde_map_count = 0;
+			pte_map_count = 0;
+			
+			prev_gpu_addr = pt->gpu_addr;
+
+			need_alloc = 0;
+			need_map = 1;
+
+		}
+
+		pde_map_count++;
+		pte_map_count += count;
+
+		pfn_idx += count;
+		mem_pfn_offset += count;
+	}
+
+	if (need_alloc) {
+		radeon_suballoc_pts(rdev, vm, pde_map_start, pde_map_count);
+	}
+
+	if (need_map) {
+		if (vm->vm_pts[pde_map_start].gpu_addr == 0)
+			DRM_ERROR("gpu_addr == 0. smth is wrong\n");
+
+		radeon_vm_map_pfns(
+			rdev, vm->vm_pts[pde_map_start].gpu_addr,
+			pte_map_start * 8,
+			addr + pfn_map_start * RADEON_GPU_PAGE_SIZE,
+			pte_map_count,
+			vm->pd_gpu_addr + pde_map_start * 8,
+			pde_map_count,
+			bo_va->flags);
+	}
 
 	radeon_fence_unref(&vm->fence);
 	r = radeon_fence_emit(rdev, &vm->fence, ridx);
@@ -997,6 +1229,7 @@  int radeon_vm_bo_update_pte(struct radeon_device *rdev,
 	radeon_ring_unlock_commit(rdev, ring);
 	radeon_semaphore_free(rdev, &sem, vm->fence);
 	radeon_fence_unref(&vm->last_flush);
+
 	return 0;
 }