diff mbox

[v6,05/32] drm/i915: Track GEN6 page table usage

Message ID 87bnkg8y11.fsf@gaia.fi.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala Feb. 26, 2015, 3:58 p.m. UTC
Michel Thierry <michel.thierry@intel.com> writes:

> From: Ben Widawsky <benjamin.widawsky@intel.com>
>
> Instead of implementing the full tracking + dynamic allocation, this
> patch does a bit less than half of the work, by tracking and warning on
> unexpected conditions. The tracking itself follows which PTEs within a
> page table are currently being used for objects. The next patch will
> modify this to actually allocate the page tables only when necessary.
>
> With the current patch there isn't much in the way of making a gen
> agnostic range allocation function. However, in the next patch we'll add
> more specificity which makes having separate functions a bit easier to
> manage.
>
> One important change introduced here is that DMA mappings are
> created/destroyed at the same page directories/tables are
> allocated/deallocated.
>
> Notice that aliasing PPGTT is not managed here. The patch which actually
> begins dynamic allocation/teardown explains the reasoning for this.
>
> v2: s/pdp.page_directory/pdp.page_directorys
> Make a scratch page allocation helper
>
> v3: Rebase and expand commit message.
>
> v4: Allocate required pagetables only when it is needed, _bind_to_vm
> instead of bind_vma (Daniel).
>
> v5: Rebased to remove the unnecessary noise in the diff, also:
>  - PDE mask is GEN agnostic, renamed GEN6_PDE_MASK to I915_PDE_MASK.
>  - Removed unnecessary checks in gen6_alloc_va_range.
>  - Changed map/unmap_px_single macros to use dma functions directly and
>    be part of a static inline function instead.
>  - Moved drm_device plumbing through page tables operation to its own
>    patch.
>  - Moved allocate/teardown_va_range calls until they are fully
>    implemented (in subsequent patch).
>  - Merged pt and scratch_pt unmap_and_free path.
>  - Moved scratch page allocator helper to the patch that will use it.
>
> v6: Reduce complexity by not tearing down pagetables dynamically, the
> same can be achieved while freeing empty vms. (Daniel)
>
> v7: s/i915_dma_map_px_single/i915_dma_map_single
> s/gen6_write_pdes/gen6_write_pde
> Prevent a NULL case when only GGTT is available. (Mika)
>
> v8: Rebased after s/page_tables/page_table/.
>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com> (v3+)
> ---
>  drivers/gpu/drm/i915/i915_gem_gtt.c | 198 +++++++++++++++++++++++++-----------
>  drivers/gpu/drm/i915/i915_gem_gtt.h |  75 ++++++++++++++
>  2 files changed, 211 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index e05488e..f9354c7 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -278,29 +278,88 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
>  	return pte;
>  }
>  
> -static void unmap_and_free_pt(struct i915_page_table_entry *pt, struct drm_device *dev)
> +#define i915_dma_unmap_single(px, dev) \
> +	__i915_dma_unmap_single((px)->daddr, dev)
> +
> +static inline void __i915_dma_unmap_single(dma_addr_t daddr,
> +					struct drm_device *dev)
> +{
> +	struct device *device = &dev->pdev->dev;
> +
> +	dma_unmap_page(device, daddr, 4096, PCI_DMA_BIDIRECTIONAL);
> +}
> +
> +/**
> + * i915_dma_map_single() - Create a dma mapping for a page table/dir/etc.
> + * @px:	Page table/dir/etc to get a DMA map for
> + * @dev:	drm device
> + *
> + * Page table allocations are unified across all gens. They always require a
> + * single 4k allocation, as well as a DMA mapping. If we keep the structs
> + * symmetric here, the simple macro covers us for every page table type.
> + *
> + * Return: 0 if success.
> + */
> +#define i915_dma_map_single(px, dev) \
> +	i915_dma_map_page_single((px)->page, (dev), &(px)->daddr)
> +
> +static inline int i915_dma_map_page_single(struct page *page,
> +					   struct drm_device *dev,
> +					   dma_addr_t *daddr)
> +{
> +	struct device *device = &dev->pdev->dev;
> +
> +	*daddr = dma_map_page(device, page, 0, 4096, PCI_DMA_BIDIRECTIONAL);
> +	return dma_mapping_error(device, *daddr);
> +}
> +
> +static void unmap_and_free_pt(struct i915_page_table_entry *pt,
> +			       struct drm_device *dev)
>  {
>  	if (WARN_ON(!pt->page))
>  		return;
> +
> +	i915_dma_unmap_single(pt, dev);
>  	__free_page(pt->page);
> +	kfree(pt->used_ptes);
>  	kfree(pt);
>  }
>  
>  static struct i915_page_table_entry *alloc_pt_single(struct drm_device *dev)
>  {
>  	struct i915_page_table_entry *pt;
> +	const size_t count = INTEL_INFO(dev)->gen >= 8 ?
> +		GEN8_PTES_PER_PAGE : I915_PPGTT_PT_ENTRIES;
> +	int ret = -ENOMEM;
>  
>  	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
>  	if (!pt)
>  		return ERR_PTR(-ENOMEM);
>  
> +	pt->used_ptes = kcalloc(BITS_TO_LONGS(count), sizeof(*pt->used_ptes),
> +				GFP_KERNEL);
> +
> +	if (!pt->used_ptes)
> +		goto fail_bitmap;
> +
>  	pt->page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> -	if (!pt->page) {
> -		kfree(pt);
> -		return ERR_PTR(-ENOMEM);
> -	}
> +	if (!pt->page)
> +		goto fail_page;
> +
> +	ret = i915_dma_map_single(pt, dev);
> +	if (ret)
> +		goto fail_dma;
>  
>  	return pt;
> +
> +fail_dma:
> +	__free_page(pt->page);
> +fail_page:
> +	kfree(pt->used_ptes);
> +fail_bitmap:
> +	kfree(pt);
> +
> +	return ERR_PTR(ret);
>  }
>  
>  /**
> @@ -838,26 +897,35 @@ static void gen6_dump_ppgtt(struct i915_hw_ppgtt *ppgtt, struct seq_file *m)
>  	}
>  }
>  
> -static void gen6_write_pdes(struct i915_hw_ppgtt *ppgtt)
> +/* Write pde (index) from the page directory @pd to the page table @pt */
> +static void gen6_write_pde(struct i915_page_directory_entry *pd,
> +			    const int pde, struct i915_page_table_entry *pt)
>  {
> -	struct drm_i915_private *dev_priv = ppgtt->base.dev->dev_private;
> -	gen6_gtt_pte_t __iomem *pd_addr;
> -	uint32_t pd_entry;
> -	int i;
> +	/* Caller needs to make sure the write completes if necessary */
> +	struct i915_hw_ppgtt *ppgtt =
> +		container_of(pd, struct i915_hw_ppgtt, pd);
> +	u32 pd_entry;
>  
> -	WARN_ON(ppgtt->pd.pd_offset & 0x3f);
> -	pd_addr = (gen6_gtt_pte_t __iomem*)dev_priv->gtt.gsm +
> -		ppgtt->pd.pd_offset / sizeof(gen6_gtt_pte_t);
> -	for (i = 0; i < ppgtt->num_pd_entries; i++) {
> -		dma_addr_t pt_addr;
> +	pd_entry = GEN6_PDE_ADDR_ENCODE(pt->daddr);
> +	pd_entry |= GEN6_PDE_VALID;
>  
> -		pt_addr = ppgtt->pd.page_table[i]->daddr;
> -		pd_entry = GEN6_PDE_ADDR_ENCODE(pt_addr);
> -		pd_entry |= GEN6_PDE_VALID;
> +	writel(pd_entry, ppgtt->pd_addr + pde);
> +}
>  
> -		writel(pd_entry, pd_addr + i);
> -	}
> -	readl(pd_addr);
> +/* Write all the page tables found in the ppgtt structure to incrementing page
> + * directories. */
> +static void gen6_write_page_range(struct drm_i915_private *dev_priv,
> +				struct i915_page_directory_entry *pd, uint32_t start, uint32_t length)
> +{
> +	struct i915_page_table_entry *pt;
> +	uint32_t pde, temp;
> +
> +	gen6_for_each_pde(pt, pd, start, length, temp, pde)
> +		gen6_write_pde(pd, pde, pt);
> +
> +	/* Make sure write is complete before other code can use this page
> +	 * table. Also require for WC mapped PTEs */
> +	readl(dev_priv->gtt.gsm);
>  }
>  
>  static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
> @@ -1083,6 +1151,28 @@ static void gen6_ppgtt_unmap_pages(struct i915_hw_ppgtt *ppgtt)
>  			       4096, PCI_DMA_BIDIRECTIONAL);
>  }
>  
> +static int gen6_alloc_va_range(struct i915_address_space *vm,
> +			       uint64_t start, uint64_t length)
> +{
> +	struct i915_hw_ppgtt *ppgtt =
> +				container_of(vm, struct i915_hw_ppgtt, base);
> +	struct i915_page_table_entry *pt;
> +	uint32_t pde, temp;
> +
> +	gen6_for_each_pde(pt, &ppgtt->pd, start, length, temp, pde) {
> +		DECLARE_BITMAP(tmp_bitmap, I915_PPGTT_PT_ENTRIES);
> +
> +		bitmap_zero(tmp_bitmap, I915_PPGTT_PT_ENTRIES);
> +		bitmap_set(tmp_bitmap, gen6_pte_index(start),
> +			   gen6_pte_count(start, length));
> +
> +		bitmap_or(pt->used_ptes, pt->used_ptes, tmp_bitmap,
> +				I915_PPGTT_PT_ENTRIES);
> +	}
> +
> +	return 0;
> +}
> +
>  static void gen6_ppgtt_free(struct i915_hw_ppgtt *ppgtt)
>  {
>  	int i;
> @@ -1129,20 +1219,24 @@ alloc:
>  					       0, dev_priv->gtt.base.total,
>  					       0);
>  		if (ret)
> -			return ret;
> +			goto err_out;
>  
>  		retried = true;
>  		goto alloc;
>  	}
>  
>  	if (ret)
> -		return ret;
> +		goto err_out;
> +
>  
>  	if (ppgtt->node.start < dev_priv->gtt.mappable_end)
>  		DRM_DEBUG("Forced to use aperture for PDEs\n");
>  
>  	ppgtt->num_pd_entries = GEN6_PPGTT_PD_ENTRIES;
>  	return 0;
> +
> +err_out:
> +	return ret;
>  }
>  
>  static int gen6_ppgtt_alloc(struct i915_hw_ppgtt *ppgtt)
> @@ -1164,30 +1258,6 @@ static int gen6_ppgtt_alloc(struct i915_hw_ppgtt *ppgtt)
>  	return 0;
>  }
>  
> -static int gen6_ppgtt_setup_page_tables(struct i915_hw_ppgtt *ppgtt)
> -{
> -	struct drm_device *dev = ppgtt->base.dev;
> -	int i;
> -
> -	for (i = 0; i < ppgtt->num_pd_entries; i++) {
> -		struct page *page;
> -		dma_addr_t pt_addr;
> -
> -		page = ppgtt->pd.page_table[i]->page;
> -		pt_addr = pci_map_page(dev->pdev, page, 0, 4096,
> -				       PCI_DMA_BIDIRECTIONAL);
> -
> -		if (pci_dma_mapping_error(dev->pdev, pt_addr)) {
> -			gen6_ppgtt_unmap_pages(ppgtt);
> -			return -EIO;
> -		}
> -
> -		ppgtt->pd.page_table[i]->daddr = pt_addr;
> -	}
> -
> -	return 0;
> -}
> -
>  static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>  {
>  	struct drm_device *dev = ppgtt->base.dev;
> @@ -1211,12 +1281,7 @@ static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>  	if (ret)
>  		return ret;
>  
> -	ret = gen6_ppgtt_setup_page_tables(ppgtt);
> -	if (ret) {
> -		gen6_ppgtt_free(ppgtt);
> -		return ret;
> -	}
> -
> +	ppgtt->base.allocate_va_range = gen6_alloc_va_range;
>  	ppgtt->base.clear_range = gen6_ppgtt_clear_range;
>  	ppgtt->base.insert_entries = gen6_ppgtt_insert_entries;
>  	ppgtt->base.cleanup = gen6_ppgtt_cleanup;
> @@ -1227,13 +1292,17 @@ static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>  	ppgtt->pd.pd_offset =
>  		ppgtt->node.start / PAGE_SIZE * sizeof(gen6_gtt_pte_t);
>  
> +	ppgtt->pd_addr = (gen6_gtt_pte_t __iomem *)dev_priv->gtt.gsm +
> +		ppgtt->pd.pd_offset / sizeof(gen6_gtt_pte_t);
> +
>  	ppgtt->base.clear_range(&ppgtt->base, 0, ppgtt->base.total, true);
>  
> +	gen6_write_page_range(dev_priv, &ppgtt->pd, 0, ppgtt->base.total);
> +
>  	DRM_DEBUG_DRIVER("Allocated pde space (%ldM) at GTT entry: %lx\n",
>  			 ppgtt->node.size >> 20,
>  			 ppgtt->node.start / PAGE_SIZE);
>  
> -	gen6_write_pdes(ppgtt);
>  	DRM_DEBUG("Adding PPGTT at offset %x\n",
>  		  ppgtt->pd.pd_offset << 10);
>  
> @@ -1504,15 +1573,20 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
>  		return;
>  	}
>  
> -	list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
> -		/* TODO: Perhaps it shouldn't be gen6 specific */
> -		if (i915_is_ggtt(vm)) {
> -			if (dev_priv->mm.aliasing_ppgtt)
> -				gen6_write_pdes(dev_priv->mm.aliasing_ppgtt);
> -			continue;
> -		}
> +	if (USES_PPGTT(dev)) {
> +		list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
> +			/* TODO: Perhaps it shouldn't be gen6 specific */
> +
> +			struct i915_hw_ppgtt *ppgtt =
> +					container_of(vm, struct i915_hw_ppgtt,
> +						     base);
>  
> -		gen6_write_pdes(container_of(vm, struct i915_hw_ppgtt, base));
> +			if (i915_is_ggtt(vm))
> +				ppgtt = dev_priv->mm.aliasing_ppgtt;
> +
> +			gen6_write_page_range(dev_priv, &ppgtt->pd, 0,
> +					      ppgtt->num_pd_entries);
> +		}
>  	}
>  
>  	i915_ggtt_flush(dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index c9e93f5..bf0e380 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -54,7 +54,10 @@ typedef gen8_gtt_pte_t gen8_ppgtt_pde_t;
>  #define GEN6_PPGTT_PD_ENTRIES		512
>  #define GEN6_PD_SIZE			(GEN6_PPGTT_PD_ENTRIES * PAGE_SIZE)
>  #define GEN6_PD_ALIGN			(PAGE_SIZE * 16)
> +#define GEN6_PDE_SHIFT			22
>  #define GEN6_PDE_VALID			(1 << 0)
> +#define I915_PDE_MASK			(GEN6_PPGTT_PD_ENTRIES-1)
> +#define NUM_PTE(pde_shift)		(1 << (pde_shift - PAGE_SHIFT))
>  
>  #define GEN7_PTE_CACHE_L3_LLC		(3 << 1)
>  
> @@ -190,6 +193,8 @@ struct i915_vma {
>  struct i915_page_table_entry {
>  	struct page *page;
>  	dma_addr_t daddr;
> +
> +	unsigned long *used_ptes;
>  };
>  
>  struct i915_page_directory_entry {
> @@ -246,6 +251,9 @@ struct i915_address_space {
>  	gen6_gtt_pte_t (*pte_encode)(dma_addr_t addr,
>  				     enum i915_cache_level level,
>  				     bool valid, u32 flags); /* Create a valid PTE */
> +	int (*allocate_va_range)(struct i915_address_space *vm,
> +				 uint64_t start,
> +				 uint64_t length);
>  	void (*clear_range)(struct i915_address_space *vm,
>  			    uint64_t start,
>  			    uint64_t length,
> @@ -298,12 +306,79 @@ struct i915_hw_ppgtt {
>  
>  	struct drm_i915_file_private *file_priv;
>  
> +	gen6_gtt_pte_t __iomem *pd_addr;
> +
>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
>  			 struct intel_engine_cs *ring);
>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
>  };
>  
> +/* For each pde iterates over every pde between from start until start + length.
> + * If start, and start+length are not perfectly divisible, the macro will round
> + * down, and up as needed. The macro modifies pde, start, and length. Dev is
> + * only used to differentiate shift values. Temp is temp.  On gen6/7, start = 0,
> + * and length = 2G effectively iterates over every PDE in the system. On gen8+
> + * it simply iterates over every page directory entry in a page directory.
> + *

There is nothing for gen8 in the macro yet, so comment is a bit misleading.

> + * XXX: temp is not actually needed, but it saves doing the ALIGN operation.
> + */
> +#define gen6_for_each_pde(pt, pd, start, length, temp, iter) \
> +	for (iter = gen6_pde_index(start), pt = (pd)->page_table[iter]; \
> +	     length > 0 && iter < GEN6_PPGTT_PD_ENTRIES; \
> +	     pt = (pd)->page_table[++iter], \
> +	     temp = ALIGN(start+1, 1 << GEN6_PDE_SHIFT) - start, \
> +	     temp = min_t(unsigned, temp, length), \
> +	     start += temp, length -= temp)
> +
> +static inline uint32_t i915_pte_index(uint64_t address, uint32_t pde_shift)
> +{
> +	const uint32_t mask = NUM_PTE(pde_shift) - 1;
> +
> +	return (address >> PAGE_SHIFT) & mask;
> +}
> +
> +/* Helper to counts the number of PTEs within the given length. This count does
> +* not cross a page table boundary, so the max value would be
> +* I915_PPGTT_PT_ENTRIES for GEN6, and GEN8_PTES_PER_PAGE for GEN8.
> +*/
> +static inline size_t i915_pte_count(uint64_t addr, size_t length,
> +					uint32_t pde_shift)
> +{
> +	const uint64_t mask = ~((1 << pde_shift) - 1);
> +	uint64_t end;
> +
> +	BUG_ON(length == 0);
> +	BUG_ON(offset_in_page(addr|length));
> +
> +	end = addr + length;
> +
> +	if ((addr & mask) != (end & mask))
> +		return NUM_PTE(pde_shift) - i915_pte_index(addr, pde_shift);
> +
> +	return i915_pte_index(end, pde_shift) - i915_pte_index(addr, pde_shift);
> +}

After trying to figure out the reasoning for this i915_pte_count
and it's role in the used bitmap setup, I started to wonder why all
the complexity here.

BUG_ON(offset_in_page(addr|length)) reveals that we can't be called
with anything but a page boundary so the address parameter is
irrelevant in here?

Then there is trickery with the pde_shifting. I tried to find some
generalization down the series to take advantage of this. Perhaps
I missed it?

For me it seems that we can replace this with simple:

static inline uint32_t i915_pte_count(uint64_t length, uint32_t
pte_len)
{
       return min_t(uint32_t, length / PAGE_SIZE, PAGE_SIZE / pte_len);
}

...and not lose anything.


On top of that when trying to wrap my brain around the differences
between GEN6/7 and GEN8+ the following patch before this patch would
make things much easier to understand, atleast for me:


With that, I think you could write generic gen_for_each_pde
more easily and just parametrize the gen6 variant and gen8 one
further in the series.

--Mika

> +static inline uint32_t i915_pde_index(uint64_t addr, uint32_t shift)
> +{
> +	return (addr >> shift) & I915_PDE_MASK;
> +}
> +
> +static inline uint32_t gen6_pte_index(uint32_t addr)
> +{
> +	return i915_pte_index(addr, GEN6_PDE_SHIFT);
> +}
> +
> +static inline size_t gen6_pte_count(uint32_t addr, uint32_t length)
> +{
> +	return i915_pte_count(addr, length, GEN6_PDE_SHIFT);
> +}
> +
> +static inline uint32_t gen6_pde_index(uint32_t addr)
> +{
> +	return i915_pde_index(addr, GEN6_PDE_SHIFT);
> +}
> +
>  int i915_gem_gtt_init(struct drm_device *dev);
>  void i915_gem_init_global_gtt(struct drm_device *dev);
>  void i915_global_gtt_cleanup(struct drm_device *dev);
> -- 
> 2.1.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Comments

Mika Kuoppala March 10, 2015, 11:19 a.m. UTC | #1
Mika Kuoppala <mika.kuoppala@linux.intel.com> writes:

> Michel Thierry <michel.thierry@intel.com> writes:
>
>> From: Ben Widawsky <benjamin.widawsky@intel.com>
>>
>> Instead of implementing the full tracking + dynamic allocation, this
>> patch does a bit less than half of the work, by tracking and warning on
>> unexpected conditions. The tracking itself follows which PTEs within a
>> page table are currently being used for objects. The next patch will
>> modify this to actually allocate the page tables only when necessary.
>>
>> With the current patch there isn't much in the way of making a gen
>> agnostic range allocation function. However, in the next patch we'll add
>> more specificity which makes having separate functions a bit easier to
>> manage.
>>
>> One important change introduced here is that DMA mappings are
>> created/destroyed at the same page directories/tables are
>> allocated/deallocated.
>>
>> Notice that aliasing PPGTT is not managed here. The patch which actually
>> begins dynamic allocation/teardown explains the reasoning for this.
>>
>> v2: s/pdp.page_directory/pdp.page_directorys
>> Make a scratch page allocation helper
>>
>> v3: Rebase and expand commit message.
>>
>> v4: Allocate required pagetables only when it is needed, _bind_to_vm
>> instead of bind_vma (Daniel).
>>
>> v5: Rebased to remove the unnecessary noise in the diff, also:
>>  - PDE mask is GEN agnostic, renamed GEN6_PDE_MASK to I915_PDE_MASK.
>>  - Removed unnecessary checks in gen6_alloc_va_range.
>>  - Changed map/unmap_px_single macros to use dma functions directly and
>>    be part of a static inline function instead.
>>  - Moved drm_device plumbing through page tables operation to its own
>>    patch.
>>  - Moved allocate/teardown_va_range calls until they are fully
>>    implemented (in subsequent patch).
>>  - Merged pt and scratch_pt unmap_and_free path.
>>  - Moved scratch page allocator helper to the patch that will use it.
>>
>> v6: Reduce complexity by not tearing down pagetables dynamically, the
>> same can be achieved while freeing empty vms. (Daniel)
>>
>> v7: s/i915_dma_map_px_single/i915_dma_map_single
>> s/gen6_write_pdes/gen6_write_pde
>> Prevent a NULL case when only GGTT is available. (Mika)
>>
>> v8: Rebased after s/page_tables/page_table/.
>>
>> Cc: Daniel Vetter <daniel@ffwll.ch>
>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
>> Signed-off-by: Michel Thierry <michel.thierry@intel.com> (v3+)
>> ---
>>  drivers/gpu/drm/i915/i915_gem_gtt.c | 198 +++++++++++++++++++++++++-----------
>>  drivers/gpu/drm/i915/i915_gem_gtt.h |  75 ++++++++++++++
>>  2 files changed, 211 insertions(+), 62 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> index e05488e..f9354c7 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> @@ -278,29 +278,88 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
>>  	return pte;
>>  }
>>  
>> -static void unmap_and_free_pt(struct i915_page_table_entry *pt, struct drm_device *dev)
>> +#define i915_dma_unmap_single(px, dev) \
>> +	__i915_dma_unmap_single((px)->daddr, dev)
>> +
>> +static inline void __i915_dma_unmap_single(dma_addr_t daddr,
>> +					struct drm_device *dev)
>> +{
>> +	struct device *device = &dev->pdev->dev;
>> +
>> +	dma_unmap_page(device, daddr, 4096, PCI_DMA_BIDIRECTIONAL);
>> +}
>> +
>> +/**
>> + * i915_dma_map_single() - Create a dma mapping for a page table/dir/etc.
>> + * @px:	Page table/dir/etc to get a DMA map for
>> + * @dev:	drm device
>> + *
>> + * Page table allocations are unified across all gens. They always require a
>> + * single 4k allocation, as well as a DMA mapping. If we keep the structs
>> + * symmetric here, the simple macro covers us for every page table type.
>> + *
>> + * Return: 0 if success.
>> + */
>> +#define i915_dma_map_single(px, dev) \
>> +	i915_dma_map_page_single((px)->page, (dev), &(px)->daddr)
>> +
>> +static inline int i915_dma_map_page_single(struct page *page,
>> +					   struct drm_device *dev,
>> +					   dma_addr_t *daddr)
>> +{
>> +	struct device *device = &dev->pdev->dev;
>> +
>> +	*daddr = dma_map_page(device, page, 0, 4096, PCI_DMA_BIDIRECTIONAL);
>> +	return dma_mapping_error(device, *daddr);
>> +}
>> +
>> +static void unmap_and_free_pt(struct i915_page_table_entry *pt,
>> +			       struct drm_device *dev)
>>  {
>>  	if (WARN_ON(!pt->page))
>>  		return;
>> +
>> +	i915_dma_unmap_single(pt, dev);
>>  	__free_page(pt->page);
>> +	kfree(pt->used_ptes);
>>  	kfree(pt);
>>  }
>>  
>>  static struct i915_page_table_entry *alloc_pt_single(struct drm_device *dev)
>>  {
>>  	struct i915_page_table_entry *pt;
>> +	const size_t count = INTEL_INFO(dev)->gen >= 8 ?
>> +		GEN8_PTES_PER_PAGE : I915_PPGTT_PT_ENTRIES;
>> +	int ret = -ENOMEM;
>>  
>>  	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
>>  	if (!pt)
>>  		return ERR_PTR(-ENOMEM);
>>  
>> +	pt->used_ptes = kcalloc(BITS_TO_LONGS(count), sizeof(*pt->used_ptes),
>> +				GFP_KERNEL);
>> +
>> +	if (!pt->used_ptes)
>> +		goto fail_bitmap;
>> +
>>  	pt->page = alloc_page(GFP_KERNEL | __GFP_ZERO);
>> -	if (!pt->page) {
>> -		kfree(pt);
>> -		return ERR_PTR(-ENOMEM);
>> -	}
>> +	if (!pt->page)
>> +		goto fail_page;
>> +
>> +	ret = i915_dma_map_single(pt, dev);
>> +	if (ret)
>> +		goto fail_dma;
>>  
>>  	return pt;
>> +
>> +fail_dma:
>> +	__free_page(pt->page);
>> +fail_page:
>> +	kfree(pt->used_ptes);
>> +fail_bitmap:
>> +	kfree(pt);
>> +
>> +	return ERR_PTR(ret);
>>  }
>>  
>>  /**
>> @@ -838,26 +897,35 @@ static void gen6_dump_ppgtt(struct i915_hw_ppgtt *ppgtt, struct seq_file *m)
>>  	}
>>  }
>>  
>> -static void gen6_write_pdes(struct i915_hw_ppgtt *ppgtt)
>> +/* Write pde (index) from the page directory @pd to the page table @pt */
>> +static void gen6_write_pde(struct i915_page_directory_entry *pd,
>> +			    const int pde, struct i915_page_table_entry *pt)
>>  {
>> -	struct drm_i915_private *dev_priv = ppgtt->base.dev->dev_private;
>> -	gen6_gtt_pte_t __iomem *pd_addr;
>> -	uint32_t pd_entry;
>> -	int i;
>> +	/* Caller needs to make sure the write completes if necessary */
>> +	struct i915_hw_ppgtt *ppgtt =
>> +		container_of(pd, struct i915_hw_ppgtt, pd);
>> +	u32 pd_entry;
>>  
>> -	WARN_ON(ppgtt->pd.pd_offset & 0x3f);
>> -	pd_addr = (gen6_gtt_pte_t __iomem*)dev_priv->gtt.gsm +
>> -		ppgtt->pd.pd_offset / sizeof(gen6_gtt_pte_t);
>> -	for (i = 0; i < ppgtt->num_pd_entries; i++) {
>> -		dma_addr_t pt_addr;
>> +	pd_entry = GEN6_PDE_ADDR_ENCODE(pt->daddr);
>> +	pd_entry |= GEN6_PDE_VALID;
>>  
>> -		pt_addr = ppgtt->pd.page_table[i]->daddr;
>> -		pd_entry = GEN6_PDE_ADDR_ENCODE(pt_addr);
>> -		pd_entry |= GEN6_PDE_VALID;
>> +	writel(pd_entry, ppgtt->pd_addr + pde);
>> +}
>>  
>> -		writel(pd_entry, pd_addr + i);
>> -	}
>> -	readl(pd_addr);
>> +/* Write all the page tables found in the ppgtt structure to incrementing page
>> + * directories. */
>> +static void gen6_write_page_range(struct drm_i915_private *dev_priv,
>> +				struct i915_page_directory_entry *pd, uint32_t start, uint32_t length)
>> +{
>> +	struct i915_page_table_entry *pt;
>> +	uint32_t pde, temp;
>> +
>> +	gen6_for_each_pde(pt, pd, start, length, temp, pde)
>> +		gen6_write_pde(pd, pde, pt);
>> +
>> +	/* Make sure write is complete before other code can use this page
>> +	 * table. Also require for WC mapped PTEs */
>> +	readl(dev_priv->gtt.gsm);
>>  }
>>  
>>  static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
>> @@ -1083,6 +1151,28 @@ static void gen6_ppgtt_unmap_pages(struct i915_hw_ppgtt *ppgtt)
>>  			       4096, PCI_DMA_BIDIRECTIONAL);
>>  }
>>  
>> +static int gen6_alloc_va_range(struct i915_address_space *vm,
>> +			       uint64_t start, uint64_t length)
>> +{
>> +	struct i915_hw_ppgtt *ppgtt =
>> +				container_of(vm, struct i915_hw_ppgtt, base);
>> +	struct i915_page_table_entry *pt;
>> +	uint32_t pde, temp;
>> +
>> +	gen6_for_each_pde(pt, &ppgtt->pd, start, length, temp, pde) {
>> +		DECLARE_BITMAP(tmp_bitmap, I915_PPGTT_PT_ENTRIES);
>> +
>> +		bitmap_zero(tmp_bitmap, I915_PPGTT_PT_ENTRIES);
>> +		bitmap_set(tmp_bitmap, gen6_pte_index(start),
>> +			   gen6_pte_count(start, length));
>> +
>> +		bitmap_or(pt->used_ptes, pt->used_ptes, tmp_bitmap,
>> +				I915_PPGTT_PT_ENTRIES);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>  static void gen6_ppgtt_free(struct i915_hw_ppgtt *ppgtt)
>>  {
>>  	int i;
>> @@ -1129,20 +1219,24 @@ alloc:
>>  					       0, dev_priv->gtt.base.total,
>>  					       0);
>>  		if (ret)
>> -			return ret;
>> +			goto err_out;
>>  
>>  		retried = true;
>>  		goto alloc;
>>  	}
>>  
>>  	if (ret)
>> -		return ret;
>> +		goto err_out;
>> +
>>  
>>  	if (ppgtt->node.start < dev_priv->gtt.mappable_end)
>>  		DRM_DEBUG("Forced to use aperture for PDEs\n");
>>  
>>  	ppgtt->num_pd_entries = GEN6_PPGTT_PD_ENTRIES;
>>  	return 0;
>> +
>> +err_out:
>> +	return ret;
>>  }
>>  
>>  static int gen6_ppgtt_alloc(struct i915_hw_ppgtt *ppgtt)
>> @@ -1164,30 +1258,6 @@ static int gen6_ppgtt_alloc(struct i915_hw_ppgtt *ppgtt)
>>  	return 0;
>>  }
>>  
>> -static int gen6_ppgtt_setup_page_tables(struct i915_hw_ppgtt *ppgtt)
>> -{
>> -	struct drm_device *dev = ppgtt->base.dev;
>> -	int i;
>> -
>> -	for (i = 0; i < ppgtt->num_pd_entries; i++) {
>> -		struct page *page;
>> -		dma_addr_t pt_addr;
>> -
>> -		page = ppgtt->pd.page_table[i]->page;
>> -		pt_addr = pci_map_page(dev->pdev, page, 0, 4096,
>> -				       PCI_DMA_BIDIRECTIONAL);
>> -
>> -		if (pci_dma_mapping_error(dev->pdev, pt_addr)) {
>> -			gen6_ppgtt_unmap_pages(ppgtt);
>> -			return -EIO;
>> -		}
>> -
>> -		ppgtt->pd.page_table[i]->daddr = pt_addr;
>> -	}
>> -
>> -	return 0;
>> -}
>> -
>>  static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>>  {
>>  	struct drm_device *dev = ppgtt->base.dev;
>> @@ -1211,12 +1281,7 @@ static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>>  	if (ret)
>>  		return ret;
>>  
>> -	ret = gen6_ppgtt_setup_page_tables(ppgtt);
>> -	if (ret) {
>> -		gen6_ppgtt_free(ppgtt);
>> -		return ret;
>> -	}
>> -
>> +	ppgtt->base.allocate_va_range = gen6_alloc_va_range;
>>  	ppgtt->base.clear_range = gen6_ppgtt_clear_range;
>>  	ppgtt->base.insert_entries = gen6_ppgtt_insert_entries;
>>  	ppgtt->base.cleanup = gen6_ppgtt_cleanup;
>> @@ -1227,13 +1292,17 @@ static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>>  	ppgtt->pd.pd_offset =
>>  		ppgtt->node.start / PAGE_SIZE * sizeof(gen6_gtt_pte_t);
>>  
>> +	ppgtt->pd_addr = (gen6_gtt_pte_t __iomem *)dev_priv->gtt.gsm +
>> +		ppgtt->pd.pd_offset / sizeof(gen6_gtt_pte_t);
>> +
>>  	ppgtt->base.clear_range(&ppgtt->base, 0, ppgtt->base.total, true);
>>  
>> +	gen6_write_page_range(dev_priv, &ppgtt->pd, 0, ppgtt->base.total);
>> +
>>  	DRM_DEBUG_DRIVER("Allocated pde space (%ldM) at GTT entry: %lx\n",
>>  			 ppgtt->node.size >> 20,
>>  			 ppgtt->node.start / PAGE_SIZE);
>>  
>> -	gen6_write_pdes(ppgtt);
>>  	DRM_DEBUG("Adding PPGTT at offset %x\n",
>>  		  ppgtt->pd.pd_offset << 10);
>>  
>> @@ -1504,15 +1573,20 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
>>  		return;
>>  	}
>>  
>> -	list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
>> -		/* TODO: Perhaps it shouldn't be gen6 specific */
>> -		if (i915_is_ggtt(vm)) {
>> -			if (dev_priv->mm.aliasing_ppgtt)
>> -				gen6_write_pdes(dev_priv->mm.aliasing_ppgtt);
>> -			continue;
>> -		}
>> +	if (USES_PPGTT(dev)) {
>> +		list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
>> +			/* TODO: Perhaps it shouldn't be gen6 specific */
>> +
>> +			struct i915_hw_ppgtt *ppgtt =
>> +					container_of(vm, struct i915_hw_ppgtt,
>> +						     base);
>>  
>> -		gen6_write_pdes(container_of(vm, struct i915_hw_ppgtt, base));
>> +			if (i915_is_ggtt(vm))
>> +				ppgtt = dev_priv->mm.aliasing_ppgtt;
>> +
>> +			gen6_write_page_range(dev_priv, &ppgtt->pd, 0,
>> +					      ppgtt->num_pd_entries);
>> +		}
>>  	}
>>  
>>  	i915_ggtt_flush(dev_priv);
>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> index c9e93f5..bf0e380 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> @@ -54,7 +54,10 @@ typedef gen8_gtt_pte_t gen8_ppgtt_pde_t;
>>  #define GEN6_PPGTT_PD_ENTRIES		512
>>  #define GEN6_PD_SIZE			(GEN6_PPGTT_PD_ENTRIES * PAGE_SIZE)
>>  #define GEN6_PD_ALIGN			(PAGE_SIZE * 16)
>> +#define GEN6_PDE_SHIFT			22
>>  #define GEN6_PDE_VALID			(1 << 0)
>> +#define I915_PDE_MASK			(GEN6_PPGTT_PD_ENTRIES-1)
>> +#define NUM_PTE(pde_shift)		(1 << (pde_shift - PAGE_SHIFT))
>>  
>>  #define GEN7_PTE_CACHE_L3_LLC		(3 << 1)
>>  
>> @@ -190,6 +193,8 @@ struct i915_vma {
>>  struct i915_page_table_entry {
>>  	struct page *page;
>>  	dma_addr_t daddr;
>> +
>> +	unsigned long *used_ptes;
>>  };
>>  
>>  struct i915_page_directory_entry {
>> @@ -246,6 +251,9 @@ struct i915_address_space {
>>  	gen6_gtt_pte_t (*pte_encode)(dma_addr_t addr,
>>  				     enum i915_cache_level level,
>>  				     bool valid, u32 flags); /* Create a valid PTE */
>> +	int (*allocate_va_range)(struct i915_address_space *vm,
>> +				 uint64_t start,
>> +				 uint64_t length);
>>  	void (*clear_range)(struct i915_address_space *vm,
>>  			    uint64_t start,
>>  			    uint64_t length,
>> @@ -298,12 +306,79 @@ struct i915_hw_ppgtt {
>>  
>>  	struct drm_i915_file_private *file_priv;
>>  
>> +	gen6_gtt_pte_t __iomem *pd_addr;
>> +
>>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
>>  			 struct intel_engine_cs *ring);
>>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
>>  };
>>  
>> +/* For each pde iterates over every pde between from start until start + length.
>> + * If start, and start+length are not perfectly divisible, the macro will round
>> + * down, and up as needed. The macro modifies pde, start, and length. Dev is
>> + * only used to differentiate shift values. Temp is temp.  On gen6/7, start = 0,
>> + * and length = 2G effectively iterates over every PDE in the system. On gen8+
>> + * it simply iterates over every page directory entry in a page directory.
>> + *
>
> There is nothing for gen8 in the macro yet, so comment is a bit misleading.
>
>> + * XXX: temp is not actually needed, but it saves doing the ALIGN operation.
>> + */
>> +#define gen6_for_each_pde(pt, pd, start, length, temp, iter) \
>> +	for (iter = gen6_pde_index(start), pt = (pd)->page_table[iter]; \
>> +	     length > 0 && iter < GEN6_PPGTT_PD_ENTRIES; \
>> +	     pt = (pd)->page_table[++iter], \
>> +	     temp = ALIGN(start+1, 1 << GEN6_PDE_SHIFT) - start, \
>> +	     temp = min_t(unsigned, temp, length), \
>> +	     start += temp, length -= temp)
>> +
>> +static inline uint32_t i915_pte_index(uint64_t address, uint32_t pde_shift)
>> +{
>> +	const uint32_t mask = NUM_PTE(pde_shift) - 1;
>> +
>> +	return (address >> PAGE_SHIFT) & mask;
>> +}
>> +
>> +/* Helper to counts the number of PTEs within the given length. This count does
>> +* not cross a page table boundary, so the max value would be
>> +* I915_PPGTT_PT_ENTRIES for GEN6, and GEN8_PTES_PER_PAGE for GEN8.
>> +*/
>> +static inline size_t i915_pte_count(uint64_t addr, size_t length,
>> +					uint32_t pde_shift)
>> +{
>> +	const uint64_t mask = ~((1 << pde_shift) - 1);
>> +	uint64_t end;
>> +
>> +	BUG_ON(length == 0);
>> +	BUG_ON(offset_in_page(addr|length));
>> +
>> +	end = addr + length;
>> +
>> +	if ((addr & mask) != (end & mask))
>> +		return NUM_PTE(pde_shift) - i915_pte_index(addr, pde_shift);
>> +
>> +	return i915_pte_index(end, pde_shift) - i915_pte_index(addr, pde_shift);
>> +}
>
> After trying to figure out the reasoning for this i915_pte_count
> and it's role in the used bitmap setup, I started to wonder why all
> the complexity here.
>
> BUG_ON(offset_in_page(addr|length)) reveals that we can't be called
> with anything but a page boundary so the address parameter is
> irrelevant in here?
>
> Then there is trickery with the pde_shifting. I tried to find some
> generalization down the series to take advantage of this. Perhaps
> I missed it?
>
> For me it seems that we can replace this with simple:
>
> static inline uint32_t i915_pte_count(uint64_t length, uint32_t
> pte_len)
> {
>        return min_t(uint32_t, length / PAGE_SIZE, PAGE_SIZE / pte_len);
> }
>
> ...and not lose anything.

Oh we lose alot. The simplified version might work in the context of
this patch but the macro will be broken. Michel explained it all to
me in irc.

I apologize for this. Please ignore and keep the original i915_pte_count.

-Mika

> On top of that when trying to wrap my brain around the differences
> between GEN6/7 and GEN8+ the following patch before this patch would
> make things much easier to understand, atleast for me:
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index c9e93f5..c13f32f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -36,13 +36,13 @@
>  
>  struct drm_i915_file_private;
>  
> -typedef uint32_t gen6_gtt_pte_t;
> -typedef uint64_t gen8_gtt_pte_t;
> -typedef gen8_gtt_pte_t gen8_ppgtt_pde_t;
> +typedef uint32_t gen6_pte_t;
> +typedef uint64_t gen8_pte_t;
> +typedef uint64_t gen8_pde_t;
>  
>  #define gtt_total_entries(gtt) ((gtt).base.total >> PAGE_SHIFT)
>  
> -#define I915_PPGTT_PT_ENTRIES		(PAGE_SIZE / sizeof(gen6_gtt_pte_t))
> +
>  /* gen6-hsw has bit 11-4 for physical addr bit 39-32 */
>  #define GEN6_GTT_ADDR_ENCODE(addr)	((addr) | (((addr) >> 28) & 0xff0))
>  #define GEN6_PTE_ADDR_ENCODE(addr)	GEN6_GTT_ADDR_ENCODE(addr)
> @@ -51,8 +51,13 @@ typedef gen8_gtt_pte_t gen8_ppgtt_pde_t;
>  #define GEN6_PTE_UNCACHED		(1 << 1)
>  #define GEN6_PTE_VALID			(1 << 0)
>  
> -#define GEN6_PPGTT_PD_ENTRIES		512
> -#define GEN6_PD_SIZE			(GEN6_PPGTT_PD_ENTRIES * PAGE_SIZE)
> +#define I915_PTES(pte_len)		(PAGE_SIZE / (pte_len))
> +#define I915_PTE_MASK(pte_len)		(I915_PTES(pte_len) - 1)
> +#define I915_PDES			512
> +#define I915_PDE_MASK			(I915_PDES - 1)
> +
> +#define GEN6_PTES			I915_PTES(sizeof(gen6_pte_t))
> +#define GEN6_PD_SIZE		        (I915_PDES * PAGE_SIZE)
>  #define GEN6_PD_ALIGN			(PAGE_SIZE * 16)
>  #define GEN6_PDE_VALID			(1 << 0)
>  
> @@ -89,8 +94,7 @@ typedef gen8_gtt_pte_t gen8_ppgtt_pde_t;
>  #define GEN8_PTE_SHIFT			12
>  #define GEN8_PTE_MASK			0x1ff
>  #define GEN8_LEGACY_PDPES		4
> -#define GEN8_PTES_PER_PAGE		(PAGE_SIZE / sizeof(gen8_gtt_pte_t))
> -#define GEN8_PDES_PER_PAGE		(PAGE_SIZE / sizeof(gen8_ppgtt_pde_t))
> +#define GEN8_PTES			I915_PTES(sizeof(gen8_pte_t))
>  
>  #define PPAT_UNCACHED_INDEX		(_PAGE_PWT | _PAGE_PCD)
>  #define PPAT_CACHED_PDE_INDEX		0 /* WB LLC */
> @@ -199,7 +203,7 @@ struct i915_page_directory_entry {
>  		dma_addr_t daddr;
>  	};
>  
> -	struct i915_page_table_entry *page_table[GEN6_PPGTT_PD_ENTRIES]; /* PDEs */
> +	struct i915_page_table_entry *page_table[I915_PDES]; /* PDEs */
>  };
>  
>  struct i915_page_directory_pointer_entry {
> @@ -243,9 +247,9 @@ struct i915_address_space {
>  	struct list_head inactive_list;
>  
>  	/* FIXME: Need a more generic return type */
> -	gen6_gtt_pte_t (*pte_encode)(dma_addr_t addr,
> -				     enum i915_cache_level level,
> -				     bool valid, u32 flags); /* Create a valid PTE */
> +	gen6_pte_t (*pte_encode)(dma_addr_t addr,
> +				 enum i915_cache_level level,
> +				 bool valid, u32 flags); /* Create a valid PTE */
>  	void (*clear_range)(struct i915_address_space *vm,
>  			    uint64_t start,
>  			    uint64_t length,
> @@ -304,6 +308,21 @@ struct i915_hw_ppgtt {
>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
>  };
>  
> +static inline uint32_t i915_pte_index(uint64_t address, uint32_t pte_len)
> +{
> +	return (address >> PAGE_SHIFT) & I915_PTE_MASK(pte_len);
> +}
> +
> +static inline uint32_t i915_pte_count(uint64_t length, uint32_t pte_len)
> +{
> +	return min_t(uint32_t, length / PAGE_SIZE, I915_PTES(pte_len));
> +}
> +
> +static inline uint32_t i915_pde_index(uint64_t addr, uint32_t pde_shift)
> +{
> +	return (addr >> pde_shift) & I915_PDE_MASK;
> +}
> +
>  int i915_gem_gtt_init(struct drm_device *dev);
>  void i915_gem_init_global_gtt(struct drm_device *dev);
>  void i915_global_gtt_cleanup(struct drm_device *dev);
>
> With that, I think you could write generic gen_for_each_pde
> more easily and just parametrize the gen6 variant and gen8 one
> further in the series.
>
> --Mika
>
>> +static inline uint32_t i915_pde_index(uint64_t addr, uint32_t shift)
>> +{
>> +	return (addr >> shift) & I915_PDE_MASK;
>> +}
>> +
>> +static inline uint32_t gen6_pte_index(uint32_t addr)
>> +{
>> +	return i915_pte_index(addr, GEN6_PDE_SHIFT);
>> +}
>> +
>> +static inline size_t gen6_pte_count(uint32_t addr, uint32_t length)
>> +{
>> +	return i915_pte_count(addr, length, GEN6_PDE_SHIFT);
>> +}
>> +
>> +static inline uint32_t gen6_pde_index(uint32_t addr)
>> +{
>> +	return i915_pde_index(addr, GEN6_PDE_SHIFT);
>> +}
>> +
>>  int i915_gem_gtt_init(struct drm_device *dev);
>>  void i915_gem_init_global_gtt(struct drm_device *dev);
>>  void i915_global_gtt_cleanup(struct drm_device *dev);
>> -- 
>> 2.1.1
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index c9e93f5..c13f32f 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -36,13 +36,13 @@ 
 
 struct drm_i915_file_private;
 
-typedef uint32_t gen6_gtt_pte_t;
-typedef uint64_t gen8_gtt_pte_t;
-typedef gen8_gtt_pte_t gen8_ppgtt_pde_t;
+typedef uint32_t gen6_pte_t;
+typedef uint64_t gen8_pte_t;
+typedef uint64_t gen8_pde_t;
 
 #define gtt_total_entries(gtt) ((gtt).base.total >> PAGE_SHIFT)
 
-#define I915_PPGTT_PT_ENTRIES		(PAGE_SIZE / sizeof(gen6_gtt_pte_t))
+
 /* gen6-hsw has bit 11-4 for physical addr bit 39-32 */
 #define GEN6_GTT_ADDR_ENCODE(addr)	((addr) | (((addr) >> 28) & 0xff0))
 #define GEN6_PTE_ADDR_ENCODE(addr)	GEN6_GTT_ADDR_ENCODE(addr)
@@ -51,8 +51,13 @@  typedef gen8_gtt_pte_t gen8_ppgtt_pde_t;
 #define GEN6_PTE_UNCACHED		(1 << 1)
 #define GEN6_PTE_VALID			(1 << 0)
 
-#define GEN6_PPGTT_PD_ENTRIES		512
-#define GEN6_PD_SIZE			(GEN6_PPGTT_PD_ENTRIES * PAGE_SIZE)
+#define I915_PTES(pte_len)		(PAGE_SIZE / (pte_len))
+#define I915_PTE_MASK(pte_len)		(I915_PTES(pte_len) - 1)
+#define I915_PDES			512
+#define I915_PDE_MASK			(I915_PDES - 1)
+
+#define GEN6_PTES			I915_PTES(sizeof(gen6_pte_t))
+#define GEN6_PD_SIZE		        (I915_PDES * PAGE_SIZE)
 #define GEN6_PD_ALIGN			(PAGE_SIZE * 16)
 #define GEN6_PDE_VALID			(1 << 0)
 
@@ -89,8 +94,7 @@  typedef gen8_gtt_pte_t gen8_ppgtt_pde_t;
 #define GEN8_PTE_SHIFT			12
 #define GEN8_PTE_MASK			0x1ff
 #define GEN8_LEGACY_PDPES		4
-#define GEN8_PTES_PER_PAGE		(PAGE_SIZE / sizeof(gen8_gtt_pte_t))
-#define GEN8_PDES_PER_PAGE		(PAGE_SIZE / sizeof(gen8_ppgtt_pde_t))
+#define GEN8_PTES			I915_PTES(sizeof(gen8_pte_t))
 
 #define PPAT_UNCACHED_INDEX		(_PAGE_PWT | _PAGE_PCD)
 #define PPAT_CACHED_PDE_INDEX		0 /* WB LLC */
@@ -199,7 +203,7 @@  struct i915_page_directory_entry {
 		dma_addr_t daddr;
 	};
 
-	struct i915_page_table_entry *page_table[GEN6_PPGTT_PD_ENTRIES]; /* PDEs */
+	struct i915_page_table_entry *page_table[I915_PDES]; /* PDEs */
 };
 
 struct i915_page_directory_pointer_entry {
@@ -243,9 +247,9 @@  struct i915_address_space {
 	struct list_head inactive_list;
 
 	/* FIXME: Need a more generic return type */
-	gen6_gtt_pte_t (*pte_encode)(dma_addr_t addr,
-				     enum i915_cache_level level,
-				     bool valid, u32 flags); /* Create a valid PTE */
+	gen6_pte_t (*pte_encode)(dma_addr_t addr,
+				 enum i915_cache_level level,
+				 bool valid, u32 flags); /* Create a valid PTE */
 	void (*clear_range)(struct i915_address_space *vm,
 			    uint64_t start,
 			    uint64_t length,
@@ -304,6 +308,21 @@  struct i915_hw_ppgtt {
 	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
 };
 
+static inline uint32_t i915_pte_index(uint64_t address, uint32_t pte_len)
+{
+	return (address >> PAGE_SHIFT) & I915_PTE_MASK(pte_len);
+}
+
+static inline uint32_t i915_pte_count(uint64_t length, uint32_t pte_len)
+{
+	return min_t(uint32_t, length / PAGE_SIZE, I915_PTES(pte_len));
+}
+
+static inline uint32_t i915_pde_index(uint64_t addr, uint32_t pde_shift)
+{
+	return (addr >> pde_shift) & I915_PDE_MASK;
+}
+
 int i915_gem_gtt_init(struct drm_device *dev);
 void i915_gem_init_global_gtt(struct drm_device *dev);
 void i915_global_gtt_cleanup(struct drm_device *dev);