diff mbox

[v2] drm/i915/gtt: Avoid calling kcalloc in a loop when allocating temp bitmaps

Message ID 1441040380-15684-1-git-send-email-michal.winiarski@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michał Winiarski Aug. 31, 2015, 4:59 p.m. UTC
On each call to gen8_alloc_va_range_3lvl we're allocating temporary
bitmaps needed for error handling. Unfortunately, when we increase
address space size (48b ppgtt) we do additional (512 - 4) calls to
kcalloc, increasing latency between exec and actual start of execution
on the GPU. Let's just do a single kcalloc and setup proper offsets in
an array, we can also drop the size from free_gen8_temp_bitmaps since
it's no longer needed.

v2: Use GFP_TEMPORARY to make the allocations reclaimable.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Signed-off-by: Micha? Winiarski <michal.winiarski@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c | 39 ++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

Comments

Chris Wilson Aug. 31, 2015, 6:42 p.m. UTC | #1
On Mon, Aug 31, 2015 at 06:59:40PM +0200, Micha? Winiarski wrote:
> On each call to gen8_alloc_va_range_3lvl we're allocating temporary
> bitmaps needed for error handling. Unfortunately, when we increase
> address space size (48b ppgtt) we do additional (512 - 4) calls to
> kcalloc, increasing latency between exec and actual start of execution
> on the GPU. Let's just do a single kcalloc and setup proper offsets in
> an array, we can also drop the size from free_gen8_temp_bitmaps since
> it's no longer needed.
> 
> v2: Use GFP_TEMPORARY to make the allocations reclaimable.
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Signed-off-by: Micha? Winiarski <michal.winiarski@intel.com>

Is there any reason why it remains a 2D array though? Looks like it can
be reduced to a single block. And why do we allocate a bitmap for the
whole address space rather than just the range being allocated?

While you may just answer the questions posed, this looks clumsy:

> +	*pts = kcalloc(pdpes * BITS_TO_LONGS(I915_PDES),
> +			sizeof(unsigned long), GFP_TEMPORARY);
> +	if (!*pts)
> +		goto err_out;
> +
> +	for (i = 0; i < pdpes; i++)
> +		pts[i] = *pts + i * BITS_TO_LONGS(I915_PDES);

i..e

for (i = 1; i < pdpes; i++)
	pts[i] = pts[0] + i* BITS_TO_LONGS(I915_PDES);

raises fewer questions.
-Chris
Michał Winiarski Sept. 1, 2015, 9:03 a.m. UTC | #2
On Mon, Aug 31, 2015 at 07:42:34PM +0100, Chris Wilson wrote:
> On Mon, Aug 31, 2015 at 06:59:40PM +0200, Micha? Winiarski wrote:
> > On each call to gen8_alloc_va_range_3lvl we're allocating temporary
> > bitmaps needed for error handling. Unfortunately, when we increase
> > address space size (48b ppgtt) we do additional (512 - 4) calls to
> > kcalloc, increasing latency between exec and actual start of execution
> > on the GPU. Let's just do a single kcalloc and setup proper offsets in
> > an array, we can also drop the size from free_gen8_temp_bitmaps since
> > it's no longer needed.
> > 
> > v2: Use GFP_TEMPORARY to make the allocations reclaimable.
> > 
> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Michel Thierry <michel.thierry@intel.com>
> > Signed-off-by: Micha? Winiarski <michal.winiarski@intel.com>
> 
> Is there any reason why it remains a 2D array though? Looks like it can
> be reduced to a single block. And why do we allocate a bitmap for the
> whole address space rather than just the range being allocated?

No particular reason, let's try that.
It's not the whole address space... Quite possibly just 1/512 of the whole
address space, as the PUD is handled in gen8_alloc_va_range_4lvl. I guess it was
done to simplify... something, although I can't tell what at this point.
 
> While you may just answer the questions posed, this looks clumsy:

Yup - missed that, I was using temp variable before sending the patch.
-Micha?

> 
> > +	*pts = kcalloc(pdpes * BITS_TO_LONGS(I915_PDES),
> > +			sizeof(unsigned long), GFP_TEMPORARY);
> > +	if (!*pts)
> > +		goto err_out;
> > +
> > +	for (i = 0; i < pdpes; i++)
> > +		pts[i] = *pts + i * BITS_TO_LONGS(I915_PDES);
> 
> i..e
> 
> for (i = 1; i < pdpes; i++)
> 	pts[i] = pts[0] + i* BITS_TO_LONGS(I915_PDES);
> 
> raises fewer questions.
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 4a76807..9270c47 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1126,13 +1126,9 @@  unwind_out:
 }
 
 static void
-free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long **new_pts,
-		       uint32_t pdpes)
+free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long **new_pts)
 {
-	int i;
-
-	for (i = 0; i < pdpes; i++)
-		kfree(new_pts[i]);
+	kfree(*new_pts);
 	kfree(new_pts);
 	kfree(new_pds);
 }
@@ -1149,22 +1145,21 @@  int __must_check alloc_gen8_temp_bitmaps(unsigned long **new_pds,
 	unsigned long *pds;
 	unsigned long **pts;
 
-	pds = kcalloc(BITS_TO_LONGS(pdpes), sizeof(unsigned long), GFP_KERNEL);
+	pds = kcalloc(BITS_TO_LONGS(pdpes), sizeof(unsigned long), GFP_TEMPORARY);
 	if (!pds)
 		return -ENOMEM;
 
-	pts = kcalloc(pdpes, sizeof(unsigned long *), GFP_KERNEL);
-	if (!pts) {
-		kfree(pds);
-		return -ENOMEM;
-	}
+	pts = kcalloc(pdpes, sizeof(unsigned long *), GFP_TEMPORARY);
+	if (!pts)
+		goto err_out;
 
-	for (i = 0; i < pdpes; i++) {
-		pts[i] = kcalloc(BITS_TO_LONGS(I915_PDES),
-				 sizeof(unsigned long), GFP_KERNEL);
-		if (!pts[i])
-			goto err_out;
-	}
+	*pts = kcalloc(pdpes * BITS_TO_LONGS(I915_PDES),
+			sizeof(unsigned long), GFP_TEMPORARY);
+	if (!*pts)
+		goto err_out;
+
+	for (i = 0; i < pdpes; i++)
+		pts[i] = *pts + i * BITS_TO_LONGS(I915_PDES);
 
 	*new_pds = pds;
 	*new_pts = pts;
@@ -1172,7 +1167,7 @@  int __must_check alloc_gen8_temp_bitmaps(unsigned long **new_pds,
 	return 0;
 
 err_out:
-	free_gen8_temp_bitmaps(pds, pts, pdpes);
+	free_gen8_temp_bitmaps(pds, pts);
 	return -ENOMEM;
 }
 
@@ -1220,7 +1215,7 @@  static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm,
 	ret = gen8_ppgtt_alloc_page_directories(vm, pdp, start, length,
 						new_page_dirs);
 	if (ret) {
-		free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
+		free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
 		return ret;
 	}
 
@@ -1278,7 +1273,7 @@  static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm,
 		gen8_setup_page_directory(ppgtt, pdp, pd, pdpe);
 	}
 
-	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
+	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
 	mark_tlbs_dirty(ppgtt);
 	return 0;
 
@@ -1291,7 +1286,7 @@  err_out:
 	for_each_set_bit(pdpe, new_page_dirs, pdpes)
 		free_pd(dev, pdp->page_directory[pdpe]);
 
-	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
+	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
 	mark_tlbs_dirty(ppgtt);
 	return ret;
 }