diff mbox

[06/10] drm/i915: Stop using AGP layer for GEN6+

Message ID 1350956055-3224-7-git-send-email-ben@bwidawsk.net (mailing list archive)
State New, archived
Headers show

Commit Message

Ben Widawsky Oct. 23, 2012, 1:34 a.m. UTC
As a quick hack we make the old intel_gtt structure mutable so we can
fool a bunch of the existing code which depends on elements in that data
structure. We can/should try to remove this in a subsequent patch.

This should preserve the old gtt init behavior which upon writing these
patches seems incorrect. The next patch will fix these things.

The one exception is VLV which doesn't have the preserved flush control
write behavior. Since we want to do that for all GEN6+ stuff, we'll
handle that in a later patch. Mainstream VLV support doesn't actually
exist yet anyway.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/char/agp/intel-gtt.c               |   2 +-
 drivers/gpu/drm/i915/i915_dma.c            |  16 +--
 drivers/gpu/drm/i915/i915_drv.h            |  10 +-
 drivers/gpu/drm/i915/i915_gem.c            |  12 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   2 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c        | 208 +++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_reg.h            |   6 +
 include/drm/intel-gtt.h                    |   3 +-
 8 files changed, 228 insertions(+), 31 deletions(-)

Comments

Chris Wilson Oct. 23, 2012, 9:59 a.m. UTC | #1
On Mon, 22 Oct 2012 18:34:11 -0700, Ben Widawsky <ben@bwidawsk.net> wrote:
> +/*
> + * Binds an object into the global gtt with the specified cache level. The object
> + * will be accessible to the GPU via commands whose operands reference offsets
> + * within the global GTT as well as accessible by the GPU through the GMADR
> + * mapped BAR (dev_priv->mm.gtt->gtt).
> + */
> +static void gen6_ggtt_bind_object(struct drm_i915_gem_object *obj,
> +				  enum i915_cache_level level)
> +{
> +	struct drm_device *dev = obj->base.dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct sg_table *st = obj->pages;
> +	struct scatterlist *sg = st->sgl;
> +	const int first_entry = obj->gtt_space->start >> PAGE_SHIFT;
> +	const int max_entries = dev_priv->mm.gtt->gtt_total_entries - first_entry;
> +	gtt_pte_t __iomem *gtt_entries = dev_priv->mm.gtt->gtt + first_entry;
> +	int unused, i = 0;
> +	unsigned int len, m = 0;
> +
> +	for_each_sg(st->sgl, sg, st->nents, unused) {
> +		len = sg_dma_len(sg) >> PAGE_SHIFT;
> +		for (m = 0; m < len; m++) {
> +			dma_addr_t addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
> +			gtt_entries[i] = pte_encode(dev, addr, level);
> +			i++;
> +			if (WARN_ON(i > max_entries))
> +				goto out;
> +		}
> +	}
> +
> +out:
> +	/* XXX: This serves as a posting read preserving the way the old code
> +	 * works. It's not clear if this is strictly necessary or just voodoo
> +	 * based on what I've tried to gather from the docs.
> +	 */
> +	readl(&gtt_entries[i-1]);

It will be required until we replace the voodoo with more explicit mb().
-Chris
Ben Widawsky Oct. 23, 2012, 2:57 p.m. UTC | #2
On 2012-10-23 02:59, Chris Wilson wrote:
> On Mon, 22 Oct 2012 18:34:11 -0700, Ben Widawsky <ben@bwidawsk.net> 
> wrote:
>> +/*
>> + * Binds an object into the global gtt with the specified cache 
>> level. The object
>> + * will be accessible to the GPU via commands whose operands 
>> reference offsets
>> + * within the global GTT as well as accessible by the GPU through 
>> the GMADR
>> + * mapped BAR (dev_priv->mm.gtt->gtt).
>> + */
>> +static void gen6_ggtt_bind_object(struct drm_i915_gem_object *obj,
>> +				  enum i915_cache_level level)
>> +{
>> +	struct drm_device *dev = obj->base.dev;
>> +	struct drm_i915_private *dev_priv = dev->dev_private;
>> +	struct sg_table *st = obj->pages;
>> +	struct scatterlist *sg = st->sgl;
>> +	const int first_entry = obj->gtt_space->start >> PAGE_SHIFT;
>> +	const int max_entries = dev_priv->mm.gtt->gtt_total_entries - 
>> first_entry;
>> +	gtt_pte_t __iomem *gtt_entries = dev_priv->mm.gtt->gtt + 
>> first_entry;
>> +	int unused, i = 0;
>> +	unsigned int len, m = 0;
>> +
>> +	for_each_sg(st->sgl, sg, st->nents, unused) {
>> +		len = sg_dma_len(sg) >> PAGE_SHIFT;
>> +		for (m = 0; m < len; m++) {
>> +			dma_addr_t addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
>> +			gtt_entries[i] = pte_encode(dev, addr, level);
>> +			i++;
>> +			if (WARN_ON(i > max_entries))
>> +				goto out;
>> +		}
>> +	}
>> +
>> +out:
>> +	/* XXX: This serves as a posting read preserving the way the old 
>> code
>> +	 * works. It's not clear if this is strictly necessary or just 
>> voodoo
>> +	 * based on what I've tried to gather from the docs.
>> +	 */
>> +	readl(&gtt_entries[i-1]);
>
> It will be required until we replace the voodoo with more explicit 
> mb().
> -Chris

Actually, after we introduce the FLSH_CNTL patch from Jesse/me later in 
the series, I think we just want a POSTING_READ on that register. It is 
technically "required" by our desire to some day WC the registers, and 
should synchronize everything else for us.

After a quick read of memory_barriers.txt (again), I think mmiowb is 
actually what we might want in addition to the POSTING_READ I'd add.
Daniel Vetter Oct. 23, 2012, 2:58 p.m. UTC | #3
On Tue, Oct 23, 2012 at 4:57 PM, Ben Widawsky <ben@bwidawsk.net> wrote:
> Actually, after we introduce the FLSH_CNTL patch from Jesse/me later in the
> series, I think we just want a POSTING_READ on that register. It is
> technically "required" by our desire to some day WC the registers, and
> should synchronize everything else for us.
>
> After a quick read of memory_barriers.txt (again), I think mmiowb is
> actually what we might want in addition to the POSTING_READ I'd add.

Imo we have special rules for the igd, since clearly not all registers
in our 4mb mmio window are equal. So I'd prefer the keep the readback
of the last pte write (to ensure those are flushed out) and maybe also
add a readback of the gfx_flsh_cntl reg (like I've seen in some
internal vlv tree). Just to be paranoid.
-Daniel
Ben Widawsky Oct. 23, 2012, 7:57 p.m. UTC | #4
On Tue, 23 Oct 2012 16:58:50 +0200
Daniel Vetter <daniel@ffwll.ch> wrote:

> On Tue, Oct 23, 2012 at 4:57 PM, Ben Widawsky <ben@bwidawsk.net> wrote:
> > Actually, after we introduce the FLSH_CNTL patch from Jesse/me later in the
> > series, I think we just want a POSTING_READ on that register. It is
> > technically "required" by our desire to some day WC the registers, and
> > should synchronize everything else for us.
> >
> > After a quick read of memory_barriers.txt (again), I think mmiowb is
> > actually what we might want in addition to the POSTING_READ I'd add.
> 
> Imo we have special rules for the igd, since clearly not all registers
> in our 4mb mmio window are equal. So I'd prefer the keep the readback
> of the last pte write (to ensure those are flushed out) and maybe also
> add a readback of the gfx_flsh_cntl reg (like I've seen in some
> internal vlv tree). Just to be paranoid.
> -Daniel

What's your definition of flush? I think we just need one read to
satisfy the device I/O flush, and I think the write to the regsiter
should satisy the TLB flush. That leads me to the conclusion that just
the POSTING_READ of the FLSH_CNTL register is sufficient.

If you want me to do both, I have no problem with that either, and I'll
just update the comment to say that we believe it is unnecessary.

I don't really care either way.
Daniel Vetter Oct. 23, 2012, 8:03 p.m. UTC | #5
On Tue, Oct 23, 2012 at 9:57 PM, Ben Widawsky <ben@bwidawsk.net> wrote:
>> On Tue, Oct 23, 2012 at 4:57 PM, Ben Widawsky <ben@bwidawsk.net> wrote:
>> > Actually, after we introduce the FLSH_CNTL patch from Jesse/me later in the
>> > series, I think we just want a POSTING_READ on that register. It is
>> > technically "required" by our desire to some day WC the registers, and
>> > should synchronize everything else for us.
>> >
>> > After a quick read of memory_barriers.txt (again), I think mmiowb is
>> > actually what we might want in addition to the POSTING_READ I'd add.
>>
>> Imo we have special rules for the igd, since clearly not all registers
>> in our 4mb mmio window are equal. So I'd prefer the keep the readback
>> of the last pte write (to ensure those are flushed out) and maybe also
>> add a readback of the gfx_flsh_cntl reg (like I've seen in some
>> internal vlv tree). Just to be paranoid.
>> -Daniel
>
> What's your definition of flush? I think we just need one read to
> satisfy the device I/O flush, and I think the write to the regsiter
> should satisy the TLB flush. That leads me to the conclusion that just
> the POSTING_READ of the FLSH_CNTL register is sufficient.
>
> If you want me to do both, I have no problem with that either, and I'll
> just update the comment to say that we believe it is unnecessary.
>
> I don't really care either way.

I think we need both, otherwise one of the pte writes might not have
arrived at the endpoint when the chip is purging the tlb already,
which might give is a tiny window where we load an invalid entry into
the tlb. Tiny window, but I'm paranoid ;-) And yes, I know that's more
strict than what docs say, but our igd also seems to be less coherent
than what docs claim.
-Daniel
Ben Widawsky Oct. 23, 2012, 8:27 p.m. UTC | #6
On Tue, 23 Oct 2012 22:03:21 +0200
Daniel Vetter <daniel@ffwll.ch> wrote:

> On Tue, Oct 23, 2012 at 9:57 PM, Ben Widawsky <ben@bwidawsk.net> wrote:
> >> On Tue, Oct 23, 2012 at 4:57 PM, Ben Widawsky <ben@bwidawsk.net> wrote:
> >> > Actually, after we introduce the FLSH_CNTL patch from Jesse/me later in the
> >> > series, I think we just want a POSTING_READ on that register. It is
> >> > technically "required" by our desire to some day WC the registers, and
> >> > should synchronize everything else for us.
> >> >
> >> > After a quick read of memory_barriers.txt (again), I think mmiowb is
> >> > actually what we might want in addition to the POSTING_READ I'd add.
> >>
> >> Imo we have special rules for the igd, since clearly not all registers
> >> in our 4mb mmio window are equal. So I'd prefer the keep the readback
> >> of the last pte write (to ensure those are flushed out) and maybe also
> >> add a readback of the gfx_flsh_cntl reg (like I've seen in some
> >> internal vlv tree). Just to be paranoid.
> >> -Daniel
> >
> > What's your definition of flush? I think we just need one read to
> > satisfy the device I/O flush, and I think the write to the regsiter
> > should satisy the TLB flush. That leads me to the conclusion that just
> > the POSTING_READ of the FLSH_CNTL register is sufficient.
> >
> > If you want me to do both, I have no problem with that either, and I'll
> > just update the comment to say that we believe it is unnecessary.
> >
> > I don't really care either way.
> 
> I think we need both, otherwise one of the pte writes might not have
> arrived at the endpoint when the chip is purging the tlb already,
> which might give is a tiny window where we load an invalid entry into
> the tlb. Tiny window, but I'm paranoid ;-) And yes, I know that's more
> strict than what docs say, but our igd also seems to be less coherent
> than what docs claim.
> -Daniel

Hmm, that's a good point. If we write the last pte, and then flsh_cntl,
and then posting_read on flsh_cntl; I think it is also fine. (Unless
writes can be reordered)
Ben Widawsky Oct. 24, 2012, 5:56 p.m. UTC | #7
On Tue, 23 Oct 2012 13:35:18 +0300
Mika Kuoppala <mika.kuoppala@linux.intel.com> wrote:

> On Mon, 22 Oct 2012 18:34:11 -0700, Ben Widawsky <ben@bwidawsk.net> wrote:

> > +static void teardown_scratch_page(struct drm_device *dev)
> > +{
> > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > +	set_pages_wb(dev_priv->mm.gtt->scratch_page, 1);
> > +	pci_unmap_page(dev->pdev, dev_priv->mm.gtt->scratch_page_dma,
> > +		       PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
> 
> #ifdef CONFIG_INTEL_IOMMU needed around pci_unmap_page?

It preserves the original behavior, which I believe is fine as is. The
iommu code should just do nothing. I am partial thuogh, since I am the
original reviewer on the patch that introduced that.

> 
> > +	put_page(dev_priv->mm.gtt->scratch_page);
> > +	__free_page(dev_priv->mm.gtt->scratch_page);
> > +}
> > +
Jesse Barnes Oct. 25, 2012, 8:54 p.m. UTC | #8
On Tue, 23 Oct 2012 07:57:12 -0700
Ben Widawsky <ben@bwidawsk.net> wrote:

> On 2012-10-23 02:59, Chris Wilson wrote:
> > On Mon, 22 Oct 2012 18:34:11 -0700, Ben Widawsky <ben@bwidawsk.net> 
> > wrote:
> >> +/*
> >> + * Binds an object into the global gtt with the specified cache 
> >> level. The object
> >> + * will be accessible to the GPU via commands whose operands 
> >> reference offsets
> >> + * within the global GTT as well as accessible by the GPU through 
> >> the GMADR
> >> + * mapped BAR (dev_priv->mm.gtt->gtt).
> >> + */
> >> +static void gen6_ggtt_bind_object(struct drm_i915_gem_object *obj,
> >> +				  enum i915_cache_level level)
> >> +{
> >> +	struct drm_device *dev = obj->base.dev;
> >> +	struct drm_i915_private *dev_priv = dev->dev_private;
> >> +	struct sg_table *st = obj->pages;
> >> +	struct scatterlist *sg = st->sgl;
> >> +	const int first_entry = obj->gtt_space->start >> PAGE_SHIFT;
> >> +	const int max_entries = dev_priv->mm.gtt->gtt_total_entries - 
> >> first_entry;
> >> +	gtt_pte_t __iomem *gtt_entries = dev_priv->mm.gtt->gtt + 
> >> first_entry;
> >> +	int unused, i = 0;
> >> +	unsigned int len, m = 0;
> >> +
> >> +	for_each_sg(st->sgl, sg, st->nents, unused) {
> >> +		len = sg_dma_len(sg) >> PAGE_SHIFT;
> >> +		for (m = 0; m < len; m++) {
> >> +			dma_addr_t addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
> >> +			gtt_entries[i] = pte_encode(dev, addr, level);
> >> +			i++;
> >> +			if (WARN_ON(i > max_entries))
> >> +				goto out;
> >> +		}
> >> +	}
> >> +
> >> +out:
> >> +	/* XXX: This serves as a posting read preserving the way the old 
> >> code
> >> +	 * works. It's not clear if this is strictly necessary or just 
> >> voodoo
> >> +	 * based on what I've tried to gather from the docs.
> >> +	 */
> >> +	readl(&gtt_entries[i-1]);
> >
> > It will be required until we replace the voodoo with more explicit 
> > mb().
> > -Chris
> 
> Actually, after we introduce the FLSH_CNTL patch from Jesse/me later in 
> the series, I think we just want a POSTING_READ on that register. It is 
> technically "required" by our desire to some day WC the registers, and 
> should synchronize everything else for us.
> 
> After a quick read of memory_barriers.txt (again), I think mmiowb is 
> actually what we might want in addition to the POSTING_READ I'd add.

On a big NUMA system maybe (i.e. on nothing we run on yet), but on x86
mmiowb doesn't do anything other than act as a compiler optimization
barrier.
Jesse Barnes Oct. 25, 2012, 8:55 p.m. UTC | #9
On Tue, 23 Oct 2012 12:57:24 -0700
Ben Widawsky <ben@bwidawsk.net> wrote:

> On Tue, 23 Oct 2012 16:58:50 +0200
> Daniel Vetter <daniel@ffwll.ch> wrote:
> 
> > On Tue, Oct 23, 2012 at 4:57 PM, Ben Widawsky <ben@bwidawsk.net> wrote:
> > > Actually, after we introduce the FLSH_CNTL patch from Jesse/me later in the
> > > series, I think we just want a POSTING_READ on that register. It is
> > > technically "required" by our desire to some day WC the registers, and
> > > should synchronize everything else for us.
> > >
> > > After a quick read of memory_barriers.txt (again), I think mmiowb is
> > > actually what we might want in addition to the POSTING_READ I'd add.
> > 
> > Imo we have special rules for the igd, since clearly not all registers
> > in our 4mb mmio window are equal. So I'd prefer the keep the readback
> > of the last pte write (to ensure those are flushed out) and maybe also
> > add a readback of the gfx_flsh_cntl reg (like I've seen in some
> > internal vlv tree). Just to be paranoid.
> > -Daniel
> 
> What's your definition of flush? I think we just need one read to
> satisfy the device I/O flush, and I think the write to the regsiter
> should satisy the TLB flush. That leads me to the conclusion that just
> the POSTING_READ of the FLSH_CNTL register is sufficient.
> 
> If you want me to do both, I have no problem with that either, and I'll
> just update the comment to say that we believe it is unnecessary.
> 
> I don't really care either way.

Yeah we just need one or the other, not both.  Technically the UC write
of the flush control reg should flush the WC buffer, then a posting
read of the flush control reg should make sure it gets to device 2
before the read returns.
diff mbox

Patch

diff --git a/drivers/char/agp/intel-gtt.c b/drivers/char/agp/intel-gtt.c
index 38390f7..4dfbb80 100644
--- a/drivers/char/agp/intel-gtt.c
+++ b/drivers/char/agp/intel-gtt.c
@@ -1686,7 +1686,7 @@  int intel_gmch_probe(struct pci_dev *bridge_pdev, struct pci_dev *gpu_pdev,
 }
 EXPORT_SYMBOL(intel_gmch_probe);
 
-const struct intel_gtt *intel_gtt_get(void)
+struct intel_gtt *intel_gtt_get(void)
 {
 	return &intel_private.base;
 }
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index ffbc915..eda983b 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1492,19 +1492,9 @@  int i915_driver_load(struct drm_device *dev, unsigned long flags)
 		goto free_priv;
 	}
 
-	ret = intel_gmch_probe(dev_priv->bridge_dev, dev->pdev, NULL);
-	if (!ret) {
-		DRM_ERROR("failed to set up gmch\n");
-		ret = -EIO;
+	ret = i915_gem_gtt_init(dev);
+	if (ret)
 		goto put_bridge;
-	}
-
-	dev_priv->mm.gtt = intel_gtt_get();
-	if (!dev_priv->mm.gtt) {
-		DRM_ERROR("Failed to initialize GTT\n");
-		ret = -ENODEV;
-		goto put_gmch;
-	}
 
 	i915_kick_out_firmware_fb(dev_priv);
 
@@ -1678,7 +1668,7 @@  out_mtrrfree:
 out_rmmap:
 	pci_iounmap(dev->pdev, dev_priv->regs);
 put_gmch:
-	intel_gmch_remove();
+	i915_gem_gtt_fini(dev);
 put_bridge:
 	pci_dev_put(dev_priv->bridge_dev);
 free_priv:
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bf628c4..534d282 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -687,7 +687,7 @@  typedef struct drm_i915_private {
 
 	struct {
 		/** Bridge to intel-gtt-ko */
-		const struct intel_gtt *gtt;
+		struct intel_gtt *gtt;
 		/** Memory allocator for GTT stolen memory */
 		struct drm_mm stolen;
 		/** Memory allocator for GTT */
@@ -1507,6 +1507,14 @@  void i915_gem_init_global_gtt(struct drm_device *dev,
 			      unsigned long start,
 			      unsigned long mappable_end,
 			      unsigned long end);
+int i915_gem_gtt_init(struct drm_device *dev);
+void i915_gem_gtt_fini(struct drm_device *dev);
+extern inline void i915_gem_chipset_flush(struct drm_device *dev)
+{
+	if (INTEL_INFO(dev)->gen < 6)
+		intel_gtt_chipset_flush();
+}
+
 
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct drm_device *dev, int min_size,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e04820c..ae3d4c1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -846,12 +846,12 @@  out:
 		 * domain anymore. */
 		if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
 			i915_gem_clflush_object(obj);
-			intel_gtt_chipset_flush();
+			i915_gem_chipset_flush(dev);
 		}
 	}
 
 	if (needs_clflush_after)
-		intel_gtt_chipset_flush();
+		i915_gem_chipset_flush(dev);
 
 	return ret;
 }
@@ -3059,7 +3059,7 @@  i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj)
 		return;
 
 	i915_gem_clflush_object(obj);
-	intel_gtt_chipset_flush();
+	i915_gem_chipset_flush(obj->base.dev);
 	old_write_domain = obj->base.write_domain;
 	obj->base.write_domain = 0;
 
@@ -3960,7 +3960,7 @@  i915_gem_init_hw(struct drm_device *dev)
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	int ret;
 
-	if (!intel_enable_gtt())
+	if (INTEL_INFO(dev)->gen < 6 && !intel_enable_gtt())
 		return -EIO;
 
 	if (IS_HASWELL(dev) && (I915_READ(0x120010) == 1))
@@ -4295,7 +4295,7 @@  void i915_gem_detach_phys_object(struct drm_device *dev,
 			page_cache_release(page);
 		}
 	}
-	intel_gtt_chipset_flush();
+	i915_gem_chipset_flush(dev);
 
 	obj->phys_obj->cur_obj = NULL;
 	obj->phys_obj = NULL;
@@ -4382,7 +4382,7 @@  i915_gem_phys_pwrite(struct drm_device *dev,
 			return -EFAULT;
 	}
 
-	intel_gtt_chipset_flush();
+	i915_gem_chipset_flush(dev);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 6a2f3e5..1af00b5 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -673,7 +673,7 @@  i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring,
 	}
 
 	if (flush_domains & I915_GEM_DOMAIN_CPU)
-		intel_gtt_chipset_flush();
+		i915_gem_chipset_flush(ring->dev);
 
 	if (flush_domains & I915_GEM_DOMAIN_GTT)
 		wmb();
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 89c273f..7d3ec42 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -305,13 +305,38 @@  static void undo_idling(struct drm_i915_private *dev_priv, bool interruptible)
 		dev_priv->mm.interruptible = interruptible;
 }
 
+
+static void i915_ggtt_clear_range(struct drm_device *dev,
+				 unsigned first_entry,
+				 unsigned num_entries)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	gtt_pte_t scratch_pte;
+	volatile void __iomem *gtt_base = dev_priv->mm.gtt->gtt + first_entry;
+	const int max_entries = dev_priv->mm.gtt->gtt_total_entries - first_entry;
+
+	if (INTEL_INFO(dev)->gen < 6) {
+		intel_gtt_clear_range(first_entry, num_entries);
+		return;
+	}
+
+	if (WARN(num_entries > max_entries,
+		 "First entry = %d; Num entries = %d (max=%d)\n",
+		 first_entry, num_entries, max_entries))
+		num_entries = max_entries;
+
+	scratch_pte = pte_encode(dev, dev_priv->mm.gtt->scratch_page_dma, I915_CACHE_LLC);
+	memset_io(gtt_base, scratch_pte, num_entries * sizeof(scratch_pte));
+	readl(gtt_base);
+}
+
 void i915_gem_restore_gtt_mappings(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_gem_object *obj;
 
 	/* First fill our portion of the GTT with scratch pages */
-	intel_gtt_clear_range(dev_priv->mm.gtt_start / PAGE_SIZE,
+	i915_ggtt_clear_range(dev, dev_priv->mm.gtt_start / PAGE_SIZE,
 			      (dev_priv->mm.gtt_end - dev_priv->mm.gtt_start) / PAGE_SIZE);
 
 	list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) {
@@ -319,7 +344,7 @@  void i915_gem_restore_gtt_mappings(struct drm_device *dev)
 		i915_gem_gtt_bind_object(obj, obj->cache_level);
 	}
 
-	intel_gtt_chipset_flush();
+	i915_gem_chipset_flush(dev);
 }
 
 int i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj)
@@ -335,21 +360,64 @@  int i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj)
 	return 0;
 }
 
+/*
+ * Binds an object into the global gtt with the specified cache level. The object
+ * will be accessible to the GPU via commands whose operands reference offsets
+ * within the global GTT as well as accessible by the GPU through the GMADR
+ * mapped BAR (dev_priv->mm.gtt->gtt).
+ */
+static void gen6_ggtt_bind_object(struct drm_i915_gem_object *obj,
+				  enum i915_cache_level level)
+{
+	struct drm_device *dev = obj->base.dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct sg_table *st = obj->pages;
+	struct scatterlist *sg = st->sgl;
+	const int first_entry = obj->gtt_space->start >> PAGE_SHIFT;
+	const int max_entries = dev_priv->mm.gtt->gtt_total_entries - first_entry;
+	gtt_pte_t __iomem *gtt_entries = dev_priv->mm.gtt->gtt + first_entry;
+	int unused, i = 0;
+	unsigned int len, m = 0;
+
+	for_each_sg(st->sgl, sg, st->nents, unused) {
+		len = sg_dma_len(sg) >> PAGE_SHIFT;
+		for (m = 0; m < len; m++) {
+			dma_addr_t addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
+			gtt_entries[i] = pte_encode(dev, addr, level);
+			i++;
+			if (WARN_ON(i > max_entries))
+				goto out;
+		}
+	}
+
+out:
+	/* XXX: This serves as a posting read preserving the way the old code
+	 * works. It's not clear if this is strictly necessary or just voodoo
+	 * based on what I've tried to gather from the docs.
+	 */
+	readl(&gtt_entries[i-1]);
+}
+
 void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
 			      enum i915_cache_level cache_level)
 {
 	struct drm_device *dev = obj->base.dev;
-	unsigned int agp_type = cache_level_to_agp_type(dev, cache_level);
+	if (INTEL_INFO(dev)->gen < 6) {
+		unsigned int agp_type = cache_level_to_agp_type(dev, cache_level);
+		intel_gtt_insert_sg_entries(obj->pages,
+					    obj->gtt_space->start >> PAGE_SHIFT,
+					    agp_type);
+	} else {
+		gen6_ggtt_bind_object(obj, cache_level);
+	}
 
-	intel_gtt_insert_sg_entries(obj->pages,
-				    obj->gtt_space->start >> PAGE_SHIFT,
-				    agp_type);
 	obj->has_global_gtt_mapping = 1;
 }
 
 void i915_gem_gtt_unbind_object(struct drm_i915_gem_object *obj)
 {
-	intel_gtt_clear_range(obj->gtt_space->start >> PAGE_SHIFT,
+	i915_ggtt_clear_range(obj->base.dev,
+			      obj->gtt_space->start >> PAGE_SHIFT,
 			      obj->base.size >> PAGE_SHIFT);
 
 	obj->has_global_gtt_mapping = 0;
@@ -407,5 +475,129 @@  void i915_gem_init_global_gtt(struct drm_device *dev,
 	dev_priv->mm.mappable_gtt_total = min(end, mappable_end) - start;
 
 	/* ... but ensure that we clear the entire range. */
-	intel_gtt_clear_range(start / PAGE_SIZE, (end-start) / PAGE_SIZE);
+	i915_ggtt_clear_range(dev, start / PAGE_SIZE, (end-start) / PAGE_SIZE);
+}
+
+static int setup_scratch_page(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct page *page;
+	dma_addr_t dma_addr;
+
+	page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
+	if (page == NULL)
+		return -ENOMEM;
+	get_page(page);
+	set_pages_uc(page, 1);
+
+#ifdef CONFIG_INTEL_IOMMU
+	dma_addr = pci_map_page(dev->pdev, page, 0, PAGE_SIZE,
+				PCI_DMA_BIDIRECTIONAL);
+	if (pci_dma_mapping_error(dev->pdev, dma_addr))
+		return -EINVAL;
+#else
+	dma_addr = page_to_phys(page);
+#endif
+	dev_priv->mm.gtt->scratch_page = page;
+	dev_priv->mm.gtt->scratch_page_dma = dma_addr;
+
+	return 0;
+}
+
+static void teardown_scratch_page(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	set_pages_wb(dev_priv->mm.gtt->scratch_page, 1);
+	pci_unmap_page(dev->pdev, dev_priv->mm.gtt->scratch_page_dma,
+		       PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+	put_page(dev_priv->mm.gtt->scratch_page);
+	__free_page(dev_priv->mm.gtt->scratch_page);
+}
+
+static inline unsigned int gen6_get_total_gtt_size(u16 snb_gmch_ctl)
+{
+	snb_gmch_ctl >>= SNB_GMCH_GGMS_SHIFT;
+	snb_gmch_ctl &= SNB_GMCH_GGMS_MASK;
+	return snb_gmch_ctl << 20;
+}
+
+static inline unsigned int gen6_get_stolen_size(u16 snb_gmch_ctl)
+{
+	snb_gmch_ctl >>= SNB_GMCH_GMS_SHIFT;
+	snb_gmch_ctl &= SNB_GMCH_GMS_MASK;
+	return snb_gmch_ctl << 25; /* 32 MB units */
+}
+
+int i915_gem_gtt_init(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	phys_addr_t gtt_bus_addr;
+	u16 snb_gmch_ctl;
+	u32 tmp;
+	int ret;
+
+	/* On modern platforms we need not worry ourself with the legacy
+	 * hostbridge query stuff. Skip it entirely
+	 */
+	if (INTEL_INFO(dev)->gen < 6) {
+		ret = intel_gmch_probe(dev_priv->bridge_dev, dev->pdev, NULL);
+		if (!ret) {
+			DRM_ERROR("failed to set up gmch\n");
+			return -EIO;
+		}
+
+		dev_priv->mm.gtt = intel_gtt_get();
+		if (!dev_priv->mm.gtt) {
+			DRM_ERROR("Failed to initialize GTT\n");
+			intel_gmch_remove();
+			return -ENODEV;
+		}
+		return 0;
+	}
+
+
+	dev_priv->mm.gtt = kzalloc(sizeof(*dev_priv->mm.gtt), GFP_KERNEL);
+	if (!dev_priv->mm.gtt)
+		return -ENOMEM;
+
+	if (!pci_set_dma_mask(dev->pdev, DMA_BIT_MASK(40)))
+		pci_set_consistent_dma_mask(dev->pdev, DMA_BIT_MASK(40));
+
+	pci_read_config_dword(dev->pdev, PCI_BASE_ADDRESS_0, &tmp);
+	/* For GEN6+ the PTEs for the ggtt live at 2MB + BAR0 */
+	gtt_bus_addr = (tmp & PCI_BASE_ADDRESS_MEM_MASK) + (2<<20);
+
+	dev_priv->mm.gtt->gtt_mappable_entries = pci_resource_len(dev->pdev, 2) >> PAGE_SHIFT;
+	pci_read_config_dword(dev->pdev, PCI_BASE_ADDRESS_2, &tmp);
+	dev_priv->mm.gtt->gma_bus_addr = tmp & PCI_BASE_ADDRESS_MEM_MASK;
+
+	/* i9xx_setup */
+	pci_read_config_word(dev->pdev, SNB_GMCH_CTRL, &snb_gmch_ctl);
+	dev_priv->mm.gtt->gtt_total_entries =
+		gen6_get_total_gtt_size(snb_gmch_ctl) / sizeof(gtt_pte_t);
+	dev_priv->mm.gtt->stolen_size = gen6_get_stolen_size(snb_gmch_ctl);
+
+	ret = setup_scratch_page(dev);
+	if (ret)
+		return ret;
+
+	dev_priv->mm.gtt->gtt = ioremap(gtt_bus_addr,
+					dev_priv->mm.gtt->gtt_total_entries * sizeof(gtt_pte_t));
+
+	/* GMADR is the PCI aperture used by SW to access tiled GFX surfaces in a linear fashion. */
+	DRM_DEBUG_DRIVER("GMADR size = %dM\n", dev_priv->mm.gtt->gtt_mappable_entries >> 8);
+	DRM_DEBUG_DRIVER("GTT total entries = %dK\n", dev_priv->mm.gtt->gtt_total_entries >> 10);
+	DRM_DEBUG_DRIVER("GTT stolen size = %dM\n", dev_priv->mm.gtt->stolen_size >> 20);
+
+	return 0;
+}
+
+void i915_gem_gtt_fini(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	iounmap(dev_priv->mm.gtt->gtt);
+	teardown_scratch_page(dev);
+	if (INTEL_INFO(dev)->gen < 6)
+		intel_gmch_remove();
+	kfree(dev_priv->mm.gtt);
 }
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index f4d70b0..10a6e9b 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -40,6 +40,12 @@ 
  */
 #define INTEL_GMCH_CTRL		0x52
 #define INTEL_GMCH_VGA_DISABLE  (1 << 1)
+#define SNB_GMCH_CTRL		0x50
+#define    SNB_GMCH_GGMS_SHIFT	8 /* GTT Graphics Memory Size */
+#define    SNB_GMCH_GGMS_MASK	0x3
+#define    SNB_GMCH_GMS_SHIFT   3 /* Graphics Mode Select */
+#define    SNB_GMCH_GMS_MASK    0x1f
+
 
 /* PCI config space */
 
diff --git a/include/drm/intel-gtt.h b/include/drm/intel-gtt.h
index 2e37e9f..94e8f2c 100644
--- a/include/drm/intel-gtt.h
+++ b/include/drm/intel-gtt.h
@@ -3,7 +3,7 @@ 
 #ifndef _DRM_INTEL_GTT_H
 #define	_DRM_INTEL_GTT_H
 
-const struct intel_gtt {
+struct intel_gtt {
 	/* Size of memory reserved for graphics by the BIOS */
 	unsigned int stolen_size;
 	/* Total number of gtt entries. */
@@ -17,6 +17,7 @@  const struct intel_gtt {
 	unsigned int do_idle_maps : 1;
 	/* Share the scratch page dma with ppgtts. */
 	dma_addr_t scratch_page_dma;
+	struct page *scratch_page;
 	/* for ppgtt PDE access */
 	u32 __iomem *gtt;
 	/* needed for ioremap in drm/i915 */