[4/4] i965: Introduce a context-local batch manager
diff mbox

Message ID 1430492023-20296-5-git-send-email-chris@chris-wilson.co.uk
State New
Headers show

Commit Message

Chris Wilson May 1, 2015, 2:53 p.m. UTC
When submitting commands to the GPU every cycle of latency counts;
mutexes, spinlocks, even atomics quickly add to substantial overhead.

This "batch manager" acts as thread-local shim over the buffer manager
(drm_intel_bufmgr_gem). As we are only ever used from within a single
context, we can rely on the upper layers providing thread safety.
This allows us to import buffers from the shared screen (sharing buffers
between multiple contexts, threads and users) and wrap that handle in
our own. Similarly, we will to share the buffer cache between all
users on the file and so allocate from the global threadsafe buffer
manager, will a very small and transient local cache of active buffers.

The batch manager provides a cheap way of busyness tracking and very
efficient batch construction and kernel submission.

The restrictions over and above the generic submission engine in
intel_bufmgr_gem are:
     - not thread-safe
     - flat relocations, only the batch buffer itself carries
       relocations. Relocations relative to auxiliary buffers
       must be performed via STATE_BASE
     - direct mapping of the batch for writes, expect reads
       from the batch to be slow
     - the batch is a fixed 64k in size
     - access to the batch must be wrapped by brw_batch_begin/_end
     - all relocations must be immediately written into the batch

The importance of the flat relocation tree with local offset handling is
that it allows us to use the "relocation-less" execbuffer interfaces,
dramatically reducing the overhead of batch submission.

ivb/bdw OglBatch7 improves by ~20% above and beyond the relocation
speedups in http://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=nightly&id=d664f006b340ad9b2b68c8b661aa396c0a72d0fb

ISSUES:
* add real kernel feature detection
* aub-annotation support (needs to expose the libdrm_intel aub writers)
* where is the doubly-linked circular list?
* shared mipmap trees between contexts - are they even legal?
* OglMultithread is nevertheless unhappy

==14653== Possible data race during read of size 4 at 0xBF930B4 by thread #2
==14653== Locks held: none
==14653==    at 0x9FD5FC0: brw_bo_put (brw_batch.h:254)
==14653==    by 0x9FD5FC0: brw_merge_inputs (brw_draw.c:293)
==14653==    by 0x9FD635F: brw_try_draw_prims (brw_draw.c:426)
==14653==    by 0x9FD635F: brw_draw_prims (brw_draw.c:577)
==14653==    by 0x9DF748B: vbo_draw_arrays (vbo_exec_array.c:645)
==14653==    by 0x9E9B3D3: _mesa_meta_GenerateMipmap (meta_generate_mipmap.c:347)
==14653==    by 0x9D1F002: _mesa_generate_texture_mipmap (genmipmap.c:124)
==14653==    by 0x9D1F002: _mesa_GenerateMipmap (genmipmap.c:149)
==14653==    by 0x481FC1: TestHdrMultithread::WorkerThread::_RenderSingleLevelReflectionMaps(TestHdrMultithread::ReflTextureLevel, TestHdrMultithread::ReflTextureLevel, TestHdrMultithread::SphereLod) (in /usr/src/SynMark2_6/synmark2)
==14653==    by 0x4820B6: TestHdrMultithread::WorkerThread::Run() (in /usr/src/SynMark2_6/synmark2)
==14653==    by 0x49BDFC: (anonymous namespace)::_ThreadProc(void*) (in /usr/src/SynMark2_6/synmark2)
==14653==    by 0x4C2F056: mythread_wrapper (hg_intercepts.c:234)
==14653==    by 0x66C70A3: start_thread (pthread_create.c:309)
==14653==    by 0x69C204C: clone (clone.S:111)
==14653==
==14653== This conflicts with a previous write of size 4 by thread #3
==14653== Locks held: none
==14653==    at 0x9FD5FCE: brw_bo_put (brw_batch.h:255)
==14653==    by 0x9FD5FCE: brw_merge_inputs (brw_draw.c:293)
==14653==    by 0x9FD635F: brw_try_draw_prims (brw_draw.c:426)
==14653==    by 0x9FD635F: brw_draw_prims (brw_draw.c:577)
==14653==    by 0x9DF748B: vbo_draw_arrays (vbo_exec_array.c:645)
==14653==    by 0x445D53: OglHdrMultithread::_ThreadCommon::RenderSceneBackgroundFromView(OglHdrMultithread::_Effect&, OglHdrMultithread::_Vaos&, prmath::Matrix4x4<float> const&) const (in /usr/src/SynMark2_6/synmark2)
==14653==    by 0x446BE9: OglHdrMultithread::_WorkerThread::RenderSceneFromView(unsigned int, unsigned int, std::vector<ApiHdrMultithread::RenderObject, std::allocator<ApiHdrMultithread::RenderObject> > const&, std::vector<ApiHdrMultithread::RenderObject, std::allocator<ApiHdrMultithread::RenderObject> > const&, prmath::Vector3<float> const&, prmath::Matrix4x4<float> const&, prmath::Matrix4x4<float> const&, unsigned int, unsigned int, unsigned int) (in /usr/src/SynMark2_6/synmark2)
==14653==    by 0x481E6C: TestHdrMultithread::WorkerThread::_RenderSingleLevelReflectionMaps(TestHdrMultithread::ReflTextureLevel, TestHdrMultithread::ReflTextureLevel, TestHdrMultithread::SphereLod) (in /usr/src/SynMark2_6/synmark2)
==14653==    by 0x4820B6: TestHdrMultithread::WorkerThread::Run() (in /usr/src/SynMark2_6/synmark2)
==14653==    by 0x49BDFC: (anonymous namespace)::_ThreadProc(void*) (in /usr/src/SynMark2_6/synmark2)
==14653==  Address 0xbf930b4 is 68 bytes inside a block of size 144 alloc'd
==14653==    at 0x4C2A040: malloc (vg_replace_malloc.c:296)
==14653==    by 0x9FB2249: brw_bo_import (brw_batch.c:668)
==14653==    by 0x9FB2249: brw_bo_create (brw_batch.c:712)
==14653==    by 0xA0A744B: alloc_buffer_object (intel_buffer_objects.c:67)
==14653==    by 0xA0A744B: brw_buffer_data (intel_buffer_objects.c:173)
==14653==    by 0x9C2BF20: _mesa_buffer_data (bufferobj.c:1564)
==14653==    by 0x9C2BF20: _mesa_BufferData (bufferobj.c:1595)
==14653==    by 0x9E9B375: _mesa_meta_GenerateMipmap (meta_generate_mipmap.c:327)
==14653==    by 0x9D1F002: _mesa_generate_texture_mipmap (genmipmap.c:124)
==14653==    by 0x9D1F002: _mesa_GenerateMipmap (genmipmap.c:149)
==14653==    by 0x461648: (anonymous namespace)::_Gl30CreateTextureFromImage(unsigned int, Image const&, bool) (in /usr/src/SynMark2_6/synmark2)
==14653==    by 0x461824: OglTex::CreateTextureCubeMapFromFile(OglExt::VersionId, char const*, bool) (in /usr/src/SynMark2_6/synmark2)
==14653==    by 0x443759: OglHdrMultithread::LoadTexture(char const*) (in /usr/src/SynMark2_6/synmark2)
==14653==    by 0x46BB17: Test::InitializeAll(WndHandleSys const&, unsigned int, unsigned int, ApiTypes::ColorFormat, ApiTypes::DepthFormat, bool, bool) (in /usr/src/SynMark2_6/synmark2)
==14653==    by 0x416FF6: TestFramework::OnWindowCreate(WndHandleSys const&) (in /usr/src/SynMark2_6/synmark2)
==14653==    by 0x49985B: AppWindowSys::AppWindowSys(AppWindow::Parameters const&, AppWindow::Events&) (in /usr/src/SynMark2_6/synmark2)
==14653==  Block was alloc'd by thread #1
==14653==

which implies that treating the entire context as atomic is an incorrect
assumption

* Add full-ppgtt softpinning support (no more relocations, at least for
  the first 4G)
* polish and move to libdrm; though at the cost of sealing the structs?

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Kristian Høgsberg <krh@bitplanet.net>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Ian Romanick <ian.d.romanick@intel.com>
Cc: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
---
 src/mesa/drivers/dri/i965/Makefile.sources         |    4 +-
 src/mesa/drivers/dri/i965/brw_batch.c              | 1079 ++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_batch.h              |  309 ++++++
 src/mesa/drivers/dri/i965/brw_binding_tables.c     |    1 -
 src/mesa/drivers/dri/i965/brw_blorp.cpp            |   45 +-
 src/mesa/drivers/dri/i965/brw_cc.c                 |   16 +-
 src/mesa/drivers/dri/i965/brw_clear.c              |    1 -
 src/mesa/drivers/dri/i965/brw_clip.c               |    2 -
 src/mesa/drivers/dri/i965/brw_clip_line.c          |    2 -
 src/mesa/drivers/dri/i965/brw_clip_point.c         |    2 -
 src/mesa/drivers/dri/i965/brw_clip_state.c         |   14 +-
 src/mesa/drivers/dri/i965/brw_clip_tri.c           |    2 -
 src/mesa/drivers/dri/i965/brw_clip_unfilled.c      |    2 -
 src/mesa/drivers/dri/i965/brw_clip_util.c          |    2 -
 src/mesa/drivers/dri/i965/brw_context.c            |  196 ++--
 src/mesa/drivers/dri/i965/brw_context.h            |  138 +--
 src/mesa/drivers/dri/i965/brw_curbe.c              |    1 -
 src/mesa/drivers/dri/i965/brw_draw.c               |   66 +-
 src/mesa/drivers/dri/i965/brw_draw_upload.c        |   23 +-
 src/mesa/drivers/dri/i965/brw_ff_gs.c              |    2 -
 src/mesa/drivers/dri/i965/brw_ff_gs_emit.c         |    1 -
 src/mesa/drivers/dri/i965/brw_fs.cpp               |    5 +-
 src/mesa/drivers/dri/i965/brw_list.h               |  353 +++++++
 src/mesa/drivers/dri/i965/brw_meta_fast_clear.c    |    1 -
 src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c  |    1 -
 src/mesa/drivers/dri/i965/brw_meta_updownsample.c  |    1 -
 src/mesa/drivers/dri/i965/brw_misc_state.c         |   10 +-
 src/mesa/drivers/dri/i965/brw_object_purgeable.c   |    8 +-
 .../drivers/dri/i965/brw_performance_monitor.c     |   74 +-
 src/mesa/drivers/dri/i965/brw_pipe_control.c       |   44 +-
 src/mesa/drivers/dri/i965/brw_primitive_restart.c  |    2 -
 src/mesa/drivers/dri/i965/brw_program.c            |   26 +-
 src/mesa/drivers/dri/i965/brw_queryobj.c           |   47 +-
 src/mesa/drivers/dri/i965/brw_reset.c              |   13 +-
 src/mesa/drivers/dri/i965/brw_sampler_state.c      |    8 +-
 src/mesa/drivers/dri/i965/brw_sf.c                 |    2 -
 src/mesa/drivers/dri/i965/brw_sf_emit.c            |    2 -
 src/mesa/drivers/dri/i965/brw_sf_state.c           |   21 +-
 src/mesa/drivers/dri/i965/brw_state.h              |    2 +-
 src/mesa/drivers/dri/i965/brw_state_batch.c        |   41 +-
 src/mesa/drivers/dri/i965/brw_state_cache.c        |   49 +-
 src/mesa/drivers/dri/i965/brw_state_dump.c         |   67 +-
 src/mesa/drivers/dri/i965/brw_state_upload.c       |   16 +-
 src/mesa/drivers/dri/i965/brw_structs.h            |   30 +-
 src/mesa/drivers/dri/i965/brw_urb.c                |    9 +-
 src/mesa/drivers/dri/i965/brw_vec4.cpp             |    5 +-
 src/mesa/drivers/dri/i965/brw_vs_state.c           |   29 +-
 src/mesa/drivers/dri/i965/brw_vs_surface_state.c   |    4 +-
 src/mesa/drivers/dri/i965/brw_wm_state.c           |   38 +-
 src/mesa/drivers/dri/i965/brw_wm_surface_state.c   |   84 +-
 src/mesa/drivers/dri/i965/gen6_blorp.cpp           |   17 +-
 src/mesa/drivers/dri/i965/gen6_cc.c                |    1 -
 src/mesa/drivers/dri/i965/gen6_clip_state.c        |    1 -
 src/mesa/drivers/dri/i965/gen6_depth_state.c       |    1 -
 src/mesa/drivers/dri/i965/gen6_depthstencil.c      |    1 -
 src/mesa/drivers/dri/i965/gen6_gs_state.c          |    1 -
 src/mesa/drivers/dri/i965/gen6_multisample_state.c |    1 -
 src/mesa/drivers/dri/i965/gen6_queryobj.c          |   48 +-
 src/mesa/drivers/dri/i965/gen6_sampler_state.c     |    1 -
 src/mesa/drivers/dri/i965/gen6_scissor_state.c     |    1 -
 src/mesa/drivers/dri/i965/gen6_sf_state.c          |    1 -
 src/mesa/drivers/dri/i965/gen6_sol.c               |    9 +-
 src/mesa/drivers/dri/i965/gen6_surface_state.c     |   15 +-
 src/mesa/drivers/dri/i965/gen6_urb.c               |    1 -
 src/mesa/drivers/dri/i965/gen6_viewport_state.c    |    1 -
 src/mesa/drivers/dri/i965/gen6_vs_state.c          |    2 +-
 src/mesa/drivers/dri/i965/gen6_wm_state.c          |    1 -
 src/mesa/drivers/dri/i965/gen7_blorp.cpp           |   16 +-
 src/mesa/drivers/dri/i965/gen7_disable.c           |    1 -
 src/mesa/drivers/dri/i965/gen7_gs_state.c          |    1 -
 src/mesa/drivers/dri/i965/gen7_misc_state.c        |    3 +-
 src/mesa/drivers/dri/i965/gen7_sf_state.c          |    1 -
 src/mesa/drivers/dri/i965/gen7_sol_state.c         |   19 +-
 src/mesa/drivers/dri/i965/gen7_urb.c               |    1 -
 src/mesa/drivers/dri/i965/gen7_viewport_state.c    |    1 -
 src/mesa/drivers/dri/i965/gen7_vs_state.c          |    1 -
 src/mesa/drivers/dri/i965/gen7_wm_state.c          |    1 -
 src/mesa/drivers/dri/i965/gen7_wm_surface_state.c  |   59 +-
 src/mesa/drivers/dri/i965/gen8_blend_state.c       |    1 -
 src/mesa/drivers/dri/i965/gen8_depth_state.c       |    7 +-
 src/mesa/drivers/dri/i965/gen8_disable.c           |    1 -
 src/mesa/drivers/dri/i965/gen8_draw_upload.c       |    1 -
 src/mesa/drivers/dri/i965/gen8_gs_state.c          |    1 -
 src/mesa/drivers/dri/i965/gen8_misc_state.c        |    1 -
 src/mesa/drivers/dri/i965/gen8_multisample_state.c |    1 -
 src/mesa/drivers/dri/i965/gen8_ps_state.c          |    1 -
 src/mesa/drivers/dri/i965/gen8_sf_state.c          |    1 -
 src/mesa/drivers/dri/i965/gen8_sol_state.c         |    3 +-
 src/mesa/drivers/dri/i965/gen8_surface_state.c     |   78 +-
 src/mesa/drivers/dri/i965/gen8_viewport_state.c    |    1 -
 src/mesa/drivers/dri/i965/gen8_vs_state.c          |    1 -
 src/mesa/drivers/dri/i965/gen8_wm_depth_stencil.c  |    1 -
 src/mesa/drivers/dri/i965/intel_batchbuffer.c      |  453 --------
 src/mesa/drivers/dri/i965/intel_batchbuffer.h      |  179 ----
 src/mesa/drivers/dri/i965/intel_blit.c             |   58 +-
 src/mesa/drivers/dri/i965/intel_blit.h             |   10 +-
 src/mesa/drivers/dri/i965/intel_buffer_objects.c   |  185 ++--
 src/mesa/drivers/dri/i965/intel_buffer_objects.h   |   18 +-
 src/mesa/drivers/dri/i965/intel_debug.c            |    4 +-
 src/mesa/drivers/dri/i965/intel_extensions.c       |   43 +-
 src/mesa/drivers/dri/i965/intel_fbo.c              |   46 +-
 src/mesa/drivers/dri/i965/intel_fbo.h              |    4 -
 src/mesa/drivers/dri/i965/intel_image.h            |    6 +-
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c      |   81 +-
 src/mesa/drivers/dri/i965/intel_mipmap_tree.h      |   11 +-
 src/mesa/drivers/dri/i965/intel_pixel_bitmap.c     |    3 +-
 src/mesa/drivers/dri/i965/intel_pixel_copy.c       |    3 -
 src/mesa/drivers/dri/i965/intel_pixel_draw.c       |    2 +-
 src/mesa/drivers/dri/i965/intel_pixel_read.c       |   22 +-
 src/mesa/drivers/dri/i965/intel_screen.c           |   44 +-
 src/mesa/drivers/dri/i965/intel_screen.h           |   13 +-
 src/mesa/drivers/dri/i965/intel_syncobj.c          |   17 +-
 src/mesa/drivers/dri/i965/intel_tex.c              |    6 +-
 src/mesa/drivers/dri/i965/intel_tex_image.c        |   30 +-
 src/mesa/drivers/dri/i965/intel_tex_subimage.c     |   25 +-
 src/mesa/drivers/dri/i965/intel_upload.c           |   33 +-
 116 files changed, 2541 insertions(+), 1961 deletions(-)
 create mode 100644 src/mesa/drivers/dri/i965/brw_batch.c
 create mode 100644 src/mesa/drivers/dri/i965/brw_batch.h
 create mode 100644 src/mesa/drivers/dri/i965/brw_list.h
 delete mode 100644 src/mesa/drivers/dri/i965/intel_batchbuffer.c
 delete mode 100644 src/mesa/drivers/dri/i965/intel_batchbuffer.h

Comments

Emil Velikov May 5, 2015, 4:16 p.m. UTC | #1
On 1 May 2015 at 15:53, Chris Wilson <chris@chris-wilson.co.uk> wrote:

> * where is the doubly-linked circular list?

IIRC there was some patches from Jason, that move the the gallium one
to src/util/list.h [1]. Not sure on the status of it though. On a
related note - would be great if one day we nuke the glsl one
(exec_node) in favour of it :-)

Cheers
Emil

[1] http://patchwork.freedesktop.org/patch/48051/

Patch
diff mbox

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index a9f9129..32bd552 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -1,4 +1,6 @@ 
 i965_FILES = \
+	brw_batch.c \
+	brw_batch.h \
 	brw_binding_tables.c \
 	brw_blorp_blit.cpp \
 	brw_blorp_blit_eu.cpp \
@@ -180,8 +182,6 @@  i965_FILES = \
 	gen8_wm_depth_stencil.c \
 	intel_asm_annotation.c \
 	intel_asm_annotation.h \
-	intel_batchbuffer.c \
-	intel_batchbuffer.h \
 	intel_blit.c \
 	intel_blit.h \
 	intel_buffer_objects.c \
diff --git a/src/mesa/drivers/dri/i965/brw_batch.c b/src/mesa/drivers/dri/i965/brw_batch.c
new file mode 100644
index 0000000..33ca414
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_batch.c
@@ -0,0 +1,1079 @@ 
+/*
+ * Copyright (c) 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+#include "brw_batch.h"
+#include "brw_context.h" /* XXX brw_finish_batch() */
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <setjmp.h>
+
+#include <intel_bufmgr.h>
+#include <i915_drm.h>
+#include <xf86drm.h>
+#include <errno.h>
+
+#include "intel_screen.h"
+
+/*
+ * When submitting commands to the GPU every cycle of latency counts;
+ * mutexes, spinlocks, even atomics quickly add to substantial overhead.
+ *
+ * This "batch manager" acts as thread-local shim over the buffer manager
+ * (drm_intel_bufmgr_gem). As we are only ever used from within a single
+ * context, we can rely on the upper layers providing thread safety. This
+ * allows us to import buffers from the shared screen (sharing buffers
+ * between multiple contexts, threads and users) and wrap that handle in
+ * our own. Similarly, we will to share the buffer cache between all users
+ * on the file and so allocate from the global threadsafe buffer manager,
+ * will a very small and transient local cache of active buffers.
+ *
+ * The batch manager provides a cheap way of busyness tracking and very
+ * efficient batch construction and kernel submission.
+ *
+ * The restrictions over and above the generic submission engine in
+ * intel_bufmgr_gem are:
+ * 	- not thread-safe
+ * 	- flat relocations, only the batch buffer itself carries
+ * 	  relocations. Relocations relative to auxiliary buffers
+ * 	  must be performed via STATE_BASE
+ * 	- direct mapping of the batch for writes, expect reads
+ * 	  from the batch to be slow
+ * 	- the batch is a fixed 64k in size
+ * 	- access to the batch must be wrapped by brw_batch_begin/_end
+ * 	- all relocations must be immediately written into the batch
+ */
+
+/**
+ * Number of bytes to reserve for commands necessary to complete a batch.
+ *
+ * This includes:
+ * - MI_BATCHBUFFER_END (4 bytes)
+ * - Optional MI_NOOP for ensuring the batch length is qword aligned (4 bytes)
+ * - Any state emitted by vtbl->finish_batch():
+ *   - Gen4-5 record ending occlusion query values (4 * 4 = 16 bytes)
+ *   - Disabling OA counters on Gen6+ (3 DWords = 12 bytes)
+ *   - Ending MI_REPORT_PERF_COUNT on Gen5+, plus associated PIPE_CONTROLs:
+ *     - Two sets of PIPE_CONTROLs, which become 3 PIPE_CONTROLs each on SNB,
+ *       which are 4 DWords each ==> 2 * 3 * 4 * 4 = 96 bytes
+ *     - 3 DWords for MI_REPORT_PERF_COUNT itself on Gen6+.  ==> 12 bytes.
+ *       On Ironlake, it's 6 DWords, but we have some slack due to the lack of
+ *       Sandybridge PIPE_CONTROL madness.
+ */
+#define BATCH_RESERVED 146
+
+/* Surface offsets are limited to a maximum of 64k from the surface base */
+#define BATCH_SIZE (64 << 10)
+
+#define EXEC_OBJECT_PINNED	(1<<4)
+
+struct brw_request {
+   struct list link, write, read;
+   struct brw_bo *bo;
+   int ring;
+};
+
+static bool __brw_bo_busy(struct brw_bo *bo)
+{
+   struct drm_i915_gem_busy busy;
+
+   memset(&busy, 0, sizeof(busy));
+   busy.handle = bo->handle;
+   busy.busy = false;
+
+   drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
+
+   return busy.busy;
+}
+
+static void __brw_request_retire(struct brw_batch *batch,
+                                 struct brw_request *rq)
+{
+   struct brw_request *tmp;
+
+   do {
+      tmp = list_last_entry(&batch->requests[rq->ring],
+                            struct brw_request, link);
+      list_del(&tmp->link);
+
+      while (!list_is_empty(&tmp->write)) {
+         struct brw_bo *bo;
+
+         bo = list_first_entry(&tmp->write,
+                               struct brw_bo,
+                               write.link);
+
+         assert(bo->write.rq == tmp);
+         assert(bo->read.rq);
+
+         list_del(&bo->write.link);
+         bo->write.rq = 0;
+      }
+
+      while (!list_is_empty(&tmp->read)) {
+         struct brw_bo *bo;
+
+         bo = list_first_entry(&tmp->read,
+                               struct brw_bo,
+                               read.link);
+         assert(bo->read.rq == tmp);
+         if (bo->write.rq) {
+            assert(bo->write.rq->ring != rq->ring);
+            __brw_request_retire(batch, bo->write.rq);
+         }
+
+         list_del(&bo->read.link);
+         bo->read.rq = 0;
+
+         if (!bo->refcnt)
+            __brw_bo_free(bo);
+      }
+
+      if (tmp == batch->throttle[1])
+         batch->throttle[1] = NULL;
+
+      tmp->link.next = (struct list *)batch->freed_rq;
+      batch->freed_rq = tmp;
+   } while (tmp != rq);
+}
+
+bool brw_request_busy(struct brw_request *rq)
+{
+   if (__brw_bo_busy(rq->bo))
+      return true;
+
+   __brw_request_retire(rq->bo->batch, rq);
+   return false;
+}
+
+static int brw_batch_reset(struct brw_batch *batch)
+{
+   struct brw_request *rq;
+
+   rq = batch->freed_rq;
+   if (rq == NULL && !list_is_empty(&batch->requests[batch->ring])) {
+      rq = list_last_entry(&batch->requests[batch->ring],
+                           struct brw_request, link);
+      if (!__brw_bo_busy(rq->bo))
+         __brw_request_retire(batch, rq);
+      rq = batch->freed_rq;
+   }
+   if (rq == NULL) {
+      rq = malloc(sizeof(*rq));
+      if (rq == NULL)
+         return -ENOMEM;
+
+      rq->bo = brw_bo_create(batch, "batch", BATCH_SIZE, 0);
+   } else
+      batch->freed_rq = (struct brw_request *)rq->link.next;
+
+   batch->map = brw_bo_map(rq->bo, MAP_WRITE);
+   if (batch->map == NULL) {
+      brw_bo_put(rq->bo);
+      free(rq);
+
+      if (list_is_empty(&batch->requests[batch->ring]))
+         return -ENOMEM;
+
+      rq = list_last_entry(&batch->requests[batch->ring],
+                           struct brw_request, link);
+
+      /* forces the synchronization */
+      batch->map = brw_bo_map(rq->bo, MAP_WRITE);
+   }
+
+   batch->bo = rq->bo;
+   batch->emit.nbatch = 0;
+   batch->emit.nexec = 0;
+   batch->emit.nself = 0;
+   batch->emit.nreloc = 0;
+   batch->next_request = rq;
+
+   batch->reserved = BATCH_RESERVED;
+   batch->state = BATCH_SIZE / 4;
+   batch->aperture = 0;
+   batch->batch_flags = batch->batch_base_flags;
+
+   list_init(&rq->read);
+   list_init(&rq->write);
+
+   list_add(&rq->bo->read.link, &rq->read);
+   rq->bo->read.rq = rq;
+   return 0;
+}
+
+int brw_batch_init(struct brw_batch *batch,
+		   struct intel_screen *screen)
+{
+   const struct brw_device_info *devinfo;
+   int n;
+
+   batch->fd = intel_screen_to_fd(screen);
+   batch->bufmgr = screen->bufmgr;
+   batch->screen = screen;
+
+   devinfo = screen->devinfo;
+
+   batch->needs_pipecontrol_ggtt_wa = devinfo->gen == 6;
+   batch->reloc_size = 512;
+   batch->exec_size = 256;
+   batch->reloc = malloc(sizeof(batch->reloc[0])*batch->reloc_size);
+   batch->exec = malloc(sizeof(batch->exec[0])*batch->exec_size);
+   if (batch->reloc == NULL || batch->exec == NULL)
+      return -ENOMEM;
+
+   list_init(&batch->borrowed);
+   list_init(&batch->active);
+   for (n = 0; n < __BRW_NUM_RINGS; n++)
+      list_init(&batch->requests[n]);
+
+   batch->hw_ring[RENDER_RING] = I915_EXEC_RENDER;
+   batch->hw_ring[BLT_RING] = devinfo->gen >= 6 ? I915_EXEC_BLT :I915_EXEC_RENDER;
+
+   batch->has_wc = true;
+   batch->has_llc = devinfo->has_llc;
+   batch->has_softpin = true;
+   batch->has_lut = true;
+   batch->max_aperture = 2<<30;
+
+   batch->batch_base_flags = I915_EXEC_NO_RELOC;
+   if (batch->has_lut)
+      batch->batch_base_flags |= I915_EXEC_HANDLE_LUT;
+
+   if (devinfo->gen >= 6) {
+      /* Create a new hardware context.  Using a hardware context means that
+       * our GPU state will be saved/restored on context switch, allowing us
+       * to assume that the GPU is in the same state we left it in.
+       *
+       * This is required for transform feedback buffer offsets, query objects,
+       * and also allows us to reduce how much state we have to emit.
+       */
+      struct drm_i915_gem_context_create create;
+
+      memset(&create, 0, sizeof(create));
+      drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create);
+      batch->hw_ctx = create.ctx_id;
+      if (!batch->hw_ctx) {
+         fprintf(stderr, "Gen6+ requires Kernel 3.6 or later.\n");
+         return -ENODEV;
+      }
+   }
+
+   return brw_batch_reset(batch);
+}
+
+uint64_t brw_batch_read_register(struct brw_batch *batch,
+				 uint32_t reg)
+{
+   struct drm_i915_reg_read reg_read;
+
+   memset(&reg_read, 0, sizeof(reg_read));
+   reg_read.offset = reg;
+   drmIoctl(batch->fd, DRM_IOCTL_I915_REG_READ, &reg_read);
+   return reg_read.val;
+}
+
+void brw_batch_enable_debug(struct brw_batch *batch)
+{
+   batch->debug = true;
+}
+
+void brw_batch_clear_dirty(struct brw_batch *batch)
+{
+   struct brw_request_node *node;
+
+   list_for_each_entry(node, &batch->next_request->write, link) {
+      struct brw_bo *bo = container_of(node, struct brw_bo, write);
+      if (!bo->dirty)
+         break;
+
+      bo->dirty = false;
+   }
+}
+
+static void __brw_batch_grow_exec(struct brw_batch *batch)
+{
+   struct drm_i915_gem_exec_object2 *new_exec;
+   uint16_t new_size;
+
+   if (batch->exec_size == UINT16_MAX)
+      longjmp(batch->jmpbuf, -ENOMEM);
+
+   new_size = batch->exec_size * 2;
+   new_exec = realloc(batch->exec, new_size*sizeof(new_exec[0]));
+   if (new_exec == NULL)
+      longjmp(batch->jmpbuf, -ENOMEM);
+
+   if (new_exec != batch->exec) {
+      struct brw_bo *bo;
+
+      list_for_each_entry(bo, &batch->next_request->read, read.link)
+         bo->exec = new_exec + (bo->exec - batch->exec);
+
+      batch->exec = new_exec;
+   }
+
+   batch->exec_size = new_size;
+}
+
+static void __brw_batch_grow_reloc(struct brw_batch *batch)
+{
+   struct drm_i915_gem_relocation_entry *new_reloc;
+   uint16_t new_size;
+
+   if (batch->reloc_size == UINT16_MAX)
+      longjmp(batch->jmpbuf, -ENOMEM);
+
+   new_size = batch->reloc_size * 2;
+   new_reloc = realloc(batch->reloc, new_size*sizeof(new_reloc[0]));
+   if (new_reloc == NULL)
+      longjmp(batch->jmpbuf, -ENOMEM);
+
+   batch->reloc = new_reloc;
+   batch->reloc_size = new_size;
+}
+
+uint64_t brw_batch_reloc(struct brw_batch *batch,
+			 uint32_t batch_offset,
+			 struct brw_bo *target_bo,
+			 uint32_t target_offset,
+			 unsigned read_domains,
+			 unsigned write_domain)
+{
+   if (!target_bo)
+      return target_offset;
+
+   assert(target_bo->refcnt);
+   if (target_bo->batch != batch) {
+      /* XXX legal sharing between contexts/threads? */
+      target_bo = brw_bo_import(batch, target_bo->base, true);
+      if (target_bo == NULL)
+         longjmp(batch->jmpbuf, -ENOMEM);
+      target_bo->refcnt--;
+   }
+   assert(target_bo->batch == batch);
+
+   if (!target_bo->exec && target_bo != batch->bo) {
+      int n;
+
+      n = batch->emit.nexec++;
+      if (n == batch->exec_size) /* reserve one entry for the batch */
+         __brw_batch_grow_exec(batch);
+
+      target_bo->target_handle = batch->has_lut ? n : target_bo->handle;
+      target_bo->exec = memset(batch->exec + n, 0, sizeof(*target_bo->exec));
+      target_bo->exec->handle = target_bo->handle;
+      target_bo->exec->offset = target_bo->offset;
+      if (target_bo->pinned)
+         target_bo->exec->flags = EXEC_OBJECT_PINNED;
+
+      target_bo->read.rq = batch->next_request;
+      list_move_tail(&target_bo->read.link, &batch->next_request->read);
+
+      batch->aperture += target_bo->base->size;
+   }
+
+   if (!target_bo->pinned) {
+      int n;
+
+      if (batch->emit.nreloc == batch->reloc_size)
+         __brw_batch_grow_reloc(batch);
+
+      n = batch->emit.nreloc++;
+      batch->reloc[n].offset = batch_offset;
+      batch->reloc[n].delta = target_offset;
+      batch->reloc[n].target_handle = target_bo->target_handle;
+      batch->reloc[n].presumed_offset = target_bo->offset;
+      batch->reloc[n].read_domains = read_domains;
+      batch->reloc[n].write_domain = write_domain;
+
+      if (target_bo == batch->bo) {
+         int m = batch->emit.nself++;
+         if (m < 256)
+            batch->self_reloc[m] = n;
+      }
+   }
+
+   if (write_domain && !target_bo->dirty) {
+      target_bo->write.rq = batch->next_request;
+      list_move(&target_bo->write.link, &batch->next_request->write);
+      assert(target_bo->write.rq == target_bo->read.rq);
+      target_bo->dirty = true;
+      target_bo->domain = DOMAIN_GPU;
+      if (batch->has_lut) {
+         target_bo->exec->flags |= EXEC_OBJECT_WRITE;
+         if (write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
+             batch->needs_pipecontrol_ggtt_wa)
+            target_bo->exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+      }
+   }
+
+   return target_bo->offset + target_offset;
+}
+
+static uint32_t __brw_batch_finish(struct brw_batch *batch)
+{
+   batch->reserved = 0;
+
+   brw_finish_batch(batch);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
+      int bytes_for_commands = 4 * batch->emit.nbatch;
+      int bytes_for_state = batch->bo->size - 4*batch->state;
+      int total_bytes = bytes_for_commands + bytes_for_state;
+      fprintf(stderr, "Batchbuffer flush on ring %d with %4db (pkt) + "
+              "%4db (state) = %4db (%0.1f%%), with %d buffers and %d relocations\n",
+              batch->ring, bytes_for_commands, bytes_for_state,
+              total_bytes, 100.0f * total_bytes / BATCH_SIZE,
+              batch->emit.nexec, batch->emit.nreloc);
+   }
+
+   batch->map[batch->emit.nbatch] = 0xa << 23;
+   return 4*((batch->emit.nbatch + 2) & ~1);
+}
+
+static void
+throttle(struct brw_batch *batch, struct brw_request *rq)
+{
+   /* Wait for the swapbuffers before the one we just emitted, so we
+    * don't get too many swaps outstanding for apps that are GPU-heavy
+    * but not CPU-heavy.
+    *
+    * We're using intelDRI2Flush (called from the loader before
+    * swapbuffer) and glFlush (for front buffer rendering) as the
+    * indicator that a frame is done and then throttle when we get
+    * here as we prepare to render the next frame.  At this point for
+    * round trips for swap/copy and getting new buffers are done and
+    * we'll spend less time waiting on the GPU.
+    *
+    * Unfortunately, we don't have a handle to the batch containing
+    * the swap, and getting our hands on that doesn't seem worth it,
+    * so we just use the first batch we emitted after the last swap.
+    */
+   if (batch->need_swap_throttle && batch->throttle[0]) {
+      if (batch->throttle[1] && !batch->disable_throttling)
+         brw_bo_wait(batch->throttle[1]->bo, -1);
+      batch->throttle[1] = batch->throttle[0];
+      batch->throttle[0] = NULL;
+
+      batch->need_swap_throttle = false;
+      batch->need_flush_throttle = false;
+   }
+   if (batch->throttle[0] == NULL)
+      batch->throttle[0] = rq;
+
+   if (batch->need_flush_throttle) {
+      drmCommandNone(batch->fd, DRM_I915_GEM_THROTTLE);
+      batch->need_flush_throttle = false;
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
+      fprintf(stderr, "waiting for idle\n");
+      brw_bo_wait(rq->bo, -1);
+   }
+}
+
+static void __brw_batch_fixup_self_relocations(struct brw_batch *batch)
+{
+   uint32_t target = batch->bo->target_handle;
+   int n, count;
+
+   count = MIN2(batch->emit.nself, 256);
+   for (n = 0; n < count; n++)
+      batch->reloc[batch->self_reloc[n]].target_handle = target;
+   if (n == 256) {
+      for (n = batch->self_reloc[255] + 1; n < batch->emit.nself; n++) {
+         if (batch->reloc[n].target_handle == -1)
+            batch->reloc[n].target_handle = target;
+      }
+   }
+}
+
+static void
+__brw_batch_dump(struct brw_batch *batch)
+{
+   struct drm_intel_decode *decode;
+
+   decode = drm_intel_decode_context_alloc(batch->screen->deviceID);
+   if (!decode)
+      return;
+
+   drm_intel_decode_set_batch_pointer(decode,
+                                      batch->map, batch->bo->offset,
+                                      batch->emit.nbatch + 1);
+
+   drm_intel_decode_set_output_file(decode, stderr);
+   drm_intel_decode(decode);
+
+   drm_intel_decode_context_free(decode);
+
+   brw_debug_batch(batch);
+}
+
+int brw_batch_flush(struct brw_batch *batch)
+{
+   struct drm_i915_gem_execbuffer2 execbuf;
+   struct drm_i915_gem_exec_object2 *exec;
+   struct brw_request *rq = batch->next_request;
+   struct brw_bo *bo;
+
+   if (batch->emit.nbatch == 0)
+      return 0;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_AUB))
+      brw_annotate_batch(batch);
+
+   rq->bo->target_handle = batch->has_lut ? batch->emit.nexec : rq->bo->handle;
+   exec = memset(&batch->exec[batch->emit.nexec], 0, sizeof(*exec));
+   exec->handle = rq->bo->handle;
+   exec->relocation_count = batch->emit.nreloc;
+   exec->relocs_ptr = (uintptr_t)batch->reloc;
+   exec->offset = rq->bo->offset;
+   if (rq->bo->pinned)
+      exec->flags = EXEC_OBJECT_PINNED;
+
+   __brw_batch_fixup_self_relocations(batch);
+
+   rq->bo->exec = exec;
+   rq->ring = batch->ring;
+
+   memset(&execbuf, 0, sizeof(execbuf));
+   execbuf.buffers_ptr = (uintptr_t)batch->exec;
+   execbuf.buffer_count = batch->emit.nexec + 1;
+   execbuf.batch_len = __brw_batch_finish(batch);
+   if (batch->ring == RENDER_RING || batch->has_softpin)
+      execbuf.rsvd1 = batch->hw_ctx;
+   execbuf.flags = batch->hw_ring[batch->ring] | batch->batch_flags;
+   if (drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf)) {
+      if (errno == ENOSPC)
+         return -ENOSPC;
+
+      fprintf(stderr,
+              "Failed to submit batch buffer, rendering will be incorrect: %s [%d]\n",
+              strerror(errno), errno);
+
+      /* submit a dummy execbuf to keep the fences accurate */
+      batch->map[0] = 0xa << 23;
+      execbuf.batch_len = 8;
+
+      if (drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf))
+         return -errno;
+   }
+
+   list_for_each_entry(bo, &rq->read, read.link) {
+      bo->offset = bo->exec->offset;
+      bo->exec = 0;
+      bo->dirty = false;
+      bo->target_handle = -1;
+   }
+   rq->bo->pinned = batch->has_softpin;
+
+   list_add(&rq->link, &batch->requests[batch->ring]);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
+      __brw_batch_dump(batch);
+
+   throttle(batch, rq);
+
+   return brw_batch_reset(batch);
+}
+
+bool brw_batch_busy(struct brw_batch *batch)
+{
+   int n;
+
+   for (n = 0; n < __BRW_NUM_RINGS; n++) {
+      struct brw_request *rq;
+
+      if (list_is_empty(&batch->requests[n]))
+	 continue;
+
+      rq = list_first_entry(&batch->requests[n],
+			    struct brw_request, link);
+      if (__brw_bo_busy(rq->bo))
+	 return true;
+
+      __brw_request_retire(batch, rq);
+   }
+   return false;
+}
+
+void brw_batch_wait(struct brw_batch *batch)
+{
+   int n;
+
+   for (n = 0; n < __BRW_NUM_RINGS; n++) {
+      struct brw_request *rq;
+
+      if (list_is_empty(&batch->requests[n]))
+	 continue;
+
+      rq = list_first_entry(&batch->requests[n],
+			    struct brw_request, link);
+
+      brw_bo_wait(rq->bo, -1);
+   }
+}
+
+struct brw_bo *brw_bo_import(struct brw_batch *batch,
+			     drm_intel_bo *base,
+			     bool borrow)
+{
+   struct brw_bo *bo;
+   uint32_t tiling, swizzling;
+
+   if (base == NULL)
+      return NULL;
+
+   assert(base->handle);
+   assert(base->size);
+
+   if (borrow) {
+      /* XXX may need a ht? */
+      list_for_each_entry(bo, &batch->borrowed, link)
+	 if (bo->handle == base->handle) {
+            bo->refcnt++;
+            return bo;
+         }
+   }
+
+   if (batch->freed_bo) {
+      bo = batch->freed_bo;
+      batch->freed_bo = (struct brw_bo *)bo->base;
+   } else {
+      bo = malloc(sizeof(*bo));
+      if (bo == NULL)
+	 return NULL;
+   }
+
+   memset(bo, 0, sizeof(*bo));
+
+   bo->handle = base->handle;
+   bo->batch = batch;
+   bo->refcnt = 1;
+   bo->offset = base->offset64;
+   bo->size = base->size;
+
+   drm_intel_bo_get_tiling(base, &tiling, &swizzling);
+   bo->tiling = tiling;
+   bo->swizzling = swizzling;
+   bo->reusable = !borrow;
+
+   list_init(&bo->read.link);
+   list_init(&bo->write.link);
+
+   list_init(&bo->link);
+
+   bo->base = base;
+   if (borrow) {
+      drm_intel_bo_reference(base);
+      list_add(&bo->link, &batch->borrowed);
+   }
+
+   return bo;
+}
+
+struct brw_bo *brw_bo_create(struct brw_batch *batch,
+			     const char *name,
+			     uint64_t size,
+			     uint64_t alignment)
+{
+   drm_intel_bo *base;
+
+   /* XXX for-render? */
+   base = drm_intel_bo_alloc(batch->bufmgr, name, size, alignment);
+   if (!base)
+      return 0;
+
+   return brw_bo_import(batch, base, false);
+}
+
+static uint64_t brw_surface_size(int cpp,
+				 uint32_t width,
+				 uint32_t height,
+				 uint32_t tiling,
+				 uint32_t *pitch)
+{
+   uint32_t tile_width, tile_height;
+
+   switch (tiling) {
+   default:
+   case I915_TILING_NONE:
+      tile_width = 64;
+      tile_height = 2;
+      break;
+   case I915_TILING_X:
+      tile_width = 512;
+      tile_height = 8;
+      break;
+   case I915_TILING_Y:
+      tile_width = 128;
+      tile_height = 32;
+      break;
+   }
+
+   *pitch = ALIGN(width * cpp, tile_width);
+   height = ALIGN(height, tile_height);
+   return *pitch * height;
+}
+
+struct brw_bo *
+brw_bo_create_tiled(struct brw_batch *batch,
+		    const char *name,
+		    uint32_t width,
+		    uint32_t height,
+		    int cpp,
+		    uint32_t *tiling,
+		    uint32_t *pitch,
+		    unsigned flags)
+{
+   unsigned long __pitch;
+   drm_intel_bo *base;
+   struct brw_bo *bo;
+
+   if (flags & BO_ALLOC_FOR_RENDER) {
+      uint64_t size = brw_surface_size(cpp, width, height,
+				       *tiling, pitch);
+      list_for_each_entry(bo, &batch->active, link) {
+	 if (bo->size < size || size > 2*bo->size)
+	    continue;
+
+	 list_del(&bo->link);
+	 drm_intel_bo_set_tiling(bo->base, tiling, *pitch);
+	 bo->refcnt++;
+	 return bo;
+      }
+   }
+
+   base = drm_intel_bo_alloc_tiled(batch->bufmgr, name,
+				   width, height, cpp,
+				   tiling, &__pitch, flags);
+   if (!base)
+      return 0;
+
+   *pitch = __pitch;
+   return brw_bo_import(batch, base, false);
+}
+
+struct brw_bo *brw_bo_create_from_name(struct brw_batch *batch,
+				       const char *name,
+				       uint32_t global_name)
+{
+   drm_intel_bo *base;
+   struct brw_bo *bo;
+
+   base = drm_intel_bo_gem_create_from_name(batch->bufmgr,
+					    name, global_name);
+   if (!base)
+      return 0;
+
+   bo = brw_bo_import(batch, base, true);
+   drm_intel_bo_unreference(base);
+
+   return bo;
+}
+
+int brw_bo_madvise(struct brw_bo *bo, int state)
+{
+   return drm_intel_bo_madvise(bo->base, state);
+}
+
+int brw_bo_wait(struct brw_bo *bo, int64_t timeout)
+{
+   struct drm_i915_gem_wait wait;
+
+   memset(&wait, 0, sizeof(wait));
+   wait.bo_handle = bo->handle;
+   wait.timeout_ns = timeout;
+   wait.flags = 0;
+
+   if (drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_WAIT, &wait))
+      return -errno;
+
+   if (bo->read.rq)
+      __brw_request_retire(bo->batch, bo->read.rq);
+
+   return 0;
+}
+
+void brw_bo_write(struct brw_bo *bo,
+		  uint64_t offset,
+		  const void *data,
+		  uint64_t length,
+		  unsigned flags)
+{
+   struct drm_i915_gem_pwrite pwrite;
+   void *map;
+
+   assert(offset < bo->size);
+   assert(length <= bo->size - offset);
+
+   map = brw_bo_map(bo, MAP_WRITE | flags);
+   if (map) {
+      memcpy(map + offset, data, length);
+      return;
+   }
+
+   memset(&pwrite, 0, sizeof(pwrite));
+   pwrite.handle = bo->handle;
+   pwrite.offset = offset;
+   pwrite.size = length;
+   pwrite.data_ptr = (uint64_t)(uintptr_t)data;
+   if (drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_PWRITE, &pwrite))
+      return;
+
+   if (bo->read.rq)
+      __brw_request_retire(bo->batch, bo->read.rq);
+}
+
+void brw_bo_read(struct brw_bo *bo,
+		 uint64_t offset,
+		 void *data,
+		 uint64_t length,
+		 unsigned flags)
+{
+   struct drm_i915_gem_pread pread;
+   void *map;
+
+   map = brw_bo_map(bo, MAP_READ | flags);
+   if (map) {
+      memcpy(data, map + offset, length);
+      return;
+   }
+
+   memset(&pread, 0, sizeof(pread));
+   pread.handle = bo->handle;
+   pread.offset = offset;
+   pread.size = length;
+   pread.data_ptr = (uint64_t)(uintptr_t)data;
+   if (drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_PREAD, &pread))
+      return;
+
+   if (bo->write.rq)
+      __brw_request_retire(bo->batch, bo->write.rq);
+}
+
+static void brw_bo_set_domain(struct brw_bo *bo, unsigned domain, bool write)
+{
+   struct drm_i915_gem_set_domain set_domain;
+   struct brw_request *rq;
+
+   if (bo->exec) /* flush failed, pretend we are ASYNC | INCOHERENT */
+      return;
+
+   memset(&set_domain, 0, sizeof(set_domain));
+   set_domain.handle = bo->handle;
+   set_domain.read_domains = domain;
+   if (write)
+      set_domain.write_domain = domain;
+
+   if (drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain))
+      return;
+
+   rq = write ? bo->read.rq : bo->write.rq;
+   if (rq)
+      __brw_request_retire(bo->batch, rq);
+}
+
+static void *brw_bo_map__gtt(struct brw_bo *bo, unsigned flags)
+{
+   if (bo->map__gtt == NULL)
+      bo->map__gtt = drm_intel_gem_bo_map__gtt(bo->base);
+
+   if ((flags & MAP_ASYNC) == 0)
+      brw_bo_set_domain(bo, I915_GEM_DOMAIN_GTT, flags & MAP_WRITE);
+
+   return bo->map__gtt;
+}
+
+static void *brw_bo_map__wc(struct brw_bo *bo, unsigned flags)
+{
+   if (!bo->batch->has_wc)
+      return brw_bo_map__gtt(bo, flags);
+
+   if (bo->map__wc == NULL) {
+      bo->map__wc = drm_intel_gem_bo_map__wc(bo->base);
+      if (bo->map__wc == NULL) {
+	 bo->batch->has_wc = false;
+	 return brw_bo_map__gtt(bo, flags);
+      }
+   }
+   assert(bo->map__wc);
+
+   if ((flags & MAP_ASYNC) == 0)
+      brw_bo_set_domain(bo, I915_GEM_DOMAIN_GTT, flags & MAP_WRITE);
+
+   return bo->map__wc;
+}
+
+static void *brw_bo_map__cpu(struct brw_bo *bo, unsigned flags)
+{
+   if (bo->map__cpu == NULL)
+      bo->map__cpu = drm_intel_gem_bo_map__cpu(bo->base);
+   assert(bo->map__cpu);
+
+   if ((flags & MAP_ASYNC) == 0)
+      brw_bo_set_domain(bo, I915_GEM_DOMAIN_CPU, flags & MAP_WRITE);
+
+   return bo->map__cpu;
+}
+
+void *brw_bo_map(struct brw_bo *bo, unsigned flags)
+{
+   if ((flags & MAP_ASYNC) == 0)
+      brw_bo_flush(bo);
+
+   if (bo->tiling && (flags & MAP_DETILED) == 0)
+      return brw_bo_map__gtt(bo, flags);
+   else if (bo->batch->has_llc || (flags & (MAP_WRITE | MAP_COHERENT)) == 0)
+      return brw_bo_map__cpu(bo, flags);
+   else
+      return brw_bo_map__wc(bo, flags);
+}
+
+uint32_t brw_bo_flink(struct brw_bo *bo)
+{
+   uint32_t name = 0;
+   drm_intel_bo_flink(bo->base, &name);
+   return name;
+}
+
+void  __brw_bo_free(struct brw_bo *bo)
+{
+   assert(bo->refcnt == 0);
+
+   if (bo->read.rq) {
+      if (bo->reusable)
+	 list_add(&bo->link, &bo->batch->active);
+      return;
+   }
+
+   assert(!bo->write.rq);
+   list_del(&bo->link);
+
+   bo->base->offset64 = bo->offset;
+   drm_intel_bo_unreference(bo->base);
+   bo->base = (drm_intel_bo *)bo->batch->freed_bo;
+   bo->batch->freed_bo = bo;
+}
+
+int brw_batch_begin(struct brw_batch *batch, uint32_t count, int ring)
+{
+   if (batch->next_request == NULL)
+      return -ENOMEM;
+
+   if (ring != batch->ring &&
+       batch->hw_ring[ring] != batch->hw_ring[batch->ring])
+      brw_batch_flush(batch);
+
+   if (batch->emit.nbatch + count > batch->state - batch->reserved) {
+      int ret = brw_batch_flush(batch);
+      if (ret)
+         return ret;
+   }
+
+   batch->ring = ring;
+
+   if (batch->emit.nbatch == 0)
+      brw_start_batch(batch);
+
+   assert(batch->ring == ring);
+   batch->saved = batch->emit;
+   return setjmp(batch->jmpbuf);
+}
+
+int brw_batch_end(struct brw_batch *batch)
+{
+   int ret;
+
+   if (batch->aperture < batch->max_aperture)
+      return 0;
+
+   ret = brw_batch_flush(batch);
+   if (ret == 0)
+      return 0;
+
+   if (batch->saved.nbatch == 0)
+      return ret;
+
+   batch->emit = batch->saved;
+
+   ret = brw_batch_flush(batch);
+   if (ret != -ENOSPC)
+      return ret;
+
+   longjmp(batch->jmpbuf, 1);
+}
+
+static void __brw_request_free(struct brw_request *rq)
+{
+   brw_bo_put(rq->bo);
+   free(rq);
+}
+
+void brw_batch_fini(struct brw_batch *batch)
+{
+   int n;
+
+   for (n = 0; n < __BRW_NUM_RINGS; n++) {
+      struct brw_request *rq;
+
+      if (list_is_empty(&batch->requests[n]))
+         continue;
+
+      rq = list_first_entry(&batch->requests[n], struct brw_request, link);
+      __brw_request_retire(batch, rq);
+   }
+
+   while (batch->freed_rq) {
+      struct brw_request *rq;
+
+      rq = batch->freed_rq;
+      batch->freed_rq = (struct brw_request *)rq->link.next;
+      __brw_request_free(rq);
+   }
+   __brw_request_free(batch->next_request);
+
+   while (batch->freed_bo) {
+      struct brw_bo *bo;
+
+      bo = batch->freed_bo;
+      batch->freed_bo = (struct brw_bo *)bo->base;
+      free(bo);
+   }
+
+   free(batch->exec);
+   free(batch->reloc);
+
+   if (batch->hw_ctx) {
+      struct drm_i915_gem_context_destroy destroy;
+
+      memset(&destroy, 0, sizeof(destroy));
+      destroy.ctx_id = batch->hw_ctx;
+      drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &destroy);
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_batch.h b/src/mesa/drivers/dri/i965/brw_batch.h
new file mode 100644
index 0000000..bece59f
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_batch.h
@@ -0,0 +1,309 @@ 
+#ifndef BRW_BATCH_H
+#define BRW_BATCH_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <setjmp.h>
+#include <assert.h>
+
+#include "brw_list.h"
+
+struct _drm_intel_bufmgr;
+struct _drm_intel_bo;
+
+struct intel_screen;
+
+enum brw_gpu_ring {
+   RENDER_RING = 0,
+   BLT_RING,
+   __BRW_NUM_RINGS,
+};
+
+struct brw_batch;
+struct brw_bo;
+struct brw_request;
+
+enum brw_bo_domain { DOMAIN_NONE, DOMAIN_CPU, DOMAIN_GTT, DOMAIN_GPU };
+
+struct brw_bo {
+   struct brw_batch *batch;
+   struct drm_i915_gem_exec_object2 *exec;
+   struct brw_request_node {
+      struct brw_request *rq;
+      struct list link;
+   } read, write;
+
+   bool dirty : 1;
+   unsigned domain : 2;
+   unsigned tiling : 4;
+   unsigned swizzling : 4;
+   unsigned pinned : 1;
+   unsigned reusable : 1;
+
+   unsigned refcnt;
+   uint32_t handle;
+   uint32_t target_handle;
+   uint64_t size;
+   uint64_t offset;
+
+   struct _drm_intel_bo *base;
+   struct list link;
+
+   void *map__cpu;
+   void *map__gtt;
+   void *map__wc;
+};
+
+struct brw_batch {
+   int fd;
+
+   struct brw_bo *bo;
+   uint32_t *map;
+
+   uint32_t batch_flags;
+   uint32_t batch_base_flags;
+
+   int ring;
+   uint32_t hw_ctx;
+
+   uint16_t reserved;
+   uint16_t state;
+
+   struct brw_batch_state {
+      uint16_t nbatch;
+      uint16_t nexec;
+      uint16_t nreloc;
+      uint16_t nself;
+   } emit, saved;
+
+   uint64_t aperture;
+   uint64_t max_aperture;
+
+   bool has_softpin : 1;
+   bool has_lut : 1;
+   bool has_llc : 1;
+   bool has_wc : 1;
+   bool needs_pipecontrol_ggtt_wa : 1;
+
+   /** Framerate throttling: @{ */
+   /* Limit the number of outstanding SwapBuffers by waiting for an earlier
+    * frame of rendering to complete. This gives a very precise cap to the
+    * latency between input and output such that rendering never gets more
+    * than a frame behind the user. (With the caveat that we technically are
+    * not using the SwapBuffers itself as a barrier but the first batch
+    * submitted afterwards, which may be immediately prior to the next
+    * SwapBuffers.)
+    */
+   bool need_swap_throttle : 1;
+
+   /** General throttling, not caught by throttling between SwapBuffers */
+   bool need_flush_throttle : 1;
+   bool disable_throttling : 1;
+   /** @} */
+
+   bool debug : 1;
+
+   jmp_buf jmpbuf;
+
+   uint16_t exec_size;
+   uint16_t reloc_size;
+
+   struct drm_i915_gem_exec_object2 *exec;
+   struct drm_i915_gem_relocation_entry *reloc;
+   uint16_t self_reloc[256];
+
+   int hw_ring[__BRW_NUM_RINGS];
+   struct list requests[__BRW_NUM_RINGS];
+   struct brw_request *throttle[2];
+   struct brw_request *next_request;
+   struct brw_request *freed_rq;
+
+   struct intel_screen *screen;
+   struct _drm_intel_bufmgr *bufmgr;
+   struct list active;
+   struct list borrowed;
+   struct list alive;
+
+   struct brw_bo *freed_bo;
+};
+
+uint64_t brw_batch_read_register(struct brw_batch *batch, uint32_t reg);
+
+int brw_batch_init(struct brw_batch *batch,
+		   struct intel_screen *screen);
+
+void brw_batch_enable_debug(struct brw_batch *batch);
+void brw_batch_clear_dirty(struct brw_batch *batch);
+
+/** Add a relocation entry to the current batch */
+uint64_t brw_batch_reloc(struct brw_batch *batch,
+			 uint32_t batch_offset,
+			 struct brw_bo *target_bo,
+			 uint32_t target_offset,
+			 unsigned read_domains,
+			 unsigned write_domain);
+
+int brw_batch_flush(struct brw_batch *batch);
+
+/** Wait for the last submitted rendering to complete */
+void brw_batch_wait(struct brw_batch *batch);
+
+void brw_batch_fini(struct brw_batch *batch);
+
+struct brw_bo *
+brw_bo_import(struct brw_batch *batch,
+	      struct _drm_intel_bo *base,
+	      bool borrow);
+
+struct brw_bo *
+brw_bo_create(struct brw_batch *batch,
+	      const char *name,
+	      uint64_t size,
+	      uint64_t alignment);
+
+struct brw_bo *
+brw_bo_create_tiled(struct brw_batch *batch,
+		    const char *name,
+		    uint32_t width,
+		    uint32_t height,
+		    int cpp,
+		    uint32_t *tiling,
+		    uint32_t *pitch,
+		    unsigned flags);
+
+struct brw_bo *brw_bo_create_from_name(struct brw_batch *batch,
+				       const char *name,
+				       uint32_t global_name);
+
+int brw_bo_madvise(struct brw_bo *bo, int madv);
+int brw_bo_wait(struct brw_bo *bo, int64_t timeout);
+uint32_t brw_bo_flink(struct brw_bo *bo);
+
+void brw_bo_write(struct brw_bo *bo, uint64_t offset,
+		  const void *data, uint64_t length,
+		  unsigned flags);
+void brw_bo_read(struct brw_bo *bo, uint64_t offset,
+		 void *data, uint64_t length,
+		 unsigned flags);
+
+bool brw_request_busy(struct brw_request *rq);
+
+static inline void brw_bo_flush(struct brw_bo *bo)
+{
+   assert(bo->refcnt);
+   if (bo->exec)
+      brw_batch_flush(bo->batch);
+}
+
+static inline bool brw_bo_busy(struct brw_bo *bo, unsigned flags)
+#define BUSY_READ 0
+#define BUSY_WRITE 1
+#define BUSY_FLUSH 2
+{
+   struct brw_request *rq;
+
+   if (!bo)
+      return false;
+
+   assert(bo->refcnt);
+   rq = flags & BUSY_WRITE ? bo->read.rq : bo->write.rq;
+   if (!rq)
+      return false;
+
+   if (flags & BUSY_FLUSH) {
+      brw_bo_flush(bo);
+      return brw_request_busy(rq);
+   }
+
+   return true;
+}
+
+void *brw_bo_map(struct brw_bo *bo, unsigned flags);
+#define MAP_READ 0x0
+#define MAP_WRITE 0x1
+#define MAP_ASYNC 0x2
+#define MAP_COHERENT 0x4
+#define MAP_DETILED 0x8
+
+static inline struct brw_bo *brw_bo_get(struct brw_bo *bo)
+{
+   assert(bo == NULL || bo->refcnt > 0);
+   if (bo)
+      bo->refcnt++;
+   return bo;
+}
+
+void  __brw_bo_free(struct brw_bo *bo);
+static inline void brw_bo_put(struct brw_bo *bo)
+{
+   assert(bo == NULL || bo->refcnt > 0);
+   if (bo && --bo->refcnt == 0)
+      __brw_bo_free(bo);
+}
+
+int brw_batch_begin(struct brw_batch *batch, uint32_t count, int ring);
+int brw_batch_end(struct brw_batch *batch);
+bool brw_batch_busy(struct brw_batch *batch);
+
+static inline void __brw_batch_check(struct brw_batch *batch, int count, int ring)
+{
+   assert(batch->emit.nbatch + count < batch->state - batch->reserved);
+   assert(batch->ring == ring);
+}
+
+static inline void brw_batch_emit(struct brw_batch *batch, uint32_t dw)
+{
+   batch->map[batch->emit.nbatch++] = dw;
+}
+
+static inline void brw_batch_data(struct brw_batch *batch,
+				  const void *data,
+				  int bytes)
+{
+   assert(batch->emit.nbatch + bytes/4 < batch->state - batch->reserved);
+   assert((bytes & 3) == 0);
+   memcpy(batch->map + batch->emit.nbatch, data, bytes);
+   batch->emit.nbatch += bytes / 4;
+}
+
+static inline uint32_t float_as_int(float f)
+{
+   union {
+      float f;
+      uint32_t dw;
+   } fi;
+
+   fi.f = f;
+   return fi.dw;
+}
+
+static inline void brw_batch_emit_f(struct brw_batch *batch, float f)
+{
+   brw_batch_emit(batch, float_as_int(f));
+}
+
+static inline void brw_batch_emit64(struct brw_batch *batch, uint64_t qw)
+{
+   *(uint64_t *)(batch->map + batch->emit.nbatch) = qw;
+   batch->emit.nbatch += 2;
+}
+
+#define BEGIN_BATCH(n) __brw_batch_check(&brw->batch, n, RENDER_RING)
+#define BEGIN_BATCH_BLT(n) __brw_batch_check(&brw->batch, n, BLT_RING)
+#define OUT_BATCH(dw) brw_batch_emit(&brw->batch, dw)
+#define OUT_BATCH_F(f) brw_batch_emit_f(&brw->batch, f)
+#define OUT_RELOC(bo, read_domains, write_domain, delta) \
+	brw_batch_emit(&brw->batch, \
+		       brw_batch_reloc(&brw->batch, brw->batch.emit.nbatch*4, \
+				       bo, delta, \
+				       read_domains, write_domain))
+#define OUT_BATCH64(qw) brw_batch_emit64(&brw->batch, qw)
+#define OUT_RELOC64(bo, read_domains, write_domain, delta) \
+	brw_batch_emit64(&brw->batch, \
+			 brw_batch_reloc(&brw->batch, brw->batch.emit.nbatch*4,\
+					 bo, delta, \
+					 read_domains, write_domain))
+#define ADVANCE_BATCH()
+
+#endif /* BRW_BATCH_H */
diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c
index 98ff0dd..697b4c7 100644
--- a/src/mesa/drivers/dri/i965/brw_binding_tables.c
+++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c
@@ -42,7 +42,6 @@ 
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
 
 /**
  * Upload a shader stage's binding table as indirect state.
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.cpp b/src/mesa/drivers/dri/i965/brw_blorp.cpp
index 2ccfae1..d08edbf 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp.cpp
@@ -22,7 +22,6 @@ 
  */
 
 #include <errno.h>
-#include "intel_batchbuffer.h"
 #include "intel_fbo.h"
 
 #include "brw_blorp.h"
@@ -211,7 +210,9 @@  brw_blorp_exec(struct brw_context *brw, const brw_blorp_params *params)
 {
    struct gl_context *ctx = &brw->ctx;
    uint32_t estimated_max_batch_usage = 1500;
-   bool check_aperture_failed_once = false;
+
+   if (brw_batch_begin(&brw->batch, estimated_max_batch_usage, RENDER_RING) < 0)
+      return;
 
    /* Flush the sampler and render caches.  We definitely need to flush the
     * sampler cache so that we get updated contents from the render cache for
@@ -222,13 +223,6 @@  brw_blorp_exec(struct brw_context *brw, const brw_blorp_params *params)
     */
    brw_emit_mi_flush(brw);
 
-retry:
-   intel_batchbuffer_require_space(brw, estimated_max_batch_usage, RENDER_RING);
-   intel_batchbuffer_save_state(brw);
-   drm_intel_bo *saved_bo = brw->batch.bo;
-   uint32_t saved_used = brw->batch.used;
-   uint32_t saved_state_batch_offset = brw->batch.state_batch_offset;
-
    switch (brw->gen) {
    case 6:
       gen6_blorp_exec(brw, params);
@@ -241,37 +235,17 @@  retry:
       unreachable("not reached");
    }
 
-   /* Make sure we didn't wrap the batch unintentionally, and make sure we
-    * reserved enough space that a wrap will never happen.
-    */
-   assert(brw->batch.bo == saved_bo);
-   assert((brw->batch.used - saved_used) * 4 +
-          (saved_state_batch_offset - brw->batch.state_batch_offset) <
-          estimated_max_batch_usage);
-   /* Shut up compiler warnings on release build */
-   (void)saved_bo;
-   (void)saved_used;
-   (void)saved_state_batch_offset;
-
    /* Check if the blorp op we just did would make our batch likely to fail to
     * map all the BOs into the GPU at batch exec time later.  If so, flush the
     * batch and try again with nothing else in the batch.
     */
-   if (dri_bufmgr_check_aperture_space(&brw->batch.bo, 1)) {
-      if (!check_aperture_failed_once) {
-         check_aperture_failed_once = true;
-         intel_batchbuffer_reset_to_saved(brw);
-         intel_batchbuffer_flush(brw);
-         goto retry;
-      } else {
-         int ret = intel_batchbuffer_flush(brw);
-         WARN_ONCE(ret == -ENOSPC,
-                   "i965: blorp emit exceeded available aperture space\n");
-      }
+   if (brw_batch_end(&brw->batch)) {
+      WARN_ONCE(1, "i965: blorp emit exceeded available aperture space\n");
+      return;
    }
 
    if (unlikely(brw->always_flush_batch))
-      intel_batchbuffer_flush(brw);
+      brw_batch_flush(&brw->batch);
 
    /* We've smashed all state compared to what the normal 3D pipeline
     * rendering tracks for GL.
@@ -279,11 +253,6 @@  retry:
    brw->ctx.NewDriverState = ~0ull;
    brw->no_depth_or_stencil = false;
    brw->ib.type = -1;
-
-   /* Flush the sampler cache so any texturing from the destination is
-    * coherent.
-    */
-   brw_emit_mi_flush(brw);
 }
 
 brw_hiz_op_params::brw_hiz_op_params(struct intel_mipmap_tree *mt,
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index 354c733..4f62b29 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -36,7 +36,6 @@ 
 #include "brw_util.h"
 #include "main/macros.h"
 #include "main/stencil.h"
-#include "intel_batchbuffer.h"
 
 static void
 brw_upload_cc_vp(struct brw_context *brw)
@@ -227,17 +226,14 @@  static void upload_cc_unit(struct brw_context *brw)
       cc->cc5.statistics_enable = 1;
 
    /* BRW_NEW_CC_VP */
-   cc->cc4.cc_viewport_state_offset = (brw->batch.bo->offset64 +
-				       brw->cc.vp_offset) >> 5; /* reloc */
+   cc->cc4.cc_viewport_state_offset =
+      brw_batch_reloc(&brw->batch,
+		      (brw->cc.state_offset +
+		       offsetof(struct brw_cc_unit_state, cc4)),
+		      brw->batch.bo, brw->cc.vp_offset,
+		      I915_GEM_DOMAIN_INSTRUCTION, 0) >> 5;
 
    brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
-
-   /* Emit CC viewport relocation */
-   drm_intel_bo_emit_reloc(brw->batch.bo,
-			   (brw->cc.state_offset +
-			    offsetof(struct brw_cc_unit_state, cc4)),
-			   brw->batch.bo, brw->cc.vp_offset,
-			   I915_GEM_DOMAIN_INSTRUCTION, 0);
 }
 
 const struct brw_tracked_state brw_cc_unit = {
diff --git a/src/mesa/drivers/dri/i965/brw_clear.c b/src/mesa/drivers/dri/i965/brw_clear.c
index a6524aa..ef146a7 100644
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -32,7 +32,6 @@ 
 #include "swrast/swrast.h"
 #include "drivers/common/meta.h"
 
-#include "intel_batchbuffer.h"
 #include "intel_blit.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index 3a73c64..e044375 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -33,8 +33,6 @@ 
 #include "main/macros.h"
 #include "main/enums.h"
 
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/mesa/drivers/dri/i965/brw_clip_line.c
index 8e34f7c..65db789 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_line.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_line.c
@@ -34,8 +34,6 @@ 
 #include "main/enums.h"
 #include "program/program.h"
 
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/mesa/drivers/dri/i965/brw_clip_point.c b/src/mesa/drivers/dri/i965/brw_clip_point.c
index 81487d3..9c886ff 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_point.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_point.c
@@ -34,8 +34,6 @@ 
 #include "main/enums.h"
 #include "program/program.h"
 
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index 3223834..3770213 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -130,16 +130,14 @@  brw_upload_clip_unit(struct brw_context *brw)
        ctx->ViewportArray[0].Width == (float) fb->Width &&
        ctx->ViewportArray[0].Height == (float) fb->Height)
    {
+      /* emit clip viewport relocation */
       clip->clip5.guard_band_enable = 1;
       clip->clip6.clipper_viewport_state_ptr =
-         (brw->batch.bo->offset64 + brw->clip.vp_offset) >> 5;
-
-      /* emit clip viewport relocation */
-      drm_intel_bo_emit_reloc(brw->batch.bo,
-                              (brw->clip.state_offset +
-                               offsetof(struct brw_clip_unit_state, clip6)),
-                              brw->batch.bo, brw->clip.vp_offset,
-                              I915_GEM_DOMAIN_INSTRUCTION, 0);
+	 brw_batch_reloc(&brw->batch,
+			 (brw->clip.state_offset +
+			  offsetof(struct brw_clip_unit_state, clip6)),
+			 brw->batch.bo, brw->clip.vp_offset,
+			 I915_GEM_DOMAIN_INSTRUCTION, 0) >> 5;
    }
 
    /* _NEW_TRANSFORM */
diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/mesa/drivers/dri/i965/brw_clip_tri.c
index cca7eb1..64db7e4 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_tri.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_tri.c
@@ -34,8 +34,6 @@ 
 #include "main/enums.h"
 #include "program/program.h"
 
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
index 6baf620..48c2648 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
@@ -34,8 +34,6 @@ 
 #include "main/enums.h"
 #include "program/program.h"
 
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c
index 40ad144..7b953b2 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_util.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_util.c
@@ -35,8 +35,6 @@ 
 #include "main/enums.h"
 #include "program/program.h"
 
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 7be3f5c..6bfaa48 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -53,13 +53,13 @@ 
 #include "brw_draw.h"
 #include "brw_state.h"
 
-#include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 #include "intel_buffers.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 #include "intel_pixel.h"
 #include "intel_image.h"
+#include "intel_reg.h"
 #include "intel_tex.h"
 #include "intel_tex_obj.h"
 
@@ -187,7 +187,8 @@  intel_update_state(struct gl_context * ctx, GLuint new_state)
 	 continue;
       intel_miptree_all_slices_resolve_depth(brw, tex_obj->mt);
       intel_miptree_resolve_color(brw, tex_obj->mt);
-      brw_render_cache_set_check_flush(brw, tex_obj->mt->bo);
+      if (tex_obj->mt->bo->dirty)
+	 brw_emit_mi_flush(brw);
    }
 
    _mesa_lock_context_textures(ctx);
@@ -215,7 +216,7 @@  intel_flush_front(struct gl_context *ctx)
           * performance.
           */
          intel_resolve_for_dri2_flush(brw, driDrawable);
-         intel_batchbuffer_flush(brw);
+         brw_batch_flush(&brw->batch);
 
          flushFront(screen)(driDrawable, driDrawable->loaderPrivate);
 
@@ -232,10 +233,10 @@  intel_glFlush(struct gl_context *ctx)
 {
    struct brw_context *brw = brw_context(ctx);
 
-   intel_batchbuffer_flush(brw);
+   brw_batch_flush(&brw->batch);
    intel_flush_front(ctx);
 
-   brw->need_flush_throttle = true;
+   brw->batch.need_flush_throttle = true;
 }
 
 static void
@@ -245,8 +246,7 @@  intel_finish(struct gl_context * ctx)
 
    intel_glFlush(ctx);
 
-   if (brw->batch.last_bo)
-      drm_intel_bo_wait_rendering(brw->batch.last_bo);
+   brw_batch_wait(&brw->batch);
 }
 
 static void
@@ -626,15 +626,6 @@  brw_process_driconf_options(struct brw_context *brw)
    driParseConfigFiles(options, &brw->intelScreen->optionCache,
                        brw->driContext->driScreenPriv->myNum, "i965");
 
-   int bo_reuse_mode = driQueryOptioni(options, "bo_reuse");
-   switch (bo_reuse_mode) {
-   case DRI_CONF_BO_REUSE_DISABLED:
-      break;
-   case DRI_CONF_BO_REUSE_ALL:
-      intel_bufmgr_gem_enable_reuse(brw->bufmgr);
-      break;
-   }
-
    if (!driQueryOptionb(options, "hiz")) {
        brw->has_hiz = false;
        /* On gen6, you can only do separate stencil with HIZ. */
@@ -654,7 +645,7 @@  brw_process_driconf_options(struct brw_context *brw)
 
    if (driQueryOptionb(options, "disable_throttling")) {
       fprintf(stderr, "disabling flush throttling\n");
-      brw->disable_throttling = true;
+      brw->batch.disable_throttling = true;
    }
 
    brw->precompile = driQueryOptionb(&brw->optionCache, "shader_precompile");
@@ -710,7 +701,18 @@  brwCreateContext(gl_api api,
    driContextPriv->driverPrivate = brw;
    brw->driContext = driContextPriv;
    brw->intelScreen = screen;
-   brw->bufmgr = screen->bufmgr;
+
+   if (brw_batch_init(&brw->batch, screen)) {
+      fprintf(stderr, "%s: failed to alloc batch\n", __func__);
+      *dri_ctx_error = __DRI_CTX_ERROR_NO_MEMORY;
+      return false;
+   }
+
+   if (brw_init_pipe_control(brw, devinfo)) {
+      fprintf(stderr, "%s: failed to alloc workarounds\n", __func__);
+      *dri_ctx_error = __DRI_CTX_ERROR_NO_MEMORY;
+      return false;
+   }
 
    brw->gen = devinfo->gen;
    brw->gt = devinfo->gt;
@@ -803,31 +805,6 @@  brwCreateContext(gl_api api,
 
    intel_fbo_init(brw);
 
-   intel_batchbuffer_init(brw);
-
-   if (brw->gen >= 6) {
-      /* Create a new hardware context.  Using a hardware context means that
-       * our GPU state will be saved/restored on context switch, allowing us
-       * to assume that the GPU is in the same state we left it in.
-       *
-       * This is required for transform feedback buffer offsets, query objects,
-       * and also allows us to reduce how much state we have to emit.
-       */
-      brw->hw_ctx = drm_intel_gem_context_create(brw->bufmgr);
-
-      if (!brw->hw_ctx) {
-         fprintf(stderr, "Gen6+ requires Kernel 3.6 or later.\n");
-         intelDestroyContext(driContextPriv);
-         return false;
-      }
-   }
-
-   if (brw_init_pipe_control(brw, devinfo)) {
-      *dri_ctx_error = __DRI_CTX_ERROR_NO_MEMORY;
-      intelDestroyContext(driContextPriv);
-      return false;
-   }
-
    brw_init_state(brw);
 
    intelInitExtensions(ctx);
@@ -909,7 +886,7 @@  intelDestroyContext(__DRIcontext * driContextPriv)
 
    /* Dump a final BMP in case the application doesn't call SwapBuffers */
    if (INTEL_DEBUG & DEBUG_AUB) {
-      intel_batchbuffer_flush(brw);
+      brw_batch_flush(&brw->batch);
       aub_dump_bmp(&brw->ctx);
    }
 
@@ -927,15 +904,10 @@  intelDestroyContext(__DRIcontext * driContextPriv)
    brw_destroy_state(brw);
    brw_draw_destroy(brw);
 
-   drm_intel_bo_unreference(brw->curbe.curbe_bo);
-   if (brw->vs.base.scratch_bo)
-      drm_intel_bo_unreference(brw->vs.base.scratch_bo);
-   if (brw->gs.base.scratch_bo)
-      drm_intel_bo_unreference(brw->gs.base.scratch_bo);
-   if (brw->wm.base.scratch_bo)
-      drm_intel_bo_unreference(brw->wm.base.scratch_bo);
-
-   drm_intel_gem_context_destroy(brw->hw_ctx);
+   brw_bo_put(brw->curbe.curbe_bo);
+   brw_bo_put(brw->vs.base.scratch_bo);
+   brw_bo_put(brw->gs.base.scratch_bo);
+   brw_bo_put(brw->wm.base.scratch_bo);
 
    if (ctx->swrast_context) {
       _swsetup_DestroyContext(&brw->ctx);
@@ -947,12 +919,7 @@  intelDestroyContext(__DRIcontext * driContextPriv)
       _swrast_DestroyContext(&brw->ctx);
 
    brw_fini_pipe_control(brw);
-   intel_batchbuffer_free(brw);
-
-   drm_intel_bo_unreference(brw->throttle_batch[1]);
-   drm_intel_bo_unreference(brw->throttle_batch[0]);
-   brw->throttle_batch[1] = NULL;
-   brw->throttle_batch[0] = NULL;
+   brw_batch_fini(&brw->batch);
 
    driDestroyOptionCache(&brw->optionCache);
 
@@ -1291,7 +1258,7 @@  intel_query_dri2_buffers(struct brw_context *brw,
        * query, we need to make sure all the pending drawing has landed in the
        * real front buffer.
        */
-      intel_batchbuffer_flush(brw);
+      brw_batch_flush(&brw->batch);
       intel_flush_front(&brw->ctx);
 
       attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
@@ -1303,7 +1270,7 @@  intel_query_dri2_buffers(struct brw_context *brw,
        * So before doing the query, make sure all the pending drawing has
        * landed in the real front buffer.
        */
-      intel_batchbuffer_flush(brw);
+      brw_batch_flush(&brw->batch);
       intel_flush_front(&brw->ctx);
    }
 
@@ -1344,7 +1311,7 @@  intel_process_dri2_buffer(struct brw_context *brw,
                           const char *buffer_name)
 {
    struct gl_framebuffer *fb = drawable->driverPrivate;
-   drm_intel_bo *bo;
+   struct brw_bo *bo;
 
    if (!rb)
       return;
@@ -1368,7 +1335,7 @@  intel_process_dri2_buffer(struct brw_context *brw,
 	* name, then drm_intel_bo_flink() is a low-cost getter.  It does not
 	* create a new name.
 	*/
-      drm_intel_bo_flink(last_mt->bo, &old_name);
+      old_name = brw_bo_flink(last_mt->bo);
    }
 
    if (old_name == buffer->name)
@@ -1381,9 +1348,7 @@  intel_process_dri2_buffer(struct brw_context *brw,
               buffer->cpp, buffer->pitch);
    }
 
-   intel_miptree_release(&rb->mt);
-   bo = drm_intel_bo_gem_create_from_name(brw->bufmgr, buffer_name,
-                                          buffer->name);
+   bo = brw_bo_create_from_name(&brw->batch, buffer_name, buffer->name);
    if (!bo) {
       fprintf(stderr,
               "Failed to open BO for returned DRI2 buffer "
@@ -1394,9 +1359,11 @@  intel_process_dri2_buffer(struct brw_context *brw,
       return;
    }
 
+   //intel_miptree_release(&rb->mt);
    intel_update_winsys_renderbuffer_miptree(brw, rb, bo,
                                             drawable->w, drawable->h,
                                             buffer->pitch);
+   brw_bo_put(bo);
 
    if (brw_is_front_buffer_drawing(fb) &&
        (buffer->attachment == __DRI_BUFFER_FRONT_LEFT ||
@@ -1406,8 +1373,6 @@  intel_process_dri2_buffer(struct brw_context *brw,
    }
 
    assert(rb->mt);
-
-   drm_intel_bo_unreference(bo);
 }
 
 /**
@@ -1449,12 +1414,15 @@  intel_update_image_buffer(struct brw_context *intel,
    else
       last_mt = rb->singlesample_mt;
 
-   if (last_mt && last_mt->bo == buffer->bo)
+   if (last_mt && last_mt->bo->handle == buffer->bo->handle)
       return;
 
-   intel_update_winsys_renderbuffer_miptree(intel, rb, buffer->bo,
+   struct brw_bo *bo = brw_bo_import(&intel->batch, buffer->bo, true);
+
+   intel_update_winsys_renderbuffer_miptree(intel, rb, bo,
                                             buffer->width, buffer->height,
                                             buffer->pitch);
+   brw_bo_put(bo);
 
    if (brw_is_front_buffer_drawing(fb) &&
        buffer_type == __DRI_IMAGE_BUFFER_FRONT &&
@@ -1518,3 +1486,91 @@  intel_update_image_buffers(struct brw_context *brw, __DRIdrawable *drawable)
                                 __DRI_IMAGE_BUFFER_BACK);
    }
 }
+
+/**
+ * Called when starting a new batch buffer.
+ */
+void
+brw_start_batch(struct brw_batch *batch)
+{
+   struct brw_context *brw = container_of(batch, struct brw_context, batch);
+
+   if (batch->ring != RENDER_RING)
+      return;
+
+   /* If the kernel supports hardware contexts, then most hardware state is
+    * preserved between batches; we only need to re-emit state that is required
+    * to be in every batch.  Otherwise we need to re-emit all the state that
+    * would otherwise be stored in the context (which for all intents and
+    * purposes means everything).
+    */
+   if (!batch->hw_ctx)
+      brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
+
+   brw->ctx.NewDriverState |= BRW_NEW_BATCH;
+
+   brw->state_batch_count = 0;
+
+   brw->ib.type = -1;
+
+   /* We need to periodically reap the shader time results, because rollover
+    * happens every few seconds.  We also want to see results every once in a
+    * while, because many programs won't cleanly destroy our context, so the
+    * end-of-run printout may not happen.
+    */
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      brw_collect_and_report_shader_time(brw);
+
+   if (INTEL_DEBUG & DEBUG_PERFMON)
+      brw_dump_perf_monitors(brw);
+
+   brw_perf_monitor_new_batch(brw);
+}
+
+/**
+ * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
+ * sending it off.
+ *
+ * This function can emit state (say, to preserve registers that aren't saved
+ * between batches).  All of this state MUST fit in the reserved space at the
+ * end of the batchbuffer.  If you add more GPU state, increase the reserved
+ * space by updating the BATCH_RESERVED macro.
+ */
+void brw_finish_batch(struct brw_batch *batch)
+{
+   struct brw_context *brw = container_of(batch, struct brw_context, batch);
+
+   /* Capture the closing pipeline statistics register values necessary to
+    * support query objects (in the non-hardware context world).
+    */
+   brw_emit_query_end(brw);
+
+   /* We may also need to snapshot and disable OA counters. */
+   if (batch->ring == RENDER_RING)
+      brw_perf_monitor_finish_batch(brw);
+}
+
+void
+brw_load_register_mem(struct brw_context *brw,
+		      uint32_t reg,
+		      struct brw_bo *bo,
+		      uint32_t read_domains, uint32_t write_domain,
+		      uint32_t offset)
+{
+   /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
+   assert(brw->gen >= 7);
+
+   if (brw->gen >= 8) {
+      BEGIN_BATCH(4);
+      OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
+      OUT_BATCH(reg);
+      OUT_RELOC64(bo, read_domains, write_domain, offset);
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(3);
+      OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
+      OUT_BATCH(reg);
+      OUT_RELOC(bo, read_domains, write_domain, offset);
+      ADVANCE_BATCH();
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 54f0dee..c48a51b 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -50,7 +50,6 @@  extern "C" {
 #endif
 
 #include <drm.h>
-#include <intel_bufmgr.h>
 #include <i915_drm.h>
 #ifdef __cplusplus
 	#undef virtual
@@ -65,6 +64,8 @@  extern "C" {
 #include "intel_tex_obj.h"
 #include "intel_resolve_map.h"
 
+#include "brw_batch.h"
+
 /* Glossary:
  *
  * URB - uniform resource buffer.  A mid-sized buffer which is
@@ -775,7 +776,7 @@  struct brw_cache {
    struct brw_context *brw;
 
    struct brw_cache_item **items;
-   drm_intel_bo *bo;
+   struct brw_bo *bo;
    GLuint size, n_items;
 
    uint32_t next_offset;
@@ -821,7 +822,7 @@  enum shader_time_shader_type {
 
 struct brw_vertex_buffer {
    /** Buffer object containing the uploaded vertex data */
-   drm_intel_bo *bo;
+   struct brw_bo *bo;
    uint32_t offset;
    /** Byte stride between elements in the uploaded array */
    GLuint stride;
@@ -840,7 +841,7 @@  struct brw_query_object {
    struct gl_query_object Base;
 
    /** Last query BO associated with this query. */
-   drm_intel_bo *bo;
+   struct brw_bo *bo;
 
    /** Last index in bo with query data for this object. */
    int last_index;
@@ -853,35 +854,7 @@  struct intel_sync_object {
    struct gl_sync_object Base;
 
    /** Batch associated with this sync object */
-   drm_intel_bo *bo;
-};
-
-enum brw_gpu_ring {
-   UNKNOWN_RING,
-   RENDER_RING,
-   BLT_RING,
-};
-
-struct intel_batchbuffer {
-   /** Current batchbuffer being queued up. */
-   drm_intel_bo *bo;
-   /** Last BO submitted to the hardware.  Used for glFinish(). */
-   drm_intel_bo *last_bo;
-
-   uint16_t emit, total;
-   uint16_t used, reserved_space;
-   uint32_t *map;
-   uint32_t *cpu_map;
-#define BATCH_SZ (8192*sizeof(uint32_t))
-
-   uint32_t state_batch_offset;
-   enum brw_gpu_ring ring;
-   bool needs_sol_reset;
-
-   struct {
-      uint16_t used;
-      int reloc_count;
-   } saved;
+   struct brw_bo *bo;
 };
 
 #define BRW_MAX_XFB_STREAMS 4
@@ -890,7 +863,7 @@  struct brw_transform_feedback_object {
    struct gl_transform_feedback_object base;
 
    /** A buffer to hold SO_WRITE_OFFSET(n) values while paused. */
-   drm_intel_bo *offset_bo;
+   struct brw_bo *offset_bo;
 
    /** If true, SO_WRITE_OFFSET(n) should be reset to zero at next use. */
    bool zero_offsets;
@@ -903,7 +876,7 @@  struct brw_transform_feedback_object {
     *  @{
     */
    uint64_t prims_generated[BRW_MAX_XFB_STREAMS];
-   drm_intel_bo *prim_count_bo;
+   struct brw_bo *prim_count_bo;
    unsigned prim_count_buffer_index; /**< in number of uint64_t units */
    /** @} */
 
@@ -929,7 +902,7 @@  struct brw_stage_state
     * Optional scratch buffer used to store spilled register values and
     * variably-indexed GRF arrays.
     */
-   drm_intel_bo *scratch_bo;
+   struct brw_bo *scratch_bo;
 
    /** Offset in the program cache to the program */
    uint32_t prog_offset;
@@ -949,7 +922,6 @@  struct brw_stage_state
    uint32_t sampler_offset;
 };
 
-
 /**
  * brw_context is derived from gl_context.
  */
@@ -981,7 +953,7 @@  struct brw_context
                                          bool rw, bool for_gather);
       void (*emit_buffer_surface_state)(struct brw_context *brw,
                                         uint32_t *out_offset,
-                                        drm_intel_bo *bo,
+                                        struct brw_bo *bo,
                                         unsigned buffer_offset,
                                         unsigned surface_format,
                                         unsigned buffer_size,
@@ -1009,22 +981,13 @@  struct brw_context
 
    } vtbl;
 
-   dri_bufmgr *bufmgr;
-
-   drm_intel_context *hw_ctx;
+   struct brw_batch batch;
 
    /** BO for post-sync nonzero writes for gen6 workaround. */
-   drm_intel_bo *workaround_bo;
+   struct brw_bo *workaround_bo;
    uint8_t pipe_controls_since_last_cs_stall;
 
    /**
-    * Set of drm_intel_bo * that have been rendered to within this batchbuffer
-    * and would need flushing before being used from another cache domain that
-    * isn't coherent with it (i.e. the sampler).
-    */
-   struct set *render_cache;
-
-   /**
     * Number of resets observed in the system at context creation.
     *
     * This is tracked in the context so that we can determine that another
@@ -1032,11 +995,8 @@  struct brw_context
     */
    uint32_t reset_count;
 
-   struct intel_batchbuffer batch;
-   bool no_batch_wrap;
-
    struct {
-      drm_intel_bo *bo;
+      struct brw_bo *bo;
       uint32_t next_offset;
    } upload;
 
@@ -1048,23 +1008,6 @@  struct brw_context
     */
    bool front_buffer_dirty;
 
-   /** Framerate throttling: @{ */
-   drm_intel_bo *throttle_batch[2];
-
-   /* Limit the number of outstanding SwapBuffers by waiting for an earlier
-    * frame of rendering to complete. This gives a very precise cap to the
-    * latency between input and output such that rendering never gets more
-    * than a frame behind the user. (With the caveat that we technically are
-    * not using the SwapBuffers itself as a barrier but the first batch
-    * submitted afterwards, which may be immediately prior to the next
-    * SwapBuffers.)
-    */
-   bool need_swap_throttle;
-
-   /** General throttling, not caught by throttling between SwapBuffers */
-   bool need_flush_throttle;
-   /** @} */
-
    GLuint stats_wm;
 
    /**
@@ -1074,7 +1017,6 @@  struct brw_context
    bool no_rast;
    bool always_flush_batch;
    bool always_flush_cache;
-   bool disable_throttling;
    bool precompile;
 
    driOptionCache optionCache;
@@ -1150,7 +1092,7 @@  struct brw_context
        * Buffer and offset used for GL_ARB_shader_draw_parameters
        * (for now, only gl_BaseVertex).
        */
-      drm_intel_bo *draw_params_bo;
+      struct brw_bo *draw_params_bo;
       uint32_t draw_params_offset;
    } draw;
 
@@ -1190,7 +1132,7 @@  struct brw_context
       const struct _mesa_index_buffer *ib;
 
       /* Updates are signaled by BRW_NEW_INDEX_BUFFER. */
-      drm_intel_bo *bo;
+      struct brw_bo *bo;
       GLuint type;
 
       /* Offset to index buffer index to use in CMD_3D_PRIM so that we can
@@ -1273,7 +1215,7 @@  struct brw_context
        * Pointer to the (intel_upload.c-generated) BO containing the uniforms
        * for upload to the CURBE.
        */
-      drm_intel_bo *curbe_bo;
+      struct brw_bo *curbe_bo;
       /** Offset within curbe_bo of space for current curbe entry */
       GLuint curbe_offset;
    } curbe;
@@ -1363,7 +1305,7 @@  struct brw_context
        * Buffer object used in place of multisampled null render targets on
        * Gen6.  See brw_emit_null_surface_state().
        */
-      drm_intel_bo *multisampled_null_render_target_bo;
+      struct brw_bo *multisampled_null_render_target_bo;
       uint32_t fast_clear_op;
    } wm;
 
@@ -1391,7 +1333,7 @@  struct brw_context
        * A buffer object storing OA counter snapshots taken at the start and
        * end of each batch (creating "bookends" around the batch).
        */
-      drm_intel_bo *bookend_bo;
+      struct brw_bo *bookend_bo;
 
       /** The number of snapshots written to bookend_bo. */
       int bookend_snapshots;
@@ -1461,7 +1403,7 @@  struct brw_context
    int basevertex;
 
    struct {
-      drm_intel_bo *bo;
+      struct brw_bo *bo;
       const char **names;
       int *ids;
       enum shader_time_shader_type *types;
@@ -1477,6 +1419,14 @@  struct brw_context
    struct intel_screen *intelScreen;
 };
 
+static inline int brw_to_fd(struct brw_context *brw)
+{
+   return intel_screen_to_fd(brw->intelScreen);
+}
+
+void brw_start_batch(struct brw_batch *batch);
+void brw_finish_batch(struct brw_batch *batch);
+
 /*======================================================================
  * brw_vtbl.c
  */
@@ -1574,23 +1524,23 @@  void brw_emit_query_end(struct brw_context *brw);
 
 /** gen6_queryobj.c */
 void gen6_init_queryobj_functions(struct dd_function_table *functions);
-void brw_write_timestamp(struct brw_context *brw, drm_intel_bo *bo, int idx);
-void brw_write_depth_count(struct brw_context *brw, drm_intel_bo *bo, int idx);
+void brw_write_timestamp(struct brw_context *brw, struct brw_bo *bo, int idx);
+void brw_write_depth_count(struct brw_context *brw, struct brw_bo *bo, int idx);
 void brw_store_register_mem64(struct brw_context *brw,
-                              drm_intel_bo *bo, uint32_t reg, int idx);
+                              struct brw_bo *bo, uint32_t reg, int idx);
 
 /** intel_batchbuffer.c */
 void brw_load_register_mem(struct brw_context *brw,
                            uint32_t reg,
-                           drm_intel_bo *bo,
+                           struct brw_bo *bo,
                            uint32_t read_domains, uint32_t write_domain,
                            uint32_t offset);
 
 /*======================================================================
  * brw_state_dump.c
  */
-void brw_debug_batch(struct brw_context *brw);
-void brw_annotate_aub(struct brw_context *brw);
+void brw_debug_batch(struct brw_batch *batch);
+void brw_annotate_batch(struct brw_batch *batch);
 
 /*======================================================================
  * brw_tex.c
@@ -1605,7 +1555,7 @@  void brwInitFragProgFuncs( struct dd_function_table *functions );
 
 int brw_get_scratch_size(int size);
 void brw_get_scratch_bo(struct brw_context *brw,
-			drm_intel_bo **scratch_bo, int size);
+			struct brw_bo **scratch_bo, int size);
 void brw_init_shader_time(struct brw_context *brw);
 int brw_get_shader_time_index(struct brw_context *brw,
                               struct gl_shader_program *shader_prog,
@@ -1665,7 +1615,7 @@  void brw_prepare_vertices(struct brw_context *brw);
 /* brw_wm_surface_state.c */
 void brw_init_surface_formats(struct brw_context *brw);
 void brw_create_constant_surface(struct brw_context *brw,
-                                 drm_intel_bo *bo,
+                                 struct brw_bo *bo,
                                  uint32_t offset,
                                  uint32_t size,
                                  uint32_t *out_offset,
@@ -1699,12 +1649,6 @@  void brw_dump_perf_monitors(struct brw_context *brw);
 void brw_perf_monitor_new_batch(struct brw_context *brw);
 void brw_perf_monitor_finish_batch(struct brw_context *brw);
 
-/* intel_buffer_objects.c */
-int brw_bo_map(struct brw_context *brw, drm_intel_bo *bo, int write_enable,
-               const char *bo_name);
-int brw_bo_map_gtt(struct brw_context *brw, drm_intel_bo *bo,
-                   const char *bo_name);
-
 /* intel_extensions.c */
 extern void intelInitExtensions(struct gl_context *ctx);
 
@@ -1863,13 +1807,9 @@  brw_program_reloc(struct brw_context *brw, uint32_t state_offset,
       return prog_offset;
    }
 
-   drm_intel_bo_emit_reloc(brw->batch.bo,
-			   state_offset,
-			   brw->cache.bo,
-			   prog_offset,
-			   I915_GEM_DOMAIN_INSTRUCTION, 0);
-
-   return brw->cache.bo->offset64 + prog_offset;
+   return brw_batch_reloc(&brw->batch, state_offset,
+			  brw->cache.bo, prog_offset,
+			  I915_GEM_DOMAIN_INSTRUCTION, 0);
 }
 
 bool brw_do_cubemap_normalize(struct exec_list *instructions);
@@ -1952,7 +1892,7 @@  void brw_fini_pipe_control(struct brw_context *brw);
 
 void brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags);
 void brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
-                                 drm_intel_bo *bo, uint32_t offset,
+                                 struct brw_bo *bo, uint32_t offset,
                                  uint32_t imm_lower, uint32_t imm_upper);
 void brw_emit_mi_flush(struct brw_context *brw);
 void brw_emit_post_sync_nonzero_flush(struct brw_context *brw);
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index befd7a9..29b75bc 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -57,7 +57,6 @@ 
 #include "program/prog_parameter.h"
 #include "program/prog_print.h"
 #include "program/prog_statevars.h"
-#include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 #include "brw_context.h"
 #include "brw_defines.h"
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index ac8daaf..29fbf50 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -48,11 +48,11 @@ 
 #include "brw_state.h"
 #include "brw_vs.h"
 
-#include "intel_batchbuffer.h"
 #include "intel_buffers.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 #include "intel_buffer_objects.h"
+#include "intel_reg.h"
 
 #define FILE_DEBUG_FLAG DEBUG_PRIMS
 
@@ -220,9 +220,10 @@  static void brw_emit_prim(struct brw_context *brw,
    /* If indirect, emit a bunch of loads from the indirect BO. */
    if (prim->is_indirect) {
       struct gl_buffer_object *indirect_buffer = brw->ctx.DrawIndirectBuffer;
-      drm_intel_bo *bo = intel_bufferobj_buffer(brw,
-            intel_buffer_object(indirect_buffer),
-            prim->indirect_offset, 5 * sizeof(GLuint));
+      struct brw_bo *bo =
+	 intel_bufferobj_buffer(brw,
+				intel_buffer_object(indirect_buffer),
+				prim->indirect_offset, 5 * sizeof(GLuint));
 
       indirect_flag = GEN7_3DPRIM_INDIRECT_PARAMETER_ENABLE;
 
@@ -289,7 +290,7 @@  static void brw_merge_inputs( struct brw_context *brw,
    GLuint i;
 
    for (i = 0; i < brw->vb.nr_buffers; i++) {
-      drm_intel_bo_unreference(brw->vb.buffers[i].bo);
+      brw_bo_put(brw->vb.buffers[i].bo);
       brw->vb.buffers[i].bo = NULL;
    }
    brw->vb.nr_buffers = 0;
@@ -360,7 +361,6 @@  static void brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
    struct intel_renderbuffer *front_irb = NULL;
    struct intel_renderbuffer *back_irb = intel_get_renderbuffer(fb, BUFFER_BACK_LEFT);
    struct intel_renderbuffer *depth_irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
-   struct intel_renderbuffer *stencil_irb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
    struct gl_renderbuffer_attachment *depth_att = &fb->Attachment[BUFFER_DEPTH];
 
    if (brw_is_front_buffer_drawing(fb))
@@ -372,20 +372,6 @@  static void brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
       back_irb->need_downsample = true;
    if (depth_irb && ctx->Depth.Mask) {
       intel_renderbuffer_att_set_needs_depth_resolve(depth_att);
-      brw_render_cache_set_add_bo(brw, depth_irb->mt->bo);
-   }
-
-   if (ctx->Extensions.ARB_stencil_texturing &&
-       stencil_irb && ctx->Stencil._WriteEnabled) {
-      brw_render_cache_set_add_bo(brw, stencil_irb->mt->bo);
-   }
-
-   for (int i = 0; i < fb->_NumColorDrawBuffers; i++) {
-      struct intel_renderbuffer *irb =
-         intel_renderbuffer(fb->_ColorDrawBuffers[i]);
-
-      if (irb)
-         brw_render_cache_set_add_bo(brw, irb->mt->bo);
    }
 }
 
@@ -403,7 +389,6 @@  static void brw_try_draw_prims( struct gl_context *ctx,
 {
    struct brw_context *brw = brw_context(ctx);
    GLuint i;
-   bool fail_next = false;
 
    if (ctx->NewState)
       _mesa_update_state( ctx );
@@ -450,6 +435,7 @@  static void brw_try_draw_prims( struct gl_context *ctx,
    for (i = 0; i < nr_prims; i++) {
       int estimated_max_prim_size;
       const int sampler_state_size = 16;
+      int ret;
 
       estimated_max_prim_size = 512; /* batchbuffer commands */
       estimated_max_prim_size += BRW_MAX_TEX_UNIT *
@@ -462,8 +448,9 @@  static void brw_try_draw_prims( struct gl_context *ctx,
        * we've got validated state that needs to be in the same batch as the
        * primitives.
        */
-      intel_batchbuffer_require_space(brw, estimated_max_prim_size, RENDER_RING);
-      intel_batchbuffer_save_state(brw);
+      ret = brw_batch_begin(&brw->batch, estimated_max_prim_size, RENDER_RING);
+      if (ret < 0)
+	 break;
 
       if (brw->num_instances != prims[i].num_instances ||
           brw->basevertex != prims[i].basevertex) {
@@ -478,13 +465,12 @@  static void brw_try_draw_prims( struct gl_context *ctx,
       brw->draw.gl_basevertex =
          prims[i].indexed ? prims[i].basevertex : prims[i].start;
 
-      drm_intel_bo_unreference(brw->draw.draw_params_bo);
+      brw_bo_put(brw->draw.draw_params_bo);
 
       if (prims[i].is_indirect) {
          /* Point draw_params_bo at the indirect buffer. */
          brw->draw.draw_params_bo =
-            intel_buffer_object(ctx->DrawIndirectBuffer)->buffer;
-         drm_intel_bo_reference(brw->draw.draw_params_bo);
+            brw_bo_get(intel_buffer_object(ctx->DrawIndirectBuffer)->buffer);
          brw->draw.draw_params_offset =
             prims[i].indirect_offset + (prims[i].indexed ? 12 : 8);
       } else {
@@ -500,35 +486,21 @@  static void brw_try_draw_prims( struct gl_context *ctx,
       else
 	 gen6_set_prim(brw, &prims[i]);
 
-retry:
-
       /* Note that before the loop, brw->ctx.NewDriverState was set to != 0, and
        * that the state updated in the loop outside of this block is that in
        * *_set_prim or intel_batchbuffer_flush(), which only impacts
        * brw->ctx.NewDriverState.
        */
       if (brw->ctx.NewDriverState) {
-	 brw->no_batch_wrap = true;
 	 brw_upload_render_state(brw);
       }
 
       brw_emit_prim(brw, &prims[i], brw->primitive);
 
-      brw->no_batch_wrap = false;
-
-      if (dri_bufmgr_check_aperture_space(&brw->batch.bo, 1)) {
-	 if (!fail_next) {
-	    intel_batchbuffer_reset_to_saved(brw);
-	    intel_batchbuffer_flush(brw);
-	    fail_next = true;
-	    goto retry;
-	 } else {
-            int ret = intel_batchbuffer_flush(brw);
-            WARN_ONCE(ret == -ENOSPC,
-                      "i965: Single primitive emit exceeded "
-                      "available aperture space\n");
-	 }
-      }
+      ret = brw_batch_end(&brw->batch);
+      WARN_ONCE(ret == -ENOSPC,
+		"i965: Single primitive emit exceeded "
+		"available aperture space\n");
 
       /* Now that we know we haven't run out of aperture space, we can safely
        * reset the dirty bits.
@@ -538,7 +510,7 @@  retry:
    }
 
    if (brw->always_flush_batch)
-      intel_batchbuffer_flush(brw);
+      brw_batch_flush(&brw->batch);
 
    brw_state_cache_check_size(brw);
    brw_postdraw_set_buffers_need_resolve(brw);
@@ -626,7 +598,7 @@  void brw_draw_destroy( struct brw_context *brw )
    int i;
 
    for (i = 0; i < brw->vb.nr_buffers; i++) {
-      drm_intel_bo_unreference(brw->vb.buffers[i].bo);
+      brw_bo_put(brw->vb.buffers[i].bo);
       brw->vb.buffers[i].bo = NULL;
    }
    brw->vb.nr_buffers = 0;
@@ -636,6 +608,6 @@  void brw_draw_destroy( struct brw_context *brw )
    }
    brw->vb.nr_enabled = 0;
 
-   drm_intel_bo_unreference(brw->ib.bo);
+   brw_bo_put(brw->ib.bo);
    brw->ib.bo = NULL;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 320e40e..bc2e8fa 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -37,7 +37,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 
-#include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 
 static GLuint double_types[5] = {
@@ -361,7 +360,7 @@  copy_array_to_vbo_array(struct brw_context *brw,
       intel_upload_data(brw, element->glarray->Ptr,
                         element->glarray->_ElementSize,
                         element->glarray->_ElementSize,
-			&buffer->bo, &buffer->offset);
+                        &buffer->bo, &buffer->offset);
 
       buffer->stride = 0;
       return;
@@ -480,9 +479,8 @@  brw_prepare_vertices(struct brw_context *brw)
                           glarray->_ElementSize);
                }
             }
-            buffer->bo = intel_bufferobj_buffer(brw, intel_buffer,
-                                                offset, size);
-            drm_intel_bo_reference(buffer->bo);
+            buffer->bo = brw_bo_get(intel_bufferobj_buffer(brw, intel_buffer,
+							   offset, size));
 
 	    input->buffer = j++;
 	    input->offset = 0;
@@ -596,7 +594,7 @@  brw_prepare_shader_draw_parameters(struct brw_context *brw)
    /* For non-indirect draws, upload gl_BaseVertex. */
    if (brw->vs.prog_data->uses_vertexid && brw->draw.draw_params_bo == NULL) {
       intel_upload_data(brw, &brw->draw.gl_basevertex, 4, 4,
-			&brw->draw.draw_params_bo,
+                        &brw->draw.draw_params_bo,
                         &brw->draw.draw_params_offset);
    }
 }
@@ -607,7 +605,7 @@  brw_prepare_shader_draw_parameters(struct brw_context *brw)
 static void
 emit_vertex_buffer_state(struct brw_context *brw,
                          unsigned buffer_nr,
-                         drm_intel_bo *bo,
+                         struct brw_bo *bo,
                          unsigned bo_ending_address,
                          unsigned bo_offset,
                          unsigned stride,
@@ -860,7 +858,7 @@  static void brw_upload_indices(struct brw_context *brw)
    struct gl_context *ctx = &brw->ctx;
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
    GLuint ib_size;
-   drm_intel_bo *old_bo = brw->ib.bo;
+   struct brw_bo *old_bo = brw->ib.bo;
    struct gl_buffer_object *bufferobj;
    GLuint offset;
    GLuint ib_type_size;
@@ -878,7 +876,7 @@  static void brw_upload_indices(struct brw_context *brw)
       /* Get new bufferobj, offset:
        */
       intel_upload_data(brw, index_buffer->ptr, ib_size, ib_type_size,
-			&brw->ib.bo, &offset);
+                        &brw->ib.bo, &offset);
    } else {
       offset = (GLuint) (unsigned long) index_buffer->ptr;
 
@@ -901,13 +899,12 @@  static void brw_upload_indices(struct brw_context *brw)
 
          ctx->Driver.UnmapBuffer(ctx, bufferobj, MAP_INTERNAL);
       } else {
-         drm_intel_bo *bo =
+         struct brw_bo *bo =
             intel_bufferobj_buffer(brw, intel_buffer_object(bufferobj),
                                    offset, ib_size);
          if (bo != brw->ib.bo) {
-            drm_intel_bo_unreference(brw->ib.bo);
-            brw->ib.bo = bo;
-            drm_intel_bo_reference(bo);
+            brw_bo_put(brw->ib.bo);
+            brw->ib.bo = brw_bo_get(bo);
          }
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_ff_gs.c b/src/mesa/drivers/dri/i965/brw_ff_gs.c
index f72f37f..bd452c3 100644
--- a/src/mesa/drivers/dri/i965/brw_ff_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_ff_gs.c
@@ -34,8 +34,6 @@ 
 #include "main/enums.h"
 #include "main/transformfeedback.h"
 
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c b/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c
index 50bda61..56a29b4 100644
--- a/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c
@@ -35,7 +35,6 @@ 
 #include "main/enums.h"
 
 #include "program/program.h"
-#include "intel_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index ed86e0b..9c3a265 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -4126,8 +4126,7 @@  brw_wm_fs_emit(struct brw_context *brw,
    double start_time = 0;
 
    if (unlikely(brw->perf_debug)) {
-      start_busy = (brw->batch.last_bo &&
-                    drm_intel_bo_busy(brw->batch.last_bo));
+      start_busy = brw_batch_busy(&brw->batch);
       start_time = get_time();
    }
 
@@ -4206,7 +4205,7 @@  brw_wm_fs_emit(struct brw_context *brw,
          brw_wm_debug_recompile(brw, prog, key);
       shader->compiled_once = true;
 
-      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
+      if (start_busy && !brw_batch_busy(&brw->batch)) {
          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
                     (get_time() - start_time) * 1000);
       }
diff --git a/src/mesa/drivers/dri/i965/brw_list.h b/src/mesa/drivers/dri/i965/brw_list.h
new file mode 100644
index 0000000..26cb718
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_list.h
@@ -0,0 +1,353 @@ 
+/*
+ * Copyright © 2010-2012 Intel Corporation
+ * Copyright © 2010 Francisco Jerez <currojerez@riseup.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _BRW_LIST_H_
+#define _BRW_LIST_H_
+
+#include <stdbool.h>
+
+/**
+ * @file Classic doubly-link circular list implementation.
+ * For real usage examples of the linked list, see the file test/list.c
+ *
+ * Example:
+ * We need to keep a list of struct foo in the parent struct bar, i.e. what
+ * we want is something like this.
+ *
+ *     struct bar {
+ *          ...
+ *          struct foo *list_of_foos; -----> struct foo {}, struct foo {}, struct foo{}
+ *          ...
+ *     }
+ *
+ * We need one list head in bar and a list element in all list_of_foos (both are of
+ * data type 'struct list').
+ *
+ *     struct bar {
+ *          ...
+ *          struct list list_of_foos;
+ *          ...
+ *     }
+ *
+ *     struct foo {
+ *          ...
+ *          struct list entry;
+ *          ...
+ *     }
+ *
+ * Now we initialize the list head:
+ *
+ *     struct bar bar;
+ *     ...
+ *     list_init(&bar.list_of_foos);
+ *
+ * Then we create the first element and add it to this list:
+ *
+ *     struct foo *foo = malloc(...);
+ *     ....
+ *     list_add(&foo->entry, &bar.list_of_foos);
+ *
+ * Repeat the above for each element you want to add to the list. Deleting
+ * works with the element itself.
+ *      list_del(&foo->entry);
+ *      free(foo);
+ *
+ * Note: calling list_del(&bar.list_of_foos) will set bar.list_of_foos to an empty
+ * list again.
+ *
+ * Looping through the list requires a 'struct foo' as iterator and the
+ * name of the field the subnodes use.
+ *
+ * struct foo *iterator;
+ * list_for_each_entry(iterator, &bar.list_of_foos, entry) {
+ *      if (iterator->something == ...)
+ *             ...
+ * }
+ *
+ * Note: You must not call list_del() on the iterator if you continue the
+ * loop. You need to run the safe for-each loop instead:
+ *
+ * struct foo *iterator, *next;
+ * list_for_each_entry_safe(iterator, next, &bar.list_of_foos, entry) {
+ *      if (...)
+ *              list_del(&iterator->entry);
+ * }
+ *
+ */
+
+/**
+ * The linkage struct for list nodes. This struct must be part of your
+ * to-be-linked struct. struct list is required for both the head of the
+ * list and for each list node.
+ *
+ * Position and name of the struct list field is irrelevant.
+ * There are no requirements that elements of a list are of the same type.
+ * There are no requirements for a list head, any struct list can be a list
+ * head.
+ */
+struct list {
+    struct list *next, *prev;
+};
+
+/**
+ * Initialize the list as an empty list.
+ *
+ * Example:
+ * list_init(&bar->list_of_foos);
+ *
+ * @param The list to initialized.
+ */
+static void
+list_init(struct list *list)
+{
+    list->next = list->prev = list;
+}
+
+static inline void
+__list_add(struct list *entry,
+	    struct list *prev,
+	    struct list *next)
+{
+    next->prev = entry;
+    entry->next = next;
+    entry->prev = prev;
+    prev->next = entry;
+}
+
+/**
+ * Insert a new element after the given list head. The new element does not
+ * need to be initialised as empty list.
+ * The list changes from:
+ *      head ? some element ? ...
+ * to
+ *      head ? new element ? older element ? ...
+ *
+ * Example:
+ * struct foo *newfoo = malloc(...);
+ * list_add(&newfoo->entry, &bar->list_of_foos);
+ *
+ * @param entry The new element to prepend to the list.
+ * @param head The existing list.
+ */
+static inline void
+list_add(struct list *entry, struct list *head)
+{
+    __list_add(entry, head, head->next);
+}
+
+static inline void
+list_add_tail(struct list *entry, struct list *head)
+{
+    __list_add(entry, head->prev, head);
+}
+
+static inline void list_replace(struct list *__old__,
+				struct list *__new__)
+{
+	__new__->next = __old__->next;
+	__new__->next->prev = __new__;
+	__new__->prev = __old__->prev;
+	__new__->prev->next = __new__;
+}
+
+#define list_last_entry(ptr, type, member) \
+    list_entry((ptr)->prev, type, member)
+
+#define list_for_each(pos, head)				\
+    for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * Append a new element to the end of the list given with this list head.
+ *
+ * The list changes from:
+ *      head ? some element ? ... ? lastelement
+ * to
+ *      head ? some element ? ... ? lastelement ? new element
+ *
+ * Example:
+ * struct foo *newfoo = malloc(...);
+ * list_append(&newfoo->entry, &bar->list_of_foos);
+ *
+ * @param entry The new element to prepend to the list.
+ * @param head The existing list.
+ */
+static inline void
+list_append(struct list *entry, struct list *head)
+{
+    __list_add(entry, head->prev, head);
+}
+
+
+static inline void
+__list_del(struct list *prev, struct list *next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+static inline void
+_list_del(struct list *entry)
+{
+    __list_del(entry->prev, entry->next);
+}
+
+/**
+ * Remove the element from the list it is in. Using this function will reset
+ * the pointers to/from this element so it is removed from the list. It does
+ * NOT free the element itself or manipulate it otherwise.
+ *
+ * Using list_del on a pure list head (like in the example at the top of
+ * this file) will NOT remove the first element from
+ * the list but rather reset the list as empty list.
+ *
+ * Example:
+ * list_del(&foo->entry);
+ *
+ * @param entry The element to remove.
+ */
+static inline void
+list_del(struct list *entry)
+{
+    _list_del(entry);
+    list_init(entry);
+}
+
+static inline void list_move(struct list *list, struct list *head)
+{
+	if (list->prev != head) {
+		_list_del(list);
+		list_add(list, head);
+	}
+}
+
+static inline void list_move_tail(struct list *list, struct list *head)
+{
+	_list_del(list);
+	list_add_tail(list, head);
+}
+
+/**
+ * Check if the list is empty.
+ *
+ * Example:
+ * list_is_empty(&bar->list_of_foos);
+ *
+ * @return True if the list contains one or more elements or False otherwise.
+ */
+static inline bool
+list_is_empty(const struct list *head)
+{
+    return head->next == head;
+}
+
+/**
+ * Alias of container_of
+ */
+#define list_entry(ptr, type, member) \
+    container_of(ptr, type, member)
+
+/**
+ * Retrieve the first list entry for the given list pointer.
+ *
+ * Example:
+ * struct foo *first;
+ * first = list_first_entry(&bar->list_of_foos, struct foo, list_of_foos);
+ *
+ * @param ptr The list head
+ * @param type Data type of the list element to retrieve
+ * @param member Member name of the struct list field in the list element.
+ * @return A pointer to the first list element.
+ */
+#define list_first_entry(ptr, type, member) \
+    list_entry((ptr)->next, type, member)
+
+/**
+ * Retrieve the last list entry for the given listpointer.
+ *
+ * Example:
+ * struct foo *first;
+ * first = list_last_entry(&bar->list_of_foos, struct foo, list_of_foos);
+ *
+ * @param ptr The list head
+ * @param type Data type of the list element to retrieve
+ * @param member Member name of the struct list field in the list element.
+ * @return A pointer to the last list element.
+ */
+#define list_last_entry(ptr, type, member) \
+    list_entry((ptr)->prev, type, member)
+
+#define __container_of(ptr, sample, member)				\
+    (void *)((char *)(ptr) - ((char *)&(sample)->member - (char *)(sample)))
+/**
+ * Loop through the list given by head and set pos to struct in the list.
+ *
+ * Example:
+ * struct foo *iterator;
+ * list_for_each_entry(iterator, &bar->list_of_foos, entry) {
+ *      [modify iterator]
+ * }
+ *
+ * This macro is not safe for node deletion. Use list_for_each_entry_safe
+ * instead.
+ *
+ * @param pos Iterator variable of the type of the list elements.
+ * @param head List head
+ * @param member Member name of the struct list in the list elements.
+ *
+ */
+#define list_for_each_entry(pos, head, member)				\
+    for (pos = __container_of((head)->next, pos, member);		\
+	 &pos->member != (head);					\
+	 pos = __container_of(pos->member.next, pos, member))
+
+#define list_for_each_entry_reverse(pos, head, member)				\
+    for (pos = __container_of((head)->prev, pos, member);		\
+	 &pos->member != (head);					\
+	 pos = __container_of(pos->member.prev, pos, member))
+
+/**
+ * Loop through the list, keeping a backup pointer to the element. This
+ * macro allows for the deletion of a list element while looping through the
+ * list.
+ *
+ * See list_for_each_entry for more details.
+ */
+#define list_for_each_entry_safe(pos, tmp, head, member)		\
+    for (pos = __container_of((head)->next, pos, member),		\
+	 tmp = __container_of(pos->member.next, pos, member);		\
+	 &pos->member != (head);					\
+	 pos = tmp, tmp = __container_of(pos->member.next, tmp, member))
+
+#undef container_of
+#define container_of(ptr, type, member) \
+	((type *)((char *)(ptr) - (char *) &((type *)0)->member))
+
+static inline int list_is_singular(const struct list *list)
+{
+	return list->next == list->prev;
+}
+
+#endif /* _BRW_LIST_H_ */
+
diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
index d5cec0d..eb57481 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
@@ -49,7 +49,6 @@ 
 #include "brw_context.h"
 #include "brw_draw.h"
 #include "intel_fbo.h"
-#include "intel_batchbuffer.h"
 
 #include "brw_blorp.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
index 84cfc05..c2114b9 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
@@ -43,7 +43,6 @@ 
  */
 
 #include "brw_context.h"
-#include "intel_batchbuffer.h"
 #include "intel_fbo.h"
 
 #include "main/blit.h"
diff --git a/src/mesa/drivers/dri/i965/brw_meta_updownsample.c b/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
index f39d50a..37a0968 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
@@ -22,7 +22,6 @@ 
  */
 
 #include "brw_context.h"
-#include "intel_batchbuffer.h"
 #include "intel_fbo.h"
 
 #include "main/blit.h"
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 7357155..0fa4407 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -31,9 +31,9 @@ 
 
 
 
-#include "intel_batchbuffer.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
+#include "intel_reg.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
@@ -540,10 +540,10 @@  brw_emit_depthbuffer(struct brw_context *brw)
       height = stencil_irb->Base.Base.Height;
    }
 
-   if (depth_mt)
-      brw_render_cache_set_check_flush(brw, depth_mt->bo);
-   if (stencil_mt)
-      brw_render_cache_set_check_flush(brw, stencil_mt->bo);
+   if (depth_mt && depth_mt->bo->dirty)
+      brw_emit_mi_flush(brw);
+   if (stencil_mt && stencil_mt->bo->dirty)
+      brw_emit_mi_flush(brw);
 
    brw->vtbl.emit_depth_stencil_hiz(brw, depth_mt, depth_offset,
                                     depthbuffer_format, depth_surface_type,
diff --git a/src/mesa/drivers/dri/i965/brw_object_purgeable.c b/src/mesa/drivers/dri/i965/brw_object_purgeable.c
index 20f66f2..3240ee3 100644
--- a/src/mesa/drivers/dri/i965/brw_object_purgeable.c
+++ b/src/mesa/drivers/dri/i965/brw_object_purgeable.c
@@ -38,12 +38,12 @@ 
 #include "intel_mipmap_tree.h"
 
 static GLenum
-intel_buffer_purgeable(drm_intel_bo *buffer)
+intel_buffer_purgeable(struct brw_bo *buffer)
 {
    int retained = 0;
 
    if (buffer != NULL)
-      retained = drm_intel_bo_madvise(buffer, I915_MADV_DONTNEED);
+      retained = brw_bo_madvise(buffer, I915_MADV_DONTNEED);
 
    return retained ? GL_VOLATILE_APPLE : GL_RELEASED_APPLE;
 }
@@ -101,13 +101,13 @@  intel_render_object_purgeable(struct gl_context * ctx,
 }
 
 static GLenum
-intel_buffer_unpurgeable(drm_intel_bo *buffer)
+intel_buffer_unpurgeable(struct brw_bo *buffer)
 {
    int retained;
 
    retained = 0;
    if (buffer != NULL)
-      retained = drm_intel_bo_madvise(buffer, I915_MADV_WILLNEED);
+      retained = brw_bo_madvise(buffer, I915_MADV_WILLNEED);
 
    return retained ? GL_RETAINED_APPLE : GL_UNDEFINED_APPLE;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
index 0a12375..91f8e63 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
@@ -54,7 +54,8 @@ 
 
 #include "brw_context.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
+
+#include "intel_reg.h"
 
 #define FILE_DEBUG_FLAG DEBUG_PERFMON
 
@@ -69,7 +70,7 @@  struct brw_perf_monitor_object
    /**
     * BO containing OA counter snapshots at monitor Begin/End time.
     */
-   drm_intel_bo *oa_bo;
+   struct brw_bo *oa_bo;
 
    /** Indexes into bookend_bo (snapshot numbers) for various segments. */
    int oa_head_end;
@@ -90,7 +91,7 @@  struct brw_perf_monitor_object
     * BO containing starting and ending snapshots for any active pipeline
     * statistics counters.
     */
-   drm_intel_bo *pipeline_stats_bo;
+   struct brw_bo *pipeline_stats_bo;
 
    /**
     * Storage for final pipeline statistics counter results.
@@ -615,15 +616,13 @@  gather_statistics_results(struct brw_context *brw,
       return;
    }
 
-   drm_intel_bo_map(monitor->pipeline_stats_bo, false);
-   uint64_t *start = monitor->pipeline_stats_bo->virtual;
+   uint64_t *start = brw_bo_map(monitor->pipeline_stats_bo, MAP_READ);
    uint64_t *end = start + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint64_t));
 
    for (int i = 0; i < num_counters; i++) {
       monitor->pipeline_stats_results[i] = end[i] - start[i];
    }
-   drm_intel_bo_unmap(monitor->pipeline_stats_bo);
-   drm_intel_bo_unreference(monitor->pipeline_stats_bo);
+   brw_bo_put(monitor->pipeline_stats_bo);
    monitor->pipeline_stats_bo = NULL;
 }
 
@@ -701,16 +700,16 @@  stop_oa_counters(struct brw_context *brw)
  */
 static void
 emit_mi_report_perf_count(struct brw_context *brw,
-                          drm_intel_bo *bo,
+                          struct brw_bo *bo,
                           uint32_t offset_in_bytes,
                           uint32_t report_id)
 {
    assert(offset_in_bytes % 64 == 0);
 
    /* Make sure the commands to take a snapshot fits in a single batch. */
-   intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
-                                   RENDER_RING);
-   int batch_used = brw->batch.used;
+   brw_batch_begin(&brw->batch,
+		   MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
+		   RENDER_RING);
 
    /* Reports apparently don't always get written unless we flush first. */
    brw_emit_mi_flush(brw);
@@ -752,9 +751,7 @@  emit_mi_report_perf_count(struct brw_context *brw,
 
    /* Reports apparently don't always get written unless we flush after. */
    brw_emit_mi_flush(brw);
-
-   (void) batch_used;
-   assert(brw->batch.used - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
+   brw_batch_end(&brw->batch);
 }
 
 /**
@@ -892,8 +889,7 @@  gather_oa_results(struct brw_context *brw,
    struct gl_perf_monitor_object *m = &monitor->base;
    assert(monitor->oa_bo != NULL);
 
-   drm_intel_bo_map(monitor->oa_bo, false);
-   uint32_t *monitor_buffer = monitor->oa_bo->virtual;
+   uint32_t *monitor_buffer = brw_bo_map(monitor->oa_bo, MAP_READ);
 
    /* If monitoring was entirely contained within a single batch, then the
     * bookend BO is irrelevant.  Just subtract monitor->bo's two snapshots.
@@ -903,7 +899,6 @@  gather_oa_results(struct brw_context *brw,
                  monitor_buffer,
                  monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
                                    sizeof(uint32_t)));
-      drm_intel_bo_unmap(monitor->oa_bo);
       return;
    }
 
@@ -950,13 +945,11 @@  gather_oa_results(struct brw_context *brw,
                                    sizeof(uint32_t)));
    }
 
-   drm_intel_bo_unmap(monitor->oa_bo);
-
    /* If the monitor has ended, then we've gathered all the results, and
     * can free the monitor's OA BO.
     */
    if (m->Ended) {
-      drm_intel_bo_unreference(monitor->oa_bo);
+      brw_bo_put(monitor->oa_bo);
       monitor->oa_bo = NULL;
 
       /* The monitor's OA result is now resolved. */
@@ -989,8 +982,7 @@  wrap_bookend_bo(struct brw_context *brw)
     */
    assert(brw->perfmon.oa_users > 0);
 
-   drm_intel_bo_map(brw->perfmon.bookend_bo, false);
-   uint32_t *bookend_buffer = brw->perfmon.bookend_bo->virtual;
+   uint32_t *bookend_buffer = brw_bo_map(brw->perfmon.bookend_bo, MAP_READ);
    for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
       struct brw_perf_monitor_object *monitor = brw->perfmon.unresolved[i];
       struct gl_perf_monitor_object *m = &monitor->base;
@@ -1011,7 +1003,6 @@  wrap_bookend_bo(struct brw_context *brw)
          assert(monitor->oa_tail_start == -1);
       }
    }
-   drm_intel_bo_unmap(brw->perfmon.bookend_bo);
 
    brw->perfmon.bookend_snapshots = 0;
 }
@@ -1060,7 +1051,7 @@  reinitialize_perf_monitor(struct brw_context *brw,
                           struct brw_perf_monitor_object *monitor)
 {
    if (monitor->oa_bo) {
-      drm_intel_bo_unreference(monitor->oa_bo);
+      brw_bo_put(monitor->oa_bo);
       monitor->oa_bo = NULL;
    }
 
@@ -1077,7 +1068,7 @@  reinitialize_perf_monitor(struct brw_context *brw,
    monitor->oa_results = NULL;
 
    if (monitor->pipeline_stats_bo) {
-      drm_intel_bo_unreference(monitor->pipeline_stats_bo);
+      brw_bo_put(monitor->pipeline_stats_bo);
       monitor->pipeline_stats_bo = NULL;
    }
 
@@ -1105,18 +1096,16 @@  brw_begin_perf_monitor(struct gl_context *ctx,
        * wasting memory for contexts that don't use performance monitors.
        */
       if (!brw->perfmon.bookend_bo) {
-         brw->perfmon.bookend_bo = drm_intel_bo_alloc(brw->bufmgr,
-                                                      "OA bookend BO",
-                                                      BOOKEND_BO_SIZE_BYTES, 64);
+	 brw->perfmon.bookend_bo = brw_bo_create(&brw->batch,
+						 "OA bookend BO",
+						 BOOKEND_BO_SIZE_BYTES, 0);
       }
 
       monitor->oa_bo =
-         drm_intel_bo_alloc(brw->bufmgr, "perf. monitor OA bo", 4096, 64);
+         brw_bo_create(&brw->batch, "perf. monitor OA bo", 4096, 0);
 #ifdef DEBUG
       /* Pre-filling the BO helps debug whether writes landed. */
-      drm_intel_bo_map(monitor->oa_bo, true);
-      memset((char *) monitor->oa_bo->virtual, 0xff, 4096);
-      drm_intel_bo_unmap(monitor->oa_bo);
+      memset(brw_bo_map(monitor->oa_bo, MAP_WRITE), 0xff, 4096);
 #endif
 
       /* Allocate storage for accumulated OA counter values. */
@@ -1126,8 +1115,6 @@  brw_begin_perf_monitor(struct gl_context *ctx,
       /* If the OA counters aren't already on, enable them. */
       if (brw->perfmon.oa_users == 0) {
          /* Ensure the OACONTROL enable and snapshot land in the same batch. */
-         int space = (MI_REPORT_PERF_COUNT_BATCH_DWORDS + 3) * 4;
-         intel_batchbuffer_require_space(brw, space, RENDER_RING);
          start_oa_counters(brw);
       }
 
@@ -1146,7 +1133,7 @@  brw_begin_perf_monitor(struct gl_context *ctx,
 
    if (monitor_needs_statistics_registers(brw, m)) {
       monitor->pipeline_stats_bo =
-         drm_intel_bo_alloc(brw->bufmgr, "perf. monitor stats bo", 4096, 64);
+         brw_bo_create(&brw->batch, "perf. monitor stats bo", 4096, 0);
 
       /* Take starting snapshots. */
       snapshot_statistics_registers(brw, monitor, 0);
@@ -1238,15 +1225,11 @@  brw_is_perf_monitor_result_available(struct gl_context *ctx,
    bool stats_available = true;
 
    if (monitor_needs_oa(brw, m)) {
-      oa_available = !monitor->oa_bo ||
-         (!drm_intel_bo_references(brw->batch.bo, monitor->oa_bo) &&
-          !drm_intel_bo_busy(monitor->oa_bo));
+      oa_available = !brw_bo_busy(monitor->oa_bo, BUSY_READ);
    }
 
    if (monitor_needs_statistics_registers(brw, m)) {
-      stats_available = !monitor->pipeline_stats_bo ||
-         (!drm_intel_bo_references(brw->batch.bo, monitor->pipeline_stats_bo) &&
-          !drm_intel_bo_busy(monitor->pipeline_stats_bo));
+      stats_available = !brw_bo_busy(monitor->pipeline_stats_bo, BUSY_READ);
    }
 
    return oa_available && stats_available;
@@ -1293,11 +1276,9 @@  brw_get_perf_monitor_result(struct gl_context *ctx,
           * Using an unsynchronized mapping avoids stalling for an
           * indeterminate amount of time.
           */
-         drm_intel_gem_bo_map_unsynchronized(brw->perfmon.bookend_bo);
-
-         gather_oa_results(brw, monitor, brw->perfmon.bookend_bo->virtual);
-
-         drm_intel_bo_unmap(brw->perfmon.bookend_bo);
+         gather_oa_results(brw, monitor,
+			   brw_bo_map(brw->perfmon.bookend_bo,
+				      MAP_READ | MAP_ASYNC));
       }
 
       for (int i = 0; i < brw->perfmon.entries_per_oa_snapshot; i++) {
@@ -1386,7 +1367,6 @@  void
 brw_perf_monitor_new_batch(struct brw_context *brw)
 {
    assert(brw->batch.ring == RENDER_RING);
-   assert(brw->gen < 6 || brw->batch.used == 0);
 
    if (brw->perfmon.oa_users == 0)
       return;
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
index 37080bf..0aa6f2e 100644
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -1,32 +1,5 @@ 
-/**************************************************************************
- *
- * Copyright 2006 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
 #include "brw_context.h"
-#include "intel_batchbuffer.h"
+
 #include "intel_fbo.h"
 #include "intel_reg.h"
 
@@ -139,7 +112,7 @@  brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
  */
 void
 brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
-                            drm_intel_bo *bo, uint32_t offset,
+                            struct brw_bo *bo, uint32_t offset,
                             uint32_t imm_lower, uint32_t imm_upper)
 {
    if (brw->gen >= 8) {
@@ -293,6 +266,9 @@  brw_emit_post_sync_nonzero_flush(struct brw_context *brw)
 void
 brw_emit_mi_flush(struct brw_context *brw)
 {
+   if (brw->batch.emit.nbatch == 0)
+      return;
+
    if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
       BEGIN_BATCH_BLT(4);
       OUT_BATCH(MI_FLUSH_DW);
@@ -331,7 +307,7 @@  brw_emit_mi_flush(struct brw_context *brw)
       brw_emit_pipe_control_flush(brw, flags);
    }
 
-   brw_render_cache_set_clear(brw);
+   brw_batch_clear_dirty(&brw->batch);
 }
 
 int
@@ -345,9 +321,9 @@  brw_init_pipe_control(struct brw_context *brw,
     * the gen6 workaround because it involves actually writing to
     * the buffer, and the kernel doesn't let us write to the batch.
     */
-   brw->workaround_bo = drm_intel_bo_alloc(brw->bufmgr,
-                                           "pipe_control workaround",
-                                           4096, 4096);
+   brw->workaround_bo = brw_bo_create(&brw->batch,
+				      "pipe_control workaround",
+				      4096, 4096);
    if (brw->workaround_bo == NULL)
       return -ENOMEM;
 
@@ -359,5 +335,5 @@  brw_init_pipe_control(struct brw_context *brw,
 void
 brw_fini_pipe_control(struct brw_context *brw)
 {
-   drm_intel_bo_unreference(brw->workaround_bo);
+   brw_bo_put(brw->workaround_bo);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_primitive_restart.c b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
index 2c7a7e8..163d8a2 100644
--- a/src/mesa/drivers/dri/i965/brw_primitive_restart.c
+++ b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
@@ -33,8 +33,6 @@ 
 #include "brw_defines.h"
 #include "brw_draw.h"
 
-#include "intel_batchbuffer.h"
-
 /**
  * Check if the hardware's cut index support can handle the primitive
  * restart index value (pre-Haswell only).
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 81a0c19..5c1bd8a 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -45,7 +45,8 @@ 
 #include "brw_shader.h"
 #include "brw_nir.h"
 #include "brw_wm.h"
-#include "intel_batchbuffer.h"
+
+#include "intel_reg.h"
 
 static unsigned
 get_new_program_id(struct intel_screen *screen)
@@ -259,17 +260,17 @@  brw_get_scratch_size(int size)
 
 void
 brw_get_scratch_bo(struct brw_context *brw,
-		   drm_intel_bo **scratch_bo, int size)
+		   struct brw_bo **scratch_bo, int size)
 {
-   drm_intel_bo *old_bo = *scratch_bo;
+   struct brw_bo *old_bo = *scratch_bo;
 
    if (old_bo && old_bo->size < size) {
-      drm_intel_bo_unreference(old_bo);
+      brw_bo_put(old_bo);
       old_bo = NULL;
    }
 
    if (!old_bo) {
-      *scratch_bo = drm_intel_bo_alloc(brw->bufmgr, "scratch bo", size, 4096);
+      *scratch_bo = brw_bo_create(&brw->batch, "scratch bo", size, 4096);
    }
 }
 
@@ -291,9 +292,9 @@  void
 brw_init_shader_time(struct brw_context *brw)
 {
    const int max_entries = 4096;
-   brw->shader_time.bo = drm_intel_bo_alloc(brw->bufmgr, "shader time",
-                                            max_entries * SHADER_TIME_STRIDE,
-                                            4096);
+   brw->shader_time.bo = brw_bo_create(&brw->batch, "shader time",
+				       max_entries * SHADER_TIME_STRIDE,
+				       4096);
    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
@@ -481,9 +482,7 @@  brw_collect_shader_time(struct brw_context *brw)
     * delaying reading the reports, but it doesn't look like it's a big
     * overhead compared to the cost of tracking the time in the first place.
     */
-   drm_intel_bo_map(brw->shader_time.bo, true);
-
-   uint32_t *times = brw->shader_time.bo->virtual;
+   uint32_t *times = brw_bo_map(brw->shader_time.bo, MAP_WRITE);
 
    for (int i = 0; i < brw->shader_time.num_entries; i++) {
       brw->shader_time.cumulative[i] += times[i * SHADER_TIME_STRIDE / 4];
@@ -491,8 +490,7 @@  brw_collect_shader_time(struct brw_context *brw)
 
    /* Zero the BO out to clear it out for our next collection.
     */
-   memset(times, 0, brw->shader_time.bo->size);
-   drm_intel_bo_unmap(brw->shader_time.bo);
+   memset(times, 0, 4*brw->shader_time.num_entries);
 }
 
 void
@@ -545,7 +543,7 @@  brw_get_shader_time_index(struct brw_context *brw,
 void
 brw_destroy_shader_time(struct brw_context *brw)
 {
-   drm_intel_bo_unreference(brw->shader_time.bo);
+   brw_bo_put(brw->shader_time.bo);
    brw->shader_time.bo = NULL;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index 667c900..ec6067f 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -40,14 +40,13 @@ 
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
 #include "intel_reg.h"
 
 /**
  * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer.
  */
 void
-brw_write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
+brw_write_timestamp(struct brw_context *brw, struct brw_bo *query_bo, int idx)
 {
    if (brw->gen == 6) {
       /* Emit Sandybridge workaround flush: */
@@ -64,7 +63,7 @@  brw_write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
  * Emit PIPE_CONTROLs to write the PS_DEPTH_COUNT register into a buffer.
  */
 void
-brw_write_depth_count(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
+brw_write_depth_count(struct brw_context *brw, struct brw_bo *query_bo, int idx)
 {
    brw_emit_pipe_control_write(brw,
                                PIPE_CONTROL_WRITE_DEPTH_COUNT
@@ -93,17 +92,9 @@  brw_queryobj_get_results(struct gl_context *ctx,
     * still contributing to it, flush it now so the results will be present
     * when mapped.
     */
-   if (drm_intel_bo_references(brw->batch.bo, query->bo))
-      intel_batchbuffer_flush(brw);
+   brw_bo_flush(query->bo);
 
-   if (unlikely(brw->perf_debug)) {
-      if (drm_intel_bo_busy(query->bo)) {
-         perf_debug("Stalling on the GPU waiting for a query object.\n");
-      }
-   }
-
-   drm_intel_bo_map(query->bo, false);
-   results = query->bo->virtual;
+   results = brw_bo_map(query->bo, MAP_READ);
    switch (query->Base.Target) {
    case GL_TIME_ELAPSED_EXT:
       /* The query BO contains the starting and ending timestamps.
@@ -149,12 +140,11 @@  brw_queryobj_get_results(struct gl_context *ctx,
    default:
       unreachable("Unrecognized query target in brw_queryobj_get_results()");
    }
-   drm_intel_bo_unmap(query->bo);
 
    /* Now that we've processed the data stored in the query's buffer object,
     * we can release it.
     */
-   drm_intel_bo_unreference(query->bo);
+   brw_bo_put(query->bo);
    query->bo = NULL;
 }
 
@@ -186,7 +176,7 @@  brw_delete_query(struct gl_context *ctx, struct gl_query_object *q)
 {
    struct brw_query_object *query = (struct brw_query_object *)q;
 
-   drm_intel_bo_unreference(query->bo);
+   brw_bo_put(query->bo);
    free(query);
 }
 
@@ -225,8 +215,8 @@  brw_begin_query(struct gl_context *ctx, struct gl_query_object *q)
        * obtain the time elapsed.  Notably, this includes time elapsed while
        * the system was doing other work, such as running other applications.
        */
-      drm_intel_bo_unreference(query->bo);
-      query->bo = drm_intel_bo_alloc(brw->bufmgr, "timer query", 4096, 4096);
+      brw_bo_put(query->bo);
+      query->bo = brw_bo_create(&brw->batch, "timer query", 4096, 4096);
       brw_write_timestamp(brw, query->bo, 0);
       break;
 
@@ -240,7 +230,7 @@  brw_begin_query(struct gl_context *ctx, struct gl_query_object *q)
        * Since we're starting a new query, we need to be sure to throw away
        * any previous occlusion query results.
        */
-      drm_intel_bo_unreference(query->bo);
+      brw_bo_put(query->bo);
       query->bo = NULL;
       query->last_index = -1;
 
@@ -352,10 +342,7 @@  static void brw_check_query(struct gl_context *ctx, struct gl_query_object *q)
     *      not ready yet on the first time it is queried.  This ensures that
     *      the async query will return true in finite time.
     */
-   if (query->bo && drm_intel_bo_references(brw->batch.bo, query->bo))
-      intel_batchbuffer_flush(brw);
-
-   if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
+   if (!brw_bo_busy(query->bo, BUSY_READ | BUSY_FLUSH)) {
       brw_queryobj_get_results(ctx, query);
       query->Base.Ready = true;
    }
@@ -384,7 +371,7 @@  ensure_bo_has_space(struct gl_context *ctx, struct brw_query_object *query)
          brw_queryobj_get_results(ctx, query);
       }
 
-      query->bo = drm_intel_bo_alloc(brw->bufmgr, "query", 4096, 1);
+      query->bo = brw_bo_create(&brw->batch, "query", 4096, 0);
       query->last_index = 0;
    }
 }
@@ -415,7 +402,7 @@  brw_emit_query_begin(struct brw_context *brw)
    struct gl_context *ctx = &brw->ctx;
    struct brw_query_object *query = brw->query.obj;
 
-   if (brw->hw_ctx)
+   if (brw->batch.hw_ctx)
       return;
 
    /* Skip if we're not doing any queries, or we've already recorded the
@@ -442,7 +429,7 @@  brw_emit_query_end(struct brw_context *brw)
 {
    struct brw_query_object *query = brw->query.obj;
 
-   if (brw->hw_ctx)
+   if (brw->batch.hw_ctx)
       return;
 
    if (!brw->query.begin_emitted)
@@ -469,8 +456,8 @@  brw_query_counter(struct gl_context *ctx, struct gl_query_object *q)
 
    assert(q->Target == GL_TIMESTAMP);
 
-   drm_intel_bo_unreference(query->bo);
-   query->bo = drm_intel_bo_alloc(brw->bufmgr, "timestamp query", 4096, 4096);
+   brw_bo_put(query->bo);
+   query->bo = brw_bo_create(&brw->batch, "timestamp query", 4096, 0);
    brw_write_timestamp(brw, query->bo, 0);
 
    query->flushed = false;
@@ -485,9 +472,9 @@  static uint64_t
 brw_get_timestamp(struct gl_context *ctx)
 {
    struct brw_context *brw = brw_context(ctx);
-   uint64_t result = 0;
+   uint64_t result;
 
-   drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result);
+   result = brw_batch_read_register(&brw->batch, TIMESTAMP);
 
    /* See logic in brw_queryobj_get_results() */
    result = result >> 32;
diff --git a/src/mesa/drivers/dri/i965/brw_reset.c b/src/mesa/drivers/dri/i965/brw_reset.c
index e3182b1..5a38af1 100644
--- a/src/mesa/drivers/dri/i965/brw_reset.c
+++ b/src/mesa/drivers/dri/i965/brw_reset.c
@@ -22,6 +22,14 @@ 
  */
 #include "brw_context.h"
 
+static int brw_get_reset_stats(struct brw_context *brw,
+			       uint32_t *reset_count,
+			       uint32_t *active,
+			       uint32_t *pending)
+{
+	return -1;
+}
+
 /**
  * Query information about GPU resets observed by this context
  *
@@ -40,7 +48,7 @@  brw_get_graphics_reset_status(struct gl_context *ctx)
     * DRM_IOCTL_I915_GET_RESET_STATS is not supported), this function should
     * not be accessible.
     */
-   assert(brw->hw_ctx != NULL);
+   assert(brw->batch.hw_ctx);
 
    /* A reset status other than NO_ERROR was returned last time. I915 returns
     * nonzero active/pending only if reset has been encountered and completed.
@@ -49,8 +57,7 @@  brw_get_graphics_reset_status(struct gl_context *ctx)
    if (brw->reset_count != 0)
       return GL_NO_ERROR;
 
-   err = drm_intel_get_reset_stats(brw->hw_ctx, &reset_count, &active,
-                                   &pending);
+   err = brw_get_reset_stats(brw, &reset_count, &active, &pending);
    if (err)
       return GL_NO_ERROR;
 
diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index 22ccbfe..a56356b 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -39,7 +39,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 #include "intel_mipmap_tree.h"
 
 #include "main/macros.h"
@@ -99,14 +98,13 @@  brw_emit_sampler_state(struct brw_context *brw,
            SET_FIELD(mag_filter, BRW_SAMPLER_MAG_FILTER) |
            SET_FIELD(min_filter, BRW_SAMPLER_MIN_FILTER);
 
-   ss[2] = border_color_offset;
    if (brw->gen < 6) {
-      ss[2] += brw->batch.bo->offset64; /* reloc */
-      drm_intel_bo_emit_reloc(brw->batch.bo,
+      ss[2] = brw_batch_reloc(&brw->batch,
                               batch_offset_for_sampler_state + 8,
                               brw->batch.bo, border_color_offset,
                               I915_GEM_DOMAIN_SAMPLER, 0);
-   }
+   } else
+      ss[2] = border_color_offset;
 
    ss[3] = SET_FIELD(max_anisotropy, BRW_SAMPLER_MAX_ANISOTROPY) |
            SET_FIELD(address_rounding, BRW_SAMPLER_ADDRESS_ROUNDING);
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index 872464c..52deb57 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -36,8 +36,6 @@ 
 #include "main/enums.h"
 #include "main/fbobject.h"
 
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/mesa/drivers/dri/i965/brw_sf_emit.c b/src/mesa/drivers/dri/i965/brw_sf_emit.c
index b3ee5c1..28e2e56 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_emit.c
@@ -34,8 +34,6 @@ 
 #include "main/macros.h"
 #include "main/enums.h"
 
-#include "intel_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 014b434..d9d8abc 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -127,7 +127,6 @@  static void upload_sf_unit( struct brw_context *brw )
 {
    struct gl_context *ctx = &brw->ctx;
    struct brw_sf_unit_state *sf;
-   drm_intel_bo *bo = brw->batch.bo;
    int chipset_max_threads;
    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
 
@@ -173,9 +172,6 @@  static void upload_sf_unit( struct brw_context *brw )
       sf->thread4.stats_enable = 1;
 
    /* BRW_NEW_SF_VP */
-   sf->sf5.sf_viewport_state_offset = (brw->batch.bo->offset64 +
-				       brw->sf.vp_offset) >> 5; /* reloc */
-
    sf->sf5.viewport_transform = 1;
 
    /* _NEW_SCISSOR */
@@ -194,6 +190,15 @@  static void upload_sf_unit( struct brw_context *brw )
     */
    sf->sf5.front_winding ^= render_to_fbo;
 
+   sf->sf5.sf_viewport_state_offset = 
+      brw_batch_reloc(&brw->batch,
+		      (brw->sf.state_offset + offsetof(struct brw_sf_unit_state, sf5)),
+		      brw->batch.bo,
+		      brw->sf.vp_offset | sf->dw5,
+		      I915_GEM_DOMAIN_INSTRUCTION, 0) >> 5;
+
+
+
    /* _NEW_POLYGON */
    switch (ctx->Polygon.CullFlag ? ctx->Polygon.CullFaceMode : GL_NONE) {
    case GL_FRONT:
@@ -284,14 +289,6 @@  static void upload_sf_unit( struct brw_context *brw )
     * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
     */
 
-   /* Emit SF viewport relocation */
-   drm_intel_bo_emit_reloc(bo, (brw->sf.state_offset +
-				offsetof(struct brw_sf_unit_state, sf5)),
-			   brw->batch.bo, (brw->sf.vp_offset |
-					     sf->sf5.front_winding |
-					     (sf->sf5.viewport_transform << 1)),
-			   I915_GEM_DOMAIN_INSTRUCTION, 0);
-
    brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index a2127d1..7ca4d0c 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -213,7 +213,7 @@  void brw_destroy_caches( struct brw_context *brw );
  * brw_state_batch.c
  */
 #define BRW_BATCH_STRUCT(brw, s) \
-   intel_batchbuffer_data(brw, (s), sizeof(*(s)), RENDER_RING)
+   brw_batch_data(&brw->batch, (s), sizeof(*(s)))
 
 void *brw_state_batch(struct brw_context *brw,
 		      enum aub_state_struct_type type,
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index 45dca69..a51df69 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -30,7 +30,7 @@ 
   */
 
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
+
 #include "main/imports.h"
 #include "util/ralloc.h"
 
@@ -40,14 +40,12 @@  brw_track_state_batch(struct brw_context *brw,
 		      uint32_t offset,
 		      int size)
 {
-   struct intel_batchbuffer *batch = &brw->batch;
-
    if (!brw->state_batch_list) {
       /* Our structs are always aligned to at least 32 bytes, so
        * our array doesn't need to be any larger
        */
       brw->state_batch_list = ralloc_size(brw, sizeof(*brw->state_batch_list) *
-					  batch->bo->size / 32);
+					  brw->batch.bo->size / 32);
    }
 
    brw->state_batch_list[brw->state_batch_count].offset = offset;
@@ -79,13 +77,14 @@  make_annotation(drm_intel_aub_annotation *annotation, uint32_t type,
  * is annotated according to the type of each data structure.
  */
 void
-brw_annotate_aub(struct brw_context *brw)
+brw_annotate_batch(struct brw_batch *batch)
 {
+   struct brw_context *brw = container_of(batch, struct brw_context, batch);
    unsigned annotation_count = 2 * brw->state_batch_count + 1;
    drm_intel_aub_annotation annotations[annotation_count];
    int a = 0;
    make_annotation(&annotations[a++], AUB_TRACE_TYPE_BATCH, 0,
-                   4*brw->batch.used);
+                   4*brw->batch.emit.nbatch);
    for (int i = brw->state_batch_count; i-- > 0; ) {
       uint32_t type = brw->state_batch_list[i].type;
       uint32_t start_offset = brw->state_batch_list[i].offset;
@@ -96,8 +95,8 @@  brw_annotate_aub(struct brw_context *brw)
                       AUB_TRACE_SUBTYPE(type), end_offset);
    }
    assert(a == annotation_count);
-   drm_intel_bufmgr_gem_set_aub_annotations(brw->batch.bo, annotations,
-                                            annotation_count);
+   drm_intel_bufmgr_gem_set_aub_annotations(brw->batch.bo->base,
+					    annotations, annotation_count);
 }
 
 /**
@@ -121,27 +120,13 @@  brw_state_batch(struct brw_context *brw,
 		int alignment,
 		uint32_t *out_offset)
 {
-   struct intel_batchbuffer *batch = &brw->batch;
-   uint32_t offset;
-
-   assert(size < batch->bo->size);
-   offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
-
-   /* If allocating from the top would wrap below the batchbuffer, or
-    * if the batch's used space (plus the reserved pad) collides with our
-    * space, then flush and try again.
-    */
-   if (batch->state_batch_offset < size ||
-       offset < 4*batch->used + batch->reserved_space) {
-      intel_batchbuffer_flush(brw);
-      offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
-   }
-
-   batch->state_batch_offset = offset;
+   assert(size < brw->batch.bo->size);
+   brw->batch.state = ROUND_DOWN_TO(4*brw->batch.state - size, alignment)/4;
+   assert(brw->batch.state > brw->batch.emit.nbatch);
 
    if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_AUB)))
-      brw_track_state_batch(brw, type, offset, size);
+      brw_track_state_batch(brw, type, 4*brw->batch.state, size);
 
-   *out_offset = offset;
-   return batch->map + (offset>>2);
+   *out_offset = 4*brw->batch.state;
+   return brw->batch.map + brw->batch.state;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index 606740e..4c92a80 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -45,7 +45,6 @@ 
  */
 
 #include "main/imports.h"
-#include "intel_batchbuffer.h"
 #include "brw_state.h"
 #include "brw_vs.h"
 #include "brw_wm.h"
@@ -168,27 +167,18 @@  static void
 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
 {
    struct brw_context *brw = cache->brw;
-   drm_intel_bo *new_bo;
+   struct brw_bo *new_bo;
 
-   new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
-   if (brw->has_llc)
-      drm_intel_gem_bo_map_unsynchronized(new_bo);
+   new_bo = brw_bo_create(&brw->batch, "program cache", new_size, 64);
 
    /* Copy any existing data that needs to be saved. */
    if (cache->next_offset != 0) {
-      if (brw->has_llc) {
-         memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
-      } else {
-         drm_intel_bo_map(cache->bo, false);
-         drm_intel_bo_subdata(new_bo, 0, cache->next_offset,
-                              cache->bo->virtual);
-         drm_intel_bo_unmap(cache->bo);
-      }
+	   memcpy(brw_bo_map(new_bo, MAP_WRITE),
+		  brw_bo_map(cache->bo, MAP_READ),
+		  cache->next_offset);
    }
 
-   if (brw->has_llc)
-      drm_intel_bo_unmap(cache->bo);
-   drm_intel_bo_unreference(cache->bo);
+   brw_bo_put(cache->bo);
    cache->bo = new_bo;
    cache->bo_used_by_gpu = false;
 
@@ -208,7 +198,6 @@  brw_try_upload_using_copy(struct brw_cache *cache,
 			  const void *data,
 			  const void *aux)
 {
-   struct brw_context *brw = cache->brw;
    int i;
    struct brw_cache_item *item;
 
@@ -230,11 +219,9 @@  brw_try_upload_using_copy(struct brw_cache *cache,
 	    continue;
 	 }
 
-         if (!brw->has_llc)
-            drm_intel_bo_map(cache->bo, false);
-	 ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
-         if (!brw->has_llc)
-            drm_intel_bo_unmap(cache->bo);
+	 ret = memcmp(brw_bo_map(cache->bo, MAP_READ | MAP_ASYNC) + item->offset,
+		      data,
+		      item->size);
 	 if (ret)
 	    continue;
 
@@ -290,7 +277,6 @@  brw_upload_cache(struct brw_cache *cache,
 		 uint32_t *out_offset,
 		 void *out_aux)
 {
-   struct brw_context *brw = cache->brw;
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
    GLuint hash;
    void *tmp;
@@ -331,11 +317,7 @@  brw_upload_cache(struct brw_cache *cache,
    cache->n_items++;
 
    /* Copy data to the buffer */
-   if (brw->has_llc) {
-      memcpy((char *) cache->bo->virtual + item->offset, data, data_size);
-   } else {
-      drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
-   }
+   memcpy(brw_bo_map(cache->bo, MAP_WRITE | MAP_ASYNC) + item->offset, data, data_size);
 
    *out_offset = item->offset;
    *(void **)out_aux = (void *)((char *)item->key + item->key_size);
@@ -354,11 +336,7 @@  brw_init_caches(struct brw_context *brw)
    cache->items =
       calloc(cache->size, sizeof(struct brw_cache_item *));
 
-   cache->bo = drm_intel_bo_alloc(brw->bufmgr,
-				  "program cache",
-				  4096, 64);
-   if (brw->has_llc)
-      drm_intel_gem_bo_map_unsynchronized(cache->bo);
+   cache->bo = brw_bo_create(&brw->batch, "program cache", 4096, 64);
 
    cache->aux_compare[BRW_CACHE_VS_PROG] = brw_vs_prog_data_compare;
    cache->aux_compare[BRW_CACHE_GS_PROG] = brw_gs_prog_data_compare;
@@ -401,7 +379,6 @@  brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
     */
    brw->NewGLState |= ~0;
    brw->ctx.NewDriverState |= ~0ull;
-   intel_batchbuffer_flush(brw);
 }
 
 void
@@ -424,9 +401,7 @@  brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 
    DBG("%s\n", __func__);
 
-   if (brw->has_llc)
-      drm_intel_bo_unmap(cache->bo);
-   drm_intel_bo_unreference(cache->bo);
+   brw_bo_put(cache->bo);
    cache->bo = NULL;
    brw_clear_cache(brw, cache);
    free(cache->items);
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index 5cf70eb..20a8333 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -26,12 +26,16 @@ 
  */
 
 #include "main/mtypes.h"
-#include "intel_batchbuffer.h"
 
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_eu.h"
 
+static void *batch_in(struct brw_context *brw, unsigned offset)
+{
+	return (void *)brw->batch.map + offset;
+}
+
 static void
 batch_out(struct brw_context *brw, const char *name, uint32_t offset,
 	  int index, char *fmt, ...) PRINTFLIKE(5, 6);
@@ -40,7 +44,7 @@  static void
 batch_out(struct brw_context *brw, const char *name, uint32_t offset,
 	  int index, char *fmt, ...)
 {
-   uint32_t *data = brw->batch.bo->virtual + offset;
+   uint32_t *data = batch_in(brw, offset);
    va_list va;
 
    fprintf(stderr, "0x%08x:      0x%08x: %8s: ",
@@ -80,7 +84,7 @@  get_965_surface_format(unsigned int surface_format)
 static void dump_vs_state(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "VS_STATE";
-   struct brw_vs_unit_state *vs = brw->batch.bo->virtual + offset;
+   struct brw_vs_unit_state *vs = batch_in(brw, offset);
 
    batch_out(brw, name, offset, 0, "thread0\n");
    batch_out(brw, name, offset, 1, "thread1\n");
@@ -95,7 +99,7 @@  static void dump_vs_state(struct brw_context *brw, uint32_t offset)
 static void dump_gs_state(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "GS_STATE";
-   struct brw_gs_unit_state *gs = brw->batch.bo->virtual + offset;
+   struct brw_gs_unit_state *gs = batch_in(brw, offset);
 
    batch_out(brw, name, offset, 0, "thread0\n");
    batch_out(brw, name, offset, 1, "thread1\n");
@@ -110,7 +114,7 @@  static void dump_gs_state(struct brw_context *brw, uint32_t offset)
 static void dump_clip_state(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "CLIP_STATE";
-   struct brw_clip_unit_state *clip = brw->batch.bo->virtual + offset;
+   struct brw_clip_unit_state *clip = batch_in(brw, offset);
 
    batch_out(brw, name, offset, 0, "thread0\n");
    batch_out(brw, name, offset, 1, "thread1\n");
@@ -129,7 +133,7 @@  static void dump_clip_state(struct brw_context *brw, uint32_t offset)
 static void dump_sf_state(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "SF_STATE";
-   struct brw_sf_unit_state *sf = brw->batch.bo->virtual + offset;
+   struct brw_sf_unit_state *sf = batch_in(brw, offset);
 
    batch_out(brw, name, offset, 0, "thread0\n");
    batch_out(brw, name, offset, 1, "thread1\n");
@@ -145,7 +149,7 @@  static void dump_sf_state(struct brw_context *brw, uint32_t offset)
 static void dump_wm_state(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "WM_STATE";
-   struct brw_wm_unit_state *wm = brw->batch.bo->virtual + offset;
+   struct brw_wm_unit_state *wm = batch_in(brw, offset);
 
    batch_out(brw, name, offset, 0, "thread0\n");
    batch_out(brw, name, offset, 1, "thread1\n");
@@ -172,7 +176,7 @@  static void dump_wm_state(struct brw_context *brw, uint32_t offset)
 static void dump_surface_state(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "SURF";
-   uint32_t *surf = brw->batch.bo->virtual + offset;
+   uint32_t *surf = batch_in(brw, offset);
 
    batch_out(brw, name, offset, 0, "%s %s\n",
 	     get_965_surfacetype(GET_FIELD(surf[0], BRW_SURFACE_TYPE)),
@@ -196,7 +200,7 @@  static void dump_surface_state(struct brw_context *brw, uint32_t offset)
 static void dump_gen7_surface_state(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "SURF";
-   uint32_t *surf = brw->batch.bo->virtual + offset;
+   uint32_t *surf = batch_in(brw, offset);
 
    batch_out(brw, name, offset, 0, "%s %s %s\n",
              get_965_surfacetype(GET_FIELD(surf[0], BRW_SURFACE_TYPE)),
@@ -228,8 +232,7 @@  dump_sdc(struct brw_context *brw, uint32_t offset)
    const char *name = "SDC";
 
    if (brw->gen >= 5 && brw->gen <= 6) {
-      struct gen5_sampler_default_color *sdc = (brw->batch.bo->virtual +
-						offset);
+      struct gen5_sampler_default_color *sdc = batch_in(brw, offset);
       batch_out(brw, name, offset, 0, "unorm rgba\n");
       batch_out(brw, name, offset, 1, "r %f\n", sdc->f[0]);
       batch_out(brw, name, offset, 2, "b %f\n", sdc->f[1]);
@@ -243,7 +246,7 @@  dump_sdc(struct brw_context *brw, uint32_t offset)
       batch_out(brw, name, offset, 10, "s16 ba\n");
       batch_out(brw, name, offset, 11, "s8 rgba\n");
    } else {
-      float *sdc = brw->batch.bo->virtual + offset;
+      float *sdc = batch_in(brw, offset);
       batch_out(brw, name, offset, 0, "r %f\n", sdc[0]);
       batch_out(brw, name, offset, 1, "g %f\n", sdc[1]);
       batch_out(brw, name, offset, 2, "b %f\n", sdc[2]);
@@ -255,7 +258,7 @@  static void dump_sampler_state(struct brw_context *brw,
 			       uint32_t offset, uint32_t size)
 {
    int i;
-   uint32_t *samp = brw->batch.bo->virtual + offset;
+   uint32_t *samp = batch_in(brw, offset);
 
    for (i = 0; i < size / 16; i++) {
       char name[20];
@@ -275,7 +278,7 @@  static void dump_sf_viewport_state(struct brw_context *brw,
 				   uint32_t offset)
 {
    const char *name = "SF VP";
-   struct brw_sf_viewport *vp = brw->batch.bo->virtual + offset;
+   struct brw_sf_viewport *vp = batch_in(brw, offset);
 
    assert(brw->gen < 7);
 
@@ -296,7 +299,7 @@  static void dump_clip_viewport_state(struct brw_context *brw,
 				     uint32_t offset)
 {
    const char *name = "CLIP VP";
-   struct brw_clipper_viewport *vp = brw->batch.bo->virtual + offset;
+   struct brw_clipper_viewport *vp = batch_in(brw, offset);
 
    assert(brw->gen < 7);
 
@@ -310,7 +313,7 @@  static void dump_sf_clip_viewport_state(struct brw_context *brw,
 					uint32_t offset)
 {
    const char *name = "SF_CLIP VP";
-   struct gen7_sf_clip_viewport *vp = brw->batch.bo->virtual + offset;
+   struct gen7_sf_clip_viewport *vp = batch_in(brw, offset);
 
    assert(brw->gen >= 7);
 
@@ -330,7 +333,7 @@  static void dump_sf_clip_viewport_state(struct brw_context *brw,
 static void dump_cc_viewport_state(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "CC VP";
-   struct brw_cc_viewport *vp = brw->batch.bo->virtual + offset;
+   struct brw_cc_viewport *vp = batch_in(brw, offset);
 
    batch_out(brw, name, offset, 0, "min_depth = %f\n", vp->min_depth);
    batch_out(brw, name, offset, 1, "max_depth = %f\n", vp->max_depth);
@@ -339,7 +342,7 @@  static void dump_cc_viewport_state(struct brw_context *brw, uint32_t offset)
 static void dump_depth_stencil_state(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "D_S";
-   struct gen6_depth_stencil_state *ds = brw->batch.bo->virtual + offset;
+   struct gen6_depth_stencil_state *ds = batch_in(brw, offset);
 
    batch_out(brw, name, offset, 0,
 	     "stencil %sable, func %d, write %sable\n",
@@ -373,7 +376,7 @@  static void dump_cc_state_gen4(struct brw_context *brw, uint32_t offset)
 static void dump_cc_state_gen6(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "CC";
-   struct gen6_color_calc_state *cc = brw->batch.bo->virtual + offset;
+   struct gen6_color_calc_state *cc = batch_in(brw, offset);
 
    batch_out(brw, name, offset, 0,
 	     "alpha test format %s, round disable %d, stencil ref %d, "
@@ -401,7 +404,7 @@  static void
 dump_scissor(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "SCISSOR";
-   struct gen6_scissor_rect *scissor = brw->batch.bo->virtual + offset;
+   struct gen6_scissor_rect *scissor = batch_in(brw, offset);
 
    batch_out(brw, name, offset, 0, "xmin %d, ymin %d\n",
 	     scissor->xmin, scissor->ymin);
@@ -413,8 +416,8 @@  static void
 dump_vs_constants(struct brw_context *brw, uint32_t offset, uint32_t size)
 {
    const char *name = "VS_CONST";
-   uint32_t *as_uint = brw->batch.bo->virtual + offset;
-   float *as_float = brw->batch.bo->virtual + offset;
+   uint32_t *as_uint = batch_in(brw, offset);
+   float *as_float = batch_in(brw, offset);
    int i;
 
    for (i = 0; i < size / 4; i += 4) {
@@ -429,8 +432,8 @@  static void
 dump_wm_constants(struct brw_context *brw, uint32_t offset, uint32_t size)
 {
    const char *name = "WM_CONST";
-   uint32_t *as_uint = brw->batch.bo->virtual + offset;
-   float *as_float = brw->batch.bo->virtual + offset;
+   uint32_t *as_uint = batch_in(brw, offset);
+   float *as_float = batch_in(brw, offset);
    int i;
 
    for (i = 0; i < size / 4; i += 4) {
@@ -446,7 +449,7 @@  static void dump_binding_table(struct brw_context *brw, uint32_t offset,
 {
    char name[20];
    int i;
-   uint32_t *data = brw->batch.bo->virtual + offset;
+   uint32_t *data = batch_in(brw, offset);
 
    for (i = 0; i < size / 4; i++) {
       if (data[i] == 0)
@@ -463,8 +466,6 @@  dump_prog_cache(struct brw_context *brw)
    struct brw_cache *cache = &brw->cache;
    unsigned int b;
 
-   drm_intel_bo_map(brw->cache.bo, false);
-
    for (b = 0; b < cache->size; b++) {
       struct brw_cache_item *item;
 
@@ -496,12 +497,11 @@  dump_prog_cache(struct brw_context *brw)
 	 }
 
          fprintf(stderr, "%s:\n", name);
-         brw_disassemble(brw->intelScreen->devinfo, brw->cache.bo->virtual,
+         brw_disassemble(brw->intelScreen->devinfo,
+			 brw_bo_map(brw->cache.bo, MAP_READ | MAP_ASYNC),
                          item->offset, item->size, stderr);
       }
    }
-
-   drm_intel_bo_unmap(brw->cache.bo);
 }
 
 static void
@@ -595,12 +595,11 @@  dump_state_batch(struct brw_context *brw)
  * The buffer offsets printed rely on the buffer containing the last offset
  * it was validated at.
  */
-void brw_debug_batch(struct brw_context *brw)
+void brw_debug_batch(struct brw_batch *batch)
 {
-   drm_intel_bo_map(brw->batch.bo, false);
-   dump_state_batch(brw);
-   drm_intel_bo_unmap(brw->batch.bo);
+   struct brw_context *brw = container_of(batch, struct brw_context, batch);
 
+   dump_state_batch(brw);
    if (0)
       dump_prog_cache(brw);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index e7ef41c..1ab7cc9 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -34,8 +34,8 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "drivers/common/meta.h"
-#include "intel_batchbuffer.h"
 #include "intel_buffers.h"
+#include "intel_reg.h"
 #include "brw_vs.h"
 #include "brw_ff_gs.h"
 #include "brw_gs.h"
@@ -330,15 +330,21 @@  static const struct brw_tracked_state *gen8_compute_atoms[] =
 {
 };
 
-static void
+static int
 brw_upload_initial_gpu_state(struct brw_context *brw)
 {
+   int ret;
+
    /* On platforms with hardware contexts, we can set our initial GPU state
     * right away rather than doing it via state atoms.  This saves a small
     * amount of overhead on every draw call.
     */
-   if (!brw->hw_ctx)
-      return;
+   if (!brw->batch.hw_ctx)
+      return 0;
+
+   ret = brw_batch_begin(&brw->batch, 200, RENDER_RING);
+   if (ret < 0)
+      return ret;
 
    if (brw->gen == 6)
       brw_emit_post_sync_nonzero_flush(brw);
@@ -358,6 +364,8 @@  brw_upload_initial_gpu_state(struct brw_context *brw)
    if (brw->gen >= 8) {
       gen8_emit_3dstate_sample_pattern(brw);
    }
+
+   return brw_batch_end(&brw->batch);
 }
 
 static inline const struct brw_tracked_state *
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index 7c97a95..8b4ca20 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -391,13 +391,16 @@  struct brw_sf_unit_state
       unsigned pad3:1;
    } thread4;
 
-   struct
+   union
    {
-      unsigned front_winding:1;
-      unsigned viewport_transform:1;
-      unsigned pad0:3;
-      unsigned sf_viewport_state_offset:27; /* Offset from GENERAL_STATE_BASE */
-   } sf5;
+      struct {
+	 unsigned front_winding:1;
+	 unsigned viewport_transform:1;
+	 unsigned pad0:3;
+	 unsigned sf_viewport_state_offset:27; /* Offset from GENERAL_STATE_BASE */
+      } sf5;
+      uint32_t dw5;
+   };
 
    struct
    {
@@ -525,12 +528,15 @@  struct brw_wm_unit_state
    struct thread2 thread2;
    struct thread3 thread3;
 
-   struct {
-      unsigned stats_enable:1;
-      unsigned depth_buffer_clear:1;
-      unsigned sampler_count:3;
-      unsigned sampler_state_pointer:27;
-   } wm4;
+   union {
+	   struct {
+	      unsigned stats_enable:1;
+	      unsigned depth_buffer_clear:1;
+	      unsigned sampler_count:3;
+	      unsigned sampler_state_pointer:27;
+	   } wm4;
+	   uint32_t dw4;
+   };
 
    struct
    {
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 6fcf1b0..fa79fba 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -31,11 +31,12 @@ 
 
 
 
-#include "intel_batchbuffer.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
 
+#include "intel_reg.h"
+
 #define VS 0
 #define GS 1
 #define CLP 2
@@ -249,10 +250,10 @@  void brw_upload_urb_fence(struct brw_context *brw)
    uf.bits1.cs_fence  = brw->urb.size;
 
    /* erratum: URB_FENCE must not cross a 64byte cacheline */
-   if ((brw->batch.used & 15) > 12) {
-      int pad = 16 - (brw->batch.used & 15);
+   if ((brw->batch.emit.nbatch & 15) > 12) {
+      int pad = 16 - (brw->batch.emit.nbatch & 15);
       do
-	 brw->batch.map[brw->batch.used++] = MI_NOOP;
+	 brw_batch_emit(&brw->batch, MI_NOOP);
       while (--pad);
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 57b507d..9a7f1e6 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1812,8 +1812,7 @@  brw_vs_emit(struct brw_context *brw,
       brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions != NULL;
 
    if (unlikely(brw->perf_debug)) {
-      start_busy = (brw->batch.last_bo &&
-                    drm_intel_bo_busy(brw->batch.last_bo));
+      start_busy = brw_batch_busy(&brw->batch);
       start_time = get_time();
    }
 
@@ -1893,7 +1892,7 @@  brw_vs_emit(struct brw_context *brw,
       if (shader->compiled_once) {
          brw_vs_debug_recompile(brw, prog, &c->key);
       }
-      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
+      if (start_busy && !brw_batch_busy(&brw->batch)) {
          perf_debug("VS compile took %.03f ms and stalled the GPU\n",
                     (get_time() - start_time) * 1000);
       }
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index b9b97a7..6ad4153 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -80,10 +80,16 @@  brw_upload_vs_unit(struct brw_context *brw)
       brw->vs.prog_data->base.base.binding_table.size_bytes / 4;
 
    if (brw->vs.prog_data->base.base.total_scratch != 0) {
-      vs->thread2.scratch_space_base_pointer =
-	 stage_state->scratch_bo->offset64 >> 10; /* reloc */
       vs->thread2.per_thread_scratch_space =
 	 ffs(brw->vs.prog_data->base.base.total_scratch) - 11;
+
+      vs->thread2.scratch_space_base_pointer =
+	      brw_batch_reloc(&brw->batch,
+			      stage_state->state_offset + offsetof(struct brw_vs_unit_state, thread2),
+			      stage_state->scratch_bo,
+			      vs->thread2.per_thread_scratch_space,
+			      I915_GEM_DOMAIN_RENDER,
+			      I915_GEM_DOMAIN_RENDER) >> 10;
    } else {
       vs->thread2.scratch_space_base_pointer = 0;
       vs->thread2.per_thread_scratch_space = 0;
@@ -158,24 +164,15 @@  brw_upload_vs_unit(struct brw_context *brw)
    if (stage_state->sampler_count) {
       /* BRW_NEW_SAMPLER_STATE_TABLE - reloc */
       vs->vs5.sampler_state_pointer =
-         (brw->batch.bo->offset64 + stage_state->sampler_offset) >> 5;
-      drm_intel_bo_emit_reloc(brw->batch.bo,
-                              stage_state->state_offset +
-                              offsetof(struct brw_vs_unit_state, vs5),
-                              brw->batch.bo,
-                              (stage_state->sampler_offset |
-                               vs->vs5.sampler_count),
-                              I915_GEM_DOMAIN_INSTRUCTION, 0);
+	      brw_batch_reloc(&brw->batch,
+			      stage_state->state_offset + offsetof(struct brw_vs_unit_state, vs5),
+			      brw->batch.bo,
+			      (stage_state->sampler_offset | vs->vs5.sampler_count),
+			      I915_GEM_DOMAIN_INSTRUCTION, 0) >> 5;
    }
 
    /* Emit scratch space relocation */
    if (brw->vs.prog_data->base.base.total_scratch != 0) {
-      drm_intel_bo_emit_reloc(brw->batch.bo,
-			      stage_state->state_offset +
-			      offsetof(struct brw_vs_unit_state, thread2),
-			      stage_state->scratch_bo,
-			      vs->thread2.per_thread_scratch_space,
-			      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
    }
 
    brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index f82a62b..c649b1a 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -74,7 +74,7 @@  brw_upload_pull_constants(struct brw_context *brw,
 
    /* BRW_NEW_*_PROG_DATA | _NEW_PROGRAM_CONSTANTS */
    uint32_t size = prog_data->nr_pull_params * 4;
-   drm_intel_bo *const_bo = NULL;
+   struct brw_bo *const_bo = NULL;
    uint32_t const_offset;
    gl_constant_value *constants = intel_upload_space(brw, size, 64,
                                                      &const_bo, &const_offset);
@@ -96,7 +96,7 @@  brw_upload_pull_constants(struct brw_context *brw,
    brw_create_constant_surface(brw, const_bo, const_offset, size,
                                &stage_state->surf_offset[surf_index],
                                dword_pitch);
-   drm_intel_bo_unreference(const_bo);
+   brw_bo_put(const_bo);
 
    brw->ctx.NewDriverState |= brw_new_constbuf;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 0cd4390..1195f55 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -140,10 +140,15 @@  brw_upload_wm_unit(struct brw_context *brw)
       prog_data->base.binding_table.size_bytes / 4;
 
    if (prog_data->base.total_scratch != 0) {
-      wm->thread2.scratch_space_base_pointer =
-	 brw->wm.base.scratch_bo->offset64 >> 10; /* reloc */
       wm->thread2.per_thread_scratch_space =
 	 ffs(prog_data->base.total_scratch) - 11;
+
+      wm->thread2.scratch_space_base_pointer =
+	 brw_batch_reloc(&brw->batch,
+			 brw->wm.base.state_offset + offsetof(struct brw_wm_unit_state, thread2),
+			 brw->wm.base.scratch_bo,
+			 wm->thread2.per_thread_scratch_space,
+			 I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER) >> 10;
    } else {
       wm->thread2.scratch_space_base_pointer = 0;
       wm->thread2.per_thread_scratch_space = 0;
@@ -167,8 +172,12 @@  brw_upload_wm_unit(struct brw_context *brw)
 
    if (brw->wm.base.sampler_count) {
       /* BRW_NEW_SAMPLER_STATE_TABLE - reloc */
-      wm->wm4.sampler_state_pointer = (brw->batch.bo->offset64 +
-				       brw->wm.base.sampler_offset) >> 5;
+      wm->wm4.sampler_state_pointer =
+	 brw_batch_reloc(&brw->batch,
+			 brw->wm.base.state_offset + offsetof(struct brw_wm_unit_state, wm4),
+			 brw->batch.bo,
+			 brw->wm.base.sampler_offset | wm->dw4,
+			 I915_GEM_DOMAIN_INSTRUCTION, 0) >> 5;
    } else {
       wm->wm4.sampler_state_pointer = 0;
    }
@@ -229,27 +238,6 @@  brw_upload_wm_unit(struct brw_context *brw)
    if (unlikely(INTEL_DEBUG & DEBUG_STATS) || brw->stats_wm)
       wm->wm4.stats_enable = 1;
 
-   /* Emit scratch space relocation */
-   if (prog_data->base.total_scratch != 0) {
-      drm_intel_bo_emit_reloc(brw->batch.bo,
-			      brw->wm.base.state_offset +
-			      offsetof(struct brw_wm_unit_state, thread2),
-			      brw->wm.base.scratch_bo,
-			      wm->thread2.per_thread_scratch_space,
-			      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
-   }
-
-   /* Emit sampler state relocation */
-   if (brw->wm.base.sampler_count != 0) {
-      drm_intel_bo_emit_reloc(brw->batch.bo,
-			      brw->wm.base.state_offset +
-			      offsetof(struct brw_wm_unit_state, wm4),
-			      brw->batch.bo, (brw->wm.base.sampler_offset |
-                                              wm->wm4.stats_enable |
-                                              (wm->wm4.sampler_count << 2)),
-			      I915_GEM_DOMAIN_INSTRUCTION, 0);
-   }
-
    brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 25fb543..aeded95 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -37,7 +37,6 @@ 
 #include "program/prog_parameter.h"
 
 #include "intel_mipmap_tree.h"
-#include "intel_batchbuffer.h"
 #include "intel_tex.h"
 #include "intel_fbo.h"
 #include "intel_buffer_objects.h"
@@ -242,7 +241,7 @@  brw_get_texture_swizzle(const struct gl_context *ctx,
 static void
 gen4_emit_buffer_surface_state(struct brw_context *brw,
                                uint32_t *out_offset,
-                               drm_intel_bo *bo,
+                               struct brw_bo *bo,
                                unsigned buffer_offset,
                                unsigned surface_format,
                                unsigned buffer_size,
@@ -256,7 +255,13 @@  gen4_emit_buffer_surface_state(struct brw_context *brw,
    surf[0] = BRW_SURFACE_BUFFER << BRW_SURFACE_TYPE_SHIFT |
              surface_format << BRW_SURFACE_FORMAT_SHIFT |
              (brw->gen >= 6 ? BRW_SURFACE_RC_READ_WRITE : 0);
-   surf[1] = (bo ? bo->offset64 : 0) + buffer_offset; /* reloc */
+   if (bo)
+      surf[1] = brw_batch_reloc(&brw->batch, *out_offset + 4,
+				bo, buffer_offset,
+				I915_GEM_DOMAIN_SAMPLER,
+				(rw ? I915_GEM_DOMAIN_SAMPLER : 0));
+   else
+      surf[1] = buffer_offset;
    surf[2] = (buffer_size & 0x7f) << BRW_SURFACE_WIDTH_SHIFT |
              ((buffer_size >> 7) & 0x1fff) << BRW_SURFACE_HEIGHT_SHIFT;
    surf[3] = ((buffer_size >> 20) & 0x7f) << BRW_SURFACE_DEPTH_SHIFT |
@@ -267,10 +272,6 @@  gen4_emit_buffer_surface_state(struct brw_context *brw,
     * physical cache.  It is mapped in hardware to the sampler cache."
     */
    if (bo) {
-      drm_intel_bo_emit_reloc(brw->batch.bo, *out_offset + 4,
-                              bo, buffer_offset,
-                              I915_GEM_DOMAIN_SAMPLER,
-                              (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
    }
 }
 
@@ -284,7 +285,7 @@  brw_update_buffer_texture_surface(struct gl_context *ctx,
    struct intel_buffer_object *intel_obj =
       intel_buffer_object(tObj->BufferObject);
    uint32_t size = tObj->BufferSize;
-   drm_intel_bo *bo = NULL;
+   struct brw_bo *bo = NULL;
    mesa_format format = tObj->_BufferObjectFormat;
    uint32_t brw_format = brw_format_for_mesa_format(format);
    int texel_size = _mesa_get_format_bytes(format);
@@ -366,7 +367,11 @@  brw_update_texture_surface(struct gl_context *ctx,
 	      BRW_SURFACE_CUBEFACE_ENABLES |
 	      tex_format << BRW_SURFACE_FORMAT_SHIFT);
 
-   surf[1] = mt->bo->offset64 + mt->offset; /* reloc */
+   surf[1] = brw_batch_reloc(&brw->batch,
+			     *surf_offset + 4,
+			     mt->bo,
+			     mt->offset,
+			     I915_GEM_DOMAIN_SAMPLER, 0);
 
    surf[2] = ((intelObj->_MaxLevel - tObj->BaseLevel) << BRW_SURFACE_LOD_SHIFT |
 	      (mt->logical_width0 - 1) << BRW_SURFACE_WIDTH_SHIFT |
@@ -380,13 +385,6 @@  brw_update_texture_surface(struct gl_context *ctx,
               SET_FIELD(tObj->BaseLevel - mt->first_level, BRW_SURFACE_MIN_LOD));
 
    surf[5] = mt->align_h == 4 ? BRW_SURFACE_VERTICAL_ALIGN_ENABLE : 0;
-
-   /* Emit relocation to surface contents */
-   drm_intel_bo_emit_reloc(brw->batch.bo,
-                           *surf_offset + 4,
-                           mt->bo,
-                           surf[1] - mt->bo->offset64,
-                           I915_GEM_DOMAIN_SAMPLER, 0);
 }
 
 /**
@@ -395,7 +393,7 @@  brw_update_texture_surface(struct gl_context *ctx,
  */
 void
 brw_create_constant_surface(struct brw_context *brw,
-			    drm_intel_bo *bo,
+			    struct brw_bo *bo,
 			    uint32_t offset,
 			    uint32_t size,
 			    uint32_t *out_offset,
@@ -423,7 +421,7 @@  brw_update_sol_surface(struct brw_context *brw,
 {
    struct intel_buffer_object *intel_bo = intel_buffer_object(buffer_obj);
    uint32_t offset_bytes = 4 * offset_dwords;
-   drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_bo,
+   struct brw_bo *bo = intel_bufferobj_buffer(brw, intel_bo,
                                              offset_bytes,
                                              buffer_obj->Size - offset_bytes);
    uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 6 * 4, 32,
@@ -479,19 +477,17 @@  brw_update_sol_surface(struct brw_context *brw,
       BRW_SURFACE_MIPMAPLAYOUT_BELOW << BRW_SURFACE_MIPLAYOUT_SHIFT |
       surface_format << BRW_SURFACE_FORMAT_SHIFT |
       BRW_SURFACE_RC_READ_WRITE;
-   surf[1] = bo->offset64 + offset_bytes; /* reloc */
+   surf[1] = brw_batch_reloc(&brw->batch,
+			     *out_offset + 4,
+			     bo, offset_bytes,
+			     I915_GEM_DOMAIN_RENDER,
+			     I915_GEM_DOMAIN_RENDER);
    surf[2] = (width << BRW_SURFACE_WIDTH_SHIFT |
 	      height << BRW_SURFACE_HEIGHT_SHIFT);
    surf[3] = (depth << BRW_SURFACE_DEPTH_SHIFT |
               pitch_minus_1 << BRW_SURFACE_PITCH_SHIFT);
    surf[4] = 0;
    surf[5] = 0;
-
-   /* Emit relocation to surface contents. */
-   drm_intel_bo_emit_reloc(brw->batch.bo,
-			   *out_offset + 4,
-			   bo, offset_bytes,
-			   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
 }
 
 /* Creates a new WM constant buffer reflecting the current fragment program's
@@ -559,7 +555,7 @@  brw_emit_null_surface_state(struct brw_context *brw,
     *     - Surface Format must be R8G8B8A8_UNORM.
     */
    unsigned surface_type = BRW_SURFACE_NULL;
-   drm_intel_bo *bo = NULL;
+   struct brw_bo *bo = NULL;
    unsigned pitch_minus_1 = 0;
    uint32_t multisampling_state = 0;
    uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 6 * 4, 32,
@@ -599,7 +595,14 @@  brw_emit_null_surface_state(struct brw_context *brw,
 		  1 << BRW_SURFACE_WRITEDISABLE_B_SHIFT |
 		  1 << BRW_SURFACE_WRITEDISABLE_A_SHIFT);
    }
-   surf[1] = bo ? bo->offset64 : 0;
+   if (bo)
+      surf[1] = brw_batch_reloc(&brw->batch,
+				*out_offset + 4,
+				bo, 0,
+				I915_GEM_DOMAIN_RENDER,
+				I915_GEM_DOMAIN_RENDER);
+   else
+      surf[1] = 0;
    surf[2] = ((width - 1) << BRW_SURFACE_WIDTH_SHIFT |
               (height - 1) << BRW_SURFACE_HEIGHT_SHIFT);
 
@@ -612,13 +615,6 @@  brw_emit_null_surface_state(struct brw_context *brw,
               pitch_minus_1 << BRW_SURFACE_PITCH_SHIFT);
    surf[4] = multisampling_state;
    surf[5] = 0;
-
-   if (bo) {
-      drm_intel_bo_emit_reloc(brw->batch.bo,
-                              *out_offset + 4,
-                              bo, 0,
-                              I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
-   }
 }
 
 /**
@@ -675,8 +671,13 @@  brw_update_renderbuffer_surface(struct brw_context *brw,
 
    /* reloc */
    assert(mt->offset % mt->cpp == 0);
-   surf[1] = (intel_renderbuffer_get_tile_offsets(irb, &tile_x, &tile_y) +
-	      mt->bo->offset64 + mt->offset);
+   surf[1] = brw_batch_reloc(&brw->batch,
+			     brw->wm.base.surf_offset[surf_index] + 4,
+			     mt->bo,
+			     mt->offset +
+			     intel_renderbuffer_get_tile_offsets(irb, &tile_x, &tile_y),
+			     I915_GEM_DOMAIN_RENDER,
+			     I915_GEM_DOMAIN_RENDER);
 
    surf[2] = ((rb->Width - 1) << BRW_SURFACE_WIDTH_SHIFT |
 	      (rb->Height - 1) << BRW_SURFACE_HEIGHT_SHIFT);
@@ -718,13 +719,6 @@  brw_update_renderbuffer_surface(struct brw_context *brw,
       }
    }
 
-   drm_intel_bo_emit_reloc(brw->batch.bo,
-                           offset + 4,
-                           mt->bo,
-                           surf[1] - mt->bo->offset64,
-                           I915_GEM_DOMAIN_RENDER,
-                           I915_GEM_DOMAIN_RENDER);
-
    return offset;
 }
 
@@ -902,7 +896,7 @@  brw_upload_ubo_surfaces(struct brw_context *brw,
 
       binding = &ctx->UniformBufferBindings[shader->UniformBlocks[i].Binding];
       intel_bo = intel_buffer_object(binding->BufferObject);
-      drm_intel_bo *bo =
+      struct brw_bo *bo =
          intel_bufferobj_buffer(brw, intel_bo,
                                 binding->Offset,
                                 binding->BufferObject->Size - binding->Offset);
@@ -961,7 +955,7 @@  brw_upload_abo_surfaces(struct brw_context *brw,
          &ctx->AtomicBufferBindings[prog->AtomicBuffers[i].Binding];
       struct intel_buffer_object *intel_bo =
          intel_buffer_object(binding->BufferObject);
-      drm_intel_bo *bo = intel_bufferobj_buffer(
+      struct brw_bo *bo = intel_bufferobj_buffer(
          brw, intel_bo, binding->Offset, intel_bo->Base.Size - binding->Offset);
 
       brw->vtbl.emit_buffer_surface_state(brw, &surf_offsets[i], bo,
diff --git a/src/mesa/drivers/dri/i965/gen6_blorp.cpp b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
index 54c4a6d..6aa772c 100644
--- a/src/mesa/drivers/dri/i965/gen6_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
@@ -23,7 +23,6 @@ 
 
 #include <assert.h>
 
-#include "intel_batchbuffer.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 
@@ -386,9 +385,12 @@  gen6_blorp_emit_surface_state(struct brw_context *brw,
               BRW_SURFACE_CUBEFACE_ENABLES |
               surface->brw_surfaceformat << BRW_SURFACE_FORMAT_SHIFT);
 
-   /* reloc */
-   surf[1] = (surface->compute_tile_offsets(&tile_x, &tile_y) +
-              mt->bo->offset64);
+   surf[1] = brw_batch_reloc(&brw->batch,
+			     wm_surf_offset + 4,
+			     mt->bo,
+			     surface->compute_tile_offsets(&tile_x, &tile_y),
+			     read_domains, write_domain);
+
 
    surf[2] = (0 << BRW_SURFACE_LOD_SHIFT |
               (width - 1) << BRW_SURFACE_WIDTH_SHIFT |
@@ -416,13 +418,6 @@  gen6_blorp_emit_surface_state(struct brw_context *brw,
               (surface->mt->align_h == 4 ?
                BRW_SURFACE_VERTICAL_ALIGN_ENABLE : 0));
 
-   /* Emit relocation to surface contents */
-   drm_intel_bo_emit_reloc(brw->batch.bo,
-                           wm_surf_offset + 4,
-                           mt->bo,
-                           surf[1] - mt->bo->offset64,
-                           read_domains, write_domain);
-
    return wm_surf_offset;
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index 2bfa271..446a07b 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -29,7 +29,6 @@ 
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_util.h"
-#include "intel_batchbuffer.h"
 #include "main/macros.h"
 #include "main/enums.h"
 #include "main/glformats.h"
diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index aaf90df..3622dc1 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -29,7 +29,6 @@ 
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_util.h"
-#include "intel_batchbuffer.h"
 #include "main/fbobject.h"
 
 static void
diff --git a/src/mesa/drivers/dri/i965/gen6_depth_state.c b/src/mesa/drivers/dri/i965/gen6_depth_state.c
index 8f0d7dc..b1a9dd1 100644
--- a/src/mesa/drivers/dri/i965/gen6_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_depth_state.c
@@ -22,7 +22,6 @@ 
  */
 
 
-#include "intel_batchbuffer.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 
diff --git a/src/mesa/drivers/dri/i965/gen6_depthstencil.c b/src/mesa/drivers/dri/i965/gen6_depthstencil.c
index 2c625c9..ed731c5 100644
--- a/src/mesa/drivers/dri/i965/gen6_depthstencil.c
+++ b/src/mesa/drivers/dri/i965/gen6_depthstencil.c
@@ -25,7 +25,6 @@ 
  *
  */
 
-#include "intel_batchbuffer.h"
 #include "intel_fbo.h"
 #include "brw_context.h"
 #include "brw_defines.h"
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c
index eb4c586..3d4bb68 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c
@@ -28,7 +28,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 
 static void
 gen6_upload_gs_push_constants(struct brw_context *brw)
diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
index ec46479..74f1478 100644
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -21,7 +21,6 @@ 
  * IN THE SOFTWARE.
  */
 
-#include "intel_batchbuffer.h"
 
 #include "brw_context.h"
 #include "brw_defines.h"
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
index 0092528..55e0e3a 100644
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -36,7 +36,6 @@ 
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
 #include "intel_reg.h"
 
 /*
@@ -50,7 +49,7 @@ 
  */
 void
 brw_store_register_mem64(struct brw_context *brw,
-                         drm_intel_bo *bo, uint32_t reg, int idx)
+                         struct brw_bo *bo, uint32_t reg, int idx)
 {
    assert(brw->gen >= 6);
 
@@ -84,7 +83,7 @@  brw_store_register_mem64(struct brw_context *brw,
 
 static void
 write_primitives_generated(struct brw_context *brw,
-                           drm_intel_bo *query_bo, int stream, int idx)
+                           struct brw_bo *query_bo, int stream, int idx)
 {
    brw_emit_mi_flush(brw);
 
@@ -98,7 +97,7 @@  write_primitives_generated(struct brw_context *brw,
 
 static void
 write_xfb_primitives_written(struct brw_context *brw,
-                             drm_intel_bo *bo, int stream, int idx)
+                             struct brw_bo *bo, int stream, int idx)
 {
    brw_emit_mi_flush(brw);
 
@@ -119,7 +118,7 @@  pipeline_target_to_index(int target)
 }
 
 static void
-emit_pipeline_stat(struct brw_context *brw, drm_intel_bo *bo,
+emit_pipeline_stat(struct brw_context *brw, struct brw_bo *bo,
                    int stream, int target, int idx)
 {
    /* One source of confusion is the tessellation shader statistics. The
@@ -175,8 +174,7 @@  gen6_queryobj_get_results(struct gl_context *ctx,
    if (query->bo == NULL)
       return;
 
-   brw_bo_map(brw, query->bo, false, "query object");
-   uint64_t *results = query->bo->virtual;
+   uint64_t *results = brw_bo_map(query->bo, MAP_READ);
    switch (query->Base.Target) {
    case GL_TIME_ELAPSED:
       /* The query BO contains the starting and ending timestamps.
@@ -255,12 +253,11 @@  gen6_queryobj_get_results(struct gl_context *ctx,
    default:
       unreachable("Unrecognized query target in brw_queryobj_get_results()");
    }
-   drm_intel_bo_unmap(query->bo);
 
    /* Now that we've processed the data stored in the query's buffer object,
     * we can release it.
     */
-   drm_intel_bo_unreference(query->bo);
+   brw_bo_put(query->bo);
    query->bo = NULL;
 
    query->Base.Ready = true;
@@ -279,8 +276,8 @@  gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q)
    struct brw_query_object *query = (struct brw_query_object *)q;
 
    /* Since we're starting a new query, we need to throw away old results. */
-   drm_intel_bo_unreference(query->bo);
-   query->bo = drm_intel_bo_alloc(brw->bufmgr, "query results", 4096, 4096);
+   brw_bo_put(query->bo);
+   query->bo = brw_bo_create(&brw->batch, "query results", 4096, 4096);
 
    switch (query->Base.Target) {
    case GL_TIME_ELAPSED:
@@ -390,27 +387,6 @@  gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
    default:
       unreachable("Unrecognized query target in brw_end_query()");
    }
-
-   /* The current batch contains the commands to handle EndQuery(),
-    * but they won't actually execute until it is flushed.
-    */
-   query->flushed = false;
-}
-
-/**
- * Flush the batch if it still references the query object BO.
- */
-static void
-flush_batch_if_needed(struct brw_context *brw, struct brw_query_object *query)
-{
-   /* If the batch doesn't reference the BO, it must have been flushed
-    * (for example, due to being full).  Record that it's been flushed.
-    */
-   query->flushed = query->flushed ||
-      !drm_intel_bo_references(brw->batch.bo, query->bo);
-
-   if (!query->flushed)
-      intel_batchbuffer_flush(brw);
 }
 
 /**
@@ -421,14 +397,13 @@  flush_batch_if_needed(struct brw_context *brw, struct brw_query_object *query)
  */
 static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q)
 {
-   struct brw_context *brw = brw_context(ctx);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
    /* If the application has requested the query result, but this batch is
     * still contributing to it, flush it now to finish that work so the
     * result will become available (eventually).
     */
-   flush_batch_if_needed(brw, query);
+   brw_bo_flush(query->bo);
 
    gen6_queryobj_get_results(ctx, query);
 }
@@ -441,7 +416,6 @@  static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q)
  */
 static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q)
 {
-   struct brw_context *brw = brw_context(ctx);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
    /* If query->bo is NULL, we've already gathered the results - this is a
@@ -457,9 +431,9 @@  static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q)
     *      not ready yet on the first time it is queried.  This ensures that
     *      the async query will return true in finite time.
     */
-   flush_batch_if_needed(brw, query);
+   brw_bo_flush(query->bo);
 
-   if (!drm_intel_bo_busy(query->bo)) {
+   if (!brw_bo_busy(query->bo, BUSY_READ | BUSY_FLUSH)) {
       gen6_queryobj_get_results(ctx, query);
    }
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_sampler_state.c b/src/mesa/drivers/dri/i965/gen6_sampler_state.c
index 9e7da58..ecc6b21 100644
--- a/src/mesa/drivers/dri/i965/gen6_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sampler_state.c
@@ -28,7 +28,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 
 static void
 upload_sampler_state_pointers(struct brw_context *brw)
diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
index 0111f15..7809c1e 100644
--- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
@@ -28,7 +28,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 #include "main/fbobject.h"
 #include "main/framebuffer.h"
 
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index ea5c47a..c94ebb0 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -31,7 +31,6 @@ 
 #include "brw_util.h"
 #include "main/macros.h"
 #include "main/fbobject.h"
-#include "intel_batchbuffer.h"
 
 /**
  * Determine the appropriate attribute override value to store into the
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
index 3899ce9..4229ea9 100644
--- a/src/mesa/drivers/dri/i965/gen6_sol.c
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -29,7 +29,6 @@ 
 #include "main/bufferobj.h"
 #include "main/macros.h"
 #include "brw_context.h"
-#include "intel_batchbuffer.h"
 #include "brw_defines.h"
 #include "brw_state.h"
 #include "main/transformfeedback.h"
@@ -205,9 +204,9 @@  brw_new_transform_feedback(struct gl_context *ctx, GLuint name)
    _mesa_init_transform_feedback_object(&brw_obj->base, name);
 
    brw_obj->offset_bo =
-      drm_intel_bo_alloc(brw->bufmgr, "transform feedback offsets", 16, 64);
+      brw_bo_create(&brw->batch, "transform feedback offsets", 16, 64);
    brw_obj->prim_count_bo =
-      drm_intel_bo_alloc(brw->bufmgr, "xfb primitive counts", 4096, 64);
+      brw_bo_create(&brw->batch, "xfb primitive counts", 4096, 64);
 
    return &brw_obj->base;
 }
@@ -223,8 +222,8 @@  brw_delete_transform_feedback(struct gl_context *ctx,
       _mesa_reference_buffer_object(ctx, &obj->Buffers[i], NULL);
    }
 
-   drm_intel_bo_unreference(brw_obj->offset_bo);
-   drm_intel_bo_unreference(brw_obj->prim_count_bo);
+   brw_bo_put(brw_obj->offset_bo);
+   brw_bo_put(brw_obj->prim_count_bo);
 
    free(brw_obj);
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_surface_state.c b/src/mesa/drivers/dri/i965/gen6_surface_state.c
index 03e913a..9121b41 100644
--- a/src/mesa/drivers/dri/i965/gen6_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_surface_state.c
@@ -30,7 +30,6 @@ 
 #include "program/prog_parameter.h"
 
 #include "intel_mipmap_tree.h"
-#include "intel_batchbuffer.h"
 #include "intel_tex.h"
 #include "intel_fbo.h"
 #include "intel_buffer_objects.h"
@@ -95,7 +94,12 @@  gen6_update_renderbuffer_surface(struct brw_context *brw,
 
    /* reloc */
    assert(mt->offset % mt->cpp == 0);
-   surf[1] = mt->bo->offset64 + mt->offset;
+   surf[1] = brw_batch_reloc(&brw->batch,
+			     brw->wm.base.surf_offset[surf_index] + 4,
+			     mt->bo,
+			     mt->offset,
+			     I915_GEM_DOMAIN_RENDER,
+			     I915_GEM_DOMAIN_RENDER);
 
    /* In the gen6 PRM Volume 1 Part 1: Graphics Core, Section 7.18.3.7.1
     * (Surface Arrays For all surfaces other than separate stencil buffer):
@@ -127,13 +131,6 @@  gen6_update_renderbuffer_surface(struct brw_context *brw,
 
    surf[5] = (mt->align_h == 4 ? BRW_SURFACE_VERTICAL_ALIGN_ENABLE : 0);
 
-   drm_intel_bo_emit_reloc(brw->batch.bo,
-                           offset + 4,
-                           mt->bo,
-                           surf[1] - mt->bo->offset64,
-                           I915_GEM_DOMAIN_RENDER,
-                           I915_GEM_DOMAIN_RENDER);
-
    return offset;
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c b/src/mesa/drivers/dri/i965/gen6_urb.c
index c7311fd..78cb973 100644
--- a/src/mesa/drivers/dri/i965/gen6_urb.c
+++ b/src/mesa/drivers/dri/i965/gen6_urb.c
@@ -26,7 +26,6 @@ 
  */
 
 #include "main/macros.h"
-#include "intel_batchbuffer.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
index 0c63283..daa0c0f 100644
--- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
@@ -28,7 +28,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 #include "main/fbobject.h"
 #include "main/viewport.h"
 
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index 35d10ef..89bb426 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -29,9 +29,9 @@ 
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_util.h"
+#include "intel_reg.h"
 #include "program/prog_parameter.h"
 #include "program/prog_statevars.h"
-#include "intel_batchbuffer.h"
 #include "glsl/glsl_parser_extras.h"
 
 /**
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 8e673a4..33d2be2 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -33,7 +33,6 @@ 
 #include "program/program.h"
 #include "program/prog_parameter.h"
 #include "program/prog_statevars.h"
-#include "intel_batchbuffer.h"
 
 static void
 gen6_upload_wm_push_constants(struct brw_context *brw)
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index abace6d..62d735e 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -23,7 +23,6 @@ 
 
 #include <assert.h>
 
-#include "intel_batchbuffer.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 
@@ -168,9 +167,11 @@  gen7_blorp_emit_surface_state(struct brw_context *brw,
    else
       surf[0] |= GEN7_SURFACE_ARYSPC_FULL;
 
-   /* reloc */
-   surf[1] =
-      surface->compute_tile_offsets(&tile_x, &tile_y) + mt->bo->offset64;
+   surf[1] = brw_batch_reloc(&brw->batch,
+			     wm_surf_offset + 4,
+			     mt->bo,
+			     surface->compute_tile_offsets(&tile_x, &tile_y),
+			     read_domains, write_domain);
 
    /* Note that the low bits of these fields are missing, so
     * there's the possibility of getting in trouble.
@@ -204,13 +205,6 @@  gen7_blorp_emit_surface_state(struct brw_context *brw,
                   SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A));
    }
 
-   /* Emit relocation to surface contents */
-   drm_intel_bo_emit_reloc(brw->batch.bo,
-                           wm_surf_offset + 4,
-                           mt->bo,
-                           surf[1] - mt->bo->offset64,
-                           read_domains, write_domain);
-
    gen7_check_surface_setup(surf, is_render_target);
 
    return wm_surf_offset;
diff --git a/src/mesa/drivers/dri/i965/gen7_disable.c b/src/mesa/drivers/dri/i965/gen7_disable.c
index 2c43cd7..6d0be45 100644
--- a/src/mesa/drivers/dri/i965/gen7_disable.c
+++ b/src/mesa/drivers/dri/i965/gen7_disable.c
@@ -24,7 +24,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 
 static void
 disable_stages(struct brw_context *brw)
diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c
index e1c4f8b..abd5ee2 100644
--- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
@@ -24,7 +24,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 
 static void
 upload_gs_state(struct brw_context *brw)
diff --git a/src/mesa/drivers/dri/i965/gen7_misc_state.c b/src/mesa/drivers/dri/i965/gen7_misc_state.c
index a14d4a0..fb20b22 100644
--- a/src/mesa/drivers/dri/i965/gen7_misc_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_misc_state.c
@@ -22,7 +22,6 @@ 
  */
 
 #include "main/mtypes.h"
-#include "intel_batchbuffer.h"
 #include "intel_mipmap_tree.h"
 #include "intel_fbo.h"
 #include "brw_context.h"
@@ -53,7 +52,7 @@  gen7_emit_depth_stencil_hiz(struct brw_context *brw,
 
    /* Skip repeated NULL depth/stencil emits (think 2D rendering). */
    if (!mt && brw->no_depth_or_stencil) {
-      assert(brw->hw_ctx);
+      assert(brw->batch.hw_ctx);
       return;
    }
 
diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
index 69853e6..fe6bc8c 100644
--- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
@@ -27,7 +27,6 @@ 
 #include "brw_util.h"
 #include "main/macros.h"
 #include "main/fbobject.h"
-#include "intel_batchbuffer.h"
 
 static void
 upload_sbe_state(struct brw_context *brw)
diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
index 41573a8..7a64b41 100644
--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
@@ -31,8 +31,8 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
+#include "intel_reg.h"
 #include "main/transformfeedback.h"
 
 static void
@@ -52,7 +52,7 @@  upload_3dstate_so_buffers(struct brw_context *brw)
    for (i = 0; i < 4; i++) {
       struct intel_buffer_object *bufferobj =
 	 intel_buffer_object(xfb_obj->Buffers[i]);
-      drm_intel_bo *bo;
+      struct brw_bo *bo;
       uint32_t start, end;
       uint32_t stride;
 
@@ -314,14 +314,9 @@  gen7_tally_prims_generated(struct brw_context *brw,
    /* If the current batch is still contributing to the number of primitives
     * generated, flush it now so the results will be present when mapped.
     */
-   if (drm_intel_bo_references(brw->batch.bo, obj->prim_count_bo))
-      intel_batchbuffer_flush(brw);
+   brw_bo_flush(obj->prim_count_bo);
 
-   if (unlikely(brw->perf_debug && drm_intel_bo_busy(obj->prim_count_bo)))
-      perf_debug("Stalling for # of transform feedback primitives written.\n");
-
-   drm_intel_bo_map(obj->prim_count_bo, false);
-   uint64_t *prim_counts = obj->prim_count_bo->virtual;
+   uint64_t *prim_counts = brw_bo_map(obj->prim_count_bo, MAP_READ);
 
    assert(obj->prim_count_buffer_index % (2 * BRW_MAX_XFB_STREAMS) == 0);
    int pairs = obj->prim_count_buffer_index / (2 * BRW_MAX_XFB_STREAMS);
@@ -334,8 +329,6 @@  gen7_tally_prims_generated(struct brw_context *brw,
       prim_counts += 2 * BRW_MAX_XFB_STREAMS; /* move to the next pair */
    }
 
-   drm_intel_bo_unmap(obj->prim_count_bo);
-
    /* We've already gathered up the old data; we can safely overwrite it now. */
    obj->prim_count_buffer_index = 0;
 }
@@ -447,8 +440,8 @@  gen7_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
    if (brw->gen >= 8) {
       brw_obj->zero_offsets = true;
    } else {
-      intel_batchbuffer_flush(brw);
-      brw->batch.needs_sol_reset = true;
+      brw_batch_flush(&brw->batch);
+      brw->batch.batch_flags |= I915_EXEC_GEN7_SOL_RESET;
    }
 
    /* We're about to lose the information needed to compute the number of
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
index d371c19..bc631b1 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -22,7 +22,6 @@ 
  */
 
 #include "main/macros.h"
-#include "intel_batchbuffer.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
index eb59684..9d9144e 100644
--- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
@@ -24,7 +24,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 #include "main/fbobject.h"
 #include "main/viewport.h"
 
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index 278b3ec..6dde6c6 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -27,7 +27,6 @@ 
 #include "brw_util.h"
 #include "program/prog_parameter.h"
 #include "program/prog_statevars.h"
-#include "intel_batchbuffer.h"
 
 
 void
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index b918275..4eaf54f 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -30,7 +30,6 @@ 
 #include "program/program.h"
 #include "program/prog_parameter.h"
 #include "program/prog_statevars.h"
-#include "intel_batchbuffer.h"
 
 static void
 upload_wm_state(struct brw_context *brw)
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
index 15ab2b0..df6ea98 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
@@ -28,7 +28,6 @@ 
 #include "program/prog_parameter.h"
 
 #include "intel_mipmap_tree.h"
-#include "intel_batchbuffer.h"
 #include "intel_tex.h"
 #include "intel_fbo.h"
 #include "intel_buffer_objects.h"
@@ -116,16 +115,11 @@  gen7_set_surface_mcs_info(struct brw_context *brw,
     * thus have their lower 12 bits zero), we can use an ordinary reloc to do
     * the necessary address translation.
     */
-   assert ((mcs_mt->bo->offset64 & 0xfff) == 0);
-
-   surf[6] = GEN7_SURFACE_MCS_ENABLE |
-             SET_FIELD(pitch_tiles - 1, GEN7_SURFACE_MCS_PITCH) |
-             mcs_mt->bo->offset64;
-
-   drm_intel_bo_emit_reloc(brw->batch.bo,
+   surf[6] = brw_batch_reloc(&brw->batch,
                            surf_offset + 6 * 4,
                            mcs_mt->bo,
-                           surf[6] & 0xfff,
+			   GEN7_SURFACE_MCS_ENABLE |
+			   SET_FIELD(pitch_tiles - 1, GEN7_SURFACE_MCS_PITCH),
                            is_render_target ? I915_GEM_DOMAIN_RENDER
                            : I915_GEM_DOMAIN_SAMPLER,
                            is_render_target ? I915_GEM_DOMAIN_RENDER : 0);
@@ -221,7 +215,7 @@  gen7_check_surface_setup(uint32_t *surf, bool is_render_target)
 static void
 gen7_emit_buffer_surface_state(struct brw_context *brw,
                                uint32_t *out_offset,
-                               drm_intel_bo *bo,
+                               struct brw_bo *bo,
                                unsigned buffer_offset,
                                unsigned surface_format,
                                unsigned buffer_size,
@@ -235,7 +229,14 @@  gen7_emit_buffer_surface_state(struct brw_context *brw,
    surf[0] = BRW_SURFACE_BUFFER << BRW_SURFACE_TYPE_SHIFT |
              surface_format << BRW_SURFACE_FORMAT_SHIFT |
              BRW_SURFACE_RC_READ_WRITE;
-   surf[1] = (bo ? bo->offset64 : 0) + buffer_offset; /* reloc */
+   if (bo) {
+      surf[1] = brw_batch_reloc(&brw->batch, *out_offset + 4,
+				bo, buffer_offset, I915_GEM_DOMAIN_SAMPLER,
+				(rw ? I915_GEM_DOMAIN_SAMPLER : 0));
+   } else {
+      surf[1] = buffer_offset;
+   }
+
    surf[2] = SET_FIELD((buffer_size - 1) & 0x7f, GEN7_SURFACE_WIDTH) |
              SET_FIELD(((buffer_size - 1) >> 7) & 0x3fff, GEN7_SURFACE_HEIGHT);
    if (surface_format == BRW_SURFACEFORMAT_RAW)
@@ -253,13 +254,6 @@  gen7_emit_buffer_surface_state(struct brw_context *brw,
                   SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A));
    }
 
-   /* Emit relocation to surface contents */
-   if (bo) {
-      drm_intel_bo_emit_reloc(brw->batch.bo, *out_offset + 4,
-                              bo, buffer_offset, I915_GEM_DOMAIN_SAMPLER,
-                              (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
-   }
-
    gen7_check_surface_setup(surf, false /* is_render_target */);
 }
 
@@ -299,7 +293,12 @@  gen7_emit_texture_surface_state(struct brw_context *brw,
    if (mt->array_layout == ALL_SLICES_AT_EACH_LOD)
       surf[0] |= GEN7_SURFACE_ARYSPC_LOD0;
 
-   surf[1] = mt->bo->offset64 + mt->offset; /* reloc */
+   surf[1] = brw_batch_reloc(&brw->batch,
+			     *surf_offset + 4,
+			     mt->bo,
+			     mt->offset,
+			     I915_GEM_DOMAIN_SAMPLER,
+			     (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
 
    surf[2] = SET_FIELD(mt->logical_width0 - 1, GEN7_SURFACE_WIDTH) |
              SET_FIELD(mt->logical_height0 - 1, GEN7_SURFACE_HEIGHT);
@@ -336,14 +335,6 @@  gen7_emit_texture_surface_state(struct brw_context *brw,
                                 mt->mcs_mt, false /* is RT */);
    }
 
-   /* Emit relocation to surface contents */
-   drm_intel_bo_emit_reloc(brw->batch.bo,
-                           *surf_offset + 4,
-                           mt->bo,
-                           surf[1] - mt->bo->offset64,
-                           I915_GEM_DOMAIN_SAMPLER,
-                           (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
-
    gen7_check_surface_setup(surf, false /* is_render_target */);
 }
 
@@ -519,7 +510,12 @@  gen7_update_renderbuffer_surface(struct brw_context *brw,
    }
 
    assert(mt->offset % mt->cpp == 0);
-   surf[1] = mt->bo->offset64 + mt->offset;
+   surf[1] = brw_batch_reloc(&brw->batch,
+			     brw->wm.base.surf_offset[surf_index] + 4,
+			     mt->bo,
+			     mt->offset,
+			     I915_GEM_DOMAIN_RENDER,
+			     I915_GEM_DOMAIN_RENDER);
 
    assert(brw->has_surface_tile_offset);
 
@@ -550,13 +546,6 @@  gen7_update_renderbuffer_surface(struct brw_context *brw,
                   SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A));
    }
 
-   drm_intel_bo_emit_reloc(brw->batch.bo,
-                           offset + 4,
-                           mt->bo,
-                           surf[1] - mt->bo->offset64,
-                           I915_GEM_DOMAIN_RENDER,
-                           I915_GEM_DOMAIN_RENDER);
-
    gen7_check_surface_setup(surf, true /* is_render_target */);
 
    return offset;
diff --git a/src/mesa/drivers/dri/i965/gen8_blend_state.c b/src/mesa/drivers/dri/i965/gen8_blend_state.c
index 786c79a..31a3deb 100644
--- a/src/mesa/drivers/dri/i965/gen8_blend_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_blend_state.c
@@ -26,7 +26,6 @@ 
 #include "brw_defines.h"
 #include "brw_util.h"
 #include "brw_wm.h"
-#include "intel_batchbuffer.h"
 #include "main/macros.h"
 #include "main/enums.h"
 #include "main/glformats.h"
diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index ded2121..4b6c5d0 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -21,9 +21,9 @@ 
  * IN THE SOFTWARE.
  */
 
-#include "intel_batchbuffer.h"
 #include "intel_mipmap_tree.h"
 #include "intel_fbo.h"
+#include "intel_reg.h"
 #include "intel_resolve_map.h"
 #include "brw_context.h"
 #include "brw_state.h"
@@ -53,7 +53,7 @@  emit_depth_packets(struct brw_context *brw,
 
    /* Skip repeated NULL depth/stencil emits (think 2D rendering). */
    if (!depth_mt && !stencil_mt && brw->no_depth_or_stencil) {
-      assert(brw->hw_ctx);
+      assert(brw->batch.hw_ctx);
       return;
    }
 
@@ -500,9 +500,6 @@  gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
    OUT_BATCH(0);
    ADVANCE_BATCH();
 
-   /* Mark this buffer as needing a TC flush, as we've rendered to it. */
-   brw_render_cache_set_add_bo(brw, mt->bo);
-
    /* We've clobbered all of the depth packets, and the drawing rectangle,
     * so we need to ensure those packets are re-emitted before the next
     * primitive.
diff --git a/src/mesa/drivers/dri/i965/gen8_disable.c b/src/mesa/drivers/dri/i965/gen8_disable.c
index da0d4a5..ec86fee 100644
--- a/src/mesa/drivers/dri/i965/gen8_disable.c
+++ b/src/mesa/drivers/dri/i965/gen8_disable.c
@@ -24,7 +24,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 
 static void
 disable_stages(struct brw_context *brw)
diff --git a/src/mesa/drivers/dri/i965/gen8_draw_upload.c b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
index 1af90ec..dc5e915 100644
--- a/src/mesa/drivers/dri/i965/gen8_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
@@ -32,7 +32,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 
-#include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 
 static void
diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c
index 46b9713..1f2d87b 100644
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
@@ -24,7 +24,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 
 static void
 gen8_upload_gs_state(struct brw_context *brw)
diff --git a/src/mesa/drivers/dri/i965/gen8_misc_state.c b/src/mesa/drivers/dri/i965/gen8_misc_state.c
index b20038e..83376cd 100644
--- a/src/mesa/drivers/dri/i965/gen8_misc_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_misc_state.c
@@ -21,7 +21,6 @@ 
  * IN THE SOFTWARE.
  */
 
-#include "intel_batchbuffer.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
diff --git a/src/mesa/drivers/dri/i965/gen8_multisample_state.c b/src/mesa/drivers/dri/i965/gen8_multisample_state.c
index 75cbe06..da5b32b 100644
--- a/src/mesa/drivers/dri/i965/gen8_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_multisample_state.c
@@ -21,7 +21,6 @@ 
  * IN THE SOFTWARE.
  */
 
-#include "intel_batchbuffer.h"
 
 #include "brw_context.h"
 #include "brw_defines.h"
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index 85ad3b6..f9638d1 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -25,7 +25,6 @@ 
 #include "program/program.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 
 void
 gen8_upload_ps_extra(struct brw_context *brw,
diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c
index 52a21b6..8c29c25 100644
--- a/src/mesa/drivers/dri/i965/gen8_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sf_state.c
@@ -27,7 +27,6 @@ 
 #include "brw_util.h"
 #include "main/macros.h"
 #include "main/fbobject.h"
-#include "intel_batchbuffer.h"
 
 static void
 upload_sbe(struct brw_context *brw)
diff --git a/src/mesa/drivers/dri/i965/gen8_sol_state.c b/src/mesa/drivers/dri/i965/gen8_sol_state.c
index 58ead68..07212ab 100644
--- a/src/mesa/drivers/dri/i965/gen8_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sol_state.c
@@ -31,7 +31,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 #include "main/transformfeedback.h"
 
@@ -70,7 +69,7 @@  gen8_upload_3dstate_so_buffers(struct brw_context *brw)
       uint32_t start = xfb_obj->Offset[i];
       assert(start % 4 == 0);
       uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
-      drm_intel_bo *bo =
+      struct brw_bo *bo =
          intel_bufferobj_buffer(brw, bufferobj, start, end - start);
       assert(end <= bo->size);
 
diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index d0c2d80..fad1862 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -29,7 +29,6 @@ 
 #include "program/prog_parameter.h"
 
 #include "intel_mipmap_tree.h"
-#include "intel_batchbuffer.h"
 #include "intel_tex.h"
 #include "intel_fbo.h"
 #include "intel_buffer_objects.h"
@@ -112,7 +111,7 @@  allocate_surface_state(struct brw_context *brw, uint32_t *out_offset)
 static void
 gen8_emit_buffer_surface_state(struct brw_context *brw,
                                uint32_t *out_offset,
-                               drm_intel_bo *bo,
+                               struct brw_bo *bo,
                                unsigned buffer_offset,
                                unsigned surface_format,
                                unsigned buffer_size,
@@ -138,15 +137,11 @@  gen8_emit_buffer_surface_state(struct brw_context *brw,
              SET_FIELD(HSW_SCS_GREEN, GEN7_SURFACE_SCS_G) |
              SET_FIELD(HSW_SCS_BLUE,  GEN7_SURFACE_SCS_B) |
              SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A);
-   /* reloc */
-   *((uint64_t *) &surf[8]) = (bo ? bo->offset64 : 0) + buffer_offset;
-
    /* Emit relocation to surface contents. */
-   if (bo) {
-      drm_intel_bo_emit_reloc(brw->batch.bo, *out_offset + 8 * 4,
-                              bo, buffer_offset, I915_GEM_DOMAIN_SAMPLER,
-                              rw ? I915_GEM_DOMAIN_SAMPLER : 0);
-   }
+   *((uint64_t *)&surf[8]) =
+	   brw_batch_reloc(&brw->batch, *out_offset + 8 * 4,
+			   bo, buffer_offset, I915_GEM_DOMAIN_SAMPLER,
+			   rw ? I915_GEM_DOMAIN_SAMPLER : 0);
 }
 
 static void
@@ -223,27 +218,20 @@  gen8_emit_texture_surface_state(struct brw_context *brw,
       SET_FIELD(swizzle_to_scs(GET_SWZ(swizzle, 2)), GEN7_SURFACE_SCS_B) |
       SET_FIELD(swizzle_to_scs(GET_SWZ(swizzle, 3)), GEN7_SURFACE_SCS_A);
 
-   *((uint64_t *) &surf[8]) = mt->bo->offset64 + mt->offset; /* reloc */
-
-   if (aux_mt) {
-      *((uint64_t *) &surf[10]) = aux_mt->bo->offset64;
-      drm_intel_bo_emit_reloc(brw->batch.bo, *surf_offset + 10 * 4,
-                              aux_mt->bo, 0,
-                              I915_GEM_DOMAIN_SAMPLER,
-                              (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
-   } else {
-      surf[10] = 0;
-      surf[11] = 0;
-   }
-   surf[12] = 0;
-
-   /* Emit relocation to surface contents */
-   drm_intel_bo_emit_reloc(brw->batch.bo,
-                           *surf_offset + 8 * 4,
-                           mt->bo,
-                           mt->offset,
+   *((uint64_t *)&surf[8]) =
+	   brw_batch_reloc(&brw->batch,
+			   *surf_offset + 8 * 4,
+			   mt->bo, mt->offset,
                            I915_GEM_DOMAIN_SAMPLER,
                            (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
+
+   *((uint64_t *)&surf[10]) =
+	   brw_batch_reloc(&brw->batch,
+			   *surf_offset + 10 * 4,
+			   aux_mt->bo, 0,
+			   I915_GEM_DOMAIN_SAMPLER,
+			   (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
+   surf[12] = 0;
 }
 
 static void
@@ -432,27 +420,21 @@  gen8_update_renderbuffer_surface(struct brw_context *brw,
              SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A);
 
    assert(mt->offset % mt->cpp == 0);
-   *((uint64_t *) &surf[8]) = mt->bo->offset64 + mt->offset; /* reloc */
-
-   if (aux_mt) {
-      *((uint64_t *) &surf[10]) = aux_mt->bo->offset64;
-      drm_intel_bo_emit_reloc(brw->batch.bo,
-                              offset + 10 * 4,
-                              aux_mt->bo, 0,
-                              I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
-   } else {
-      surf[10] = 0;
-      surf[11] = 0;
-   }
+   *((uint64_t *) &surf[8]) =
+      brw_batch_reloc(&brw->batch,
+		      brw->wm.base.surf_offset[surf_index] + 8 * 4,
+		      mt->bo,
+		      mt->offset,
+		      I915_GEM_DOMAIN_RENDER,
+		      I915_GEM_DOMAIN_RENDER);
+
+   *((uint64_t *)&surf[10]) =
+      brw_batch_reloc(&brw->batch,
+		      brw->wm.base.surf_offset[surf_index] + 10 * 4,
+		      aux_mt->bo, 0,
+		      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
    surf[12] = 0;
 
-   drm_intel_bo_emit_reloc(brw->batch.bo,
-                           offset + 8 * 4,
-                           mt->bo,
-                           mt->offset,
-                           I915_GEM_DOMAIN_RENDER,
-                           I915_GEM_DOMAIN_RENDER);
-
    return offset;
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen8_viewport_state.c b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
index 322e466..1dde8f5 100644
--- a/src/mesa/drivers/dri/i965/gen8_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
@@ -24,7 +24,6 @@ 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "intel_batchbuffer.h"
 #include "main/fbobject.h"
 #include "main/viewport.h"
 
diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c
index f92af55..6252fef 100644
--- a/src/mesa/drivers/dri/i965/gen8_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c
@@ -27,7 +27,6 @@ 
 #include "brw_util.h"
 #include "program/prog_parameter.h"
 #include "program/prog_statevars.h"
-#include "intel_batchbuffer.h"
 
 static void
 upload_vs_state(struct brw_context *brw)
diff --git a/src/mesa/drivers/dri/i965/gen8_wm_depth_stencil.c b/src/mesa/drivers/dri/i965/gen8_wm_depth_stencil.c
index 2c843b2..0f1128a 100644
--- a/src/mesa/drivers/dri/i965/gen8_wm_depth_stencil.c
+++ b/src/mesa/drivers/dri/i965/gen8_wm_depth_stencil.c
@@ -21,7 +21,6 @@ 
  * IN THE SOFTWARE.
  */
 
-#include "intel_batchbuffer.h"
 #include "intel_fbo.h"
 #include "brw_context.h"
 #include "brw_defines.h"
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
deleted file mode 100644
index fa3263d..0000000
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ /dev/null
@@ -1,453 +0,0 @@ 
-/**************************************************************************
- *
- * Copyright 2006 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include "intel_batchbuffer.h"
-#include "intel_buffer_objects.h"
-#include "intel_reg.h"
-#include "intel_bufmgr.h"
-#include "intel_buffers.h"
-#include "intel_fbo.h"
-#include "brw_context.h"
-
-#include <xf86drm.h>
-#include <i915_drm.h>
-
-static void
-intel_batchbuffer_reset(struct brw_context *brw);
-
-void
-intel_batchbuffer_init(struct brw_context *brw)
-{
-   intel_batchbuffer_reset(brw);
-
-   if (!brw->has_llc) {
-      brw->batch.cpu_map = malloc(BATCH_SZ);
-      brw->batch.map = brw->batch.cpu_map;
-   }
-}
-
-static void
-intel_batchbuffer_reset(struct brw_context *brw)
-{
-   if (brw->batch.last_bo != NULL) {
-      drm_intel_bo_unreference(brw->batch.last_bo);
-      brw->batch.last_bo = NULL;
-   }
-   brw->batch.last_bo = brw->batch.bo;
-
-   brw_render_cache_set_clear(brw);
-
-   brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer",
-					BATCH_SZ, 4096);
-   if (brw->has_llc) {
-      drm_intel_bo_map(brw->batch.bo, true);
-      brw->batch.map = brw->batch.bo->virtual;
-   }
-
-   brw->batch.reserved_space = BATCH_RESERVED;
-   brw->batch.state_batch_offset = brw->batch.bo->size;
-   brw->batch.used = 0;
-   brw->batch.needs_sol_reset = false;
-
-   /* We don't know what ring the new batch will be sent to until we see the
-    * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
-    */
-   brw->batch.ring = UNKNOWN_RING;
-}
-
-void
-intel_batchbuffer_save_state(struct brw_context *brw)
-{
-   brw->batch.saved.used = brw->batch.used;
-   brw->batch.saved.reloc_count =
-      drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
-}
-
-void
-intel_batchbuffer_reset_to_saved(struct brw_context *brw)
-{
-   drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
-
-   brw->batch.used = brw->batch.saved.used;
-   if (brw->batch.used == 0)
-      brw->batch.ring = UNKNOWN_RING;
-}
-
-void
-intel_batchbuffer_free(struct brw_context *brw)
-{
-   free(brw->batch.cpu_map);
-   drm_intel_bo_unreference(brw->batch.last_bo);
-   drm_intel_bo_unreference(brw->batch.bo);
-}
-
-static void
-do_batch_dump(struct brw_context *brw)
-{
-   struct drm_intel_decode *decode;
-   struct intel_batchbuffer *batch = &brw->batch;
-   int ret;
-
-   decode = drm_intel_decode_context_alloc(brw->intelScreen->deviceID);
-   if (!decode)
-      return;
-
-   ret = drm_intel_bo_map(batch->bo, false);
-   if (ret == 0) {
-      drm_intel_decode_set_batch_pointer(decode,
-					 batch->bo->virtual,
-					 batch->bo->offset64,
-					 batch->used);
-   } else {
-      fprintf(stderr,
-	      "WARNING: failed to map batchbuffer (%s), "
-	      "dumping uploaded data instead.\n", strerror(ret));
-
-      drm_intel_decode_set_batch_pointer(decode,
-					 batch->map,
-					 batch->bo->offset64,
-					 batch->used);
-   }
-
-   drm_intel_decode_set_output_file(decode, stderr);
-   drm_intel_decode(decode);
-
-   drm_intel_decode_context_free(decode);
-
-   if (ret == 0) {
-      drm_intel_bo_unmap(batch->bo);
-
-      brw_debug_batch(brw);
-   }
-}
-
-void
-intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw)
-{
-   /* We may need to enable and snapshot OA counters. */
-   brw_perf_monitor_new_batch(brw);
-}
-
-/**
- * Called when starting a new batch buffer.
- */
-static void
-brw_new_batch(struct brw_context *brw)
-{
-   /* Create a new batchbuffer and reset the associated state: */
-   drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
-   intel_batchbuffer_reset(brw);
-
-   /* If the kernel supports hardware contexts, then most hardware state is
-    * preserved between batches; we only need to re-emit state that is required
-    * to be in every batch.  Otherwise we need to re-emit all the state that
-    * would otherwise be stored in the context (which for all intents and
-    * purposes means everything).
-    */
-   if (brw->hw_ctx == NULL)
-      brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
-
-   brw->ctx.NewDriverState |= BRW_NEW_BATCH;
-
-   brw->state_batch_count = 0;
-
-   brw->ib.type = -1;
-
-   /* We need to periodically reap the shader time results, because rollover
-    * happens every few seconds.  We also want to see results every once in a
-    * while, because many programs won't cleanly destroy our context, so the
-    * end-of-run printout may not happen.
-    */
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
-      brw_collect_and_report_shader_time(brw);
-
-   if (INTEL_DEBUG & DEBUG_PERFMON)
-      brw_dump_perf_monitors(brw);
-}
-
-/**
- * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
- * sending it off.
- *
- * This function can emit state (say, to preserve registers that aren't saved
- * between batches).  All of this state MUST fit in the reserved space at the
- * end of the batchbuffer.  If you add more GPU state, increase the reserved
- * space by updating the BATCH_RESERVED macro.
- */
-static void
-brw_finish_batch(struct brw_context *brw)
-{
-   /* Capture the closing pipeline statistics register values necessary to
-    * support query objects (in the non-hardware context world).
-    */
-   brw_emit_query_end(brw);
-
-   /* We may also need to snapshot and disable OA counters. */
-   if (brw->batch.ring == RENDER_RING)
-      brw_perf_monitor_finish_batch(brw);
-
-   /* Mark that the current program cache BO has been used by the GPU.
-    * It will be reallocated if we need to put new programs in for the
-    * next batch.
-    */
-   brw->cache.bo_used_by_gpu = true;
-}
-
-static void
-throttle(struct brw_context *brw)
-{
-   /* Wait for the swapbuffers before the one we just emitted, so we
-    * don't get too many swaps outstanding for apps that are GPU-heavy
-    * but not CPU-heavy.
-    *
-    * We're using intelDRI2Flush (called from the loader before
-    * swapbuffer) and glFlush (for front buffer rendering) as the
-    * indicator that a frame is done and then throttle when we get
-    * here as we prepare to render the next frame.  At this point for
-    * round trips for swap/copy and getting new buffers are done and
-    * we'll spend less time waiting on the GPU.
-    *
-    * Unfortunately, we don't have a handle to the batch containing
-    * the swap, and getting our hands on that doesn't seem worth it,
-    * so we just use the first batch we emitted after the last swap.
-    */
-   if (brw->need_swap_throttle && brw->throttle_batch[0]) {
-      if (brw->throttle_batch[1]) {
-         if (!brw->disable_throttling)
-            drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
-         drm_intel_bo_unreference(brw->throttle_batch[1]);
-      }
-      brw->throttle_batch[1] = brw->throttle_batch[0];
-      brw->throttle_batch[0] = NULL;
-      brw->need_swap_throttle = false;
-      /* Throttling here is more precise than the throttle ioctl, so skip it */
-      brw->need_flush_throttle = false;
-   }
-
-   if (brw->need_flush_throttle) {
-      __DRIscreen *psp = brw->intelScreen->driScrnPriv;
-      drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
-      brw->need_flush_throttle = false;
-   }
-}
-
-/* TODO: Push this whole function into bufmgr.
- */
-static int
-do_flush_locked(struct brw_context *brw)
-{
-   struct intel_batchbuffer *batch = &brw->batch;
-   int ret = 0;
-
-   if (brw->has_llc) {
-      drm_intel_bo_unmap(batch->bo);
-   } else {
-      ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map);
-      if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
-	 ret = drm_intel_bo_subdata(batch->bo,
-				    batch->state_batch_offset,
-				    batch->bo->size - batch->state_batch_offset,
-				    (char *)batch->map + batch->state_batch_offset);
-      }
-   }
-
-   if (!brw->intelScreen->no_hw) {
-      int flags;
-
-      if (brw->gen >= 6 && batch->ring == BLT_RING) {
-         flags = I915_EXEC_BLT;
-      } else {
-         flags = I915_EXEC_RENDER;
-      }
-      if (batch->needs_sol_reset)
-	 flags |= I915_EXEC_GEN7_SOL_RESET;
-
-      if (ret == 0) {
-         if (unlikely(INTEL_DEBUG & DEBUG_AUB))
-            brw_annotate_aub(brw);
-
-	 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
-	    ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
-					flags);
-	 } else {
-	    ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
-						4 * batch->used, flags);
-	 }
-      }
-
-      throttle(brw);
-   }
-
-   if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
-      do_batch_dump(brw);
-
-   if (ret != 0) {
-      fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
-      exit(1);
-   }
-
-   return ret;
-}
-
-int
-_intel_batchbuffer_flush(struct brw_context *brw,
-			 const char *file, int line)
-{
-   int ret;
-
-   if (brw->batch.used == 0)
-      return 0;
-
-   if (brw->throttle_batch[0] == NULL) {
-      brw->throttle_batch[0] = brw->batch.bo;
-      drm_intel_bo_reference(brw->throttle_batch[0]);
-   }
-
-   if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
-      int bytes_for_commands = 4 * brw->batch.used;
-      int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
-      int total_bytes = bytes_for_commands + bytes_for_state;
-      fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
-              "%4db (state) = %4db (%0.1f%%)\n", file, line,
-              bytes_for_commands, bytes_for_state,
-              total_bytes,
-              100.0f * total_bytes / BATCH_SZ);
-   }
-
-   brw->batch.reserved_space = 0;
-
-   brw_finish_batch(brw);
-
-   /* Mark the end of the buffer. */
-   intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END);
-   if (brw->batch.used & 1) {
-      /* Round batchbuffer usage to 2 DWORDs. */
-      intel_batchbuffer_emit_dword(brw, MI_NOOP);
-   }
-
-   intel_upload_finish(brw);
-
-   /* Check that we didn't just wrap our batchbuffer at a bad time. */
-   assert(!brw->no_batch_wrap);
-
-   ret = do_flush_locked(brw);
-
-   if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
-      fprintf(stderr, "waiting for idle\n");
-      drm_intel_bo_wait_rendering(brw->batch.bo);
-   }
-
-   /* Start a new batch buffer. */
-   brw_new_batch(brw);
-
-   return ret;
-}
-
-
-/*  This is the only way buffers get added to the validate list.
- */
-bool
-intel_batchbuffer_emit_reloc(struct brw_context *brw,
-                             drm_intel_bo *buffer,
-                             uint32_t read_domains, uint32_t write_domain,
-			     uint32_t delta)
-{
-   int ret;
-
-   ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
-				 buffer, delta,
-				 read_domains, write_domain);
-   assert(ret == 0);
-   (void)ret;
-
-   /* Using the old buffer offset, write in what the right data would be, in
-    * case the buffer doesn't move and we can short-circuit the relocation
-    * processing in the kernel
-    */
-   intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
-
-   return true;
-}
-
-bool
-intel_batchbuffer_emit_reloc64(struct brw_context *brw,
-                               drm_intel_bo *buffer,
-                               uint32_t read_domains, uint32_t write_domain,
-			       uint32_t delta)
-{
-   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
-                                     buffer, delta,
-                                     read_domains, write_domain);
-   assert(ret == 0);
-   (void) ret;
-
-   /* Using the old buffer offset, write in what the right data would be, in
-    * case the buffer doesn't move and we can short-circuit the relocation
-    * processing in the kernel
-    */
-   uint64_t offset = buffer->offset64 + delta;
-   intel_batchbuffer_emit_dword(brw, offset);
-   intel_batchbuffer_emit_dword(brw, offset >> 32);
-
-   return true;
-}
-
-
-void
-intel_batchbuffer_data(struct brw_context *brw,
-                       const void *data, GLuint bytes, enum brw_gpu_ring ring)
-{
-   assert((bytes & 3) == 0);
-   intel_batchbuffer_require_space(brw, bytes, ring);
-   memcpy(brw->batch.map + brw->batch.used, data, bytes);
-   brw->batch.used += bytes >> 2;
-}
-
-void
-brw_load_register_mem(struct brw_context *brw,
-                      uint32_t reg,
-                      drm_intel_bo *bo,
-                      uint32_t read_domains, uint32_t write_domain,
-                      uint32_t offset)
-{
-   /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
-   assert(brw->gen >= 7);
-
-   if (brw->gen >= 8) {
-      BEGIN_BATCH(4);
-      OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
-      OUT_BATCH(reg);
-      OUT_RELOC64(bo, read_domains, write_domain, offset);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(3);
-      OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
-      OUT_BATCH(reg);
-      OUT_RELOC(bo, read_domains, write_domain, offset);
-      ADVANCE_BATCH();
-   }
-}
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
deleted file mode 100644
index ef8a6ff..0000000
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ /dev/null
@@ -1,179 +0,0 @@ 
-#ifndef INTEL_BATCHBUFFER_H
-#define INTEL_BATCHBUFFER_H
-
-#include "main/mtypes.h"
-
-#include "brw_context.h"
-#include "intel_bufmgr.h"
-#include "intel_reg.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Number of bytes to reserve for commands necessary to complete a batch.
- *
- * This includes:
- * - MI_BATCHBUFFER_END (4 bytes)
- * - Optional MI_NOOP for ensuring the batch length is qword aligned (4 bytes)
- * - Any state emitted by vtbl->finish_batch():
- *   - Gen4-5 record ending occlusion query values (4 * 4 = 16 bytes)
- *   - Disabling OA counters on Gen6+ (3 DWords = 12 bytes)
- *   - Ending MI_REPORT_PERF_COUNT on Gen5+, plus associated PIPE_CONTROLs:
- *     - Two sets of PIPE_CONTROLs, which become 3 PIPE_CONTROLs each on SNB,
- *       which are 4 DWords each ==> 2 * 3 * 4 * 4 = 96 bytes
- *     - 3 DWords for MI_REPORT_PERF_COUNT itself on Gen6+.  ==> 12 bytes.
- *       On Ironlake, it's 6 DWords, but we have some slack due to the lack of
- *       Sandybridge PIPE_CONTROL madness.
- */
-#define BATCH_RESERVED 146
-
-struct intel_batchbuffer;
-
-void intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw);
-void intel_batchbuffer_init(struct brw_context *brw);
-void intel_batchbuffer_free(struct brw_context *brw);
-void intel_batchbuffer_save_state(struct brw_context *brw);
-void intel_batchbuffer_reset_to_saved(struct brw_context *brw);
-
-int _intel_batchbuffer_flush(struct brw_context *brw,
-			     const char *file, int line);
-
-#define intel_batchbuffer_flush(intel) \
-	_intel_batchbuffer_flush(intel, __FILE__, __LINE__)
-
-
-
-/* Unlike bmBufferData, this currently requires the buffer be mapped.
- * Consider it a convenience function wrapping multple
- * intel_buffer_dword() calls.
- */
-void intel_batchbuffer_data(struct brw_context *brw,
-                            const void *data, GLuint bytes,
-                            enum brw_gpu_ring ring);
-
-bool intel_batchbuffer_emit_reloc(struct brw_context *brw,
-                                       drm_intel_bo *buffer,
-				       uint32_t read_domains,
-				       uint32_t write_domain,
-				       uint32_t offset);
-bool intel_batchbuffer_emit_reloc64(struct brw_context *brw,
-                                    drm_intel_bo *buffer,
-                                    uint32_t read_domains,
-                                    uint32_t write_domain,
-                                    uint32_t offset);
-static inline uint32_t float_as_int(float f)
-{
-   union {
-      float f;
-      uint32_t d;
-   } fi;
-
-   fi.f = f;
-   return fi.d;
-}
-
-/* Inline functions - might actually be better off with these
- * non-inlined.  Certainly better off switching all command packets to
- * be passed as structs rather than dwords, but that's a little bit of
- * work...
- */
-static inline unsigned
-intel_batchbuffer_space(struct brw_context *brw)
-{
-   return (brw->batch.state_batch_offset - brw->batch.reserved_space)
-      - brw->batch.used*4;
-}
-
-
-static inline void
-intel_batchbuffer_emit_dword(struct brw_context *brw, GLuint dword)
-{
-#ifdef DEBUG
-   assert(intel_batchbuffer_space(brw) >= 4);
-#endif
-   brw->batch.map[brw->batch.used++] = dword;
-   assert(brw->batch.ring != UNKNOWN_RING);
-}
-
-static inline void
-intel_batchbuffer_emit_float(struct brw_context *brw, float f)
-{
-   intel_batchbuffer_emit_dword(brw, float_as_int(f));
-}
-
-static inline void
-intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
-                                enum brw_gpu_ring ring)
-{
-   /* If we're switching rings, implicitly flush the batch. */
-   if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
-       brw->gen >= 6) {
-      intel_batchbuffer_flush(brw);
-   }
-
-#ifdef DEBUG
-   assert(sz < BATCH_SZ - BATCH_RESERVED);
-#endif
-   if (intel_batchbuffer_space(brw) < sz)
-      intel_batchbuffer_flush(brw);
-
-   enum brw_gpu_ring prev_ring = brw->batch.ring;
-   /* The intel_batchbuffer_flush() calls above might have changed
-    * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
-    */
-   brw->batch.ring = ring;
-
-   if (unlikely(prev_ring == UNKNOWN_RING && ring == RENDER_RING))
-      intel_batchbuffer_emit_render_ring_prelude(brw);
-}
-
-static inline void
-intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring)
-{
-   intel_batchbuffer_require_space(brw, n * 4, ring);
-
-   brw->batch.emit = brw->batch.used;
-#ifdef DEBUG
-   brw->batch.total = n;
-#endif
-}
-
-static inline void
-intel_batchbuffer_advance(struct brw_context *brw)
-{
-#ifdef DEBUG
-   struct intel_batchbuffer *batch = &brw->batch;
-   unsigned int _n = batch->used - batch->emit;
-   assert(batch->total != 0);
-   if (_n != batch->total) {
-      fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",
-	      _n, batch->total);
-      abort();
-   }
-   batch->total = 0;
-#endif
-}
-
-#define BEGIN_BATCH(n) intel_batchbuffer_begin(brw, n, RENDER_RING)
-#define BEGIN_BATCH_BLT(n) intel_batchbuffer_begin(brw, n, BLT_RING)
-#define OUT_BATCH(d) intel_batchbuffer_emit_dword(brw, d)
-#define OUT_BATCH_F(f) intel_batchbuffer_emit_float(brw, f)
-#define OUT_RELOC(buf, read_domains, write_domain, delta) do {		\
-   intel_batchbuffer_emit_reloc(brw, buf,			\
-				read_domains, write_domain, delta);	\
-} while (0)
-
-/* Handle 48-bit address relocations for Gen8+ */
-#define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
-   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta);	\
-} while (0)
-
-#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index 9f44451..8e23586 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -38,7 +38,6 @@ 
 #include "intel_buffers.h"
 #include "intel_fbo.h"
 #include "intel_reg.h"
-#include "intel_batchbuffer.h"
 #include "intel_mipmap_tree.h"
 
 #define FILE_DEBUG_FLAG DEBUG_BLIT
@@ -291,11 +290,11 @@  bool
 intelEmitCopyBlit(struct brw_context *brw,
 		  GLuint cpp,
 		  GLshort src_pitch,
-		  drm_intel_bo *src_buffer,
+		  struct brw_bo *src_buffer,
 		  GLuint src_offset,
 		  uint32_t src_tiling,
 		  GLshort dst_pitch,
-		  drm_intel_bo *dst_buffer,
+		  struct brw_bo *dst_buffer,
 		  GLuint dst_offset,
 		  uint32_t dst_tiling,
 		  GLshort src_x, GLshort src_y,
@@ -303,10 +302,9 @@  intelEmitCopyBlit(struct brw_context *brw,
 		  GLshort w, GLshort h,
 		  GLenum logic_op)
 {
-   GLuint CMD, BR13, pass = 0;
+   GLuint CMD, BR13;
    int dst_y2 = dst_y + h;
    int dst_x2 = dst_x + w;
-   drm_intel_bo *aper_array[3];
    bool dst_y_tiled = dst_tiling == I915_TILING_Y;
    bool src_y_tiled = src_tiling == I915_TILING_Y;
 
@@ -321,25 +319,8 @@  intelEmitCopyBlit(struct brw_context *brw,
    assert(!dst_y_tiled || (dst_pitch % 128) == 0);
    assert(!src_y_tiled || (src_pitch % 128) == 0);
 
-   /* do space check before going any further */
-   do {
-       aper_array[0] = brw->batch.bo;
-       aper_array[1] = dst_buffer;
-       aper_array[2] = src_buffer;
-
-       if (dri_bufmgr_check_aperture_space(aper_array, 3) != 0) {
-           intel_batchbuffer_flush(brw);
-           pass++;
-       } else
-           break;
-   } while (pass < 2);
-
-   if (pass >= 2)
-      return false;
-
    unsigned length = brw->gen >= 8 ? 10 : 8;
 
-   intel_batchbuffer_require_space(brw, length * 4, BLT_RING);
    DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
        __func__,
        src_buffer, src_pitch, src_offset, src_x, src_y,
@@ -404,6 +385,9 @@  intelEmitCopyBlit(struct brw_context *brw,
    assert(dst_offset + (dst_y + h - 1) * abs(dst_pitch) +
           (w * cpp) <= dst_buffer->size);
 
+   if (brw_batch_begin(&brw->batch, 20, BLT_RING) < 0)
+      return false;
+
    BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, src_y_tiled);
    OUT_BATCH(CMD | (length - 2));
    OUT_BATCH(BR13 | (uint16_t)dst_pitch);
@@ -431,10 +415,8 @@  intelEmitCopyBlit(struct brw_context *brw,
    }
 
    ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled);
-
    brw_emit_mi_flush(brw);
-
-   return true;
+   return brw_batch_end(&brw->batch) == 0;
 }
 
 bool
@@ -443,7 +425,7 @@  intelEmitImmediateColorExpandBlit(struct brw_context *brw,
 				  GLubyte *src_bits, GLuint src_size,
 				  GLuint fg_color,
 				  GLshort dst_pitch,
-				  drm_intel_bo *dst_buffer,
+				  struct brw_bo *dst_buffer,
 				  GLuint dst_offset,
 				  uint32_t dst_tiling,
 				  GLshort x, GLshort y,
@@ -471,9 +453,6 @@  intelEmitImmediateColorExpandBlit(struct brw_context *brw,
        dst_buffer, dst_pitch, dst_offset, x, y, w, h, src_size, dwords);
 
    unsigned xy_setup_blt_length = brw->gen >= 8 ? 10 : 8;
-   intel_batchbuffer_require_space(brw, (xy_setup_blt_length * 4) +
-                                        (3 * 4) + dwords * 4, BLT_RING);
-
    opcode = XY_SETUP_BLT_CMD;
    if (cpp == 4)
       opcode |= XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
@@ -489,6 +468,7 @@  intelEmitImmediateColorExpandBlit(struct brw_context *brw,
    if (dst_tiling != I915_TILING_NONE)
       blit_cmd |= XY_DST_TILED;
 
+   brw_batch_begin(&brw->batch, 20 + dwords, BLT_RING);
    BEGIN_BATCH_BLT(xy_setup_blt_length + 3);
    OUT_BATCH(opcode | (xy_setup_blt_length - 2));
    OUT_BATCH(br13);
@@ -514,11 +494,10 @@  intelEmitImmediateColorExpandBlit(struct brw_context *brw,
    OUT_BATCH(SET_FIELD(y + h, BLT_Y) | SET_FIELD(x + w, BLT_X));
    ADVANCE_BATCH();
 
-   intel_batchbuffer_data(brw, src_bits, dwords * 4, BLT_RING);
-
+   brw_batch_data(&brw->batch, src_bits, dwords * 4);
    brw_emit_mi_flush(brw);
 
-   return true;
+   return brw_batch_end(&brw->batch);
 }
 
 /* We don't have a memmove-type blit like some other hardware, so we'll do a
@@ -527,9 +506,9 @@  intelEmitImmediateColorExpandBlit(struct brw_context *brw,
  */
 void
 intel_emit_linear_blit(struct brw_context *brw,
-		       drm_intel_bo *dst_bo,
+		       struct brw_bo *dst_bo,
 		       unsigned int dst_offset,
-		       drm_intel_bo *src_bo,
+		       struct brw_bo *src_bo,
 		       unsigned int src_offset,
 		       unsigned int size)
 {
@@ -592,7 +571,6 @@  intel_miptree_set_alpha_to_one(struct brw_context *brw,
 {
    uint32_t BR13, CMD;
    int pitch, cpp;
-   drm_intel_bo *aper_array[2];
 
    pitch = mt->pitch;
    cpp = mt->cpp;
@@ -610,14 +588,7 @@  intel_miptree_set_alpha_to_one(struct brw_context *brw,
    }
    BR13 |= pitch;
 
-   /* do space check before going any further */
-   aper_array[0] = brw->batch.bo;
-   aper_array[1] = mt->bo;
-
-   if (drm_intel_bufmgr_check_aperture_space(aper_array,
-					     ARRAY_SIZE(aper_array)) != 0) {
-      intel_batchbuffer_flush(brw);
-   }
+   brw_batch_begin(&brw->batch, 20, BLT_RING);
 
    unsigned length = brw->gen >= 8 ? 7 : 6;
    bool dst_y_tiled = mt->tiling == I915_TILING_Y;
@@ -640,4 +611,5 @@  intel_miptree_set_alpha_to_one(struct brw_context *brw,
    ADVANCE_BATCH_TILED(dst_y_tiled, false);
 
    brw_emit_mi_flush(brw);
+   brw_batch_end(&brw->batch);
 }
diff --git a/src/mesa/drivers/dri/i965/intel_blit.h b/src/mesa/drivers/dri/i965/intel_blit.h
index f563939..70625c9 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.h
+++ b/src/mesa/drivers/dri/i965/intel_blit.h
@@ -34,11 +34,11 @@  bool
 intelEmitCopyBlit(struct brw_context *brw,
                               GLuint cpp,
                               GLshort src_pitch,
-                              drm_intel_bo *src_buffer,
+                              struct brw_bo *src_buffer,
                               GLuint src_offset,
 			      uint32_t src_tiling,
                               GLshort dst_pitch,
-                              drm_intel_bo *dst_buffer,
+                              struct brw_bo *dst_buffer,
                               GLuint dst_offset,
 			      uint32_t dst_tiling,
                               GLshort srcx, GLshort srcy,
@@ -62,16 +62,16 @@  intelEmitImmediateColorExpandBlit(struct brw_context *brw,
 				  GLubyte *src_bits, GLuint src_size,
 				  GLuint fg_color,
 				  GLshort dst_pitch,
-				  drm_intel_bo *dst_buffer,
+				  struct brw_bo *dst_buffer,
 				  GLuint dst_offset,
 				  uint32_t dst_tiling,
 				  GLshort x, GLshort y,
 				  GLshort w, GLshort h,
 				  GLenum logic_op);
 void intel_emit_linear_blit(struct brw_context *brw,
-			    drm_intel_bo *dst_bo,
+			    struct brw_bo *dst_bo,
 			    unsigned int dst_offset,
-			    drm_intel_bo *src_bo,
+			    struct brw_bo *src_bo,
 			    unsigned int src_offset,
 			    unsigned int size);
 
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
index ff05b5c..b2bc8c9 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
@@ -39,47 +39,6 @@ 
 #include "brw_context.h"
 #include "intel_blit.h"
 #include "intel_buffer_objects.h"
-#include "intel_batchbuffer.h"
-
-/**
- * Map a buffer object; issue performance warnings if mapping causes stalls.
- *
- * This matches the drm_intel_bo_map API, but takes an additional human-readable
- * name for the buffer object to use in the performance debug message.
- */
-int
-brw_bo_map(struct brw_context *brw,
-           drm_intel_bo *bo, int write_enable,
-           const char *bo_name)
-{
-   if (likely(!brw->perf_debug) || !drm_intel_bo_busy(bo))
-      return drm_intel_bo_map(bo, write_enable);
-
-   double start_time = get_time();
-
-   int ret = drm_intel_bo_map(bo, write_enable);
-
-   perf_debug("CPU mapping a busy %s BO stalled and took %.03f ms.\n",
-              bo_name, (get_time() - start_time) * 1000);
-
-   return ret;
-}
-
-int
-brw_bo_map_gtt(struct brw_context *brw, drm_intel_bo *bo, const char *bo_name)
-{
-   if (likely(!brw->perf_debug) || !drm_intel_bo_busy(bo))
-      return drm_intel_gem_bo_map_gtt(bo);
-
-   double start_time = get_time();
-
-   int ret = drm_intel_gem_bo_map_gtt(bo);
-
-   perf_debug("GTT mapping a busy %s BO stalled and took %.03f ms.\n",
-              bo_name, (get_time() - start_time) * 1000);
-
-   return ret;
-}
 
 static void
 mark_buffer_gpu_usage(struct intel_buffer_object *intel_obj,
@@ -92,17 +51,20 @@  mark_buffer_gpu_usage(struct intel_buffer_object *intel_obj,
 static void
 mark_buffer_inactive(struct intel_buffer_object *intel_obj)
 {
+   if (brw_bo_busy(intel_obj->buffer, BUSY_WRITE))
+      return;
+
    intel_obj->gpu_active_start = ~0;
    intel_obj->gpu_active_end = 0;
 }
 
-/** Allocates a new drm_intel_bo to store the data for the buffer object. */
+/** Allocates a new brw_bo to store the data for the buffer object. */
 static void
 alloc_buffer_object(struct brw_context *brw,
                     struct intel_buffer_object *intel_obj)
 {
-   intel_obj->buffer = drm_intel_bo_alloc(brw->bufmgr, "bufferobj",
-					  intel_obj->Base.Size, 64);
+   intel_obj->buffer =
+	   brw_bo_create(&brw->batch, "bufferobj", intel_obj->Base.Size, 64);
 
    /* the buffer might be bound as a uniform buffer, need to update it
     */
@@ -119,7 +81,7 @@  alloc_buffer_object(struct brw_context *brw,
 static void
 release_buffer(struct intel_buffer_object *intel_obj)
 {
-   drm_intel_bo_unreference(intel_obj->buffer);
+   brw_bo_put(intel_obj->buffer);
    intel_obj->buffer = NULL;
 }
 
@@ -166,7 +128,7 @@  brw_delete_buffer(struct gl_context * ctx, struct gl_buffer_object *obj)
     */
    _mesa_buffer_unmap_all_mappings(ctx, obj);
 
-   drm_intel_bo_unreference(intel_obj->buffer);
+   brw_bo_put(intel_obj->buffer);
    free(intel_obj);
 }
 
@@ -213,7 +175,7 @@  brw_buffer_data(struct gl_context *ctx,
          return false;
 
       if (data != NULL)
-	 drm_intel_bo_subdata(intel_obj->buffer, 0, size, data);
+	 brw_bo_write(intel_obj->buffer, 0, data, size, 0);
    }
 
    return true;
@@ -256,46 +218,34 @@  brw_buffer_subdata(struct gl_context *ctx,
     */
    if (offset + size <= intel_obj->gpu_active_start ||
        intel_obj->gpu_active_end <= offset) {
-      if (brw->has_llc) {
-         drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer);
-         memcpy(intel_obj->buffer->virtual + offset, data, size);
-         drm_intel_bo_unmap(intel_obj->buffer);
-
-         if (intel_obj->gpu_active_end > intel_obj->gpu_active_start)
-            intel_obj->prefer_stall_to_blit = true;
-         return;
-      } else {
-         perf_debug("BufferSubData could be unsynchronized, but !LLC doesn't support it yet\n");
-      }
+      memcpy(brw_bo_map(intel_obj->buffer, MAP_WRITE | MAP_ASYNC),
+	     data, size);
+      if (intel_obj->gpu_active_end > intel_obj->gpu_active_start)
+	 intel_obj->prefer_stall_to_blit = brw->has_llc;
+      return;
    }
 
-   busy =
-      drm_intel_bo_busy(intel_obj->buffer) ||
-      drm_intel_bo_references(brw->batch.bo, intel_obj->buffer);
-
+   busy = brw_bo_busy(intel_obj->buffer, BUSY_WRITE | BUSY_FLUSH);
    if (busy) {
       if (size == intel_obj->Base.Size) {
 	 /* Replace the current busy bo so the subdata doesn't stall. */
-	 drm_intel_bo_unreference(intel_obj->buffer);
+	 brw_bo_put(intel_obj->buffer);
 	 alloc_buffer_object(brw, intel_obj);
       } else if (!intel_obj->prefer_stall_to_blit) {
+	 uint32_t upload;
          perf_debug("Using a blit copy to avoid stalling on "
                     "glBufferSubData(%ld, %ld) (%ldkb) to a busy "
                     "(%d-%d) buffer object.\n",
                     (long)offset, (long)offset + size, (long)(size/1024),
                     intel_obj->gpu_active_start,
                     intel_obj->gpu_active_end);
-	 drm_intel_bo *temp_bo =
-	    drm_intel_bo_alloc(brw->bufmgr, "subdata temp", size, 64);
-
-	 drm_intel_bo_subdata(temp_bo, 0, size, data);
-
+	 struct brw_bo *bo = NULL;
+         intel_upload_data(brw, data, size, 64, &bo, &upload);
 	 intel_emit_linear_blit(brw,
 				intel_obj->buffer, offset,
-				temp_bo, 0,
+				bo, upload,
 				size);
-
-	 drm_intel_bo_unreference(temp_bo);
+	 brw_bo_put(bo);
          return;
       } else {
          perf_debug("Stalling on glBufferSubData(%ld, %ld) (%ldkb) to a busy "
@@ -304,11 +254,11 @@  brw_buffer_subdata(struct gl_context *ctx,
                     (long)offset, (long)offset + size, (long)(size/1024),
                     intel_obj->gpu_active_start,
                     intel_obj->gpu_active_end);
-         intel_batchbuffer_flush(brw);
+	 brw_bo_flush(intel_obj->buffer);
       }
    }
 
-   drm_intel_bo_subdata(intel_obj->buffer, offset, size, data);
+   brw_bo_write(intel_obj->buffer, offset, data, size, 0);
    mark_buffer_inactive(intel_obj);
 }
 
@@ -327,14 +277,9 @@  brw_get_buffer_subdata(struct gl_context *ctx,
                        struct gl_buffer_object *obj)
 {
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
-   struct brw_context *brw = brw_context(ctx);
 
    assert(intel_obj);
-   if (drm_intel_bo_references(brw->batch.bo, intel_obj->buffer)) {
-      intel_batchbuffer_flush(brw);
-   }
-   drm_intel_bo_get_subdata(intel_obj->buffer, offset, size, data);
-
+   brw_bo_read(intel_obj->buffer, offset, data, size, 0);
    mark_buffer_inactive(intel_obj);
 }
 
@@ -365,6 +310,7 @@  brw_map_buffer_range(struct gl_context *ctx,
 {
    struct brw_context *brw = brw_context(ctx);
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
+   unsigned map_flags;
 
    assert(intel_obj);
 
@@ -389,19 +335,14 @@  brw_map_buffer_range(struct gl_context *ctx,
     * achieve the required synchronization.
     */
    if (!(access & GL_MAP_UNSYNCHRONIZED_BIT)) {
-      if (drm_intel_bo_references(brw->batch.bo, intel_obj->buffer)) {
-	 if (access & GL_MAP_INVALIDATE_BUFFER_BIT) {
-	    drm_intel_bo_unreference(intel_obj->buffer);
+      if ((access & GL_MAP_INVALIDATE_BUFFER_BIT)) {
+	 if (brw_bo_busy(intel_obj->buffer, BUSY_WRITE | BUSY_FLUSH)) {
+	    brw_bo_put(intel_obj->buffer);
 	    alloc_buffer_object(brw, intel_obj);
-	 } else {
-            perf_debug("Stalling on the GPU for mapping a busy buffer "
-                       "object\n");
-	    intel_batchbuffer_flush(brw);
 	 }
-      } else if (drm_intel_bo_busy(intel_obj->buffer) &&
-		 (access & GL_MAP_INVALIDATE_BUFFER_BIT)) {
-	 drm_intel_bo_unreference(intel_obj->buffer);
-	 alloc_buffer_object(brw, intel_obj);
+      } else {
+	 if (access & GL_MAP_WRITE_BIT || intel_obj->buffer->domain == DOMAIN_GPU)
+	    brw_bo_flush(intel_obj->buffer);
       }
    }
 
@@ -416,46 +357,41 @@  brw_map_buffer_range(struct gl_context *ctx,
     */
    if (!(access & (GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_PERSISTENT_BIT)) &&
        (access & GL_MAP_INVALIDATE_RANGE_BIT) &&
-       drm_intel_bo_busy(intel_obj->buffer)) {
+       brw_bo_busy(intel_obj->buffer, BUSY_WRITE | BUSY_FLUSH)) {
       /* Ensure that the base alignment of the allocation meets the alignment
        * guarantees the driver has advertised to the application.
        */
       const unsigned alignment = ctx->Const.MinMapBufferAlignment;
 
       intel_obj->map_extra[index] = (uintptr_t) offset % alignment;
-      intel_obj->range_map_bo[index] = drm_intel_bo_alloc(brw->bufmgr,
-                                                          "BO blit temp",
-                                                          length +
-                                                          intel_obj->map_extra[index],
-                                                          alignment);
-      if (brw->has_llc) {
-         brw_bo_map(brw, intel_obj->range_map_bo[index],
-                    (access & GL_MAP_WRITE_BIT) != 0, "range-map");
-      } else {
-         drm_intel_gem_bo_map_gtt(intel_obj->range_map_bo[index]);
-      }
+      intel_obj->range_map_bo[index] =
+	      brw_bo_create(&brw->batch,
+			    "BO blit temp",
+			    length + intel_obj->map_extra[index],
+			    alignment);
+
       obj->Mappings[index].Pointer =
-         intel_obj->range_map_bo[index]->virtual + intel_obj->map_extra[index];
+	 brw_bo_map(intel_obj->range_map_bo[index], MAP_WRITE) +
+	 intel_obj->map_extra[index];
+
       return obj->Mappings[index].Pointer;
    }
 
-   if (access & GL_MAP_UNSYNCHRONIZED_BIT) {
-      if (!brw->has_llc && brw->perf_debug &&
-          drm_intel_bo_busy(intel_obj->buffer)) {
-         perf_debug("MapBufferRange with GL_MAP_UNSYNCHRONIZED_BIT stalling (it's actually synchronized on non-LLC platforms)\n");
-      }
-      drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer);
-   } else if (!brw->has_llc && (!(access & GL_MAP_READ_BIT) ||
-                              (access & GL_MAP_PERSISTENT_BIT))) {
-      drm_intel_gem_bo_map_gtt(intel_obj->buffer);
-      mark_buffer_inactive(intel_obj);
-   } else {
-      brw_bo_map(brw, intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0,
-                 "MapBufferRange");
-      mark_buffer_inactive(intel_obj);
-   }
+   map_flags = 0;
+   if (access & GL_MAP_UNSYNCHRONIZED_BIT)
+      map_flags |= MAP_ASYNC;
+   if (access & GL_MAP_WRITE_BIT)
+      map_flags |= MAP_WRITE;
+   if (access & GL_MAP_READ_BIT)
+      map_flags |= MAP_READ;
+   if (access & GL_MAP_PERSISTENT_BIT)
+      map_flags |= MAP_COHERENT;
+
+   obj->Mappings[index].Pointer =
+      brw_bo_map(intel_obj->buffer, map_flags) + offset;
+
+   mark_buffer_inactive(intel_obj);
 
-   obj->Mappings[index].Pointer = intel_obj->buffer->virtual + offset;
    return obj->Mappings[index].Pointer;
 }
 
@@ -543,8 +479,6 @@  brw_unmap_buffer(struct gl_context *ctx,
    assert(intel_obj);
    assert(obj->Mappings[index].Pointer);
    if (intel_obj->range_map_bo[index] != NULL) {
-      drm_intel_bo_unmap(intel_obj->range_map_bo[index]);
-
       if (!(obj->Mappings[index].AccessFlags & GL_MAP_FLUSH_EXPLICIT_BIT)) {
          intel_emit_linear_blit(brw,
                                 intel_obj->buffer, obj->Mappings[index].Offset,
@@ -562,11 +496,10 @@  brw_unmap_buffer(struct gl_context *ctx,
        */
       brw_emit_mi_flush(brw);
 
-      drm_intel_bo_unreference(intel_obj->range_map_bo[index]);
+      brw_bo_put(intel_obj->range_map_bo[index]);
       intel_obj->range_map_bo[index] = NULL;
-   } else if (intel_obj->buffer != NULL) {
-      drm_intel_bo_unmap(intel_obj->buffer);
    }
+
    obj->Mappings[index].Pointer = NULL;
    obj->Mappings[index].Offset = 0;
    obj->Mappings[index].Length = 0;
@@ -581,7 +514,7 @@  brw_unmap_buffer(struct gl_context *ctx,
  * Anywhere that uses buffer objects in the pipeline should be using this to
  * mark the range of the buffer that is being accessed by the pipeline.
  */
-drm_intel_bo *
+struct brw_bo *
 intel_bufferobj_buffer(struct brw_context *brw,
                        struct intel_buffer_object *intel_obj,
                        uint32_t offset, uint32_t size)
@@ -615,7 +548,7 @@  brw_copy_buffer_subdata(struct gl_context *ctx,
    struct brw_context *brw = brw_context(ctx);
    struct intel_buffer_object *intel_src = intel_buffer_object(src);
    struct intel_buffer_object *intel_dst = intel_buffer_object(dst);
-   drm_intel_bo *src_bo, *dst_bo;
+   struct brw_bo *src_bo, *dst_bo;
 
    if (size == 0)
       return;
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.h b/src/mesa/drivers/dri/i965/intel_buffer_objects.h
index 5eaf9dc..179c0576 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.h
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.h
@@ -40,9 +40,9 @@  struct gl_buffer_object;
 struct intel_buffer_object
 {
    struct gl_buffer_object Base;
-   drm_intel_bo *buffer;     /* the low-level buffer manager's buffer handle */
+   struct brw_bo *buffer; /* the low-level buffer manager's buffer handle */
 
-   drm_intel_bo *range_map_bo[MAP_COUNT];
+   struct brw_bo *range_map_bo[MAP_COUNT];
 
    /**
     * Alignment offset from the range_map_bo temporary mapping to the returned
@@ -84,26 +84,24 @@  struct intel_buffer_object
 
 /* Get the bm buffer associated with a GL bufferobject:
  */
-drm_intel_bo *intel_bufferobj_buffer(struct brw_context *brw,
-                                     struct intel_buffer_object *obj,
-                                     uint32_t offset,
-                                     uint32_t size);
+struct brw_bo *intel_bufferobj_buffer(struct brw_context *brw,
+				      struct intel_buffer_object *obj,
+				      uint32_t offset,
+				      uint32_t size);
 
 void intel_upload_data(struct brw_context *brw,
                        const void *data,
                        uint32_t size,
                        uint32_t alignment,
-                       drm_intel_bo **out_bo,
+                       struct brw_bo **out_bo,
                        uint32_t *out_offset);
 
 void *intel_upload_space(struct brw_context *brw,
                          uint32_t size,
                          uint32_t alignment,
-                         drm_intel_bo **out_bo,
+                         struct brw_bo **out_bo,
                          uint32_t *out_offset);
 
-void intel_upload_finish(struct brw_context *brw);
-
 /* Hook the bufferobject implementation into mesa:
  */
 void intelInitBufferObjectFuncs(struct dd_function_table *functions);
diff --git a/src/mesa/drivers/dri/i965/intel_debug.c b/src/mesa/drivers/dri/i965/intel_debug.c
index 19be464..42954e9 100644
--- a/src/mesa/drivers/dri/i965/intel_debug.c
+++ b/src/mesa/drivers/dri/i965/intel_debug.c
@@ -93,7 +93,7 @@  brw_process_intel_debug_variable(struct brw_context *brw)
    (void) p_atomic_cmpxchg(&INTEL_DEBUG, 0, intel_debug);
 
    if (INTEL_DEBUG & DEBUG_BUFMGR)
-      dri_bufmgr_set_debug(brw->bufmgr, true);
+      brw_batch_enable_debug(&brw->batch);
 
    if ((INTEL_DEBUG & DEBUG_SHADER_TIME) && brw->gen < 7) {
       fprintf(stderr,
@@ -104,8 +104,10 @@  brw_process_intel_debug_variable(struct brw_context *brw)
    if (INTEL_DEBUG & DEBUG_PERF)
       brw->perf_debug = true;
 
+#if 0
    if (INTEL_DEBUG & DEBUG_AUB)
       drm_intel_bufmgr_gem_set_aub_dump(brw->bufmgr, true);
+#endif
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index aa77a86..fe31e43 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -28,7 +28,6 @@ 
 #include "main/version.h"
 
 #include "brw_context.h"
-#include "intel_batchbuffer.h"
 #include "intel_reg.h"
 #include "utils.h"
 
@@ -56,6 +55,7 @@  can_do_pipelined_register_writes(struct brw_context *brw)
    const int reg = GEN7_SO_WRITE_OFFSET(0);
    const int expected_value = 0x1337d0d0;
    const int offset = 100;
+   int ret;
 
    /* The register we picked only exists on Gen7+. */
    assert(brw->gen == 7);
@@ -64,10 +64,12 @@  can_do_pipelined_register_writes(struct brw_context *brw)
    /* Set a value in a BO to a known quantity.  The workaround BO already
     * exists and doesn't contain anything important, so we may as well use it.
     */
-   drm_intel_bo_map(brw->workaround_bo, true);
-   data = brw->workaround_bo->virtual;
+   data = brw_bo_map(brw->workaround_bo, MAP_WRITE);
    data[offset] = 0xffffffff;
-   drm_intel_bo_unmap(brw->workaround_bo);
+
+   ret = brw_batch_begin(&brw->batch, 6, RENDER_RING);
+   if (ret < 0)
+	   return false;
 
    /* Write the register. */
    BEGIN_BATCH(3);
@@ -87,17 +89,12 @@  can_do_pipelined_register_writes(struct brw_context *brw)
              offset * sizeof(uint32_t));
    ADVANCE_BATCH();
 
-   intel_batchbuffer_flush(brw);
+   if (brw_batch_end(&brw->batch))
+	   return false;
 
    /* Check whether the value got written. */
-   drm_intel_bo_map(brw->workaround_bo, false);
-   data = brw->workaround_bo->virtual;
-   bool success = data[offset] == expected_value;
-   drm_intel_bo_unmap(brw->workaround_bo);
-
-   result = success;
-
-   return success;
+   data = brw_bo_map(brw->workaround_bo, MAP_READ);
+   return data[offset] == expected_value;
 }
 
 static bool
@@ -120,10 +117,11 @@  can_write_oacontrol(struct brw_context *brw)
    /* Set a value in a BO to a known quantity.  The workaround BO already
     * exists and doesn't contain anything important, so we may as well use it.
     */
-   drm_intel_bo_map(brw->workaround_bo, true);
-   data = brw->workaround_bo->virtual;
+   data = brw_bo_map(brw->workaround_bo, MAP_WRITE);
    data[offset] = 0xffffffff;
-   drm_intel_bo_unmap(brw->workaround_bo);
+
+   if (brw_batch_begin(&brw->batch, 20, RENDER_RING) < 0)
+      return false;
 
    /* Write OACONTROL. */
    BEGIN_BATCH(3);
@@ -152,17 +150,12 @@  can_write_oacontrol(struct brw_context *brw)
    OUT_BATCH(0);
    ADVANCE_BATCH();
 
-   intel_batchbuffer_flush(brw);
+   if (brw_batch_end(&brw->batch))
+      return false;
 
    /* Check whether the value got written. */
-   drm_intel_bo_map(brw->workaround_bo, false);
-   data = brw->workaround_bo->virtual;
-   bool success = data[offset] == expected_value;
-   drm_intel_bo_unmap(brw->workaround_bo);
-
-   result = success;
-
-   return success;
+   data = brw_bo_map(brw->workaround_bo, MAP_READ);
+   return data[offset] == expected_value;
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index c6f447a..03728b1 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -43,7 +43,6 @@ 
 #include "swrast/swrast.h"
 #include "drivers/common/meta.h"
 
-#include "intel_batchbuffer.h"
 #include "intel_buffers.h"
 #include "intel_blit.h"
 #include "intel_fbo.h"
@@ -377,13 +376,15 @@  intel_image_target_renderbuffer_storage(struct gl_context *ctx,
    irb = intel_renderbuffer(rb);
    intel_miptree_release(&irb->mt);
 
+   struct brw_bo *bo = brw_bo_import(&brw->batch, image->bo, true);
+
    /* Disable creation of the miptree's aux buffers because the driver exposes
     * no EGL API to manage them. That is, there is no API for resolving the aux
     * buffer's content to the main buffer nor for invalidating the aux buffer's
     * content.
     */
    irb->mt = intel_miptree_create_for_bo(brw,
-                                         image->bo,
+					 bo,
                                          image->format,
                                          image->offset,
                                          image->width,
@@ -391,6 +392,7 @@  intel_image_target_renderbuffer_storage(struct gl_context *ctx,
                                          1,
                                          image->pitch,
                                          true /*disable_aux_buffers*/);
+   brw_bo_put(bo);
    if (!irb->mt)
       return;
 
@@ -1043,43 +1045,6 @@  intel_renderbuffer_move_to_temp(struct brw_context *brw,
    intel_miptree_release(&new_mt);
 }
 
-void
-brw_render_cache_set_clear(struct brw_context *brw)
-{
-   struct set_entry *entry;
-
-   set_foreach(brw->render_cache, entry) {
-      _mesa_set_remove(brw->render_cache, entry);
-   }
-}
-
-void
-brw_render_cache_set_add_bo(struct brw_context *brw, drm_intel_bo *bo)
-{
-   _mesa_set_add(brw->render_cache, bo);
-}
-
-/**
- * Emits an appropriate flush for a BO if it has been rendered to within the
- * same batchbuffer as a read that's about to be emitted.
- *
- * The GPU has separate, incoherent caches for the render cache and the
- * sampler cache, along with other caches.  Usually data in the different
- * caches don't interact (e.g. we don't render to our driver-generated
- * immediate constant data), but for render-to-texture in FBOs we definitely
- * do.  When a batchbuffer is flushed, the kernel will ensure that everything
- * necessary is flushed before another use of that BO, but for reuse from
- * different caches within a batchbuffer, it's all our responsibility.
- */
-void
-brw_render_cache_set_check_flush(struct brw_context *brw, drm_intel_bo *bo)
-{
-   if (!_mesa_set_search(brw->render_cache, bo))
-      return;
-
-   brw_emit_mi_flush(brw);
-}
-
 /**
  * Do one-time context initializations related to GL_EXT_framebuffer_object.
  * Hook in device driver functions.
@@ -1100,7 +1065,4 @@  intel_fbo_init(struct brw_context *brw)
       dd->BlitFramebuffer = gen4_blit_framebuffer;
    dd->EGLImageTargetRenderbufferStorage =
       intel_image_target_renderbuffer_storage;
-
-   brw->render_cache = _mesa_set_create(brw, _mesa_hash_pointer,
-                                        _mesa_key_pointer_equal);
 }
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.h b/src/mesa/drivers/dri/i965/intel_fbo.h
index c7cc570..6c761e6 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.h
+++ b/src/mesa/drivers/dri/i965/intel_fbo.h
@@ -240,10 +240,6 @@  void
 intel_renderbuffer_upsample(struct brw_context *brw,
                             struct intel_renderbuffer *irb);
 
-void brw_render_cache_set_clear(struct brw_context *brw);
-void brw_render_cache_set_add_bo(struct brw_context *brw, drm_intel_bo *bo);
-void brw_render_cache_set_check_flush(struct brw_context *brw, drm_intel_bo *bo);
-
 unsigned
 intel_quantize_num_samples(struct intel_screen *intel, unsigned num_samples);
 
diff --git a/src/mesa/drivers/dri/i965/intel_image.h b/src/mesa/drivers/dri/i965/intel_image.h
index a82cf3b..c464a17 100644
--- a/src/mesa/drivers/dri/i965/intel_image.h
+++ b/src/mesa/drivers/dri/i965/intel_image.h
@@ -42,7 +42,6 @@ 
 #include <xf86drm.h>
 
 #include "main/mtypes.h"
-#include "intel_bufmgr.h"
 #include <GL/internal/dri_interface.h>
 
 #ifdef __cplusplus
@@ -66,8 +65,11 @@  struct intel_image_format {
    } planes[3];
 };
 
+struct _drm_intel_bo;
+
 struct __DRIimageRec {
-   drm_intel_bo *bo;
+   struct _drm_intel_bo *bo;
+
    uint32_t pitch; /**< in bytes */
    GLenum internal_format;
    uint32_t dri_format;
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 24a5c3d..3df1fd7 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -28,7 +28,6 @@ 
 #include <GL/gl.h>
 #include <GL/internal/dri_interface.h>
 
-#include "intel_batchbuffer.h"
 #include "intel_mipmap_tree.h"
 #include "intel_resolve_map.h"
 #include "intel_tex.h"
@@ -671,13 +670,13 @@  intel_miptree_create(struct brw_context *brw,
       mt->tiling = tiling;
    }
 
-   unsigned long pitch;
+   uint32_t pitch;
    mt->etc_format = etc_format;
-   mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree",
-                                     total_width, total_height, mt->cpp,
-                                     &mt->tiling, &pitch,
-                                     (expect_accelerated_upload ?
-                                      BO_ALLOC_FOR_RENDER : 0));
+   mt->bo = brw_bo_create_tiled(&brw->batch, "miptree",
+				total_width, total_height, mt->cpp,
+				&mt->tiling, &pitch,
+				(expect_accelerated_upload ?
+				 BO_ALLOC_FOR_RENDER : 0));
    mt->pitch = pitch;
 
    /* If the BO is too large to fit in the aperture, we need to use the
@@ -689,12 +688,12 @@  intel_miptree_create(struct brw_context *brw,
                  mt->total_width, mt->total_height);
 
       mt->tiling = I915_TILING_X;
-      drm_intel_bo_unreference(mt->bo);
-      mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree",
-                                        total_width, total_height, mt->cpp,
-                                        &mt->tiling, &pitch,
-                                        (expect_accelerated_upload ?
-                                         BO_ALLOC_FOR_RENDER : 0));
+      brw_bo_put(mt->bo);
+      mt->bo = brw_bo_create_tiled(&brw->batch, "miptree",
+				   total_width, total_height, mt->cpp,
+				   &mt->tiling, &pitch,
+				   (expect_accelerated_upload ?
+				    BO_ALLOC_FOR_RENDER : 0));
       mt->pitch = pitch;
    }
 
@@ -726,7 +725,7 @@  intel_miptree_create(struct brw_context *brw,
 
 struct intel_mipmap_tree *
 intel_miptree_create_for_bo(struct brw_context *brw,
-                            drm_intel_bo *bo,
+                            struct brw_bo *bo,
                             mesa_format format,
                             uint32_t offset,
                             uint32_t width,
@@ -736,15 +735,12 @@  intel_miptree_create_for_bo(struct brw_context *brw,
                             bool disable_aux_buffers)
 {
    struct intel_mipmap_tree *mt;
-   uint32_t tiling, swizzle;
    GLenum target;
 
-   drm_intel_bo_get_tiling(bo, &tiling, &swizzle);
-
    /* Nothing will be able to use this miptree with the BO if the offset isn't
     * aligned.
     */
-   if (tiling != I915_TILING_NONE)
+   if (bo->tiling != I915_TILING_NONE)
       assert(offset % 4096 == 0);
 
    /* miptrees can't handle negative pitch.  If you need flipping of images,
@@ -762,11 +758,10 @@  intel_miptree_create_for_bo(struct brw_context *brw,
    if (!mt)
       return NULL;
 
-   drm_intel_bo_reference(bo);
-   mt->bo = bo;
+   mt->bo = brw_bo_get(bo);
    mt->pitch = pitch;
    mt->offset = offset;
-   mt->tiling = tiling;
+   mt->tiling = bo->tiling;
 
    return mt;
 }
@@ -784,7 +779,7 @@  intel_miptree_create_for_bo(struct brw_context *brw,
 void
 intel_update_winsys_renderbuffer_miptree(struct brw_context *intel,
                                          struct intel_renderbuffer *irb,
-                                         drm_intel_bo *bo,
+                                         struct brw_bo *bo,
                                          uint32_t width, uint32_t height,
                                          uint32_t pitch)
 {
@@ -914,13 +909,13 @@  intel_miptree_release(struct intel_mipmap_tree **mt)
 
       DBG("%s deleting %p\n", __func__, *mt);
 
-      drm_intel_bo_unreference((*mt)->bo);
+      brw_bo_put((*mt)->bo);
       intel_miptree_release(&(*mt)->stencil_mt);
       if ((*mt)->hiz_buf) {
          if ((*mt)->hiz_buf->mt)
             intel_miptree_release(&(*mt)->hiz_buf->mt);
          else
-            drm_intel_bo_unreference((*mt)->hiz_buf->bo);
+            brw_bo_put((*mt)->hiz_buf->bo);
          free((*mt)->hiz_buf);
       }
       intel_miptree_release(&(*mt)->mcs_mt);
@@ -1553,17 +1548,17 @@  intel_gen7_hiz_buf_create(struct brw_context *brw,
       }
    }
 
-   unsigned long pitch;
+   uint32_t pitch;
    uint32_t tiling = I915_TILING_Y;
-   buf->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "hiz",
-                                      hz_width, hz_height, 1,
-                                      &tiling, &pitch,
-                                      BO_ALLOC_FOR_RENDER);
+   buf->bo = brw_bo_create_tiled(&brw->batch, "hiz",
+				 hz_width, hz_height, 1,
+				 &tiling, &pitch,
+				 BO_ALLOC_FOR_RENDER);
    if (!buf->bo) {
       free(buf);
       return NULL;
    } else if (tiling != I915_TILING_Y) {
-      drm_intel_bo_unreference(buf->bo);
+      brw_bo_put(buf->bo);
       free(buf);
       return NULL;
    }
@@ -1656,17 +1651,17 @@  intel_gen8_hiz_buf_create(struct brw_context *brw,
       }
    }
 
-   unsigned long pitch;
+   uint32_t pitch;
    uint32_t tiling = I915_TILING_Y;
-   buf->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "hiz",
-                                      hz_width, hz_height, 1,
-                                      &tiling, &pitch,
-                                      BO_ALLOC_FOR_RENDER);
+   buf->bo = brw_bo_create_tiled(&brw->batch, "hiz",
+				 hz_width, hz_height, 1,
+				 &tiling, &pitch,
+				 BO_ALLOC_FOR_RENDER);
    if (!buf->bo) {
       free(buf);
       return NULL;
    } else if (tiling != I915_TILING_Y) {
-      drm_intel_bo_unreference(buf->bo);
+      brw_bo_put(buf->bo);
       free(buf);
       return NULL;
    }
@@ -2045,25 +2040,13 @@  intel_miptree_map_raw(struct brw_context *brw, struct intel_mipmap_tree *mt)
     * resolve any pending fast color clears before we map.
     */
    intel_miptree_resolve_color(brw, mt);
-
-   drm_intel_bo *bo = mt->bo;
-
-   if (drm_intel_bo_references(brw->batch.bo, bo))
-      intel_batchbuffer_flush(brw);
-
-   if (mt->tiling != I915_TILING_NONE)
-      brw_bo_map_gtt(brw, bo, "miptree");
-   else
-      brw_bo_map(brw, bo, true, "miptree");
-
-   return bo->virtual;
+   return brw_bo_map(mt->bo, MAP_READ);
 }
 
 void
 intel_miptree_unmap_raw(struct brw_context *brw,
                         struct intel_mipmap_tree *mt)
 {
-   drm_intel_bo_unmap(mt->bo);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index 8b42e4a..09b9270 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -33,7 +33,7 @@ 
  * The hardware has a fixed layout of a texture depending on parameters such
  * as the target/type (2D, 3D, CUBE), width, height, pitch, and number of
  * mipmap levels.  The individual level/layer slices are each 2D rectangles of
- * pixels at some x/y offset from the start of the drm_intel_bo.
+ * pixels at some x/y offset from the start of the brw_bo.
  *
  * Original OpenGL allowed texture miplevels to be specified in arbitrary
  * order, and a texture may change size over time.  Thus, each
@@ -49,7 +49,6 @@ 
 #include <assert.h>
 
 #include "main/mtypes.h"
-#include "intel_bufmgr.h"
 #include "intel_resolve_map.h"
 #include <GL/internal/dri_interface.h>
 
@@ -321,7 +320,7 @@  enum miptree_array_layout {
 struct intel_miptree_aux_buffer
 {
    /** Buffer object containing the pixel data. */
-   drm_intel_bo *bo;
+   struct brw_bo *bo;
 
    uint32_t pitch; /**< pitch in bytes. */
 
@@ -333,7 +332,7 @@  struct intel_miptree_aux_buffer
 struct intel_mipmap_tree
 {
    /** Buffer object containing the pixel data. */
-   drm_intel_bo *bo;
+   struct brw_bo *bo;
 
    uint32_t pitch; /**< pitch in bytes. */
 
@@ -542,7 +541,7 @@  struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
 
 struct intel_mipmap_tree *
 intel_miptree_create_for_bo(struct brw_context *brw,
-                            drm_intel_bo *bo,
+                            struct brw_bo *bo,
                             mesa_format format,
                             uint32_t offset,
                             uint32_t width,
@@ -554,7 +553,7 @@  intel_miptree_create_for_bo(struct brw_context *brw,
 void
 intel_update_winsys_renderbuffer_miptree(struct brw_context *intel,
                                          struct intel_renderbuffer *irb,
-                                         drm_intel_bo *bo,
+					 struct brw_bo *bo,
                                          uint32_t width, uint32_t height,
                                          uint32_t pitch);
 
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c b/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c
index 224dc65..bd40a92 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c
@@ -44,7 +44,6 @@ 
 
 #include "brw_context.h"
 #include "intel_screen.h"
-#include "intel_batchbuffer.h"
 #include "intel_blit.h"
 #include "intel_fbo.h"
 #include "intel_image.h"
@@ -314,7 +313,7 @@  do_blit_bitmap( struct gl_context *ctx,
 out:
 
    if (unlikely(INTEL_DEBUG & DEBUG_SYNC))
-      intel_batchbuffer_flush(brw);
+      brw_batch_flush(&brw->batch);
 
    if (_mesa_is_bufferobj(unpack->BufferObj)) {
       /* done with PBO so unmap it now */
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_copy.c b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
index ce053ed..4313588 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
@@ -39,7 +39,6 @@ 
 #include "intel_pixel.h"
 #include "intel_fbo.h"
 #include "intel_blit.h"
-#include "intel_batchbuffer.h"
 
 #define FILE_DEBUG_FLAG DEBUG_PIXEL
 
@@ -149,8 +148,6 @@  do_blit_copypixels(struct gl_context * ctx,
       return false;
    }
 
-   intel_batchbuffer_flush(brw);
-
    /* Clip to destination buffer. */
    orig_dstx = dstx;
    orig_dsty = dsty;
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_draw.c b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
index 4ecefc8..d8e00f6 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_draw.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
@@ -59,7 +59,7 @@  do_blit_drawpixels(struct gl_context * ctx,
    struct brw_context *brw = brw_context(ctx);
    struct intel_buffer_object *src = intel_buffer_object(unpack->BufferObj);
    GLuint src_offset;
-   drm_intel_bo *src_buffer;
+   struct brw_bo *src_buffer;
 
    DBG("%s\n", __func__);
 
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index d3ca38b..51cf9f5 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -39,7 +39,6 @@ 
 
 #include "brw_context.h"
 #include "intel_screen.h"
-#include "intel_batchbuffer.h"
 #include "intel_blit.h"
 #include "intel_buffers.h"
 #include "intel_fbo.h"
@@ -84,11 +83,6 @@  intel_readpixels_tiled_memcpy(struct gl_context * ctx,
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    int dst_pitch;
 
-   /* The miptree's buffer. */
-   drm_intel_bo *bo;
-
-   int error = 0;
-
    uint32_t cpp;
    mem_copy_fn mem_copy = NULL;
 
@@ -155,19 +149,6 @@  intel_readpixels_tiled_memcpy(struct gl_context * ctx,
     */
    intel_miptree_resolve_color(brw, irb->mt);
 
-   bo = irb->mt->bo;
-
-   if (drm_intel_bo_references(brw->batch.bo, bo)) {
-      perf_debug("Flushing before mapping a referenced bo.\n");
-      intel_batchbuffer_flush(brw);
-   }
-
-   error = brw_bo_map(brw, bo, false /* write enable */, "miptree");
-   if (error) {
-      DBG("%s: failed to map bo\n", __func__);
-      return false;
-   }
-
    dst_pitch = _mesa_image_row_stride(pack, width, format, type);
 
    /* For a window-system renderbuffer, the buffer is actually flipped
@@ -200,14 +181,13 @@  intel_readpixels_tiled_memcpy(struct gl_context * ctx,
       xoffset * cpp, (xoffset + width) * cpp,
       yoffset, yoffset + height,
       pixels - (ptrdiff_t) yoffset * dst_pitch - (ptrdiff_t) xoffset * cpp,
-      bo->virtual,
+      brw_bo_map(irb->mt->bo, MAP_READ | MAP_DETILED),
       dst_pitch, irb->mt->pitch,
       brw->has_swizzling,
       irb->mt->tiling,
       mem_copy
    );
 
-   drm_intel_bo_unmap(bo);
    return true;
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 72f5417..0da0bbd 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -43,6 +43,8 @@ 
 #include "utils.h"
 #include "xmlpool.h"
 
+#include "intel_reg.h"
+
 static const __DRIconfigOptionsExtension brw_config_options = {
    .base = { __DRI_CONFIG_OPTIONS, 1 },
    .xml =
@@ -90,9 +92,7 @@  DRI_CONF_BEGIN
 DRI_CONF_END
 };
 
-#include "intel_batchbuffer.h"
 #include "intel_buffers.h"
-#include "intel_bufmgr.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 #include "intel_screen.h"
@@ -137,7 +137,7 @@  aub_dump_bmp(struct gl_context *ctx)
 	    continue;
 	 }
 
-         drm_intel_gem_bo_aub_dump_bmp(irb->mt->bo,
+	 drm_intel_gem_bo_aub_dump_bmp(irb->mt->bo->base,
 				       irb->draw_x,
 				       irb->draw_y,
 				       irb->Base.Base.Width,
@@ -176,11 +176,11 @@  intel_dri2_flush_with_flags(__DRIcontext *cPriv,
       intel_resolve_for_dri2_flush(brw, dPriv);
 
    if (reason == __DRI2_THROTTLE_SWAPBUFFER)
-      brw->need_swap_throttle = true;
+      brw->batch.need_swap_throttle = true;
    if (reason == __DRI2_THROTTLE_FLUSHFRONT)
-      brw->need_flush_throttle = true;
+      brw->batch.need_flush_throttle = true;
 
-   intel_batchbuffer_flush(brw);
+   brw_batch_flush(&brw->batch);
 
    if (INTEL_DEBUG & DEBUG_AUB) {
       aub_dump_bmp(ctx);
@@ -358,8 +358,8 @@  intel_setup_image_from_mipmap_tree(struct brw_context *brw, __DRIimage *image,
                                                   &image->tile_y);
 
    drm_intel_bo_unreference(image->bo);
-   image->bo = mt->bo;
-   drm_intel_bo_reference(mt->bo);
+   image->bo = mt->bo->base;
+   drm_intel_bo_reference(image->bo);
 }
 
 static __DRIimage *
@@ -420,8 +420,8 @@  intel_create_image_from_renderbuffer(__DRIcontext *context,
    image->offset = 0;
    image->data = loaderPrivate;
    drm_intel_bo_unreference(image->bo);
-   image->bo = irb->mt->bo;
-   drm_intel_bo_reference(irb->mt->bo);
+   image->bo = irb->mt->bo->base;
+   drm_intel_bo_reference(image->bo);
    image->width = rb->Width;
    image->height = rb->Height;
    image->pitch = irb->mt->pitch;
@@ -525,7 +525,7 @@  intel_create_image(__DRIscreen *screen,
    if (image == NULL)
       return NULL;
 
-   
+
    cpp = _mesa_get_format_bytes(image->format);
    image->bo = drm_intel_bo_alloc_tiled(intelScreen->bufmgr, "image",
                                         width, height, cpp, &tiling,
@@ -552,7 +552,7 @@  intel_query_image(__DRIimage *image, int attrib, int *value)
       *value = image->bo->handle;
       return true;
    case __DRI_IMAGE_ATTRIB_NAME:
-      return !drm_intel_bo_flink(image->bo, (uint32_t *) value);
+      return drm_intel_bo_flink(image->bo, (uint32_t *)value) == 0;
    case __DRI_IMAGE_ATTRIB_FORMAT:
       *value = image->dri_format;
       return true;
@@ -568,9 +568,7 @@  intel_query_image(__DRIimage *image, int attrib, int *value)
       *value = image->planar_format->components;
       return true;
    case __DRI_IMAGE_ATTRIB_FD:
-      if (drm_intel_bo_gem_export_to_prime(image->bo, value) == 0)
-         return true;
-      return false;
+      return drm_intel_bo_gem_export_to_prime(image->bo, value) == 0;
    case __DRI_IMAGE_ATTRIB_FOURCC:
       if (intel_lookup_fourcc(image->dri_format, value))
          return true;
@@ -1079,13 +1077,27 @@  intel_init_bufmgr(struct intel_screen *intelScreen)
 
    intelScreen->no_hw = getenv("INTEL_NO_HW") != NULL;
 
-   intelScreen->bufmgr = intel_bufmgr_gem_init(spriv->fd, BATCH_SZ);
+   intelScreen->bufmgr = intel_bufmgr_gem_init(spriv->fd, 0);
    if (intelScreen->bufmgr == NULL) {
       fprintf(stderr, "[%s:%u] Error initializing buffer manager.\n",
 	      __func__, __LINE__);
       return false;
    }
 
+#if 0 /* XXX */
+   driParseConfigFiles(options, &brw->intelScreen->optionCache,
+                       brw->driContext->driScreenPriv->myNum, "i965");
+   switch (driQueryOptioni(options, "bo_reuse")) {
+   case DRI_CONF_BO_REUSE_DISABLED:
+      break;
+   case DRI_CONF_BO_REUSE_ALL:
+      drm_intel_bufmgr_gem_enable_reuse(intelScreen->bufmgr);
+      break;
+   }
+#else
+   drm_intel_bufmgr_gem_enable_reuse(intelScreen->bufmgr);
+#endif
+
    drm_intel_bufmgr_gem_enable_fenced_relocs(intelScreen->bufmgr);
 
    if (!intel_get_boolean(spriv, I915_PARAM_HAS_RELAXED_DELTA)) {
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index 2f8faaf..f6f43fa 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -31,16 +31,19 @@ 
 #include <stdbool.h>
 #include <sys/time.h>
 #include "dri_util.h"
-#include "intel_bufmgr.h"
 #include "brw_device_info.h"
 #include "i915_drm.h"
 #include "xmlconfig.h"
 
+#include <intel_bufmgr.h>
+
 struct intel_screen
 {
    int deviceID;
    const struct brw_device_info *devinfo;
 
+   drm_intel_bufmgr *bufmgr;
+
    __DRIscreen *driScrnPriv;
 
    bool no_hw;
@@ -56,8 +59,6 @@  struct intel_screen
     */
    bool has_context_reset_notification;
 
-   dri_bufmgr *bufmgr;
-
    /**
     * A unique ID for shader programs.
     */
@@ -73,6 +74,12 @@  struct intel_screen
    driOptionCache optionCache;
 };
 
+static inline int intel_screen_to_fd(struct intel_screen *scr)
+{
+   __DRIscreen *psp = scr->driScrnPriv;
+   return psp->fd;
+}
+
 extern void intelDestroyContext(__DRIcontext * driContextPriv);
 
 extern GLboolean intelUnbindContext(__DRIcontext * driContextPriv);
diff --git a/src/mesa/drivers/dri/i965/intel_syncobj.c b/src/mesa/drivers/dri/i965/intel_syncobj.c
index 0432980..3250d9d 100644
--- a/src/mesa/drivers/dri/i965/intel_syncobj.c
+++ b/src/mesa/drivers/dri/i965/intel_syncobj.c
@@ -41,7 +41,6 @@ 
 #include "main/imports.h"
 
 #include "brw_context.h"
-#include "intel_batchbuffer.h"
 #include "intel_reg.h"
 
 static struct gl_sync_object *
@@ -59,7 +58,7 @@  intel_delete_sync_object(struct gl_context *ctx, struct gl_sync_object *s)
 {
    struct intel_sync_object *sync = (struct intel_sync_object *)s;
 
-   drm_intel_bo_unreference(sync->bo);
+   brw_bo_put(sync->bo);
    free(sync);
 }
 
@@ -73,10 +72,8 @@  intel_fence_sync(struct gl_context *ctx, struct gl_sync_object *s,
    assert(condition == GL_SYNC_GPU_COMMANDS_COMPLETE);
    brw_emit_mi_flush(brw);
 
-   sync->bo = brw->batch.bo;
-   drm_intel_bo_reference(sync->bo);
-
-   intel_batchbuffer_flush(brw);
+   sync->bo = brw_bo_get(brw->batch.bo);
+   brw_batch_flush(&brw->batch);
 }
 
 static void intel_client_wait_sync(struct gl_context *ctx, struct gl_sync_object *s,
@@ -92,9 +89,9 @@  static void intel_client_wait_sync(struct gl_context *ctx, struct gl_sync_object
    if (timeout > INT64_MAX)
       timeout = INT64_MAX;
 
-   if (sync->bo && drm_intel_gem_bo_wait(sync->bo, timeout) == 0) {
+   if (sync->bo && brw_bo_wait(sync->bo, timeout) == 0) {
       s->StatusFlag = 1;
-      drm_intel_bo_unreference(sync->bo);
+      brw_bo_put(sync->bo);
       sync->bo = NULL;
    }
 }
@@ -113,8 +110,8 @@  static void intel_check_sync(struct gl_context *ctx, struct gl_sync_object *s)
 {
    struct intel_sync_object *sync = (struct intel_sync_object *)s;
 
-   if (sync->bo && !drm_intel_bo_busy(sync->bo)) {
-      drm_intel_bo_unreference(sync->bo);
+   if (sync->bo && !brw_bo_busy(sync->bo, BUSY_WRITE | BUSY_FLUSH)) {
+      brw_bo_put(sync->bo);
       sync->bo = NULL;
       s->StatusFlag = 1;
    }
diff --git a/src/mesa/drivers/dri/i965/intel_tex.c b/src/mesa/drivers/dri/i965/intel_tex.c
index 777a682..d4b270f 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.c
+++ b/src/mesa/drivers/dri/i965/intel_tex.c
@@ -332,9 +332,9 @@  intel_set_texture_storage_for_buffer_object(struct gl_context *ctx,
 
    assert(intel_texobj->mt == NULL);
 
-   drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_buffer_obj,
-                                             buffer_offset,
-                                             row_stride * image->Height);
+   struct brw_bo *bo = intel_bufferobj_buffer(brw, intel_buffer_obj,
+					      buffer_offset,
+					      row_stride * image->Height);
    intel_texobj->mt =
       intel_miptree_create_for_bo(brw, bo,
                                   image->TexFormat,
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index 7952ee5..df85c8d 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -19,7 +19,6 @@ 
 
 #include "intel_mipmap_tree.h"
 #include "intel_buffer_objects.h"
-#include "intel_batchbuffer.h"
 #include "intel_tex.h"
 #include "intel_blit.h"
 #include "intel_fbo.h"
@@ -95,7 +94,8 @@  intelTexImage(struct gl_context * ctx,
    struct intel_texture_image *intelImage = intel_texture_image(texImage);
    bool ok;
 
-   bool tex_busy = intelImage->mt && drm_intel_bo_busy(intelImage->mt->bo);
+   bool tex_busy =
+      intelImage->mt && brw_bo_busy(intelImage->mt->bo, BUSY_WRITE);
 
    DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n",
        __func__, _mesa_get_format_name(texImage->TexFormat),
@@ -147,7 +147,7 @@  intelTexImage(struct gl_context * ctx,
 static void
 intel_set_texture_image_bo(struct gl_context *ctx,
                            struct gl_texture_image *image,
-                           drm_intel_bo *bo,
+                           struct brw_bo *bo,
                            GLenum target,
                            GLenum internalFormat,
                            mesa_format format,
@@ -343,13 +343,15 @@  intel_image_target_texture_2d(struct gl_context *ctx, GLenum target,
     * buffer's content to the main buffer nor for invalidating the aux buffer's
     * content.
     */
-   intel_set_texture_image_bo(ctx, texImage, image->bo,
+   struct brw_bo *bo = brw_bo_import(&brw->batch, image->bo, true);
+   intel_set_texture_image_bo(ctx, texImage, bo,
                               target, image->internal_format,
                               image->format, image->offset,
                               image->width,  image->height,
                               image->pitch,
                               image->tile_x, image->tile_y,
                               true /*disable_aux_buffers*/);
+   brw_bo_put(bo);
 }
 
 /**
@@ -370,11 +372,6 @@  intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
    struct intel_texture_image *image = intel_texture_image(texImage);
    int dst_pitch;
 
-   /* The miptree's buffer. */
-   drm_intel_bo *bo;
-
-   int error = 0;
-
    uint32_t cpp;
    mem_copy_fn mem_copy = NULL;
 
@@ -429,18 +426,6 @@  intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
     */
    intel_miptree_resolve_color(brw, image->mt);
 
-   bo = image->mt->bo;
-
-   if (drm_intel_bo_references(brw->batch.bo, bo)) {
-      perf_debug("Flushing before mapping a referenced bo.\n");
-      intel_batchbuffer_flush(brw);
-   }
-
-   error = brw_bo_map(brw, bo, false /* write enable */, "miptree");
-   if (error) {
-      DBG("%s: failed to map bo\n", __func__);
-      return false;
-   }
 
    dst_pitch = _mesa_image_row_stride(packing, width, format, type);
 
@@ -462,14 +447,13 @@  intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
       xoffset * cpp, (xoffset + width) * cpp,
       yoffset, yoffset + height,
       pixels - (ptrdiff_t) yoffset * dst_pitch - (ptrdiff_t) xoffset * cpp,
-      bo->virtual,
+      brw_bo_map(image->mt->bo, MAP_READ | MAP_DETILED),
       dst_pitch, image->mt->pitch,
       brw->has_swizzling,
       image->mt->tiling,
       mem_copy
    );
 
-   drm_intel_bo_unmap(bo);
    return true;
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
index 7507f76..8c4aa8a 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@@ -38,7 +38,6 @@ 
 #include "drivers/common/meta.h"
 
 #include "brw_context.h"
-#include "intel_batchbuffer.h"
 #include "intel_tex.h"
 #include "intel_mipmap_tree.h"
 #include "intel_blit.h"
@@ -86,11 +85,6 @@  intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
    struct intel_texture_image *image = intel_texture_image(texImage);
    int src_pitch;
 
-   /* The miptree's buffer. */
-   drm_intel_bo *bo;
-
-   int error = 0;
-
    uint32_t cpp;
    mem_copy_fn mem_copy = NULL;
 
@@ -141,19 +135,6 @@  intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
     */
    intel_miptree_resolve_color(brw, image->mt);
 
-   bo = image->mt->bo;
-
-   if (drm_intel_bo_references(brw->batch.bo, bo)) {
-      perf_debug("Flushing before mapping a referenced bo.\n");
-      intel_batchbuffer_flush(brw);
-   }
-
-   error = brw_bo_map(brw, bo, true /* write enable */, "miptree");
-   if (error || bo->virtual == NULL) {
-      DBG("%s: failed to map bo\n", __func__);
-      return false;
-   }
-
    src_pitch = _mesa_image_row_stride(packing, width, format, type);
 
    /* We postponed printing this message until having committed to executing
@@ -177,7 +158,7 @@  intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
    linear_to_tiled(
       xoffset * cpp, (xoffset + width) * cpp,
       yoffset, yoffset + height,
-      bo->virtual,
+      brw_bo_map(image->mt->bo, MAP_READ | MAP_DETILED),
       pixels - (ptrdiff_t) yoffset * src_pitch - (ptrdiff_t) xoffset * cpp,
       image->mt->pitch, src_pitch,
       brw->has_swizzling,
@@ -185,7 +166,6 @@  intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
       mem_copy
    );
 
-   drm_intel_bo_unmap(bo);
    return true;
 }
 
@@ -202,7 +182,8 @@  intelTexSubImage(struct gl_context * ctx,
    struct intel_texture_image *intelImage = intel_texture_image(texImage);
    bool ok;
 
-   bool tex_busy = intelImage->mt && drm_intel_bo_busy(intelImage->mt->bo);
+   bool tex_busy =
+      intelImage->mt && brw_bo_busy(intelImage->mt->bo, BUSY_WRITE);
 
    DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n",
        __func__, _mesa_get_format_name(texImage->TexFormat),
diff --git a/src/mesa/drivers/dri/i965/intel_upload.c b/src/mesa/drivers/dri/i965/intel_upload.c
index 870aabc..a5a3a7b 100644
--- a/src/mesa/drivers/dri/i965/intel_upload.c
+++ b/src/mesa/drivers/dri/i965/intel_upload.c
@@ -36,7 +36,6 @@ 
 #include "brw_context.h"
 #include "intel_blit.h"
 #include "intel_buffer_objects.h"
-#include "intel_batchbuffer.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 
@@ -50,14 +49,10 @@ 
 #define ALIGN_NPOT(value, alignment) \
    (((value) + (alignment) - 1) / (alignment) * (alignment))
 
-void
+static void
 intel_upload_finish(struct brw_context *brw)
 {
-   if (!brw->upload.bo)
-      return;
-
-   drm_intel_bo_unmap(brw->upload.bo);
-   drm_intel_bo_unreference(brw->upload.bo);
+   brw_bo_put(brw->upload.bo);
    brw->upload.bo = NULL;
    brw->upload.next_offset = 0;
 }
@@ -89,7 +84,7 @@  void *
 intel_upload_space(struct brw_context *brw,
                    uint32_t size,
                    uint32_t alignment,
-                   drm_intel_bo **out_bo,
+                   struct brw_bo **out_bo,
                    uint32_t *out_offset)
 {
    uint32_t offset;
@@ -101,24 +96,20 @@  intel_upload_space(struct brw_context *brw,
    }
 
    if (!brw->upload.bo) {
-      brw->upload.bo = drm_intel_bo_alloc(brw->bufmgr, "streamed data",
-                                          MAX2(INTEL_UPLOAD_SIZE, size), 4096);
-      if (brw->has_llc)
-         drm_intel_bo_map(brw->upload.bo, true);
-      else
-         drm_intel_gem_bo_map_gtt(brw->upload.bo);
+      brw->upload.bo = brw_bo_create(&brw->batch, "streamed data",
+				     MAX2(INTEL_UPLOAD_SIZE, size), 4096);
    }
 
    brw->upload.next_offset = offset + size;
 
    *out_offset = offset;
+
    if (*out_bo != brw->upload.bo) {
-      drm_intel_bo_unreference(*out_bo);
-      *out_bo = brw->upload.bo;
-      drm_intel_bo_reference(brw->upload.bo);
+      brw_bo_put(*out_bo);
+      *out_bo = brw_bo_get(brw->upload.bo);
    }
 
-   return brw->upload.bo->virtual + offset;
+   return brw_bo_map(brw->upload.bo, MAP_WRITE | MAP_ASYNC) + offset;
 }
 
 /**
@@ -131,9 +122,9 @@  intel_upload_data(struct brw_context *brw,
                   const void *data,
                   uint32_t size,
                   uint32_t alignment,
-                  drm_intel_bo **out_bo,
+		  struct brw_bo **out_bo,
                   uint32_t *out_offset)
 {
-   void *dst = intel_upload_space(brw, size, alignment, out_bo, out_offset);
-   memcpy(dst, data, size);
+   memcpy(intel_upload_space(brw, size, alignment, out_bo, out_offset),
+	  data, size);
 }