Message ID | 1402673891-14618-13-git-send-email-oscar.mateo@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Jun 13, 2014 at 08:37:30AM -0700, oscar.mateo@intel.com wrote: > From: Oscar Mateo <oscar.mateo@intel.com> > > For the most part, logical ring context objects are similar to hardware > contexts in that the backing object is meant to be opaque. There are > some exceptions where we need to poke certain offsets of the object for > initialization, updating the tail pointer or updating the PDPs. > > For our basic execlist implementation we'll only need our PPGTT PDs, > and ringbuffer addresses in order to set up the context. With previous > patches, we have both, so start prepping the context to be load. > > Before running a context for the first time you must populate some > fields in the context object. These fields begin 1 PAGE + LRCA, ie. the > first page (in 0 based counting) of the context image. These same > fields will be read and written to as contexts are saved and restored > once the system is up and running. > > Many of these fields are completely reused from previous global > registers: ringbuffer head/tail/control, context control matches some > previous MI_SET_CONTEXT flags, and page directories. There are other > fields which we don't touch which we may want in the future. > > v2: CTX_LRI_HEADER_0 is MI_LOAD_REGISTER_IMM(14) for render and (11) > for other engines. > > v3: Several rebases and general changes to the code. > > v4: Squash with "Extract LR context object populating" > Also, Damien's review comments: > - Set the Force Posted bit on the LRI header, as the BSpec suggest we do. > - Prevent warning when compiling a 32-bits kernel without HIGHMEM64. > - Add a clarifying comment to the context population code. > > v5: Damien's review comments: > - The third MI_LOAD_REGISTER_IMM in the context does not set Force Posted. > - Remove dead code. > > Signed-off-by: Ben Widawsky <ben@bwidawsk.net> (v1) > Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com> (v2) > Signed-off-by: Oscar Mateo <oscar.mateo@intel.com> (v3-5) > --- > drivers/gpu/drm/i915/i915_reg.h | 1 + > drivers/gpu/drm/i915/intel_lrc.c | 154 ++++++++++++++++++++++++++++++++++++++- > 2 files changed, 151 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > index 286f05c..9c8692a 100644 > --- a/drivers/gpu/drm/i915/i915_reg.h > +++ b/drivers/gpu/drm/i915/i915_reg.h > @@ -277,6 +277,7 @@ > * address/value pairs. Don't overdue it, though, x <= 2^4 must hold! > */ > #define MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1) > +#define MI_LRI_FORCE_POSTED (1<<12) > #define MI_STORE_REGISTER_MEM(x) MI_INSTR(0x24, 2*(x)-1) > #define MI_STORE_REGISTER_MEM_GEN8(x) MI_INSTR(0x24, 3*(x)-1) > #define MI_SRM_LRM_GLOBAL_GTT (1<<22) > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index b3a23e0..b96bb45 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -46,6 +46,38 @@ > > #define GEN8_LR_CONTEXT_ALIGN 4096 > > +#define RING_ELSP(ring) ((ring)->mmio_base+0x230) > +#define RING_CONTEXT_CONTROL(ring) ((ring)->mmio_base+0x244) > + > +#define CTX_LRI_HEADER_0 0x01 > +#define CTX_CONTEXT_CONTROL 0x02 > +#define CTX_RING_HEAD 0x04 > +#define CTX_RING_TAIL 0x06 > +#define CTX_RING_BUFFER_START 0x08 > +#define CTX_RING_BUFFER_CONTROL 0x0a > +#define CTX_BB_HEAD_U 0x0c > +#define CTX_BB_HEAD_L 0x0e > +#define CTX_BB_STATE 0x10 > +#define CTX_SECOND_BB_HEAD_U 0x12 > +#define CTX_SECOND_BB_HEAD_L 0x14 > +#define CTX_SECOND_BB_STATE 0x16 > +#define CTX_BB_PER_CTX_PTR 0x18 > +#define CTX_RCS_INDIRECT_CTX 0x1a > +#define CTX_RCS_INDIRECT_CTX_OFFSET 0x1c > +#define CTX_LRI_HEADER_1 0x21 > +#define CTX_CTX_TIMESTAMP 0x22 > +#define CTX_PDP3_UDW 0x24 > +#define CTX_PDP3_LDW 0x26 > +#define CTX_PDP2_UDW 0x28 > +#define CTX_PDP2_LDW 0x2a > +#define CTX_PDP1_UDW 0x2c > +#define CTX_PDP1_LDW 0x2e > +#define CTX_PDP0_UDW 0x30 > +#define CTX_PDP0_LDW 0x32 > +#define CTX_LRI_HEADER_2 0x41 > +#define CTX_R_PWR_CLK_STATE 0x42 > +#define CTX_GPGPU_CSR_BASE_ADDRESS 0x44 > + > bool intel_enable_execlists(struct drm_device *dev) > { > if (!i915.enable_execlists) > @@ -54,6 +86,110 @@ bool intel_enable_execlists(struct drm_device *dev) > return HAS_LOGICAL_RING_CONTEXTS(dev) && USES_PPGTT(dev); > } > > +static int > +populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_obj, > + struct intel_engine_cs *ring, struct drm_i915_gem_object *ring_obj) > +{ > + struct i915_hw_ppgtt *ppgtt = ctx_to_ppgtt(ctx); > + struct page *page; > + uint32_t *reg_state; > + int ret; > + > + ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true); > + if (ret) { > + DRM_DEBUG_DRIVER("Could not set to CPU domain\n"); > + return ret; > + } > + > + ret = i915_gem_object_get_pages(ctx_obj); > + if (ret) { > + DRM_DEBUG_DRIVER("Could not get object pages\n"); > + return ret; > + } > + > + i915_gem_object_pin_pages(ctx_obj); > + > + /* The second page of the context object contains some fields which must > + * be set up prior to the first execution. */ > + page = i915_gem_object_get_page(ctx_obj, 1); > + reg_state = kmap_atomic(page); > + > + /* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM > + * commands followed by (reg, value) pairs. The values we are setting here are > + * only for the first context restore: on a subsequent save, the GPU will > + * recreate this batchbuffer with new values (including all the missing > + * MI_LOAD_REGISTER_IMM commands that we are not initializing here). */ > + if (ring->id == RCS) > + reg_state[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(14); > + else > + reg_state[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(11); > + reg_state[CTX_LRI_HEADER_0] |= MI_LRI_FORCE_POSTED; > + reg_state[CTX_CONTEXT_CONTROL] = RING_CONTEXT_CONTROL(ring); > + reg_state[CTX_CONTEXT_CONTROL+1] = (1<<3) | MI_RESTORE_INHIBIT; > + reg_state[CTX_CONTEXT_CONTROL+1] |= reg_state[CTX_CONTEXT_CONTROL+1] << 16; If we can, we should probably use _MASKED_BIT_ENABLE() here to make it more obvious why we're doing the or+shift. > + reg_state[CTX_RING_HEAD] = RING_HEAD(ring->mmio_base); > + reg_state[CTX_RING_HEAD+1] = 0; > + reg_state[CTX_RING_TAIL] = RING_TAIL(ring->mmio_base); > + reg_state[CTX_RING_TAIL+1] = 0; > + reg_state[CTX_RING_BUFFER_START] = RING_START(ring->mmio_base); > + reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(ring_obj); > + reg_state[CTX_RING_BUFFER_CONTROL] = RING_CTL(ring->mmio_base); > + reg_state[CTX_RING_BUFFER_CONTROL+1] = (31 * PAGE_SIZE) | RING_VALID; The size here doesn't look right to me. Shouldn't it be (number of pages - 1)? See init_ring_common() > + reg_state[CTX_BB_HEAD_U] = ring->mmio_base + 0x168; > + reg_state[CTX_BB_HEAD_U+1] = 0; > + reg_state[CTX_BB_HEAD_L] = ring->mmio_base + 0x140; > + reg_state[CTX_BB_HEAD_L+1] = 0; > + reg_state[CTX_BB_STATE] = ring->mmio_base + 0x110; > + reg_state[CTX_BB_STATE+1] = (1<<5); > + reg_state[CTX_SECOND_BB_HEAD_U] = ring->mmio_base + 0x11c; > + reg_state[CTX_SECOND_BB_HEAD_U+1] = 0; > + reg_state[CTX_SECOND_BB_HEAD_L] = ring->mmio_base + 0x114; > + reg_state[CTX_SECOND_BB_HEAD_L+1] = 0; > + reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118; > + reg_state[CTX_SECOND_BB_STATE+1] = 0; > + if (ring->id == RCS) { > + reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0; > + reg_state[CTX_BB_PER_CTX_PTR+1] = 0; > + reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4; > + reg_state[CTX_RCS_INDIRECT_CTX+1] = 0; > + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8; > + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0; > + } > + reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9); > + reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED; > + reg_state[CTX_CTX_TIMESTAMP] = ring->mmio_base + 0x3a8; > + reg_state[CTX_CTX_TIMESTAMP+1] = 0; > + reg_state[CTX_PDP3_UDW] = GEN8_RING_PDP_UDW(ring, 3); > + reg_state[CTX_PDP3_LDW] = GEN8_RING_PDP_LDW(ring, 3); > + reg_state[CTX_PDP2_UDW] = GEN8_RING_PDP_UDW(ring, 2); > + reg_state[CTX_PDP2_LDW] = GEN8_RING_PDP_LDW(ring, 2); > + reg_state[CTX_PDP1_UDW] = GEN8_RING_PDP_UDW(ring, 1); > + reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1); > + reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0); > + reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0); > + reg_state[CTX_PDP3_UDW+1] = (u64)ppgtt->pd_dma_addr[3] >> 32; > + reg_state[CTX_PDP3_LDW+1] = ppgtt->pd_dma_addr[3]; > + reg_state[CTX_PDP2_UDW+1] = (u64)ppgtt->pd_dma_addr[2] >> 32; > + reg_state[CTX_PDP2_LDW+1] = ppgtt->pd_dma_addr[2]; > + reg_state[CTX_PDP1_UDW+1] = (u64)ppgtt->pd_dma_addr[1] >> 32; > + reg_state[CTX_PDP1_LDW+1] = ppgtt->pd_dma_addr[1]; > + reg_state[CTX_PDP0_UDW+1] = (u64)ppgtt->pd_dma_addr[0] >> 32; > + reg_state[CTX_PDP0_LDW+1] = ppgtt->pd_dma_addr[0]; Are we able to use upper_32_bits() and lower_32_bits() for these? Brad > + if (ring->id == RCS) { > + reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); > + reg_state[CTX_R_PWR_CLK_STATE] = 0x20c8; > + reg_state[CTX_R_PWR_CLK_STATE+1] = 0; > + } > + > + kunmap_atomic(reg_state); > + > + ctx_obj->dirty = 1; > + set_page_dirty(page); > + i915_gem_object_unpin_pages(ctx_obj); > + > + return 0; > +} > + > void intel_lr_context_free(struct intel_context *ctx) > { > int i; > @@ -145,14 +281,24 @@ int intel_lr_context_deferred_create(struct intel_context *ctx, > if (ret) { > DRM_DEBUG_DRIVER("Failed to allocate ringbuffer obj %s: %d\n", > ring->name, ret); > - kfree(ringbuf); > - i915_gem_object_ggtt_unpin(ctx_obj); > - drm_gem_object_unreference(&ctx_obj->base); > - return ret; > + goto error; > + } > + > + ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf->obj); > + if (ret) { > + DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); > + intel_destroy_ring_buffer(ringbuf); > + goto error; > } > > ctx->engine[ring->id].ringbuf = ringbuf; > ctx->engine[ring->id].obj = ctx_obj; > > return 0; > + > +error: > + kfree(ringbuf); > + i915_gem_object_ggtt_unpin(ctx_obj); > + drm_gem_object_unreference(&ctx_obj->base); > + return ret; > } > -- > 1.9.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> -----Original Message----- > From: Volkin, Bradley D > Sent: Thursday, June 19, 2014 12:24 AM > To: Mateo Lozano, Oscar > Cc: intel-gfx@lists.freedesktop.org > Subject: Re: [Intel-gfx] [PATCH 12/53] drm/i915/bdw: Populate LR contexts > (somewhat) > > On Fri, Jun 13, 2014 at 08:37:30AM -0700, oscar.mateo@intel.com wrote: > > From: Oscar Mateo <oscar.mateo@intel.com> > > > > For the most part, logical ring context objects are similar to > > hardware contexts in that the backing object is meant to be opaque. > > There are some exceptions where we need to poke certain offsets of the > > object for initialization, updating the tail pointer or updating the PDPs. > > > > For our basic execlist implementation we'll only need our PPGTT PDs, > > and ringbuffer addresses in order to set up the context. With previous > > patches, we have both, so start prepping the context to be load. > > > > Before running a context for the first time you must populate some > > fields in the context object. These fields begin 1 PAGE + LRCA, ie. > > the first page (in 0 based counting) of the context image. These same > > fields will be read and written to as contexts are saved and restored > > once the system is up and running. > > > > Many of these fields are completely reused from previous global > > registers: ringbuffer head/tail/control, context control matches some > > previous MI_SET_CONTEXT flags, and page directories. There are other > > fields which we don't touch which we may want in the future. > > > > v2: CTX_LRI_HEADER_0 is MI_LOAD_REGISTER_IMM(14) for render and > (11) > > for other engines. > > > > v3: Several rebases and general changes to the code. > > > > v4: Squash with "Extract LR context object populating" > > Also, Damien's review comments: > > - Set the Force Posted bit on the LRI header, as the BSpec suggest we do. > > - Prevent warning when compiling a 32-bits kernel without HIGHMEM64. > > - Add a clarifying comment to the context population code. > > > > v5: Damien's review comments: > > - The third MI_LOAD_REGISTER_IMM in the context does not set Force > Posted. > > - Remove dead code. > > > > Signed-off-by: Ben Widawsky <ben@bwidawsk.net> (v1) > > Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com> (v2) > > Signed-off-by: Oscar Mateo <oscar.mateo@intel.com> (v3-5) > > --- > > drivers/gpu/drm/i915/i915_reg.h | 1 + > > drivers/gpu/drm/i915/intel_lrc.c | 154 > > ++++++++++++++++++++++++++++++++++++++- > > 2 files changed, 151 insertions(+), 4 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_reg.h > > b/drivers/gpu/drm/i915/i915_reg.h index 286f05c..9c8692a 100644 > > --- a/drivers/gpu/drm/i915/i915_reg.h > > +++ b/drivers/gpu/drm/i915/i915_reg.h > > @@ -277,6 +277,7 @@ > > * address/value pairs. Don't overdue it, though, x <= 2^4 must hold! > > */ > > #define MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1) > > +#define MI_LRI_FORCE_POSTED (1<<12) > > #define MI_STORE_REGISTER_MEM(x) MI_INSTR(0x24, 2*(x)-1) #define > > MI_STORE_REGISTER_MEM_GEN8(x) MI_INSTR(0x24, 3*(x)-1) > > #define MI_SRM_LRM_GLOBAL_GTT (1<<22) > > diff --git a/drivers/gpu/drm/i915/intel_lrc.c > > b/drivers/gpu/drm/i915/intel_lrc.c > > index b3a23e0..b96bb45 100644 > > --- a/drivers/gpu/drm/i915/intel_lrc.c > > +++ b/drivers/gpu/drm/i915/intel_lrc.c > > @@ -46,6 +46,38 @@ > > > > #define GEN8_LR_CONTEXT_ALIGN 4096 > > > > +#define RING_ELSP(ring) ((ring)->mmio_base+0x230) > > +#define RING_CONTEXT_CONTROL(ring) ((ring)->mmio_base+0x244) > > + > > +#define CTX_LRI_HEADER_0 0x01 > > +#define CTX_CONTEXT_CONTROL 0x02 > > +#define CTX_RING_HEAD 0x04 > > +#define CTX_RING_TAIL 0x06 > > +#define CTX_RING_BUFFER_START 0x08 > > +#define CTX_RING_BUFFER_CONTROL 0x0a > > +#define CTX_BB_HEAD_U 0x0c > > +#define CTX_BB_HEAD_L 0x0e > > +#define CTX_BB_STATE 0x10 > > +#define CTX_SECOND_BB_HEAD_U 0x12 > > +#define CTX_SECOND_BB_HEAD_L 0x14 > > +#define CTX_SECOND_BB_STATE 0x16 > > +#define CTX_BB_PER_CTX_PTR 0x18 > > +#define CTX_RCS_INDIRECT_CTX 0x1a > > +#define CTX_RCS_INDIRECT_CTX_OFFSET 0x1c > > +#define CTX_LRI_HEADER_1 0x21 > > +#define CTX_CTX_TIMESTAMP 0x22 > > +#define CTX_PDP3_UDW 0x24 > > +#define CTX_PDP3_LDW 0x26 > > +#define CTX_PDP2_UDW 0x28 > > +#define CTX_PDP2_LDW 0x2a > > +#define CTX_PDP1_UDW 0x2c > > +#define CTX_PDP1_LDW 0x2e > > +#define CTX_PDP0_UDW 0x30 > > +#define CTX_PDP0_LDW 0x32 > > +#define CTX_LRI_HEADER_2 0x41 > > +#define CTX_R_PWR_CLK_STATE 0x42 > > +#define CTX_GPGPU_CSR_BASE_ADDRESS 0x44 > > + > > bool intel_enable_execlists(struct drm_device *dev) { > > if (!i915.enable_execlists) > > @@ -54,6 +86,110 @@ bool intel_enable_execlists(struct drm_device > *dev) > > return HAS_LOGICAL_RING_CONTEXTS(dev) && USES_PPGTT(dev); } > > > > +static int > > +populate_lr_context(struct intel_context *ctx, struct > drm_i915_gem_object *ctx_obj, > > + struct intel_engine_cs *ring, struct drm_i915_gem_object > > +*ring_obj) { > > + struct i915_hw_ppgtt *ppgtt = ctx_to_ppgtt(ctx); > > + struct page *page; > > + uint32_t *reg_state; > > + int ret; > > + > > + ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true); > > + if (ret) { > > + DRM_DEBUG_DRIVER("Could not set to CPU domain\n"); > > + return ret; > > + } > > + > > + ret = i915_gem_object_get_pages(ctx_obj); > > + if (ret) { > > + DRM_DEBUG_DRIVER("Could not get object pages\n"); > > + return ret; > > + } > > + > > + i915_gem_object_pin_pages(ctx_obj); > > + > > + /* The second page of the context object contains some fields which > must > > + * be set up prior to the first execution. */ > > + page = i915_gem_object_get_page(ctx_obj, 1); > > + reg_state = kmap_atomic(page); > > + > > + /* A context is actually a big batch buffer with several > MI_LOAD_REGISTER_IMM > > + * commands followed by (reg, value) pairs. The values we are > setting here are > > + * only for the first context restore: on a subsequent save, the GPU > will > > + * recreate this batchbuffer with new values (including all the missing > > + * MI_LOAD_REGISTER_IMM commands that we are not initializing > here). */ > > + if (ring->id == RCS) > > + reg_state[CTX_LRI_HEADER_0] = > MI_LOAD_REGISTER_IMM(14); > > + else > > + reg_state[CTX_LRI_HEADER_0] = > MI_LOAD_REGISTER_IMM(11); > > + reg_state[CTX_LRI_HEADER_0] |= MI_LRI_FORCE_POSTED; > > + reg_state[CTX_CONTEXT_CONTROL] = > RING_CONTEXT_CONTROL(ring); > > + reg_state[CTX_CONTEXT_CONTROL+1] = (1<<3) | > MI_RESTORE_INHIBIT; > > + reg_state[CTX_CONTEXT_CONTROL+1] |= > reg_state[CTX_CONTEXT_CONTROL+1] > > +<< 16; > > If we can, we should probably use _MASKED_BIT_ENABLE() here to make it > more obvious why we're doing the or+shift. ACK > > + reg_state[CTX_RING_HEAD] = RING_HEAD(ring->mmio_base); > > + reg_state[CTX_RING_HEAD+1] = 0; > > + reg_state[CTX_RING_TAIL] = RING_TAIL(ring->mmio_base); > > + reg_state[CTX_RING_TAIL+1] = 0; > > + reg_state[CTX_RING_BUFFER_START] = RING_START(ring- > >mmio_base); > > + reg_state[CTX_RING_BUFFER_START+1] = > i915_gem_obj_ggtt_offset(ring_obj); > > + reg_state[CTX_RING_BUFFER_CONTROL] = RING_CTL(ring- > >mmio_base); > > + reg_state[CTX_RING_BUFFER_CONTROL+1] = (31 * PAGE_SIZE) | > > +RING_VALID; > > The size here doesn't look right to me. Shouldn't it be (number of pages - 1)? > See init_ring_common() But that´s exactly what it is, right? Our ringbuf->size = 32 * PAGE_SIZE; so we are setting 31 * PAGE_SIZE > > + reg_state[CTX_BB_HEAD_U] = ring->mmio_base + 0x168; > > + reg_state[CTX_BB_HEAD_U+1] = 0; > > + reg_state[CTX_BB_HEAD_L] = ring->mmio_base + 0x140; > > + reg_state[CTX_BB_HEAD_L+1] = 0; > > + reg_state[CTX_BB_STATE] = ring->mmio_base + 0x110; > > + reg_state[CTX_BB_STATE+1] = (1<<5); > > + reg_state[CTX_SECOND_BB_HEAD_U] = ring->mmio_base + 0x11c; > > + reg_state[CTX_SECOND_BB_HEAD_U+1] = 0; > > + reg_state[CTX_SECOND_BB_HEAD_L] = ring->mmio_base + 0x114; > > + reg_state[CTX_SECOND_BB_HEAD_L+1] = 0; > > + reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118; > > + reg_state[CTX_SECOND_BB_STATE+1] = 0; > > + if (ring->id == RCS) { > > + reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + > 0x1c0; > > + reg_state[CTX_BB_PER_CTX_PTR+1] = 0; > > + reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + > 0x1c4; > > + reg_state[CTX_RCS_INDIRECT_CTX+1] = 0; > > + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring- > >mmio_base + 0x1c8; > > + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0; > > + } > > + reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9); > > + reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED; > > + reg_state[CTX_CTX_TIMESTAMP] = ring->mmio_base + 0x3a8; > > + reg_state[CTX_CTX_TIMESTAMP+1] = 0; > > + reg_state[CTX_PDP3_UDW] = GEN8_RING_PDP_UDW(ring, 3); > > + reg_state[CTX_PDP3_LDW] = GEN8_RING_PDP_LDW(ring, 3); > > + reg_state[CTX_PDP2_UDW] = GEN8_RING_PDP_UDW(ring, 2); > > + reg_state[CTX_PDP2_LDW] = GEN8_RING_PDP_LDW(ring, 2); > > + reg_state[CTX_PDP1_UDW] = GEN8_RING_PDP_UDW(ring, 1); > > + reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1); > > + reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0); > > + reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0); > > + reg_state[CTX_PDP3_UDW+1] = (u64)ppgtt->pd_dma_addr[3] >> 32; > > + reg_state[CTX_PDP3_LDW+1] = ppgtt->pd_dma_addr[3]; > > + reg_state[CTX_PDP2_UDW+1] = (u64)ppgtt->pd_dma_addr[2] >> 32; > > + reg_state[CTX_PDP2_LDW+1] = ppgtt->pd_dma_addr[2]; > > + reg_state[CTX_PDP1_UDW+1] = (u64)ppgtt->pd_dma_addr[1] >> 32; > > + reg_state[CTX_PDP1_LDW+1] = ppgtt->pd_dma_addr[1]; > > + reg_state[CTX_PDP0_UDW+1] = (u64)ppgtt->pd_dma_addr[0] >> 32; > > + reg_state[CTX_PDP0_LDW+1] = ppgtt->pd_dma_addr[0]; > > Are we able to use upper_32_bits() and lower_32_bits() for these? ACK -- Oscar
On Mon, Jun 23, 2014 at 05:42:50AM -0700, Mateo Lozano, Oscar wrote: > > -----Original Message----- > > From: Volkin, Bradley D > > > + reg_state[CTX_RING_HEAD+1] = 0; > > > + reg_state[CTX_RING_TAIL] = RING_TAIL(ring->mmio_base); > > > + reg_state[CTX_RING_TAIL+1] = 0; > > > + reg_state[CTX_RING_BUFFER_START] = RING_START(ring- > > >mmio_base); > > > + reg_state[CTX_RING_BUFFER_START+1] = > > i915_gem_obj_ggtt_offset(ring_obj); > > > + reg_state[CTX_RING_BUFFER_CONTROL] = RING_CTL(ring- > > >mmio_base); > > > + reg_state[CTX_RING_BUFFER_CONTROL+1] = (31 * PAGE_SIZE) | > > > +RING_VALID; > > > > The size here doesn't look right to me. Shouldn't it be (number of pages - 1)? > > See init_ring_common() > > But that´s exactly what it is, right? > > Our ringbuf->size = 32 * PAGE_SIZE; > so we are setting 31 * PAGE_SIZE Ok, on closer inspection, the result is correct because the Buffer Length field happens to be bits 20:12. But it looked to me like a size-in-bytes rather than the encoding I expected. So I guess I'd prefer that we do it as in init_ring_common(), using ring_obj->base.size and the RING_NR_PAGES mask so that it's a bit more obvious what's going on. Thanks, Brad > > > > + reg_state[CTX_BB_HEAD_U] = ring->mmio_base + 0x168; > > > + reg_state[CTX_BB_HEAD_U+1] = 0; > > > + reg_state[CTX_BB_HEAD_L] = ring->mmio_base + 0x140; > > > + reg_state[CTX_BB_HEAD_L+1] = 0; > > > + reg_state[CTX_BB_STATE] = ring->mmio_base + 0x110; > > > + reg_state[CTX_BB_STATE+1] = (1<<5); > > > + reg_state[CTX_SECOND_BB_HEAD_U] = ring->mmio_base + 0x11c; > > > + reg_state[CTX_SECOND_BB_HEAD_U+1] = 0; > > > + reg_state[CTX_SECOND_BB_HEAD_L] = ring->mmio_base + 0x114; > > > + reg_state[CTX_SECOND_BB_HEAD_L+1] = 0; > > > + reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118; > > > + reg_state[CTX_SECOND_BB_STATE+1] = 0; > > > + if (ring->id == RCS) { > > > + reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + > > 0x1c0; > > > + reg_state[CTX_BB_PER_CTX_PTR+1] = 0; > > > + reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + > > 0x1c4; > > > + reg_state[CTX_RCS_INDIRECT_CTX+1] = 0; > > > + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring- > > >mmio_base + 0x1c8; > > > + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0; > > > + } > > > + reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9); > > > + reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED; > > > + reg_state[CTX_CTX_TIMESTAMP] = ring->mmio_base + 0x3a8; > > > + reg_state[CTX_CTX_TIMESTAMP+1] = 0; > > > + reg_state[CTX_PDP3_UDW] = GEN8_RING_PDP_UDW(ring, 3); > > > + reg_state[CTX_PDP3_LDW] = GEN8_RING_PDP_LDW(ring, 3); > > > + reg_state[CTX_PDP2_UDW] = GEN8_RING_PDP_UDW(ring, 2); > > > + reg_state[CTX_PDP2_LDW] = GEN8_RING_PDP_LDW(ring, 2); > > > + reg_state[CTX_PDP1_UDW] = GEN8_RING_PDP_UDW(ring, 1); > > > + reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1); > > > + reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0); > > > + reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0); > > > + reg_state[CTX_PDP3_UDW+1] = (u64)ppgtt->pd_dma_addr[3] >> 32; > > > + reg_state[CTX_PDP3_LDW+1] = ppgtt->pd_dma_addr[3]; > > > + reg_state[CTX_PDP2_UDW+1] = (u64)ppgtt->pd_dma_addr[2] >> 32; > > > + reg_state[CTX_PDP2_LDW+1] = ppgtt->pd_dma_addr[2]; > > > + reg_state[CTX_PDP1_UDW+1] = (u64)ppgtt->pd_dma_addr[1] >> 32; > > > + reg_state[CTX_PDP1_LDW+1] = ppgtt->pd_dma_addr[1]; > > > + reg_state[CTX_PDP0_UDW+1] = (u64)ppgtt->pd_dma_addr[0] >> 32; > > > + reg_state[CTX_PDP0_LDW+1] = ppgtt->pd_dma_addr[0]; > > > > Are we able to use upper_32_bits() and lower_32_bits() for these? > > ACK > > -- Oscar
> -----Original Message----- > From: Volkin, Bradley D > Sent: Monday, June 23, 2014 4:06 PM > To: Mateo Lozano, Oscar > Cc: intel-gfx@lists.freedesktop.org > Subject: Re: [Intel-gfx] [PATCH 12/53] drm/i915/bdw: Populate LR contexts > (somewhat) > > On Mon, Jun 23, 2014 at 05:42:50AM -0700, Mateo Lozano, Oscar wrote: > > > -----Original Message----- > > > From: Volkin, Bradley D > > > > + reg_state[CTX_RING_HEAD+1] = 0; > > > > + reg_state[CTX_RING_TAIL] = RING_TAIL(ring->mmio_base); > > > > + reg_state[CTX_RING_TAIL+1] = 0; > > > > + reg_state[CTX_RING_BUFFER_START] = RING_START(ring- > > > >mmio_base); > > > > + reg_state[CTX_RING_BUFFER_START+1] = > > > i915_gem_obj_ggtt_offset(ring_obj); > > > > + reg_state[CTX_RING_BUFFER_CONTROL] = RING_CTL(ring- > > > >mmio_base); > > > > + reg_state[CTX_RING_BUFFER_CONTROL+1] = (31 * PAGE_SIZE) | > > > > +RING_VALID; > > > > > > The size here doesn't look right to me. Shouldn't it be (number of pages - > 1)? > > > See init_ring_common() > > > > But that´s exactly what it is, right? > > > > Our ringbuf->size = 32 * PAGE_SIZE; > > so we are setting 31 * PAGE_SIZE > > Ok, on closer inspection, the result is correct because the Buffer Length field > happens to be bits 20:12. But it looked to me like a size-in-bytes rather than > the encoding I expected. So I guess I'd prefer that we do it as in > init_ring_common(), using ring_obj->base.size and the RING_NR_PAGES > mask so that it's a bit more obvious what's going on. > > Thanks, > Brad Yeah, it probably looks less magical that way. Ok, will do.
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 286f05c..9c8692a 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -277,6 +277,7 @@ * address/value pairs. Don't overdue it, though, x <= 2^4 must hold! */ #define MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1) +#define MI_LRI_FORCE_POSTED (1<<12) #define MI_STORE_REGISTER_MEM(x) MI_INSTR(0x24, 2*(x)-1) #define MI_STORE_REGISTER_MEM_GEN8(x) MI_INSTR(0x24, 3*(x)-1) #define MI_SRM_LRM_GLOBAL_GTT (1<<22) diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index b3a23e0..b96bb45 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -46,6 +46,38 @@ #define GEN8_LR_CONTEXT_ALIGN 4096 +#define RING_ELSP(ring) ((ring)->mmio_base+0x230) +#define RING_CONTEXT_CONTROL(ring) ((ring)->mmio_base+0x244) + +#define CTX_LRI_HEADER_0 0x01 +#define CTX_CONTEXT_CONTROL 0x02 +#define CTX_RING_HEAD 0x04 +#define CTX_RING_TAIL 0x06 +#define CTX_RING_BUFFER_START 0x08 +#define CTX_RING_BUFFER_CONTROL 0x0a +#define CTX_BB_HEAD_U 0x0c +#define CTX_BB_HEAD_L 0x0e +#define CTX_BB_STATE 0x10 +#define CTX_SECOND_BB_HEAD_U 0x12 +#define CTX_SECOND_BB_HEAD_L 0x14 +#define CTX_SECOND_BB_STATE 0x16 +#define CTX_BB_PER_CTX_PTR 0x18 +#define CTX_RCS_INDIRECT_CTX 0x1a +#define CTX_RCS_INDIRECT_CTX_OFFSET 0x1c +#define CTX_LRI_HEADER_1 0x21 +#define CTX_CTX_TIMESTAMP 0x22 +#define CTX_PDP3_UDW 0x24 +#define CTX_PDP3_LDW 0x26 +#define CTX_PDP2_UDW 0x28 +#define CTX_PDP2_LDW 0x2a +#define CTX_PDP1_UDW 0x2c +#define CTX_PDP1_LDW 0x2e +#define CTX_PDP0_UDW 0x30 +#define CTX_PDP0_LDW 0x32 +#define CTX_LRI_HEADER_2 0x41 +#define CTX_R_PWR_CLK_STATE 0x42 +#define CTX_GPGPU_CSR_BASE_ADDRESS 0x44 + bool intel_enable_execlists(struct drm_device *dev) { if (!i915.enable_execlists) @@ -54,6 +86,110 @@ bool intel_enable_execlists(struct drm_device *dev) return HAS_LOGICAL_RING_CONTEXTS(dev) && USES_PPGTT(dev); } +static int +populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_obj, + struct intel_engine_cs *ring, struct drm_i915_gem_object *ring_obj) +{ + struct i915_hw_ppgtt *ppgtt = ctx_to_ppgtt(ctx); + struct page *page; + uint32_t *reg_state; + int ret; + + ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true); + if (ret) { + DRM_DEBUG_DRIVER("Could not set to CPU domain\n"); + return ret; + } + + ret = i915_gem_object_get_pages(ctx_obj); + if (ret) { + DRM_DEBUG_DRIVER("Could not get object pages\n"); + return ret; + } + + i915_gem_object_pin_pages(ctx_obj); + + /* The second page of the context object contains some fields which must + * be set up prior to the first execution. */ + page = i915_gem_object_get_page(ctx_obj, 1); + reg_state = kmap_atomic(page); + + /* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM + * commands followed by (reg, value) pairs. The values we are setting here are + * only for the first context restore: on a subsequent save, the GPU will + * recreate this batchbuffer with new values (including all the missing + * MI_LOAD_REGISTER_IMM commands that we are not initializing here). */ + if (ring->id == RCS) + reg_state[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(14); + else + reg_state[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(11); + reg_state[CTX_LRI_HEADER_0] |= MI_LRI_FORCE_POSTED; + reg_state[CTX_CONTEXT_CONTROL] = RING_CONTEXT_CONTROL(ring); + reg_state[CTX_CONTEXT_CONTROL+1] = (1<<3) | MI_RESTORE_INHIBIT; + reg_state[CTX_CONTEXT_CONTROL+1] |= reg_state[CTX_CONTEXT_CONTROL+1] << 16; + reg_state[CTX_RING_HEAD] = RING_HEAD(ring->mmio_base); + reg_state[CTX_RING_HEAD+1] = 0; + reg_state[CTX_RING_TAIL] = RING_TAIL(ring->mmio_base); + reg_state[CTX_RING_TAIL+1] = 0; + reg_state[CTX_RING_BUFFER_START] = RING_START(ring->mmio_base); + reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(ring_obj); + reg_state[CTX_RING_BUFFER_CONTROL] = RING_CTL(ring->mmio_base); + reg_state[CTX_RING_BUFFER_CONTROL+1] = (31 * PAGE_SIZE) | RING_VALID; + reg_state[CTX_BB_HEAD_U] = ring->mmio_base + 0x168; + reg_state[CTX_BB_HEAD_U+1] = 0; + reg_state[CTX_BB_HEAD_L] = ring->mmio_base + 0x140; + reg_state[CTX_BB_HEAD_L+1] = 0; + reg_state[CTX_BB_STATE] = ring->mmio_base + 0x110; + reg_state[CTX_BB_STATE+1] = (1<<5); + reg_state[CTX_SECOND_BB_HEAD_U] = ring->mmio_base + 0x11c; + reg_state[CTX_SECOND_BB_HEAD_U+1] = 0; + reg_state[CTX_SECOND_BB_HEAD_L] = ring->mmio_base + 0x114; + reg_state[CTX_SECOND_BB_HEAD_L+1] = 0; + reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118; + reg_state[CTX_SECOND_BB_STATE+1] = 0; + if (ring->id == RCS) { + reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0; + reg_state[CTX_BB_PER_CTX_PTR+1] = 0; + reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4; + reg_state[CTX_RCS_INDIRECT_CTX+1] = 0; + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8; + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0; + } + reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9); + reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED; + reg_state[CTX_CTX_TIMESTAMP] = ring->mmio_base + 0x3a8; + reg_state[CTX_CTX_TIMESTAMP+1] = 0; + reg_state[CTX_PDP3_UDW] = GEN8_RING_PDP_UDW(ring, 3); + reg_state[CTX_PDP3_LDW] = GEN8_RING_PDP_LDW(ring, 3); + reg_state[CTX_PDP2_UDW] = GEN8_RING_PDP_UDW(ring, 2); + reg_state[CTX_PDP2_LDW] = GEN8_RING_PDP_LDW(ring, 2); + reg_state[CTX_PDP1_UDW] = GEN8_RING_PDP_UDW(ring, 1); + reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1); + reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0); + reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0); + reg_state[CTX_PDP3_UDW+1] = (u64)ppgtt->pd_dma_addr[3] >> 32; + reg_state[CTX_PDP3_LDW+1] = ppgtt->pd_dma_addr[3]; + reg_state[CTX_PDP2_UDW+1] = (u64)ppgtt->pd_dma_addr[2] >> 32; + reg_state[CTX_PDP2_LDW+1] = ppgtt->pd_dma_addr[2]; + reg_state[CTX_PDP1_UDW+1] = (u64)ppgtt->pd_dma_addr[1] >> 32; + reg_state[CTX_PDP1_LDW+1] = ppgtt->pd_dma_addr[1]; + reg_state[CTX_PDP0_UDW+1] = (u64)ppgtt->pd_dma_addr[0] >> 32; + reg_state[CTX_PDP0_LDW+1] = ppgtt->pd_dma_addr[0]; + if (ring->id == RCS) { + reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); + reg_state[CTX_R_PWR_CLK_STATE] = 0x20c8; + reg_state[CTX_R_PWR_CLK_STATE+1] = 0; + } + + kunmap_atomic(reg_state); + + ctx_obj->dirty = 1; + set_page_dirty(page); + i915_gem_object_unpin_pages(ctx_obj); + + return 0; +} + void intel_lr_context_free(struct intel_context *ctx) { int i; @@ -145,14 +281,24 @@ int intel_lr_context_deferred_create(struct intel_context *ctx, if (ret) { DRM_DEBUG_DRIVER("Failed to allocate ringbuffer obj %s: %d\n", ring->name, ret); - kfree(ringbuf); - i915_gem_object_ggtt_unpin(ctx_obj); - drm_gem_object_unreference(&ctx_obj->base); - return ret; + goto error; + } + + ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf->obj); + if (ret) { + DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); + intel_destroy_ring_buffer(ringbuf); + goto error; } ctx->engine[ring->id].ringbuf = ringbuf; ctx->engine[ring->id].obj = ctx_obj; return 0; + +error: + kfree(ringbuf); + i915_gem_object_ggtt_unpin(ctx_obj); + drm_gem_object_unreference(&ctx_obj->base); + return ret; }