Message ID | 1398698528-25256-1-git-send-email-bradley.d.volkin@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, Apr 28, 2014 at 08:22:08AM -0700, bradley.d.volkin@intel.com wrote: > From: Brad Volkin <bradley.d.volkin@intel.com> > > For clients that submit large batch buffers the command parser has > a substantial impact on performance. On my HSW ULT system performance > drops as much as ~20% on some tests. Most of the time is spent in the > command lookup code. Converting that from the current naive search to > a hash table lookup reduces the performance impact by as much as ~10%. > > The choice of value for I915_CMD_HASH_ORDER allows all commands > currently used in the parser tables to hash to their own bucket (except > for one collision on the render ring). The tradeoff is that it wastes > memory. Because the opcodes for the commands in the tables are not > particularly well distributed, reducing the order still leaves many > buckets empty. The increased collisions don't seem to have a huge > impact on the performance gain, but for now anyhow, the parser trades > memory for performance. > > Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com> Nice. One idea on top which could be worth a shot is a bloomfilter to handle all the non-special cases without a (likely) cache miss in the hashtable. The per-ring bloomfilter would be only loaded once (and if we place it nearby other stuff the cmdparser needs anyway even that is amortized). Also Chris mentioned that blitter loads under X are about the worst case wrt impact of the cmdparser. Benchmarking x11perf might be a useful extreme testcase for optimizing this. I guess Chris will jump in with more ideas? Thanks, Daniel > --- > drivers/gpu/drm/i915/i915_cmd_parser.c | 136 ++++++++++++++++++++++++-------- > drivers/gpu/drm/i915/i915_drv.h | 1 + > drivers/gpu/drm/i915/intel_ringbuffer.c | 2 + > drivers/gpu/drm/i915/intel_ringbuffer.h | 11 ++- > 4 files changed, 116 insertions(+), 34 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c > index 9bac097..9dca899 100644 > --- a/drivers/gpu/drm/i915/i915_cmd_parser.c > +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c > @@ -498,16 +498,18 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header) > return 0; > } > > -static bool validate_cmds_sorted(struct intel_ring_buffer *ring) > +static bool validate_cmds_sorted(struct intel_ring_buffer *ring, > + const struct drm_i915_cmd_table *cmd_tables, > + int cmd_table_count) > { > int i; > bool ret = true; > > - if (!ring->cmd_tables || ring->cmd_table_count == 0) > + if (!cmd_tables || cmd_table_count == 0) > return true; > > - for (i = 0; i < ring->cmd_table_count; i++) { > - const struct drm_i915_cmd_table *table = &ring->cmd_tables[i]; > + for (i = 0; i < cmd_table_count; i++) { > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > u32 previous = 0; > int j; > > @@ -557,6 +559,60 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > ring->master_reg_count); > } > > +struct cmd_node { > + const struct drm_i915_cmd_descriptor *desc; > + struct hlist_node node; > +}; > + > +/* > + * Different command ranges have different numbers of bits for the opcode. > + * In order to use the opcode bits, and only the opcode bits, for the hash key > + * we should use the MI_* command opcode mask (since those commands use the > + * fewest bits for the opcode.) > + */ > +#define CMD_HASH_MASK STD_MI_OPCODE_MASK > + > +static int init_hash_table(struct intel_ring_buffer *ring, > + const struct drm_i915_cmd_table *cmd_tables, > + int cmd_table_count) > +{ > + int i, j; > + > + hash_init(ring->cmd_hash); > + > + for (i = 0; i < cmd_table_count; i++) { > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > + > + for (j = 0; j < table->count; j++) { > + const struct drm_i915_cmd_descriptor *desc = > + &table->table[j]; > + struct cmd_node *desc_node = > + kmalloc(sizeof(*desc_node), GFP_KERNEL); > + > + if (!desc_node) > + return -ENOMEM; > + > + desc_node->desc = desc; > + hash_add(ring->cmd_hash, &desc_node->node, > + desc->cmd.value & CMD_HASH_MASK); > + } > + } > + > + return 0; > +} > + > +static void fini_hash_table(struct intel_ring_buffer *ring) > +{ > + struct hlist_node *tmp; > + struct cmd_node *desc_node; > + int i; > + > + hash_for_each_safe(ring->cmd_hash, i, tmp, desc_node, node) { > + hash_del(&desc_node->node); > + kfree(desc_node); > + } > +} > + > /** > * i915_cmd_parser_init_ring() - set cmd parser related fields for a ringbuffer > * @ring: the ringbuffer to initialize > @@ -567,18 +623,21 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > */ > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > { > + const struct drm_i915_cmd_table *cmd_tables; > + int cmd_table_count; > + > if (!IS_GEN7(ring->dev)) > return; > > switch (ring->id) { > case RCS: > if (IS_HASWELL(ring->dev)) { > - ring->cmd_tables = hsw_render_ring_cmds; > - ring->cmd_table_count = > + cmd_tables = hsw_render_ring_cmds; > + cmd_table_count = > ARRAY_SIZE(hsw_render_ring_cmds); > } else { > - ring->cmd_tables = gen7_render_cmds; > - ring->cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > + cmd_tables = gen7_render_cmds; > + cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > } > > ring->reg_table = gen7_render_regs; > @@ -595,17 +654,17 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > ring->get_cmd_length_mask = gen7_render_get_cmd_length_mask; > break; > case VCS: > - ring->cmd_tables = gen7_video_cmds; > - ring->cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > + cmd_tables = gen7_video_cmds; > + cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > break; > case BCS: > if (IS_HASWELL(ring->dev)) { > - ring->cmd_tables = hsw_blt_ring_cmds; > - ring->cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > + cmd_tables = hsw_blt_ring_cmds; > + cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > } else { > - ring->cmd_tables = gen7_blt_cmds; > - ring->cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > + cmd_tables = gen7_blt_cmds; > + cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > } > > ring->reg_table = gen7_blt_regs; > @@ -622,8 +681,8 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > ring->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; > break; > case VECS: > - ring->cmd_tables = hsw_vebox_cmds; > - ring->cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > + cmd_tables = hsw_vebox_cmds; > + cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > /* VECS can use the same length_mask function as VCS */ > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > break; > @@ -633,18 +692,38 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > BUG(); > } > > - BUG_ON(!validate_cmds_sorted(ring)); > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > BUG_ON(!validate_regs_sorted(ring)); > + > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > + > + ring->needs_cmd_parser = true; > +} > + > +/** > + * i915_cmd_parser_fini_ring() - clean up cmd parser related fields > + * @ring: the ringbuffer to clean up > + * > + * Releases any resources related to command parsing that may have been > + * initialized for the specified ring. > + */ > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring) > +{ > + if (!ring->needs_cmd_parser) > + return; > + > + fini_hash_table(ring); > } > > static const struct drm_i915_cmd_descriptor* > -find_cmd_in_table(const struct drm_i915_cmd_table *table, > +find_cmd_in_table(struct intel_ring_buffer *ring, > u32 cmd_header) > { > - int i; > + struct cmd_node *desc_node; > > - for (i = 0; i < table->count; i++) { > - const struct drm_i915_cmd_descriptor *desc = &table->table[i]; > + hash_for_each_possible(ring->cmd_hash, desc_node, node, > + cmd_header & CMD_HASH_MASK) { > + const struct drm_i915_cmd_descriptor *desc = desc_node->desc; > u32 masked_cmd = desc->cmd.mask & cmd_header; > u32 masked_value = desc->cmd.value & desc->cmd.mask; > > @@ -668,16 +747,12 @@ find_cmd(struct intel_ring_buffer *ring, > u32 cmd_header, > struct drm_i915_cmd_descriptor *default_desc) > { > + const struct drm_i915_cmd_descriptor *desc; > u32 mask; > - int i; > - > - for (i = 0; i < ring->cmd_table_count; i++) { > - const struct drm_i915_cmd_descriptor *desc; > > - desc = find_cmd_in_table(&ring->cmd_tables[i], cmd_header); > - if (desc) > - return desc; > - } > + desc = find_cmd_in_table(ring, cmd_header); > + if (desc) > + return desc; > > mask = ring->get_cmd_length_mask(cmd_header); > if (!mask) > @@ -748,8 +823,7 @@ bool i915_needs_cmd_parser(struct intel_ring_buffer *ring) > { > struct drm_i915_private *dev_priv = ring->dev->dev_private; > > - /* No command tables indicates a platform without parsing */ > - if (!ring->cmd_tables) > + if (!ring->needs_cmd_parser) > return false; > > /* > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 7d6acb4..8c8388f 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2414,6 +2414,7 @@ const char *i915_cache_level_str(int type); > /* i915_cmd_parser.c */ > int i915_cmd_parser_get_version(void); > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring); > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring); > bool i915_needs_cmd_parser(struct intel_ring_buffer *ring); > int i915_parse_cmds(struct intel_ring_buffer *ring, > struct drm_i915_gem_object *batch_obj, > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index eb3dd26..06fa9a3 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -1469,6 +1469,8 @@ void intel_cleanup_ring_buffer(struct intel_ring_buffer *ring) > ring->cleanup(ring); > > cleanup_status_page(ring); > + > + i915_cmd_parser_fini_ring(ring); > } > > static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n) > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index 413cdc7..48daa91 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -1,6 +1,10 @@ > #ifndef _INTEL_RINGBUFFER_H_ > #define _INTEL_RINGBUFFER_H_ > > +#include <linux/hashtable.h> > + > +#define I915_CMD_HASH_ORDER 9 > + > /* > * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use" > * Gen3 BSpec "vol1c Memory Interface Functions" / 2.3.4.5 "Ring Buffer Use" > @@ -164,12 +168,13 @@ struct intel_ring_buffer { > volatile u32 *cpu_page; > } scratch; > > + bool needs_cmd_parser; > + > /* > - * Tables of commands the command parser needs to know about > + * Table of commands the command parser needs to know about > * for this ring. > */ > - const struct drm_i915_cmd_table *cmd_tables; > - int cmd_table_count; > + DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER); > > /* > * Table of registers allowed in commands that read/write registers. > -- > 1.8.3.2 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Mon, Apr 28, 2014 at 08:22:08AM -0700, Volkin, Bradley D wrote: > From: Brad Volkin <bradley.d.volkin@intel.com> > > For clients that submit large batch buffers the command parser has > a substantial impact on performance. On my HSW ULT system performance > drops as much as ~20% on some tests. Most of the time is spent in the > command lookup code. Converting that from the current naive search to > a hash table lookup reduces the performance impact by as much as ~10%. Tvrtko pointed out that what I wrote here is a bit ambiguous. To clarify: Without the patch, perf drops 20% With the patch, perf drops 10% Brad > > The choice of value for I915_CMD_HASH_ORDER allows all commands > currently used in the parser tables to hash to their own bucket (except > for one collision on the render ring). The tradeoff is that it wastes > memory. Because the opcodes for the commands in the tables are not > particularly well distributed, reducing the order still leaves many > buckets empty. The increased collisions don't seem to have a huge > impact on the performance gain, but for now anyhow, the parser trades > memory for performance. > > Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com> > --- > drivers/gpu/drm/i915/i915_cmd_parser.c | 136 ++++++++++++++++++++++++-------- > drivers/gpu/drm/i915/i915_drv.h | 1 + > drivers/gpu/drm/i915/intel_ringbuffer.c | 2 + > drivers/gpu/drm/i915/intel_ringbuffer.h | 11 ++- > 4 files changed, 116 insertions(+), 34 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c > index 9bac097..9dca899 100644 > --- a/drivers/gpu/drm/i915/i915_cmd_parser.c > +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c > @@ -498,16 +498,18 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header) > return 0; > } > > -static bool validate_cmds_sorted(struct intel_ring_buffer *ring) > +static bool validate_cmds_sorted(struct intel_ring_buffer *ring, > + const struct drm_i915_cmd_table *cmd_tables, > + int cmd_table_count) > { > int i; > bool ret = true; > > - if (!ring->cmd_tables || ring->cmd_table_count == 0) > + if (!cmd_tables || cmd_table_count == 0) > return true; > > - for (i = 0; i < ring->cmd_table_count; i++) { > - const struct drm_i915_cmd_table *table = &ring->cmd_tables[i]; > + for (i = 0; i < cmd_table_count; i++) { > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > u32 previous = 0; > int j; > > @@ -557,6 +559,60 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > ring->master_reg_count); > } > > +struct cmd_node { > + const struct drm_i915_cmd_descriptor *desc; > + struct hlist_node node; > +}; > + > +/* > + * Different command ranges have different numbers of bits for the opcode. > + * In order to use the opcode bits, and only the opcode bits, for the hash key > + * we should use the MI_* command opcode mask (since those commands use the > + * fewest bits for the opcode.) > + */ > +#define CMD_HASH_MASK STD_MI_OPCODE_MASK > + > +static int init_hash_table(struct intel_ring_buffer *ring, > + const struct drm_i915_cmd_table *cmd_tables, > + int cmd_table_count) > +{ > + int i, j; > + > + hash_init(ring->cmd_hash); > + > + for (i = 0; i < cmd_table_count; i++) { > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > + > + for (j = 0; j < table->count; j++) { > + const struct drm_i915_cmd_descriptor *desc = > + &table->table[j]; > + struct cmd_node *desc_node = > + kmalloc(sizeof(*desc_node), GFP_KERNEL); > + > + if (!desc_node) > + return -ENOMEM; > + > + desc_node->desc = desc; > + hash_add(ring->cmd_hash, &desc_node->node, > + desc->cmd.value & CMD_HASH_MASK); > + } > + } > + > + return 0; > +} > + > +static void fini_hash_table(struct intel_ring_buffer *ring) > +{ > + struct hlist_node *tmp; > + struct cmd_node *desc_node; > + int i; > + > + hash_for_each_safe(ring->cmd_hash, i, tmp, desc_node, node) { > + hash_del(&desc_node->node); > + kfree(desc_node); > + } > +} > + > /** > * i915_cmd_parser_init_ring() - set cmd parser related fields for a ringbuffer > * @ring: the ringbuffer to initialize > @@ -567,18 +623,21 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > */ > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > { > + const struct drm_i915_cmd_table *cmd_tables; > + int cmd_table_count; > + > if (!IS_GEN7(ring->dev)) > return; > > switch (ring->id) { > case RCS: > if (IS_HASWELL(ring->dev)) { > - ring->cmd_tables = hsw_render_ring_cmds; > - ring->cmd_table_count = > + cmd_tables = hsw_render_ring_cmds; > + cmd_table_count = > ARRAY_SIZE(hsw_render_ring_cmds); > } else { > - ring->cmd_tables = gen7_render_cmds; > - ring->cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > + cmd_tables = gen7_render_cmds; > + cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > } > > ring->reg_table = gen7_render_regs; > @@ -595,17 +654,17 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > ring->get_cmd_length_mask = gen7_render_get_cmd_length_mask; > break; > case VCS: > - ring->cmd_tables = gen7_video_cmds; > - ring->cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > + cmd_tables = gen7_video_cmds; > + cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > break; > case BCS: > if (IS_HASWELL(ring->dev)) { > - ring->cmd_tables = hsw_blt_ring_cmds; > - ring->cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > + cmd_tables = hsw_blt_ring_cmds; > + cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > } else { > - ring->cmd_tables = gen7_blt_cmds; > - ring->cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > + cmd_tables = gen7_blt_cmds; > + cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > } > > ring->reg_table = gen7_blt_regs; > @@ -622,8 +681,8 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > ring->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; > break; > case VECS: > - ring->cmd_tables = hsw_vebox_cmds; > - ring->cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > + cmd_tables = hsw_vebox_cmds; > + cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > /* VECS can use the same length_mask function as VCS */ > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > break; > @@ -633,18 +692,38 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > BUG(); > } > > - BUG_ON(!validate_cmds_sorted(ring)); > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > BUG_ON(!validate_regs_sorted(ring)); > + > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > + > + ring->needs_cmd_parser = true; > +} > + > +/** > + * i915_cmd_parser_fini_ring() - clean up cmd parser related fields > + * @ring: the ringbuffer to clean up > + * > + * Releases any resources related to command parsing that may have been > + * initialized for the specified ring. > + */ > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring) > +{ > + if (!ring->needs_cmd_parser) > + return; > + > + fini_hash_table(ring); > } > > static const struct drm_i915_cmd_descriptor* > -find_cmd_in_table(const struct drm_i915_cmd_table *table, > +find_cmd_in_table(struct intel_ring_buffer *ring, > u32 cmd_header) > { > - int i; > + struct cmd_node *desc_node; > > - for (i = 0; i < table->count; i++) { > - const struct drm_i915_cmd_descriptor *desc = &table->table[i]; > + hash_for_each_possible(ring->cmd_hash, desc_node, node, > + cmd_header & CMD_HASH_MASK) { > + const struct drm_i915_cmd_descriptor *desc = desc_node->desc; > u32 masked_cmd = desc->cmd.mask & cmd_header; > u32 masked_value = desc->cmd.value & desc->cmd.mask; > > @@ -668,16 +747,12 @@ find_cmd(struct intel_ring_buffer *ring, > u32 cmd_header, > struct drm_i915_cmd_descriptor *default_desc) > { > + const struct drm_i915_cmd_descriptor *desc; > u32 mask; > - int i; > - > - for (i = 0; i < ring->cmd_table_count; i++) { > - const struct drm_i915_cmd_descriptor *desc; > > - desc = find_cmd_in_table(&ring->cmd_tables[i], cmd_header); > - if (desc) > - return desc; > - } > + desc = find_cmd_in_table(ring, cmd_header); > + if (desc) > + return desc; > > mask = ring->get_cmd_length_mask(cmd_header); > if (!mask) > @@ -748,8 +823,7 @@ bool i915_needs_cmd_parser(struct intel_ring_buffer *ring) > { > struct drm_i915_private *dev_priv = ring->dev->dev_private; > > - /* No command tables indicates a platform without parsing */ > - if (!ring->cmd_tables) > + if (!ring->needs_cmd_parser) > return false; > > /* > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 7d6acb4..8c8388f 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2414,6 +2414,7 @@ const char *i915_cache_level_str(int type); > /* i915_cmd_parser.c */ > int i915_cmd_parser_get_version(void); > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring); > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring); > bool i915_needs_cmd_parser(struct intel_ring_buffer *ring); > int i915_parse_cmds(struct intel_ring_buffer *ring, > struct drm_i915_gem_object *batch_obj, > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index eb3dd26..06fa9a3 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -1469,6 +1469,8 @@ void intel_cleanup_ring_buffer(struct intel_ring_buffer *ring) > ring->cleanup(ring); > > cleanup_status_page(ring); > + > + i915_cmd_parser_fini_ring(ring); > } > > static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n) > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index 413cdc7..48daa91 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -1,6 +1,10 @@ > #ifndef _INTEL_RINGBUFFER_H_ > #define _INTEL_RINGBUFFER_H_ > > +#include <linux/hashtable.h> > + > +#define I915_CMD_HASH_ORDER 9 > + > /* > * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use" > * Gen3 BSpec "vol1c Memory Interface Functions" / 2.3.4.5 "Ring Buffer Use" > @@ -164,12 +168,13 @@ struct intel_ring_buffer { > volatile u32 *cpu_page; > } scratch; > > + bool needs_cmd_parser; > + > /* > - * Tables of commands the command parser needs to know about > + * Table of commands the command parser needs to know about > * for this ring. > */ > - const struct drm_i915_cmd_table *cmd_tables; > - int cmd_table_count; > + DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER); > > /* > * Table of registers allowed in commands that read/write registers. > -- > 1.8.3.2 >
On Mon, Apr 28, 2014 at 08:22:08AM -0700, bradley.d.volkin@intel.com wrote: > From: Brad Volkin <bradley.d.volkin@intel.com> > > For clients that submit large batch buffers the command parser has > a substantial impact on performance. On my HSW ULT system performance > drops as much as ~20% on some tests. Most of the time is spent in the > command lookup code. Converting that from the current naive search to > a hash table lookup reduces the performance impact by as much as ~10%. > > The choice of value for I915_CMD_HASH_ORDER allows all commands > currently used in the parser tables to hash to their own bucket (except > for one collision on the render ring). The tradeoff is that it wastes > memory. Because the opcodes for the commands in the tables are not > particularly well distributed, reducing the order still leaves many > buckets empty. The increased collisions don't seem to have a huge > impact on the performance gain, but for now anyhow, the parser trades > memory for performance. For the collisions have you looked into pre-munging the key a bit so that we use more bits? A few shifts and xors shouldn't affect perf much really. Also since the tables are mostly empty we could just overflow to the next hashtable entry, but unfortunately that would require a bit of custom insert and lookup code. Finally if we manage to get 0 collisions a WARN_ON would be good for that, to make sure we don't accidentally regress. Anyway just a few more ideas. -Daniel Finally if we manage to get 0 collisions a WARN_ON would be good for that, to make sure we don't accidentally regress. Anyway just a few more ideas. -Daniel > > Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com> > --- > drivers/gpu/drm/i915/i915_cmd_parser.c | 136 ++++++++++++++++++++++++-------- > drivers/gpu/drm/i915/i915_drv.h | 1 + > drivers/gpu/drm/i915/intel_ringbuffer.c | 2 + > drivers/gpu/drm/i915/intel_ringbuffer.h | 11 ++- > 4 files changed, 116 insertions(+), 34 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c > index 9bac097..9dca899 100644 > --- a/drivers/gpu/drm/i915/i915_cmd_parser.c > +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c > @@ -498,16 +498,18 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header) > return 0; > } > > -static bool validate_cmds_sorted(struct intel_ring_buffer *ring) > +static bool validate_cmds_sorted(struct intel_ring_buffer *ring, > + const struct drm_i915_cmd_table *cmd_tables, > + int cmd_table_count) > { > int i; > bool ret = true; > > - if (!ring->cmd_tables || ring->cmd_table_count == 0) > + if (!cmd_tables || cmd_table_count == 0) > return true; > > - for (i = 0; i < ring->cmd_table_count; i++) { > - const struct drm_i915_cmd_table *table = &ring->cmd_tables[i]; > + for (i = 0; i < cmd_table_count; i++) { > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > u32 previous = 0; > int j; > > @@ -557,6 +559,60 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > ring->master_reg_count); > } > > +struct cmd_node { > + const struct drm_i915_cmd_descriptor *desc; > + struct hlist_node node; > +}; > + > +/* > + * Different command ranges have different numbers of bits for the opcode. > + * In order to use the opcode bits, and only the opcode bits, for the hash key > + * we should use the MI_* command opcode mask (since those commands use the > + * fewest bits for the opcode.) > + */ > +#define CMD_HASH_MASK STD_MI_OPCODE_MASK > + > +static int init_hash_table(struct intel_ring_buffer *ring, > + const struct drm_i915_cmd_table *cmd_tables, > + int cmd_table_count) > +{ > + int i, j; > + > + hash_init(ring->cmd_hash); > + > + for (i = 0; i < cmd_table_count; i++) { > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > + > + for (j = 0; j < table->count; j++) { > + const struct drm_i915_cmd_descriptor *desc = > + &table->table[j]; > + struct cmd_node *desc_node = > + kmalloc(sizeof(*desc_node), GFP_KERNEL); > + > + if (!desc_node) > + return -ENOMEM; > + > + desc_node->desc = desc; > + hash_add(ring->cmd_hash, &desc_node->node, > + desc->cmd.value & CMD_HASH_MASK); > + } > + } > + > + return 0; > +} > + > +static void fini_hash_table(struct intel_ring_buffer *ring) > +{ > + struct hlist_node *tmp; > + struct cmd_node *desc_node; > + int i; > + > + hash_for_each_safe(ring->cmd_hash, i, tmp, desc_node, node) { > + hash_del(&desc_node->node); > + kfree(desc_node); > + } > +} > + > /** > * i915_cmd_parser_init_ring() - set cmd parser related fields for a ringbuffer > * @ring: the ringbuffer to initialize > @@ -567,18 +623,21 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > */ > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > { > + const struct drm_i915_cmd_table *cmd_tables; > + int cmd_table_count; > + > if (!IS_GEN7(ring->dev)) > return; > > switch (ring->id) { > case RCS: > if (IS_HASWELL(ring->dev)) { > - ring->cmd_tables = hsw_render_ring_cmds; > - ring->cmd_table_count = > + cmd_tables = hsw_render_ring_cmds; > + cmd_table_count = > ARRAY_SIZE(hsw_render_ring_cmds); > } else { > - ring->cmd_tables = gen7_render_cmds; > - ring->cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > + cmd_tables = gen7_render_cmds; > + cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > } > > ring->reg_table = gen7_render_regs; > @@ -595,17 +654,17 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > ring->get_cmd_length_mask = gen7_render_get_cmd_length_mask; > break; > case VCS: > - ring->cmd_tables = gen7_video_cmds; > - ring->cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > + cmd_tables = gen7_video_cmds; > + cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > break; > case BCS: > if (IS_HASWELL(ring->dev)) { > - ring->cmd_tables = hsw_blt_ring_cmds; > - ring->cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > + cmd_tables = hsw_blt_ring_cmds; > + cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > } else { > - ring->cmd_tables = gen7_blt_cmds; > - ring->cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > + cmd_tables = gen7_blt_cmds; > + cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > } > > ring->reg_table = gen7_blt_regs; > @@ -622,8 +681,8 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > ring->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; > break; > case VECS: > - ring->cmd_tables = hsw_vebox_cmds; > - ring->cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > + cmd_tables = hsw_vebox_cmds; > + cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > /* VECS can use the same length_mask function as VCS */ > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > break; > @@ -633,18 +692,38 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > BUG(); > } > > - BUG_ON(!validate_cmds_sorted(ring)); > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > BUG_ON(!validate_regs_sorted(ring)); > + > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > + > + ring->needs_cmd_parser = true; > +} > + > +/** > + * i915_cmd_parser_fini_ring() - clean up cmd parser related fields > + * @ring: the ringbuffer to clean up > + * > + * Releases any resources related to command parsing that may have been > + * initialized for the specified ring. > + */ > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring) > +{ > + if (!ring->needs_cmd_parser) > + return; > + > + fini_hash_table(ring); > } > > static const struct drm_i915_cmd_descriptor* > -find_cmd_in_table(const struct drm_i915_cmd_table *table, > +find_cmd_in_table(struct intel_ring_buffer *ring, > u32 cmd_header) > { > - int i; > + struct cmd_node *desc_node; > > - for (i = 0; i < table->count; i++) { > - const struct drm_i915_cmd_descriptor *desc = &table->table[i]; > + hash_for_each_possible(ring->cmd_hash, desc_node, node, > + cmd_header & CMD_HASH_MASK) { > + const struct drm_i915_cmd_descriptor *desc = desc_node->desc; > u32 masked_cmd = desc->cmd.mask & cmd_header; > u32 masked_value = desc->cmd.value & desc->cmd.mask; > > @@ -668,16 +747,12 @@ find_cmd(struct intel_ring_buffer *ring, > u32 cmd_header, > struct drm_i915_cmd_descriptor *default_desc) > { > + const struct drm_i915_cmd_descriptor *desc; > u32 mask; > - int i; > - > - for (i = 0; i < ring->cmd_table_count; i++) { > - const struct drm_i915_cmd_descriptor *desc; > > - desc = find_cmd_in_table(&ring->cmd_tables[i], cmd_header); > - if (desc) > - return desc; > - } > + desc = find_cmd_in_table(ring, cmd_header); > + if (desc) > + return desc; > > mask = ring->get_cmd_length_mask(cmd_header); > if (!mask) > @@ -748,8 +823,7 @@ bool i915_needs_cmd_parser(struct intel_ring_buffer *ring) > { > struct drm_i915_private *dev_priv = ring->dev->dev_private; > > - /* No command tables indicates a platform without parsing */ > - if (!ring->cmd_tables) > + if (!ring->needs_cmd_parser) > return false; > > /* > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 7d6acb4..8c8388f 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2414,6 +2414,7 @@ const char *i915_cache_level_str(int type); > /* i915_cmd_parser.c */ > int i915_cmd_parser_get_version(void); > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring); > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring); > bool i915_needs_cmd_parser(struct intel_ring_buffer *ring); > int i915_parse_cmds(struct intel_ring_buffer *ring, > struct drm_i915_gem_object *batch_obj, > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index eb3dd26..06fa9a3 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -1469,6 +1469,8 @@ void intel_cleanup_ring_buffer(struct intel_ring_buffer *ring) > ring->cleanup(ring); > > cleanup_status_page(ring); > + > + i915_cmd_parser_fini_ring(ring); > } > > static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n) > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index 413cdc7..48daa91 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -1,6 +1,10 @@ > #ifndef _INTEL_RINGBUFFER_H_ > #define _INTEL_RINGBUFFER_H_ > > +#include <linux/hashtable.h> > + > +#define I915_CMD_HASH_ORDER 9 > + > /* > * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use" > * Gen3 BSpec "vol1c Memory Interface Functions" / 2.3.4.5 "Ring Buffer Use" > @@ -164,12 +168,13 @@ struct intel_ring_buffer { > volatile u32 *cpu_page; > } scratch; > > + bool needs_cmd_parser; > + > /* > - * Tables of commands the command parser needs to know about > + * Table of commands the command parser needs to know about > * for this ring. > */ > - const struct drm_i915_cmd_table *cmd_tables; > - int cmd_table_count; > + DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER); > > /* > * Table of registers allowed in commands that read/write registers. > -- > 1.8.3.2 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Mon, Apr 28, 2014 at 08:42:56AM -0700, Daniel Vetter wrote: > On Mon, Apr 28, 2014 at 08:22:08AM -0700, bradley.d.volkin@intel.com wrote: > > From: Brad Volkin <bradley.d.volkin@intel.com> > > > > For clients that submit large batch buffers the command parser has > > a substantial impact on performance. On my HSW ULT system performance > > drops as much as ~20% on some tests. Most of the time is spent in the > > command lookup code. Converting that from the current naive search to > > a hash table lookup reduces the performance impact by as much as ~10%. > > > > The choice of value for I915_CMD_HASH_ORDER allows all commands > > currently used in the parser tables to hash to their own bucket (except > > for one collision on the render ring). The tradeoff is that it wastes > > memory. Because the opcodes for the commands in the tables are not > > particularly well distributed, reducing the order still leaves many > > buckets empty. The increased collisions don't seem to have a huge > > impact on the performance gain, but for now anyhow, the parser trades > > memory for performance. > > > > Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com> > > Nice. One idea on top which could be worth a shot is a bloomfilter to > handle all the non-special cases without a (likely) cache miss in the > hashtable. The per-ring bloomfilter would be only loaded once (and if we > place it nearby other stuff the cmdparser needs anyway even that is > amortized). Good suggestion. Noted. > > Also Chris mentioned that blitter loads under X are about the worst case > wrt impact of the cmdparser. Benchmarking x11perf might be a useful > extreme testcase for optimizing this. I guess Chris will jump in with more > ideas? Ok, I'll see how x11perf looks with and without this patch as a starting point. Brad > > Thanks, Daniel > > > --- > > drivers/gpu/drm/i915/i915_cmd_parser.c | 136 ++++++++++++++++++++++++-------- > > drivers/gpu/drm/i915/i915_drv.h | 1 + > > drivers/gpu/drm/i915/intel_ringbuffer.c | 2 + > > drivers/gpu/drm/i915/intel_ringbuffer.h | 11 ++- > > 4 files changed, 116 insertions(+), 34 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c > > index 9bac097..9dca899 100644 > > --- a/drivers/gpu/drm/i915/i915_cmd_parser.c > > +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c > > @@ -498,16 +498,18 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header) > > return 0; > > } > > > > -static bool validate_cmds_sorted(struct intel_ring_buffer *ring) > > +static bool validate_cmds_sorted(struct intel_ring_buffer *ring, > > + const struct drm_i915_cmd_table *cmd_tables, > > + int cmd_table_count) > > { > > int i; > > bool ret = true; > > > > - if (!ring->cmd_tables || ring->cmd_table_count == 0) > > + if (!cmd_tables || cmd_table_count == 0) > > return true; > > > > - for (i = 0; i < ring->cmd_table_count; i++) { > > - const struct drm_i915_cmd_table *table = &ring->cmd_tables[i]; > > + for (i = 0; i < cmd_table_count; i++) { > > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > > u32 previous = 0; > > int j; > > > > @@ -557,6 +559,60 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > > ring->master_reg_count); > > } > > > > +struct cmd_node { > > + const struct drm_i915_cmd_descriptor *desc; > > + struct hlist_node node; > > +}; > > + > > +/* > > + * Different command ranges have different numbers of bits for the opcode. > > + * In order to use the opcode bits, and only the opcode bits, for the hash key > > + * we should use the MI_* command opcode mask (since those commands use the > > + * fewest bits for the opcode.) > > + */ > > +#define CMD_HASH_MASK STD_MI_OPCODE_MASK > > + > > +static int init_hash_table(struct intel_ring_buffer *ring, > > + const struct drm_i915_cmd_table *cmd_tables, > > + int cmd_table_count) > > +{ > > + int i, j; > > + > > + hash_init(ring->cmd_hash); > > + > > + for (i = 0; i < cmd_table_count; i++) { > > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > > + > > + for (j = 0; j < table->count; j++) { > > + const struct drm_i915_cmd_descriptor *desc = > > + &table->table[j]; > > + struct cmd_node *desc_node = > > + kmalloc(sizeof(*desc_node), GFP_KERNEL); > > + > > + if (!desc_node) > > + return -ENOMEM; > > + > > + desc_node->desc = desc; > > + hash_add(ring->cmd_hash, &desc_node->node, > > + desc->cmd.value & CMD_HASH_MASK); > > + } > > + } > > + > > + return 0; > > +} > > + > > +static void fini_hash_table(struct intel_ring_buffer *ring) > > +{ > > + struct hlist_node *tmp; > > + struct cmd_node *desc_node; > > + int i; > > + > > + hash_for_each_safe(ring->cmd_hash, i, tmp, desc_node, node) { > > + hash_del(&desc_node->node); > > + kfree(desc_node); > > + } > > +} > > + > > /** > > * i915_cmd_parser_init_ring() - set cmd parser related fields for a ringbuffer > > * @ring: the ringbuffer to initialize > > @@ -567,18 +623,21 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > > */ > > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > > { > > + const struct drm_i915_cmd_table *cmd_tables; > > + int cmd_table_count; > > + > > if (!IS_GEN7(ring->dev)) > > return; > > > > switch (ring->id) { > > case RCS: > > if (IS_HASWELL(ring->dev)) { > > - ring->cmd_tables = hsw_render_ring_cmds; > > - ring->cmd_table_count = > > + cmd_tables = hsw_render_ring_cmds; > > + cmd_table_count = > > ARRAY_SIZE(hsw_render_ring_cmds); > > } else { > > - ring->cmd_tables = gen7_render_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > > + cmd_tables = gen7_render_cmds; > > + cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > > } > > > > ring->reg_table = gen7_render_regs; > > @@ -595,17 +654,17 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > > ring->get_cmd_length_mask = gen7_render_get_cmd_length_mask; > > break; > > case VCS: > > - ring->cmd_tables = gen7_video_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > > + cmd_tables = gen7_video_cmds; > > + cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > > break; > > case BCS: > > if (IS_HASWELL(ring->dev)) { > > - ring->cmd_tables = hsw_blt_ring_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > > + cmd_tables = hsw_blt_ring_cmds; > > + cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > > } else { > > - ring->cmd_tables = gen7_blt_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > > + cmd_tables = gen7_blt_cmds; > > + cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > > } > > > > ring->reg_table = gen7_blt_regs; > > @@ -622,8 +681,8 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > > ring->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; > > break; > > case VECS: > > - ring->cmd_tables = hsw_vebox_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > > + cmd_tables = hsw_vebox_cmds; > > + cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > > /* VECS can use the same length_mask function as VCS */ > > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > > break; > > @@ -633,18 +692,38 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > > BUG(); > > } > > > > - BUG_ON(!validate_cmds_sorted(ring)); > > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > > BUG_ON(!validate_regs_sorted(ring)); > > + > > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > > + > > + ring->needs_cmd_parser = true; > > +} > > + > > +/** > > + * i915_cmd_parser_fini_ring() - clean up cmd parser related fields > > + * @ring: the ringbuffer to clean up > > + * > > + * Releases any resources related to command parsing that may have been > > + * initialized for the specified ring. > > + */ > > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring) > > +{ > > + if (!ring->needs_cmd_parser) > > + return; > > + > > + fini_hash_table(ring); > > } > > > > static const struct drm_i915_cmd_descriptor* > > -find_cmd_in_table(const struct drm_i915_cmd_table *table, > > +find_cmd_in_table(struct intel_ring_buffer *ring, > > u32 cmd_header) > > { > > - int i; > > + struct cmd_node *desc_node; > > > > - for (i = 0; i < table->count; i++) { > > - const struct drm_i915_cmd_descriptor *desc = &table->table[i]; > > + hash_for_each_possible(ring->cmd_hash, desc_node, node, > > + cmd_header & CMD_HASH_MASK) { > > + const struct drm_i915_cmd_descriptor *desc = desc_node->desc; > > u32 masked_cmd = desc->cmd.mask & cmd_header; > > u32 masked_value = desc->cmd.value & desc->cmd.mask; > > > > @@ -668,16 +747,12 @@ find_cmd(struct intel_ring_buffer *ring, > > u32 cmd_header, > > struct drm_i915_cmd_descriptor *default_desc) > > { > > + const struct drm_i915_cmd_descriptor *desc; > > u32 mask; > > - int i; > > - > > - for (i = 0; i < ring->cmd_table_count; i++) { > > - const struct drm_i915_cmd_descriptor *desc; > > > > - desc = find_cmd_in_table(&ring->cmd_tables[i], cmd_header); > > - if (desc) > > - return desc; > > - } > > + desc = find_cmd_in_table(ring, cmd_header); > > + if (desc) > > + return desc; > > > > mask = ring->get_cmd_length_mask(cmd_header); > > if (!mask) > > @@ -748,8 +823,7 @@ bool i915_needs_cmd_parser(struct intel_ring_buffer *ring) > > { > > struct drm_i915_private *dev_priv = ring->dev->dev_private; > > > > - /* No command tables indicates a platform without parsing */ > > - if (!ring->cmd_tables) > > + if (!ring->needs_cmd_parser) > > return false; > > > > /* > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > > index 7d6acb4..8c8388f 100644 > > --- a/drivers/gpu/drm/i915/i915_drv.h > > +++ b/drivers/gpu/drm/i915/i915_drv.h > > @@ -2414,6 +2414,7 @@ const char *i915_cache_level_str(int type); > > /* i915_cmd_parser.c */ > > int i915_cmd_parser_get_version(void); > > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring); > > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring); > > bool i915_needs_cmd_parser(struct intel_ring_buffer *ring); > > int i915_parse_cmds(struct intel_ring_buffer *ring, > > struct drm_i915_gem_object *batch_obj, > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > > index eb3dd26..06fa9a3 100644 > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > > @@ -1469,6 +1469,8 @@ void intel_cleanup_ring_buffer(struct intel_ring_buffer *ring) > > ring->cleanup(ring); > > > > cleanup_status_page(ring); > > + > > + i915_cmd_parser_fini_ring(ring); > > } > > > > static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n) > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > > index 413cdc7..48daa91 100644 > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > > @@ -1,6 +1,10 @@ > > #ifndef _INTEL_RINGBUFFER_H_ > > #define _INTEL_RINGBUFFER_H_ > > > > +#include <linux/hashtable.h> > > + > > +#define I915_CMD_HASH_ORDER 9 > > + > > /* > > * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use" > > * Gen3 BSpec "vol1c Memory Interface Functions" / 2.3.4.5 "Ring Buffer Use" > > @@ -164,12 +168,13 @@ struct intel_ring_buffer { > > volatile u32 *cpu_page; > > } scratch; > > > > + bool needs_cmd_parser; > > + > > /* > > - * Tables of commands the command parser needs to know about > > + * Table of commands the command parser needs to know about > > * for this ring. > > */ > > - const struct drm_i915_cmd_table *cmd_tables; > > - int cmd_table_count; > > + DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER); > > > > /* > > * Table of registers allowed in commands that read/write registers. > > -- > > 1.8.3.2 > > > > _______________________________________________ > > Intel-gfx mailing list > > Intel-gfx@lists.freedesktop.org > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx > > -- > Daniel Vetter > Software Engineer, Intel Corporation > +41 (0) 79 365 57 48 - http://blog.ffwll.ch
On Mon, Apr 28, 2014 at 08:53:30AM -0700, Daniel Vetter wrote: > On Mon, Apr 28, 2014 at 08:22:08AM -0700, bradley.d.volkin@intel.com wrote: > > From: Brad Volkin <bradley.d.volkin@intel.com> > > > > For clients that submit large batch buffers the command parser has > > a substantial impact on performance. On my HSW ULT system performance > > drops as much as ~20% on some tests. Most of the time is spent in the > > command lookup code. Converting that from the current naive search to > > a hash table lookup reduces the performance impact by as much as ~10%. > > > > The choice of value for I915_CMD_HASH_ORDER allows all commands > > currently used in the parser tables to hash to their own bucket (except > > for one collision on the render ring). The tradeoff is that it wastes > > memory. Because the opcodes for the commands in the tables are not > > particularly well distributed, reducing the order still leaves many > > buckets empty. The increased collisions don't seem to have a huge > > impact on the performance gain, but for now anyhow, the parser trades > > memory for performance. > > For the collisions have you looked into pre-munging the key a bit so that > we use more bits? A few shifts and xors shouldn't affect perf much really. I looked at this briefly but didn't find a substantial improvement. The basic patch improved things enough that I wanted to just get it out. I can look into this more, but I'd like to think about implementing the batch buffer copy portion next. I don't want to optimize this, make people happy, and then introduce another perf drop from the copy. Better to just take the full hit now and then continue the improvements. Sound reasonable? Brad > > Also since the tables are mostly empty we could just overflow to the next > hashtable entry, but unfortunately that would require a bit of custom > insert and lookup code. > > Finally if we manage to get 0 collisions a WARN_ON would be good for > that, to make sure we don't accidentally regress. > > Anyway just a few more ideas. > -Daniel > > Finally if we manage to get 0 collisions a WARN_ON would be good for > that, to make sure we don't accidentally regress. > > Anyway just a few more ideas. > -Daniel > > > > > Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com> > > --- > > drivers/gpu/drm/i915/i915_cmd_parser.c | 136 ++++++++++++++++++++++++-------- > > drivers/gpu/drm/i915/i915_drv.h | 1 + > > drivers/gpu/drm/i915/intel_ringbuffer.c | 2 + > > drivers/gpu/drm/i915/intel_ringbuffer.h | 11 ++- > > 4 files changed, 116 insertions(+), 34 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c > > index 9bac097..9dca899 100644 > > --- a/drivers/gpu/drm/i915/i915_cmd_parser.c > > +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c > > @@ -498,16 +498,18 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header) > > return 0; > > } > > > > -static bool validate_cmds_sorted(struct intel_ring_buffer *ring) > > +static bool validate_cmds_sorted(struct intel_ring_buffer *ring, > > + const struct drm_i915_cmd_table *cmd_tables, > > + int cmd_table_count) > > { > > int i; > > bool ret = true; > > > > - if (!ring->cmd_tables || ring->cmd_table_count == 0) > > + if (!cmd_tables || cmd_table_count == 0) > > return true; > > > > - for (i = 0; i < ring->cmd_table_count; i++) { > > - const struct drm_i915_cmd_table *table = &ring->cmd_tables[i]; > > + for (i = 0; i < cmd_table_count; i++) { > > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > > u32 previous = 0; > > int j; > > > > @@ -557,6 +559,60 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > > ring->master_reg_count); > > } > > > > +struct cmd_node { > > + const struct drm_i915_cmd_descriptor *desc; > > + struct hlist_node node; > > +}; > > + > > +/* > > + * Different command ranges have different numbers of bits for the opcode. > > + * In order to use the opcode bits, and only the opcode bits, for the hash key > > + * we should use the MI_* command opcode mask (since those commands use the > > + * fewest bits for the opcode.) > > + */ > > +#define CMD_HASH_MASK STD_MI_OPCODE_MASK > > + > > +static int init_hash_table(struct intel_ring_buffer *ring, > > + const struct drm_i915_cmd_table *cmd_tables, > > + int cmd_table_count) > > +{ > > + int i, j; > > + > > + hash_init(ring->cmd_hash); > > + > > + for (i = 0; i < cmd_table_count; i++) { > > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > > + > > + for (j = 0; j < table->count; j++) { > > + const struct drm_i915_cmd_descriptor *desc = > > + &table->table[j]; > > + struct cmd_node *desc_node = > > + kmalloc(sizeof(*desc_node), GFP_KERNEL); > > + > > + if (!desc_node) > > + return -ENOMEM; > > + > > + desc_node->desc = desc; > > + hash_add(ring->cmd_hash, &desc_node->node, > > + desc->cmd.value & CMD_HASH_MASK); > > + } > > + } > > + > > + return 0; > > +} > > + > > +static void fini_hash_table(struct intel_ring_buffer *ring) > > +{ > > + struct hlist_node *tmp; > > + struct cmd_node *desc_node; > > + int i; > > + > > + hash_for_each_safe(ring->cmd_hash, i, tmp, desc_node, node) { > > + hash_del(&desc_node->node); > > + kfree(desc_node); > > + } > > +} > > + > > /** > > * i915_cmd_parser_init_ring() - set cmd parser related fields for a ringbuffer > > * @ring: the ringbuffer to initialize > > @@ -567,18 +623,21 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > > */ > > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > > { > > + const struct drm_i915_cmd_table *cmd_tables; > > + int cmd_table_count; > > + > > if (!IS_GEN7(ring->dev)) > > return; > > > > switch (ring->id) { > > case RCS: > > if (IS_HASWELL(ring->dev)) { > > - ring->cmd_tables = hsw_render_ring_cmds; > > - ring->cmd_table_count = > > + cmd_tables = hsw_render_ring_cmds; > > + cmd_table_count = > > ARRAY_SIZE(hsw_render_ring_cmds); > > } else { > > - ring->cmd_tables = gen7_render_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > > + cmd_tables = gen7_render_cmds; > > + cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > > } > > > > ring->reg_table = gen7_render_regs; > > @@ -595,17 +654,17 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > > ring->get_cmd_length_mask = gen7_render_get_cmd_length_mask; > > break; > > case VCS: > > - ring->cmd_tables = gen7_video_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > > + cmd_tables = gen7_video_cmds; > > + cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > > break; > > case BCS: > > if (IS_HASWELL(ring->dev)) { > > - ring->cmd_tables = hsw_blt_ring_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > > + cmd_tables = hsw_blt_ring_cmds; > > + cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > > } else { > > - ring->cmd_tables = gen7_blt_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > > + cmd_tables = gen7_blt_cmds; > > + cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > > } > > > > ring->reg_table = gen7_blt_regs; > > @@ -622,8 +681,8 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > > ring->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; > > break; > > case VECS: > > - ring->cmd_tables = hsw_vebox_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > > + cmd_tables = hsw_vebox_cmds; > > + cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > > /* VECS can use the same length_mask function as VCS */ > > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > > break; > > @@ -633,18 +692,38 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > > BUG(); > > } > > > > - BUG_ON(!validate_cmds_sorted(ring)); > > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > > BUG_ON(!validate_regs_sorted(ring)); > > + > > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > > + > > + ring->needs_cmd_parser = true; > > +} > > + > > +/** > > + * i915_cmd_parser_fini_ring() - clean up cmd parser related fields > > + * @ring: the ringbuffer to clean up > > + * > > + * Releases any resources related to command parsing that may have been > > + * initialized for the specified ring. > > + */ > > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring) > > +{ > > + if (!ring->needs_cmd_parser) > > + return; > > + > > + fini_hash_table(ring); > > } > > > > static const struct drm_i915_cmd_descriptor* > > -find_cmd_in_table(const struct drm_i915_cmd_table *table, > > +find_cmd_in_table(struct intel_ring_buffer *ring, > > u32 cmd_header) > > { > > - int i; > > + struct cmd_node *desc_node; > > > > - for (i = 0; i < table->count; i++) { > > - const struct drm_i915_cmd_descriptor *desc = &table->table[i]; > > + hash_for_each_possible(ring->cmd_hash, desc_node, node, > > + cmd_header & CMD_HASH_MASK) { > > + const struct drm_i915_cmd_descriptor *desc = desc_node->desc; > > u32 masked_cmd = desc->cmd.mask & cmd_header; > > u32 masked_value = desc->cmd.value & desc->cmd.mask; > > > > @@ -668,16 +747,12 @@ find_cmd(struct intel_ring_buffer *ring, > > u32 cmd_header, > > struct drm_i915_cmd_descriptor *default_desc) > > { > > + const struct drm_i915_cmd_descriptor *desc; > > u32 mask; > > - int i; > > - > > - for (i = 0; i < ring->cmd_table_count; i++) { > > - const struct drm_i915_cmd_descriptor *desc; > > > > - desc = find_cmd_in_table(&ring->cmd_tables[i], cmd_header); > > - if (desc) > > - return desc; > > - } > > + desc = find_cmd_in_table(ring, cmd_header); > > + if (desc) > > + return desc; > > > > mask = ring->get_cmd_length_mask(cmd_header); > > if (!mask) > > @@ -748,8 +823,7 @@ bool i915_needs_cmd_parser(struct intel_ring_buffer *ring) > > { > > struct drm_i915_private *dev_priv = ring->dev->dev_private; > > > > - /* No command tables indicates a platform without parsing */ > > - if (!ring->cmd_tables) > > + if (!ring->needs_cmd_parser) > > return false; > > > > /* > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > > index 7d6acb4..8c8388f 100644 > > --- a/drivers/gpu/drm/i915/i915_drv.h > > +++ b/drivers/gpu/drm/i915/i915_drv.h > > @@ -2414,6 +2414,7 @@ const char *i915_cache_level_str(int type); > > /* i915_cmd_parser.c */ > > int i915_cmd_parser_get_version(void); > > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring); > > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring); > > bool i915_needs_cmd_parser(struct intel_ring_buffer *ring); > > int i915_parse_cmds(struct intel_ring_buffer *ring, > > struct drm_i915_gem_object *batch_obj, > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > > index eb3dd26..06fa9a3 100644 > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > > @@ -1469,6 +1469,8 @@ void intel_cleanup_ring_buffer(struct intel_ring_buffer *ring) > > ring->cleanup(ring); > > > > cleanup_status_page(ring); > > + > > + i915_cmd_parser_fini_ring(ring); > > } > > > > static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n) > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > > index 413cdc7..48daa91 100644 > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > > @@ -1,6 +1,10 @@ > > #ifndef _INTEL_RINGBUFFER_H_ > > #define _INTEL_RINGBUFFER_H_ > > > > +#include <linux/hashtable.h> > > + > > +#define I915_CMD_HASH_ORDER 9 > > + > > /* > > * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use" > > * Gen3 BSpec "vol1c Memory Interface Functions" / 2.3.4.5 "Ring Buffer Use" > > @@ -164,12 +168,13 @@ struct intel_ring_buffer { > > volatile u32 *cpu_page; > > } scratch; > > > > + bool needs_cmd_parser; > > + > > /* > > - * Tables of commands the command parser needs to know about > > + * Table of commands the command parser needs to know about > > * for this ring. > > */ > > - const struct drm_i915_cmd_table *cmd_tables; > > - int cmd_table_count; > > + DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER); > > > > /* > > * Table of registers allowed in commands that read/write registers. > > -- > > 1.8.3.2 > > > > _______________________________________________ > > Intel-gfx mailing list > > Intel-gfx@lists.freedesktop.org > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx > > -- > Daniel Vetter > Software Engineer, Intel Corporation > +41 (0) 79 365 57 48 - http://blog.ffwll.ch
On Mon, Apr 28, 2014 at 6:07 PM, Volkin, Bradley D <bradley.d.volkin@intel.com> wrote: > On Mon, Apr 28, 2014 at 08:53:30AM -0700, Daniel Vetter wrote: >> On Mon, Apr 28, 2014 at 08:22:08AM -0700, bradley.d.volkin@intel.com wrote: >> > From: Brad Volkin <bradley.d.volkin@intel.com> >> > >> > For clients that submit large batch buffers the command parser has >> > a substantial impact on performance. On my HSW ULT system performance >> > drops as much as ~20% on some tests. Most of the time is spent in the >> > command lookup code. Converting that from the current naive search to >> > a hash table lookup reduces the performance impact by as much as ~10%. >> > >> > The choice of value for I915_CMD_HASH_ORDER allows all commands >> > currently used in the parser tables to hash to their own bucket (except >> > for one collision on the render ring). The tradeoff is that it wastes >> > memory. Because the opcodes for the commands in the tables are not >> > particularly well distributed, reducing the order still leaves many >> > buckets empty. The increased collisions don't seem to have a huge >> > impact on the performance gain, but for now anyhow, the parser trades >> > memory for performance. >> >> For the collisions have you looked into pre-munging the key a bit so that >> we use more bits? A few shifts and xors shouldn't affect perf much really. > > I looked at this briefly but didn't find a substantial improvement. The > basic patch improved things enough that I wanted to just get it out. I > can look into this more, but I'd like to think about implementing the > batch buffer copy portion next. I don't want to optimize this, make people > happy, and then introduce another perf drop from the copy. Better to just > take the full hit now and then continue the improvements. Sound reasonable? Yeah, makes sense. Like I've said just throwing around ideas. -Daniel
Could someone help to review this patch please? It provides a nice improvement to the command parser's performance, so I'd like to get this one in. Thanks, Brad On Mon, Apr 28, 2014 at 08:22:08AM -0700, Volkin, Bradley D wrote: > From: Brad Volkin <bradley.d.volkin@intel.com> > > For clients that submit large batch buffers the command parser has > a substantial impact on performance. On my HSW ULT system performance > drops as much as ~20% on some tests. Most of the time is spent in the > command lookup code. Converting that from the current naive search to > a hash table lookup reduces the performance impact by as much as ~10%. > > The choice of value for I915_CMD_HASH_ORDER allows all commands > currently used in the parser tables to hash to their own bucket (except > for one collision on the render ring). The tradeoff is that it wastes > memory. Because the opcodes for the commands in the tables are not > particularly well distributed, reducing the order still leaves many > buckets empty. The increased collisions don't seem to have a huge > impact on the performance gain, but for now anyhow, the parser trades > memory for performance. > > Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com> > --- > drivers/gpu/drm/i915/i915_cmd_parser.c | 136 ++++++++++++++++++++++++-------- > drivers/gpu/drm/i915/i915_drv.h | 1 + > drivers/gpu/drm/i915/intel_ringbuffer.c | 2 + > drivers/gpu/drm/i915/intel_ringbuffer.h | 11 ++- > 4 files changed, 116 insertions(+), 34 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c > index 9bac097..9dca899 100644 > --- a/drivers/gpu/drm/i915/i915_cmd_parser.c > +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c > @@ -498,16 +498,18 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header) > return 0; > } > > -static bool validate_cmds_sorted(struct intel_ring_buffer *ring) > +static bool validate_cmds_sorted(struct intel_ring_buffer *ring, > + const struct drm_i915_cmd_table *cmd_tables, > + int cmd_table_count) > { > int i; > bool ret = true; > > - if (!ring->cmd_tables || ring->cmd_table_count == 0) > + if (!cmd_tables || cmd_table_count == 0) > return true; > > - for (i = 0; i < ring->cmd_table_count; i++) { > - const struct drm_i915_cmd_table *table = &ring->cmd_tables[i]; > + for (i = 0; i < cmd_table_count; i++) { > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > u32 previous = 0; > int j; > > @@ -557,6 +559,60 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > ring->master_reg_count); > } > > +struct cmd_node { > + const struct drm_i915_cmd_descriptor *desc; > + struct hlist_node node; > +}; > + > +/* > + * Different command ranges have different numbers of bits for the opcode. > + * In order to use the opcode bits, and only the opcode bits, for the hash key > + * we should use the MI_* command opcode mask (since those commands use the > + * fewest bits for the opcode.) > + */ > +#define CMD_HASH_MASK STD_MI_OPCODE_MASK > + > +static int init_hash_table(struct intel_ring_buffer *ring, > + const struct drm_i915_cmd_table *cmd_tables, > + int cmd_table_count) > +{ > + int i, j; > + > + hash_init(ring->cmd_hash); > + > + for (i = 0; i < cmd_table_count; i++) { > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > + > + for (j = 0; j < table->count; j++) { > + const struct drm_i915_cmd_descriptor *desc = > + &table->table[j]; > + struct cmd_node *desc_node = > + kmalloc(sizeof(*desc_node), GFP_KERNEL); > + > + if (!desc_node) > + return -ENOMEM; > + > + desc_node->desc = desc; > + hash_add(ring->cmd_hash, &desc_node->node, > + desc->cmd.value & CMD_HASH_MASK); > + } > + } > + > + return 0; > +} > + > +static void fini_hash_table(struct intel_ring_buffer *ring) > +{ > + struct hlist_node *tmp; > + struct cmd_node *desc_node; > + int i; > + > + hash_for_each_safe(ring->cmd_hash, i, tmp, desc_node, node) { > + hash_del(&desc_node->node); > + kfree(desc_node); > + } > +} > + > /** > * i915_cmd_parser_init_ring() - set cmd parser related fields for a ringbuffer > * @ring: the ringbuffer to initialize > @@ -567,18 +623,21 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > */ > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > { > + const struct drm_i915_cmd_table *cmd_tables; > + int cmd_table_count; > + > if (!IS_GEN7(ring->dev)) > return; > > switch (ring->id) { > case RCS: > if (IS_HASWELL(ring->dev)) { > - ring->cmd_tables = hsw_render_ring_cmds; > - ring->cmd_table_count = > + cmd_tables = hsw_render_ring_cmds; > + cmd_table_count = > ARRAY_SIZE(hsw_render_ring_cmds); > } else { > - ring->cmd_tables = gen7_render_cmds; > - ring->cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > + cmd_tables = gen7_render_cmds; > + cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > } > > ring->reg_table = gen7_render_regs; > @@ -595,17 +654,17 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > ring->get_cmd_length_mask = gen7_render_get_cmd_length_mask; > break; > case VCS: > - ring->cmd_tables = gen7_video_cmds; > - ring->cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > + cmd_tables = gen7_video_cmds; > + cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > break; > case BCS: > if (IS_HASWELL(ring->dev)) { > - ring->cmd_tables = hsw_blt_ring_cmds; > - ring->cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > + cmd_tables = hsw_blt_ring_cmds; > + cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > } else { > - ring->cmd_tables = gen7_blt_cmds; > - ring->cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > + cmd_tables = gen7_blt_cmds; > + cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > } > > ring->reg_table = gen7_blt_regs; > @@ -622,8 +681,8 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > ring->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; > break; > case VECS: > - ring->cmd_tables = hsw_vebox_cmds; > - ring->cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > + cmd_tables = hsw_vebox_cmds; > + cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > /* VECS can use the same length_mask function as VCS */ > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > break; > @@ -633,18 +692,38 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > BUG(); > } > > - BUG_ON(!validate_cmds_sorted(ring)); > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > BUG_ON(!validate_regs_sorted(ring)); > + > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > + > + ring->needs_cmd_parser = true; > +} > + > +/** > + * i915_cmd_parser_fini_ring() - clean up cmd parser related fields > + * @ring: the ringbuffer to clean up > + * > + * Releases any resources related to command parsing that may have been > + * initialized for the specified ring. > + */ > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring) > +{ > + if (!ring->needs_cmd_parser) > + return; > + > + fini_hash_table(ring); > } > > static const struct drm_i915_cmd_descriptor* > -find_cmd_in_table(const struct drm_i915_cmd_table *table, > +find_cmd_in_table(struct intel_ring_buffer *ring, > u32 cmd_header) > { > - int i; > + struct cmd_node *desc_node; > > - for (i = 0; i < table->count; i++) { > - const struct drm_i915_cmd_descriptor *desc = &table->table[i]; > + hash_for_each_possible(ring->cmd_hash, desc_node, node, > + cmd_header & CMD_HASH_MASK) { > + const struct drm_i915_cmd_descriptor *desc = desc_node->desc; > u32 masked_cmd = desc->cmd.mask & cmd_header; > u32 masked_value = desc->cmd.value & desc->cmd.mask; > > @@ -668,16 +747,12 @@ find_cmd(struct intel_ring_buffer *ring, > u32 cmd_header, > struct drm_i915_cmd_descriptor *default_desc) > { > + const struct drm_i915_cmd_descriptor *desc; > u32 mask; > - int i; > - > - for (i = 0; i < ring->cmd_table_count; i++) { > - const struct drm_i915_cmd_descriptor *desc; > > - desc = find_cmd_in_table(&ring->cmd_tables[i], cmd_header); > - if (desc) > - return desc; > - } > + desc = find_cmd_in_table(ring, cmd_header); > + if (desc) > + return desc; > > mask = ring->get_cmd_length_mask(cmd_header); > if (!mask) > @@ -748,8 +823,7 @@ bool i915_needs_cmd_parser(struct intel_ring_buffer *ring) > { > struct drm_i915_private *dev_priv = ring->dev->dev_private; > > - /* No command tables indicates a platform without parsing */ > - if (!ring->cmd_tables) > + if (!ring->needs_cmd_parser) > return false; > > /* > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 7d6acb4..8c8388f 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2414,6 +2414,7 @@ const char *i915_cache_level_str(int type); > /* i915_cmd_parser.c */ > int i915_cmd_parser_get_version(void); > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring); > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring); > bool i915_needs_cmd_parser(struct intel_ring_buffer *ring); > int i915_parse_cmds(struct intel_ring_buffer *ring, > struct drm_i915_gem_object *batch_obj, > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index eb3dd26..06fa9a3 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -1469,6 +1469,8 @@ void intel_cleanup_ring_buffer(struct intel_ring_buffer *ring) > ring->cleanup(ring); > > cleanup_status_page(ring); > + > + i915_cmd_parser_fini_ring(ring); > } > > static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n) > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index 413cdc7..48daa91 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -1,6 +1,10 @@ > #ifndef _INTEL_RINGBUFFER_H_ > #define _INTEL_RINGBUFFER_H_ > > +#include <linux/hashtable.h> > + > +#define I915_CMD_HASH_ORDER 9 > + > /* > * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use" > * Gen3 BSpec "vol1c Memory Interface Functions" / 2.3.4.5 "Ring Buffer Use" > @@ -164,12 +168,13 @@ struct intel_ring_buffer { > volatile u32 *cpu_page; > } scratch; > > + bool needs_cmd_parser; > + > /* > - * Tables of commands the command parser needs to know about > + * Table of commands the command parser needs to know about > * for this ring. > */ > - const struct drm_i915_cmd_table *cmd_tables; > - int cmd_table_count; > + DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER); > > /* > * Table of registers allowed in commands that read/write registers. > -- > 1.8.3.2 >
Hi Brad, On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: [snip] > - BUG_ON(!validate_cmds_sorted(ring)); > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > BUG_ON(!validate_regs_sorted(ring)); > + > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); Is a BUG_ON a bit harsh since the above fails only on ENOMEM condition? If the concern is not allowing any command execution if parser setup has failed, it would be nicer to the system as whole to just keep rejecting everything, but let the rest of the kernel live to enable debug or whatever? I know it won't happen almost ever so it's a minor point really. I just dislike actively hosing the whole system if it is avoidable. Regards, Tvrtko
On Thu, May 08, 2014 at 10:56:05AM +0100, Tvrtko Ursulin wrote: > > Hi Brad, > > On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: > [snip] > >- BUG_ON(!validate_cmds_sorted(ring)); > >+ BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > > BUG_ON(!validate_regs_sorted(ring)); > >+ > >+ BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > > Is a BUG_ON a bit harsh since the above fails only on ENOMEM condition? > > If the concern is not allowing any command execution if parser setup > has failed, it would be nicer to the system as whole to just keep > rejecting everything, but let the rest of the kernel live to enable > debug or whatever? Those number_of_cmds allocations are a bit awkward though, couldn't we just embed the hlist_node into the desciptor struct? I was hoping we could compute a (near) minimal perfect hash function though. Let me try to dig a bit.
On 05/08/2014 12:44 PM, Damien Lespiau wrote: > On Thu, May 08, 2014 at 10:56:05AM +0100, Tvrtko Ursulin wrote: >> >> Hi Brad, >> >> On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: >> [snip] >>> - BUG_ON(!validate_cmds_sorted(ring)); >>> + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); >>> BUG_ON(!validate_regs_sorted(ring)); >>> + >>> + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); >> >> Is a BUG_ON a bit harsh since the above fails only on ENOMEM condition? >> >> If the concern is not allowing any command execution if parser setup >> has failed, it would be nicer to the system as whole to just keep >> rejecting everything, but let the rest of the kernel live to enable >> debug or whatever? > > Those number_of_cmds allocations are a bit awkward though, couldn't we > just embed the hlist_node into the desciptor struct? Until Brad comes online, I think that is because command descriptors to hash table entries are not 1-to-1. Tvrtko
On Thu, May 08, 2014 at 01:25:33PM +0100, Tvrtko Ursulin wrote: > > On 05/08/2014 12:44 PM, Damien Lespiau wrote: > >On Thu, May 08, 2014 at 10:56:05AM +0100, Tvrtko Ursulin wrote: > >> > >>Hi Brad, > >> > >>On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: > >>[snip] > >>>- BUG_ON(!validate_cmds_sorted(ring)); > >>>+ BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > >>> BUG_ON(!validate_regs_sorted(ring)); > >>>+ > >>>+ BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > >> > >>Is a BUG_ON a bit harsh since the above fails only on ENOMEM condition? > >> > >>If the concern is not allowing any command execution if parser setup > >>has failed, it would be nicer to the system as whole to just keep > >>rejecting everything, but let the rest of the kernel live to enable > >>debug or whatever? > > > >Those number_of_cmds allocations are a bit awkward though, couldn't we > >just embed the hlist_node into the desciptor struct? > > Until Brad comes online, I think that is because command descriptors > to hash table entries are not 1-to-1. Ah, I guess the common cmds are part of several hash tables. We could at least turn the multiple allocations into one big allocation though.
On Mon, Apr 28, 2014 at 08:22:08AM -0700, bradley.d.volkin@intel.com wrote: > From: Brad Volkin <bradley.d.volkin@intel.com> > +/* > + * Different command ranges have different numbers of bits for the opcode. > + * In order to use the opcode bits, and only the opcode bits, for the hash key > + * we should use the MI_* command opcode mask (since those commands use the > + * fewest bits for the opcode.) > + */ > +#define CMD_HASH_MASK STD_MI_OPCODE_MASK This is not very convicing (but could well be correct). #define STD_MI_OPCODE_MASK 0xFF800000 #define STD_3D_OPCODE_MASK 0xFFFF0000 So it only works if the 3D opcodes have the top 9 bits all distinct?
On Thu, May 08, 2014 at 02:05:07PM +0100, Damien Lespiau wrote: > On Mon, Apr 28, 2014 at 08:22:08AM -0700, bradley.d.volkin@intel.com wrote: > > From: Brad Volkin <bradley.d.volkin@intel.com> > > +/* > > + * Different command ranges have different numbers of bits for the opcode. > > + * In order to use the opcode bits, and only the opcode bits, for the hash key > > + * we should use the MI_* command opcode mask (since those commands use the > > + * fewest bits for the opcode.) > > + */ > > +#define CMD_HASH_MASK STD_MI_OPCODE_MASK > > This is not very convicing (but could well be correct). > > #define STD_MI_OPCODE_MASK 0xFF800000 > #define STD_3D_OPCODE_MASK 0xFFFF0000 > > So it only works if the 3D opcodes have the top 9 bits all distinct? To expand on that, with the attached program: $ ./minimal-hash-hsw-render | wc -l 44 $ ./minimal-hash-hsw-render | sort -u | wc -l 37
On 05/08/2014 02:02 PM, Damien Lespiau wrote: > On Thu, May 08, 2014 at 01:25:33PM +0100, Tvrtko Ursulin wrote: >> >> On 05/08/2014 12:44 PM, Damien Lespiau wrote: >>> On Thu, May 08, 2014 at 10:56:05AM +0100, Tvrtko Ursulin wrote: >>>> >>>> Hi Brad, >>>> >>>> On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: >>>> [snip] >>>>> - BUG_ON(!validate_cmds_sorted(ring)); >>>>> + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); >>>>> BUG_ON(!validate_regs_sorted(ring)); >>>>> + >>>>> + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); >>>> >>>> Is a BUG_ON a bit harsh since the above fails only on ENOMEM condition? >>>> >>>> If the concern is not allowing any command execution if parser setup >>>> has failed, it would be nicer to the system as whole to just keep >>>> rejecting everything, but let the rest of the kernel live to enable >>>> debug or whatever? >>> >>> Those number_of_cmds allocations are a bit awkward though, couldn't we >>> just embed the hlist_node into the desciptor struct? >> >> Until Brad comes online, I think that is because command descriptors >> to hash table entries are not 1-to-1. > > Ah, I guess the common cmds are part of several hash tables. We could at > least turn the multiple allocations into one big allocation though. Probably a problem since commands can be added dynamically. Also from the memory use point of view, single allocations are 12/24 bytes so fit into 16/32 byte slabs. That's some wastage on the face of it, but bunching them up would make... say common plus render commands are ~30 in total, so 360/720 bytes. Which would waste 152/304 bytes (512 and 1024 slabs) vs. 120/240 bytes for individual allocations. So for 30 command array current approach is even better. :) Maybe make a dedicated cache for struct cmd_node? Tvrtko
On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: > From: Brad Volkin <bradley.d.volkin@intel.com> > > For clients that submit large batch buffers the command parser has > a substantial impact on performance. On my HSW ULT system performance > drops as much as ~20% on some tests. Most of the time is spent in the > command lookup code. Converting that from the current naive search to > a hash table lookup reduces the performance impact by as much as ~10%. I have another question - what about register lookups, do they come up in profiling as significant? Regards, Tvrtko
On Thu, May 08, 2014 at 12:44:57PM +0100, Damien Lespiau wrote: > I was hoping we could compute a (near) minimal perfect hash function > though. Let me try to dig a bit. So, I went a bit further here and we can actually generate a minimal perfect hash function. I took the 44 HSW render opcodes and fed them into: http://burtleburtle.net/bob/hash/perfect.html https://github.com/driedfruit/jenkins-minimal-perfect-hash The resulting hash function is: static uint8_t mph_tab[] = { 0,0,10,0,51,0,34,0,12,22,0,0,28,0,9,0, 9,0,19,0,0,0,11,0,10,61,20,50,49,60,45,55, }; static uint32_t mph_hash(uint32_t val) { uint32_t a, b, rsl; val += 0xa195a44b; val += (val << 8); val ^= (val >> 4); b = (val >> 18) & 0x1f; a = val >> 27; rsl = (a^mph_tab[b]); return rsl; } When used the 44 HSW opcodes (opcode & 0xffff0000) it produces distinct numbers (perfect hash) from 0 to 43 (minimal) that could then be used to address an array. You can play with the joined test: $ ./minimal-hash-hsw-render | sort -nk2 So now, how useful is that is an interesting question. We do have static tables upstream (IIUC), but Tvrtko reminded me that it may not match other trees, which is sad.
On Thu, May 08, 2014 at 02:56:05AM -0700, Tvrtko Ursulin wrote: > > Hi Brad, > > On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: > [snip] > > - BUG_ON(!validate_cmds_sorted(ring)); > > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > > BUG_ON(!validate_regs_sorted(ring)); > > + > > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > > Is a BUG_ON a bit harsh since the above fails only on ENOMEM condition? > > If the concern is not allowing any command execution if parser setup has > failed, it would be nicer to the system as whole to just keep rejecting > everything, but let the rest of the kernel live to enable debug or whatever? > > I know it won't happen almost ever so it's a minor point really. I just > dislike actively hosing the whole system if it is avoidable. Hi Tvrtko, I agree that a BUG_ON might be harsh here. I suppose we could log an error and disable the command parser. Most command buffers would still go through fine but HW parsing would reject some that the SW parser might otherwise allow. That could be a bit odd if we ever did get a failure - apps/functionality that worked the last time I booted suddenly don't this time. The issue would be in the log though. I don't have a strong preference on this. Whatever people prefer. Thanks, Brad > > Regards, > > Tvrtko
On Thu, May 08, 2014 at 06:42:16AM -0700, Tvrtko Ursulin wrote: > > On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: > > From: Brad Volkin <bradley.d.volkin@intel.com> > > > > For clients that submit large batch buffers the command parser has > > a substantial impact on performance. On my HSW ULT system performance > > drops as much as ~20% on some tests. Most of the time is spent in the > > command lookup code. Converting that from the current naive search to > > a hash table lookup reduces the performance impact by as much as ~10%. > > I have another question - what about register lookups, do they come up > in profiling as significant? So far no. The command lookup happens for every command in every batch. The register lookup only happens for certain commands. I think most batches don't include such commands and those that do may only have a few instances. Ken might have a better sense of the fequency. Brad > > Regards, > > Tvrtko
On Thu, May 08, 2014 at 08:27:16AM -0700, Volkin, Bradley D wrote: > On Thu, May 08, 2014 at 02:56:05AM -0700, Tvrtko Ursulin wrote: > > > > Hi Brad, > > > > On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: > > [snip] > > > - BUG_ON(!validate_cmds_sorted(ring)); > > > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > > > BUG_ON(!validate_regs_sorted(ring)); > > > + > > > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > > > > Is a BUG_ON a bit harsh since the above fails only on ENOMEM condition? > > > > If the concern is not allowing any command execution if parser setup has > > failed, it would be nicer to the system as whole to just keep rejecting > > everything, but let the rest of the kernel live to enable debug or whatever? > > > > I know it won't happen almost ever so it's a minor point really. I just > > dislike actively hosing the whole system if it is avoidable. > > Hi Tvrtko, > > I agree that a BUG_ON might be harsh here. I suppose we could log an > error and disable the command parser. Most command buffers would > still go through fine but HW parsing would reject some that the SW > parser might otherwise allow. That could be a bit odd if we ever did > get a failure - apps/functionality that worked the last time I booted > suddenly don't this time. The issue would be in the log though. > > I don't have a strong preference on this. Whatever people prefer. If the memory allocation fails there's probably not much point in trying to limp along and continue the driver init. So just pass error up and let the caller deal with it. Looking at the error paths up from ring init, we probably leak a ton of junk but at least the kernel should remain otherwise operational.
On 05/08/2014 04:27 PM, Volkin, Bradley D wrote: > On Thu, May 08, 2014 at 02:56:05AM -0700, Tvrtko Ursulin wrote: >> >> Hi Brad, >> >> On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: >> [snip] >>> - BUG_ON(!validate_cmds_sorted(ring)); >>> + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); >>> BUG_ON(!validate_regs_sorted(ring)); >>> + >>> + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); >> >> Is a BUG_ON a bit harsh since the above fails only on ENOMEM condition? >> >> If the concern is not allowing any command execution if parser setup has >> failed, it would be nicer to the system as whole to just keep rejecting >> everything, but let the rest of the kernel live to enable debug or whatever? >> >> I know it won't happen almost ever so it's a minor point really. I just >> dislike actively hosing the whole system if it is avoidable. > > Hi Tvrtko, > > I agree that a BUG_ON might be harsh here. I suppose we could log an > error and disable the command parser. Most command buffers would > still go through fine but HW parsing would reject some that the SW > parser might otherwise allow. That could be a bit odd if we ever did > get a failure - apps/functionality that worked the last time I booted > suddenly don't this time. The issue would be in the log though. That would indeed be weird. So command parser is just a more permissive whitelist, a super set of hardware parser? > I don't have a strong preference on this. Whatever people prefer. Initially I was thinking of just rejecting all batch buffers if such fundamental initialisation fails. It beats BUG_ON in the sense that it leaves the system alive and it doesn't suffer from the random subtle behaviour like if it would just disable the command parser. It is a bit theoretical because it is likely system won't be too healthy anyway if such a tiny allocation fails but you never know. It's better not to crash it if we don't have to. Regards, Tvrtko
On Thu, May 08, 2014 at 06:15:44AM -0700, Lespiau, Damien wrote: > On Thu, May 08, 2014 at 02:05:07PM +0100, Damien Lespiau wrote: > > On Mon, Apr 28, 2014 at 08:22:08AM -0700, bradley.d.volkin@intel.com wrote: > > > From: Brad Volkin <bradley.d.volkin@intel.com> > > > +/* > > > + * Different command ranges have different numbers of bits for the opcode. > > > + * In order to use the opcode bits, and only the opcode bits, for the hash key > > > + * we should use the MI_* command opcode mask (since those commands use the > > > + * fewest bits for the opcode.) > > > + */ > > > +#define CMD_HASH_MASK STD_MI_OPCODE_MASK > > > > This is not very convicing (but could well be correct). > > > > #define STD_MI_OPCODE_MASK 0xFF800000 > > #define STD_3D_OPCODE_MASK 0xFFFF0000 > > > > So it only works if the 3D opcodes have the top 9 bits all distinct? > > To expand on that, with the attached program: > > $ ./minimal-hash-hsw-render | wc -l > 44 > > $ ./minimal-hash-hsw-render | sort -u | wc -l > 37 Yes, as it's currently written, some commands may hash to the same bucket. The per-bucket search will use the full mask from the cmd descriptor to get an exact match. The issue is that, for example, MI commands in a batch may use bits 22:16 for something other than the opcode (e.g. GGTT vs PPGTT). If we mask a command from a batch with 0xFFFF0000 then an MI command may hash to the wrong bucket. If we want a perfect hash then I suppose we should look at bits 31:29 and mask with the exact STD_xx_OPCODE_MASK for the client. The existing INSTR_CLIENT_MASK/SHIFT defines could be reused for that. Brad > > -- > Damien > #include <stdint.h> > #include <stdio.h> > > #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) > > #define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags)) > > #define MI_NOOP MI_INSTR(0, 0) > #define MI_USER_INTERRUPT MI_INSTR(0x02, 0) > #define MI_WAIT_FOR_EVENT MI_INSTR(0x03, 0) > #define MI_ARB_CHECK MI_INSTR(0x05, 0) > #define MI_REPORT_HEAD MI_INSTR(0x07, 0) > #define MI_ARB_ON_OFF MI_INSTR(0x08, 0) > #define MI_BATCH_BUFFER_END MI_INSTR(0x0a, 0) > #define MI_SUSPEND_FLUSH MI_INSTR(0x0b, 0) > #define MI_OVERLAY_FLIP MI_INSTR(0x11, 0) > #define MI_SET_PREDICATE MI_INSTR(0x01, 0) > #define MI_ARB_CHECK MI_INSTR(0x05, 0) > #define MI_RS_CONTROL MI_INSTR(0x06, 0) > #define MI_URB_ATOMIC_ALLOC MI_INSTR(0x09, 0) > #define MI_PREDICATE MI_INSTR(0x0C, 0) > #define MI_RS_CONTEXT MI_INSTR(0x0F, 0) > #define MI_TOPOLOGY_FILTER MI_INSTR(0x0D, 0) > #define MI_LOAD_SCAN_LINES_EXCL MI_INSTR(0x13, 0) > #define MI_URB_CLEAR MI_INSTR(0x19, 0) > #define MI_UPDATE_GTT MI_INSTR(0x23, 0) > #define MI_CLFLUSH MI_INSTR(0x27, 0) > #define MI_REPORT_PERF_COUNT MI_INSTR(0x28, 0) > #define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 0) > #define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 0) > #define MI_RS_STORE_DATA_IMM MI_INSTR(0x2B, 0) > #define MI_LOAD_URB_MEM MI_INSTR(0x2C, 0) > #define MI_STORE_URB_MEM MI_INSTR(0x2D, 0) > #define MI_CONDITIONAL_BATCH_BUFFER_END MI_INSTR(0x36, 0) > #define MI_SEMAPHORE_MBOX MI_INSTR(0x16, 1) /* gen6+ */ > #define MI_STORE_DWORD_IMM MI_INSTR(0x20, 1) > #define MI_STORE_DWORD_INDEX MI_INSTR(0x21, 1) > #define MI_SET_CONTEXT MI_INSTR(0x18, 0) > #define MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1) > #define MI_STORE_REGISTER_MEM(x) MI_INSTR(0x24, 2*(x)-1) > #define MI_BATCH_BUFFER_START MI_INSTR(0x31, 0) > #define MI_FLUSH MI_INSTR(0x04, 0) > #define MI_DISPLAY_FLIP MI_INSTR(0x14, 2) > #define MI_LOAD_SCAN_LINES_INCL MI_INSTR(0x12, 0) > > #define GFX_OP_PIPE_CONTROL(len) ((0x3<<29)|(0x3<<27)|(0x2<<24)|(len-2)) > #define PIPELINE_SELECT ((0x3<<29)|(0x1<<27)|(0x1<<24)|(0x4<<16)) > #define GFX_OP_3DSTATE_VF_STATISTICS ((0x3<<29)|(0x1<<27)|(0x0<<24)|(0xB<<16)) > #define MEDIA_VFE_STATE ((0x3<<29)|(0x2<<27)|(0x0<<24)|(0x0<<16)) > #define MEDIA_VFE_STATE_MMIO_ACCESS_MASK (0x18) > #define GPGPU_OBJECT ((0x3<<29)|(0x2<<27)|(0x1<<24)|(0x4<<16)) > #define GPGPU_WALKER ((0x3<<29)|(0x2<<27)|(0x1<<24)|(0x5<<16)) > #define GFX_OP_3DSTATE_DX9_CONSTANTF_VS \ > ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x39<<16)) > #define GFX_OP_3DSTATE_DX9_CONSTANTF_PS \ > ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x3A<<16)) > #define GFX_OP_3DSTATE_SO_DECL_LIST \ > ((0x3<<29)|(0x3<<27)|(0x1<<24)|(0x17<<16)) > > #define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_VS \ > ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x43<<16)) > #define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_GS \ > ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x44<<16)) > #define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_HS \ > ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x45<<16)) > #define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_DS \ > ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x46<<16)) > #define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_PS \ > ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x47<<16)) > > #define MFX_WAIT ((0x3<<29)|(0x1<<27)|(0x0<<16)) > > #define COLOR_BLT ((0x2<<29)|(0x40<<22)) > #define SRC_COPY_BLT ((0x2<<29)|(0x43<<22)) > > uint32_t opcodes[] = { > MI_NOOP, > MI_USER_INTERRUPT, > MI_WAIT_FOR_EVENT, > MI_ARB_CHECK, > MI_REPORT_HEAD, > MI_SUSPEND_FLUSH, > MI_SEMAPHORE_MBOX, > MI_STORE_DWORD_INDEX, > MI_LOAD_REGISTER_IMM(1), > MI_STORE_REGISTER_MEM(1), > MI_LOAD_REGISTER_MEM, > MI_BATCH_BUFFER_START, > MI_FLUSH, > MI_ARB_ON_OFF, > MI_PREDICATE, > MI_TOPOLOGY_FILTER, > MI_DISPLAY_FLIP, > MI_SET_CONTEXT, > MI_URB_CLEAR, > MI_STORE_DWORD_IMM, > MI_UPDATE_GTT, > MI_CLFLUSH, > MI_REPORT_PERF_COUNT, > MI_CONDITIONAL_BATCH_BUFFER_END, > GFX_OP_3DSTATE_VF_STATISTICS, > PIPELINE_SELECT, > MEDIA_VFE_STATE, > GPGPU_OBJECT, > GPGPU_WALKER, > GFX_OP_3DSTATE_SO_DECL_LIST, > GFX_OP_PIPE_CONTROL(5), > MI_LOAD_SCAN_LINES_INCL, > MI_LOAD_SCAN_LINES_EXCL, > MI_LOAD_REGISTER_REG, > MI_RS_STORE_DATA_IMM, > MI_LOAD_URB_MEM, > MI_STORE_URB_MEM, > GFX_OP_3DSTATE_DX9_CONSTANTF_VS, > GFX_OP_3DSTATE_DX9_CONSTANTF_PS, > GFX_OP_3DSTATE_BINDING_TABLE_EDIT_VS, > GFX_OP_3DSTATE_BINDING_TABLE_EDIT_GS, > GFX_OP_3DSTATE_BINDING_TABLE_EDIT_HS, > GFX_OP_3DSTATE_BINDING_TABLE_EDIT_DS, > GFX_OP_3DSTATE_BINDING_TABLE_EDIT_PS, > }; > > #define STD_MI_OPCODE_MASK 0xFF800000 > #define STD_3D_OPCODE_MASK 0xFFFF0000 > #define STD_2D_OPCODE_MASK 0xFFC00000 > #define STD_MFX_OPCODE_MASK 0xFFFF0000 > #define CMD_HASH_MASK STD_MI_OPCODE_MASK > > int main(void) > { > int i; > > for (i = 0; i < ARRAY_SIZE(opcodes); i++) > printf("%08x\n", opcodes[i] & CMD_HASH_MASK); > > return 0; > }
On Thu, May 08, 2014 at 08:53:38AM -0700, Volkin, Bradley D wrote: > On Thu, May 08, 2014 at 06:15:44AM -0700, Lespiau, Damien wrote: > > On Thu, May 08, 2014 at 02:05:07PM +0100, Damien Lespiau wrote: > > > On Mon, Apr 28, 2014 at 08:22:08AM -0700, bradley.d.volkin@intel.com wrote: > > > > From: Brad Volkin <bradley.d.volkin@intel.com> > > > > +/* > > > > + * Different command ranges have different numbers of bits for the opcode. > > > > + * In order to use the opcode bits, and only the opcode bits, for the hash key > > > > + * we should use the MI_* command opcode mask (since those commands use the > > > > + * fewest bits for the opcode.) > > > > + */ > > > > +#define CMD_HASH_MASK STD_MI_OPCODE_MASK > > > > > > This is not very convicing (but could well be correct). > > > > > > #define STD_MI_OPCODE_MASK 0xFF800000 > > > #define STD_3D_OPCODE_MASK 0xFFFF0000 > > > > > > So it only works if the 3D opcodes have the top 9 bits all distinct? > > > > To expand on that, with the attached program: > > > > $ ./minimal-hash-hsw-render | wc -l > > 44 > > > > $ ./minimal-hash-hsw-render | sort -u | wc -l > > 37 > > Yes, as it's currently written, some commands may hash to the same > bucket. The per-bucket search will use the full mask from the cmd > descriptor to get an exact match. > > The issue is that, for example, MI commands in a batch may use bits > 22:16 for something other than the opcode (e.g. GGTT vs PPGTT). If we > mask a command from a batch with 0xFFFF0000 then an MI command may hash > to the wrong bucket. If we want a perfect hash then I suppose we should > look at bits 31:29 and mask with the exact STD_xx_OPCODE_MASK for the > client. The existing INSTR_CLIENT_MASK/SHIFT defines could be reused > for that. Ah that works then, not super convinced that's the best way we can do it, but it seems to be an improvement, so well... can't argue with that.
On Thu, May 08, 2014 at 08:45:07AM -0700, Ville Syrjälä wrote: > On Thu, May 08, 2014 at 08:27:16AM -0700, Volkin, Bradley D wrote: > > On Thu, May 08, 2014 at 02:56:05AM -0700, Tvrtko Ursulin wrote: > > > > > > Hi Brad, > > > > > > On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: > > > [snip] > > > > - BUG_ON(!validate_cmds_sorted(ring)); > > > > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > > > > BUG_ON(!validate_regs_sorted(ring)); > > > > + > > > > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > > > > > > Is a BUG_ON a bit harsh since the above fails only on ENOMEM condition? > > > > > > If the concern is not allowing any command execution if parser setup has > > > failed, it would be nicer to the system as whole to just keep rejecting > > > everything, but let the rest of the kernel live to enable debug or whatever? > > > > > > I know it won't happen almost ever so it's a minor point really. I just > > > dislike actively hosing the whole system if it is avoidable. > > > > Hi Tvrtko, > > > > I agree that a BUG_ON might be harsh here. I suppose we could log an > > error and disable the command parser. Most command buffers would > > still go through fine but HW parsing would reject some that the SW > > parser might otherwise allow. That could be a bit odd if we ever did > > get a failure - apps/functionality that worked the last time I booted > > suddenly don't this time. The issue would be in the log though. > > > > I don't have a strong preference on this. Whatever people prefer. > > If the memory allocation fails there's probably not much point in > trying to limp along and continue the driver init. So just pass error > up and let the caller deal with it. > > Looking at the error paths up from ring init, we probably leak a ton > of junk but at least the kernel should remain otherwise operational. Passing the error up is probably a good option. There was a recent change to mark the GPU as wedged if ring init fails, which seems like a reasonable enough response to parser init failure. I'll have to look at the cleanup paths. The per-ring parser init can probably move around to avoid excessive additional cleanup code on failure paths. Brad > > -- > Ville Syrjälä > Intel OTC
On Thu, May 08, 2014 at 08:50:40AM -0700, Tvrtko Ursulin wrote: > > On 05/08/2014 04:27 PM, Volkin, Bradley D wrote: > > On Thu, May 08, 2014 at 02:56:05AM -0700, Tvrtko Ursulin wrote: > >> > >> Hi Brad, > >> > >> On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: > >> [snip] > >>> - BUG_ON(!validate_cmds_sorted(ring)); > >>> + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > >>> BUG_ON(!validate_regs_sorted(ring)); > >>> + > >>> + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > >> > >> Is a BUG_ON a bit harsh since the above fails only on ENOMEM condition? > >> > >> If the concern is not allowing any command execution if parser setup has > >> failed, it would be nicer to the system as whole to just keep rejecting > >> everything, but let the rest of the kernel live to enable debug or whatever? > >> > >> I know it won't happen almost ever so it's a minor point really. I just > >> dislike actively hosing the whole system if it is avoidable. > > > > Hi Tvrtko, > > > > I agree that a BUG_ON might be harsh here. I suppose we could log an > > error and disable the command parser. Most command buffers would > > still go through fine but HW parsing would reject some that the SW > > parser might otherwise allow. That could be a bit odd if we ever did > > get a failure - apps/functionality that worked the last time I booted > > suddenly don't this time. The issue would be in the log though. > > That would indeed be weird. > > So command parser is just a more permissive whitelist, a super set of > hardware parser? Yes. The thing we're trying to accomplish is that HW rejects all MI_LOAD/STORE_REGISTER_* commands, but it's safe and required to access some registers from a batch in order to implement GL functionality. > > > I don't have a strong preference on this. Whatever people prefer. > > Initially I was thinking of just rejecting all batch buffers if such > fundamental initialisation fails. It beats BUG_ON in the sense that it > leaves the system alive and it doesn't suffer from the random subtle > behaviour like if it would just disable the command parser. > > It is a bit theoretical because it is likely system won't be too healthy > anyway if such a tiny allocation fails but you never know. It's better > not to crash it if we don't have to. Yeah, see my response to Ville. I think there's a good solution there. Brad > > Regards, > > Tvrtko
On Thu, May 08, 2014 at 09:02:18AM -0700, Volkin, Bradley D wrote: > On Thu, May 08, 2014 at 08:45:07AM -0700, Ville Syrjälä wrote: > > On Thu, May 08, 2014 at 08:27:16AM -0700, Volkin, Bradley D wrote: > > > On Thu, May 08, 2014 at 02:56:05AM -0700, Tvrtko Ursulin wrote: > > > > > > > > Hi Brad, > > > > > > > > On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: > > > > [snip] > > > > > - BUG_ON(!validate_cmds_sorted(ring)); > > > > > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > > > > > BUG_ON(!validate_regs_sorted(ring)); > > > > > + > > > > > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > > > > > > > > Is a BUG_ON a bit harsh since the above fails only on ENOMEM condition? > > > > > > > > If the concern is not allowing any command execution if parser setup has > > > > failed, it would be nicer to the system as whole to just keep rejecting > > > > everything, but let the rest of the kernel live to enable debug or whatever? > > > > > > > > I know it won't happen almost ever so it's a minor point really. I just > > > > dislike actively hosing the whole system if it is avoidable. > > > > > > Hi Tvrtko, > > > > > > I agree that a BUG_ON might be harsh here. I suppose we could log an > > > error and disable the command parser. Most command buffers would > > > still go through fine but HW parsing would reject some that the SW > > > parser might otherwise allow. That could be a bit odd if we ever did > > > get a failure - apps/functionality that worked the last time I booted > > > suddenly don't this time. The issue would be in the log though. > > > > > > I don't have a strong preference on this. Whatever people prefer. > > > > If the memory allocation fails there's probably not much point in > > trying to limp along and continue the driver init. So just pass error > > up and let the caller deal with it. > > > > Looking at the error paths up from ring init, we probably leak a ton > > of junk but at least the kernel should remain otherwise operational. > > Passing the error up is probably a good option. There was a recent change > to mark the GPU as wedged if ring init fails, which seems like a reasonable > enough response to parser init failure. I'll have to look at the cleanup > paths. The per-ring parser init can probably move around to avoid excessive > additional cleanup code on failure paths. With the latest ring init rework from Chris we should just fail the gem side of things and declare the gpu terminally wedged. But the modeset side should keep on working, which means users will be able to take a look at dmesg instead of a black screen (since that's what you get from BUG_ON in driver load often). Working modeset with broken gpu is infinitely better than a black screen, so no BUG_ON here please. At least if the error code isn't much more than a simply return, we always have to balance bugs in the error code with better abilities to limp on long enough for a useful bug report. -Daniel
On Mon, May 12, 2014 at 09:24:06AM -0700, Daniel Vetter wrote: > On Thu, May 08, 2014 at 09:02:18AM -0700, Volkin, Bradley D wrote: > > On Thu, May 08, 2014 at 08:45:07AM -0700, Ville Syrjälä wrote: > > > On Thu, May 08, 2014 at 08:27:16AM -0700, Volkin, Bradley D wrote: > > > > On Thu, May 08, 2014 at 02:56:05AM -0700, Tvrtko Ursulin wrote: > > > > > > > > > > Hi Brad, > > > > > > > > > > On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: > > > > > [snip] > > > > > > - BUG_ON(!validate_cmds_sorted(ring)); > > > > > > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > > > > > > BUG_ON(!validate_regs_sorted(ring)); > > > > > > + > > > > > > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > > > > > > > > > > Is a BUG_ON a bit harsh since the above fails only on ENOMEM condition? > > > > > > > > > > If the concern is not allowing any command execution if parser setup has > > > > > failed, it would be nicer to the system as whole to just keep rejecting > > > > > everything, but let the rest of the kernel live to enable debug or whatever? > > > > > > > > > > I know it won't happen almost ever so it's a minor point really. I just > > > > > dislike actively hosing the whole system if it is avoidable. > > > > > > > > Hi Tvrtko, > > > > > > > > I agree that a BUG_ON might be harsh here. I suppose we could log an > > > > error and disable the command parser. Most command buffers would > > > > still go through fine but HW parsing would reject some that the SW > > > > parser might otherwise allow. That could be a bit odd if we ever did > > > > get a failure - apps/functionality that worked the last time I booted > > > > suddenly don't this time. The issue would be in the log though. > > > > > > > > I don't have a strong preference on this. Whatever people prefer. > > > > > > If the memory allocation fails there's probably not much point in > > > trying to limp along and continue the driver init. So just pass error > > > up and let the caller deal with it. > > > > > > Looking at the error paths up from ring init, we probably leak a ton > > > of junk but at least the kernel should remain otherwise operational. > > > > Passing the error up is probably a good option. There was a recent change > > to mark the GPU as wedged if ring init fails, which seems like a reasonable > > enough response to parser init failure. I'll have to look at the cleanup > > paths. The per-ring parser init can probably move around to avoid excessive > > additional cleanup code on failure paths. > > With the latest ring init rework from Chris we should just fail the gem > side of things and declare the gpu terminally wedged. But the modeset side > should keep on working, which means users will be able to take a look at > dmesg instead of a black screen (since that's what you get from BUG_ON in > driver load often). > > Working modeset with broken gpu is infinitely better than a black screen, > so no BUG_ON here please. At least if the error code isn't much more than > a simply return, we always have to balance bugs in the error code with > better abilities to limp on long enough for a useful bug report. Ok. I sent a v2 and in that I have i915_cmd_parser_init_ring() pass the error returned by init_hash_table() back to the caller, who passes it up, etc. When I looked more closely at Chris' change, the wedge behavior only happens if the return is -EIO, which it will not be in this case. The only failure from init_hash_table() is -ENOMEM so if we specifically want the wedging behavior (vs whatever would normally happen if ring init fails with -ENOMEM) then we need to modify the return from i915_cmd_parser_init_ring() to be -EIO. Probably also add a comment explaining why we do the modification. I'm fine either way. The feedback earlier seemed to be that if we have an allocation failure we probably can't do much else anyhow, hence just returning the -ENOMEM. Brad > -Daniel > -- > Daniel Vetter > Software Engineer, Intel Corporation > +41 (0) 79 365 57 48 - http://blog.ffwll.ch
On Mon, May 12, 2014 at 09:41:02AM -0700, Volkin, Bradley D wrote: > On Mon, May 12, 2014 at 09:24:06AM -0700, Daniel Vetter wrote: > > On Thu, May 08, 2014 at 09:02:18AM -0700, Volkin, Bradley D wrote: > > > On Thu, May 08, 2014 at 08:45:07AM -0700, Ville Syrjälä wrote: > > > > On Thu, May 08, 2014 at 08:27:16AM -0700, Volkin, Bradley D wrote: > > > > > On Thu, May 08, 2014 at 02:56:05AM -0700, Tvrtko Ursulin wrote: > > > > > > > > > > > > Hi Brad, > > > > > > > > > > > > On 04/28/2014 04:22 PM, bradley.d.volkin@intel.com wrote: > > > > > > [snip] > > > > > > > - BUG_ON(!validate_cmds_sorted(ring)); > > > > > > > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > > > > > > > BUG_ON(!validate_regs_sorted(ring)); > > > > > > > + > > > > > > > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > > > > > > > > > > > > Is a BUG_ON a bit harsh since the above fails only on ENOMEM condition? > > > > > > > > > > > > If the concern is not allowing any command execution if parser setup has > > > > > > failed, it would be nicer to the system as whole to just keep rejecting > > > > > > everything, but let the rest of the kernel live to enable debug or whatever? > > > > > > > > > > > > I know it won't happen almost ever so it's a minor point really. I just > > > > > > dislike actively hosing the whole system if it is avoidable. > > > > > > > > > > Hi Tvrtko, > > > > > > > > > > I agree that a BUG_ON might be harsh here. I suppose we could log an > > > > > error and disable the command parser. Most command buffers would > > > > > still go through fine but HW parsing would reject some that the SW > > > > > parser might otherwise allow. That could be a bit odd if we ever did > > > > > get a failure - apps/functionality that worked the last time I booted > > > > > suddenly don't this time. The issue would be in the log though. > > > > > > > > > > I don't have a strong preference on this. Whatever people prefer. > > > > > > > > If the memory allocation fails there's probably not much point in > > > > trying to limp along and continue the driver init. So just pass error > > > > up and let the caller deal with it. > > > > > > > > Looking at the error paths up from ring init, we probably leak a ton > > > > of junk but at least the kernel should remain otherwise operational. > > > > > > Passing the error up is probably a good option. There was a recent change > > > to mark the GPU as wedged if ring init fails, which seems like a reasonable > > > enough response to parser init failure. I'll have to look at the cleanup > > > paths. The per-ring parser init can probably move around to avoid excessive > > > additional cleanup code on failure paths. > > > > With the latest ring init rework from Chris we should just fail the gem > > side of things and declare the gpu terminally wedged. But the modeset side > > should keep on working, which means users will be able to take a look at > > dmesg instead of a black screen (since that's what you get from BUG_ON in > > driver load often). > > > > Working modeset with broken gpu is infinitely better than a black screen, > > so no BUG_ON here please. At least if the error code isn't much more than > > a simply return, we always have to balance bugs in the error code with > > better abilities to limp on long enough for a useful bug report. > > Ok. I sent a v2 and in that I have i915_cmd_parser_init_ring() pass the > error returned by init_hash_table() back to the caller, who passes it up, > etc. When I looked more closely at Chris' change, the wedge behavior only > happens if the return is -EIO, which it will not be in this case. The only > failure from init_hash_table() is -ENOMEM so if we specifically want the > wedging behavior (vs whatever would normally happen if ring init fails with > -ENOMEM) then we need to modify the return from i915_cmd_parser_init_ring() > to be -EIO. Probably also add a comment explaining why we do the modification. > > I'm fine either way. The feedback earlier seemed to be that if we have an > allocation failure we probably can't do much else anyhow, hence just returning > the -ENOMEM. Oh right. Since -ENOMEM should really be extremely exceptional I guess the current code is ok. I've mixed things up a bit ;-) But generally if the hw goes bananas we should continue to limp alone, if it makes sense. -Daniel
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 9bac097..9dca899 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -498,16 +498,18 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header) return 0; } -static bool validate_cmds_sorted(struct intel_ring_buffer *ring) +static bool validate_cmds_sorted(struct intel_ring_buffer *ring, + const struct drm_i915_cmd_table *cmd_tables, + int cmd_table_count) { int i; bool ret = true; - if (!ring->cmd_tables || ring->cmd_table_count == 0) + if (!cmd_tables || cmd_table_count == 0) return true; - for (i = 0; i < ring->cmd_table_count; i++) { - const struct drm_i915_cmd_table *table = &ring->cmd_tables[i]; + for (i = 0; i < cmd_table_count; i++) { + const struct drm_i915_cmd_table *table = &cmd_tables[i]; u32 previous = 0; int j; @@ -557,6 +559,60 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) ring->master_reg_count); } +struct cmd_node { + const struct drm_i915_cmd_descriptor *desc; + struct hlist_node node; +}; + +/* + * Different command ranges have different numbers of bits for the opcode. + * In order to use the opcode bits, and only the opcode bits, for the hash key + * we should use the MI_* command opcode mask (since those commands use the + * fewest bits for the opcode.) + */ +#define CMD_HASH_MASK STD_MI_OPCODE_MASK + +static int init_hash_table(struct intel_ring_buffer *ring, + const struct drm_i915_cmd_table *cmd_tables, + int cmd_table_count) +{ + int i, j; + + hash_init(ring->cmd_hash); + + for (i = 0; i < cmd_table_count; i++) { + const struct drm_i915_cmd_table *table = &cmd_tables[i]; + + for (j = 0; j < table->count; j++) { + const struct drm_i915_cmd_descriptor *desc = + &table->table[j]; + struct cmd_node *desc_node = + kmalloc(sizeof(*desc_node), GFP_KERNEL); + + if (!desc_node) + return -ENOMEM; + + desc_node->desc = desc; + hash_add(ring->cmd_hash, &desc_node->node, + desc->cmd.value & CMD_HASH_MASK); + } + } + + return 0; +} + +static void fini_hash_table(struct intel_ring_buffer *ring) +{ + struct hlist_node *tmp; + struct cmd_node *desc_node; + int i; + + hash_for_each_safe(ring->cmd_hash, i, tmp, desc_node, node) { + hash_del(&desc_node->node); + kfree(desc_node); + } +} + /** * i915_cmd_parser_init_ring() - set cmd parser related fields for a ringbuffer * @ring: the ringbuffer to initialize @@ -567,18 +623,21 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) */ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) { + const struct drm_i915_cmd_table *cmd_tables; + int cmd_table_count; + if (!IS_GEN7(ring->dev)) return; switch (ring->id) { case RCS: if (IS_HASWELL(ring->dev)) { - ring->cmd_tables = hsw_render_ring_cmds; - ring->cmd_table_count = + cmd_tables = hsw_render_ring_cmds; + cmd_table_count = ARRAY_SIZE(hsw_render_ring_cmds); } else { - ring->cmd_tables = gen7_render_cmds; - ring->cmd_table_count = ARRAY_SIZE(gen7_render_cmds); + cmd_tables = gen7_render_cmds; + cmd_table_count = ARRAY_SIZE(gen7_render_cmds); } ring->reg_table = gen7_render_regs; @@ -595,17 +654,17 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) ring->get_cmd_length_mask = gen7_render_get_cmd_length_mask; break; case VCS: - ring->cmd_tables = gen7_video_cmds; - ring->cmd_table_count = ARRAY_SIZE(gen7_video_cmds); + cmd_tables = gen7_video_cmds; + cmd_table_count = ARRAY_SIZE(gen7_video_cmds); ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; break; case BCS: if (IS_HASWELL(ring->dev)) { - ring->cmd_tables = hsw_blt_ring_cmds; - ring->cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); + cmd_tables = hsw_blt_ring_cmds; + cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); } else { - ring->cmd_tables = gen7_blt_cmds; - ring->cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); + cmd_tables = gen7_blt_cmds; + cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); } ring->reg_table = gen7_blt_regs; @@ -622,8 +681,8 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) ring->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; break; case VECS: - ring->cmd_tables = hsw_vebox_cmds; - ring->cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); + cmd_tables = hsw_vebox_cmds; + cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); /* VECS can use the same length_mask function as VCS */ ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; break; @@ -633,18 +692,38 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) BUG(); } - BUG_ON(!validate_cmds_sorted(ring)); + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); BUG_ON(!validate_regs_sorted(ring)); + + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); + + ring->needs_cmd_parser = true; +} + +/** + * i915_cmd_parser_fini_ring() - clean up cmd parser related fields + * @ring: the ringbuffer to clean up + * + * Releases any resources related to command parsing that may have been + * initialized for the specified ring. + */ +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring) +{ + if (!ring->needs_cmd_parser) + return; + + fini_hash_table(ring); } static const struct drm_i915_cmd_descriptor* -find_cmd_in_table(const struct drm_i915_cmd_table *table, +find_cmd_in_table(struct intel_ring_buffer *ring, u32 cmd_header) { - int i; + struct cmd_node *desc_node; - for (i = 0; i < table->count; i++) { - const struct drm_i915_cmd_descriptor *desc = &table->table[i]; + hash_for_each_possible(ring->cmd_hash, desc_node, node, + cmd_header & CMD_HASH_MASK) { + const struct drm_i915_cmd_descriptor *desc = desc_node->desc; u32 masked_cmd = desc->cmd.mask & cmd_header; u32 masked_value = desc->cmd.value & desc->cmd.mask; @@ -668,16 +747,12 @@ find_cmd(struct intel_ring_buffer *ring, u32 cmd_header, struct drm_i915_cmd_descriptor *default_desc) { + const struct drm_i915_cmd_descriptor *desc; u32 mask; - int i; - - for (i = 0; i < ring->cmd_table_count; i++) { - const struct drm_i915_cmd_descriptor *desc; - desc = find_cmd_in_table(&ring->cmd_tables[i], cmd_header); - if (desc) - return desc; - } + desc = find_cmd_in_table(ring, cmd_header); + if (desc) + return desc; mask = ring->get_cmd_length_mask(cmd_header); if (!mask) @@ -748,8 +823,7 @@ bool i915_needs_cmd_parser(struct intel_ring_buffer *ring) { struct drm_i915_private *dev_priv = ring->dev->dev_private; - /* No command tables indicates a platform without parsing */ - if (!ring->cmd_tables) + if (!ring->needs_cmd_parser) return false; /* diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 7d6acb4..8c8388f 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2414,6 +2414,7 @@ const char *i915_cache_level_str(int type); /* i915_cmd_parser.c */ int i915_cmd_parser_get_version(void); void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring); +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring); bool i915_needs_cmd_parser(struct intel_ring_buffer *ring); int i915_parse_cmds(struct intel_ring_buffer *ring, struct drm_i915_gem_object *batch_obj, diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index eb3dd26..06fa9a3 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1469,6 +1469,8 @@ void intel_cleanup_ring_buffer(struct intel_ring_buffer *ring) ring->cleanup(ring); cleanup_status_page(ring); + + i915_cmd_parser_fini_ring(ring); } static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 413cdc7..48daa91 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -1,6 +1,10 @@ #ifndef _INTEL_RINGBUFFER_H_ #define _INTEL_RINGBUFFER_H_ +#include <linux/hashtable.h> + +#define I915_CMD_HASH_ORDER 9 + /* * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use" * Gen3 BSpec "vol1c Memory Interface Functions" / 2.3.4.5 "Ring Buffer Use" @@ -164,12 +168,13 @@ struct intel_ring_buffer { volatile u32 *cpu_page; } scratch; + bool needs_cmd_parser; + /* - * Tables of commands the command parser needs to know about + * Table of commands the command parser needs to know about * for this ring. */ - const struct drm_i915_cmd_table *cmd_tables; - int cmd_table_count; + DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER); /* * Table of registers allowed in commands that read/write registers.