diff mbox

[06/18] drm/i915: Handle log buffer flush interrupt event from GuC

Message ID 1471272599-14586-7-git-send-email-akash.goel@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

akash.goel@intel.com Aug. 15, 2016, 2:49 p.m. UTC
From: Sagar Arun Kamble <sagar.a.kamble@intel.com>

GuC ukernel sends an interrupt to Host to flush the log buffer
and expects Host to correspondingly update the read pointer
information in the state structure, once it has consumed the
log buffer contents by copying them to a file or buffer.
Even if Host couldn't copy the contents, it can still update the
read pointer so that logging state is not disturbed on GuC side.

v2:
- Use a dedicated workqueue for handling flush interrupt. (Tvrtko)
- Reduce the overall log buffer copying time by skipping the copy of
  crash buffer area for regular cases and copying only the state
  structure data in first page.

v3:
 - Create a vmalloc mapping of log buffer. (Chris)
 - Cover the flush acknowledgment under rpm get & put.(Chris)
 - Revert the change of skipping the copy of crash dump area, as
   not really needed, will be covered by subsequent patch.

v4:
 - Destroy the wq under the same condition in which it was created,
   pass dev_piv pointer instead of dev to newly added GuC function,
   add more comments & rename variable for clarity. (Tvrtko)

v5:
- Allocate & destroy the dedicated wq, for handling flush interrupt,
  from the setup/teardown routines of GuC logging. (Chris)
- Validate the log buffer size value retrieved from state structure
  and do some minor cleanup. (Tvrtko)
- Fix error/warnings reported by checkpatch. (Tvrtko)
- Rebase.

Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com>
Signed-off-by: Akash Goel <akash.goel@intel.com>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 202 +++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_irq.c            |  29 ++++-
 drivers/gpu/drm/i915/intel_guc.h           |   4 +
 3 files changed, 234 insertions(+), 1 deletion(-)

Comments

Tvrtko Ursulin Aug. 15, 2016, 3:20 p.m. UTC | #1
On 15/08/16 15:49, akash.goel@intel.com wrote:
> From: Sagar Arun Kamble <sagar.a.kamble@intel.com>
>
> GuC ukernel sends an interrupt to Host to flush the log buffer
> and expects Host to correspondingly update the read pointer
> information in the state structure, once it has consumed the
> log buffer contents by copying them to a file or buffer.
> Even if Host couldn't copy the contents, it can still update the
> read pointer so that logging state is not disturbed on GuC side.
>
> v2:
> - Use a dedicated workqueue for handling flush interrupt. (Tvrtko)
> - Reduce the overall log buffer copying time by skipping the copy of
>    crash buffer area for regular cases and copying only the state
>    structure data in first page.
>
> v3:
>   - Create a vmalloc mapping of log buffer. (Chris)
>   - Cover the flush acknowledgment under rpm get & put.(Chris)
>   - Revert the change of skipping the copy of crash dump area, as
>     not really needed, will be covered by subsequent patch.
>
> v4:
>   - Destroy the wq under the same condition in which it was created,
>     pass dev_piv pointer instead of dev to newly added GuC function,
>     add more comments & rename variable for clarity. (Tvrtko)
>
> v5:
> - Allocate & destroy the dedicated wq, for handling flush interrupt,
>    from the setup/teardown routines of GuC logging. (Chris)
> - Validate the log buffer size value retrieved from state structure
>    and do some minor cleanup. (Tvrtko)
> - Fix error/warnings reported by checkpatch. (Tvrtko)
> - Rebase.
>
> Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com>
> Signed-off-by: Akash Goel <akash.goel@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_guc_submission.c | 202 +++++++++++++++++++++++++++++
>   drivers/gpu/drm/i915/i915_irq.c            |  29 ++++-
>   drivers/gpu/drm/i915/intel_guc.h           |   4 +
>   3 files changed, 234 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index b062da6..2b27b87 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -172,6 +172,15 @@ static int host2guc_sample_forcewake(struct intel_guc *guc,
>   	return host2guc_action(guc, data, ARRAY_SIZE(data));
>   }
>
> +static int host2guc_logbuffer_flush_complete(struct intel_guc *guc)
> +{
> +	u32 data[1];
> +
> +	data[0] = HOST2GUC_ACTION_LOG_BUFFER_FILE_FLUSH_COMPLETE;
> +
> +	return host2guc_action(guc, data, 1);
> +}
> +
>   /*
>    * Initialise, update, or clear doorbell data shared with the GuC
>    *
> @@ -828,6 +837,179 @@ err:
>   	return NULL;
>   }
>
> +static void guc_move_to_next_buf(struct intel_guc *guc)
> +{
> +}
> +
> +static void *guc_get_write_buffer(struct intel_guc *guc)
> +{
> +	return NULL;
> +}
> +
> +static void guc_read_update_log_buffer(struct intel_guc *guc)
> +{
> +	struct guc_log_buffer_state *log_buffer_state, *log_buffer_snapshot_state;
> +	struct guc_log_buffer_state log_buffer_state_local;
> +	void *src_data_ptr, *dst_data_ptr;
> +	unsigned int buffer_size, expected_size;
> +	enum guc_log_buffer_type type;
> +
> +	if (WARN_ON(!guc->log.buf_addr))
> +		return;
> +
> +	/* Get the pointer to shared GuC log buffer */
> +	log_buffer_state = src_data_ptr = guc->log.buf_addr;
> +
> +	/* Get the pointer to local buffer to store the logs */
> +	dst_data_ptr = log_buffer_snapshot_state = guc_get_write_buffer(guc);
> +
> +	/* Actual logs are present from the 2nd page */
> +	src_data_ptr += PAGE_SIZE;
> +	dst_data_ptr += PAGE_SIZE;
> +
> +	for (type = GUC_ISR_LOG_BUFFER; type < GUC_MAX_LOG_BUFFER; type++) {
> +		/* Make a copy of the state structure in GuC log buffer (which
> +		 * is uncached mapped) on the stack to avoid reading from it
> +		 * multiple times.
> +		 */
> +		memcpy(&log_buffer_state_local, log_buffer_state,
> +		       sizeof(struct guc_log_buffer_state));
> +		buffer_size = log_buffer_state_local.size;
> +
> +		if (log_buffer_snapshot_state) {
> +			/* First copy the state structure in snapshot buffer */
> +			memcpy(log_buffer_snapshot_state, &log_buffer_state_local,
> +			       sizeof(struct guc_log_buffer_state));
> +
> +			/* The write pointer could have been updated by the GuC
> +			 * firmware, after sending the flush interrupt to Host,
> +			 * for consistency set the write pointer value to same
> +			 * value of sampled_write_ptr in the snapshot buffer.
> +			 */
> +			log_buffer_snapshot_state->write_ptr =
> +				log_buffer_snapshot_state->sampled_write_ptr;
> +
> +			log_buffer_snapshot_state++;
> +
> +			/* Now copy the actual logs, but before that validate
> +			 * the buffer size value retrieved from state structure.
> +			 */
> +			if (type == GUC_ISR_LOG_BUFFER)
> +				expected_size = (GUC_LOG_ISR_PAGES+1)*PAGE_SIZE;
> +			else if (type == GUC_DPC_LOG_BUFFER)
> +				expected_size = (GUC_LOG_DPC_PAGES+1)*PAGE_SIZE;
> +			else
> +				expected_size = (GUC_LOG_CRASH_PAGES+1)*PAGE_SIZE;

Candidate for extraction to a helper function - to keep the body of the 
loop shorter.

I would also do this above after the line which loads the buffer_size so 
operations are logically together.

And a nitpick: spaces around arithmetic operators are standard - 
checkpatch did not complain? Maybe it does only in strict mode?

> +
> +			if (unlikely(buffer_size != expected_size)) {
> +				DRM_ERROR("unexpected log buffer size\n");
> +				/* Continue with further copying, already state
> +				 * structure has been copied which is enough to
> +				 * let Userspace know about the anomaly.
> +				 */
> +				buffer_size = expected_size;

Would it be worth checking and protecting against reported buffer size 
being smaller than expected?

> +			}
> +
> +			memcpy(dst_data_ptr, src_data_ptr, buffer_size);
> +
> +			src_data_ptr += buffer_size;
> +			dst_data_ptr += buffer_size;
> +		}
> +
> +		/* FIXME: invalidate/flush for log buffer needed */
> +
> +		/* Update the read pointer in the shared log buffer */
> +		log_buffer_state->read_ptr =
> +			log_buffer_state_local.sampled_write_ptr;
> +
> +		/* Clear the 'flush to file' flag */
> +		log_buffer_state->flush_to_file = 0;
> +		log_buffer_state++;
> +	}
> +
> +	if (log_buffer_snapshot_state)
> +		guc_move_to_next_buf(guc);
> +}
> +
> +static void guc_capture_logs_work(struct work_struct *work)
> +{
> +	struct drm_i915_private *dev_priv =
> +		container_of(work, struct drm_i915_private, guc.log.flush_work);
> +
> +	spin_lock_irq(&dev_priv->irq_lock);
> +	/* Speed up work cancellation during disabling guc interrupts. */

Hm I can't see work cancellation in this patch?

> +	if (!dev_priv->guc.interrupts_enabled) {
> +		spin_unlock_irq(&dev_priv->irq_lock);
> +		return;
> +	}
> +	spin_unlock_irq(&dev_priv->irq_lock);

Can gen9_disable_guc_interrupts run on another CPU just at this point? 
So after a spinlock here has been released. If so the spinlock dance 
above does not do anything.

> +
> +	i915_guc_capture_logs(dev_priv);
> +}
> +
> +static void guc_log_cleanup(struct intel_guc *guc)
> +{
> +	struct drm_i915_private *dev_priv = guc_to_i915(guc);
> +
> +	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> +
> +	if (i915.guc_log_level < 0)
> +		return;
> +
> +	/* First disable the flush interrupt */
> +	gen9_disable_guc_interrupts(dev_priv);
> +
> +	if (guc->log.flush_wq)
> +		destroy_workqueue(guc->log.flush_wq);

It doesn't need cancel_work_sync before or something?

> +
> +	guc->log.flush_wq = NULL;
> +
> +	if (guc->log.buf_addr)
> +		i915_gem_object_unpin_map(guc->log.vma->obj);
> +
> +	guc->log.buf_addr = NULL;
> +}
> +
> +static int guc_create_log_extras(struct intel_guc *guc)
> +{
> +	struct drm_i915_private *dev_priv = guc_to_i915(guc);
> +	void *vaddr;
> +	int ret;
> +
> +	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> +
> +	/* Nothing to do */
> +	if (i915.guc_log_level < 0)
> +		return 0;
> +
> +	if (!guc->log.buf_addr) {
> +		/* Create a vmalloc mapping of log buffer pages */
> +		vaddr = i915_gem_object_pin_map(guc->log.vma->obj, I915_MAP_WB);
> +		if (IS_ERR(vaddr)) {
> +			ret = PTR_ERR(vaddr);
> +			DRM_ERROR("Couldn't map log buffer pages %d\n", ret);
> +			return ret;
> +		}
> +
> +		guc->log.buf_addr = vaddr;
> +	}
> +
> +	if (!guc->log.flush_wq) {
> +		INIT_WORK(&guc->log.flush_work, guc_capture_logs_work);
> +
> +		/* Need a dedicated wq to process log buffer flush interrupts
> +		 * from GuC without much delay so as to avoid any loss of logs.
> +		 */
> +		guc->log.flush_wq = alloc_ordered_workqueue("i915-guc_log", 0);
> +		if (guc->log.flush_wq == NULL) {
> +			DRM_ERROR("Couldn't allocate the wq for GuC logging\n");
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>   static void guc_create_log(struct intel_guc *guc)
>   {
>   	struct i915_vma *vma;
> @@ -853,6 +1035,13 @@ static void guc_create_log(struct intel_guc *guc)
>   		}
>
>   		guc->log.vma = vma;
> +
> +		if (guc_create_log_extras(guc)) {
> +			guc_log_cleanup(guc);
> +			i915_vma_unpin_and_release(&guc->log.vma);
> +			i915.guc_log_level = -1;
> +			return;
> +		}
>   	}
>
>   	/* each allocated unit is a page */
> @@ -1034,6 +1223,7 @@ void i915_guc_submission_fini(struct drm_i915_private *dev_priv)
>   	struct intel_guc *guc = &dev_priv->guc;
>
>   	i915_vma_unpin_and_release(&guc->ads_vma);
> +	guc_log_cleanup(guc);
>   	i915_vma_unpin_and_release(&guc->log.vma);

Would it make sense to move the unpin_and_release into the guc_log_cleanup?

>
>   	if (guc->ctx_pool_vma)
> @@ -1095,3 +1285,15 @@ int intel_guc_resume(struct drm_device *dev)
>
>   	return host2guc_action(guc, data, ARRAY_SIZE(data));
>   }
> +
> +void i915_guc_capture_logs(struct drm_i915_private *dev_priv)
> +{
> +	guc_read_update_log_buffer(&dev_priv->guc);
> +
> +	/* Generally device is expected to be active only at this
> +	 * time, so get/put should be really quick.
> +	 */
> +	intel_runtime_pm_get(dev_priv);
> +	host2guc_logbuffer_flush_complete(&dev_priv->guc);
> +	intel_runtime_pm_put(dev_priv);
> +}
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index eb4774f..7e43d7d 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1668,7 +1668,34 @@ static void gen9_guc_irq_handler(struct drm_i915_private *dev_priv, u32 gt_iir)
>   		interrupts_enabled = dev_priv->guc.interrupts_enabled;
>   		spin_unlock(&dev_priv->irq_lock);
>   		if (interrupts_enabled) {
> -			/* TODO: Handle events for which GuC interrupted host */
> +			/* Sample the log buffer flush related bits & clear them
> +			 * out now itself from the message identity register to
> +			 * minimize the probability of losing a flush interrupt,
> +			 * when there are back to back flush interrupts.
> +			 * There can be a new flush interrupt, for different log
> +			 * buffer type (like for ISR), whilst Host is handling
> +			 * one (for DPC). Since same bit is used in message
> +			 * register for ISR & DPC, it could happen that GuC
> +			 * sets the bit for 2nd interrupt but Host clears out
> +			 * the bit on handling the 1st interrupt.
> +			 */
> +			u32 msg, flush;
> +
> +			msg = I915_READ(SOFT_SCRATCH(15));
> +			flush = msg & (GUC2HOST_MSG_CRASH_DUMP_POSTED |
> +				       GUC2HOST_MSG_FLUSH_LOG_BUFFER);
> +			if (flush) {
> +				/* Clear the message bits that are handled */
> +				I915_WRITE(SOFT_SCRATCH(15), msg & ~flush);
> +
> +				/* Handle flush interrupt in bottom half */
> +				queue_work(dev_priv->guc.log.flush_wq,
> +					   &dev_priv->guc.log.flush_work);
> +			} else {
> +				/* Not clearing of unhandled event bits won't
> +				 * result in re-triggering of the interrupt.
> +				 */
> +			}
>   		}
>   	}
>   }
> diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> index 1fc63fe..d053a18 100644
> --- a/drivers/gpu/drm/i915/intel_guc.h
> +++ b/drivers/gpu/drm/i915/intel_guc.h
> @@ -124,6 +124,9 @@ struct intel_guc_fw {
>   struct intel_guc_log {
>   	uint32_t flags;
>   	struct i915_vma *vma;
> +	void *buf_addr;
> +	struct workqueue_struct *flush_wq;
> +	struct work_struct flush_work;
>   };
>
>   struct intel_guc {
> @@ -167,5 +170,6 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv);
>   int i915_guc_wq_check_space(struct drm_i915_gem_request *rq);
>   void i915_guc_submission_disable(struct drm_i915_private *dev_priv);
>   void i915_guc_submission_fini(struct drm_i915_private *dev_priv);
> +void i915_guc_capture_logs(struct drm_i915_private *dev_priv);
>
>   #endif
>

Regards,

Tvrtko
akash.goel@intel.com Aug. 15, 2016, 3:57 p.m. UTC | #2
On 8/15/2016 8:50 PM, Tvrtko Ursulin wrote:
>
>
> On 15/08/16 15:49, akash.goel@intel.com wrote:
>> From: Sagar Arun Kamble <sagar.a.kamble@intel.com>
>>
>> GuC ukernel sends an interrupt to Host to flush the log buffer
>> and expects Host to correspondingly update the read pointer
>> information in the state structure, once it has consumed the
>> log buffer contents by copying them to a file or buffer.
>> Even if Host couldn't copy the contents, it can still update the
>> read pointer so that logging state is not disturbed on GuC side.
>>
>> v2:
>> - Use a dedicated workqueue for handling flush interrupt. (Tvrtko)
>> - Reduce the overall log buffer copying time by skipping the copy of
>>    crash buffer area for regular cases and copying only the state
>>    structure data in first page.
>>
>> v3:
>>   - Create a vmalloc mapping of log buffer. (Chris)
>>   - Cover the flush acknowledgment under rpm get & put.(Chris)
>>   - Revert the change of skipping the copy of crash dump area, as
>>     not really needed, will be covered by subsequent patch.
>>
>> v4:
>>   - Destroy the wq under the same condition in which it was created,
>>     pass dev_piv pointer instead of dev to newly added GuC function,
>>     add more comments & rename variable for clarity. (Tvrtko)
>>
>> v5:
>> - Allocate & destroy the dedicated wq, for handling flush interrupt,
>>    from the setup/teardown routines of GuC logging. (Chris)
>> - Validate the log buffer size value retrieved from state structure
>>    and do some minor cleanup. (Tvrtko)
>> - Fix error/warnings reported by checkpatch. (Tvrtko)
>> - Rebase.
>>
>> Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com>
>> Signed-off-by: Akash Goel <akash.goel@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_guc_submission.c | 202
>> +++++++++++++++++++++++++++++
>>   drivers/gpu/drm/i915/i915_irq.c            |  29 ++++-
>>   drivers/gpu/drm/i915/intel_guc.h           |   4 +
>>   3 files changed, 234 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c
>> b/drivers/gpu/drm/i915/i915_guc_submission.c
>> index b062da6..2b27b87 100644
>> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
>> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
>> @@ -172,6 +172,15 @@ static int host2guc_sample_forcewake(struct
>> intel_guc *guc,
>>       return host2guc_action(guc, data, ARRAY_SIZE(data));
>>   }
>>
>> +static int host2guc_logbuffer_flush_complete(struct intel_guc *guc)
>> +{
>> +    u32 data[1];
>> +
>> +    data[0] = HOST2GUC_ACTION_LOG_BUFFER_FILE_FLUSH_COMPLETE;
>> +
>> +    return host2guc_action(guc, data, 1);
>> +}
>> +
>>   /*
>>    * Initialise, update, or clear doorbell data shared with the GuC
>>    *
>> @@ -828,6 +837,179 @@ err:
>>       return NULL;
>>   }
>>
>> +static void guc_move_to_next_buf(struct intel_guc *guc)
>> +{
>> +}
>> +
>> +static void *guc_get_write_buffer(struct intel_guc *guc)
>> +{
>> +    return NULL;
>> +}
>> +
>> +static void guc_read_update_log_buffer(struct intel_guc *guc)
>> +{
>> +    struct guc_log_buffer_state *log_buffer_state,
>> *log_buffer_snapshot_state;
>> +    struct guc_log_buffer_state log_buffer_state_local;
>> +    void *src_data_ptr, *dst_data_ptr;
>> +    unsigned int buffer_size, expected_size;
>> +    enum guc_log_buffer_type type;
>> +
>> +    if (WARN_ON(!guc->log.buf_addr))
>> +        return;
>> +
>> +    /* Get the pointer to shared GuC log buffer */
>> +    log_buffer_state = src_data_ptr = guc->log.buf_addr;
>> +
>> +    /* Get the pointer to local buffer to store the logs */
>> +    dst_data_ptr = log_buffer_snapshot_state =
>> guc_get_write_buffer(guc);
>> +
>> +    /* Actual logs are present from the 2nd page */
>> +    src_data_ptr += PAGE_SIZE;
>> +    dst_data_ptr += PAGE_SIZE;
>> +
>> +    for (type = GUC_ISR_LOG_BUFFER; type < GUC_MAX_LOG_BUFFER; type++) {
>> +        /* Make a copy of the state structure in GuC log buffer (which
>> +         * is uncached mapped) on the stack to avoid reading from it
>> +         * multiple times.
>> +         */
>> +        memcpy(&log_buffer_state_local, log_buffer_state,
>> +               sizeof(struct guc_log_buffer_state));
>> +        buffer_size = log_buffer_state_local.size;
>> +
>> +        if (log_buffer_snapshot_state) {
>> +            /* First copy the state structure in snapshot buffer */
>> +            memcpy(log_buffer_snapshot_state, &log_buffer_state_local,
>> +                   sizeof(struct guc_log_buffer_state));
>> +
>> +            /* The write pointer could have been updated by the GuC
>> +             * firmware, after sending the flush interrupt to Host,
>> +             * for consistency set the write pointer value to same
>> +             * value of sampled_write_ptr in the snapshot buffer.
>> +             */
>> +            log_buffer_snapshot_state->write_ptr =
>> +                log_buffer_snapshot_state->sampled_write_ptr;
>> +
>> +            log_buffer_snapshot_state++;
>> +
>> +            /* Now copy the actual logs, but before that validate
>> +             * the buffer size value retrieved from state structure.
>> +             */
>> +            if (type == GUC_ISR_LOG_BUFFER)
>> +                expected_size = (GUC_LOG_ISR_PAGES+1)*PAGE_SIZE;
>> +            else if (type == GUC_DPC_LOG_BUFFER)
>> +                expected_size = (GUC_LOG_DPC_PAGES+1)*PAGE_SIZE;
>> +            else
>> +                expected_size = (GUC_LOG_CRASH_PAGES+1)*PAGE_SIZE;
>
> Candidate for extraction to a helper function - to keep the body of the
> loop shorter.
>
Did mulled about this, then finally decided to do it as per your comment.
So would this be fine ?
	expected_size = guc_get_log_buffer_size(type);
And will define the helper in this source file only.

> I would also do this above after the line which loads the buffer_size so
> operations are logically together.
>
> And a nitpick: spaces around arithmetic operators are standard -
> checkpatch did not complain? Maybe it does only in strict mode?
>
Yes it did complain but then had to remove the space to avoid the 80 
character warning.

>> +
>> +            if (unlikely(buffer_size != expected_size)) {
>> +                DRM_ERROR("unexpected log buffer size\n");
>> +                /* Continue with further copying, already state
>> +                 * structure has been copied which is enough to
>> +                 * let Userspace know about the anomaly.
>> +                 */
>> +                buffer_size = expected_size;
>
> Would it be worth checking and protecting against reported buffer size
> being smaller than expected?
>
Fine, will then select the minimum of 2
	buffer_size = min(buffer_size, expected_size);

>> +            }
>> +
>> +            memcpy(dst_data_ptr, src_data_ptr, buffer_size);
>> +
>> +            src_data_ptr += buffer_size;
>> +            dst_data_ptr += buffer_size;
>> +        }
>> +
>> +        /* FIXME: invalidate/flush for log buffer needed */
>> +
>> +        /* Update the read pointer in the shared log buffer */
>> +        log_buffer_state->read_ptr =
>> +            log_buffer_state_local.sampled_write_ptr;
>> +
>> +        /* Clear the 'flush to file' flag */
>> +        log_buffer_state->flush_to_file = 0;
>> +        log_buffer_state++;
>> +    }
>> +
>> +    if (log_buffer_snapshot_state)
>> +        guc_move_to_next_buf(guc);
>> +}
>> +
>> +static void guc_capture_logs_work(struct work_struct *work)
>> +{
>> +    struct drm_i915_private *dev_priv =
>> +        container_of(work, struct drm_i915_private, guc.log.flush_work);
>> +
>> +    spin_lock_irq(&dev_priv->irq_lock);
>> +    /* Speed up work cancellation during disabling guc interrupts. */
>
> Hm I can't see work cancellation in this patch?
>
There is no explicit work cancellation done from anywhere, just 
interrupt disabling is there.

>> +    if (!dev_priv->guc.interrupts_enabled) {
>> +        spin_unlock_irq(&dev_priv->irq_lock);
>> +        return;
>> +    }
>> +    spin_unlock_irq(&dev_priv->irq_lock);
>
> Can gen9_disable_guc_interrupts run on another CPU just at this point?
> So after a spinlock here has been released. If so the spinlock dance
> above does not do anything.
>
Yes gen9_disable_guc_interrupts() can run on another CPU (in case of 
forceful suspend or disabling logging through debugfs control) after the 
spinlock has been released.
But this check is for the case where interrupt disabling happens just
before this work item starts the execution.

Will consider removing this check.

>> +
>> +    i915_guc_capture_logs(dev_priv);
>> +}
>> +
>> +static void guc_log_cleanup(struct intel_guc *guc)
>> +{
>> +    struct drm_i915_private *dev_priv = guc_to_i915(guc);
>> +
>> +    lockdep_assert_held(&dev_priv->drm.struct_mutex);
>> +
>> +    if (i915.guc_log_level < 0)
>> +        return;
>> +
>> +    /* First disable the flush interrupt */
>> +    gen9_disable_guc_interrupts(dev_priv);
>> +
>> +    if (guc->log.flush_wq)
>> +        destroy_workqueue(guc->log.flush_wq);
>
> It doesn't need cancel_work_sync before or something?
>
This is done by destroy_workqueue only, any work items queued on the wq 
are first flushed out before wq is destroyed.

>> +
>> +    guc->log.flush_wq = NULL;
>> +
>> +    if (guc->log.buf_addr)
>> +        i915_gem_object_unpin_map(guc->log.vma->obj);
>> +
>> +    guc->log.buf_addr = NULL;
>> +}
>> +
>> +static int guc_create_log_extras(struct intel_guc *guc)
>> +{
>> +    struct drm_i915_private *dev_priv = guc_to_i915(guc);
>> +    void *vaddr;
>> +    int ret;
>> +
>> +    lockdep_assert_held(&dev_priv->drm.struct_mutex);
>> +
>> +    /* Nothing to do */
>> +    if (i915.guc_log_level < 0)
>> +        return 0;
>> +
>> +    if (!guc->log.buf_addr) {
>> +        /* Create a vmalloc mapping of log buffer pages */
>> +        vaddr = i915_gem_object_pin_map(guc->log.vma->obj, I915_MAP_WB);
>> +        if (IS_ERR(vaddr)) {
>> +            ret = PTR_ERR(vaddr);
>> +            DRM_ERROR("Couldn't map log buffer pages %d\n", ret);
>> +            return ret;
>> +        }
>> +
>> +        guc->log.buf_addr = vaddr;
>> +    }
>> +
>> +    if (!guc->log.flush_wq) {
>> +        INIT_WORK(&guc->log.flush_work, guc_capture_logs_work);
>> +
>> +        /* Need a dedicated wq to process log buffer flush interrupts
>> +         * from GuC without much delay so as to avoid any loss of logs.
>> +         */
>> +        guc->log.flush_wq = alloc_ordered_workqueue("i915-guc_log", 0);
>> +        if (guc->log.flush_wq == NULL) {
>> +            DRM_ERROR("Couldn't allocate the wq for GuC logging\n");
>> +            return -ENOMEM;
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   static void guc_create_log(struct intel_guc *guc)
>>   {
>>       struct i915_vma *vma;
>> @@ -853,6 +1035,13 @@ static void guc_create_log(struct intel_guc *guc)
>>           }
>>
>>           guc->log.vma = vma;
>> +
>> +        if (guc_create_log_extras(guc)) {
>> +            guc_log_cleanup(guc);
>> +            i915_vma_unpin_and_release(&guc->log.vma);
>> +            i915.guc_log_level = -1;
>> +            return;
>> +        }
>>       }
>>
>>       /* each allocated unit is a page */
>> @@ -1034,6 +1223,7 @@ void i915_guc_submission_fini(struct
>> drm_i915_private *dev_priv)
>>       struct intel_guc *guc = &dev_priv->guc;
>>
>>       i915_vma_unpin_and_release(&guc->ads_vma);
>> +    guc_log_cleanup(guc);
>>       i915_vma_unpin_and_release(&guc->log.vma);
>
> Would it make sense to move the unpin_and_release into the guc_log_cleanup?
>
Have moved guc_log_cleanup to i915_guc_unregister() function in the 
later patch (to match the i915_guc_register()).
Would like to keep unpin_and_release here only to match the 
guc_submission_init().

>>
>>       if (guc->ctx_pool_vma)
>> @@ -1095,3 +1285,15 @@ int intel_guc_resume(struct drm_device *dev)
>>
>>       return host2guc_action(guc, data, ARRAY_SIZE(data));
>>   }
>> +
>> +void i915_guc_capture_logs(struct drm_i915_private *dev_priv)
>> +{
>> +    guc_read_update_log_buffer(&dev_priv->guc);
>> +
>> +    /* Generally device is expected to be active only at this
>> +     * time, so get/put should be really quick.
>> +     */
>> +    intel_runtime_pm_get(dev_priv);
>> +    host2guc_logbuffer_flush_complete(&dev_priv->guc);
>> +    intel_runtime_pm_put(dev_priv);
>> +}
>> diff --git a/drivers/gpu/drm/i915/i915_irq.c
>> b/drivers/gpu/drm/i915/i915_irq.c
>> index eb4774f..7e43d7d 100644
>> --- a/drivers/gpu/drm/i915/i915_irq.c
>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>> @@ -1668,7 +1668,34 @@ static void gen9_guc_irq_handler(struct
>> drm_i915_private *dev_priv, u32 gt_iir)
>>           interrupts_enabled = dev_priv->guc.interrupts_enabled;
>>           spin_unlock(&dev_priv->irq_lock);
>>           if (interrupts_enabled) {
>> -            /* TODO: Handle events for which GuC interrupted host */
>> +            /* Sample the log buffer flush related bits & clear them
>> +             * out now itself from the message identity register to
>> +             * minimize the probability of losing a flush interrupt,
>> +             * when there are back to back flush interrupts.
>> +             * There can be a new flush interrupt, for different log
>> +             * buffer type (like for ISR), whilst Host is handling
>> +             * one (for DPC). Since same bit is used in message
>> +             * register for ISR & DPC, it could happen that GuC
>> +             * sets the bit for 2nd interrupt but Host clears out
>> +             * the bit on handling the 1st interrupt.
>> +             */
>> +            u32 msg, flush;
>> +
>> +            msg = I915_READ(SOFT_SCRATCH(15));
>> +            flush = msg & (GUC2HOST_MSG_CRASH_DUMP_POSTED |
>> +                       GUC2HOST_MSG_FLUSH_LOG_BUFFER);
>> +            if (flush) {
>> +                /* Clear the message bits that are handled */
>> +                I915_WRITE(SOFT_SCRATCH(15), msg & ~flush);
>> +
>> +                /* Handle flush interrupt in bottom half */
>> +                queue_work(dev_priv->guc.log.flush_wq,
>> +                       &dev_priv->guc.log.flush_work);
>> +            } else {
>> +                /* Not clearing of unhandled event bits won't
>> +                 * result in re-triggering of the interrupt.
>> +                 */
>> +            }
>>           }
>>       }
>>   }
>> diff --git a/drivers/gpu/drm/i915/intel_guc.h
>> b/drivers/gpu/drm/i915/intel_guc.h
>> index 1fc63fe..d053a18 100644
>> --- a/drivers/gpu/drm/i915/intel_guc.h
>> +++ b/drivers/gpu/drm/i915/intel_guc.h
>> @@ -124,6 +124,9 @@ struct intel_guc_fw {
>>   struct intel_guc_log {
>>       uint32_t flags;
>>       struct i915_vma *vma;
>> +    void *buf_addr;
>> +    struct workqueue_struct *flush_wq;
>> +    struct work_struct flush_work;
>>   };
>>
>>   struct intel_guc {
>> @@ -167,5 +170,6 @@ int i915_guc_submission_enable(struct
>> drm_i915_private *dev_priv);
>>   int i915_guc_wq_check_space(struct drm_i915_gem_request *rq);
>>   void i915_guc_submission_disable(struct drm_i915_private *dev_priv);
>>   void i915_guc_submission_fini(struct drm_i915_private *dev_priv);
>> +void i915_guc_capture_logs(struct drm_i915_private *dev_priv);
>>
>>   #endif
>>
>
> Regards,
>
> Tvrtko
Chris Wilson Aug. 15, 2016, 4:06 p.m. UTC | #3
On Mon, Aug 15, 2016 at 08:19:47PM +0530, akash.goel@intel.com wrote:
> +static void guc_read_update_log_buffer(struct intel_guc *guc)
> +{
> +	struct guc_log_buffer_state *log_buffer_state, *log_buffer_snapshot_state;
> +	struct guc_log_buffer_state log_buffer_state_local;
> +	void *src_data_ptr, *dst_data_ptr;
> +	unsigned int buffer_size, expected_size;
> +	enum guc_log_buffer_type type;
> +
> +	if (WARN_ON(!guc->log.buf_addr))
> +		return;
> +
> +	/* Get the pointer to shared GuC log buffer */
> +	log_buffer_state = src_data_ptr = guc->log.buf_addr;
> +
> +	/* Get the pointer to local buffer to store the logs */
> +	dst_data_ptr = log_buffer_snapshot_state = guc_get_write_buffer(guc);
> +
> +	/* Actual logs are present from the 2nd page */
> +	src_data_ptr += PAGE_SIZE;
> +	dst_data_ptr += PAGE_SIZE;
> +
> +	for (type = GUC_ISR_LOG_BUFFER; type < GUC_MAX_LOG_BUFFER; type++) {
> +		/* Make a copy of the state structure in GuC log buffer (which
> +		 * is uncached mapped) on the stack to avoid reading from it
> +		 * multiple times.
> +		 */
> +		memcpy(&log_buffer_state_local, log_buffer_state,
> +		       sizeof(struct guc_log_buffer_state));
> +		buffer_size = log_buffer_state_local.size;
> +
> +		if (log_buffer_snapshot_state) {
> +			/* First copy the state structure in snapshot buffer */
> +			memcpy(log_buffer_snapshot_state, &log_buffer_state_local,
> +			       sizeof(struct guc_log_buffer_state));
> +
> +			/* The write pointer could have been updated by the GuC
> +			 * firmware, after sending the flush interrupt to Host,
> +			 * for consistency set the write pointer value to same
> +			 * value of sampled_write_ptr in the snapshot buffer.
> +			 */
> +			log_buffer_snapshot_state->write_ptr =
> +				log_buffer_snapshot_state->sampled_write_ptr;
> +
> +			log_buffer_snapshot_state++;
> +
> +			/* Now copy the actual logs, but before that validate
> +			 * the buffer size value retrieved from state structure.
> +			 */
> +			if (type == GUC_ISR_LOG_BUFFER)
> +				expected_size = (GUC_LOG_ISR_PAGES+1)*PAGE_SIZE;
> +			else if (type == GUC_DPC_LOG_BUFFER)
> +				expected_size = (GUC_LOG_DPC_PAGES+1)*PAGE_SIZE;
> +			else
> +				expected_size = (GUC_LOG_CRASH_PAGES+1)*PAGE_SIZE;
> +
> +			if (unlikely(buffer_size != expected_size)) {
> +				DRM_ERROR("unexpected log buffer size\n");
> +				/* Continue with further copying, already state
> +				 * structure has been copied which is enough to
> +				 * let Userspace know about the anomaly.
> +				 */
> +				buffer_size = expected_size;

Urm, no.

You tell userspace one thing and then do another. This code should just
be a conduit and not apply its own outdated interpretation.

> +			}
> +
> +			memcpy(dst_data_ptr, src_data_ptr, buffer_size);

Where do you validate that buffer_size is sane before copying?
-Chris
akash.goel@intel.com Aug. 15, 2016, 4:46 p.m. UTC | #4
On 8/15/2016 9:36 PM, Chris Wilson wrote:
> On Mon, Aug 15, 2016 at 08:19:47PM +0530, akash.goel@intel.com wrote:
>> +static void guc_read_update_log_buffer(struct intel_guc *guc)
>> +{
>> +	struct guc_log_buffer_state *log_buffer_state, *log_buffer_snapshot_state;
>> +	struct guc_log_buffer_state log_buffer_state_local;
>> +	void *src_data_ptr, *dst_data_ptr;
>> +	unsigned int buffer_size, expected_size;
>> +	enum guc_log_buffer_type type;
>> +
>> +	if (WARN_ON(!guc->log.buf_addr))
>> +		return;
>> +
>> +	/* Get the pointer to shared GuC log buffer */
>> +	log_buffer_state = src_data_ptr = guc->log.buf_addr;
>> +
>> +	/* Get the pointer to local buffer to store the logs */
>> +	dst_data_ptr = log_buffer_snapshot_state = guc_get_write_buffer(guc);
>> +
>> +	/* Actual logs are present from the 2nd page */
>> +	src_data_ptr += PAGE_SIZE;
>> +	dst_data_ptr += PAGE_SIZE;
>> +
>> +	for (type = GUC_ISR_LOG_BUFFER; type < GUC_MAX_LOG_BUFFER; type++) {
>> +		/* Make a copy of the state structure in GuC log buffer (which
>> +		 * is uncached mapped) on the stack to avoid reading from it
>> +		 * multiple times.
>> +		 */
>> +		memcpy(&log_buffer_state_local, log_buffer_state,
>> +		       sizeof(struct guc_log_buffer_state));
>> +		buffer_size = log_buffer_state_local.size;
>> +
>> +		if (log_buffer_snapshot_state) {
>> +			/* First copy the state structure in snapshot buffer */
>> +			memcpy(log_buffer_snapshot_state, &log_buffer_state_local,
>> +			       sizeof(struct guc_log_buffer_state));
>> +
>> +			/* The write pointer could have been updated by the GuC
>> +			 * firmware, after sending the flush interrupt to Host,
>> +			 * for consistency set the write pointer value to same
>> +			 * value of sampled_write_ptr in the snapshot buffer.
>> +			 */
>> +			log_buffer_snapshot_state->write_ptr =
>> +				log_buffer_snapshot_state->sampled_write_ptr;
>> +
>> +			log_buffer_snapshot_state++;
>> +
>> +			/* Now copy the actual logs, but before that validate
>> +			 * the buffer size value retrieved from state structure.
>> +			 */
>> +			if (type == GUC_ISR_LOG_BUFFER)
>> +				expected_size = (GUC_LOG_ISR_PAGES+1)*PAGE_SIZE;
>> +			else if (type == GUC_DPC_LOG_BUFFER)
>> +				expected_size = (GUC_LOG_DPC_PAGES+1)*PAGE_SIZE;
>> +			else
>> +				expected_size = (GUC_LOG_CRASH_PAGES+1)*PAGE_SIZE;
>> +
>> +			if (unlikely(buffer_size != expected_size)) {
>> +				DRM_ERROR("unexpected log buffer size\n");
>> +				/* Continue with further copying, already state
>> +				 * structure has been copied which is enough to
>> +				 * let Userspace know about the anomaly.
>> +				 */
>> +				buffer_size = expected_size;
>
> Urm, no.
>
> You tell userspace one thing and then do another. This code should just
> be a conduit and not apply its own outdated interpretation.
>
Userspace parser would get to know from the state structure about the 
anomalous buffer size.

Please suggest that what should be done here ideally.

Should the further copying (for this snapshot) be skipped ?

>> +			}
>> +
>> +			memcpy(dst_data_ptr, src_data_ptr, buffer_size);
>
> Where do you validate that buffer_size is sane before copying?
Sorry didn't get you, the check for buffer_size is being done right 
before this memcpy.

Best regards
Akash
> -Chris
>
Chris Wilson Aug. 15, 2016, 4:56 p.m. UTC | #5
On Mon, Aug 15, 2016 at 10:16:56PM +0530, Goel, Akash wrote:
> 
> 
> On 8/15/2016 9:36 PM, Chris Wilson wrote:
> >On Mon, Aug 15, 2016 at 08:19:47PM +0530, akash.goel@intel.com wrote:
> >>+static void guc_read_update_log_buffer(struct intel_guc *guc)
> >>+{
> >>+	struct guc_log_buffer_state *log_buffer_state, *log_buffer_snapshot_state;
> >>+	struct guc_log_buffer_state log_buffer_state_local;
> >>+	void *src_data_ptr, *dst_data_ptr;
> >>+	unsigned int buffer_size, expected_size;
> >>+	enum guc_log_buffer_type type;
> >>+
> >>+	if (WARN_ON(!guc->log.buf_addr))
> >>+		return;
> >>+
> >>+	/* Get the pointer to shared GuC log buffer */
> >>+	log_buffer_state = src_data_ptr = guc->log.buf_addr;
> >>+
> >>+	/* Get the pointer to local buffer to store the logs */
> >>+	dst_data_ptr = log_buffer_snapshot_state = guc_get_write_buffer(guc);
> >>+
> >>+	/* Actual logs are present from the 2nd page */
> >>+	src_data_ptr += PAGE_SIZE;
> >>+	dst_data_ptr += PAGE_SIZE;
> >>+
> >>+	for (type = GUC_ISR_LOG_BUFFER; type < GUC_MAX_LOG_BUFFER; type++) {
> >>+		/* Make a copy of the state structure in GuC log buffer (which
> >>+		 * is uncached mapped) on the stack to avoid reading from it
> >>+		 * multiple times.
> >>+		 */
> >>+		memcpy(&log_buffer_state_local, log_buffer_state,
> >>+		       sizeof(struct guc_log_buffer_state));
> >>+		buffer_size = log_buffer_state_local.size;
> >>+
> >>+		if (log_buffer_snapshot_state) {
> >>+			/* First copy the state structure in snapshot buffer */
> >>+			memcpy(log_buffer_snapshot_state, &log_buffer_state_local,
> >>+			       sizeof(struct guc_log_buffer_state));
> >>+
> >>+			/* The write pointer could have been updated by the GuC
> >>+			 * firmware, after sending the flush interrupt to Host,
> >>+			 * for consistency set the write pointer value to same
> >>+			 * value of sampled_write_ptr in the snapshot buffer.
> >>+			 */
> >>+			log_buffer_snapshot_state->write_ptr =
> >>+				log_buffer_snapshot_state->sampled_write_ptr;
> >>+
> >>+			log_buffer_snapshot_state++;
> >>+
> >>+			/* Now copy the actual logs, but before that validate
> >>+			 * the buffer size value retrieved from state structure.
> >>+			 */
> >>+			if (type == GUC_ISR_LOG_BUFFER)
> >>+				expected_size = (GUC_LOG_ISR_PAGES+1)*PAGE_SIZE;
> >>+			else if (type == GUC_DPC_LOG_BUFFER)
> >>+				expected_size = (GUC_LOG_DPC_PAGES+1)*PAGE_SIZE;
> >>+			else
> >>+				expected_size = (GUC_LOG_CRASH_PAGES+1)*PAGE_SIZE;
> >>+
> >>+			if (unlikely(buffer_size != expected_size)) {
> >>+				DRM_ERROR("unexpected log buffer size\n");
> >>+				/* Continue with further copying, already state
> >>+				 * structure has been copied which is enough to
> >>+				 * let Userspace know about the anomaly.
> >>+				 */
> >>+				buffer_size = expected_size;
> >
> >Urm, no.
> >
> >You tell userspace one thing and then do another. This code should just
> >be a conduit and not apply its own outdated interpretation.
> >
> Userspace parser would get to know from the state structure about
> the anomalous buffer size.

It will, but it won't be told what the kernel did. So if believes the
GuC (as it should since it is a packet that should be unadulterated) the
entire stream is then corrupt.

> Please suggest that what should be done here ideally.
> 
> Should the further copying (for this snapshot) be skipped ?

The kernel should be avoiding interpretting the log packets as much as
possible - I would prefer it if we just moved the byte stream without
trying to interpret it as datagrams. But there is probably some merit to
at least using the log packets (datagrams).

> >>+			}
> >>+
> >>+			memcpy(dst_data_ptr, src_data_ptr, buffer_size);
> >
> >Where do you validate that buffer_size is sane before copying?
> Sorry didn't get you, the check for buffer_size is being done right
> before this memcpy.

There is no explicit check for valid src_data_ptr + buffer_size or
dst_data_ptr + buffer_size, and a quick glance at the code suggested no
reason to believe they must be valid.
-Chris
akash.goel@intel.com Aug. 16, 2016, 5:37 a.m. UTC | #6
On 8/15/2016 10:26 PM, Chris Wilson wrote:
> On Mon, Aug 15, 2016 at 10:16:56PM +0530, Goel, Akash wrote:
>>
>>
>> On 8/15/2016 9:36 PM, Chris Wilson wrote:
>>> On Mon, Aug 15, 2016 at 08:19:47PM +0530, akash.goel@intel.com wrote:
>>>> +static void guc_read_update_log_buffer(struct intel_guc *guc)
>>>> +{
>>>> +	struct guc_log_buffer_state *log_buffer_state, *log_buffer_snapshot_state;
>>>> +	struct guc_log_buffer_state log_buffer_state_local;
>>>> +	void *src_data_ptr, *dst_data_ptr;
>>>> +	unsigned int buffer_size, expected_size;
>>>> +	enum guc_log_buffer_type type;
>>>> +
>>>> +	if (WARN_ON(!guc->log.buf_addr))
>>>> +		return;
>>>> +
>>>> +	/* Get the pointer to shared GuC log buffer */
>>>> +	log_buffer_state = src_data_ptr = guc->log.buf_addr;
>>>> +
>>>> +	/* Get the pointer to local buffer to store the logs */
>>>> +	dst_data_ptr = log_buffer_snapshot_state = guc_get_write_buffer(guc);
>>>> +
>>>> +	/* Actual logs are present from the 2nd page */
>>>> +	src_data_ptr += PAGE_SIZE;
>>>> +	dst_data_ptr += PAGE_SIZE;
>>>> +
>>>> +	for (type = GUC_ISR_LOG_BUFFER; type < GUC_MAX_LOG_BUFFER; type++) {
>>>> +		/* Make a copy of the state structure in GuC log buffer (which
>>>> +		 * is uncached mapped) on the stack to avoid reading from it
>>>> +		 * multiple times.
>>>> +		 */
>>>> +		memcpy(&log_buffer_state_local, log_buffer_state,
>>>> +		       sizeof(struct guc_log_buffer_state));
>>>> +		buffer_size = log_buffer_state_local.size;
>>>> +
>>>> +		if (log_buffer_snapshot_state) {
>>>> +			/* First copy the state structure in snapshot buffer */
>>>> +			memcpy(log_buffer_snapshot_state, &log_buffer_state_local,
>>>> +			       sizeof(struct guc_log_buffer_state));
>>>> +
>>>> +			/* The write pointer could have been updated by the GuC
>>>> +			 * firmware, after sending the flush interrupt to Host,
>>>> +			 * for consistency set the write pointer value to same
>>>> +			 * value of sampled_write_ptr in the snapshot buffer.
>>>> +			 */
>>>> +			log_buffer_snapshot_state->write_ptr =
>>>> +				log_buffer_snapshot_state->sampled_write_ptr;
>>>> +
>>>> +			log_buffer_snapshot_state++;
>>>> +
>>>> +			/* Now copy the actual logs, but before that validate
>>>> +			 * the buffer size value retrieved from state structure.
>>>> +			 */
>>>> +			if (type == GUC_ISR_LOG_BUFFER)
>>>> +				expected_size = (GUC_LOG_ISR_PAGES+1)*PAGE_SIZE;
>>>> +			else if (type == GUC_DPC_LOG_BUFFER)
>>>> +				expected_size = (GUC_LOG_DPC_PAGES+1)*PAGE_SIZE;
>>>> +			else
>>>> +				expected_size = (GUC_LOG_CRASH_PAGES+1)*PAGE_SIZE;
>>>> +
>>>> +			if (unlikely(buffer_size != expected_size)) {
>>>> +				DRM_ERROR("unexpected log buffer size\n");
>>>> +				/* Continue with further copying, already state
>>>> +				 * structure has been copied which is enough to
>>>> +				 * let Userspace know about the anomaly.
>>>> +				 */
>>>> +				buffer_size = expected_size;
>>>
>>> Urm, no.
>>>
>>> You tell userspace one thing and then do another. This code should just
>>> be a conduit and not apply its own outdated interpretation.
>>>
>> Userspace parser would get to know from the state structure about
>> the anomalous buffer size.
>
> It will, but it won't be told what the kernel did. So if believes the
> GuC (as it should since it is a packet that should be unadulterated) the
> entire stream is then corrupt.
>
>> Please suggest that what should be done here ideally.
>>
>> Should the further copying (for this snapshot) be skipped ?
>
> The kernel should be avoiding interpretting the log packets as much as
> possible - I would prefer it if we just moved the byte stream without
> trying to interpret it as datagrams. But there is probably some merit to
> at least using the log packets (datagrams).
>
It would have been ideal if log packets can be dumped without any 
interpretation.

We copy the payload without any interpretation, only some bits of header 
we parse.

We also have to interpret the header (in subsequent patch) to copy only 
the updated payload data, for better performance.

>>>> +			}
>>>> +
>>>> +			memcpy(dst_data_ptr, src_data_ptr, buffer_size);
>>>
>>> Where do you validate that buffer_size is sane before copying?
>> Sorry didn't get you, the check for buffer_size is being done right
>> before this memcpy.
>
> There is no explicit check for valid src_data_ptr + buffer_size or
> dst_data_ptr + buffer_size, and a quick glance at the code suggested no
> reason to believe they must be valid.
Actually if buffer_size has been validated & corrected, then both 
src_data_ptr + buffer_size and dst_data_ptr + buffer_size should be 
valid. Both buffers have been allocated by Driver.

Will avoid reading the buffer_size from the header and just move the 
pointer & do the copy as per the expected buffer size, which cannot go 
wrong (out of bounds) as Driver only allocated the log buffer.

Best regards
Akash

> -Chris
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index b062da6..2b27b87 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -172,6 +172,15 @@  static int host2guc_sample_forcewake(struct intel_guc *guc,
 	return host2guc_action(guc, data, ARRAY_SIZE(data));
 }
 
+static int host2guc_logbuffer_flush_complete(struct intel_guc *guc)
+{
+	u32 data[1];
+
+	data[0] = HOST2GUC_ACTION_LOG_BUFFER_FILE_FLUSH_COMPLETE;
+
+	return host2guc_action(guc, data, 1);
+}
+
 /*
  * Initialise, update, or clear doorbell data shared with the GuC
  *
@@ -828,6 +837,179 @@  err:
 	return NULL;
 }
 
+static void guc_move_to_next_buf(struct intel_guc *guc)
+{
+}
+
+static void *guc_get_write_buffer(struct intel_guc *guc)
+{
+	return NULL;
+}
+
+static void guc_read_update_log_buffer(struct intel_guc *guc)
+{
+	struct guc_log_buffer_state *log_buffer_state, *log_buffer_snapshot_state;
+	struct guc_log_buffer_state log_buffer_state_local;
+	void *src_data_ptr, *dst_data_ptr;
+	unsigned int buffer_size, expected_size;
+	enum guc_log_buffer_type type;
+
+	if (WARN_ON(!guc->log.buf_addr))
+		return;
+
+	/* Get the pointer to shared GuC log buffer */
+	log_buffer_state = src_data_ptr = guc->log.buf_addr;
+
+	/* Get the pointer to local buffer to store the logs */
+	dst_data_ptr = log_buffer_snapshot_state = guc_get_write_buffer(guc);
+
+	/* Actual logs are present from the 2nd page */
+	src_data_ptr += PAGE_SIZE;
+	dst_data_ptr += PAGE_SIZE;
+
+	for (type = GUC_ISR_LOG_BUFFER; type < GUC_MAX_LOG_BUFFER; type++) {
+		/* Make a copy of the state structure in GuC log buffer (which
+		 * is uncached mapped) on the stack to avoid reading from it
+		 * multiple times.
+		 */
+		memcpy(&log_buffer_state_local, log_buffer_state,
+		       sizeof(struct guc_log_buffer_state));
+		buffer_size = log_buffer_state_local.size;
+
+		if (log_buffer_snapshot_state) {
+			/* First copy the state structure in snapshot buffer */
+			memcpy(log_buffer_snapshot_state, &log_buffer_state_local,
+			       sizeof(struct guc_log_buffer_state));
+
+			/* The write pointer could have been updated by the GuC
+			 * firmware, after sending the flush interrupt to Host,
+			 * for consistency set the write pointer value to same
+			 * value of sampled_write_ptr in the snapshot buffer.
+			 */
+			log_buffer_snapshot_state->write_ptr =
+				log_buffer_snapshot_state->sampled_write_ptr;
+
+			log_buffer_snapshot_state++;
+
+			/* Now copy the actual logs, but before that validate
+			 * the buffer size value retrieved from state structure.
+			 */
+			if (type == GUC_ISR_LOG_BUFFER)
+				expected_size = (GUC_LOG_ISR_PAGES+1)*PAGE_SIZE;
+			else if (type == GUC_DPC_LOG_BUFFER)
+				expected_size = (GUC_LOG_DPC_PAGES+1)*PAGE_SIZE;
+			else
+				expected_size = (GUC_LOG_CRASH_PAGES+1)*PAGE_SIZE;
+
+			if (unlikely(buffer_size != expected_size)) {
+				DRM_ERROR("unexpected log buffer size\n");
+				/* Continue with further copying, already state
+				 * structure has been copied which is enough to
+				 * let Userspace know about the anomaly.
+				 */
+				buffer_size = expected_size;
+			}
+
+			memcpy(dst_data_ptr, src_data_ptr, buffer_size);
+
+			src_data_ptr += buffer_size;
+			dst_data_ptr += buffer_size;
+		}
+
+		/* FIXME: invalidate/flush for log buffer needed */
+
+		/* Update the read pointer in the shared log buffer */
+		log_buffer_state->read_ptr =
+			log_buffer_state_local.sampled_write_ptr;
+
+		/* Clear the 'flush to file' flag */
+		log_buffer_state->flush_to_file = 0;
+		log_buffer_state++;
+	}
+
+	if (log_buffer_snapshot_state)
+		guc_move_to_next_buf(guc);
+}
+
+static void guc_capture_logs_work(struct work_struct *work)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(work, struct drm_i915_private, guc.log.flush_work);
+
+	spin_lock_irq(&dev_priv->irq_lock);
+	/* Speed up work cancellation during disabling guc interrupts. */
+	if (!dev_priv->guc.interrupts_enabled) {
+		spin_unlock_irq(&dev_priv->irq_lock);
+		return;
+	}
+	spin_unlock_irq(&dev_priv->irq_lock);
+
+	i915_guc_capture_logs(dev_priv);
+}
+
+static void guc_log_cleanup(struct intel_guc *guc)
+{
+	struct drm_i915_private *dev_priv = guc_to_i915(guc);
+
+	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+
+	if (i915.guc_log_level < 0)
+		return;
+
+	/* First disable the flush interrupt */
+	gen9_disable_guc_interrupts(dev_priv);
+
+	if (guc->log.flush_wq)
+		destroy_workqueue(guc->log.flush_wq);
+
+	guc->log.flush_wq = NULL;
+
+	if (guc->log.buf_addr)
+		i915_gem_object_unpin_map(guc->log.vma->obj);
+
+	guc->log.buf_addr = NULL;
+}
+
+static int guc_create_log_extras(struct intel_guc *guc)
+{
+	struct drm_i915_private *dev_priv = guc_to_i915(guc);
+	void *vaddr;
+	int ret;
+
+	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+
+	/* Nothing to do */
+	if (i915.guc_log_level < 0)
+		return 0;
+
+	if (!guc->log.buf_addr) {
+		/* Create a vmalloc mapping of log buffer pages */
+		vaddr = i915_gem_object_pin_map(guc->log.vma->obj, I915_MAP_WB);
+		if (IS_ERR(vaddr)) {
+			ret = PTR_ERR(vaddr);
+			DRM_ERROR("Couldn't map log buffer pages %d\n", ret);
+			return ret;
+		}
+
+		guc->log.buf_addr = vaddr;
+	}
+
+	if (!guc->log.flush_wq) {
+		INIT_WORK(&guc->log.flush_work, guc_capture_logs_work);
+
+		/* Need a dedicated wq to process log buffer flush interrupts
+		 * from GuC without much delay so as to avoid any loss of logs.
+		 */
+		guc->log.flush_wq = alloc_ordered_workqueue("i915-guc_log", 0);
+		if (guc->log.flush_wq == NULL) {
+			DRM_ERROR("Couldn't allocate the wq for GuC logging\n");
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 static void guc_create_log(struct intel_guc *guc)
 {
 	struct i915_vma *vma;
@@ -853,6 +1035,13 @@  static void guc_create_log(struct intel_guc *guc)
 		}
 
 		guc->log.vma = vma;
+
+		if (guc_create_log_extras(guc)) {
+			guc_log_cleanup(guc);
+			i915_vma_unpin_and_release(&guc->log.vma);
+			i915.guc_log_level = -1;
+			return;
+		}
 	}
 
 	/* each allocated unit is a page */
@@ -1034,6 +1223,7 @@  void i915_guc_submission_fini(struct drm_i915_private *dev_priv)
 	struct intel_guc *guc = &dev_priv->guc;
 
 	i915_vma_unpin_and_release(&guc->ads_vma);
+	guc_log_cleanup(guc);
 	i915_vma_unpin_and_release(&guc->log.vma);
 
 	if (guc->ctx_pool_vma)
@@ -1095,3 +1285,15 @@  int intel_guc_resume(struct drm_device *dev)
 
 	return host2guc_action(guc, data, ARRAY_SIZE(data));
 }
+
+void i915_guc_capture_logs(struct drm_i915_private *dev_priv)
+{
+	guc_read_update_log_buffer(&dev_priv->guc);
+
+	/* Generally device is expected to be active only at this
+	 * time, so get/put should be really quick.
+	 */
+	intel_runtime_pm_get(dev_priv);
+	host2guc_logbuffer_flush_complete(&dev_priv->guc);
+	intel_runtime_pm_put(dev_priv);
+}
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index eb4774f..7e43d7d 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1668,7 +1668,34 @@  static void gen9_guc_irq_handler(struct drm_i915_private *dev_priv, u32 gt_iir)
 		interrupts_enabled = dev_priv->guc.interrupts_enabled;
 		spin_unlock(&dev_priv->irq_lock);
 		if (interrupts_enabled) {
-			/* TODO: Handle events for which GuC interrupted host */
+			/* Sample the log buffer flush related bits & clear them
+			 * out now itself from the message identity register to
+			 * minimize the probability of losing a flush interrupt,
+			 * when there are back to back flush interrupts.
+			 * There can be a new flush interrupt, for different log
+			 * buffer type (like for ISR), whilst Host is handling
+			 * one (for DPC). Since same bit is used in message
+			 * register for ISR & DPC, it could happen that GuC
+			 * sets the bit for 2nd interrupt but Host clears out
+			 * the bit on handling the 1st interrupt.
+			 */
+			u32 msg, flush;
+
+			msg = I915_READ(SOFT_SCRATCH(15));
+			flush = msg & (GUC2HOST_MSG_CRASH_DUMP_POSTED |
+				       GUC2HOST_MSG_FLUSH_LOG_BUFFER);
+			if (flush) {
+				/* Clear the message bits that are handled */
+				I915_WRITE(SOFT_SCRATCH(15), msg & ~flush);
+
+				/* Handle flush interrupt in bottom half */
+				queue_work(dev_priv->guc.log.flush_wq,
+					   &dev_priv->guc.log.flush_work);
+			} else {
+				/* Not clearing of unhandled event bits won't
+				 * result in re-triggering of the interrupt.
+				 */
+			}
 		}
 	}
 }
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 1fc63fe..d053a18 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -124,6 +124,9 @@  struct intel_guc_fw {
 struct intel_guc_log {
 	uint32_t flags;
 	struct i915_vma *vma;
+	void *buf_addr;
+	struct workqueue_struct *flush_wq;
+	struct work_struct flush_work;
 };
 
 struct intel_guc {
@@ -167,5 +170,6 @@  int i915_guc_submission_enable(struct drm_i915_private *dev_priv);
 int i915_guc_wq_check_space(struct drm_i915_gem_request *rq);
 void i915_guc_submission_disable(struct drm_i915_private *dev_priv);
 void i915_guc_submission_fini(struct drm_i915_private *dev_priv);
+void i915_guc_capture_logs(struct drm_i915_private *dev_priv);
 
 #endif