From ecd1e401ad6bb663c910b0a943dc8a276d1300e2 Mon Sep 17 00:00:00 2001 From: Mika Kuoppala Date: Wed, 5 Feb 2014 15:00:43 +0200 Subject: [PATCH] drm/i915: add reason for capturing the error state We capture error state not only when the GPU hangs but also on other situations as in interrupt errors and in situations where we can kick things forward without GPU reset. There will be log entry on most of these cases. But as error state capture might be only thing we have or as in GEN4 case, interrupt can triggered error state capture without log entry, the exact reason why capture was made is hard to decipher. To avoid confusion why the error state was captured, record the reason and stick it into the error state. Reference: https://bugs.freedesktop.org/show_bug.cgi?id=74193 Signed-off-by: Mika Kuoppala --- drivers/gpu/drm/i915/i915_drv.h | 15 +++++++++++++-- drivers/gpu/drm/i915/i915_gpu_error.c | 26 +++++++++++++++++++++++++- drivers/gpu/drm/i915/i915_irq.c | 22 +++++++++++----------- 3 files changed, 49 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e908c99..ca4837a 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -293,10 +293,20 @@ struct sdvo_device_mapping { struct intel_display_error_state; +enum error_capture_reason { + ERROR_HANGCHECK_RINGS_HUNG, + ERROR_HANGCHECK_KICK_WAIT, + ERROR_HANGCHECK_KICK_SEMAPHORE, + ERROR_IIR_INTERRUPT, + ERROR_PM_INTERRUPT, + ERROR_GT_INTERRUPT +}; + struct drm_i915_error_state { struct kref ref; struct timeval time; + enum error_capture_reason reason; /* Generic register state */ u32 eir; u32 pgtbl_er; @@ -1982,7 +1992,7 @@ extern void intel_console_resume(struct work_struct *work); /* i915_irq.c */ void i915_queue_hangcheck(struct drm_device *dev); -void i915_handle_error(struct drm_device *dev, bool wedged); +void i915_handle_error(struct drm_device *dev, enum error_capture_reason r); void gen6_set_pm_mask(struct drm_i915_private *dev_priv, u32 pm_iir, int new_delay); @@ -2449,7 +2459,8 @@ static inline void i915_error_state_buf_release( { kfree(eb->buf); } -void i915_capture_error_state(struct drm_device *dev); +void i915_capture_error_state(struct drm_device *dev, + enum error_capture_reason reason); void i915_error_state_get(struct drm_device *dev, struct i915_error_state_file_priv *error_priv); void i915_error_state_put(struct i915_error_state_file_priv *error_priv); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 94542d4..7046e90 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -233,6 +233,26 @@ static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a) return "unknown"; } +static const char *capture_reason_to_str(enum error_capture_reason r) +{ + switch(r) { + case ERROR_HANGCHECK_RINGS_HUNG: + return "ring(s) hung"; + case ERROR_HANGCHECK_KICK_WAIT: + return "kicked stuck wait"; + case ERROR_HANGCHECK_KICK_SEMAPHORE: + return "kicked stuck semaphore"; + case ERROR_IIR_INTERRUPT: + return "iir interrupt"; + case ERROR_PM_INTERRUPT: + return "pm interrupt"; + case ERROR_GT_INTERRUPT: + return "gt interrupt"; + } + + return "unknown"; +} + static void i915_ring_error_state(struct drm_i915_error_state_buf *m, struct drm_device *dev, struct drm_i915_error_ring *ring) @@ -318,6 +338,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, error->time.tv_usec); err_printf(m, "Kernel: " UTS_RELEASE "\n"); err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device); + err_printf(m, "Reason: %s\n", capture_reason_to_str(error->reason)); err_printf(m, "EIR: 0x%08x\n", error->eir); err_printf(m, "IER: 0x%08x\n", error->ier); err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er); @@ -1092,7 +1113,8 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv, * out a structure which becomes available in debugfs for user level tools * to pick up. */ -void i915_capture_error_state(struct drm_device *dev) +void i915_capture_error_state(struct drm_device *dev, + enum error_capture_reason reason) { static bool warned; struct drm_i915_private *dev_priv = dev->dev_private; @@ -1124,6 +1146,8 @@ void i915_capture_error_state(struct drm_device *dev) kref_init(&error->ref); + error->reason = reason; + i915_capture_reg_state(dev_priv, error); i915_gem_capture_buffers(dev_priv, error); i915_gem_record_fences(dev, error); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 56edff3..b14b507 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1224,7 +1224,7 @@ static void snb_gt_irq_handler(struct drm_device *dev, GT_BSD_CS_ERROR_INTERRUPT | GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) { DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir); - i915_handle_error(dev, false); + i915_handle_error(dev, ERROR_GT_INTERRUPT); } if (gt_iir & GT_PARITY_ERROR(dev)) @@ -1472,7 +1472,7 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir) if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) { DRM_ERROR("VEBOX CS error interrupt 0x%08x\n", pm_iir); - i915_handle_error(dev_priv->dev, false); + i915_handle_error(dev_priv->dev, ERROR_PM_INTERRUPT); } } } @@ -2167,14 +2167,14 @@ static void i915_report_and_clear_eir(struct drm_device *dev) * so userspace knows something bad happened (should trigger collection * of a ring dump etc.). */ -void i915_handle_error(struct drm_device *dev, bool wedged) +void i915_handle_error(struct drm_device *dev, enum error_capture_reason reason) { struct drm_i915_private *dev_priv = dev->dev_private; - i915_capture_error_state(dev); + i915_capture_error_state(dev, reason); i915_report_and_clear_eir(dev); - if (wedged) { + if (reason == ERROR_HANGCHECK_RINGS_HUNG) { atomic_set_mask(I915_RESET_IN_PROGRESS_FLAG, &dev_priv->gpu_error.reset_counter); @@ -2490,7 +2490,7 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd) if (tmp & RING_WAIT) { DRM_ERROR("Kicking stuck wait on %s\n", ring->name); - i915_handle_error(dev, false); + i915_handle_error(dev, ERROR_HANGCHECK_KICK_WAIT); I915_WRITE_CTL(ring, tmp); return HANGCHECK_KICK; } @@ -2502,7 +2502,7 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd) case 1: DRM_ERROR("Kicking stuck semaphore on %s\n", ring->name); - i915_handle_error(dev, false); + i915_handle_error(dev, ERROR_HANGCHECK_KICK_SEMAPHORE); I915_WRITE_CTL(ring, tmp); return HANGCHECK_KICK; case 0: @@ -2624,7 +2624,7 @@ static void i915_hangcheck_elapsed(unsigned long data) } if (rings_hung) - return i915_handle_error(dev, true); + return i915_handle_error(dev, ERROR_HANGCHECK_RINGS_HUNG); if (busy_count) /* Reset timer case chip hangs without another request @@ -3241,7 +3241,7 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg) */ spin_lock_irqsave(&dev_priv->irq_lock, irqflags); if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT) - i915_handle_error(dev, false); + i915_handle_error(dev, ERROR_IIR_INTERRUPT); for_each_pipe(pipe) { int reg = PIPESTAT(pipe); @@ -3423,7 +3423,7 @@ static irqreturn_t i915_irq_handler(int irq, void *arg) */ spin_lock_irqsave(&dev_priv->irq_lock, irqflags); if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT) - i915_handle_error(dev, false); + i915_handle_error(dev, ERROR_IIR_INTERRUPT); for_each_pipe(pipe) { int reg = PIPESTAT(pipe); @@ -3660,7 +3660,7 @@ static irqreturn_t i965_irq_handler(int irq, void *arg) */ spin_lock_irqsave(&dev_priv->irq_lock, irqflags); if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT) - i915_handle_error(dev, false); + i915_handle_error(dev, ERROR_IIR_INTERRUPT); for_each_pipe(pipe) { int reg = PIPESTAT(pipe); -- 1.7.9.5