powerpc: Rework pseries machine check handler
authorAnton Blanchard <anton@samba.org>
Tue, 11 Jan 2011 19:49:19 +0000 (19:49 +0000)
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>
Fri, 21 Jan 2011 03:08:38 +0000 (14:08 +1100)
Rework pseries machine check handler:

- If MSR_RI isn't set, we cannot recover even if the machine check was fully
  recovered

- Rename nonfatal to recovered

- Handle RTAS_DISP_LIMITED_RECOVERY

- Use BUS_MCEERR_AR instead of BUS_ADRERR

- Don't check all the RTAS error log fields when receiving a synchronous
  machine check. Recent versions of the pseries firmware do not fill them
  in during a machine check and instead send a follow up error log with
  the detailed information. If we see a synchronous machine check, and we
  came from userspace then kill the task.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
arch/powerpc/platforms/pseries/ras.c

index 048e257..d194150 100644 (file)
@@ -259,31 +259,43 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
  * Return 1 if corrected (or delivered a signal).
  * Return 0 if there is nothing we can do.
  */
-static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err)
+static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
 {
-       int nonfatal = 0;
+       int recovered = 0;
 
-       if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
+       if (!(regs->msr & MSR_RI)) {
+               /* If MSR_RI isn't set, we cannot recover */
+               recovered = 0;
+
+       } else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
                /* Platform corrected itself */
-               nonfatal = 1;
-       } else if ((regs->msr & MSR_RI) &&
-                  user_mode(regs) &&
-                  err->severity == RTAS_SEVERITY_ERROR_SYNC &&
-                  err->disposition == RTAS_DISP_NOT_RECOVERED &&
-                  err->target == RTAS_TARGET_MEMORY &&
-                  err->type == RTAS_TYPE_ECC_UNCORR &&
-                  !(current->pid == 0 || is_global_init(current))) {
-               /* Kill off a user process with an ECC error */
-               printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n",
-                      current->pid);
-               /* XXX something better for ECC error? */
-               _exception(SIGBUS, regs, BUS_ADRERR, regs->nip);
-               nonfatal = 1;
+               recovered = 1;
+
+       } else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) {
+               /* Platform corrected itself but could be degraded */
+               printk(KERN_ERR "MCE: limited recovery, system may "
+                      "be degraded\n");
+               recovered = 1;
+
+       } else if (user_mode(regs) && !is_global_init(current) &&
+                  err->severity == RTAS_SEVERITY_ERROR_SYNC) {
+
+               /*
+                * If we received a synchronous error when in userspace
+                * kill the task. Firmware may report details of the fail
+                * asynchronously, so we can't rely on the target and type
+                * fields being valid here.
+                */
+               printk(KERN_ERR "MCE: uncorrectable error, killing task "
+                      "%s:%d\n", current->comm, current->pid);
+
+               _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+               recovered = 1;
        }
 
        log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
 
-       return nonfatal;
+       return recovered;
 }
 
 /*