Merge branch 'for_paulus' of master.kernel.org:/pub/scm/linux/kernel/git/galak/powerpc

[cascardo/linux.git] / arch / powerpc / platforms / pseries / eeh_event.c
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c

index 9249733..a0b3964 100644 (file)
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -18,9 +18,13 @@
   * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
   */
  
+#include <linux/delay.h>
  #include <linux/list.h>
+#include <linux/mutex.h>
  #include <linux/pci.h>
+#include <linux/workqueue.h>
  #include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
  
  /** Overview:
   *  EEH error states may be detected within exception handlers;
@@ -36,66 +40,60 @@ LIST_HEAD(eeh_eventlist);
  static void eeh_thread_launcher(void *);
  DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL);
  
-/**
- * eeh_panic - call panic() for an eeh event that cannot be handled.
- * The philosophy of this routine is that it is better to panic and
- * halt the OS than it is to risk possible data corruption by
- * oblivious device drivers that don't know better.
- *
- * @dev pci device that had an eeh event
- * @reset_state current reset state of the device slot
- */
-static void eeh_panic(struct pci_dev *dev, int reset_state)
-{
-       /*
-        * Since the panic_on_oops sysctl is used to halt the system
-        * in light of potential corruption, we can use it here.
-        */
-       if (panic_on_oops) {
-               panic("EEH: MMIO failure (%d) on device:%s\n", reset_state,
-                     pci_name(dev));
-       }
-       else {
-               printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n",
-                      reset_state, pci_name(dev));
-       }
-}
+/* Serialize reset sequences for a given pci device */
+DEFINE_MUTEX(eeh_event_mutex);
  
  /**
- * eeh_event_handler - dispatch EEH events.  The detection of a frozen
- * slot can occur inside an interrupt, where it can be hard to do
- * anything about it.  The goal of this routine is to pull these
- * detection events out of the context of the interrupt handler, and
- * re-dispatch them for processing at a later time in a normal context.
- *
+ * eeh_event_handler - dispatch EEH events.
   * @dummy - unused
+ *
+ * The detection of a frozen slot can occur inside an interrupt,
+ * where it can be hard to do anything about it.  The goal of this
+ * routine is to pull these detection events out of the context
+ * of the interrupt handler, and re-dispatch them for processing
+ * at a later time in a normal context.
   */
  static int eeh_event_handler(void * dummy)
  {
         unsigned long flags;
         struct eeh_event        *event;
+       struct pci_dn *pdn;
  
         daemonize ("eehd");
+       set_current_state(TASK_INTERRUPTIBLE);
+
+       spin_lock_irqsave(&eeh_eventlist_lock, flags);
+       event = NULL;
+
+       /* Unqueue the event, get ready to process. */
+       if (!list_empty(&eeh_eventlist)) {
+               event = list_entry(eeh_eventlist.next, struct eeh_event, list);
+               list_del(&event->list);
+       }
+       spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
  
-       while (1) {
-               set_current_state(TASK_INTERRUPTIBLE);
+       if (event == NULL)
+               return 0;
  
-               spin_lock_irqsave(&eeh_eventlist_lock, flags);
-               event = NULL;
-               if (!list_empty(&eeh_eventlist)) {
-                       event = list_entry(eeh_eventlist.next, struct eeh_event, list);
-                       list_del(&event->list);
-               }
-               spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
-               if (event == NULL)
-                       break;
+       /* Serialize processing of EEH events */
+       mutex_lock(&eeh_event_mutex);
+       eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
  
-               printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
-                      pci_name(event->dev));
+       printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
+              pci_name(event->dev));
  
-               eeh_panic (event->dev, event->state);
+       pdn = handle_eeh_events(event);
  
-               kfree(event);
+       eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
+       pci_dev_put(event->dev);
+       kfree(event);
+       mutex_unlock(&eeh_event_mutex);
+
+       /* If there are no new errors after an hour, clear the counter. */
+       if (pdn && pdn->eeh_freeze_count>0) {
+               msleep_interruptible (3600*1000);
+               if (pdn->eeh_freeze_count>0)
+                       pdn->eeh_freeze_count--;
         }
  
         return 0;
@@ -103,7 +101,6 @@ static int eeh_event_handler(void * dummy)
  
  /**
   * eeh_thread_launcher
- *
   * @dummy - unused
   */
  static void eeh_thread_launcher(void *dummy)
@@ -122,7 +119,7 @@ static void eeh_thread_launcher(void *dummy)
   */
  int eeh_send_failure_event (struct device_node *dn,
                              struct pci_dev *dev,
-                            int state,
+                            enum pci_channel_state state,
                              int time_unavail)
  {
         unsigned long flags;