patch-2.1.75 linux/drivers/scsi/scsi_error.c

Next file: linux/drivers/scsi/scsi_ioctl.c
Previous file: linux/drivers/scsi/scsi_debug.h
Back to the patch index
Back to the overall index
Lines: 1896
Date: Sun Dec 21 17:04:49 1997
Orig file: v2.1.74/linux/drivers/scsi/scsi_error.c
Orig date: Wed Dec 31 16:00:00 1969

diff -u --recursive --new-file v2.1.74/linux/drivers/scsi/scsi_error.c linux/drivers/scsi/scsi_error.c
@@ -0,0 +1,1895 @@
+/*
+ *  scsi_error.c Copyright (C) 1997 Eric Youngdale
+ *
+ *  SCSI error/timeout handling
+ *      Initial versions: Eric Youngdale.  Based upon conversations with
+ *			  Leonard Zubkoff and David Miller at Linux Expo, 
+ *			  ideas originating from all over the place.
+ *
+ */
+
+#include <linux/config.h>
+#define __NO_VERSION__
+#include <linux/module.h>
+
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/malloc.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/stat.h>
+#include <linux/blk.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <asm/smp_lock.h>
+
+#define __KERNEL_SYSCALLS__
+
+#include <linux/unistd.h>
+
+#include <asm/system.h>
+#include <asm/irq.h>
+#include <asm/dma.h>
+
+#include "scsi.h"
+#include "hosts.h"
+#include "constants.h"
+
+#define SHUTDOWN_SIGS	(sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
+
+#ifdef CONFIG_KERNELD
+#include <linux/kerneld.h>
+#endif
+
+#ifdef DEBUG
+    #define SENSE_TIMEOUT SCSI_TIMEOUT
+    #define ABORT_TIMEOUT SCSI_TIMEOUT
+    #define RESET_TIMEOUT SCSI_TIMEOUT
+#else
+    #define SENSE_TIMEOUT (10*HZ)
+    #define RESET_TIMEOUT (2*HZ)
+    #define ABORT_TIMEOUT (15*HZ)
+#endif
+
+#define STATIC
+
+/*
+ * These should *probably* be handled by the host itself.
+ * Since it is allowed to sleep, it probably should.
+ */
+#define BUS_RESET_SETTLE_TIME   5*HZ
+#define HOST_RESET_SETTLE_TIME  10*HZ
+
+
+static const char RCSid[] = "$Header: /mnt/ide/home/eric/CVSROOT/linux/drivers/scsi/scsi_error.c,v 1.9 1997/12/07 23:38:23 eric Exp $";
+
+STATIC int         scsi_check_sense (Scsi_Cmnd * SCpnt);
+STATIC int         scsi_request_sense(Scsi_Cmnd *);
+STATIC void        scsi_send_eh_cmnd (Scsi_Cmnd * SCpnt, int timeout);
+STATIC int         scsi_try_to_abort_command(Scsi_Cmnd *, int);
+STATIC int         scsi_test_unit_ready(Scsi_Cmnd *);
+STATIC int         scsi_try_bus_device_reset(Scsi_Cmnd *, int timeout);
+STATIC int         scsi_try_bus_reset(Scsi_Cmnd *);
+STATIC int         scsi_try_host_reset(Scsi_Cmnd *);
+STATIC int         scsi_unit_is_ready(Scsi_Cmnd *);
+STATIC void        scsi_eh_action_done(Scsi_Cmnd *, int);
+STATIC int         scsi_eh_retry_command(Scsi_Cmnd *);
+STATIC int	   scsi_eh_completed_normally(Scsi_Cmnd * SCpnt);
+STATIC void        scsi_restart_operations(struct Scsi_Host *);
+STATIC void        scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt);
+
+
+/*
+ * Function:    scsi_add_timer()
+ *
+ * Purpose:     Start timeout timer for a single scsi command.
+ *
+ * Arguments:   SCset   - command that is about to start running.
+ *              timeout - amount of time to allow this command to run.
+ *              complete - timeout function to call if timer isn't
+ *                      canceled.
+ *
+ * Returns:     Nothing
+ *
+ * Notes:	This should be turned into an inline function.
+ *
+ * More Notes:  Each scsi command has it's own timer, and as it is added to
+ *              the queue, we set up the timer.  When the command completes,
+ *              we cancel the timer.  Pretty simple, really, especially
+ *              compared to the old way of handling this crap.
+ */
+void
+scsi_add_timer(Scsi_Cmnd * SCset, 
+			int timeout, 
+			void (*complete)(Scsi_Cmnd *))
+{
+
+    /*
+     * If the clock was already running for this command, then
+     * first delete the timer.  The timer handling code gets rather
+     * confused if we don't do this.
+     */
+    if( SCset->eh_timeout.function != NULL )
+    {
+        del_timer(&SCset->eh_timeout);
+    }
+
+    SCset->eh_timeout.data = (unsigned long) SCset;
+    SCset->eh_timeout.expires = jiffies + timeout;
+    SCset->eh_timeout.function = (void (*)(unsigned long))complete;
+    
+    SCSI_LOG_ERROR_RECOVERY(5,printk("Adding timer for command %p at %d (%p)\n", SCset, timeout, complete));
+    
+    add_timer(&SCset->eh_timeout);
+
+}
+
+/*
+ * Function:    scsi_delete_timer()
+ *
+ * Purpose:     Delete/cancel timer for a given function.
+ *
+ * Arguments:   SCset   - command that we are canceling timer for.
+ *
+ * Returns:     Amount of time remaining before command would have timed out.
+ *
+ * Notes:	This should be turned into an inline function.
+ */
+int
+scsi_delete_timer(Scsi_Cmnd * SCset)
+{
+  int rtn;
+
+  rtn = jiffies - SCset->eh_timeout.expires;
+  del_timer(&SCset->eh_timeout);
+
+  SCSI_LOG_ERROR_RECOVERY(5,printk("Clearing timer for command %p\n", SCset));
+
+  SCset->eh_timeout.data = (unsigned long) NULL;
+  SCset->eh_timeout.expires = 0;
+  SCset->eh_timeout.function = NULL;
+
+  return rtn;
+}
+
+/*
+ * Function:    scsi_times_out()
+ *
+ * Purpose:     Timeout function for normal scsi commands..
+ *
+ * Arguments:   SCpnt   - command that is timing out.
+ *
+ * Returns:     Nothing.
+ *
+ * Notes:
+ */
+void scsi_times_out (Scsi_Cmnd * SCpnt)
+{
+
+    /* 
+     * Notify the low-level code that this operation failed and we are
+     * reposessing the command.  
+     */
+#ifdef ERIC_neverdef
+    /*
+     * FIXME(eric)
+     * Allow the host adapter to push a queue ordering tag
+     * out to the bus to force the command in question to complete.
+     * If the host wants to do this, then we just restart the timer
+     * for the command.  Before we really do this, some real thought
+     * as to the optimum way to handle this should be done.  We *do*
+     * need to force ordering every so often to ensure that all requests
+     * do eventually complete, but I am not sure if this is the best way
+     * to actually go about it.
+     *
+     * Better yet, force a sync here, but don't block since we are in an
+     * interrupt.
+     */
+    if( SCpnt->host->hostt->eh_ordered_queue_tag )
+    {
+        if( (*SCpnt->host->hostt->eh_ordered_queue_tag)(SCpnt))
+        {
+            scsi_add_timer(SCpnt, SCpnt->internal_timeout,
+                           scsi_times_out);
+            return;
+        }
+    }
+    /*
+     * FIXME(eric) - add a second special interface to handle this
+     * case.  Ideally that interface can also be used to request
+     * a queu
+     */
+     if (SCpnt->host->can_queue)
+     {
+         SCpnt->host->hostt->queuecommand (SCpnt, NULL);
+     }
+#endif
+
+    SCpnt->state = SCSI_STATE_TIMEOUT;
+    SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
+    
+    SCpnt->host->in_recovery = 1;
+    SCpnt->host->host_failed++;
+    
+    SCSI_LOG_TIMEOUT(3,printk("Command timed out active=%d busy=%d failed=%d\n", 
+                              atomic_read(&SCpnt->host->host_active),
+                              SCpnt->host->host_busy, 
+                              SCpnt->host->host_failed));
+    
+    /*
+     * If the host is having troubles, then look to see if this was the last
+     * command that might have failed.  If so, wake up the error handler.
+     */
+    if( atomic_read(&SCpnt->host->host_active) == SCpnt->host->host_failed )
+    {
+        up(SCpnt->host->eh_wait);
+    }
+}
+
+/*
+ * Function     scsi_block_when_processing_errors
+ *
+ * Purpose:     Prevent more commands from being queued while error recovery
+ *              is taking place.
+ *
+ * Arguments:   SDpnt - device on which we are performing recovery.
+ *
+ * Returns:     FALSE   The device was taken offline by error recovery.
+ *              TRUE    OK to proceed.
+ *
+ * Notes:       We block until the host is out of error recovery, and then
+ *              check to see whether the host or the device is offline.
+ */
+int  
+scsi_block_when_processing_errors(Scsi_Device * SDpnt)
+{
+
+  SCSI_SLEEP( &SDpnt->host->host_wait, SDpnt->host->in_recovery);
+
+  SCSI_LOG_ERROR_RECOVERY(5,printk("Open returning %d\n", SDpnt->online));
+
+  return SDpnt->online;
+}
+
+/*
+ * Function:    scsi_eh_times_out()
+ *
+ * Purpose:     Timeout function for error handling.
+ *
+ * Arguments:   SCpnt   - command that is timing out.
+ *
+ * Returns:     Nothing.
+ *
+ * Notes:	During error handling, the kernel thread will be sleeping
+ *		waiting for some action to complete on the device.  Our only
+ *		job is to record that it timed out, and to wake up the
+ *		thread.
+ */
+STATIC
+void scsi_eh_times_out (Scsi_Cmnd * SCpnt)
+{
+  SCpnt->request.rq_status = RQ_SCSI_DONE;
+  SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
+  SCpnt->eh_state = SCSI_STATE_TIMEOUT;
+
+  SCSI_LOG_ERROR_RECOVERY(5,printk("In scsi_eh_times_out %p\n", SCpnt));
+
+  if (SCpnt->host->eh_action != NULL)
+    up(SCpnt->host->eh_action);
+  else
+    panic("Missing scsi error handler thread");
+}
+
+
+/*
+ * Function:    scsi_eh_done()
+ *
+ * Purpose:     Completion function for error handling.
+ *
+ * Arguments:   SCpnt   - command that is timing out.
+ *
+ * Returns:     Nothing.
+ *
+ * Notes:	During error handling, the kernel thread will be sleeping
+ *		waiting for some action to complete on the device.  Our only
+ *		job is to record that the action completed, and to wake up the
+ *		thread.
+ */
+STATIC
+void scsi_eh_done (Scsi_Cmnd * SCpnt)
+{
+  SCpnt->request.rq_status = RQ_SCSI_DONE;
+  
+  SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
+  SCpnt->eh_state = SUCCESS;
+
+  SCSI_LOG_ERROR_RECOVERY(5,printk("In eh_done %p result:%x\n", SCpnt, 
+                                   SCpnt->result));
+
+  if (SCpnt->host->eh_action != NULL)
+    up(SCpnt->host->eh_action);
+}
+
+/*
+ * Function:    scsi_eh_action_done()
+ *
+ * Purpose:     Completion function for error handling.
+ *
+ * Arguments:   SCpnt   - command that is timing out.
+ *		answer  - boolean that indicates whether operation succeeded.
+ *
+ * Returns:     Nothing.
+ *
+ * Notes:	This callback is only used for abort and reset operations.
+ */
+STATIC
+void scsi_eh_action_done (Scsi_Cmnd * SCpnt, int answer)
+{
+  SCpnt->request.rq_status = RQ_SCSI_DONE;
+  
+  SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
+  SCpnt->eh_state = (answer ? SUCCESS : FAILED);
+
+  if (SCpnt->host->eh_action != NULL)
+    up(SCpnt->host->eh_action);
+}
+
+/*
+ * Function:	scsi_sense_valid()
+ *
+ * Purpose:	Determine whether a host has automatically obtained sense
+ *		information or not.  If we have it, then give a recommendation
+ *		as to what we should do next.
+ */
+int
+scsi_sense_valid(Scsi_Cmnd * SCpnt)
+{
+  if (((SCpnt->sense_buffer[0] & 0x70) >> 4) != 7) 
+    {
+      return FALSE;
+    }
+  return TRUE;
+}
+
+/*
+ * Function:	scsi_eh_retry_command()
+ *
+ * Purpose:	Retry the original command
+ *
+ * Returns:	SUCCESS - we were able to get the sense data.
+ *		FAILED  - we were not able to get the sense data.
+ * 
+ * Notes:	This function will *NOT* return until the command either
+ *		times out, or it completes.
+ */
+STATIC int
+scsi_eh_retry_command(Scsi_Cmnd * SCpnt)
+{
+  memcpy ((void *) SCpnt->cmnd,  (void*) SCpnt->data_cmnd,
+          sizeof(SCpnt->data_cmnd));
+  SCpnt->request_buffer = SCpnt->buffer;
+  SCpnt->request_bufflen = SCpnt->bufflen;
+  SCpnt->use_sg = SCpnt->old_use_sg;
+  SCpnt->cmd_len = SCpnt->old_cmd_len;
+
+  SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
+
+  scsi_send_eh_cmnd (SCpnt, SCpnt->timeout_per_command);
+
+  /*
+   * Hey, we are done.  Let's look to see what happened.
+   */
+  return SCpnt->eh_state;
+}
+
+/*
+ * Function:	scsi_request_sense()
+ *
+ * Purpose:	Request sense data from a particular target.
+ *
+ * Returns:	SUCCESS - we were able to get the sense data.
+ *		FAILED  - we were not able to get the sense data.
+ * 
+ * Notes:	Some hosts automatically obtain this information, others
+ *		require that we obtain it on our own.
+ *
+ *		This function will *NOT* return until the command either
+ *		times out, or it completes.
+ */
+STATIC int
+scsi_request_sense(Scsi_Cmnd * SCpnt)
+{
+  static unsigned char generic_sense[6] = {REQUEST_SENSE, 0,0,0, 255, 0};
+
+  memcpy ((void *) SCpnt->cmnd , (void *) generic_sense,
+	  sizeof(generic_sense));
+
+  SCpnt->cmnd[1] = SCpnt->lun << 5;
+  SCpnt->cmnd[4] = sizeof(SCpnt->sense_buffer);
+
+  SCpnt->request_buffer = &SCpnt->sense_buffer;
+  SCpnt->request_bufflen = sizeof(SCpnt->sense_buffer);
+  SCpnt->use_sg = 0;
+  SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
+
+  scsi_send_eh_cmnd (SCpnt, SENSE_TIMEOUT);
+
+  /*
+   * Hey, we are done.  Let's look to see what happened.
+   */
+  return SCpnt->eh_state;
+}
+
+/*
+ * Function:	scsi_test_unit_ready()
+ *
+ * Purpose:	Run test unit ready command to see if the device is talking to us or not.
+ *
+ */
+STATIC int
+scsi_test_unit_ready(Scsi_Cmnd * SCpnt)
+{
+  static unsigned char tur_command[6] = {TEST_UNIT_READY, 0,0,0,0,0};
+
+  memcpy ((void *) SCpnt->cmnd , (void *) tur_command,
+	  sizeof(tur_command));
+
+  SCpnt->cmnd[1] = SCpnt->lun << 5;
+  SCpnt->cmnd[4] = sizeof(SCpnt->sense_buffer);
+
+  SCpnt->request_buffer = &SCpnt->sense_buffer;
+  SCpnt->request_bufflen = sizeof(SCpnt->sense_buffer);
+  SCpnt->use_sg = 0;
+  SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
+
+  scsi_send_eh_cmnd (SCpnt, SENSE_TIMEOUT);
+
+  /*
+   * Hey, we are done.  Let's look to see what happened.
+   */
+  return SCpnt->eh_state;
+}
+
+STATIC
+void scsi_sleep_done (struct semaphore * sem)
+{
+    if( sem != NULL )
+    {
+        up(sem);
+    }
+}
+
+
+void scsi_sleep (int timeout)
+{
+    struct semaphore sem = MUTEX_LOCKED;
+    struct timer_list timer;
+
+    timer.data = (unsigned long) &sem;
+    timer.expires = jiffies + timeout;
+    timer.function = (void (*)(unsigned long))scsi_sleep_done;
+    
+    SCSI_LOG_ERROR_RECOVERY(5,printk("Sleeping for timer tics %d\n", timeout));
+    
+    add_timer(&timer);
+
+    down(&sem);
+    
+    del_timer(&timer);
+}
+
+/*
+ * Function:	scsi_send_eh_cmnd
+ *
+ * Purpose:	Send a command out to a device as part of error recovery.
+ *
+ * Notes:	The initialization of the structures is quite a bit different
+ *		in this case, and furthermore, there is a different completion
+ *		handler.
+ */
+STATIC void scsi_send_eh_cmnd (Scsi_Cmnd * SCpnt, int timeout)
+{
+    struct Scsi_Host * host;
+
+    host = SCpnt->host;
+
+retry:
+    /*
+     * We will use a queued command if possible, otherwise we will emulate the
+     * queuing and calling of completion function ourselves.
+     */
+    SCpnt->owner = SCSI_OWNER_LOWLEVEL;
+
+    if (host->can_queue)
+    {
+        struct semaphore sem = MUTEX_LOCKED;
+
+        SCpnt->eh_state = SCSI_STATE_QUEUED;
+
+        scsi_add_timer(SCpnt, timeout, scsi_eh_times_out);
+
+	/*
+	 * Set up the semaphore so we wait for the command to complete.
+	 */
+	SCpnt->host->eh_action = &sem;
+	SCpnt->request.rq_status = RQ_SCSI_BUSY;
+
+	host->hostt->queuecommand (SCpnt, scsi_eh_done);
+	down(&sem);
+        SCpnt->host->eh_action = NULL;
+
+	del_timer(&SCpnt->eh_timeout);
+
+	/*
+	 * See if timeout.  If so, tell the host to forget about it.
+	 * In other words, we don't want a callback any more.
+	 */
+	if( SCpnt->eh_state == SCSI_STATE_TIMEOUT )
+	  {
+	    SCpnt->eh_state = FAILED;
+	  }
+
+        SCSI_LOG_ERROR_RECOVERY(5,printk("send_eh_cmnd: %p eh_state:%x\n", 
+                                         SCpnt, SCpnt->eh_state));
+    }
+    else
+      {
+	int temp;
+
+	/*
+	 * We damn well had better never use this code.  There is no timeout
+	 * protection here, since we would end up waiting in the actual low
+	 * level driver, we don't know how to wake it up.
+	 */
+	temp = host->hostt->command (SCpnt);
+	SCpnt->result = temp;
+	if( scsi_eh_completed_normally(SCpnt) )
+	  {
+	    SCpnt->eh_state = SUCCESS;
+	  }
+	else
+	  {
+	    SCpnt->eh_state = FAILED;
+	  }
+      }
+
+    /*
+     * Now examine the actual status codes to see whether the command actually
+     * did complete normally.
+     */
+    if( SCpnt->eh_state == SUCCESS )
+      {
+	switch( scsi_eh_completed_normally(SCpnt) )
+	  {
+	  case SUCCESS:
+	    SCpnt->eh_state = SUCCESS;
+	    break;
+	  case NEEDS_RETRY:
+	    goto retry;
+	  case FAILED:
+	  default:
+	    SCpnt->eh_state = FAILED;
+	    break;
+	  }
+      }
+    else
+      {
+	SCpnt->eh_state = FAILED;
+      }
+}
+
+/*
+ * Function:	scsi_unit_is_ready()
+ *
+ * Purpose:	Called after TEST_UNIT_READY is run, to test to see if
+ *		the unit responded in a way that indicates it is ready.
+ */
+STATIC int
+scsi_unit_is_ready(Scsi_Cmnd * SCpnt)
+{
+  if (SCpnt->result) 
+    {
+      if (((driver_byte (SCpnt->result) & DRIVER_SENSE) ||
+	   (status_byte (SCpnt->result) & CHECK_CONDITION)) &&
+	  ((SCpnt->sense_buffer[0] & 0x70) >> 4) == 7) 
+	{
+	  if (((SCpnt->sense_buffer[2] & 0xf) != NOT_READY) &&
+	      ((SCpnt->sense_buffer[2] & 0xf) != UNIT_ATTENTION) &&
+	      ((SCpnt->sense_buffer[2] & 0xf) != ILLEGAL_REQUEST))
+	    {
+	      return 0;
+	    }
+	}
+    }
+  
+  return 1;
+}
+
+/*
+ * Function:    scsi_eh_finish_command
+ *
+ * Purpose:     Handle a command that we are finished with WRT error handling.
+ *
+ * Arguments:   SClist - pointer to list into which we are putting completed commands.
+ *              SCpnt  - command that is completing
+ *
+ * Notes:       We don't want to use the normal command completion while we are
+ *              are still handling errors - it may cause other commands to be queued,
+ *              and that would disturb what we are doing.  Thus we really want to keep
+ *              a list of pending commands for final completion, and once we
+ *              are ready to leave error handling we handle completion for real.
+ */
+STATIC void
+scsi_eh_finish_command(Scsi_Cmnd **SClist, Scsi_Cmnd * SCpnt)
+{
+    SCpnt->state = SCSI_STATE_BHQUEUE;
+    SCpnt->bh_next = *SClist;
+    /*
+     * Set this back so that the upper level can correctly free up
+     * things.
+     */
+    SCpnt->use_sg = SCpnt->old_use_sg;
+    *SClist = SCpnt;
+}
+
+/*
+ * Function:	scsi_try_to_abort_command
+ *
+ * Purpose:	Ask host adapter to abort a running command.
+ *
+ * Returns:	FAILED		Operation failed or not supported.
+ *		SUCCESS		Succeeded.
+ *
+ * Notes:	This function will not return until the user's completion
+ *		function has been called.  There is no timeout on this
+ *              operation.  If the author of the low-level driver wishes
+ *              this operation to be timed, they can provide this facility
+ *              themselves.  Helper functions in scsi_error.c can be supplied
+ *              to make this easier to do.
+ *
+ * Notes:	It may be possible to combine this with all of the reset
+ *		handling to eliminate a lot of code duplication.  I don't
+ *		know what makes more sense at the moment - this is just a
+ *		prototype.
+ */
+STATIC int
+scsi_try_to_abort_command(Scsi_Cmnd * SCpnt, int timeout)
+{
+  SCpnt->eh_state = FAILED; /* Until we come up with something better */
+
+  if( SCpnt->host->hostt->eh_abort_handler == NULL )
+    {
+      return FAILED;
+    }
+  
+  SCpnt->owner = SCSI_OWNER_LOWLEVEL;
+
+  return SCpnt->host->hostt->eh_abort_handler(SCpnt);
+}
+
+/*
+ * Function:	scsi_try_bus_device_reset
+ *
+ * Purpose:	Ask host adapter to perform a bus device reset for a given
+ *		device.
+ *
+ * Returns:	FAILED		Operation failed or not supported.
+ *		SUCCESS		Succeeded.
+ *
+ * Notes:	There is no timeout for this operation.  If this operation is
+ *              unreliable for a given host, then the host itself needs to put a
+ *              timer on it, and set the host back to a consistent state prior
+ *              to returning.
+ */
+STATIC int
+scsi_try_bus_device_reset(Scsi_Cmnd * SCpnt, int timeout)
+{
+  SCpnt->eh_state = FAILED; /* Until we come up with something better */
+
+  if( SCpnt->host->hostt->eh_device_reset_handler == NULL )
+    {
+      return FAILED;
+    }
+  
+  SCpnt->owner = SCSI_OWNER_LOWLEVEL;
+    
+  return SCpnt->host->hostt->eh_device_reset_handler(SCpnt);
+}
+
+/*
+ * Function:	scsi_try_bus_reset
+ *
+ * Purpose:	Ask host adapter to perform a bus reset for a host.
+ *
+ * Returns:	FAILED		Operation failed or not supported.
+ *		SUCCESS		Succeeded.
+ *
+ * Notes:	
+ */
+STATIC int
+scsi_try_bus_reset(Scsi_Cmnd * SCpnt)
+{
+  int		   rtn;
+
+  SCpnt->eh_state = FAILED; /* Until we come up with something better */
+  SCpnt->owner = SCSI_OWNER_LOWLEVEL;
+
+  if( SCpnt->host->hostt->eh_bus_reset_handler == NULL )
+    {
+      return FAILED;
+    }
+
+  rtn = SCpnt->host->hostt->eh_bus_reset_handler(SCpnt);
+
+  /*
+   * If we had a successful bus reset, mark the command blocks to expect
+   * a condition code of unit attention.
+   */
+  scsi_sleep(BUS_RESET_SETTLE_TIME);
+  if( SCpnt->eh_state == SUCCESS )
+    {
+      Scsi_Device * SDloop;
+      for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next)
+	{
+            if( SCpnt->channel == SDloop->channel )
+            {
+                SDloop->was_reset = 1;
+                SDloop->expecting_cc_ua = 1;
+            }
+	}
+    }
+
+  return SCpnt->eh_state;
+}
+
+/*
+ * Function:	scsi_try_host_reset
+ *
+ * Purpose:	Ask host adapter to reset itself, and the bus.
+ *
+ * Returns:	FAILED		Operation failed or not supported.
+ *		SUCCESS		Succeeded.
+ *
+ * Notes:
+ */
+STATIC int
+scsi_try_host_reset(Scsi_Cmnd * SCpnt)
+{
+    int		   rtn;
+
+    SCpnt->eh_state = FAILED; /* Until we come up with something better */
+    SCpnt->owner = SCSI_OWNER_LOWLEVEL;
+    
+    if( SCpnt->host->hostt->eh_host_reset_handler == NULL )
+    {
+        return FAILED;
+    }
+    
+    rtn = SCpnt->host->hostt->eh_host_reset_handler(SCpnt);
+
+    /*
+     * If we had a successful host reset, mark the command blocks to expect
+     * a condition code of unit attention.
+     */
+    scsi_sleep(HOST_RESET_SETTLE_TIME);
+    if( SCpnt->eh_state == SUCCESS )
+    {
+        Scsi_Device * SDloop;
+        for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next)
+	{
+            SDloop->was_reset = 1;
+            SDloop->expecting_cc_ua = 1;
+	}
+    }
+    
+    return SCpnt->eh_state;
+}
+
+/*
+ * Function:	scsi_decide_disposition
+ *
+ * Purpose:	Examine a command block that has come back from the low-level
+ *		and figure out what to do next.
+ *
+ * Returns:	SUCCESS		- pass on to upper level.
+ *		FAILED		- pass on to error handler thread.
+ *		RETRY		- command should be retried.
+ *		SOFTERR		- command succeeded, but we need to log
+ *				  a soft error.
+ *
+ * Notes:	This is *ONLY* called when we are examining the status
+ *		after sending out the actual data command.  Any commands
+ *		that are queued for error recovery (i.e. TEST_UNIT_READY)
+ *		do *NOT* come through here.
+ *
+ *              NOTE - When this routine returns FAILED, it means the error
+ *              handler thread is woken.  In cases where the error code
+ *              indicates an error that doesn't require the error handler
+ *              thread (i.e. we don't need to abort/reset), then this function
+ *              should return SUCCESS.
+ */
+int scsi_decide_disposition (Scsi_Cmnd * SCpnt)
+{
+  int	rtn;
+
+  /*
+   * If the device is offline, then we clearly just pass the result back
+   * up to the top level.
+   */
+  if( SCpnt->device->online == FALSE )
+  {
+      SCSI_LOG_ERROR_RECOVERY(5,printk("scsi_error.c: device offline - report as SUCCESS\n"));
+      return SUCCESS;
+  }
+
+  /*
+   * First check the host byte, to see if there is anything in there
+   * that would indicate what we need to do.
+   */
+
+  switch(host_byte(SCpnt->result))
+    {
+    case DID_PASSTHROUGH:
+        /*
+         * No matter what, pass this through to the upper layer.
+         * Nuke this special code so that it looks like we are saying
+         * DID_OK.
+         */
+        SCpnt->result &= 0xff00ffff;
+        return SUCCESS;
+    case DID_OK:
+      /*
+       * Looks good.  Drop through, and check the next byte.
+       */
+      break;
+    case DID_NO_CONNECT:
+    case DID_BAD_TARGET:
+    case DID_ABORT:
+      /*
+       * Note - this means that we just report the status back to the
+       * top level driver, not that we actually think that it indicates
+       * sucess.
+       */
+      return SUCCESS;
+    case DID_PARITY:
+    case DID_BUS_BUSY:
+    case DID_ERROR:
+      goto maybe_retry;
+    case DID_TIME_OUT:
+      /*
+         * When we scan the bus, we get timeout messages for
+         * these commands if there is no device available.
+         * Other hosts report DID_NO_CONNECT for the same thing.
+         */
+        if( (SCpnt->cmnd[0] == TEST_UNIT_READY ||
+             SCpnt->cmnd[0] == INQUIRY) )
+        {
+            return SUCCESS;
+        }
+        else
+        {
+            return FAILED;
+        }
+    case DID_RESET:
+      /*
+       * In the normal case where we haven't initiated a reset, this is
+       * a failure.
+       */
+      if( SCpnt->flags & IS_RESETTING )
+	{
+	  SCpnt->flags &= ~IS_RESETTING;
+	  goto maybe_retry;
+	}
+
+      /*
+       * Examine the sense data to figure out how to proceed from here.
+       * If there is no sense data, we will be forced into the error
+       * handler thread, where we get to examine the thing in a lot more
+       * detail.
+       */
+      return scsi_check_sense (SCpnt);
+    default:
+      return FAILED;
+    }
+
+  /*
+   * Next, check the message byte.
+   */
+  if( msg_byte(SCpnt->result) != COMMAND_COMPLETE )
+    {
+      return FAILED;
+    }
+
+  /*
+   * Now, check the status byte to see if this indicates anything special.
+   */
+  switch (status_byte(SCpnt->result))
+    {
+    case QUEUE_FULL:
+      /*
+       * The case of trying to send too many commands to a tagged queueing
+       * device.
+       */
+      return ADD_TO_MLQUEUE;
+    case GOOD:
+    case COMMAND_TERMINATED:
+      return SUCCESS;
+    case CHECK_CONDITION:
+      rtn = scsi_check_sense(SCpnt);
+      if( rtn == NEEDS_RETRY )
+	{
+	  goto maybe_retry;
+	}
+      return rtn;
+    case CONDITION_GOOD:
+    case INTERMEDIATE_GOOD:
+    case INTERMEDIATE_C_GOOD:
+      /*
+       * Who knows?  FIXME(eric)
+       */
+      return SUCCESS;
+    case BUSY:
+    case RESERVATION_CONFLICT:
+      goto maybe_retry;
+    default:
+      return FAILED;
+    }
+  return FAILED;
+
+maybe_retry:
+
+  if ((++SCpnt->retries) < SCpnt->allowed)
+    {
+      return NEEDS_RETRY;
+    }
+  else
+    {
+      return FAILED;
+    }
+}
+
+/*
+ * Function:	scsi_eh_completed_normally
+ *
+ * Purpose:	Examine a command block that has come back from the low-level
+ *		and figure out what to do next.
+ *
+ * Returns:	SUCCESS		- pass on to upper level.
+ *		FAILED		- pass on to error handler thread.
+ *		RETRY		- command should be retried.
+ *		SOFTERR		- command succeeded, but we need to log
+ *				  a soft error.
+ *
+ * Notes:	This is *ONLY* called when we are examining the status
+ *		of commands queued during error recovery.  The main
+ *		difference here is that we don't allow for the possibility
+ *		of retries here, and we are a lot more restrictive about what
+ *              we consider acceptable.
+ */
+STATIC int scsi_eh_completed_normally (Scsi_Cmnd * SCpnt)
+{
+  int	rtn;
+  /*
+   * First check the host byte, to see if there is anything in there
+   * that would indicate what we need to do.
+   */
+  if( host_byte(SCpnt->result) == DID_RESET )
+    {
+     if (SCpnt->flags & IS_RESETTING )
+       {
+	 /*
+	  * OK, this is normal.  We don't know whether in fact the
+	  * command in question really needs to be rerun or not - 
+	  * if this was the original data command then the answer is yes,
+	  * otherwise we just flag it as success.
+	  */
+	 SCpnt->flags &= ~IS_RESETTING;
+	 return NEEDS_RETRY;
+       }
+
+     /*
+      * Rats.  We are already in the error handler, so we now get to try
+      * and figure out what to do next.  If the sense is valid, we have
+      * a pretty good idea of what to do.  If not, we mark it as failed.
+      */
+     return scsi_check_sense (SCpnt);
+    }
+
+  if(host_byte(SCpnt->result) != DID_OK )
+  {
+      return FAILED;
+  }
+
+  /*
+   * Next, check the message byte.
+   */
+  if( msg_byte(SCpnt->result) != COMMAND_COMPLETE )
+    {
+      return FAILED;
+    }
+
+  /*
+   * Now, check the status byte to see if this indicates anything special.
+   */
+  switch (status_byte(SCpnt->result))
+    {
+    case GOOD:
+    case COMMAND_TERMINATED:
+      return SUCCESS;
+    case CHECK_CONDITION:
+      rtn = scsi_check_sense(SCpnt);
+      if( rtn == NEEDS_RETRY )
+	{
+	  return FAILED;
+	}
+      return rtn;
+    case CONDITION_GOOD:
+    case INTERMEDIATE_GOOD:
+    case INTERMEDIATE_C_GOOD:
+      /*
+       * Who knows?  FIXME(eric)
+       */
+      return SUCCESS;
+    case BUSY:
+    case QUEUE_FULL:
+    case RESERVATION_CONFLICT:
+    default:
+      return FAILED;
+    }
+  return FAILED;
+}
+
+/*
+ * Function:	scsi_check_sense
+ *
+ * Purpose:	Examine sense information - give suggestion as to what
+ *		we should do with it.
+ */
+STATIC  int scsi_check_sense (Scsi_Cmnd * SCpnt)
+{
+    if ( !scsi_sense_valid(SCpnt) ) 
+      {
+	return FAILED;
+      }
+
+    if (SCpnt->sense_buffer[2] & 0xe0)
+	return FAILED;
+
+    switch (SCpnt->sense_buffer[2] & 0xf)
+    {
+    case NO_SENSE:
+	return SUCCESS;
+    case RECOVERED_ERROR:
+	return SOFT_ERROR;
+
+    case ABORTED_COMMAND:
+	return NEEDS_RETRY;
+    case NOT_READY:
+    case UNIT_ATTENTION:
+        /*
+         * If we are expecting a CC/UA because of a bus reset that we
+         * performed, treat this just as a retry.  Otherwise this is
+         * information that we should pass up to the upper-level driver
+         * so that we can deal with it there.
+         */
+        if( SCpnt->device->expecting_cc_ua )
+        {
+            SCpnt->device->expecting_cc_ua = 0;
+            return NEEDS_RETRY;
+        }
+	return SUCCESS;
+
+    /* these three are not supported */
+    case COPY_ABORTED:
+    case VOLUME_OVERFLOW:
+    case MISCOMPARE:
+
+    case MEDIUM_ERROR:
+	return FAILED;
+
+    case BLANK_CHECK:
+    case DATA_PROTECT:
+    case HARDWARE_ERROR:
+    case ILLEGAL_REQUEST:
+    default:
+	return FAILED;
+    }
+}
+
+
+/*
+ * Function:	scsi_restart_operations
+ *
+ * Purpose:	Restart IO operations to the specified host.
+ *
+ * Arguments:	host  - host that we are restarting
+ *
+ * Returns:	Nothing
+ *
+ * Notes:	When we entered the error handler, we blocked all further
+ *		I/O to this device.  We need to 'reverse' this process.
+ */
+STATIC void
+scsi_restart_operations(struct Scsi_Host * host)
+{
+  Scsi_Device * SDpnt;
+
+  /*
+   * Next free up anything directly waiting upon the host.  This will be
+   * requests for character device operations, and also for ioctls to queued
+   * block devices.
+   */
+  SCSI_LOG_ERROR_RECOVERY(5,printk("scsi_error.c: Waking up host to restart\n"));
+
+   wake_up(&host->host_wait);
+
+   /*
+    * Finally, block devices need an extra kick in the pants.  This is because
+    * the request queueing mechanism may have queued lots of pending requests
+    * and there won't be a process waiting in a place where we can simply wake
+    * it up.  Thus we simply go through and call the request function to goose
+    * the various top level drivers and get things moving again.
+    */
+   for( SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next )
+     {
+       SCSI_LOG_ERROR_RECOVERY(5,printk("Calling request function to restart things...\n"));
+
+       if( SDpnt->scsi_request_fn != NULL )
+	 (*SDpnt->scsi_request_fn)();
+     }
+}
+
+/*
+ * Function:	scsi_unjam_host
+ *
+ * Purpose:	Attempt to fix a host which has a command that failed for
+ *		some reason.
+ *
+ * Arguments:	host	- host that needs unjamming.
+ * 
+ * Returns:	Nothing
+ *
+ * Notes:	When we come in here, we *know* that all commands on the
+ *		bus have either completed, failed or timed out.  We also
+ *		know that no further commands are being sent to the host,
+ *		so things are relatively quiet and we have freedom to
+ *		fiddle with things as we wish.
+ *
+ * Additional note:  This is only the *default* implementation.  It is possible
+ *		for individual drivers to supply their own version of this
+ *		function, and if the maintainer wishes to do this, it is
+ *		strongly suggested that this function be taken as a template
+ *		and modified.  This function was designed to correctly handle
+ *		problems for about 95% of the different cases out there, and
+ *		it should always provide at least a reasonable amount of error
+ *		recovery.
+ *
+ * Note3:       Any command marked 'FAILED' or 'TIMEOUT' must eventually
+ *              have scsi_finish_command() called for it.  We do all of
+ *              the retry stuff here, so when we restart the host after we
+ *              return it should have an empty queue.
+ */
+STATIC int
+scsi_unjam_host(struct Scsi_Host * host)
+{
+  int           devices_failed;
+  int           numfailed;
+  int           ourrtn;
+  int		rtn = FALSE;
+  int		result;
+  Scsi_Cmnd   * SCloop;
+  Scsi_Cmnd   * SCpnt;
+  Scsi_Device * SDpnt;
+  Scsi_Device * SDloop;
+  Scsi_Cmnd   * SCdone;
+  int           timed_out;
+
+  SCdone = NULL;
+
+  /*
+   * First, protect against any sort of race condition.  If any of the outstanding
+   * commands are in states that indicate that we are not yet blocked (i.e. we are
+   * not in a quiet state) then we got woken up in error.  If we ever end up here,
+   * we need to re-examine some of the assumptions.
+   */
+  for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+  {
+      for(SCpnt=SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next)
+      {
+          if( SCpnt->state == SCSI_STATE_FAILED 
+              || SCpnt->state == SCSI_STATE_TIMEOUT 
+              || SCpnt->state == SCSI_STATE_UNUSED)
+          {
+              continue;
+          }
+
+          /*
+           * Rats.  Something is still floating around out there.  This could
+           * be the result of the fact that the upper level drivers are still frobbing
+           * commands that might have succeeded.  There are two outcomes.  One is that
+           * the command block will eventually be freed, and the other one is that
+           * the command will be queued and will be finished along the way.
+           */
+          SCSI_LOG_ERROR_RECOVERY(1,printk("Error handler prematurely woken - commands still active (%p %x %d)\n", SCpnt, SCpnt->state, SCpnt->target));
+          panic("SCSI Error handler woken too early\n");
+      }
+  }
+
+  /*
+   * Next, see if we need to request sense information.  if so,
+   * then get it now, so we have a better idea of what to do.
+   * FIXME(eric) this has the unfortunate side effect that if a host
+   * adapter does not automatically request sense information, that we end
+   * up shutting it down before we request it.  All hosts should be doing this
+   * anyways, so for now all I have to say is tough noogies if you end up in here.
+   * On second thought, this is probably a good idea.  We *really* want to give
+   * authors an incentive to automatically request this.
+   */
+  SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Checking to see if we need to request sense\n"));
+
+  for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+  {
+      for(SCpnt=SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next)
+      {
+          if( SCpnt->state != SCSI_STATE_FAILED || scsi_sense_valid(SCpnt) )
+          {
+              continue;
+          }
+
+          SCSI_LOG_ERROR_RECOVERY(2,printk("scsi_unjam_host: Requesting sense for %d\n",
+                                           SCpnt->target));
+          rtn = scsi_request_sense(SCpnt);
+          if( rtn != SUCCESS )
+          {
+              continue;
+          }
+
+          SCSI_LOG_ERROR_RECOVERY(3,printk("Sense requested for %p - result %x\n",
+                                           SCpnt, SCpnt->result));
+          SCSI_LOG_ERROR_RECOVERY(3,print_sense("bh",SCpnt));
+                  
+          result = scsi_decide_disposition(SCpnt);
+
+          /*
+           * If the result was normal, then just pass it along to the
+           * upper level.
+           */
+          if( result == SUCCESS )
+          {
+              SCpnt->host->host_failed--;
+              scsi_eh_finish_command(&SCdone, SCpnt);
+          }
+
+          if( result != NEEDS_RETRY )
+          {
+              continue;
+          }
+
+          /* 
+           * We only come in here if we want to retry a
+           * command.  The test to see whether the command
+           * should be retried should be keeping track of the
+           * number of tries, so we don't end up looping, of
+           * course.  
+           */
+          SCpnt->state = NEEDS_RETRY;
+          rtn = scsi_eh_retry_command(SCpnt);
+          if( rtn != SUCCESS )
+          {
+              continue;
+          }
+
+          /*
+           * We eventually hand this one back to the top level.
+           */
+          SCpnt->host->host_failed--;
+          scsi_eh_finish_command(&SCdone, SCpnt);
+      }
+  }
+
+  /*
+   * Go through the list of commands and figure out where we stand and how bad things
+   * really are.
+   */
+  numfailed = 0;
+  timed_out = 0;
+  devices_failed = 0;
+  for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+  {
+      unsigned int device_error = 0;
+
+      for(SCpnt=SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next)
+      {
+          if( SCpnt->state == SCSI_STATE_FAILED )
+          {
+              SCSI_LOG_ERROR_RECOVERY(5,printk("Command to ID %d failed\n", 
+                                               SCpnt->target));
+              numfailed++;
+              device_error++;
+          }
+          if( SCpnt->state == SCSI_STATE_TIMEOUT )
+          {
+              SCSI_LOG_ERROR_RECOVERY(5,printk("Command to ID %d timedout\n", 
+                                               SCpnt->target));
+              timed_out++;
+              device_error++;
+          }
+      }
+      if( device_error > 0 )
+      {
+          devices_failed++;
+      }
+  }
+
+  SCSI_LOG_ERROR_RECOVERY(2,printk("Total of %d+%d commands on %d devices require eh work\n", 
+                                   numfailed, timed_out, devices_failed));
+
+  if( host->host_failed == 0 )
+  {
+      ourrtn = TRUE;
+      goto leave;
+  }
+
+
+  /*
+   * Next, try and see whether or not it makes sense to try and abort
+   * the running command.  This only works out to be the case if we have
+   * one command that has timed out.  If the command simply failed, it
+   * makes no sense to try and abort the command, since as far as the
+   * host adapter is concerned, it isn't running.
+   */
+
+  SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Checking to see if we want to try abort\n"));
+
+  for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+  {
+      for(SCloop=SDpnt->device_queue; SCloop; SCloop = SCloop->next)
+      {
+          if( SCloop->state != SCSI_STATE_TIMEOUT )
+          {
+              continue;
+          }
+
+	  rtn = scsi_try_to_abort_command(SCloop, ABORT_TIMEOUT);
+
+	  if( rtn == SUCCESS )
+          {
+	      rtn = scsi_test_unit_ready(SCloop);
+              
+	      if( rtn == SUCCESS && scsi_unit_is_ready(SCloop) )
+              {
+		  rtn = scsi_eh_retry_command(SCloop);
+                  
+		  if( rtn == SUCCESS )
+                  {
+                      SCloop->host->host_failed--;
+		      scsi_eh_finish_command(&SCdone,SCloop);
+                  }
+              }
+          }
+      }
+  }
+  
+  /*
+   * If we have corrected all of the problems, then we are done.
+   */
+  if( host->host_failed == 0 )
+  {
+      ourrtn = TRUE;
+      goto leave;
+  }
+
+  /*
+   * Either the abort wasn't appropriate, or it didn't succeed.
+   * Now try a bus device reset.  Still, look to see whether we have
+   * multiple devices that are jammed or not - if we have multiple devices,
+   * it makes no sense to try BUS_DEVICE_RESET - we really would need
+   * to try a BUS_RESET instead.
+   *
+   * Does this make sense - should we try BDR on each device individually?
+   * Yes, definitely.
+   */
+  SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Checking to see if we want to try BDR\n"));
+
+  for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+  {
+      for(SCloop=SDpnt->device_queue; SCloop; SCloop = SCloop->next)
+      {
+          if( SCloop->state == SCSI_STATE_FAILED 
+              || SCloop->state == SCSI_STATE_TIMEOUT )
+          {
+              break;
+          }
+      }
+
+      if( SCloop == NULL )
+      {
+          continue;
+      }
+
+      /*
+       * OK, we have a device that is having problems.  Try and send
+       * a bus device reset to it.
+       *
+       * FIXME(eric) - make sure we handle the case where multiple
+       * commands to the same device have failed. They all must
+       * get properly restarted.
+       */
+      rtn = scsi_try_bus_device_reset(SCloop, RESET_TIMEOUT);
+      
+      if( rtn == SUCCESS )
+      {
+	  rtn = scsi_test_unit_ready(SCloop);
+	  
+	  if( rtn == SUCCESS && scsi_unit_is_ready(SCloop) )
+          {
+	      rtn = scsi_eh_retry_command(SCloop);
+	      
+	      if( rtn == SUCCESS )
+              {
+                  SCloop->host->host_failed--;
+		  scsi_eh_finish_command(&SCdone,SCloop);
+              }
+          }
+      }
+      
+  }
+  
+  if( host->host_failed == 0 )
+  {
+      ourrtn = TRUE;
+      goto leave;
+  }
+
+  /*
+   * If we ended up here, we have serious problems.  The only thing left
+   * to try is a full bus reset.  If someone has grabbed the bus and isn't
+   * letting go, then perhaps this will help.
+   */
+  SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Try hard bus reset\n"));
+
+  /* 
+   * We really want to loop over the various channels, and do this on
+   * a channel by channel basis.  We should also check to see if any
+   * of the failed commands are on soft_reset devices, and if so, skip
+   * the reset.  
+   */
+  for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+  {
+next_device:
+      for(SCpnt=SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next)
+      {
+          if( SCpnt->state != SCSI_STATE_FAILED 
+              && SCpnt->state != SCSI_STATE_TIMEOUT )
+          {
+              continue;
+          }
+          /*
+           * We have a failed command.  Make sure there are no other failed
+           * commands on the same channel that are timed out and implement a
+           * soft reset.
+           */
+          for(SDloop=host->host_queue; SDloop; SDloop = SDloop->next)
+          {
+              for(SCloop=SDloop->device_queue; SCloop; SCloop = SCloop->next)
+              {
+                  if( SCloop->channel != SCpnt->channel )
+                  {
+                      continue;
+                  }
+                  
+                  if( SCloop->state != SCSI_STATE_FAILED 
+                      && SCloop->state != SCSI_STATE_TIMEOUT )
+                  {
+                      continue;
+                  }
+                  
+                  if( SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT )
+                  {
+                      /* 
+                       * If this device uses the soft reset option, and this
+                       * is one of the devices acting up, then our only
+                       * option is to wait a bit, since the command is
+                       * supposedly still running.  
+                       *
+                       * FIXME(eric) - right now we will just end up falling
+                       * through to the 'take device offline' case.
+                       *
+                       * FIXME(eric) - It is possible that the command completed
+                       * *after* the error recovery procedure started, and if this
+                       * is the case, we are worrying about nothing here.
+                       */
+                      goto next_device;
+                  }
+              }
+          }
+
+          /*
+           * We now know that we are able to perform a reset for the
+           * bus that SCpnt points to.  There are no soft-reset devices
+           * with outstanding timed out commands.
+           */
+          rtn = scsi_try_bus_reset(SCpnt);
+          if( rtn == SUCCESS )
+          {
+              for(SDloop=host->host_queue; SDloop; SDloop = SDloop->next)
+              {
+                  for(SCloop=SDloop->device_queue; SCloop; SCloop = SCloop->next)
+                  {
+                      if( SCloop->channel != SCpnt->channel )
+                      {
+                          continue;
+                      }
+                      
+                      if( SCloop->state != SCSI_STATE_FAILED 
+                          && SCloop->state != SCSI_STATE_TIMEOUT )
+                      {
+                          continue;
+                      }
+                      
+                      rtn = scsi_test_unit_ready(SCloop);
+                      
+                      if( rtn == SUCCESS && scsi_unit_is_ready(SCloop) )
+                      {
+                          rtn = scsi_eh_retry_command(SCloop);
+                          
+                          if( rtn == SUCCESS )
+                          {
+                              SCpnt->host->host_failed--;
+                              scsi_eh_finish_command(&SCdone,SCloop);
+                          }
+                      }
+                      
+                      /*
+                       * If the bus reset worked, but we are still unable to
+                       * talk to the device, take it offline.
+                       * FIXME(eric) - is this really the correct thing to do?
+                       */
+                      if( rtn != SUCCESS )
+                      {
+                          SCloop->device->online = FALSE;
+                          SCloop->host->host_failed--;
+                          scsi_eh_finish_command(&SCdone,SCloop);
+                      }
+                  }
+              }
+          }
+      }
+  }
+
+  if( host->host_failed == 0 )
+  {
+      ourrtn = TRUE;
+      goto leave;
+  }
+  /*
+   * If we ended up here, we have serious problems.  The only thing left
+   * to try is a full host reset - perhaps the firmware on the device
+   * crashed, or something like that.
+   *
+   * It is assumed that a succesful host reset will cause *all* information
+   * about the command to be flushed from both the host adapter *and* the
+   * device.
+   *
+   * FIXME(eric) - it isn't clear that devices that implement the soft reset
+   * option can ever be cleared except via cycling the power.  The problem is
+   * that sending the host reset command will cause the host to forget
+   * about the pending command, but the device won't forget.  For now, we
+   * skip the host reset option if any of the failed devices are configured
+   * to use the soft reset option.
+   */
+  for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+  {
+next_device2:
+      for(SCpnt=SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next)
+      {
+          if( SCpnt->state != SCSI_STATE_FAILED 
+              && SCpnt->state != SCSI_STATE_TIMEOUT )
+          {
+              continue;
+          }
+          if( SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT )
+          {
+              /* 
+               * If this device uses the soft reset option, and this
+               * is one of the devices acting up, then our only
+               * option is to wait a bit, since the command is
+               * supposedly still running.  
+               *
+               * FIXME(eric) - right now we will just end up falling
+               * through to the 'take device offline' case.
+               */
+              SCSI_LOG_ERROR_RECOVERY(3,
+                        printk("scsi_unjam_host: Unable to try hard host reset\n"));
+              goto next_device2;
+          }
+
+          SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Try hard host reset\n"));
+
+          /*
+           * FIXME(eric) - we need to obtain a valid SCpnt to perform this call.
+           */
+          rtn = scsi_try_host_reset(SCpnt);
+          if( rtn == SUCCESS )
+          {
+              /*
+               * FIXME(eric) we assume that all commands are flushed from the
+               * controller.  We should get a DID_RESET for all of the commands
+               * that were pending.  We should ignore these so that we can
+               * guarantee that we are in a consistent state.
+               *
+               * I believe this to be the case right now, but this needs to be
+               * tested.
+               */
+            for(SDloop=host->host_queue; SDloop; SDloop = SDloop->next)
+              {
+                  for(SCloop=SDloop->device_queue; SCloop; SCloop = SCloop->next)
+                  {
+                      if( SCloop->state != SCSI_STATE_FAILED 
+                          && SCloop->state != SCSI_STATE_TIMEOUT )
+                      {
+                          continue;
+                      }
+                      
+                      rtn = scsi_test_unit_ready(SCloop);
+                      
+                      if( rtn == SUCCESS && scsi_unit_is_ready(SCloop) )
+                      {
+                          rtn = scsi_eh_retry_command(SCloop);
+                          
+                          if( rtn == SUCCESS )
+                          {
+                              SCpnt->host->host_failed--;
+                              scsi_eh_finish_command(&SCdone,SCloop);
+                          }
+                      }
+                      if( rtn != SUCCESS )
+                      {
+                          SCloop->device->online = FALSE;
+                          SCloop->host->host_failed--;
+                          scsi_eh_finish_command(&SCdone,SCloop);
+                      }
+                  }
+              }
+          }
+      }
+  }
+
+
+  /*
+   * If we solved all of the problems, then let's rev up the engines again.
+   */
+  if( host->host_failed == 0 )
+  {
+      ourrtn = TRUE;
+      goto leave;
+  }
+
+  /*
+   * If the HOST RESET failed, then for now we assume that the entire host
+   * adapter is too hosed to be of any use.  For our purposes, however, it is
+   * easier to simply take the devices offline that correspond to commands
+   * that failed.
+   */
+  SCSI_LOG_ERROR_RECOVERY(1,printk("scsi_unjam_host: Take device offline\n"));
+
+  for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+  {
+      for(SCloop=SDpnt->device_queue; SCloop; SCloop = SCloop->next)
+      {
+          if( SCloop->state == SCSI_STATE_FAILED || SCloop->state == SCSI_STATE_TIMEOUT )
+          {
+              SCloop->device->online = FALSE;
+              
+              /*
+               * This should pass the failure up to the top level driver, and
+               * it will have to try and do something intelligent with it.
+               */
+              SCloop->host->host_failed--;
+              
+              if( SCloop->state == SCSI_STATE_TIMEOUT )
+              {
+                  SCloop->result |= (DRIVER_TIMEOUT << 24);
+              }
+
+              SCSI_LOG_ERROR_RECOVERY(3,printk("Finishing command for device %d %x\n",
+                     SCloop->device->id, SCloop->result));
+              
+              scsi_eh_finish_command(&SCdone,SCloop);
+          }
+      }
+  }
+
+  if( host->host_failed != 0 )
+  {
+      panic("scsi_unjam_host: Miscount of number of failed commands.\n");
+  }
+
+  SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Returning\n"));
+
+  ourrtn = FALSE;
+
+leave:
+
+  /*
+   * We should have a list of commands that we 'finished' during the course of
+   * error recovery.  This should be the same as the list of commands that timed out
+   * or failed.  We are currently holding these things in a linked list - we didn't
+   * put them in the bottom half queue because we wanted to keep things quiet while
+   * we were working on recovery, and passing them up to the top level could easily
+   * cause the top level to try and queue something else again.
+   *
+   * Start by marking that the host is no longer in error recovery.
+   */
+  host->in_recovery = 0;
+
+  /*
+   * Take the list of commands, and stick them in the bottom half queue.
+   * The current implementation of scsi_done will do this for us - if need
+   * be we can create a special version of this function to do the
+   * same job for us.
+   */
+  for(SCpnt = SCdone; SCpnt != NULL; SCpnt = SCdone)
+  {
+      SCdone = SCpnt->bh_next;
+      SCpnt->bh_next = NULL;
+      scsi_done(SCpnt);
+  }
+
+  return (ourrtn);
+}
+
+
+/*
+ * Function:	scsi_error_handler
+ *
+ * Purpose:	Handle errors/timeouts of scsi commands, try and clean up
+ *		and unjam the bus, and restart things.
+ *
+ * Arguments:	host	- host for which we are running.
+ *
+ * Returns:	Never returns.
+ *
+ * Notes:	This is always run in the context of a kernel thread.  The
+ *		idea is that we start this thing up when the kernel starts
+ *		up (one per host that we detect), and it immediately goes to
+ *		sleep and waits for some event (i.e. failure).  When this
+ *		takes place, we have the job of trying to unjam the bus
+ *		and restarting things.
+ *
+ */
+void
+scsi_error_handler(void * data)
+{
+	struct Scsi_Host     * host = (struct Scsi_Host *) data;
+	int	               rtn;
+	struct semaphore sem = MUTEX_LOCKED;
+
+	lock_kernel();
+
+	/*
+	 * If we were started as result of loading a module, close all of the
+	 * user space pages.  We don't need them, and if we didn't close them
+	 * they would be locked into memory.
+	 */
+	exit_mm(current);
+
+
+	current->session = 1;
+	current->pgrp = 1;
+        /*
+         * FIXME(eric) this is still a child process of the one that did the insmod.
+         * This needs to be attached to task[0] instead.
+         */
+
+	siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
+        current->fs->umask = 0;
+
+	/*
+	 * Set the name of this process.
+	 */
+	sprintf(current->comm, "scsi_eh_%d", host->host_no);
+
+	host->eh_wait = &sem;
+	host->ehandler = current;
+        
+	unlock_kernel();
+
+        /*
+         * Wake up the thread that created us.
+         */
+        SCSI_LOG_ERROR_RECOVERY(3,printk("Wake up parent %d\n", host->eh_notify->count.counter));
+
+        up(host->eh_notify);
+
+	while(1)
+	  {
+	    /*
+	     * If we get a signal, it means we are supposed to go
+	     * away and die.  This typically happens if the user is
+	     * trying to unload a module.
+	     */
+            SCSI_LOG_ERROR_RECOVERY(1,printk("Error handler sleeping\n"));
+	    down_interruptible (&sem);
+
+	    if (signal_pending(current) )
+	      break;
+
+            SCSI_LOG_ERROR_RECOVERY(1,printk("Error handler waking up\n"));
+
+            host->eh_active = 1;
+
+	    /*
+	     * We have a host that is failing for some reason.  Figure out
+	     * what we need to do to get it up and online again (if we can).
+	     * If we fail, we end up taking the thing offline.
+	     */
+	    if( host->hostt->eh_strategy_handler != NULL )
+	      {
+		rtn = host->hostt->eh_strategy_handler(host);
+	      }
+	    else
+	      {
+		rtn = scsi_unjam_host(host);
+	      }
+
+            host->eh_active = 0;
+
+	    /*
+	     * Note - if the above fails completely, the action is to take
+	     * individual devices offline and flush the queue of any
+	     * outstanding requests that may have been pending.  When we
+	     * restart, we restart any I/O to any other devices on the bus
+	     * which are still online.
+	     */
+	    scsi_restart_operations(host);
+	  }
+
+        SCSI_LOG_ERROR_RECOVERY(1,printk("Error handler exiting\n"));
+
+	/*
+	 * Make sure that nobody tries to wake us up again.
+	 */
+	host->eh_wait = NULL;
+
+	/*
+	 * Knock this down too.  From this point on, the host is flying
+	 * without a pilot.  If this is because the module is being unloaded,
+	 * that's fine.  If the user sent a signal to this thing, we are
+	 * potentially in real danger.
+	 */
+	host->in_recovery = 0;
+        host->eh_active = 0;
+	host->ehandler = NULL;
+
+	/*
+	 * If anyone is waiting for us to exit (i.e. someone trying to unload
+	 * a driver), then wake up that process to let them know we are on
+	 * the way out the door.  This may be overkill - I *think* that we
+	 * could probably just unload the driver and send the signal, and when
+	 * the error handling thread wakes up that it would just exit without
+	 * needing to touch any memory associated with the driver itself.
+	 */
+	if( host->eh_notify != NULL )
+	  up(host->eh_notify);
+}
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-indent-level: 4
+ * c-brace-imaginary-offset: 0
+ * c-brace-offset: -4
+ * c-argdecl-indent: 4
+ * c-label-offset: -4
+ * c-continued-statement-offset: 4
+ * c-continued-brace-offset: 0
+ * indent-tabs-mode: nil
+ * tab-width: 8
+ * End:
+ */
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov