patch-2.1.75 linux/drivers/scsi/scsi_error.c
Next file: linux/drivers/scsi/scsi_ioctl.c
Previous file: linux/drivers/scsi/scsi_debug.h
Back to the patch index
Back to the overall index
- Lines: 1896
- Date:
Sun Dec 21 17:04:49 1997
- Orig file:
v2.1.74/linux/drivers/scsi/scsi_error.c
- Orig date:
Wed Dec 31 16:00:00 1969
diff -u --recursive --new-file v2.1.74/linux/drivers/scsi/scsi_error.c linux/drivers/scsi/scsi_error.c
@@ -0,0 +1,1895 @@
+/*
+ * scsi_error.c Copyright (C) 1997 Eric Youngdale
+ *
+ * SCSI error/timeout handling
+ * Initial versions: Eric Youngdale. Based upon conversations with
+ * Leonard Zubkoff and David Miller at Linux Expo,
+ * ideas originating from all over the place.
+ *
+ */
+
+#include <linux/config.h>
+#define __NO_VERSION__
+#include <linux/module.h>
+
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/malloc.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/stat.h>
+#include <linux/blk.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <asm/smp_lock.h>
+
+#define __KERNEL_SYSCALLS__
+
+#include <linux/unistd.h>
+
+#include <asm/system.h>
+#include <asm/irq.h>
+#include <asm/dma.h>
+
+#include "scsi.h"
+#include "hosts.h"
+#include "constants.h"
+
+#define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
+
+#ifdef CONFIG_KERNELD
+#include <linux/kerneld.h>
+#endif
+
+#ifdef DEBUG
+ #define SENSE_TIMEOUT SCSI_TIMEOUT
+ #define ABORT_TIMEOUT SCSI_TIMEOUT
+ #define RESET_TIMEOUT SCSI_TIMEOUT
+#else
+ #define SENSE_TIMEOUT (10*HZ)
+ #define RESET_TIMEOUT (2*HZ)
+ #define ABORT_TIMEOUT (15*HZ)
+#endif
+
+#define STATIC
+
+/*
+ * These should *probably* be handled by the host itself.
+ * Since it is allowed to sleep, it probably should.
+ */
+#define BUS_RESET_SETTLE_TIME 5*HZ
+#define HOST_RESET_SETTLE_TIME 10*HZ
+
+
+static const char RCSid[] = "$Header: /mnt/ide/home/eric/CVSROOT/linux/drivers/scsi/scsi_error.c,v 1.9 1997/12/07 23:38:23 eric Exp $";
+
+STATIC int scsi_check_sense (Scsi_Cmnd * SCpnt);
+STATIC int scsi_request_sense(Scsi_Cmnd *);
+STATIC void scsi_send_eh_cmnd (Scsi_Cmnd * SCpnt, int timeout);
+STATIC int scsi_try_to_abort_command(Scsi_Cmnd *, int);
+STATIC int scsi_test_unit_ready(Scsi_Cmnd *);
+STATIC int scsi_try_bus_device_reset(Scsi_Cmnd *, int timeout);
+STATIC int scsi_try_bus_reset(Scsi_Cmnd *);
+STATIC int scsi_try_host_reset(Scsi_Cmnd *);
+STATIC int scsi_unit_is_ready(Scsi_Cmnd *);
+STATIC void scsi_eh_action_done(Scsi_Cmnd *, int);
+STATIC int scsi_eh_retry_command(Scsi_Cmnd *);
+STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt);
+STATIC void scsi_restart_operations(struct Scsi_Host *);
+STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt);
+
+
+/*
+ * Function: scsi_add_timer()
+ *
+ * Purpose: Start timeout timer for a single scsi command.
+ *
+ * Arguments: SCset - command that is about to start running.
+ * timeout - amount of time to allow this command to run.
+ * complete - timeout function to call if timer isn't
+ * canceled.
+ *
+ * Returns: Nothing
+ *
+ * Notes: This should be turned into an inline function.
+ *
+ * More Notes: Each scsi command has it's own timer, and as it is added to
+ * the queue, we set up the timer. When the command completes,
+ * we cancel the timer. Pretty simple, really, especially
+ * compared to the old way of handling this crap.
+ */
+void
+scsi_add_timer(Scsi_Cmnd * SCset,
+ int timeout,
+ void (*complete)(Scsi_Cmnd *))
+{
+
+ /*
+ * If the clock was already running for this command, then
+ * first delete the timer. The timer handling code gets rather
+ * confused if we don't do this.
+ */
+ if( SCset->eh_timeout.function != NULL )
+ {
+ del_timer(&SCset->eh_timeout);
+ }
+
+ SCset->eh_timeout.data = (unsigned long) SCset;
+ SCset->eh_timeout.expires = jiffies + timeout;
+ SCset->eh_timeout.function = (void (*)(unsigned long))complete;
+
+ SCSI_LOG_ERROR_RECOVERY(5,printk("Adding timer for command %p at %d (%p)\n", SCset, timeout, complete));
+
+ add_timer(&SCset->eh_timeout);
+
+}
+
+/*
+ * Function: scsi_delete_timer()
+ *
+ * Purpose: Delete/cancel timer for a given function.
+ *
+ * Arguments: SCset - command that we are canceling timer for.
+ *
+ * Returns: Amount of time remaining before command would have timed out.
+ *
+ * Notes: This should be turned into an inline function.
+ */
+int
+scsi_delete_timer(Scsi_Cmnd * SCset)
+{
+ int rtn;
+
+ rtn = jiffies - SCset->eh_timeout.expires;
+ del_timer(&SCset->eh_timeout);
+
+ SCSI_LOG_ERROR_RECOVERY(5,printk("Clearing timer for command %p\n", SCset));
+
+ SCset->eh_timeout.data = (unsigned long) NULL;
+ SCset->eh_timeout.expires = 0;
+ SCset->eh_timeout.function = NULL;
+
+ return rtn;
+}
+
+/*
+ * Function: scsi_times_out()
+ *
+ * Purpose: Timeout function for normal scsi commands..
+ *
+ * Arguments: SCpnt - command that is timing out.
+ *
+ * Returns: Nothing.
+ *
+ * Notes:
+ */
+void scsi_times_out (Scsi_Cmnd * SCpnt)
+{
+
+ /*
+ * Notify the low-level code that this operation failed and we are
+ * reposessing the command.
+ */
+#ifdef ERIC_neverdef
+ /*
+ * FIXME(eric)
+ * Allow the host adapter to push a queue ordering tag
+ * out to the bus to force the command in question to complete.
+ * If the host wants to do this, then we just restart the timer
+ * for the command. Before we really do this, some real thought
+ * as to the optimum way to handle this should be done. We *do*
+ * need to force ordering every so often to ensure that all requests
+ * do eventually complete, but I am not sure if this is the best way
+ * to actually go about it.
+ *
+ * Better yet, force a sync here, but don't block since we are in an
+ * interrupt.
+ */
+ if( SCpnt->host->hostt->eh_ordered_queue_tag )
+ {
+ if( (*SCpnt->host->hostt->eh_ordered_queue_tag)(SCpnt))
+ {
+ scsi_add_timer(SCpnt, SCpnt->internal_timeout,
+ scsi_times_out);
+ return;
+ }
+ }
+ /*
+ * FIXME(eric) - add a second special interface to handle this
+ * case. Ideally that interface can also be used to request
+ * a queu
+ */
+ if (SCpnt->host->can_queue)
+ {
+ SCpnt->host->hostt->queuecommand (SCpnt, NULL);
+ }
+#endif
+
+ SCpnt->state = SCSI_STATE_TIMEOUT;
+ SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
+
+ SCpnt->host->in_recovery = 1;
+ SCpnt->host->host_failed++;
+
+ SCSI_LOG_TIMEOUT(3,printk("Command timed out active=%d busy=%d failed=%d\n",
+ atomic_read(&SCpnt->host->host_active),
+ SCpnt->host->host_busy,
+ SCpnt->host->host_failed));
+
+ /*
+ * If the host is having troubles, then look to see if this was the last
+ * command that might have failed. If so, wake up the error handler.
+ */
+ if( atomic_read(&SCpnt->host->host_active) == SCpnt->host->host_failed )
+ {
+ up(SCpnt->host->eh_wait);
+ }
+}
+
+/*
+ * Function scsi_block_when_processing_errors
+ *
+ * Purpose: Prevent more commands from being queued while error recovery
+ * is taking place.
+ *
+ * Arguments: SDpnt - device on which we are performing recovery.
+ *
+ * Returns: FALSE The device was taken offline by error recovery.
+ * TRUE OK to proceed.
+ *
+ * Notes: We block until the host is out of error recovery, and then
+ * check to see whether the host or the device is offline.
+ */
+int
+scsi_block_when_processing_errors(Scsi_Device * SDpnt)
+{
+
+ SCSI_SLEEP( &SDpnt->host->host_wait, SDpnt->host->in_recovery);
+
+ SCSI_LOG_ERROR_RECOVERY(5,printk("Open returning %d\n", SDpnt->online));
+
+ return SDpnt->online;
+}
+
+/*
+ * Function: scsi_eh_times_out()
+ *
+ * Purpose: Timeout function for error handling.
+ *
+ * Arguments: SCpnt - command that is timing out.
+ *
+ * Returns: Nothing.
+ *
+ * Notes: During error handling, the kernel thread will be sleeping
+ * waiting for some action to complete on the device. Our only
+ * job is to record that it timed out, and to wake up the
+ * thread.
+ */
+STATIC
+void scsi_eh_times_out (Scsi_Cmnd * SCpnt)
+{
+ SCpnt->request.rq_status = RQ_SCSI_DONE;
+ SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
+ SCpnt->eh_state = SCSI_STATE_TIMEOUT;
+
+ SCSI_LOG_ERROR_RECOVERY(5,printk("In scsi_eh_times_out %p\n", SCpnt));
+
+ if (SCpnt->host->eh_action != NULL)
+ up(SCpnt->host->eh_action);
+ else
+ panic("Missing scsi error handler thread");
+}
+
+
+/*
+ * Function: scsi_eh_done()
+ *
+ * Purpose: Completion function for error handling.
+ *
+ * Arguments: SCpnt - command that is timing out.
+ *
+ * Returns: Nothing.
+ *
+ * Notes: During error handling, the kernel thread will be sleeping
+ * waiting for some action to complete on the device. Our only
+ * job is to record that the action completed, and to wake up the
+ * thread.
+ */
+STATIC
+void scsi_eh_done (Scsi_Cmnd * SCpnt)
+{
+ SCpnt->request.rq_status = RQ_SCSI_DONE;
+
+ SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
+ SCpnt->eh_state = SUCCESS;
+
+ SCSI_LOG_ERROR_RECOVERY(5,printk("In eh_done %p result:%x\n", SCpnt,
+ SCpnt->result));
+
+ if (SCpnt->host->eh_action != NULL)
+ up(SCpnt->host->eh_action);
+}
+
+/*
+ * Function: scsi_eh_action_done()
+ *
+ * Purpose: Completion function for error handling.
+ *
+ * Arguments: SCpnt - command that is timing out.
+ * answer - boolean that indicates whether operation succeeded.
+ *
+ * Returns: Nothing.
+ *
+ * Notes: This callback is only used for abort and reset operations.
+ */
+STATIC
+void scsi_eh_action_done (Scsi_Cmnd * SCpnt, int answer)
+{
+ SCpnt->request.rq_status = RQ_SCSI_DONE;
+
+ SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
+ SCpnt->eh_state = (answer ? SUCCESS : FAILED);
+
+ if (SCpnt->host->eh_action != NULL)
+ up(SCpnt->host->eh_action);
+}
+
+/*
+ * Function: scsi_sense_valid()
+ *
+ * Purpose: Determine whether a host has automatically obtained sense
+ * information or not. If we have it, then give a recommendation
+ * as to what we should do next.
+ */
+int
+scsi_sense_valid(Scsi_Cmnd * SCpnt)
+{
+ if (((SCpnt->sense_buffer[0] & 0x70) >> 4) != 7)
+ {
+ return FALSE;
+ }
+ return TRUE;
+}
+
+/*
+ * Function: scsi_eh_retry_command()
+ *
+ * Purpose: Retry the original command
+ *
+ * Returns: SUCCESS - we were able to get the sense data.
+ * FAILED - we were not able to get the sense data.
+ *
+ * Notes: This function will *NOT* return until the command either
+ * times out, or it completes.
+ */
+STATIC int
+scsi_eh_retry_command(Scsi_Cmnd * SCpnt)
+{
+ memcpy ((void *) SCpnt->cmnd, (void*) SCpnt->data_cmnd,
+ sizeof(SCpnt->data_cmnd));
+ SCpnt->request_buffer = SCpnt->buffer;
+ SCpnt->request_bufflen = SCpnt->bufflen;
+ SCpnt->use_sg = SCpnt->old_use_sg;
+ SCpnt->cmd_len = SCpnt->old_cmd_len;
+
+ SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
+
+ scsi_send_eh_cmnd (SCpnt, SCpnt->timeout_per_command);
+
+ /*
+ * Hey, we are done. Let's look to see what happened.
+ */
+ return SCpnt->eh_state;
+}
+
+/*
+ * Function: scsi_request_sense()
+ *
+ * Purpose: Request sense data from a particular target.
+ *
+ * Returns: SUCCESS - we were able to get the sense data.
+ * FAILED - we were not able to get the sense data.
+ *
+ * Notes: Some hosts automatically obtain this information, others
+ * require that we obtain it on our own.
+ *
+ * This function will *NOT* return until the command either
+ * times out, or it completes.
+ */
+STATIC int
+scsi_request_sense(Scsi_Cmnd * SCpnt)
+{
+ static unsigned char generic_sense[6] = {REQUEST_SENSE, 0,0,0, 255, 0};
+
+ memcpy ((void *) SCpnt->cmnd , (void *) generic_sense,
+ sizeof(generic_sense));
+
+ SCpnt->cmnd[1] = SCpnt->lun << 5;
+ SCpnt->cmnd[4] = sizeof(SCpnt->sense_buffer);
+
+ SCpnt->request_buffer = &SCpnt->sense_buffer;
+ SCpnt->request_bufflen = sizeof(SCpnt->sense_buffer);
+ SCpnt->use_sg = 0;
+ SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
+
+ scsi_send_eh_cmnd (SCpnt, SENSE_TIMEOUT);
+
+ /*
+ * Hey, we are done. Let's look to see what happened.
+ */
+ return SCpnt->eh_state;
+}
+
+/*
+ * Function: scsi_test_unit_ready()
+ *
+ * Purpose: Run test unit ready command to see if the device is talking to us or not.
+ *
+ */
+STATIC int
+scsi_test_unit_ready(Scsi_Cmnd * SCpnt)
+{
+ static unsigned char tur_command[6] = {TEST_UNIT_READY, 0,0,0,0,0};
+
+ memcpy ((void *) SCpnt->cmnd , (void *) tur_command,
+ sizeof(tur_command));
+
+ SCpnt->cmnd[1] = SCpnt->lun << 5;
+ SCpnt->cmnd[4] = sizeof(SCpnt->sense_buffer);
+
+ SCpnt->request_buffer = &SCpnt->sense_buffer;
+ SCpnt->request_bufflen = sizeof(SCpnt->sense_buffer);
+ SCpnt->use_sg = 0;
+ SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
+
+ scsi_send_eh_cmnd (SCpnt, SENSE_TIMEOUT);
+
+ /*
+ * Hey, we are done. Let's look to see what happened.
+ */
+ return SCpnt->eh_state;
+}
+
+STATIC
+void scsi_sleep_done (struct semaphore * sem)
+{
+ if( sem != NULL )
+ {
+ up(sem);
+ }
+}
+
+
+void scsi_sleep (int timeout)
+{
+ struct semaphore sem = MUTEX_LOCKED;
+ struct timer_list timer;
+
+ timer.data = (unsigned long) &sem;
+ timer.expires = jiffies + timeout;
+ timer.function = (void (*)(unsigned long))scsi_sleep_done;
+
+ SCSI_LOG_ERROR_RECOVERY(5,printk("Sleeping for timer tics %d\n", timeout));
+
+ add_timer(&timer);
+
+ down(&sem);
+
+ del_timer(&timer);
+}
+
+/*
+ * Function: scsi_send_eh_cmnd
+ *
+ * Purpose: Send a command out to a device as part of error recovery.
+ *
+ * Notes: The initialization of the structures is quite a bit different
+ * in this case, and furthermore, there is a different completion
+ * handler.
+ */
+STATIC void scsi_send_eh_cmnd (Scsi_Cmnd * SCpnt, int timeout)
+{
+ struct Scsi_Host * host;
+
+ host = SCpnt->host;
+
+retry:
+ /*
+ * We will use a queued command if possible, otherwise we will emulate the
+ * queuing and calling of completion function ourselves.
+ */
+ SCpnt->owner = SCSI_OWNER_LOWLEVEL;
+
+ if (host->can_queue)
+ {
+ struct semaphore sem = MUTEX_LOCKED;
+
+ SCpnt->eh_state = SCSI_STATE_QUEUED;
+
+ scsi_add_timer(SCpnt, timeout, scsi_eh_times_out);
+
+ /*
+ * Set up the semaphore so we wait for the command to complete.
+ */
+ SCpnt->host->eh_action = &sem;
+ SCpnt->request.rq_status = RQ_SCSI_BUSY;
+
+ host->hostt->queuecommand (SCpnt, scsi_eh_done);
+ down(&sem);
+ SCpnt->host->eh_action = NULL;
+
+ del_timer(&SCpnt->eh_timeout);
+
+ /*
+ * See if timeout. If so, tell the host to forget about it.
+ * In other words, we don't want a callback any more.
+ */
+ if( SCpnt->eh_state == SCSI_STATE_TIMEOUT )
+ {
+ SCpnt->eh_state = FAILED;
+ }
+
+ SCSI_LOG_ERROR_RECOVERY(5,printk("send_eh_cmnd: %p eh_state:%x\n",
+ SCpnt, SCpnt->eh_state));
+ }
+ else
+ {
+ int temp;
+
+ /*
+ * We damn well had better never use this code. There is no timeout
+ * protection here, since we would end up waiting in the actual low
+ * level driver, we don't know how to wake it up.
+ */
+ temp = host->hostt->command (SCpnt);
+ SCpnt->result = temp;
+ if( scsi_eh_completed_normally(SCpnt) )
+ {
+ SCpnt->eh_state = SUCCESS;
+ }
+ else
+ {
+ SCpnt->eh_state = FAILED;
+ }
+ }
+
+ /*
+ * Now examine the actual status codes to see whether the command actually
+ * did complete normally.
+ */
+ if( SCpnt->eh_state == SUCCESS )
+ {
+ switch( scsi_eh_completed_normally(SCpnt) )
+ {
+ case SUCCESS:
+ SCpnt->eh_state = SUCCESS;
+ break;
+ case NEEDS_RETRY:
+ goto retry;
+ case FAILED:
+ default:
+ SCpnt->eh_state = FAILED;
+ break;
+ }
+ }
+ else
+ {
+ SCpnt->eh_state = FAILED;
+ }
+}
+
+/*
+ * Function: scsi_unit_is_ready()
+ *
+ * Purpose: Called after TEST_UNIT_READY is run, to test to see if
+ * the unit responded in a way that indicates it is ready.
+ */
+STATIC int
+scsi_unit_is_ready(Scsi_Cmnd * SCpnt)
+{
+ if (SCpnt->result)
+ {
+ if (((driver_byte (SCpnt->result) & DRIVER_SENSE) ||
+ (status_byte (SCpnt->result) & CHECK_CONDITION)) &&
+ ((SCpnt->sense_buffer[0] & 0x70) >> 4) == 7)
+ {
+ if (((SCpnt->sense_buffer[2] & 0xf) != NOT_READY) &&
+ ((SCpnt->sense_buffer[2] & 0xf) != UNIT_ATTENTION) &&
+ ((SCpnt->sense_buffer[2] & 0xf) != ILLEGAL_REQUEST))
+ {
+ return 0;
+ }
+ }
+ }
+
+ return 1;
+}
+
+/*
+ * Function: scsi_eh_finish_command
+ *
+ * Purpose: Handle a command that we are finished with WRT error handling.
+ *
+ * Arguments: SClist - pointer to list into which we are putting completed commands.
+ * SCpnt - command that is completing
+ *
+ * Notes: We don't want to use the normal command completion while we are
+ * are still handling errors - it may cause other commands to be queued,
+ * and that would disturb what we are doing. Thus we really want to keep
+ * a list of pending commands for final completion, and once we
+ * are ready to leave error handling we handle completion for real.
+ */
+STATIC void
+scsi_eh_finish_command(Scsi_Cmnd **SClist, Scsi_Cmnd * SCpnt)
+{
+ SCpnt->state = SCSI_STATE_BHQUEUE;
+ SCpnt->bh_next = *SClist;
+ /*
+ * Set this back so that the upper level can correctly free up
+ * things.
+ */
+ SCpnt->use_sg = SCpnt->old_use_sg;
+ *SClist = SCpnt;
+}
+
+/*
+ * Function: scsi_try_to_abort_command
+ *
+ * Purpose: Ask host adapter to abort a running command.
+ *
+ * Returns: FAILED Operation failed or not supported.
+ * SUCCESS Succeeded.
+ *
+ * Notes: This function will not return until the user's completion
+ * function has been called. There is no timeout on this
+ * operation. If the author of the low-level driver wishes
+ * this operation to be timed, they can provide this facility
+ * themselves. Helper functions in scsi_error.c can be supplied
+ * to make this easier to do.
+ *
+ * Notes: It may be possible to combine this with all of the reset
+ * handling to eliminate a lot of code duplication. I don't
+ * know what makes more sense at the moment - this is just a
+ * prototype.
+ */
+STATIC int
+scsi_try_to_abort_command(Scsi_Cmnd * SCpnt, int timeout)
+{
+ SCpnt->eh_state = FAILED; /* Until we come up with something better */
+
+ if( SCpnt->host->hostt->eh_abort_handler == NULL )
+ {
+ return FAILED;
+ }
+
+ SCpnt->owner = SCSI_OWNER_LOWLEVEL;
+
+ return SCpnt->host->hostt->eh_abort_handler(SCpnt);
+}
+
+/*
+ * Function: scsi_try_bus_device_reset
+ *
+ * Purpose: Ask host adapter to perform a bus device reset for a given
+ * device.
+ *
+ * Returns: FAILED Operation failed or not supported.
+ * SUCCESS Succeeded.
+ *
+ * Notes: There is no timeout for this operation. If this operation is
+ * unreliable for a given host, then the host itself needs to put a
+ * timer on it, and set the host back to a consistent state prior
+ * to returning.
+ */
+STATIC int
+scsi_try_bus_device_reset(Scsi_Cmnd * SCpnt, int timeout)
+{
+ SCpnt->eh_state = FAILED; /* Until we come up with something better */
+
+ if( SCpnt->host->hostt->eh_device_reset_handler == NULL )
+ {
+ return FAILED;
+ }
+
+ SCpnt->owner = SCSI_OWNER_LOWLEVEL;
+
+ return SCpnt->host->hostt->eh_device_reset_handler(SCpnt);
+}
+
+/*
+ * Function: scsi_try_bus_reset
+ *
+ * Purpose: Ask host adapter to perform a bus reset for a host.
+ *
+ * Returns: FAILED Operation failed or not supported.
+ * SUCCESS Succeeded.
+ *
+ * Notes:
+ */
+STATIC int
+scsi_try_bus_reset(Scsi_Cmnd * SCpnt)
+{
+ int rtn;
+
+ SCpnt->eh_state = FAILED; /* Until we come up with something better */
+ SCpnt->owner = SCSI_OWNER_LOWLEVEL;
+
+ if( SCpnt->host->hostt->eh_bus_reset_handler == NULL )
+ {
+ return FAILED;
+ }
+
+ rtn = SCpnt->host->hostt->eh_bus_reset_handler(SCpnt);
+
+ /*
+ * If we had a successful bus reset, mark the command blocks to expect
+ * a condition code of unit attention.
+ */
+ scsi_sleep(BUS_RESET_SETTLE_TIME);
+ if( SCpnt->eh_state == SUCCESS )
+ {
+ Scsi_Device * SDloop;
+ for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next)
+ {
+ if( SCpnt->channel == SDloop->channel )
+ {
+ SDloop->was_reset = 1;
+ SDloop->expecting_cc_ua = 1;
+ }
+ }
+ }
+
+ return SCpnt->eh_state;
+}
+
+/*
+ * Function: scsi_try_host_reset
+ *
+ * Purpose: Ask host adapter to reset itself, and the bus.
+ *
+ * Returns: FAILED Operation failed or not supported.
+ * SUCCESS Succeeded.
+ *
+ * Notes:
+ */
+STATIC int
+scsi_try_host_reset(Scsi_Cmnd * SCpnt)
+{
+ int rtn;
+
+ SCpnt->eh_state = FAILED; /* Until we come up with something better */
+ SCpnt->owner = SCSI_OWNER_LOWLEVEL;
+
+ if( SCpnt->host->hostt->eh_host_reset_handler == NULL )
+ {
+ return FAILED;
+ }
+
+ rtn = SCpnt->host->hostt->eh_host_reset_handler(SCpnt);
+
+ /*
+ * If we had a successful host reset, mark the command blocks to expect
+ * a condition code of unit attention.
+ */
+ scsi_sleep(HOST_RESET_SETTLE_TIME);
+ if( SCpnt->eh_state == SUCCESS )
+ {
+ Scsi_Device * SDloop;
+ for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next)
+ {
+ SDloop->was_reset = 1;
+ SDloop->expecting_cc_ua = 1;
+ }
+ }
+
+ return SCpnt->eh_state;
+}
+
+/*
+ * Function: scsi_decide_disposition
+ *
+ * Purpose: Examine a command block that has come back from the low-level
+ * and figure out what to do next.
+ *
+ * Returns: SUCCESS - pass on to upper level.
+ * FAILED - pass on to error handler thread.
+ * RETRY - command should be retried.
+ * SOFTERR - command succeeded, but we need to log
+ * a soft error.
+ *
+ * Notes: This is *ONLY* called when we are examining the status
+ * after sending out the actual data command. Any commands
+ * that are queued for error recovery (i.e. TEST_UNIT_READY)
+ * do *NOT* come through here.
+ *
+ * NOTE - When this routine returns FAILED, it means the error
+ * handler thread is woken. In cases where the error code
+ * indicates an error that doesn't require the error handler
+ * thread (i.e. we don't need to abort/reset), then this function
+ * should return SUCCESS.
+ */
+int scsi_decide_disposition (Scsi_Cmnd * SCpnt)
+{
+ int rtn;
+
+ /*
+ * If the device is offline, then we clearly just pass the result back
+ * up to the top level.
+ */
+ if( SCpnt->device->online == FALSE )
+ {
+ SCSI_LOG_ERROR_RECOVERY(5,printk("scsi_error.c: device offline - report as SUCCESS\n"));
+ return SUCCESS;
+ }
+
+ /*
+ * First check the host byte, to see if there is anything in there
+ * that would indicate what we need to do.
+ */
+
+ switch(host_byte(SCpnt->result))
+ {
+ case DID_PASSTHROUGH:
+ /*
+ * No matter what, pass this through to the upper layer.
+ * Nuke this special code so that it looks like we are saying
+ * DID_OK.
+ */
+ SCpnt->result &= 0xff00ffff;
+ return SUCCESS;
+ case DID_OK:
+ /*
+ * Looks good. Drop through, and check the next byte.
+ */
+ break;
+ case DID_NO_CONNECT:
+ case DID_BAD_TARGET:
+ case DID_ABORT:
+ /*
+ * Note - this means that we just report the status back to the
+ * top level driver, not that we actually think that it indicates
+ * sucess.
+ */
+ return SUCCESS;
+ case DID_PARITY:
+ case DID_BUS_BUSY:
+ case DID_ERROR:
+ goto maybe_retry;
+ case DID_TIME_OUT:
+ /*
+ * When we scan the bus, we get timeout messages for
+ * these commands if there is no device available.
+ * Other hosts report DID_NO_CONNECT for the same thing.
+ */
+ if( (SCpnt->cmnd[0] == TEST_UNIT_READY ||
+ SCpnt->cmnd[0] == INQUIRY) )
+ {
+ return SUCCESS;
+ }
+ else
+ {
+ return FAILED;
+ }
+ case DID_RESET:
+ /*
+ * In the normal case where we haven't initiated a reset, this is
+ * a failure.
+ */
+ if( SCpnt->flags & IS_RESETTING )
+ {
+ SCpnt->flags &= ~IS_RESETTING;
+ goto maybe_retry;
+ }
+
+ /*
+ * Examine the sense data to figure out how to proceed from here.
+ * If there is no sense data, we will be forced into the error
+ * handler thread, where we get to examine the thing in a lot more
+ * detail.
+ */
+ return scsi_check_sense (SCpnt);
+ default:
+ return FAILED;
+ }
+
+ /*
+ * Next, check the message byte.
+ */
+ if( msg_byte(SCpnt->result) != COMMAND_COMPLETE )
+ {
+ return FAILED;
+ }
+
+ /*
+ * Now, check the status byte to see if this indicates anything special.
+ */
+ switch (status_byte(SCpnt->result))
+ {
+ case QUEUE_FULL:
+ /*
+ * The case of trying to send too many commands to a tagged queueing
+ * device.
+ */
+ return ADD_TO_MLQUEUE;
+ case GOOD:
+ case COMMAND_TERMINATED:
+ return SUCCESS;
+ case CHECK_CONDITION:
+ rtn = scsi_check_sense(SCpnt);
+ if( rtn == NEEDS_RETRY )
+ {
+ goto maybe_retry;
+ }
+ return rtn;
+ case CONDITION_GOOD:
+ case INTERMEDIATE_GOOD:
+ case INTERMEDIATE_C_GOOD:
+ /*
+ * Who knows? FIXME(eric)
+ */
+ return SUCCESS;
+ case BUSY:
+ case RESERVATION_CONFLICT:
+ goto maybe_retry;
+ default:
+ return FAILED;
+ }
+ return FAILED;
+
+maybe_retry:
+
+ if ((++SCpnt->retries) < SCpnt->allowed)
+ {
+ return NEEDS_RETRY;
+ }
+ else
+ {
+ return FAILED;
+ }
+}
+
+/*
+ * Function: scsi_eh_completed_normally
+ *
+ * Purpose: Examine a command block that has come back from the low-level
+ * and figure out what to do next.
+ *
+ * Returns: SUCCESS - pass on to upper level.
+ * FAILED - pass on to error handler thread.
+ * RETRY - command should be retried.
+ * SOFTERR - command succeeded, but we need to log
+ * a soft error.
+ *
+ * Notes: This is *ONLY* called when we are examining the status
+ * of commands queued during error recovery. The main
+ * difference here is that we don't allow for the possibility
+ * of retries here, and we are a lot more restrictive about what
+ * we consider acceptable.
+ */
+STATIC int scsi_eh_completed_normally (Scsi_Cmnd * SCpnt)
+{
+ int rtn;
+ /*
+ * First check the host byte, to see if there is anything in there
+ * that would indicate what we need to do.
+ */
+ if( host_byte(SCpnt->result) == DID_RESET )
+ {
+ if (SCpnt->flags & IS_RESETTING )
+ {
+ /*
+ * OK, this is normal. We don't know whether in fact the
+ * command in question really needs to be rerun or not -
+ * if this was the original data command then the answer is yes,
+ * otherwise we just flag it as success.
+ */
+ SCpnt->flags &= ~IS_RESETTING;
+ return NEEDS_RETRY;
+ }
+
+ /*
+ * Rats. We are already in the error handler, so we now get to try
+ * and figure out what to do next. If the sense is valid, we have
+ * a pretty good idea of what to do. If not, we mark it as failed.
+ */
+ return scsi_check_sense (SCpnt);
+ }
+
+ if(host_byte(SCpnt->result) != DID_OK )
+ {
+ return FAILED;
+ }
+
+ /*
+ * Next, check the message byte.
+ */
+ if( msg_byte(SCpnt->result) != COMMAND_COMPLETE )
+ {
+ return FAILED;
+ }
+
+ /*
+ * Now, check the status byte to see if this indicates anything special.
+ */
+ switch (status_byte(SCpnt->result))
+ {
+ case GOOD:
+ case COMMAND_TERMINATED:
+ return SUCCESS;
+ case CHECK_CONDITION:
+ rtn = scsi_check_sense(SCpnt);
+ if( rtn == NEEDS_RETRY )
+ {
+ return FAILED;
+ }
+ return rtn;
+ case CONDITION_GOOD:
+ case INTERMEDIATE_GOOD:
+ case INTERMEDIATE_C_GOOD:
+ /*
+ * Who knows? FIXME(eric)
+ */
+ return SUCCESS;
+ case BUSY:
+ case QUEUE_FULL:
+ case RESERVATION_CONFLICT:
+ default:
+ return FAILED;
+ }
+ return FAILED;
+}
+
+/*
+ * Function: scsi_check_sense
+ *
+ * Purpose: Examine sense information - give suggestion as to what
+ * we should do with it.
+ */
+STATIC int scsi_check_sense (Scsi_Cmnd * SCpnt)
+{
+ if ( !scsi_sense_valid(SCpnt) )
+ {
+ return FAILED;
+ }
+
+ if (SCpnt->sense_buffer[2] & 0xe0)
+ return FAILED;
+
+ switch (SCpnt->sense_buffer[2] & 0xf)
+ {
+ case NO_SENSE:
+ return SUCCESS;
+ case RECOVERED_ERROR:
+ return SOFT_ERROR;
+
+ case ABORTED_COMMAND:
+ return NEEDS_RETRY;
+ case NOT_READY:
+ case UNIT_ATTENTION:
+ /*
+ * If we are expecting a CC/UA because of a bus reset that we
+ * performed, treat this just as a retry. Otherwise this is
+ * information that we should pass up to the upper-level driver
+ * so that we can deal with it there.
+ */
+ if( SCpnt->device->expecting_cc_ua )
+ {
+ SCpnt->device->expecting_cc_ua = 0;
+ return NEEDS_RETRY;
+ }
+ return SUCCESS;
+
+ /* these three are not supported */
+ case COPY_ABORTED:
+ case VOLUME_OVERFLOW:
+ case MISCOMPARE:
+
+ case MEDIUM_ERROR:
+ return FAILED;
+
+ case BLANK_CHECK:
+ case DATA_PROTECT:
+ case HARDWARE_ERROR:
+ case ILLEGAL_REQUEST:
+ default:
+ return FAILED;
+ }
+}
+
+
+/*
+ * Function: scsi_restart_operations
+ *
+ * Purpose: Restart IO operations to the specified host.
+ *
+ * Arguments: host - host that we are restarting
+ *
+ * Returns: Nothing
+ *
+ * Notes: When we entered the error handler, we blocked all further
+ * I/O to this device. We need to 'reverse' this process.
+ */
+STATIC void
+scsi_restart_operations(struct Scsi_Host * host)
+{
+ Scsi_Device * SDpnt;
+
+ /*
+ * Next free up anything directly waiting upon the host. This will be
+ * requests for character device operations, and also for ioctls to queued
+ * block devices.
+ */
+ SCSI_LOG_ERROR_RECOVERY(5,printk("scsi_error.c: Waking up host to restart\n"));
+
+ wake_up(&host->host_wait);
+
+ /*
+ * Finally, block devices need an extra kick in the pants. This is because
+ * the request queueing mechanism may have queued lots of pending requests
+ * and there won't be a process waiting in a place where we can simply wake
+ * it up. Thus we simply go through and call the request function to goose
+ * the various top level drivers and get things moving again.
+ */
+ for( SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next )
+ {
+ SCSI_LOG_ERROR_RECOVERY(5,printk("Calling request function to restart things...\n"));
+
+ if( SDpnt->scsi_request_fn != NULL )
+ (*SDpnt->scsi_request_fn)();
+ }
+}
+
+/*
+ * Function: scsi_unjam_host
+ *
+ * Purpose: Attempt to fix a host which has a command that failed for
+ * some reason.
+ *
+ * Arguments: host - host that needs unjamming.
+ *
+ * Returns: Nothing
+ *
+ * Notes: When we come in here, we *know* that all commands on the
+ * bus have either completed, failed or timed out. We also
+ * know that no further commands are being sent to the host,
+ * so things are relatively quiet and we have freedom to
+ * fiddle with things as we wish.
+ *
+ * Additional note: This is only the *default* implementation. It is possible
+ * for individual drivers to supply their own version of this
+ * function, and if the maintainer wishes to do this, it is
+ * strongly suggested that this function be taken as a template
+ * and modified. This function was designed to correctly handle
+ * problems for about 95% of the different cases out there, and
+ * it should always provide at least a reasonable amount of error
+ * recovery.
+ *
+ * Note3: Any command marked 'FAILED' or 'TIMEOUT' must eventually
+ * have scsi_finish_command() called for it. We do all of
+ * the retry stuff here, so when we restart the host after we
+ * return it should have an empty queue.
+ */
+STATIC int
+scsi_unjam_host(struct Scsi_Host * host)
+{
+ int devices_failed;
+ int numfailed;
+ int ourrtn;
+ int rtn = FALSE;
+ int result;
+ Scsi_Cmnd * SCloop;
+ Scsi_Cmnd * SCpnt;
+ Scsi_Device * SDpnt;
+ Scsi_Device * SDloop;
+ Scsi_Cmnd * SCdone;
+ int timed_out;
+
+ SCdone = NULL;
+
+ /*
+ * First, protect against any sort of race condition. If any of the outstanding
+ * commands are in states that indicate that we are not yet blocked (i.e. we are
+ * not in a quiet state) then we got woken up in error. If we ever end up here,
+ * we need to re-examine some of the assumptions.
+ */
+ for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+ {
+ for(SCpnt=SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next)
+ {
+ if( SCpnt->state == SCSI_STATE_FAILED
+ || SCpnt->state == SCSI_STATE_TIMEOUT
+ || SCpnt->state == SCSI_STATE_UNUSED)
+ {
+ continue;
+ }
+
+ /*
+ * Rats. Something is still floating around out there. This could
+ * be the result of the fact that the upper level drivers are still frobbing
+ * commands that might have succeeded. There are two outcomes. One is that
+ * the command block will eventually be freed, and the other one is that
+ * the command will be queued and will be finished along the way.
+ */
+ SCSI_LOG_ERROR_RECOVERY(1,printk("Error handler prematurely woken - commands still active (%p %x %d)\n", SCpnt, SCpnt->state, SCpnt->target));
+ panic("SCSI Error handler woken too early\n");
+ }
+ }
+
+ /*
+ * Next, see if we need to request sense information. if so,
+ * then get it now, so we have a better idea of what to do.
+ * FIXME(eric) this has the unfortunate side effect that if a host
+ * adapter does not automatically request sense information, that we end
+ * up shutting it down before we request it. All hosts should be doing this
+ * anyways, so for now all I have to say is tough noogies if you end up in here.
+ * On second thought, this is probably a good idea. We *really* want to give
+ * authors an incentive to automatically request this.
+ */
+ SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Checking to see if we need to request sense\n"));
+
+ for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+ {
+ for(SCpnt=SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next)
+ {
+ if( SCpnt->state != SCSI_STATE_FAILED || scsi_sense_valid(SCpnt) )
+ {
+ continue;
+ }
+
+ SCSI_LOG_ERROR_RECOVERY(2,printk("scsi_unjam_host: Requesting sense for %d\n",
+ SCpnt->target));
+ rtn = scsi_request_sense(SCpnt);
+ if( rtn != SUCCESS )
+ {
+ continue;
+ }
+
+ SCSI_LOG_ERROR_RECOVERY(3,printk("Sense requested for %p - result %x\n",
+ SCpnt, SCpnt->result));
+ SCSI_LOG_ERROR_RECOVERY(3,print_sense("bh",SCpnt));
+
+ result = scsi_decide_disposition(SCpnt);
+
+ /*
+ * If the result was normal, then just pass it along to the
+ * upper level.
+ */
+ if( result == SUCCESS )
+ {
+ SCpnt->host->host_failed--;
+ scsi_eh_finish_command(&SCdone, SCpnt);
+ }
+
+ if( result != NEEDS_RETRY )
+ {
+ continue;
+ }
+
+ /*
+ * We only come in here if we want to retry a
+ * command. The test to see whether the command
+ * should be retried should be keeping track of the
+ * number of tries, so we don't end up looping, of
+ * course.
+ */
+ SCpnt->state = NEEDS_RETRY;
+ rtn = scsi_eh_retry_command(SCpnt);
+ if( rtn != SUCCESS )
+ {
+ continue;
+ }
+
+ /*
+ * We eventually hand this one back to the top level.
+ */
+ SCpnt->host->host_failed--;
+ scsi_eh_finish_command(&SCdone, SCpnt);
+ }
+ }
+
+ /*
+ * Go through the list of commands and figure out where we stand and how bad things
+ * really are.
+ */
+ numfailed = 0;
+ timed_out = 0;
+ devices_failed = 0;
+ for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+ {
+ unsigned int device_error = 0;
+
+ for(SCpnt=SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next)
+ {
+ if( SCpnt->state == SCSI_STATE_FAILED )
+ {
+ SCSI_LOG_ERROR_RECOVERY(5,printk("Command to ID %d failed\n",
+ SCpnt->target));
+ numfailed++;
+ device_error++;
+ }
+ if( SCpnt->state == SCSI_STATE_TIMEOUT )
+ {
+ SCSI_LOG_ERROR_RECOVERY(5,printk("Command to ID %d timedout\n",
+ SCpnt->target));
+ timed_out++;
+ device_error++;
+ }
+ }
+ if( device_error > 0 )
+ {
+ devices_failed++;
+ }
+ }
+
+ SCSI_LOG_ERROR_RECOVERY(2,printk("Total of %d+%d commands on %d devices require eh work\n",
+ numfailed, timed_out, devices_failed));
+
+ if( host->host_failed == 0 )
+ {
+ ourrtn = TRUE;
+ goto leave;
+ }
+
+
+ /*
+ * Next, try and see whether or not it makes sense to try and abort
+ * the running command. This only works out to be the case if we have
+ * one command that has timed out. If the command simply failed, it
+ * makes no sense to try and abort the command, since as far as the
+ * host adapter is concerned, it isn't running.
+ */
+
+ SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Checking to see if we want to try abort\n"));
+
+ for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+ {
+ for(SCloop=SDpnt->device_queue; SCloop; SCloop = SCloop->next)
+ {
+ if( SCloop->state != SCSI_STATE_TIMEOUT )
+ {
+ continue;
+ }
+
+ rtn = scsi_try_to_abort_command(SCloop, ABORT_TIMEOUT);
+
+ if( rtn == SUCCESS )
+ {
+ rtn = scsi_test_unit_ready(SCloop);
+
+ if( rtn == SUCCESS && scsi_unit_is_ready(SCloop) )
+ {
+ rtn = scsi_eh_retry_command(SCloop);
+
+ if( rtn == SUCCESS )
+ {
+ SCloop->host->host_failed--;
+ scsi_eh_finish_command(&SCdone,SCloop);
+ }
+ }
+ }
+ }
+ }
+
+ /*
+ * If we have corrected all of the problems, then we are done.
+ */
+ if( host->host_failed == 0 )
+ {
+ ourrtn = TRUE;
+ goto leave;
+ }
+
+ /*
+ * Either the abort wasn't appropriate, or it didn't succeed.
+ * Now try a bus device reset. Still, look to see whether we have
+ * multiple devices that are jammed or not - if we have multiple devices,
+ * it makes no sense to try BUS_DEVICE_RESET - we really would need
+ * to try a BUS_RESET instead.
+ *
+ * Does this make sense - should we try BDR on each device individually?
+ * Yes, definitely.
+ */
+ SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Checking to see if we want to try BDR\n"));
+
+ for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+ {
+ for(SCloop=SDpnt->device_queue; SCloop; SCloop = SCloop->next)
+ {
+ if( SCloop->state == SCSI_STATE_FAILED
+ || SCloop->state == SCSI_STATE_TIMEOUT )
+ {
+ break;
+ }
+ }
+
+ if( SCloop == NULL )
+ {
+ continue;
+ }
+
+ /*
+ * OK, we have a device that is having problems. Try and send
+ * a bus device reset to it.
+ *
+ * FIXME(eric) - make sure we handle the case where multiple
+ * commands to the same device have failed. They all must
+ * get properly restarted.
+ */
+ rtn = scsi_try_bus_device_reset(SCloop, RESET_TIMEOUT);
+
+ if( rtn == SUCCESS )
+ {
+ rtn = scsi_test_unit_ready(SCloop);
+
+ if( rtn == SUCCESS && scsi_unit_is_ready(SCloop) )
+ {
+ rtn = scsi_eh_retry_command(SCloop);
+
+ if( rtn == SUCCESS )
+ {
+ SCloop->host->host_failed--;
+ scsi_eh_finish_command(&SCdone,SCloop);
+ }
+ }
+ }
+
+ }
+
+ if( host->host_failed == 0 )
+ {
+ ourrtn = TRUE;
+ goto leave;
+ }
+
+ /*
+ * If we ended up here, we have serious problems. The only thing left
+ * to try is a full bus reset. If someone has grabbed the bus and isn't
+ * letting go, then perhaps this will help.
+ */
+ SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Try hard bus reset\n"));
+
+ /*
+ * We really want to loop over the various channels, and do this on
+ * a channel by channel basis. We should also check to see if any
+ * of the failed commands are on soft_reset devices, and if so, skip
+ * the reset.
+ */
+ for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+ {
+next_device:
+ for(SCpnt=SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next)
+ {
+ if( SCpnt->state != SCSI_STATE_FAILED
+ && SCpnt->state != SCSI_STATE_TIMEOUT )
+ {
+ continue;
+ }
+ /*
+ * We have a failed command. Make sure there are no other failed
+ * commands on the same channel that are timed out and implement a
+ * soft reset.
+ */
+ for(SDloop=host->host_queue; SDloop; SDloop = SDloop->next)
+ {
+ for(SCloop=SDloop->device_queue; SCloop; SCloop = SCloop->next)
+ {
+ if( SCloop->channel != SCpnt->channel )
+ {
+ continue;
+ }
+
+ if( SCloop->state != SCSI_STATE_FAILED
+ && SCloop->state != SCSI_STATE_TIMEOUT )
+ {
+ continue;
+ }
+
+ if( SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT )
+ {
+ /*
+ * If this device uses the soft reset option, and this
+ * is one of the devices acting up, then our only
+ * option is to wait a bit, since the command is
+ * supposedly still running.
+ *
+ * FIXME(eric) - right now we will just end up falling
+ * through to the 'take device offline' case.
+ *
+ * FIXME(eric) - It is possible that the command completed
+ * *after* the error recovery procedure started, and if this
+ * is the case, we are worrying about nothing here.
+ */
+ goto next_device;
+ }
+ }
+ }
+
+ /*
+ * We now know that we are able to perform a reset for the
+ * bus that SCpnt points to. There are no soft-reset devices
+ * with outstanding timed out commands.
+ */
+ rtn = scsi_try_bus_reset(SCpnt);
+ if( rtn == SUCCESS )
+ {
+ for(SDloop=host->host_queue; SDloop; SDloop = SDloop->next)
+ {
+ for(SCloop=SDloop->device_queue; SCloop; SCloop = SCloop->next)
+ {
+ if( SCloop->channel != SCpnt->channel )
+ {
+ continue;
+ }
+
+ if( SCloop->state != SCSI_STATE_FAILED
+ && SCloop->state != SCSI_STATE_TIMEOUT )
+ {
+ continue;
+ }
+
+ rtn = scsi_test_unit_ready(SCloop);
+
+ if( rtn == SUCCESS && scsi_unit_is_ready(SCloop) )
+ {
+ rtn = scsi_eh_retry_command(SCloop);
+
+ if( rtn == SUCCESS )
+ {
+ SCpnt->host->host_failed--;
+ scsi_eh_finish_command(&SCdone,SCloop);
+ }
+ }
+
+ /*
+ * If the bus reset worked, but we are still unable to
+ * talk to the device, take it offline.
+ * FIXME(eric) - is this really the correct thing to do?
+ */
+ if( rtn != SUCCESS )
+ {
+ SCloop->device->online = FALSE;
+ SCloop->host->host_failed--;
+ scsi_eh_finish_command(&SCdone,SCloop);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if( host->host_failed == 0 )
+ {
+ ourrtn = TRUE;
+ goto leave;
+ }
+ /*
+ * If we ended up here, we have serious problems. The only thing left
+ * to try is a full host reset - perhaps the firmware on the device
+ * crashed, or something like that.
+ *
+ * It is assumed that a succesful host reset will cause *all* information
+ * about the command to be flushed from both the host adapter *and* the
+ * device.
+ *
+ * FIXME(eric) - it isn't clear that devices that implement the soft reset
+ * option can ever be cleared except via cycling the power. The problem is
+ * that sending the host reset command will cause the host to forget
+ * about the pending command, but the device won't forget. For now, we
+ * skip the host reset option if any of the failed devices are configured
+ * to use the soft reset option.
+ */
+ for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+ {
+next_device2:
+ for(SCpnt=SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next)
+ {
+ if( SCpnt->state != SCSI_STATE_FAILED
+ && SCpnt->state != SCSI_STATE_TIMEOUT )
+ {
+ continue;
+ }
+ if( SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT )
+ {
+ /*
+ * If this device uses the soft reset option, and this
+ * is one of the devices acting up, then our only
+ * option is to wait a bit, since the command is
+ * supposedly still running.
+ *
+ * FIXME(eric) - right now we will just end up falling
+ * through to the 'take device offline' case.
+ */
+ SCSI_LOG_ERROR_RECOVERY(3,
+ printk("scsi_unjam_host: Unable to try hard host reset\n"));
+ goto next_device2;
+ }
+
+ SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Try hard host reset\n"));
+
+ /*
+ * FIXME(eric) - we need to obtain a valid SCpnt to perform this call.
+ */
+ rtn = scsi_try_host_reset(SCpnt);
+ if( rtn == SUCCESS )
+ {
+ /*
+ * FIXME(eric) we assume that all commands are flushed from the
+ * controller. We should get a DID_RESET for all of the commands
+ * that were pending. We should ignore these so that we can
+ * guarantee that we are in a consistent state.
+ *
+ * I believe this to be the case right now, but this needs to be
+ * tested.
+ */
+ for(SDloop=host->host_queue; SDloop; SDloop = SDloop->next)
+ {
+ for(SCloop=SDloop->device_queue; SCloop; SCloop = SCloop->next)
+ {
+ if( SCloop->state != SCSI_STATE_FAILED
+ && SCloop->state != SCSI_STATE_TIMEOUT )
+ {
+ continue;
+ }
+
+ rtn = scsi_test_unit_ready(SCloop);
+
+ if( rtn == SUCCESS && scsi_unit_is_ready(SCloop) )
+ {
+ rtn = scsi_eh_retry_command(SCloop);
+
+ if( rtn == SUCCESS )
+ {
+ SCpnt->host->host_failed--;
+ scsi_eh_finish_command(&SCdone,SCloop);
+ }
+ }
+ if( rtn != SUCCESS )
+ {
+ SCloop->device->online = FALSE;
+ SCloop->host->host_failed--;
+ scsi_eh_finish_command(&SCdone,SCloop);
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+ /*
+ * If we solved all of the problems, then let's rev up the engines again.
+ */
+ if( host->host_failed == 0 )
+ {
+ ourrtn = TRUE;
+ goto leave;
+ }
+
+ /*
+ * If the HOST RESET failed, then for now we assume that the entire host
+ * adapter is too hosed to be of any use. For our purposes, however, it is
+ * easier to simply take the devices offline that correspond to commands
+ * that failed.
+ */
+ SCSI_LOG_ERROR_RECOVERY(1,printk("scsi_unjam_host: Take device offline\n"));
+
+ for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next)
+ {
+ for(SCloop=SDpnt->device_queue; SCloop; SCloop = SCloop->next)
+ {
+ if( SCloop->state == SCSI_STATE_FAILED || SCloop->state == SCSI_STATE_TIMEOUT )
+ {
+ SCloop->device->online = FALSE;
+
+ /*
+ * This should pass the failure up to the top level driver, and
+ * it will have to try and do something intelligent with it.
+ */
+ SCloop->host->host_failed--;
+
+ if( SCloop->state == SCSI_STATE_TIMEOUT )
+ {
+ SCloop->result |= (DRIVER_TIMEOUT << 24);
+ }
+
+ SCSI_LOG_ERROR_RECOVERY(3,printk("Finishing command for device %d %x\n",
+ SCloop->device->id, SCloop->result));
+
+ scsi_eh_finish_command(&SCdone,SCloop);
+ }
+ }
+ }
+
+ if( host->host_failed != 0 )
+ {
+ panic("scsi_unjam_host: Miscount of number of failed commands.\n");
+ }
+
+ SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Returning\n"));
+
+ ourrtn = FALSE;
+
+leave:
+
+ /*
+ * We should have a list of commands that we 'finished' during the course of
+ * error recovery. This should be the same as the list of commands that timed out
+ * or failed. We are currently holding these things in a linked list - we didn't
+ * put them in the bottom half queue because we wanted to keep things quiet while
+ * we were working on recovery, and passing them up to the top level could easily
+ * cause the top level to try and queue something else again.
+ *
+ * Start by marking that the host is no longer in error recovery.
+ */
+ host->in_recovery = 0;
+
+ /*
+ * Take the list of commands, and stick them in the bottom half queue.
+ * The current implementation of scsi_done will do this for us - if need
+ * be we can create a special version of this function to do the
+ * same job for us.
+ */
+ for(SCpnt = SCdone; SCpnt != NULL; SCpnt = SCdone)
+ {
+ SCdone = SCpnt->bh_next;
+ SCpnt->bh_next = NULL;
+ scsi_done(SCpnt);
+ }
+
+ return (ourrtn);
+}
+
+
+/*
+ * Function: scsi_error_handler
+ *
+ * Purpose: Handle errors/timeouts of scsi commands, try and clean up
+ * and unjam the bus, and restart things.
+ *
+ * Arguments: host - host for which we are running.
+ *
+ * Returns: Never returns.
+ *
+ * Notes: This is always run in the context of a kernel thread. The
+ * idea is that we start this thing up when the kernel starts
+ * up (one per host that we detect), and it immediately goes to
+ * sleep and waits for some event (i.e. failure). When this
+ * takes place, we have the job of trying to unjam the bus
+ * and restarting things.
+ *
+ */
+void
+scsi_error_handler(void * data)
+{
+ struct Scsi_Host * host = (struct Scsi_Host *) data;
+ int rtn;
+ struct semaphore sem = MUTEX_LOCKED;
+
+ lock_kernel();
+
+ /*
+ * If we were started as result of loading a module, close all of the
+ * user space pages. We don't need them, and if we didn't close them
+ * they would be locked into memory.
+ */
+ exit_mm(current);
+
+
+ current->session = 1;
+ current->pgrp = 1;
+ /*
+ * FIXME(eric) this is still a child process of the one that did the insmod.
+ * This needs to be attached to task[0] instead.
+ */
+
+ siginitsetinv(¤t->blocked, SHUTDOWN_SIGS);
+ current->fs->umask = 0;
+
+ /*
+ * Set the name of this process.
+ */
+ sprintf(current->comm, "scsi_eh_%d", host->host_no);
+
+ host->eh_wait = &sem;
+ host->ehandler = current;
+
+ unlock_kernel();
+
+ /*
+ * Wake up the thread that created us.
+ */
+ SCSI_LOG_ERROR_RECOVERY(3,printk("Wake up parent %d\n", host->eh_notify->count.counter));
+
+ up(host->eh_notify);
+
+ while(1)
+ {
+ /*
+ * If we get a signal, it means we are supposed to go
+ * away and die. This typically happens if the user is
+ * trying to unload a module.
+ */
+ SCSI_LOG_ERROR_RECOVERY(1,printk("Error handler sleeping\n"));
+ down_interruptible (&sem);
+
+ if (signal_pending(current) )
+ break;
+
+ SCSI_LOG_ERROR_RECOVERY(1,printk("Error handler waking up\n"));
+
+ host->eh_active = 1;
+
+ /*
+ * We have a host that is failing for some reason. Figure out
+ * what we need to do to get it up and online again (if we can).
+ * If we fail, we end up taking the thing offline.
+ */
+ if( host->hostt->eh_strategy_handler != NULL )
+ {
+ rtn = host->hostt->eh_strategy_handler(host);
+ }
+ else
+ {
+ rtn = scsi_unjam_host(host);
+ }
+
+ host->eh_active = 0;
+
+ /*
+ * Note - if the above fails completely, the action is to take
+ * individual devices offline and flush the queue of any
+ * outstanding requests that may have been pending. When we
+ * restart, we restart any I/O to any other devices on the bus
+ * which are still online.
+ */
+ scsi_restart_operations(host);
+ }
+
+ SCSI_LOG_ERROR_RECOVERY(1,printk("Error handler exiting\n"));
+
+ /*
+ * Make sure that nobody tries to wake us up again.
+ */
+ host->eh_wait = NULL;
+
+ /*
+ * Knock this down too. From this point on, the host is flying
+ * without a pilot. If this is because the module is being unloaded,
+ * that's fine. If the user sent a signal to this thing, we are
+ * potentially in real danger.
+ */
+ host->in_recovery = 0;
+ host->eh_active = 0;
+ host->ehandler = NULL;
+
+ /*
+ * If anyone is waiting for us to exit (i.e. someone trying to unload
+ * a driver), then wake up that process to let them know we are on
+ * the way out the door. This may be overkill - I *think* that we
+ * could probably just unload the driver and send the signal, and when
+ * the error handling thread wakes up that it would just exit without
+ * needing to touch any memory associated with the driver itself.
+ */
+ if( host->eh_notify != NULL )
+ up(host->eh_notify);
+}
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only. This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-indent-level: 4
+ * c-brace-imaginary-offset: 0
+ * c-brace-offset: -4
+ * c-argdecl-indent: 4
+ * c-label-offset: -4
+ * c-continued-statement-offset: 4
+ * c-continued-brace-offset: 0
+ * indent-tabs-mode: nil
+ * tab-width: 8
+ * End:
+ */
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov