As des' example shows us, DMA overruns could happen in a situation

where they will never succeed. Add a stop-gap measure that will at least eventually timeout the operation instead of retrying it indefinately. MFC after: 1 month
2001-07-09 20:46:45 +00:00 · 2001-07-09 20:46:45 +00:00 · aff6414a8d
commit aff6414a8d
parent 8e095496f1
3 changed files with 75 additions and 18 deletions
--- a/sys/dev/fdc/fdc.c
+++ b/sys/dev/fdc/fdc.c
@ -97,6 +97,13 @@
 #define FD_FAILED -1
 #define FD_NOT_VALID -2
 #define FDC_ERRMAX	100	/* do not log more */
+/*
+ * Stop retrying after this many DMA overruns.  Since each retry takes
+ * one revolution, with 300 rpm., 25 retries take approximately 10
+ * seconds which the read attempt will block in case the DMA overrun
+ * is persistent.
+ */
+#define FDC_DMAOV_MAX	25

 #define NUMTYPES 17
 #define NUMDENS  (NUMTYPES - 7)
@ -1451,6 +1458,18 @@ Fdopen(dev_t dev, int flags, int mode, struct proc *p)
 	}
 	fd->ft = fd_types + type - 1;
 	fd->flags |= FD_OPEN;
+	/*
+	 * Clearing the DMA overrun counter at open time is a bit messy.
+	 * Since we're only managing one counter per controller, opening
+	 * the second drive could mess it up.  Anyway, if the DMA overrun
+	 * condition is really persistent, it will eventually time out
+	 * still.  OTOH, clearing it here will ensure we'll at least start
+	 * trying again after a previous (maybe even long ago) failure.
+	 * Also, this is merely a stop-gap measure only that should not
+	 * happen during normal operation, so we can tolerate it to be a
+	 * bit sloppy about this.
+	 */
+	fdc->dma_overruns = 0;

 	return 0;
 }
@ -2033,16 +2052,23 @@ fdstate(fdc_p fdc)
                        if ((fdc->status[0] & NE7_ST0_IC) == NE7_ST0_IC_AT
 			    && fdc->status[1] & NE7_ST1_OR) {
                                /*
-				 * DMA overrun. Someone hogged the bus
-				 * and didn't release it in time for the
-				 * next FDC transfer.
-				 * Just restart it, don't increment retry
-				 * count. (vak)
-                                 */
-                                fdc->state = SEEKCOMPLETE;
-                                return (1);
+				 * DMA overrun. Someone hogged the bus and
+				 * didn't release it in time for the next
+				 * FDC transfer.
+				 *
+				 * We normally restart this without bumping
+				 * the retry counter.  However, in case
+				 * something is seriously messed up (like
+				 * broken hardware), we rather limit the
+				 * number of retries so the IO operation
+				 * doesn't block indefinately.
+				 */
+				if (fdc->dma_overruns++ < FDC_DMAOV_MAX) {
+					fdc->state = SEEKCOMPLETE;
+					return (1);
+				} /* else fall through */
                        }
-			else if((fdc->status[0] & NE7_ST0_IC) == NE7_ST0_IC_IV
+			if((fdc->status[0] & NE7_ST0_IC) == NE7_ST0_IC_IV
 				&& fdc->retry < 6)
 				fdc->retry = 6;	/* force a reset */
 			else if((fdc->status[0] & NE7_ST0_IC) == NE7_ST0_IC_AT
@ -2060,6 +2086,8 @@ fdstate(fdc_p fdc)
 			idp->sec = fdc->status[5];
 			idp->secshift = fdc->status[6];
 		}
+		/* Operation successful, retry DMA overruns again next time. */
+		fdc->dma_overruns = 0;
 		fd->skip += fdblk;
 		if (!rdsectid && !format && fd->skip < bp->bio_bcount) {
 			/* set up next transfer */
--- a/sys/isa/fd.c
+++ b/sys/isa/fd.c
@ -97,6 +97,13 @@
 #define FD_FAILED -1
 #define FD_NOT_VALID -2
 #define FDC_ERRMAX	100	/* do not log more */
+/*
+ * Stop retrying after this many DMA overruns.  Since each retry takes
+ * one revolution, with 300 rpm., 25 retries take approximately 10
+ * seconds which the read attempt will block in case the DMA overrun
+ * is persistent.
+ */
+#define FDC_DMAOV_MAX	25

 #define NUMTYPES 17
 #define NUMDENS  (NUMTYPES - 7)
@ -1451,6 +1458,18 @@ Fdopen(dev_t dev, int flags, int mode, struct proc *p)
 	}
 	fd->ft = fd_types + type - 1;
 	fd->flags |= FD_OPEN;
+	/*
+	 * Clearing the DMA overrun counter at open time is a bit messy.
+	 * Since we're only managing one counter per controller, opening
+	 * the second drive could mess it up.  Anyway, if the DMA overrun
+	 * condition is really persistent, it will eventually time out
+	 * still.  OTOH, clearing it here will ensure we'll at least start
+	 * trying again after a previous (maybe even long ago) failure.
+	 * Also, this is merely a stop-gap measure only that should not
+	 * happen during normal operation, so we can tolerate it to be a
+	 * bit sloppy about this.
+	 */
+	fdc->dma_overruns = 0;

 	return 0;
 }
@ -2033,16 +2052,23 @@ fdstate(fdc_p fdc)
                        if ((fdc->status[0] & NE7_ST0_IC) == NE7_ST0_IC_AT
 			    && fdc->status[1] & NE7_ST1_OR) {
                                /*
-				 * DMA overrun. Someone hogged the bus
-				 * and didn't release it in time for the
-				 * next FDC transfer.
-				 * Just restart it, don't increment retry
-				 * count. (vak)
-                                 */
-                                fdc->state = SEEKCOMPLETE;
-                                return (1);
+				 * DMA overrun. Someone hogged the bus and
+				 * didn't release it in time for the next
+				 * FDC transfer.
+				 *
+				 * We normally restart this without bumping
+				 * the retry counter.  However, in case
+				 * something is seriously messed up (like
+				 * broken hardware), we rather limit the
+				 * number of retries so the IO operation
+				 * doesn't block indefinately.
+				 */
+				if (fdc->dma_overruns++ < FDC_DMAOV_MAX) {
+					fdc->state = SEEKCOMPLETE;
+					return (1);
+				} /* else fall through */
                        }
-			else if((fdc->status[0] & NE7_ST0_IC) == NE7_ST0_IC_IV
+			if((fdc->status[0] & NE7_ST0_IC) == NE7_ST0_IC_IV
 				&& fdc->retry < 6)
 				fdc->retry = 6;	/* force a reset */
 			else if((fdc->status[0] & NE7_ST0_IC) == NE7_ST0_IC_AT
@ -2060,6 +2086,8 @@ fdstate(fdc_p fdc)
 			idp->sec = fdc->status[5];
 			idp->secshift = fdc->status[6];
 		}
+		/* Operation successful, retry DMA overruns again next time. */
+		fdc->dma_overruns = 0;
 		fd->skip += fdblk;
 		if (!rdsectid && !format && fd->skip < bp->bio_bcount) {
 			/* set up next transfer */
--- a/sys/isa/fdc.h
+++ b/sys/isa/fdc.h
@ -80,6 +80,7 @@ struct fdc_data
 	u_int	status[7];	/* copy of the registers */
 	enum	fdc_type fdct;	/* chip version of FDC */
 	int	fdc_errs;	/* number of logged errors */
+	int	dma_overruns;	/* number of DMA overruns */
 	struct	bio_queue_head head;
 	struct	bio *bp;	/* active buffer */
 	struct	resource *res_ioport, *res_ctl, *res_irq, *res_drq;