[net.bugs.4bsd] 4.2 VAX RL02 driver bug found

paul@msdc.UUCP (Paul Manno) (09/18/85)

We were noticing a number of hard "header not found" errors on our RL02
disks during moderate I/O on our VAXen.  After some digging, we found
that the RL02 driver can't recover from soft errors (data late, etc.)
in the latter tracks of a multi-track transfer.  The driver correctly
requeues the request but does not re-seek the drive to the correct
position.  This behavior is quite obvious if you print soft errors.
The first "data late" becomes an uncorrectable "header not found".

Below is a context diff output between our BRL Release 3 version of the
RL02 driver and what we've fixed.  To my knowledge, this error exists
in 4.1BSD, 4.2BSD and BRL release 3 versions of VAX Unix.  It never
showed up under 4.1BSD because the 1KB file system never performed a
multi-track I/O on the device.  This has solved our RL02 troubles on
all of our VAX systems including 750s and 785s.  We still see an
occasional RL02 soft error but now it's always recovered.

If you have further interest in this change, please contact me by mail.
Thanks.

	Paul Manno
	Medical Systems Development Corp.
	UUCP: ..{akgua, gatech, ihnp4, mcnc}!msdc!paul

*** rl.c	Tue Sep 17 18:10:19 1985
--- newrl.c	Tue Sep 17 18:11:12 1985
***************
*** 169,175
  	rladdr->rlda.getstat = RL_RESET;	/* SHOULD BE REPEATED? */
  	rladdr->rlcs = (ui->ui_slave << 8) | RL_GETSTAT; /* Reset DE bit */
  	rlwait(rladdr);
! 	/* determine disk posistion */
  	rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR;
  	rlwait(rladdr);
  	/* save disk drive posistion */

--- 169,175 -----
  	rladdr->rlda.getstat = RL_RESET;	/* SHOULD BE REPEATED? */
  	rladdr->rlcs = (ui->ui_slave << 8) | RL_GETSTAT; /* Reset DE bit */
  	rlwait(rladdr);
! 	/* determine disk position */
  	rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR;
  	rlwait(rladdr);
  	/* save disk drive position */
***************
*** 172,178
  	/* determine disk posistion */
  	rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR;
  	rlwait(rladdr);
! 	/* save disk drive posistion */
  	rl_stat[ui->ui_ctlr].rl_cyl[ui->ui_slave] =
  	     (rladdr->rlmp.readhdr & 0177700) >> 6;
  	rl_stat[ui->ui_ctlr].rl_dn = -1;

--- 172,178 -----
  	/* determine disk position */
  	rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR;
  	rlwait(rladdr);
! 	/* save disk drive position */
  	rl_stat[ui->ui_ctlr].rl_cyl[ui->ui_slave] =
  	     (rladdr->rlmp.readhdr & 0177700) >> 6;
  	rl_stat[ui->ui_ctlr].rl_dn = -1;
***************
*** 258,270
  		rl_softc[um->um_ctlr].rl_softas |=  1<<ui->ui_slave;
  		return;
  	}
! 	/*
! 	 * If we have already positioned this drive,
! 	 * then just put it on the ready queue.
! 	 */
! 	if (dp->b_active)
! 		goto done;
! 	dp->b_active = 1;	/* positioning drive */
  	rladdr = (struct rldevice *)um->um_addr;
  
  	/*

--- 258,264 -----
  		rl_softc[um->um_ctlr].rl_softas |=  1<<ui->ui_slave;
  		return;
  	}
! 
  	rladdr = (struct rldevice *)um->um_addr;
  	if (dp->b_active == 0)
  		dp->b_active = 1;	/* positioning drive */
***************
*** 266,271
  		goto done;
  	dp->b_active = 1;	/* positioning drive */
  	rladdr = (struct rldevice *)um->um_addr;
  
  	/*
  	 * Figure out where this transfer is going to

--- 260,267 -----
  	}
  
  	rladdr = (struct rldevice *)um->um_addr;
+ 	if (dp->b_active == 0)
+ 		dp->b_active = 1;	/* positioning drive */
  
  	/*
  	 * Figure out where this transfer is going to
***************
*** 280,285
  	diff = (rl_stat[um->um_ctlr].rl_cyl[ui->ui_slave] >> 1) - bp->b_cylin;
  	if ( diff == 0 && (rl_stat[um->um_ctlr].rl_cyl[ui->ui_slave] & 1) == hd)
  		goto done;		/* on cylinder and head */
  	/*
  	 * Not at correct position.
  	 */

--- 276,282 -----
  	diff = (rl_stat[um->um_ctlr].rl_cyl[ui->ui_slave] >> 1) - bp->b_cylin;
  	if ( diff == 0 && (rl_stat[um->um_ctlr].rl_cyl[ui->ui_slave] & 1) == hd)
  		goto done;		/* on cylinder and head */
+ 
  	/*
  	 * Not at correct position.
  	 */
***************
*** 428,434
  			 */
  			printf("rl%d: write protected\n", dkunit(bp));
  			bp->b_flags |= B_ERROR;
! 		} else if (++um->um_tab.b_errcnt > 10) {
  			/*
  			 * After 10 retries give up.
  			 */

--- 425,431 -----
  			 */
  			printf("rl%d: write protected\n", dkunit(bp));
  			bp->b_flags |= B_ERROR;
! 		} else if (++um->um_tab.b_errcnt > 20) {
  			/*
  			 * After 20 retries give up.
  			 */
***************
*** 430,436
  			bp->b_flags |= B_ERROR;
  		} else if (++um->um_tab.b_errcnt > 10) {
  			/*
! 			 * After 10 retries give up.
  			 */
  			harderr(bp, "rl");
  			printf("cs=%b mp=%b\n", err, RLCS_BITS,

--- 427,433 -----
  			bp->b_flags |= B_ERROR;
  		} else if (++um->um_tab.b_errcnt > 20) {
  			/*
! 			 * After 20 retries give up.
  			 */
  			harderr(bp, "rl");
  			printf("cs=%b mp=%b\n", err, RLCS_BITS,
***************
*** 436,442
  			printf("cs=%b mp=%b\n", err, RLCS_BITS,
  			    status, RLER_BITS);
  			bp->b_flags |= B_ERROR;
! 		} else
  			um->um_tab.b_active = 0;	 /* force retry */
  		/* determine disk position */
  		rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR;

--- 433,448 -----
  			printf("cs=%b mp=%b\n", err, RLCS_BITS,
  			    status, RLER_BITS);
  			bp->b_flags |= B_ERROR;
! 		} else {
! 			/*
! 			 * Talk about it on retry 1, 6, 11, and 16
! 			 */
! 			if ((um->um_tab.b_errcnt % 5) == 1) {
! 				printf("rl%d%c: soft error sn%d ",dkunit(bp),
! 				'a'+(minor(bp->b_dev)&07), bp->b_blkno);
! 				printf("cs=%b mp=%b - retry %d\n",err,RLCS_BITS,
! 				    status, RLER_BITS, um->um_tab.b_errcnt);
! 			}
  			um->um_tab.b_active = 0;	 /* force retry */
  		}
  		/* determine disk position */
***************
*** 438,443
  			bp->b_flags |= B_ERROR;
  		} else
  			um->um_tab.b_active = 0;	 /* force retry */
  		/* determine disk position */
  		rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR;
  		rlwait(rladdr);

--- 444,450 -----
  				    status, RLER_BITS, um->um_tab.b_errcnt);
  			}
  			um->um_tab.b_active = 0;	 /* force retry */
+ 		}
  		/* determine disk position */
  		rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR;
  		rlwait(rladdr);
***************
*** 603,609
  		for (unit = 0; unit < NRL; unit++) {
  			rladdr->rlcs = (unit << 8) | RL_GETSTAT;
  			rlwait(rladdr);
! 			/* Determine disk posistion */
  			rladdr->rlcs = (unit << 8) | RL_RHDR;
  			rlwait(rladdr);
  			/* save disk drive posistion */

--- 610,616 -----
  		for (unit = 0; unit < NRL; unit++) {
  			rladdr->rlcs = (unit << 8) | RL_GETSTAT;
  			rlwait(rladdr);
! 			/* Determine disk position */
  			rladdr->rlcs = (unit << 8) | RL_RHDR;
  			rlwait(rladdr);
  			/* save disk drive position */
***************
*** 606,612
  			/* Determine disk posistion */
  			rladdr->rlcs = (unit << 8) | RL_RHDR;
  			rlwait(rladdr);
! 			/* save disk drive posistion */
  			st->rl_cyl[unit] =
  				(rladdr->rlmp.readhdr & 0177700) >> 6;
  			if ((ui = rldinfo[unit]) == 0)

--- 613,619 -----
  			/* Determine disk position */
  			rladdr->rlcs = (unit << 8) | RL_RHDR;
  			rlwait(rladdr);
! 			/* save disk drive position */
  			st->rl_cyl[unit] =
  				(rladdr->rlmp.readhdr & 0177700) >> 6;
  			if ((ui = rldinfo[unit]) == 0)