paul@msdc.UUCP (Paul Manno) (09/18/85)
We were noticing a number of hard "header not found" errors on our RL02 disks during moderate I/O on our VAXen. After some digging, we found that the RL02 driver can't recover from soft errors (data late, etc.) in the latter tracks of a multi-track transfer. The driver correctly requeues the request but does not re-seek the drive to the correct position. This behavior is quite obvious if you print soft errors. The first "data late" becomes an uncorrectable "header not found". Below is a context diff output between our BRL Release 3 version of the RL02 driver and what we've fixed. To my knowledge, this error exists in 4.1BSD, 4.2BSD and BRL release 3 versions of VAX Unix. It never showed up under 4.1BSD because the 1KB file system never performed a multi-track I/O on the device. This has solved our RL02 troubles on all of our VAX systems including 750s and 785s. We still see an occasional RL02 soft error but now it's always recovered. If you have further interest in this change, please contact me by mail. Thanks. Paul Manno Medical Systems Development Corp. UUCP: ..{akgua, gatech, ihnp4, mcnc}!msdc!paul *** rl.c Tue Sep 17 18:10:19 1985 --- newrl.c Tue Sep 17 18:11:12 1985 *************** *** 169,175 rladdr->rlda.getstat = RL_RESET; /* SHOULD BE REPEATED? */ rladdr->rlcs = (ui->ui_slave << 8) | RL_GETSTAT; /* Reset DE bit */ rlwait(rladdr); ! /* determine disk posistion */ rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR; rlwait(rladdr); /* save disk drive posistion */ --- 169,175 ----- rladdr->rlda.getstat = RL_RESET; /* SHOULD BE REPEATED? */ rladdr->rlcs = (ui->ui_slave << 8) | RL_GETSTAT; /* Reset DE bit */ rlwait(rladdr); ! /* determine disk position */ rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR; rlwait(rladdr); /* save disk drive position */ *************** *** 172,178 /* determine disk posistion */ rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR; rlwait(rladdr); ! /* save disk drive posistion */ rl_stat[ui->ui_ctlr].rl_cyl[ui->ui_slave] = (rladdr->rlmp.readhdr & 0177700) >> 6; rl_stat[ui->ui_ctlr].rl_dn = -1; --- 172,178 ----- /* determine disk position */ rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR; rlwait(rladdr); ! /* save disk drive position */ rl_stat[ui->ui_ctlr].rl_cyl[ui->ui_slave] = (rladdr->rlmp.readhdr & 0177700) >> 6; rl_stat[ui->ui_ctlr].rl_dn = -1; *************** *** 258,270 rl_softc[um->um_ctlr].rl_softas |= 1<<ui->ui_slave; return; } ! /* ! * If we have already positioned this drive, ! * then just put it on the ready queue. ! */ ! if (dp->b_active) ! goto done; ! dp->b_active = 1; /* positioning drive */ rladdr = (struct rldevice *)um->um_addr; /* --- 258,264 ----- rl_softc[um->um_ctlr].rl_softas |= 1<<ui->ui_slave; return; } ! rladdr = (struct rldevice *)um->um_addr; if (dp->b_active == 0) dp->b_active = 1; /* positioning drive */ *************** *** 266,271 goto done; dp->b_active = 1; /* positioning drive */ rladdr = (struct rldevice *)um->um_addr; /* * Figure out where this transfer is going to --- 260,267 ----- } rladdr = (struct rldevice *)um->um_addr; + if (dp->b_active == 0) + dp->b_active = 1; /* positioning drive */ /* * Figure out where this transfer is going to *************** *** 280,285 diff = (rl_stat[um->um_ctlr].rl_cyl[ui->ui_slave] >> 1) - bp->b_cylin; if ( diff == 0 && (rl_stat[um->um_ctlr].rl_cyl[ui->ui_slave] & 1) == hd) goto done; /* on cylinder and head */ /* * Not at correct position. */ --- 276,282 ----- diff = (rl_stat[um->um_ctlr].rl_cyl[ui->ui_slave] >> 1) - bp->b_cylin; if ( diff == 0 && (rl_stat[um->um_ctlr].rl_cyl[ui->ui_slave] & 1) == hd) goto done; /* on cylinder and head */ + /* * Not at correct position. */ *************** *** 428,434 */ printf("rl%d: write protected\n", dkunit(bp)); bp->b_flags |= B_ERROR; ! } else if (++um->um_tab.b_errcnt > 10) { /* * After 10 retries give up. */ --- 425,431 ----- */ printf("rl%d: write protected\n", dkunit(bp)); bp->b_flags |= B_ERROR; ! } else if (++um->um_tab.b_errcnt > 20) { /* * After 20 retries give up. */ *************** *** 430,436 bp->b_flags |= B_ERROR; } else if (++um->um_tab.b_errcnt > 10) { /* ! * After 10 retries give up. */ harderr(bp, "rl"); printf("cs=%b mp=%b\n", err, RLCS_BITS, --- 427,433 ----- bp->b_flags |= B_ERROR; } else if (++um->um_tab.b_errcnt > 20) { /* ! * After 20 retries give up. */ harderr(bp, "rl"); printf("cs=%b mp=%b\n", err, RLCS_BITS, *************** *** 436,442 printf("cs=%b mp=%b\n", err, RLCS_BITS, status, RLER_BITS); bp->b_flags |= B_ERROR; ! } else um->um_tab.b_active = 0; /* force retry */ /* determine disk position */ rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR; --- 433,448 ----- printf("cs=%b mp=%b\n", err, RLCS_BITS, status, RLER_BITS); bp->b_flags |= B_ERROR; ! } else { ! /* ! * Talk about it on retry 1, 6, 11, and 16 ! */ ! if ((um->um_tab.b_errcnt % 5) == 1) { ! printf("rl%d%c: soft error sn%d ",dkunit(bp), ! 'a'+(minor(bp->b_dev)&07), bp->b_blkno); ! printf("cs=%b mp=%b - retry %d\n",err,RLCS_BITS, ! status, RLER_BITS, um->um_tab.b_errcnt); ! } um->um_tab.b_active = 0; /* force retry */ } /* determine disk position */ *************** *** 438,443 bp->b_flags |= B_ERROR; } else um->um_tab.b_active = 0; /* force retry */ /* determine disk position */ rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR; rlwait(rladdr); --- 444,450 ----- status, RLER_BITS, um->um_tab.b_errcnt); } um->um_tab.b_active = 0; /* force retry */ + } /* determine disk position */ rladdr->rlcs = (ui->ui_slave << 8) | RL_RHDR; rlwait(rladdr); *************** *** 603,609 for (unit = 0; unit < NRL; unit++) { rladdr->rlcs = (unit << 8) | RL_GETSTAT; rlwait(rladdr); ! /* Determine disk posistion */ rladdr->rlcs = (unit << 8) | RL_RHDR; rlwait(rladdr); /* save disk drive posistion */ --- 610,616 ----- for (unit = 0; unit < NRL; unit++) { rladdr->rlcs = (unit << 8) | RL_GETSTAT; rlwait(rladdr); ! /* Determine disk position */ rladdr->rlcs = (unit << 8) | RL_RHDR; rlwait(rladdr); /* save disk drive position */ *************** *** 606,612 /* Determine disk posistion */ rladdr->rlcs = (unit << 8) | RL_RHDR; rlwait(rladdr); ! /* save disk drive posistion */ st->rl_cyl[unit] = (rladdr->rlmp.readhdr & 0177700) >> 6; if ((ui = rldinfo[unit]) == 0) --- 613,619 ----- /* Determine disk position */ rladdr->rlcs = (unit << 8) | RL_RHDR; rlwait(rladdr); ! /* save disk drive position */ st->rl_cyl[unit] = (rladdr->rlmp.readhdr & 0177700) >> 6; if ((ui = rldinfo[unit]) == 0)