rlb@Purdue.ARPA (03/08/84)
From: Bob Brown <rlb@Purdue.ARPA> 4.2BSD kernel memerr() does not properly clear soft ecc errors if you have an 11/780 running the 64kb chip interleaved memory controller. The reason is that the 0x6c controller has (at least) four registers, the address/syndrome register existing twice - once for each side. If you get an error on the second half (the "D" register), memerr() doesn't clear it and the system hangs at high IPL. Below are some context diffs that might give you an idea how to fix it. New copies of vax/machdep.c and vax/mem.h can be had from me if the diffs seem too much to handle. Bob Brown (415)965-5407 ------------------------------------------------------------------------------ *** /sys/vax/machdep.c Wed Mar 7 13:01:02 1984 --- /user/ftp/pub/machdep.c Wed Mar 7 12:27:28 1984 *************** *** 1,4 ! /* $Header: /usr/src/sys/vax/RCS/machdep.c,v 1.1 84/03/07 12:56:10 rlb Exp $ */ /* machdep.c 6.2 83/10/02 */ #include "../machine/reg.h" --- 1,4 ----- ! /* $Header: machdep.c,v 1.1 83/11/22 15:55:13 root Rel $ */ /* machdep.c 6.2 83/10/02 */ #include "../machine/reg.h" *************** *** 26,31 #include "../h/msgbuf.h" #include "../h/quota.h" #include "../vax/frame.h" #include "../vax/cons.h" #include "../vax/cpu.h" --- 26,32 ----- #include "../h/msgbuf.h" #include "../h/quota.h" + #include "../vax/nexus.h" #include "../vax/frame.h" #include "../vax/cons.h" #include "../vax/cpu.h" *************** *** 428,434 switch (cpu) { #if VAX780 case VAX_780: ! M780_ENA(mcr); break; #endif #if VAX750 --- 429,437 ----- switch (cpu) { #if VAX780 case VAX_780: ! M780_ENA(mcr,2); ! if ((mcr->mc_reg[0]&0xff)==NEX_MEM64I) ! M780_ENA(mcr,3); break; #endif #if VAX750 *************** *** 463,471 switch (cpu) { #if VAX780 case VAX_780: ! if (M780_ERR(mcr)) { ! printf("mcr%d: soft ecc addr %x syn %x\n", ! m, M780_ADDR(mcr), M780_SYN(mcr)); #ifdef TRENDATA memlog(m, mcr); #endif --- 466,474 ----- switch (cpu) { #if VAX780 case VAX_780: ! if (M780_ERR(mcr,2)) { ! printf("mcr%dc: soft ecc addr %x syn %x\n", ! m, M780_ADDR(mcr,2), M780_SYN(mcr,2)); #ifdef TRENDATA memlog(m, mcr); #endif *************** *** 469,475 #ifdef TRENDATA memlog(m, mcr); #endif ! M780_INH(mcr); } break; #endif --- 472,478 ----- #ifdef TRENDATA memlog(m, mcr); #endif ! M780_INH(mcr,2); } if ((mcr->mc_reg[0]&0xff)==NEX_MEM64I && M780_ERR(mcr,3)) { printf("mcr%dd: soft ecc addr %x syn %x\n", *************** *** 471,476 #endif M780_INH(mcr); } break; #endif #if VAX750 --- 474,484 ----- #endif M780_INH(mcr,2); } + if ((mcr->mc_reg[0]&0xff)==NEX_MEM64I && M780_ERR(mcr,3)) { + printf("mcr%dd: soft ecc addr %x syn %x\n", + m, M780_ADDR(mcr,3), M780_SYN(mcr,3)); + M780_INH(mcr,3); + } break; #endif #if VAX750 *************** *** 543,549 #if VAX780 case VAX_780: for (i = 0; i < (sizeof (memlogtab) / sizeof (memlogtab[0])); i++) ! if ((u_char)(M780_SYN(mcr)) == memlogtab[i].m_syndrome) { printf ( "mcr%d: replace %s chip in %s bank of memory board %d (0-15)\n", m, --- 551,557 ----- #if VAX780 case VAX_780: for (i = 0; i < (sizeof (memlogtab) / sizeof (memlogtab[0])); i++) ! if ((u_char)(M780_SYN(mcr,2)) == memlogtab[i].m_syndrome) { printf ( "mcr%d: replace %s chip in %s bank of memory board %d (0-15)\n", m, *************** *** 548,555 "mcr%d: replace %s chip in %s bank of memory board %d (0-15)\n", m, memlogtab[i].m_chip, ! (M780_ADDR(mcr) & 0x8000) ? "upper" : "lower", ! (M780_ADDR(mcr) >> 16)); return; } printf ("mcr%d: multiple errors, not traceable\n", m); --- 556,563 ----- "mcr%d: replace %s chip in %s bank of memory board %d (0-15)\n", m, memlogtab[i].m_chip, ! (M780_ADDR(mcr,2) & 0x8000) ? "upper" : "lower", ! (M780_ADDR(mcr,2) >> 16)); return; } printf ("mcr%d: multiple errors, not traceable\n", m); *** /sys/vax/mem.h Wed Mar 7 13:01:54 1984 --- /user/ftp/pub/mem.h Wed Mar 7 12:27:28 1984 *************** *** 1,4 - /* $Header: /usr/src/sys/vax/RCS/mem.h,v 1.1 84/03/07 12:56:28 rlb Exp $ */ /* mem.h 6.1 83/07/29 */ /* --- 1,3 ----- /* mem.h 6.1 83/07/29 */ /* *************** *** 8,14 * per cpu, so we define macros here to mask that. */ struct mcr { ! int mc_reg[3]; }; /* --- 7,13 ----- * per cpu, so we define macros here to mask that. */ struct mcr { ! int mc_reg[4]; }; /* *************** *** 37,48 /* register; bit 14 there is an error bit which we also clear */ /* these bits are in the back of the ``red book'' (or in the VMS code) */ ! #define M780_INH(mcr) \ ! (((mcr)->mc_reg[2] = (M780_ICRD|M780_HIER|M780_ERLOG)), mtpr(SBIER, 0)) ! #define M780_ENA(mcr) \ ! (((mcr)->mc_reg[2] = (M780_HIER|M780_ERLOG)), mtpr(SBIER, 3<<14)) ! #define M780_ERR(mcr) \ ! ((mcr)->mc_reg[2] & (M780_ERLOG)) #define M780_SYN(mcr) ((mcr)->mc_reg[2] & 0xff) #define M780_ADDR(mcr) (((mcr)->mc_reg[2] >> 8) & 0xfffff) --- 36,47 ----- /* register; bit 14 there is an error bit which we also clear */ /* these bits are in the back of the ``red book'' (or in the VMS code) */ ! #define M780_INH(mcr,i) \ ! (((mcr)->mc_reg[i] = (M780_ICRD|M780_HIER|M780_ERLOG)), mtpr(SBIER, 0)) ! #define M780_ENA(mcr,i) \ ! (((mcr)->mc_reg[i] = (M780_HIER|M780_ERLOG)), mtpr(SBIER, 3<<14)) ! #define M780_ERR(mcr,i) \ ! ((mcr)->mc_reg[i] & (M780_ERLOG)) #define M780_SYN(mcr,i) ((mcr)->mc_reg[i] & 0xff) #define M780_ADDR(mcr,i) (((mcr)->mc_reg[i] >> 8) & 0xfffff) *************** *** 44,51 #define M780_ERR(mcr) \ ((mcr)->mc_reg[2] & (M780_ERLOG)) ! #define M780_SYN(mcr) ((mcr)->mc_reg[2] & 0xff) ! #define M780_ADDR(mcr) (((mcr)->mc_reg[2] >> 8) & 0xfffff) #endif #if VAX750 --- 43,50 ----- #define M780_ERR(mcr,i) \ ((mcr)->mc_reg[i] & (M780_ERLOG)) ! #define M780_SYN(mcr,i) ((mcr)->mc_reg[i] & 0xff) ! #define M780_ADDR(mcr,i) (((mcr)->mc_reg[i] >> 8) & 0xfffff) #endif #if VAX750
salkind%nyu@sri-unix.UUCP (03/09/84)
From: Lou Salkind <salkind@nyu> Although the fix handles the usual case (internally interleaved controllers), it doesn't handle some "pathological" configurations (for example, what happens if only the upper controller of the MS780E is enabled?). I also fixed the MS780E problem, but in a different and slightly more general way. Note the 4.2 bsd memory handling code makes the following assumption: There is a 1-1 correspondence between cpu type and memory controller type. (Note, for example, that the case tests in the memory controller routines are by CPU type.) To my way of thinking, this is not a valid assumption. My fix involves setting the memory controller type in autoconf.c, and then changing the memory code to test by controller type, not cpu type. The code works fine on our 780's. Lou