rlb@Purdue.ARPA (03/08/84)
From: Bob Brown <rlb@Purdue.ARPA>
4.2BSD kernel memerr() does not properly clear soft ecc errors if you
have an 11/780 running the 64kb chip interleaved memory controller. The
reason is that the 0x6c controller has (at least) four registers, the
address/syndrome register existing twice - once for each side. If you
get an error on the second half (the "D" register), memerr() doesn't
clear it and the system hangs at high IPL.
Below are some context diffs that might give you an idea how to fix it.
New copies of vax/machdep.c and vax/mem.h can be had from me if the diffs
seem too much to handle.
Bob Brown
(415)965-5407
------------------------------------------------------------------------------
*** /sys/vax/machdep.c Wed Mar 7 13:01:02 1984
--- /user/ftp/pub/machdep.c Wed Mar 7 12:27:28 1984
***************
*** 1,4
! /* $Header: /usr/src/sys/vax/RCS/machdep.c,v 1.1 84/03/07 12:56:10 rlb Exp $ */
/* machdep.c 6.2 83/10/02 */
#include "../machine/reg.h"
--- 1,4 -----
! /* $Header: machdep.c,v 1.1 83/11/22 15:55:13 root Rel $ */
/* machdep.c 6.2 83/10/02 */
#include "../machine/reg.h"
***************
*** 26,31
#include "../h/msgbuf.h"
#include "../h/quota.h"
#include "../vax/frame.h"
#include "../vax/cons.h"
#include "../vax/cpu.h"
--- 26,32 -----
#include "../h/msgbuf.h"
#include "../h/quota.h"
+ #include "../vax/nexus.h"
#include "../vax/frame.h"
#include "../vax/cons.h"
#include "../vax/cpu.h"
***************
*** 428,434
switch (cpu) {
#if VAX780
case VAX_780:
! M780_ENA(mcr);
break;
#endif
#if VAX750
--- 429,437 -----
switch (cpu) {
#if VAX780
case VAX_780:
! M780_ENA(mcr,2);
! if ((mcr->mc_reg[0]&0xff)==NEX_MEM64I)
! M780_ENA(mcr,3);
break;
#endif
#if VAX750
***************
*** 463,471
switch (cpu) {
#if VAX780
case VAX_780:
! if (M780_ERR(mcr)) {
! printf("mcr%d: soft ecc addr %x syn %x\n",
! m, M780_ADDR(mcr), M780_SYN(mcr));
#ifdef TRENDATA
memlog(m, mcr);
#endif
--- 466,474 -----
switch (cpu) {
#if VAX780
case VAX_780:
! if (M780_ERR(mcr,2)) {
! printf("mcr%dc: soft ecc addr %x syn %x\n",
! m, M780_ADDR(mcr,2), M780_SYN(mcr,2));
#ifdef TRENDATA
memlog(m, mcr);
#endif
***************
*** 469,475
#ifdef TRENDATA
memlog(m, mcr);
#endif
! M780_INH(mcr);
}
break;
#endif
--- 472,478 -----
#ifdef TRENDATA
memlog(m, mcr);
#endif
! M780_INH(mcr,2);
}
if ((mcr->mc_reg[0]&0xff)==NEX_MEM64I && M780_ERR(mcr,3)) {
printf("mcr%dd: soft ecc addr %x syn %x\n",
***************
*** 471,476
#endif
M780_INH(mcr);
}
break;
#endif
#if VAX750
--- 474,484 -----
#endif
M780_INH(mcr,2);
}
+ if ((mcr->mc_reg[0]&0xff)==NEX_MEM64I && M780_ERR(mcr,3)) {
+ printf("mcr%dd: soft ecc addr %x syn %x\n",
+ m, M780_ADDR(mcr,3), M780_SYN(mcr,3));
+ M780_INH(mcr,3);
+ }
break;
#endif
#if VAX750
***************
*** 543,549
#if VAX780
case VAX_780:
for (i = 0; i < (sizeof (memlogtab) / sizeof (memlogtab[0])); i++)
! if ((u_char)(M780_SYN(mcr)) == memlogtab[i].m_syndrome) {
printf (
"mcr%d: replace %s chip in %s bank of memory board %d (0-15)\n",
m,
--- 551,557 -----
#if VAX780
case VAX_780:
for (i = 0; i < (sizeof (memlogtab) / sizeof (memlogtab[0])); i++)
! if ((u_char)(M780_SYN(mcr,2)) == memlogtab[i].m_syndrome) {
printf (
"mcr%d: replace %s chip in %s bank of memory board %d (0-15)\n",
m,
***************
*** 548,555
"mcr%d: replace %s chip in %s bank of memory board %d (0-15)\n",
m,
memlogtab[i].m_chip,
! (M780_ADDR(mcr) & 0x8000) ? "upper" : "lower",
! (M780_ADDR(mcr) >> 16));
return;
}
printf ("mcr%d: multiple errors, not traceable\n", m);
--- 556,563 -----
"mcr%d: replace %s chip in %s bank of memory board %d (0-15)\n",
m,
memlogtab[i].m_chip,
! (M780_ADDR(mcr,2) & 0x8000) ? "upper" : "lower",
! (M780_ADDR(mcr,2) >> 16));
return;
}
printf ("mcr%d: multiple errors, not traceable\n", m);
*** /sys/vax/mem.h Wed Mar 7 13:01:54 1984
--- /user/ftp/pub/mem.h Wed Mar 7 12:27:28 1984
***************
*** 1,4
- /* $Header: /usr/src/sys/vax/RCS/mem.h,v 1.1 84/03/07 12:56:28 rlb Exp $ */
/* mem.h 6.1 83/07/29 */
/*
--- 1,3 -----
/* mem.h 6.1 83/07/29 */
/*
***************
*** 8,14
* per cpu, so we define macros here to mask that.
*/
struct mcr {
! int mc_reg[3];
};
/*
--- 7,13 -----
* per cpu, so we define macros here to mask that.
*/
struct mcr {
! int mc_reg[4];
};
/*
***************
*** 37,48
/* register; bit 14 there is an error bit which we also clear */
/* these bits are in the back of the ``red book'' (or in the VMS code) */
! #define M780_INH(mcr) \
! (((mcr)->mc_reg[2] = (M780_ICRD|M780_HIER|M780_ERLOG)), mtpr(SBIER, 0))
! #define M780_ENA(mcr) \
! (((mcr)->mc_reg[2] = (M780_HIER|M780_ERLOG)), mtpr(SBIER, 3<<14))
! #define M780_ERR(mcr) \
! ((mcr)->mc_reg[2] & (M780_ERLOG))
#define M780_SYN(mcr) ((mcr)->mc_reg[2] & 0xff)
#define M780_ADDR(mcr) (((mcr)->mc_reg[2] >> 8) & 0xfffff)
--- 36,47 -----
/* register; bit 14 there is an error bit which we also clear */
/* these bits are in the back of the ``red book'' (or in the VMS code) */
! #define M780_INH(mcr,i) \
! (((mcr)->mc_reg[i] = (M780_ICRD|M780_HIER|M780_ERLOG)), mtpr(SBIER, 0))
! #define M780_ENA(mcr,i) \
! (((mcr)->mc_reg[i] = (M780_HIER|M780_ERLOG)), mtpr(SBIER, 3<<14))
! #define M780_ERR(mcr,i) \
! ((mcr)->mc_reg[i] & (M780_ERLOG))
#define M780_SYN(mcr,i) ((mcr)->mc_reg[i] & 0xff)
#define M780_ADDR(mcr,i) (((mcr)->mc_reg[i] >> 8) & 0xfffff)
***************
*** 44,51
#define M780_ERR(mcr) \
((mcr)->mc_reg[2] & (M780_ERLOG))
! #define M780_SYN(mcr) ((mcr)->mc_reg[2] & 0xff)
! #define M780_ADDR(mcr) (((mcr)->mc_reg[2] >> 8) & 0xfffff)
#endif
#if VAX750
--- 43,50 -----
#define M780_ERR(mcr,i) \
((mcr)->mc_reg[i] & (M780_ERLOG))
! #define M780_SYN(mcr,i) ((mcr)->mc_reg[i] & 0xff)
! #define M780_ADDR(mcr,i) (((mcr)->mc_reg[i] >> 8) & 0xfffff)
#endif
#if VAX750salkind%nyu@sri-unix.UUCP (03/09/84)
From: Lou Salkind <salkind@nyu> Although the fix handles the usual case (internally interleaved controllers), it doesn't handle some "pathological" configurations (for example, what happens if only the upper controller of the MS780E is enabled?). I also fixed the MS780E problem, but in a different and slightly more general way. Note the 4.2 bsd memory handling code makes the following assumption: There is a 1-1 correspondence between cpu type and memory controller type. (Note, for example, that the case tests in the memory controller routines are by CPU type.) To my way of thinking, this is not a valid assumption. My fix involves setting the memory controller type in autoconf.c, and then changing the memory code to test by controller type, not cpu type. The code works fine on our 780's. Lou