[net.micro.pc] Help for Parity Errors

mccullou@ittral.UUCP (Clifford McCullough) (01/01/86)

I too have parity check problems.  They always seemed to occur after
several hours of unsaved work.  I finally broke down and wrote a program
to check for these latent errors.  It follows.  The preamble should explain
what the program tries to do.

My experience with this program has been somewhat successful.  The third or
fourth program iteration finally came up with an error after 12 hours of
looking.  Unfortunately on subsequent tries, I have not been able to detect
any errors (or even the same error) after 48 hours of continuous looking.
I have not yet changed the chip indicated in the first error report.  I'm
hopping to think of some different way to reliably catch the errors.

I believe the program will work with DOS 1.1 on the IBM PC (i.e. I tried to
not make any function calls unsupported by DOS 1.1).  I have always used it
on DOS 2.1.  If anyone has suggestions or comments please send them to me.

-Cliff McCullough
decvax!ittatc!ittral!mccullou

-----------------------------------------------------------------------------
page	60,132

title	RAMHAM	Hammer the PE out of RAM by Clifford A McCullough
subttl	Version 1.00 December 15, 1985

comment	*	RamHammer is intended to discover latent parity errors.
	This is done by cyclicly writing a pattern to memory and then checking
	that the pattern remains.  The pattern is checked several times before
	a new pattern is written.  This requires the RAM to maintain the
	pattern for a period of time.  The pattern progresses through each
	block so that each byte is tested with all patterns and each bit is
	tested both high and low.

	RamHammer checks that it resides completely within the lowest 64K
	of memory.  For this reason the system should be booted with a minimum
	of resident user programs (if any).  RamHammer will not check this
	first block of memory.  Some other method must be used to check this
	block if it is suspected.

	When RamHammer detects a memory error, it sends to the screen a
	message including the time, the memory address, and the bit that
	was found to be in error.  The ports and data bytes it uses to disable
	the NMI and parity reporting mechanism is specific to the IBM PC and
	may not be correct for "compatible" machines.
 *

.radix	16

;***** Equates *********************************************************

check_no	equ	0FF		;number of check cycles
begin_blk	equ	004		;beginning block to be checked
end_blk		equ	040		;last block to be checked
nmi_port	equ	0A0		;NMI on / off register
nmi_on		equ	080		;turn NMI on
nmi_off		equ	000		;turn NMI off
port_b		equ	061		;hardware port B
port_c		equ	062		;hardware port C
par_err		equ	0C0		;parity error bits - either brd
par_err_m	equ	080		;parity error bit for main brd
par_err_e	equ	040		;parity error bit for exp brd
clr_pe		equ	030		;bits to clear parity error latches
stack_size	equ	0100		;minimum size of program stack
cr		equ	0Dh		;carriage return character
lf		equ	0A		;line feed character
bel		equ	07		;bell character

;***** Main Program ****************************************************

code	segment
	assume	cs:code, ds:code, es:code, ss:code
	org	100			;where all good .COM programs go

;----- Start:  make some checks and set up stuff -----------------------

start	proc	near

;check the address of the end of the program
	mov	dx,offset too_big_msg	;get error message
	mov	ax,cs			;get program segment
	cmp	ax,1000			;check for 1st 64K
	jge	bye_bye			;/
	mov	cl,4			;convert from this seg to abs number
	shl	ax,cl			;/
	add	ax,offset prog_end	;add in the length of the prog
	jc	bye_bye			;check program size
	add	ax,stack_size		;add in the length of the stack
	jc	bye_bye			;check program size

;find end of user memory and set up block enable flags
	int	12			;get memory size in K bytes
	mov	dx,offset too_small_msg	;get next error message
	cmp	ax,4			;check if total memory > 64K
	jng	bye_bye			;/
	mov	cl,4			;convert to base 16K
	shr	ax,cl			;/
	sub	ax,4			;skip 1st 4 blocks
	mov	cx,ax			;set up block counter
	cld				;set direction to increment
	mov	al,0FF			;get the block enable flag
	mov	di,offset en_flags + 4	;point to enable flags after 1st 4
	repz	stosb			;set flag for each block existing

;check for reduced hammering area
	mov	al,00			;get block disable flag
	std				;set direction to decrement
	mov	cx,040d			;get last block
	sub	cx,end_blk		;any work to do?
	jle	lower			;/
	mov	di,offset en_flags +39d	;point to enable flags
	repz	stosb			;reset flags
lower:
	cld				;set direction to increment
	mov	cx,begin_blk		;get starting point
	sub	cx,04			;any work to do?
	jle	safe_stack		;/
	mov	di,offset en_flags + 4	;point to enable flags
	repz	stosb			;reset flags

;load a "safe" stack pointer
safe_stack:
	mov	ax,cs			;get program segment
	mov	cl,4			;convert from seg to abs
	shl	ax,cl			;/
	neg	ax			;get difference from 64K
	mov	sp,ax			;load safe stack pointer

;turn off NMI - parity error interrupt
	mov	al,nmi_off		;get off byte
	out	nmi_port,al		;turn off nmi

;start hammering RAM
	sub	dl,dl			;start pattern at 0
	jmp	control_loop

;installation error - either too big system or too small memory.
;print message and exit.
bye_bye:
	mov	ah,9			;print message
	int	21			;/
	int	20			;bye bye

too_big_msg	db	cr, lf
		db	"Program is not contained in first 64K of memory."
		db	cr, lf, bel
		db	"Reduce system overhead and try again."
		db	cr, lf, "$"

too_small_msg	db	cr, lf
		db	"User memory is not more than 64K bytes."
		db	cr, lf, bel
		db	"Nothing remains to be checked."
		db	cr, lf, "$"

en_flags	db	40d dup(0)	;40 block enable flags (cleared)
pattern		db	10110010b	;par = e, B2
		db	11011001b	;par = o, D9
		db	01101100b	;par = e, 6C
		db	10110110b	;par = o, B6
		db	01011011b	;par = o, 5B
		db	00101101b	;par = e, 2D
		db	10010110b	;par = e, 96
		db	11001011b	;par = o, CB
		db	01100101b	;par = e, 65
pat_length	equ	this byte - pattern - 1	;length of test pattern - 1

start	endp

;----- Control Loop ----------------------------------------------------

control_loop	proc	near
;enter:	dl = pattern pointer

;set up pattern pointer to new value
	dec	dl			;start at different spot than last
	jge	write_pattern		;check for wrap-around
	inc	loop_counter		;count loops
	mov	dx,loop_counter		;update printout
	mov	si,offset loop_hex	;load storage location
	mov	bx,offset hex_tbl	;point to hex to ascii xlat table
	call	hex2ascii		;convert
	mov	dl,pat_length		;reset the pattern pointer

write_pattern:
;find first enabled block
	sub	bx,bx			;start at beginning
	call	find_next_block		;find the first enabled block
	jc	exit			;no blocks enabled

;pattern write
blk_write:
	call	pat_write		;do the writing
	call	find_next_block		;find the next enabled block
	jnc	blk_write		;a block was found

;all enabled blocks written with pattern
	mov	dh,check_no		;load number of times to check
cycle:

;find first enabled block
	sub	bx,bx			;start at beginning
	call	find_next_block		;find the first enabled block
	jc	exit			;should not happen

;pattern check
blk_check:
	call	pat_check		;do the checking
	call	find_next_block		;find the next enabled block
	jnc	blk_check		;a block was found

;all enabled blocks checked and disabled if faulty
	call	pacifier		;look busy
	dec	dh			;decrement check counter
	jnz	cycle			;check for more checking
	jmp	control_loop

;no more blocks enabled for checking
exit:
	mov	dx,offset exit_msg	;print exit message
	mov	ah,9			;/
	int	21			;/
	int	20			;bye bye

exit_msg	db	cr, lf
		db	"All memory blocks checked have errors."
		db	cr, lf, bel
		db	"Execution terminated."
		db	cr, lf, "$"

loop_counter	dw	0FFFF		;over all loop counter
loop_hex	db	"0000-"		;ascii of loop counter
pac_hex		db	"0000$"		;ascii of pacifier

control_loop	endp

;***** Subroutines *****************************************************

;----- Pacifier --------------------------------------------------------

pacifier	proc	near
;enter:	dh = check counter; dl = pattern pointer
;exit:	ax, cx, bp, si = lost

	push	bx			;save block number
	mov	bp,dx			;save counter and pointer

;get current cursor position
	mov	ah,3			;read cursor position
	sub	bh,bh			;/  page number = 0
	int	10			;/
	push	dx			;save current cursor position

;set cursor position
	mov	ah,2			;set cursor position
	mov	dh,0			;/  row = 1
	mov	dl,69d			;/  column = 70
	int	10			;/  page number = 0

;convert pacifier to ascii
	mov	dx,bp			;restore counter and pointer
	mov	si,offset pac_hex	;point to storage location
	mov	bx,offset hex_tbl	;point to translation table
	call	hex2ascii		;make conversion

;print pacifier
	mov	dx,offset loop_hex	;get start of message
	mov	ah,9			;print message
	int	21			;/

;restore stuff
	mov	ah,2			;restore cursor position
	sub	bh,bh			;/  page number = 0
	pop	dx			;/  get old setting
	int	10			;/
	mov	dx,bp			;restore counter and pointer
	pop	bx			;restore block number
	ret

pacifier	endp

;----- find_next_block -------------------------------------------------

find_next_block	proc	near
;find the next enabled block after the one pointed to by bx and
;set up es to point to the block of memory found
;enter:	bx = current block number
;exit:	bx = next block number;  es = next block segment address
;	cx = lost; si = lost
;	cy set = end of blocks reached; cy clear = next block found

	mov	cx,length en_flags - 1	;total number of blocks - 1
	sub	cx,bx			;set count to remaining blocks
	jz	none_found		;already at end of blocks
next_blk:
	inc	bx			;point to next block
	cmp	byte ptr en_flags[bx],0FF	;is it enabled?
	je	found_blk		;/  yes - found a good block
	loop	next_blk		;/  no  - check next
none_found:
	stc				;set end of blocks flag
	ret

found_blk:
	mov	si,bx			;do not disturb bx
	mov	cl,6			;convert si to seg addr
	ror	si,cl			;/
	mov	es,si			;es points to 16K memory block
	clc				;set found block flag
	ret

find_next_block	endp

;----- pat_write -------------------------------------------------------

pat_write	proc	near
;do the actual pattern writing
;enter:	dl = pattern start pointer; es = segment address to put pattern in
;exit:	ax, cx, si, di = lost

	mov	cx,4000			;set counter to block size
	sub	di,di			;clear destination index
	mov	al,dl			;get pattern start pointer
	cbw				;clear high byte
	add	ax,offset pattern	;complete index addr to pattern
	mov	si,ax			;set source index
write_next:
	movsb				;put pattern byte in block
	cmp	si,offset pattern + pat_length	;check source index
	jle	skip2			;if necessary ...
	mov	si,offset pattern	;/  reset pattern pointer
skip2:
	loop	write_next		;write next byte
	ret

pat_write	endp

;----- pat_check -------------------------------------------------------

pat_check	proc	near
;do the actual pattern checking
;enter:	dl = pattern start pointer; es = segment address of pattern
;	bx = current block number
;exit:	ax, cx, si, di = lost
;	if error found:	control is passed to bad_byte procedure
;		di = bad byte index + 1; si = pattern index + 1

	mov	cx,4000			;set counter to block size
	sub	di,di			;clear destination index
	mov	al,dl			;get pattern start pointer
	cbw				;clear high byte
	add	ax,offset pattern	;complete index addr to pattern
	mov	si,ax			;set source index
check_next:
	cmpsb				;check pattern byte in block
	jne	bad_byte		;oops!
	in	al,port_c		;check for parity error
	test	al,par_err		;/
	jnz	bad_byte		;oops!
	cmp	si,offset pattern + pat_length	;check source index
	jle	skip3			;if necessary ...
	mov	si,offset pattern	;/  reset pattern pointer
skip3:
	loop	check_next		;check next byte
	ret

pat_check	endp

;----- bad_byte --------------------------------------------------------

bad_byte	proc	near
;a bad byte was found in memory.  report it and disable the block
;enter:	bx = current block number; es = current block segment address
;	di = bad byte index + 1; si = pattern index + 1
;exit:	ax, cx, si = lost

	push	dx			;save pattern start pointer
	mov	dx,offset crlf		;print a new line
	mov	ah,9			;/
	int	21			;/

;check for source of error
	dec	di			;correct for last increment
	dec	si			;/
	cmp	si,offset pattern	;check source index
	jge	skip4			;if necessary ...
	mov	si,offset pattern + pat_length	;/  reset pattern pointer
skip4:
	mov	al,es:[di]		;get bad byte
	xor	al,[si]			;check for a bad bit
	jnz	bit_error		;/
	in	al,port_c		;get parity error bits
	mov	dx,offset mpe_msg	;get main pe message
	test	al,par_err_m		;check for main brd parity error
	jnz	err_recover		;/
	mov	dx,offset epe_msg	;get expansion pe message
	test	al,par_err_e		;check for exp brd parity error
	jnz	err_recover		;/

;error detected by pat_check but the source was not located.
;print message but do not disable this block.  it will be checked again.
	mov	dx,offset shit_msg	;print the message ...
	jmp	short noerr_recover	;/  do not disable the block

bit_error:
;a bit error was located.  separate the bits and include in the message.
	mov	si,offset err_byte + 7	;get storage location
	mov	cx,8			;get count
store_bit:
	mov	ah,018			;get ascii "0" pre-shifted
	shr	al,1			;put bit in carry
	rcl	ah,1			;get bit from carry
	mov	[si],ah			;store ascii of bit
	dec	si			;point to next bit
	loop	store_bit		;do all bits
	mov	dx,offset bit_msg	;get bit error message

err_recover:
	mov	byte ptr en_flags[bx],00	;disable this block
noerr_recover:
	mov	ah,9			;DOS print message function
	int	21			;/
	call	get_time		;get time and include in status_msg
	push	bx			;save block number
	mov	bx,offset hex_tbl	;point to hex to ascii xlat table
	mov	dx,es			;separate block segment, convert
	mov	si,offset err_blk	;/  to ascii, and include in
	call	hex2ascii		;/  status_msg
	mov	dx,di			;separate memory address, convert
	mov	si,offset err_addr	;/  to ascii, and include in
	call	hex2ascii		;/  status_msg
	mov	dx,offset status_msg	;print status message
	mov	ah,9			;/
	int	21			;/
	pop	bx			;restore block number
	pop	dx			;restore pattern start pointer

;clear parity error latches
	in	al,port_b		;get current status of port B
	or	al,clr_pe		;clear parity latches
	out	port_b,al		;/
	xor	al,clr_pe		;re-enable parity latches
	out	port_b,al		;/
	ret

crlf		db	cr, lf, "$"
shit_msg	db	"An error was detected but not located.$"
bit_msg		db	"The bit indicated was found to be in error: "
err_byte	db	"00000000$"
mpe_msg		db	"The main board parity bit was found to be in error.$"
epe_msg		db	"The expansion board parity bit was found to be in "
		db	"error.$"
status_msg	db	cr, lf
		db	"Time: "
err_hour	db	"00:"
err_min		db	"00   Memory Segment: "
err_blk		db	"0000   Memory Address: "
err_addr	db	"0000", cr, lf, "$"

bad_byte	endp

;----- get_time --------------------------------------------------------

get_time	proc	near
;get the time of day, make decimal, and put it in status_msg.
;exit:	ax, cx = lost

	mov	ah,2C			;DOS get time function
	int	21			;/
	mov	al,ch			;hours (0-23)
	call	hex2dec			;convert hex to ascii decimal
	mov	word ptr err_hour,ax	;store ascii hours
	mov	al,cl			;minutes (0-59)
	call	hex2dec			;convert hex to ascii decimal
	mov	word ptr err_min,ax	;store ascii minutes
	ret

get_time	endp

;----- hex2dec ---------------------------------------------------------

hex2dec		proc	near
;convert a hex number <100 in al into ascii decimal in ax.
;enter:	al = number to be converted
;exit:	ah = ones; al = tens; ch = lost

	cbw				;clear high byte
	mov	ch,10d			;divide ax by 10
	div	ch			;/
	add	ax,"00"			;make ascii
	ret

hex2dec		endp

;----- hex2ascii -------------------------------------------------------

hex2ascii	proc	near
;convert a hex number in dx into ascii.  si points to storage location.
;enter:	dx = number to be converted; si = storage location pointer
;	bx = offset of hex_tbl
;exit:	ah, cl = lost

	add	si,3			;start at end of number
	mov	ah,dl			;do it to low byte
	call	hex_lookup		;/
	mov	ah,dh			;do it to high byte
	call	hex_lookup		;/
	ret

hex2ascii	endp

;----- hex_lookup ------------------------------------------------------

hex_lookup	proc	near
;separate two hex digits in ah, convert to ascii and store
;enter:	ah = two hex digits; si = storage location
;	bx = hex_tbl offset
;exit:	al, cl = lost; si = decremented by two

	mov	al,ah			;get low byte
	and	al,0F			;get low nibble
	xlat	hex_tbl			;translate hex to ascii
	mov	[si],al			;store 1st/3rd digit
	dec	si			;point to next
	mov	al,ah			;get low byte again
	mov	cl,4			;set counter
	shr	al,cl			;shift high nibble to low nibble
	xlat	hex_tbl			;translate hex to ascii
	mov	[si],al			;store 2nd/4th digit
	dec	si			;point to next
	ret

hex_tbl		db	"0123456789ABCDEF"	;hex to ascii conversion

hex_lookup	endp

;***** Program End *****************************************************

prog_end	label	byte		;label the end of the program

code	ends

	end	start