mccullou@ittral.UUCP (Clifford McCullough) (01/01/86)
I too have parity check problems. They always seemed to occur after several hours of unsaved work. I finally broke down and wrote a program to check for these latent errors. It follows. The preamble should explain what the program tries to do. My experience with this program has been somewhat successful. The third or fourth program iteration finally came up with an error after 12 hours of looking. Unfortunately on subsequent tries, I have not been able to detect any errors (or even the same error) after 48 hours of continuous looking. I have not yet changed the chip indicated in the first error report. I'm hopping to think of some different way to reliably catch the errors. I believe the program will work with DOS 1.1 on the IBM PC (i.e. I tried to not make any function calls unsupported by DOS 1.1). I have always used it on DOS 2.1. If anyone has suggestions or comments please send them to me. -Cliff McCullough decvax!ittatc!ittral!mccullou ----------------------------------------------------------------------------- page 60,132 title RAMHAM Hammer the PE out of RAM by Clifford A McCullough subttl Version 1.00 December 15, 1985 comment * RamHammer is intended to discover latent parity errors. This is done by cyclicly writing a pattern to memory and then checking that the pattern remains. The pattern is checked several times before a new pattern is written. This requires the RAM to maintain the pattern for a period of time. The pattern progresses through each block so that each byte is tested with all patterns and each bit is tested both high and low. RamHammer checks that it resides completely within the lowest 64K of memory. For this reason the system should be booted with a minimum of resident user programs (if any). RamHammer will not check this first block of memory. Some other method must be used to check this block if it is suspected. When RamHammer detects a memory error, it sends to the screen a message including the time, the memory address, and the bit that was found to be in error. The ports and data bytes it uses to disable the NMI and parity reporting mechanism is specific to the IBM PC and may not be correct for "compatible" machines. * .radix 16 ;***** Equates ********************************************************* check_no equ 0FF ;number of check cycles begin_blk equ 004 ;beginning block to be checked end_blk equ 040 ;last block to be checked nmi_port equ 0A0 ;NMI on / off register nmi_on equ 080 ;turn NMI on nmi_off equ 000 ;turn NMI off port_b equ 061 ;hardware port B port_c equ 062 ;hardware port C par_err equ 0C0 ;parity error bits - either brd par_err_m equ 080 ;parity error bit for main brd par_err_e equ 040 ;parity error bit for exp brd clr_pe equ 030 ;bits to clear parity error latches stack_size equ 0100 ;minimum size of program stack cr equ 0Dh ;carriage return character lf equ 0A ;line feed character bel equ 07 ;bell character ;***** Main Program **************************************************** code segment assume cs:code, ds:code, es:code, ss:code org 100 ;where all good .COM programs go ;----- Start: make some checks and set up stuff ----------------------- start proc near ;check the address of the end of the program mov dx,offset too_big_msg ;get error message mov ax,cs ;get program segment cmp ax,1000 ;check for 1st 64K jge bye_bye ;/ mov cl,4 ;convert from this seg to abs number shl ax,cl ;/ add ax,offset prog_end ;add in the length of the prog jc bye_bye ;check program size add ax,stack_size ;add in the length of the stack jc bye_bye ;check program size ;find end of user memory and set up block enable flags int 12 ;get memory size in K bytes mov dx,offset too_small_msg ;get next error message cmp ax,4 ;check if total memory > 64K jng bye_bye ;/ mov cl,4 ;convert to base 16K shr ax,cl ;/ sub ax,4 ;skip 1st 4 blocks mov cx,ax ;set up block counter cld ;set direction to increment mov al,0FF ;get the block enable flag mov di,offset en_flags + 4 ;point to enable flags after 1st 4 repz stosb ;set flag for each block existing ;check for reduced hammering area mov al,00 ;get block disable flag std ;set direction to decrement mov cx,040d ;get last block sub cx,end_blk ;any work to do? jle lower ;/ mov di,offset en_flags +39d ;point to enable flags repz stosb ;reset flags lower: cld ;set direction to increment mov cx,begin_blk ;get starting point sub cx,04 ;any work to do? jle safe_stack ;/ mov di,offset en_flags + 4 ;point to enable flags repz stosb ;reset flags ;load a "safe" stack pointer safe_stack: mov ax,cs ;get program segment mov cl,4 ;convert from seg to abs shl ax,cl ;/ neg ax ;get difference from 64K mov sp,ax ;load safe stack pointer ;turn off NMI - parity error interrupt mov al,nmi_off ;get off byte out nmi_port,al ;turn off nmi ;start hammering RAM sub dl,dl ;start pattern at 0 jmp control_loop ;installation error - either too big system or too small memory. ;print message and exit. bye_bye: mov ah,9 ;print message int 21 ;/ int 20 ;bye bye too_big_msg db cr, lf db "Program is not contained in first 64K of memory." db cr, lf, bel db "Reduce system overhead and try again." db cr, lf, "$" too_small_msg db cr, lf db "User memory is not more than 64K bytes." db cr, lf, bel db "Nothing remains to be checked." db cr, lf, "$" en_flags db 40d dup(0) ;40 block enable flags (cleared) pattern db 10110010b ;par = e, B2 db 11011001b ;par = o, D9 db 01101100b ;par = e, 6C db 10110110b ;par = o, B6 db 01011011b ;par = o, 5B db 00101101b ;par = e, 2D db 10010110b ;par = e, 96 db 11001011b ;par = o, CB db 01100101b ;par = e, 65 pat_length equ this byte - pattern - 1 ;length of test pattern - 1 start endp ;----- Control Loop ---------------------------------------------------- control_loop proc near ;enter: dl = pattern pointer ;set up pattern pointer to new value dec dl ;start at different spot than last jge write_pattern ;check for wrap-around inc loop_counter ;count loops mov dx,loop_counter ;update printout mov si,offset loop_hex ;load storage location mov bx,offset hex_tbl ;point to hex to ascii xlat table call hex2ascii ;convert mov dl,pat_length ;reset the pattern pointer write_pattern: ;find first enabled block sub bx,bx ;start at beginning call find_next_block ;find the first enabled block jc exit ;no blocks enabled ;pattern write blk_write: call pat_write ;do the writing call find_next_block ;find the next enabled block jnc blk_write ;a block was found ;all enabled blocks written with pattern mov dh,check_no ;load number of times to check cycle: ;find first enabled block sub bx,bx ;start at beginning call find_next_block ;find the first enabled block jc exit ;should not happen ;pattern check blk_check: call pat_check ;do the checking call find_next_block ;find the next enabled block jnc blk_check ;a block was found ;all enabled blocks checked and disabled if faulty call pacifier ;look busy dec dh ;decrement check counter jnz cycle ;check for more checking jmp control_loop ;no more blocks enabled for checking exit: mov dx,offset exit_msg ;print exit message mov ah,9 ;/ int 21 ;/ int 20 ;bye bye exit_msg db cr, lf db "All memory blocks checked have errors." db cr, lf, bel db "Execution terminated." db cr, lf, "$" loop_counter dw 0FFFF ;over all loop counter loop_hex db "0000-" ;ascii of loop counter pac_hex db "0000$" ;ascii of pacifier control_loop endp ;***** Subroutines ***************************************************** ;----- Pacifier -------------------------------------------------------- pacifier proc near ;enter: dh = check counter; dl = pattern pointer ;exit: ax, cx, bp, si = lost push bx ;save block number mov bp,dx ;save counter and pointer ;get current cursor position mov ah,3 ;read cursor position sub bh,bh ;/ page number = 0 int 10 ;/ push dx ;save current cursor position ;set cursor position mov ah,2 ;set cursor position mov dh,0 ;/ row = 1 mov dl,69d ;/ column = 70 int 10 ;/ page number = 0 ;convert pacifier to ascii mov dx,bp ;restore counter and pointer mov si,offset pac_hex ;point to storage location mov bx,offset hex_tbl ;point to translation table call hex2ascii ;make conversion ;print pacifier mov dx,offset loop_hex ;get start of message mov ah,9 ;print message int 21 ;/ ;restore stuff mov ah,2 ;restore cursor position sub bh,bh ;/ page number = 0 pop dx ;/ get old setting int 10 ;/ mov dx,bp ;restore counter and pointer pop bx ;restore block number ret pacifier endp ;----- find_next_block ------------------------------------------------- find_next_block proc near ;find the next enabled block after the one pointed to by bx and ;set up es to point to the block of memory found ;enter: bx = current block number ;exit: bx = next block number; es = next block segment address ; cx = lost; si = lost ; cy set = end of blocks reached; cy clear = next block found mov cx,length en_flags - 1 ;total number of blocks - 1 sub cx,bx ;set count to remaining blocks jz none_found ;already at end of blocks next_blk: inc bx ;point to next block cmp byte ptr en_flags[bx],0FF ;is it enabled? je found_blk ;/ yes - found a good block loop next_blk ;/ no - check next none_found: stc ;set end of blocks flag ret found_blk: mov si,bx ;do not disturb bx mov cl,6 ;convert si to seg addr ror si,cl ;/ mov es,si ;es points to 16K memory block clc ;set found block flag ret find_next_block endp ;----- pat_write ------------------------------------------------------- pat_write proc near ;do the actual pattern writing ;enter: dl = pattern start pointer; es = segment address to put pattern in ;exit: ax, cx, si, di = lost mov cx,4000 ;set counter to block size sub di,di ;clear destination index mov al,dl ;get pattern start pointer cbw ;clear high byte add ax,offset pattern ;complete index addr to pattern mov si,ax ;set source index write_next: movsb ;put pattern byte in block cmp si,offset pattern + pat_length ;check source index jle skip2 ;if necessary ... mov si,offset pattern ;/ reset pattern pointer skip2: loop write_next ;write next byte ret pat_write endp ;----- pat_check ------------------------------------------------------- pat_check proc near ;do the actual pattern checking ;enter: dl = pattern start pointer; es = segment address of pattern ; bx = current block number ;exit: ax, cx, si, di = lost ; if error found: control is passed to bad_byte procedure ; di = bad byte index + 1; si = pattern index + 1 mov cx,4000 ;set counter to block size sub di,di ;clear destination index mov al,dl ;get pattern start pointer cbw ;clear high byte add ax,offset pattern ;complete index addr to pattern mov si,ax ;set source index check_next: cmpsb ;check pattern byte in block jne bad_byte ;oops! in al,port_c ;check for parity error test al,par_err ;/ jnz bad_byte ;oops! cmp si,offset pattern + pat_length ;check source index jle skip3 ;if necessary ... mov si,offset pattern ;/ reset pattern pointer skip3: loop check_next ;check next byte ret pat_check endp ;----- bad_byte -------------------------------------------------------- bad_byte proc near ;a bad byte was found in memory. report it and disable the block ;enter: bx = current block number; es = current block segment address ; di = bad byte index + 1; si = pattern index + 1 ;exit: ax, cx, si = lost push dx ;save pattern start pointer mov dx,offset crlf ;print a new line mov ah,9 ;/ int 21 ;/ ;check for source of error dec di ;correct for last increment dec si ;/ cmp si,offset pattern ;check source index jge skip4 ;if necessary ... mov si,offset pattern + pat_length ;/ reset pattern pointer skip4: mov al,es:[di] ;get bad byte xor al,[si] ;check for a bad bit jnz bit_error ;/ in al,port_c ;get parity error bits mov dx,offset mpe_msg ;get main pe message test al,par_err_m ;check for main brd parity error jnz err_recover ;/ mov dx,offset epe_msg ;get expansion pe message test al,par_err_e ;check for exp brd parity error jnz err_recover ;/ ;error detected by pat_check but the source was not located. ;print message but do not disable this block. it will be checked again. mov dx,offset shit_msg ;print the message ... jmp short noerr_recover ;/ do not disable the block bit_error: ;a bit error was located. separate the bits and include in the message. mov si,offset err_byte + 7 ;get storage location mov cx,8 ;get count store_bit: mov ah,018 ;get ascii "0" pre-shifted shr al,1 ;put bit in carry rcl ah,1 ;get bit from carry mov [si],ah ;store ascii of bit dec si ;point to next bit loop store_bit ;do all bits mov dx,offset bit_msg ;get bit error message err_recover: mov byte ptr en_flags[bx],00 ;disable this block noerr_recover: mov ah,9 ;DOS print message function int 21 ;/ call get_time ;get time and include in status_msg push bx ;save block number mov bx,offset hex_tbl ;point to hex to ascii xlat table mov dx,es ;separate block segment, convert mov si,offset err_blk ;/ to ascii, and include in call hex2ascii ;/ status_msg mov dx,di ;separate memory address, convert mov si,offset err_addr ;/ to ascii, and include in call hex2ascii ;/ status_msg mov dx,offset status_msg ;print status message mov ah,9 ;/ int 21 ;/ pop bx ;restore block number pop dx ;restore pattern start pointer ;clear parity error latches in al,port_b ;get current status of port B or al,clr_pe ;clear parity latches out port_b,al ;/ xor al,clr_pe ;re-enable parity latches out port_b,al ;/ ret crlf db cr, lf, "$" shit_msg db "An error was detected but not located.$" bit_msg db "The bit indicated was found to be in error: " err_byte db "00000000$" mpe_msg db "The main board parity bit was found to be in error.$" epe_msg db "The expansion board parity bit was found to be in " db "error.$" status_msg db cr, lf db "Time: " err_hour db "00:" err_min db "00 Memory Segment: " err_blk db "0000 Memory Address: " err_addr db "0000", cr, lf, "$" bad_byte endp ;----- get_time -------------------------------------------------------- get_time proc near ;get the time of day, make decimal, and put it in status_msg. ;exit: ax, cx = lost mov ah,2C ;DOS get time function int 21 ;/ mov al,ch ;hours (0-23) call hex2dec ;convert hex to ascii decimal mov word ptr err_hour,ax ;store ascii hours mov al,cl ;minutes (0-59) call hex2dec ;convert hex to ascii decimal mov word ptr err_min,ax ;store ascii minutes ret get_time endp ;----- hex2dec --------------------------------------------------------- hex2dec proc near ;convert a hex number <100 in al into ascii decimal in ax. ;enter: al = number to be converted ;exit: ah = ones; al = tens; ch = lost cbw ;clear high byte mov ch,10d ;divide ax by 10 div ch ;/ add ax,"00" ;make ascii ret hex2dec endp ;----- hex2ascii ------------------------------------------------------- hex2ascii proc near ;convert a hex number in dx into ascii. si points to storage location. ;enter: dx = number to be converted; si = storage location pointer ; bx = offset of hex_tbl ;exit: ah, cl = lost add si,3 ;start at end of number mov ah,dl ;do it to low byte call hex_lookup ;/ mov ah,dh ;do it to high byte call hex_lookup ;/ ret hex2ascii endp ;----- hex_lookup ------------------------------------------------------ hex_lookup proc near ;separate two hex digits in ah, convert to ascii and store ;enter: ah = two hex digits; si = storage location ; bx = hex_tbl offset ;exit: al, cl = lost; si = decremented by two mov al,ah ;get low byte and al,0F ;get low nibble xlat hex_tbl ;translate hex to ascii mov [si],al ;store 1st/3rd digit dec si ;point to next mov al,ah ;get low byte again mov cl,4 ;set counter shr al,cl ;shift high nibble to low nibble xlat hex_tbl ;translate hex to ascii mov [si],al ;store 2nd/4th digit dec si ;point to next ret hex_tbl db "0123456789ABCDEF" ;hex to ascii conversion hex_lookup endp ;***** Program End ***************************************************** prog_end label byte ;label the end of the program code ends end start