[alt.sources] statmon: monitor up/down and time status of hosts

tchrist@convex.com (Tom Christiansen) (03/10/90)

Here's a little program I hacked up the other night to watch when hosts
went up and down, as well as how their clocks strayed.  To find out
how this thing works, you can do these things:

    *	read the following description
    *	call statmon w/o any args for a usage message
    *	type `h' while in the program 
    *   read the source

I suggest all in that order.  If nothing else, the source is
a decent example of playing with cbreak and echo mode, using
UDP sockets, using select to multiplex i/o and timeouts, and for
using the dump operator to greatly speed up start up time.  
It probably won't work very well, if at all, for non-BSD(ish) sites.

Here's what it does:  given a list of hosts, which can be read in from
a file (a simplified ghosts-type file) it tries to talk to the time/udp
service of their inetd's, and if they go too long without any answer
after repeated attempts, it considers them down and tells you so.  When
they come back up again, you get a message that this has happened.
This is better than mere pings, as it requires a coherent inetd to
answer you and is pretty cheap.  The program will also tell you which
hosts have times that are far astray from your own.  The retry,
timeout, clock tolerance, and sleep interval between sends are all
command-line configurable.  This is all done asynchronously with
select()s, including your keyboard inputs, which are in cbreak mode.

Porting notes:  you'll need the following include files, probably in
the perl library directory, which you should have generated from the
corresponding C include files using the makelib program in the perl
source directory:

	sys/errno.h
	sys/socket.h
	sys/ioctl.h

The last one needs a %sizeof array to work right.  I put mine
in sizeof.h in the perl library.  Mine happens to look like this.
Yours, unless you're on a Convex, will almost surely vary.

    $sizeof{'char'} = 1;
    $sizeof{'int'} = 4;
    $sizeof{'long'} = 4;
    $sizeof{'float'} = 4;
    $sizeof{'double'} = 8;
    $sizeof{'long long'} = 8;
    $sizeof{'struct arpreq'} = 36;
    $sizeof{'struct ifconf'} = 8;
    $sizeof{'struct ifreq'} = 32;
    $sizeof{'struct ltchars'} = 6;
    $sizeof{'struct pcntl'} = 116;
    $sizeof{'struct rtentry'} = 52;
    $sizeof{'struct sgttyb'} = 6;
    $sizeof{'struct tchars'} = 6;
    $sizeof{'struct ttychars'} = 14;
    $sizeof{'struct winsize'} = 8;
    $sizeof{'struct system_information'} = 12;
    1;

It also wants getopts.pl and ctime.pl.  

If you find yourself with copious quantities of unwanted disk
space, you can spare yourself the costs of initialization at
each startup by calling 'statmon -u' to dump the state of the
program.  This will skip all the include files and static init
code when restarted.  I suggest you make sure that the program
actually runs first, though, before you bother to dump it.  Also,
those are big include files, so your dump will be pretty huge.

--tom

#! /bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #! /bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create:
#	statmon
# This archive created: Sat Mar 10 08:23:23 1990
export PATH; PATH=/bin:/usr/bin:$PATH
echo shar: "extracting 'statmon'" '(9588 characters)'
if test -f 'statmon'
then
	echo shar: "will not over-write existing file 'statmon'"
else
sed 's/^	X//' << \SHAR_EOF > 'statmon'
	X#!/usr/bin/perl
	X#
	X# statmon - check for hosts going up and down, or with bad clocks
	X# tom christiansen <tchrist@convex.com> on 3/8/90
	X#
	X
	XRESTART:  			# shouldn't really need this...
	X
	X($program = $0) =~ s%.*/%%;
	X$version = 0.3;
	X
	X$| = 1;
	X
	X&bad_usage unless $#ARGV >= 0;
	X
	Xprintf "%s v%3.1g; ", $program, $version;
	X
	Xif ($compiled) {
	X    print "quick start.... ";
	X} else {
	X    print "initializing... ";
	X    
	X    # some useful constants
	X    $sockaddr_t	= 'S n a4 x8';
	X    $inetaddr_t = 'C4';
	X    $sgttyb_t   = 'C4 S';            
	X
	X    $SINCE_1970 = 2208988800;
	X
	X    $def_timeout  = 5;      # how long we give a host to answer us
	X    $def_timewarp = 10;     # how far time may vary until we complain
	X    $def_retries  = 5;	    # he gets this many tries to answer us
	X    $def_sleep    = 5;      # between send loops
	X
	X    $retries      = $def_retries;
	X    $timeout      = $def_timeout;
	X    $timewarp     = $def_timewarp;
	X    $sleep        = $def_sleep;
	X
	X    $OOPS = ", can't continue";
	X
	X    $dashes = ('-' x 75) . "\n";
	X
	X    %cmds = (
	X	'q',	'quit',
	X	'x',	'quit',
	X	'h',	'help',
	X	'?',	'help',
	X	't',	'timers',
	X	'd',	'downers',
	X	'u',	'uppers' ,
	X	'm',	'missing',
	X	'U',	'usage' 
	X    );
	X
	X    &source('sys/errno.h');
	X    &source('sys/socket.h');
	X    &source('sizeof.h');
	X    &source('sys/ioctl.h');
	X    &source('ctime.pl');
	X    &source('getopts.pl');
	X} 
	X
	X
	X&Getopts('udmt:r:c:s:') || &bad_usage;
	X
	X$debug = $opt_d;
	X
	X
	X$retries  = $opt_r if defined $opt_r;
	X$timeout  = $opt_t if defined $opt_t;
	X$timewarp = $opt_c if defined $opt_c;
	X$sleep    = $opt_s if defined $opt_s;
	X
	X
	Xif ($opt_u) {  # dump this puppy
	X    $compiled = 1;
	X    print "dumping\n";
	X    reset 'o';		# so the opt_* vars (especially $opt_u!) go away
	X    dump RESTART;
	X    # not reached
	X} 
	X
	X@SIG{'INT','HUP','TERM','QUIT'} = ('quit','quit','quit','quit');
	X
	X$SIG{'CONT'} = 'continue';
	X
	X# if they say -m, then they want to take stuff from /usr/adm/MACHINES
	X#
	X# which is of the general form:
	X#
	X#	NAME	features
	X#
	X#	spool   vax bsd
	X#	coyote	sunos4 diskserver
	X#	pokey	sunos4 diskless slow
	X#	gort 	convex bsd 
	X#
	Xif ($opt_m) {
	X    # try very hard to find a machines file
	X    $MACHINES = $ENV{'GHOSTS'};
	X    $MACHINES = $ENV{'MACHINES'} 	    unless $MACHINES;		
	X    $MACHINES = $ENV{'HOME'} . '/.ghosts'   unless $MACHINES;
	X    $MACHINES = $ENV{'HOME'} . '/.machines' unless -f $MACHINES;
	X    $MACHINES = '/usr/adm/MACHINES'         unless -f $MACHINES;
	X
	X    die "Can't find any MACHINES file"      unless -f $MACHINES;
	X
	X    open MACHINES ||                        die "can't open $MACHINES: $!";
	X
	X    print "opened $MACHINES\n"		    if $debug;
	X    @hosts = <MACHINES>;
	X    close MACHINES;
	X
	X    @hosts = grep(/^\w+\s/, @hosts);
	X
	X    while ($criterion = shift) {
	X	@hosts = grep(/\b$criterion\b/, @hosts);
	X    } 
	X
	X    for (@hosts) {
	X	chop;
	X	s/^(\w+).*/$1/;
	X    } 
	X} else {
	X    @hosts = @ARGV;
	X} 
	X
	Xif ($#hosts < 0) {
	X    print "No hosts\n";
	X    &bad_usage;
	X} 
	X
	Xprint "hosts are @hosts\n" if $debug;
	X
	X#
	X# ok, now create our socket we want everyone to talk to us at
	X#
	X
	Xchop ($localhost = `hostname`);
	X
	X(($name, $aliases, $type, $len, $thisaddr) = gethostbyname($localhost))
	X    || die "no localhost \"$localhost\"$OOPS";
	X
	X(($name, $aliases, $port, $proto) = getservbyname('time', 'udp'))
	X    || die "no udp service for \"time\"$OOPS";
	X
	Xprint "service is $name, port is $port\n" 
	X    if $debug;
	X
	X
	X(($name, $aliases, $proto) = getprotobyname('udp'))
	X    || die "can't get udp proto$OOPS" ;
	X
	X
	Xsocket(SOCKET, &AF_INET, &SOCK_DGRAM, $proto) 
	X    || die "can't get socket$OOPS";
	X
	X$this = &sockaddr(&AF_INET, 0, $thisaddr);
	X
	Xbind(SOCKET, $this) 
	X    || die "can't bind socket: $!$OOPS";
	X
	X#
	X# now go find all of our hosts' addresses, storing
	X# these in %hosts keyed on $name
	X#
	X
	X
	Xprint "fetching addrs... ";
	X
	for $host (@hosts) {
	X    (($name, $aliases, $type, $len, @addrs) = gethostbyname($host))
	X	|| die "no remote \"$host\"\n";
	X
	X    $name =~ s/\.convex\.com$//;
	X
	X    $hosts{$name} = $addrs[0];
	X}
	X
	Xprint "done.\nType 'h' for help.\n";
	X
	X$rin = $win = $ein = '';
	Xvec($rin,fileno(SOCKET),1) = 1;
	Xvec($ttyin,fileno(STDIN),1) = 1;
	X$rin |= $ttyin;
	X
	X
	X
	X# now keep interrogating forever
	Xfor (;;) {
	X    %sent = ();  # haven't sent anybody anything yet
	X    $sent = 0;
	X
	X    &cbreak;
	X
	X    print $dashes, "entering send loop\n" if $debug;
	X
	X    while (($name, $addr) = each %hosts) {
	X	$that = &sockaddr(&AF_INET, $port, $addr);
	X
	X	if (!send(SOCKET,0,0,$that)) {
	X	    printf STDERR "couldn't send to %-12s %-16s\n", $name, &fmtaddr($addr);
	X	    next;
	X	}
	X
	X	$sent{$name}++;
	X	$sent++;
	X
	X	#printf "sent to %-12s %s\n", $name, &fmtaddr($addr) if $debug;
	X    }
	X
	X    print $dashes, "entering recv loop\n" if $debug;
	X
	X    $ntimeout = $timeout;
	X
	X    while ($sent > 0) {
	X	    $then = time;
	X	    last unless $nfound = select($rout=$rin, $wout=$win, $eout=$ein, $ntimeout);
	X	    if ($nfound < 0) {
	X		warn "select failed: $!\n" unless $! == &EINTR;
	X		redo;
	X	    } 
	X	    $took = (time - $then);
	X	    $ntimeout -= $took; 
	X
	X	    &readsock if vec($rout,fileno(SOCKET),1); 
	X	    &readtty if vec($rout,fileno(STDIN),1); 
	X    }
	X
	X    for $name (sort keys %sent) {
	X	$missed{$name}++;
	X	printf "%-12s missed %d times\n", $name, $missed{$name} if $debug;
	X	if (! $down{$name}) {
	X	    next unless $missed{$name} > $retries;
	X	    next if $down{$name};
	X	    $down{$name} = time;
	X	    printf "%-12s %-16s down at %s", 
	X		$name, &fmtaddr($hosts{$name}), &ctime($down{$name});
	X	}
	X    } 
	X
	X    print "sleeping $sleep -- hit any key to interrupt\n" if $debug;
	X    select($ttyout = $ttyin, $wout=$win, $eout = $ein, $sleep);
	X    &readtty if vec($ttyout,fileno(STDIN),1); 
	X}
	X
	Xsub sockaddr {
	X    if (wantarray) {
	X	    unpack($sockaddr_t, $_[0]);
	X    } else {
	X	    pack($sockaddr_t, $_[0], $_[1], $_[2]);
	X    } 
	X} 
	X
	Xsub inetaddr {
	X    if (wantarray) {
	X	    unpack($inetaddr_t, $_[0]);
	X    } else {
	X	    pack($inetaddr_t, $_[0], $_[1], $_[2]);
	X    }
	X} 
	X
	Xsub source {
	X    local($file) = @_;
	X    local($return) = 0;
	X
	X    $return = do $file;
	X    die "couldn't do \"$file\": $!" unless defined $return;
	X    die "couldn't parse \"$file\": $@" if $@;
	X    die "couldn't run \"$file\"" unless $return;
	X}
	X
	Xsub usage {
	X    print STDERR <<EOM;
	Xusage: $program [switches] host ...
	X   or: $program [switches] -m [criterion ...]
	X
	Xswitches are:
	X    -m  look in MACHINES file for hosts matching criteria
	X
	X    -t	timeout for responses (default $def_timeout)
	X    -r	retries until timed-out host considered down (default $def_retries)
	X    -c  clock drift tolerance (default $def_timewarp)
	X    -s  sleep interval between send loops (default $def_sleep)
	X
	X    -d  print out debugging information
	X    -u  dump state to disk for faster init
	XEOM
	X} 
	X
	Xsub bad_usage {
	X    &usage;
	X    exit(1);
	X} 
	X
	Xsub fmtaddr {
	X    sprintf("[%d.%d.%d.%d]", &inetaddr($_[0]));
	X} 
	X
	X
	Xsub readsock {
	X    ($hisaddr = recv(SOCKET,$histime='',4,0))
	X	|| (warn "couldn't recv: $!$OOPS", return);
	X
	X    $sent--;
	X
	X    ($addrtype, $port, $iaddr) = &sockaddr($hisaddr);
	X
	X    $histime = unpack('L',$histime);
	X    $histime -= $SINCE_1970;
	X
	X    unless (($name,$aliases,$addrtype,$length,@addrs) =
	X		gethostbyaddr($iaddr,$addrtype)) 
	X    {
	X	printf STDERR "received reply from unknown address %sn",
	X				&fmtaddr($iaddr);
	X	next;
	X    } 
	X    $name =~ s/\.convex\.com$//;
	X
	X    printf "%-12s %-16s thinks it's %s", 
	X		$name, &fmtaddr($iaddr), &ctime($histime) if $debug;
	X
	X    $delta = ($histime - time);
	X    $delta = -$delta if $delta < 0;
	X    $delta{$name} = $delta;
	X
	X    delete $missed{$name};
	X
	X    if ($down{$name}) {
	X	printf "%-12s %-16s back at %s",
	X		$name, &fmtaddr($iaddr), &ctime(time);
	X	delete $down{$name};
	X    } 
	X
	X    printf "funny, i didn't send $name anything\n" unless $hosts{$name};
	X    delete $sent{$name};
	X}
	X
	Xsub readtty {
	X    local($cmd) = getc;
	X    local($routine) = '';
	X
	X    $cmd = sprintf ("%c", ord($cmd) & 0x7f);
	X
	X    if (defined $cmds{$cmd}) {
	X	$routine = $cmds{$cmd};
	X	print "\n",$dashes unless $routine eq 'quit';
	X	&$routine;
	X	print $dashes;
	X    } else {
	X	printf " -- unknown command: `%s' (0x%02x)\n", $cmd, ord($cmd);
	X    } 
	X} 
	X
	Xsub quit {
	X    $SIG{'TTOU'} = "IGNORE";
	X    &cooked;
	X    exit 0;
	X} 
	X
	Xsub help {
	X    local($cmd);
	X    print "Key\tCommand\n";
	X    for $cmd (sort keys %cmds) {
	X	printf "%s\t%s\n", $cmd, $cmds{$cmd};
	X    } 
	X} 
	X
	Xsub timers {
	X    local($name);
	X    print "Bad Clocks exceeding $timewarp seconds\n";
	X    for $name (sort keys %delta) {
	X	next unless $delta{$name} > $timewarp;
	X	printf "%-12s %-16s has a clock that's %4d seconds off\n", 
	X	    $name, &fmtaddr($hosts{$name}), $delta{$name};
	X    }
	X}
	X
	X
	Xsub missing {
	X    local($name);
	X    print "Missing Hosts\n";
	X    for $name (sort keys %missed) {
	X	printf "%-12s %-16s has missed %d timeout%s of %d seconds\n",
	X	    $name, &fmtaddr($hosts{$name}), $missed{$name},
	X	    ($missed{$name} == 1) ? " " : "s", $timeout;
	X    }
	X} 
	X
	Xsub downers {
	X    local($name);
	X    print "Down Hosts\n";
	X    for $name (sort keys %down) {
	X	printf "%-12s %-16s down since %s", 
	X	    $name, &fmtaddr($hosts{$name}), &ctime($down{$name});
	X    } 
	X} 
	X
	Xsub uppers {
	X    local ($name);
	X
	X    print "Up Hosts\n";
	X
	X    for $name (sort keys %hosts) {
	X	next if $down{$name};
	X	printf "%-12s up\n", $name;
	X    } 
	X} 
	X
	Xsub continue { 
	X    print "continuing...\n";
	X    &cbreak; 
	X}
	X
	Xsub cbreak {
	X    &set_cbreak(1);
	X} 
	X
	Xsub cooked {
	X    &set_cbreak(0);
	X} 
	X
	Xsub set_cbreak {
	X    local($on) = @_;
	X
	X    ioctl(STDIN,&TIOCGETP,$sgttyb) 
	X	|| die "Can't ioctl TIOCGETP: $!";
	X
	X    @ary = unpack($sgttyb_t,$sgttyb);
	X    if ($on) {
	X	$ary[4] |= &CBREAK;
	X	$ary[4] &= ~&ECHO;
	X    } else {
	X	$ary[4] &= ~&CBREAK;
	X	$ary[4] |= &ECHO;
	X    }
	X    $sgttyb = pack($sgttyb_t,@ary);
	X    ioctl(STDIN,&TIOCSETP,$sgttyb)
	X	    || die "Can't ioctl TIOCSETP: $!";
	X
	X}
SHAR_EOF
if test 9588 -ne "`wc -c < 'statmon'`"
then
	echo shar: "error transmitting 'statmon'" '(should have been 9588 characters)'
fi
chmod 775 'statmon'
fi
exit 0
#	End of shell archive
--

    Tom Christiansen                       {uunet,uiucdcs,sun}!convex!tchrist 
    Convex Computer Corporation                            tchrist@convex.COM
		 "EMACS belongs in <sys/errno.h>: Editor too big!"