[comp.windows.x] Using xdm on IBM RTs running AOS 4.3/Sept 88

ehrlich@cs.psu.edu (Daniel Ehrlich) (09/15/89)

Hello,

We have a number of IBM RT 6152 Academic Systems running AOS 4.3 (aka
BSD 4.3).  It would be nice if we could coerce our users into using X
windows on these machines.  Xdm looked like the way to go, but we have
noticed that when a user logs out zombied processes tend to collect
until the process table is full and the machine must be rebooted.

Xdm is set up to restart, rather than reset, the server as the Xibm
server tends to grow without bound if not restarted.  Has anyone else
tried using xdm on RTs running AOS?  Have any of those experienced
they same problem with zombied processes?  I would appreciate hearing
from anyone who has a clue as to why they are accumulating and/or how
to prevent them from occurring in the first place.

--
Dan Ehrlich <ehrlich@cs.psu.edu>   | "A message is not a message until the
The Pennsylvania State University  | rules for interpreting it are in the
Department of Computer Science     | hands of the reciever."
University Park, PA   16802        |    --Apollo Belvedere Smith

john@acorn.co.uk (John Bowler) (09/19/89)

In article <EHRLICH.89Sep15084200@shire.cs.psu.edu> ehrlich@cs.psu.edu (Daniel Ehrlich) writes:
>
>We have a number of IBM RT 6152 Academic Systems running AOS 4.3 (aka
>BSD 4.3).  It would be nice if we could coerce our users into using X
>windows on these machines.  Xdm looked like the way to go, but we have
>noticed that when a user logs out zombied processes tend to collect
>until the process table is full and the machine must be rebooted.
>
>Xdm is set up to restart, rather than reset, the server as the Xibm
>server tends to grow without bound if not restarted.  Has anyone else
>tried using xdm on RTs running AOS?

No - but I have suffered from zombied xdm's on a system where the server
shuts down instead of resetting when all the client connections are
closed.  (This was done for precisely the same reasons - to ensure that
a new X session didn't inherit an enormous server from a previous one).

The problem is that xdm suffers from at least one race condition when
the server dies - heavily loaded machines in particular are likely to
promote it.  The following (context) diffs are to display.c with patches
1-8 (neither patch 9 nor 10 affect it).  They fix the problem on Acorn's
system:-

*** /tmp/,SMSt1022767	Mon Feb 27 19:44:09 1989 (version with patches 1-8)
--- display.c	Thu Jun  1 19:31:11 1989
***************
*** 38,44 ****
  
  static jmp_buf	terminated;
  
! static CatchTerm (), someoneDied (), abortOpen (), StartServer ();
  static WaitForServer (), TerminateServer (), HupServer (), StartSession ();
  
  extern unsigned sleep ();
--- 38,44 ----
  
  static jmp_buf	terminated;
  
! static CatchTerm (), CatchChild (), abortOpen (), StartServer ();
  static WaitForServer (), TerminateServer (), HupServer (), StartSession ();
  
  extern unsigned sleep ();
***************
*** 58,70 ****
  	abort ();
  }
  
! static int	someoneDead;
  
  static
! someoneDied ()
  {
  	Debug ("someone died\n");
! 	someoneDead = 1;
  }
  
  ManageDisplay (d)
--- 58,77 ----
  	abort ();
  }
  
! /*
!  * This modification is to deal with a server (the ARM one) which
!  * insists on shutting down when the last client closes the display.
!  * This exposes an interesting race/bug in this code - WaitForServer
!  * fails to clean up (because the SIGCHLD handler is removed - below).
!  */
! static int	deaths;
! static int	burials;
  
  static
! CatchChild ()
  {
  	Debug ("someone died\n");
! 	++deaths;
  }
  
  ManageDisplay (d)
***************
*** 86,92 ****
  	}
  	(void) signal (SIGTERM, CatchTerm);
  	(void) signal (SIGHUP, CatchHup);
! 	(void) signal (SIGCHLD, someoneDied);
  	(void) signal (SIGPIPE, SIG_IGN);
  	/*
  	 * Step 4: Start server control program
--- 93,99 ----
  	}
  	(void) signal (SIGTERM, CatchTerm);
  	(void) signal (SIGHUP, CatchHup);
! 	(void) signal (SIGCHLD, CatchChild);
  	(void) signal (SIGPIPE, SIG_IGN);
  	/*
  	 * Step 4: Start server control program
***************
*** 95,101 ****
  		Debug ("aborting display %s\n", d->name);
  		exit (1);
  	}
- 	(void) signal (SIGCHLD, SIG_DFL);
  	/*
  	 * keep a session running on this display
  	 */
--- 102,107 ----
***************
*** 109,114 ****
--- 115,121 ----
  			else
  				continue;
  		}
+ 		++burials;
  		if (pid == sessionPid) {
  			Debug ("session died %s\n", d->name);
  			switch (waitVal (status)) {
***************
*** 266,274 ****
  				RegisterCloseOnFork (ConnectionNumber (dpy));
  				return 1;
  			}
! 			if (someoneDead) {
  				pid = wait ((waitType *) 0);
! 				if (pid == serverPid) {
  					Debug ("server died\n");
  					return 0;
  				}
--- 273,282 ----
  				RegisterCloseOnFork (ConnectionNumber (dpy));
  				return 1;
  			}
! 			if (burials < deaths) {
  				pid = wait ((waitType *) 0);
! 				if (pid != (-1)) ++burials;
! 				if (pid == serverPid || pid == (-1) && errno == ECHILD) {
  					Debug ("server died\n");
  					return 0;
  				}
***************
*** 285,292 ****
  	Debug ("giving up on server\n");
  	LogError ("server open failed for %s, giving up\n", d->name);
  	pid = 0;
! 	if (someoneDead)
! 		pid = wait ((waitType *) 0);
  	if (pid != serverPid)
  		TerminateServer (d, serverPid);
  	return 0;
--- 293,301 ----
  	Debug ("giving up on server\n");
  	LogError ("server open failed for %s, giving up\n", d->name);
  	pid = 0;
! 	while (burials < deaths && pid != serverPid)
! 		if ((pid = wait ((waitType *) 0)) != (-1)) ++burials;
! 		else break;
  	if (pid != serverPid)
  		TerminateServer (d, serverPid);
  	return 0;
***************
*** 338,343 ****
--- 347,353 ----
  				(void) alarm (d->openTimeout);
  				pid = wait ((waitType *) 0);
  				(void) alarm (0);
+ 				if (pid >= 0) ++burials;
  				if (pid == serverPid)
  					break;
  			}