[net.sources] Unhang TCP connections stuck in FIN_WAIT_2 state

dennis@rlgvax.UUCP (02/05/86)

"Fixtcp" is a shell script which is useful for getting rid of 4.2bsd TCP
connections hung in the FIN_WAIT_2 state.

Steps:

	Save this file in some directory.
	Remove first lines from this file so that "#! /bin/sh" should
	be the first line.
	Type "sh file" where file is the name of this file.
	cat fixtcp.mk, and follow those directions.

Enjoy,
-dennis


#--------------- CUT HERE ---------------
#! /bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #! /bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create the files:
#	_get_tcp_.c
#	fixtcp
#	fixtcp.mk
# This archive created: Wed Feb  5 15:16:26 EST 1986
#
if test -f _get_tcp_.c
then
echo shar: will not over-write existing file '_get_tcp_.c'
else
echo x - _get_tcp_.c
# ............    F  I   L   E      B  E  G  .......... _get_tcp_.c
cat << '\SHAR_EOF' > _get_tcp_.c
/*
 * dennis@rlgvax
 * prints offsets of fields in TCP connection control block.
 * called by fixtcp sh script
 */
#include <stdio.h>
#include <sys/types.h>		/* u_char */
#include <netinet/tcp.h>	/* tcp_seq typedef */
#include <netinet/tcp_timer.h>	/* tcp timers */
#include <netinet/tcp_var.h>	/* tcp connection control block */
#include <netinet/tcp_fsm.h>	/* defines for tcp states */

/* use S3/S5 strrchr(), but on 4.x systems, remap to Berkeley rindex */
#ifdef BSD4
#	define strrchr	rindex
#endif

#define STR_SAME !strcmp
#define STR_DIFF strcmp

/* fw non-int functions */
char *basename();

/* external non-int functions */
extern	char	*strrchr();

main(argc, argv)
	int	argc;
	char	**argv;
{
	char	*cmd;
	struct	tcpcb	*p = 0;

	cmd = basename(argv[0]);

	if (argc != 2)
		{
usage:
		fprintf(stderr, "usage: %s state|2msl|FIN_WAIT2|TIME_CLOSE\n", cmd);
		exit(1);
		}

	if (STR_SAME(argv[1], "state"))
		printf("0x%x\n", &p->t_state);	/* state offset */
	else if (STR_SAME(argv[1], "2msl"))
		printf("0x%x\n", &p->t_timer[TCPT_2MSL]);	/* timer offset */
	else if (STR_SAME(argv[1], "FIN_WAIT2"))
		printf("0x%x\n", TCPS_FIN_WAIT_2);	/* state value */
	else if (STR_SAME(argv[1], "TIME_CLOSE"))
		printf("0x%x\n", TCPS_TIME_WAIT);	/* state value */
	else
		goto usage;
}

/*
 * return basename of full path name
 */
char *
basename(path)
	char	*path;
{
	char	*cp;		/* general char pointer */

	if ((cp = strrchr(path, '/')) == NULL)	/* no rightmost slash */
		return path;
	else
		return cp;
}
\SHAR_EOF
# ............    F  I   L   E      E  N  D  .......... _get_tcp_.c
fi # end of overwriting check
if test -f fixtcp
then
echo shar: will not over-write existing file 'fixtcp'
else
echo x - fixtcp
# ............    F  I   L   E      B  E  G  .......... fixtcp
cat << '\SHAR_EOF' > fixtcp
# fixtcp
# dennis bednar jan 24 86	dennis@rlgvax.uucp
#
# Unhang tcp connections which are stuck in the FIN_WAIT2 state
# These connections can be seen by doing a 4.2 netstat -a command.
#
# Usage:
# invoke as "fixtcp" to display kernel stuff for connections.
# "fixtcp" by itself is HIGHLY RECOMMENDED for the first time!
#
# invoke as "fixtcp fix" to patch kernel memory - you must be root.
# Then do a netstat -a command, and it should have gone away.
#
# CCI only symptom:
# A symptom of this problem is that "startoftp" goes wild restarting
# the receive daemon, and you see a lot of rcvlog.pid files being
# created in the oftp spool directory.
#
# Symptom for everybody else:
# In general, a symptom of this problem is that a tcpopen passive
# will fail with the errno UNIX reason being "Address Already In Use".
#
#
# To correct OFTP problem (CCI only):
# su root
# killoftp; fixtcp fix; startoftp
#
# Internals of how this script works:
# Works by loading the 2 * msl timer (addr+16) in the Connection Control Block
# with a 1, which means it will time out in 1/2 second from now, and
# enter the CLOSE state, and the the CCB will be freed (so you will not
# see it with netstat -a).
# The proper offset for the 2 * msl timer can be seen by examining
# /usr/include/netinet/tcp_var.h include file, plus other tcp*.h files
# in the same directory.
#
# relies on
#	_get_tcp_	a.out file that returns the offset of various
#			fields in a connecton control block.
#			There is a _get_tcp_.c file to create this.
#			This was created to avoid problems of offsets
#			being site-dependent, if your OS uses different
#			offsets.
#
#

# don't print full path name of command in error messages
cmd=`basename $0`

# name of state to look for in the netstat command
# state=ESTABLISHED	# debugging
state=FIN_WAIT_2	# really

# get the values of the offsets of the fields the the structure for adb
stateoff=`_get_tcp_ state`	# probably 0x8
timer2msloff=`_get_tcp_ 2msl`	# probably 0x10
FIN_WAIT2=`_get_tcp_ FIN_WAIT2`	# probably 9
FIN_CLOSE=`_get_tcp_ TIME_CLOSE`	# probably 10



# remove temp file if SIGHUP, SIGINT, SIGTERM
trap "echo $cmd: interrupted; rm /tmp/fixtcp.$$; exit 1" 1 2 15

# get kernel address of TCP CCB's in FIN_WAIT2 and save in a temporary file
netstat -A | grep $state | sed '1,$s/ .*//p' >/tmp/fixtcp.$$

# check if we got any addresses
if [ ! -s /tmp/fixtcp.$$ ]
then
#	file doesn't exist or is zero in length, therefore no addresses
	echo "$cmd: Sorry, no tcp connections stuck in $state state."
	rm /tmp/fixtcp.$$
	exit 0
fi

echo "Before: only connections in state $state"
netstat -a | grep $state


# cat /tmp/fixtcp.$$	# debug

# see if we want to patch kernel memory or just display it
if [ "$1" = "fix" ]
then
#	patch by writing
	for addr in `cat /tmp/fixtcp.$$`
	do
adb -w /vmunix /dev/kmem <<EOF
0x$addr+$timer2msloff/w 1
\$q
EOF
	done

	sleep 2			# wait for connection to clear

#	make sure it really got unstuck
	netstat -a | grep $state >/tmp/fixtcp.$$
	if [ -s /tmp/fixtcp.$$ ]	# file exists and size > 0
	then
		echo "$cmd: Sorry, TCP connections still hung!!"
		rm /tmp/fixtcp.$$
		exit 1
	else
		echo "$cmd: TCP connections in state $state have been unstuck."
	fi
else
#	just display the current state flag and current 2 * msl timer
	for addr in `cat /tmp/fixtcp.$$`
	do
		echo "The next two numbers displayed by adb should be $FIN_WAIT2 and 0."
		echo "The state flag value of $FIN_WAIT2 represents the FIN_WAIT_2 state."
		echo "The decimal 0 means the 2 * msl timer is off."
adb /vmunix /dev/kmem <<EOF
0x$addr+8/d
0x$addr+0x10/d
\$q
EOF
	done
fi

# cleanup intermediate file
rm /tmp/fixtcp.$$

echo "After: only connections in state $state"
netstat -a | grep $state
exit 0
\SHAR_EOF
# ............    F  I   L   E      E  N  D  .......... fixtcp
fi # end of overwriting check
if test -f fixtcp.mk
then
echo shar: will not over-write existing file 'fixtcp.mk'
else
echo x - fixtcp.mk
# ............    F  I   L   E      B  E  G  .......... fixtcp.mk
cat << '\SHAR_EOF' > fixtcp.mk
#
# dennis@rlgvax 2/4/86
#
# fixtcp.mk	Makefile, this file
# fixtcp	shell script
# _get_tcp_.c	C program
# _get_tcp_	a.out program called by fixtcp
# .fixtcp.mail	header for mail
#
# directions, type
#	make -f fixtcp.mk	# to make necessary files
#	edit fixtcp.mk and change INSTALLDIR
#	make -f fixtcp.mk install
#	cd $INSTALLDIR		# directory where you really installed it
#	fixtcp			# to display tcp connections hung in finwait2
#
#				# don't do this if you have none to unstick
#	su root			# required for adb write mode
#	fixtcp fix		# to actually unstuck tcp connections

# change this at your site
INSTALLDIR = .

all: _get_tcp_

clean:
	rm -f _get_tcp_

install: _get_tcp_
	-cp _get_tcp_ $(INSTALLDIR)
	-cp fixtcp $(INSTALLDIR)

# distribute the latest version to the world, private for dennis@rlgvax
dist:
	rm -rf /tmp/dpb
	mkdir /tmp/dpb
	cp fixtcp.mk /tmp/dpb
	cp _get_tcp_.c /tmp/dpb
	cp ../cmd/fixtcp /tmp/dpb
	cp .fixtcp.mail	/tmp/dpb
	(cd /tmp/dpb; makeshar * >>.fixtcp.mail)

# please note that .fixtcp.mail was chosen so that makeshar *
# doesn't try to append to itself.
\SHAR_EOF
# ............    F  I   L   E      E  N  D  .......... fixtcp.mk
fi # end of overwriting check
# end of shell archive
exit 0
-- 
-Dennis Bednar
{decvax,ihnp4,harpo,allegra}!seismo!rlgvax!dennis	UUCP

steve@umcp-cs.UUCP (Steve D. Miller) (02/06/86)

   Here's a better fix for the 4.2 FIN_WAIT_2 problem.  I don't remember
where I got it, but it works; the basic problem is that the code to
drop the connection when nothing cares about it is there in vanilla
4.2BSD, but is in the wrong place.  This fix moves it to the right
spot and mungs a conditional a little bit...

   The fix is to netinet/tcp_input.c; I think this is for vanilla
4.2, but your line numbers may vary.


*** Vanilla (??) 4.2 tcp_input.c	Fri Jan 24 12:24:00 1986
--- Fixed 4.2 tcp_input.c	Fri Jan 24 12:24:03 1986
***************
*** 358,363 ****
--- 358,372 ----
  			goto dropafterack;
  		if (ti->ti_len > 0) {
  			m_adj(m, ti->ti_len);
+   			/*
+  			 * If data is received on a connection after the
+  			 * user processes are gone, then RST the other end.
+  			 */
+  			if ((so->so_state & SS_NOFDREF) 
+  			    && tp->t_state > TCPS_CLOSE_WAIT) {
+  				tp = tcp_close(tp);
+  				goto dropwithreset;
+  			}
  			ti->ti_len = 0;
  			ti->ti_flags &= ~(TH_PUSH|TH_FIN);
  		}
***************
*** 404,419 ****
  			ti->ti_len -= todrop;
  			ti->ti_flags &= ~(TH_PUSH|TH_FIN);
  		}
- 	}
- 
- 	/*
- 	 * If data is received on a connection after the
- 	 * user processes are gone, then RST the other end.
- 	 */
- 	if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT &&
- 	    ti->ti_len) {
- 		tp = tcp_close(tp);
- 		goto dropwithreset;
  	}
  
  	/*
--- 413,418 ----


-- 
Spoken: Steve Miller 	ARPA:	steve@mimsy.umd.edu	Phone: +1-301-454-4251
CSNet:	steve@umcp-cs 	UUCP:	{seismo,allegra}!umcp-cs!steve
USPS: Computer Science Dept., University of Maryland, College Park, MD 20742