[net.bugs.4bsd] Swap space mapping bug fix

mjb (02/24/83)

>>>4.1/4.1aBSD<<<

Description:
------------
The swap space mapping works correctly only if the size of
the per-disk swap space is a multiple of DMMAX blocks (thus the
use of 33440 in all those swap*.c files is an error). This is because
the code which initializes the swap map only frees in chunks of size
DMMAX, and frees one block too many if the per-disk swap size is not
a multiple of DMMAX.

Symptoms:
---------
The observable symptom is a panic (IO err in push) when you actually try
to use the blocks erroneously "freed" at the end of the per-disk swap space
(it tries to write past the end of the file system).

For those of you who have no idea what I'm talking about:
---------------------------------------------------------
The compile-time parameter DMMAX is used for two different purposes in 4BSD:
1) The virtual address space of a growing process is expanded by powers
of 2 (starting with 16K) until you hit DMMAX (normally 1024 disk blocks, or
512K), at which point you just add DMMAX.
2) The swap space is interleaved across devices in DMMAX-sized chunks (i.e. if
you have 2 swap devices the first DMMAX blocks of swap space come from the
first device, the second chunk from the second device, the third from the
first, etc). Where you get into trouble is the system claims you have a full
DMMAX-sized chunk at the end of the swap device, but if the device's size is
not a multiple of DMMAX, you lose. Note that this usually means you are getting
real close to overflowing the swap area anyhow, but it's nicer if the system
doesn't crash because of it (it should just abort the expansion request).

Fixes:
------
Well, you could just redefine your "nswap" in your /sys/dev/swap*.c file
to be a multiple of DMMAX blocks thereby losing some swap space (less than
a half meg/device), or redefine DMMAX to evenly divide "nswap" (not
recommended!) , or you can install the following code.

This fix is to leave recognized, unallocatable holes in the swap map, thus
"padding" each file system used to match the length of the longest one
rounded up to a multiple of DMMAX.

This has no effect on performance or other unpleasant side effects (i.e
there's no reason NOT to do it). As a goodie, you get to have different size
file systems on different disks for swapping (although it's not really
necessary to the fix, but it was trivial to do, and we wanted it).
The fix has been running for over a year with no problems.


In /sys/dev/sw.c:

/*
 * Swfree(index) frees the index'th portion of the swap map.
 * Each of the nswdev devices provides 1/nswdev'th of the swap
 * space, which is laid out with blocks of DMMAX pages circularly
 * among the devices.
#ifdef BRUNIX
 *	Each of the nswdev devices physically provides UP TO
 *	1/nswdev'th of the swap map, depending on its length;
 *	thus there will be holes in the swap space (which are,
 *	of course, not freed) if the swap devices have different
 *	lengths or do not contain a multiple of DMMAX blocks.
 *	Nswap is computed at boot to be nswdev*(size of the largest
 *	swap device rounded up to the nearest multiple of DMMAX).
#endif
 */
swfree(index)
	int index;
{
	register swblk_t vsbase;
	register int blk;
#ifdef BRUNIX
	register swblk_t dvbase;
	register int nblks;

	swdevt[index].sw_freed = 1;
	nblks = swdevt[index].sw_nblks;
	for (dvbase = 0; dvbase < nblks; dvbase += DMMAX) {
		blk = nblks - dvbase;
		if ((vsbase = index*DMMAX + dvbase*nswdev) >= nswap)
			panic("swfree");
#else
	swdevt[index].sw_freed = 1;
	for (vsbase = index*DMMAX; vsbase < nswap; vsbase += nswdev*DMMAX) {
		blk = nswap - vsbase;
#endif
		if (blk > DMMAX)
			blk = DMMAX;
		.
		.
		.
	}
}

-------------------------------------------------------------------

In binit() in /sys/sys/main.c:
	/*
	 * Count swap devices, and adjust total swap space available.
	 * Some of this space will not be available until a vswapon()
	 * system is issued, usually when the system goes multi-user.
#ifdef BRUNIX
	 *	compute swap space "size" as:
	 *	(# of swap devices) * ((size of largest) rounded up to
	 *					nearest multiple of DMMAX)
#endif
	 */
	nswdev = 0;
#ifdef BRUNIX
	nswap = 0;
	for (swp = swdevt; swp->sw_dev; swp++) {
		nswdev++;
		if (swp->sw_nblks > nswap)
			nswap = swp->sw_nblks;
	}
	nswap = ((nswap + DMMAX - 1) / DMMAX) * DMMAX;
#else
	for (swp = swdevt; swp->sw_dev; swp++)
		nswdev++;
#endif
	if (nswdev == 0)
		panic("binit");
	nswap *= nswdev;
	maxpgio *= nswdev;
	swfree(0);
}
-----------------------------------------------------
A sample /sys/dev/swaphp.c:

#include "../h/param.h"
#include "../h/conf.h"
/*
 * Single rp0?/rm?? configuration
 *	root on hp0a
 *	paging on hp0b
 */
dev_t	rootdev = makedev(0, 0);	/* hp0a */
dev_t	pipedev = makedev(0, 0);	/* hp0a */
dev_t	argdev	= makedev(0, 1);	/* hp0b */
dev_t	dumpdev = makedev(0, 1);	/* hp0b */
int	dumplo	= 33440 - 4 * 2048;	/* 4 Mbytes from end */

#ifdef BRUNIX
/*
 * Nswap is the size in disk blocks of the extent of
 * the swap space. It will have holes depending on the
 * relative sizes of the swap devices and whether their
 * sizes are multiples of DMMAX. It is filled in at boot.
 */
int	nswap;
#else
/*
 * Nswap is the basic number of blocks of swap per
 * swap device, and is multiplied by nswdev after
 * nswdev is determined at boot.
 */
int	nswap = 33440;
#endif BRUNIX

struct	swdevt swdevt[] =
{
#ifdef BRUNIX	/* device, swap on flag, length */
	makedev(0, 1),	0,	33440,		/* hp0b */
#else
	makedev(0, 1),	0,		/* hp0b */
#endif
	0,		0,	0,
};
--------------------------------------------------------
in /sys/h/conf.h:

/*
 * Swap device information
 */
struct swdevt
{
	dev_t	sw_dev;
	int	sw_freed;
#ifdef BRUNIX
	int	sw_nblks;
#endif
};
---------------------------------------------------------
Mike Braca, Brown CS, ..!decvax!brunix!mjb, mjb.brown@udel-relay