mjb (02/24/83)
>>>4.1/4.1aBSD<<< Description: ------------ The swap space mapping works correctly only if the size of the per-disk swap space is a multiple of DMMAX blocks (thus the use of 33440 in all those swap*.c files is an error). This is because the code which initializes the swap map only frees in chunks of size DMMAX, and frees one block too many if the per-disk swap size is not a multiple of DMMAX. Symptoms: --------- The observable symptom is a panic (IO err in push) when you actually try to use the blocks erroneously "freed" at the end of the per-disk swap space (it tries to write past the end of the file system). For those of you who have no idea what I'm talking about: --------------------------------------------------------- The compile-time parameter DMMAX is used for two different purposes in 4BSD: 1) The virtual address space of a growing process is expanded by powers of 2 (starting with 16K) until you hit DMMAX (normally 1024 disk blocks, or 512K), at which point you just add DMMAX. 2) The swap space is interleaved across devices in DMMAX-sized chunks (i.e. if you have 2 swap devices the first DMMAX blocks of swap space come from the first device, the second chunk from the second device, the third from the first, etc). Where you get into trouble is the system claims you have a full DMMAX-sized chunk at the end of the swap device, but if the device's size is not a multiple of DMMAX, you lose. Note that this usually means you are getting real close to overflowing the swap area anyhow, but it's nicer if the system doesn't crash because of it (it should just abort the expansion request). Fixes: ------ Well, you could just redefine your "nswap" in your /sys/dev/swap*.c file to be a multiple of DMMAX blocks thereby losing some swap space (less than a half meg/device), or redefine DMMAX to evenly divide "nswap" (not recommended!) , or you can install the following code. This fix is to leave recognized, unallocatable holes in the swap map, thus "padding" each file system used to match the length of the longest one rounded up to a multiple of DMMAX. This has no effect on performance or other unpleasant side effects (i.e there's no reason NOT to do it). As a goodie, you get to have different size file systems on different disks for swapping (although it's not really necessary to the fix, but it was trivial to do, and we wanted it). The fix has been running for over a year with no problems. In /sys/dev/sw.c: /* * Swfree(index) frees the index'th portion of the swap map. * Each of the nswdev devices provides 1/nswdev'th of the swap * space, which is laid out with blocks of DMMAX pages circularly * among the devices. #ifdef BRUNIX * Each of the nswdev devices physically provides UP TO * 1/nswdev'th of the swap map, depending on its length; * thus there will be holes in the swap space (which are, * of course, not freed) if the swap devices have different * lengths or do not contain a multiple of DMMAX blocks. * Nswap is computed at boot to be nswdev*(size of the largest * swap device rounded up to the nearest multiple of DMMAX). #endif */ swfree(index) int index; { register swblk_t vsbase; register int blk; #ifdef BRUNIX register swblk_t dvbase; register int nblks; swdevt[index].sw_freed = 1; nblks = swdevt[index].sw_nblks; for (dvbase = 0; dvbase < nblks; dvbase += DMMAX) { blk = nblks - dvbase; if ((vsbase = index*DMMAX + dvbase*nswdev) >= nswap) panic("swfree"); #else swdevt[index].sw_freed = 1; for (vsbase = index*DMMAX; vsbase < nswap; vsbase += nswdev*DMMAX) { blk = nswap - vsbase; #endif if (blk > DMMAX) blk = DMMAX; . . . } } ------------------------------------------------------------------- In binit() in /sys/sys/main.c: /* * Count swap devices, and adjust total swap space available. * Some of this space will not be available until a vswapon() * system is issued, usually when the system goes multi-user. #ifdef BRUNIX * compute swap space "size" as: * (# of swap devices) * ((size of largest) rounded up to * nearest multiple of DMMAX) #endif */ nswdev = 0; #ifdef BRUNIX nswap = 0; for (swp = swdevt; swp->sw_dev; swp++) { nswdev++; if (swp->sw_nblks > nswap) nswap = swp->sw_nblks; } nswap = ((nswap + DMMAX - 1) / DMMAX) * DMMAX; #else for (swp = swdevt; swp->sw_dev; swp++) nswdev++; #endif if (nswdev == 0) panic("binit"); nswap *= nswdev; maxpgio *= nswdev; swfree(0); } ----------------------------------------------------- A sample /sys/dev/swaphp.c: #include "../h/param.h" #include "../h/conf.h" /* * Single rp0?/rm?? configuration * root on hp0a * paging on hp0b */ dev_t rootdev = makedev(0, 0); /* hp0a */ dev_t pipedev = makedev(0, 0); /* hp0a */ dev_t argdev = makedev(0, 1); /* hp0b */ dev_t dumpdev = makedev(0, 1); /* hp0b */ int dumplo = 33440 - 4 * 2048; /* 4 Mbytes from end */ #ifdef BRUNIX /* * Nswap is the size in disk blocks of the extent of * the swap space. It will have holes depending on the * relative sizes of the swap devices and whether their * sizes are multiples of DMMAX. It is filled in at boot. */ int nswap; #else /* * Nswap is the basic number of blocks of swap per * swap device, and is multiplied by nswdev after * nswdev is determined at boot. */ int nswap = 33440; #endif BRUNIX struct swdevt swdevt[] = { #ifdef BRUNIX /* device, swap on flag, length */ makedev(0, 1), 0, 33440, /* hp0b */ #else makedev(0, 1), 0, /* hp0b */ #endif 0, 0, 0, }; -------------------------------------------------------- in /sys/h/conf.h: /* * Swap device information */ struct swdevt { dev_t sw_dev; int sw_freed; #ifdef BRUNIX int sw_nblks; #endif }; --------------------------------------------------------- Mike Braca, Brown CS, ..!decvax!brunix!mjb, mjb.brown@udel-relay