allbery@uunet.UU.NET (Brandon S. Allbery - comp.sources.misc) (07/09/89)
Posting-number: Volume 7, Issue 63 Submitted-by: gwyn@BRL.MIL Archive-name: tot_info #!/bin/sh # Self-unpacking archive format. To unbundle, sh this file. echo 'Makefile' 1>&2 cat >'Makefile' <<'END OF Makefile' # Makefile -- -- makefile for "tot_info" utility # last edit: 89/02/06 D A Gwyn # SCCS ID: @(#)tot_info.mk 1.1 (edited for publication) PRODUCT = tot_info MAKEFIL = Makefile HEADERS = chisq.h gamma.h CFILES = $(PRODUCT).c info.c gamma.c chisq.c # chisq.c is a "freebie" OBJECTS = $(PRODUCT).o info.o gamma.o # chisq.o LIBTEST = g_test TSRC = $(LIBTEST).c TOBJ = $(LIBTEST).o TFILES = $(PRODUCT).in $(PRODUCT).exp LIBES = -lm LLIBES = $(LIBES) MANSRCS = $(PRODUCT).1 chisq.3 gamma.3 # "standard" UNIX utilities that may need invocations tweaked: INS = cp LINT = lint FLOW = cflow XREF = cxref -c -s -w132 # printer configuration: LPR = lp PR = pr -f # DWB configuration: TOPT = -Ti300 TMAC = -man EQN = eqn $(TOPT) TROFF = troff $(TOPT) $(TMAC) TPOST = | dimp $(PRODUCT): $(OBJECTS) $(CC) $(LDFLAGS) -o $@ $(OBJECTS) $(LIBES) chisq.o info.o: chisq.h gamma.o: gamma.h print: $(MAKEFIL) $(HEADERS) $(CFILES) $(TFILES) $(MANSRCS) $(PR) $(MAKEFIL) $(HEADERS) $(CFILES) $(TFILES) $(MANSRCS) | $(LPR) typeset: $(MANSRCS) $(EQN) $(PRODUCT).1 | $(TROFF) $(TPOST) $(EQN) chisq.3 | $(TROFF) $(TPOST) $(EQN) gamma.3 | $(TROFF) $(TPOST) lint: $(HEADERS) $(CFILES) $(LINT) $(CFILES) $(LLIBES) > $@ flow: $(HEADERS) $(CFILES) $(FLOW) $(CFILES) > $@ xref: $(HEADERS) $(CFILES) $(XREF) $(CFILES) > $@ test: gamma_test tot_test tot_test: $(PRODUCT) $(TFILES) ./$(PRODUCT) < $(PRODUCT).in > $(PRODUCT).out @if cmp -s $(PRODUCT).exp $(PRODUCT).out ; \ then echo 'Tested okay' ; \ else echo '*** Test failed; differences:' ; \ diff $(PRODUCT).exp $(PRODUCT).out ; \ exit 1 ; \ fi gamma_test: $(LIBTEST) ./$(LIBTEST) $(LIBTEST): $(TOBJ) gamma.o $(CC) $(LDFLAGS) -o $@ $(TOBJ) gamma.o $(LIBES) $(LIBTEST).o: gamma.h clean: -rm -f $(OBJECTS) $(TOBJ) $(LIBTEST) $(PRODUCT).out \ lint flow xref core mon.out clobber: clean -rm -f $(PRODUCT) END OF Makefile echo 'chisq.3' 1>&2 cat >'chisq.3' <<'END OF chisq.3' '\" e .TH CHISQ 3V VMB '\" last edit: 88/09/19 D A Gwyn '\" SCCS ID: @(#)chisq.3 1.2 (edited for publication) '\" '\" This source must be typeset via "eqn | troff -man" '\" .EQ delim @@ .EN .SH NAME chisq \- independence tests for 2-way contingency tables .SH SYNOPSIS .ds cW (CW\" change to B (without the paren) if you don't have a CW font \f\*(cW #include "chisq.h" .br /* \fPThe following functions are declared in this header file.\f\*(cW */ .sp double ChiSqTbl(int r, int c, const long *f, int *df); .br double InfoTbl(int r, int c, const long *f, int *df);\fP .SH DESCRIPTION \f\*(cWChiSqTbl\fP returns Pearson's @chi sup 2@ statistic for the \f\*(cWr\fP-by-\f\*(cWc\fP contingency table of observed frequencies of occurrence for each combination of row and column categories stored in row-major order in the vector starting at \f\*(cWf\fP. The computed number of degrees of freedom is stored into the location pointed to by \f\*(cWdf\fP. .br .EQ chi sup 2 ~==~ roman sum from i=0 to r-1 roman sum from j=0 to c-1 { left ( { f sub ij ~-~ { f sub { i. } f sub { .j }} over N } right ) sup 2 } over { { f sub { i. } f sub { .j }} over N } .EN where the row and column sums are denoted @f sub { i. }~==~roman sum from j=0 to c-1 f sub ij@ and @f sub { .j }~==~roman sum from i=0 to r-1 f sub ij@ and the total number of occurrences is @N~==~roman sum from i=0 to r-1 roman sum from j=0 to c-1 f sub ij@. Terms involving zero row or column sum are omitted and the returned number of degrees of freedom are correspondingly reduced from the nominal value @(r-1)(c-1)@. .P \f\*(cWInfoTbl\fP returns 2 times Kullback's @"\v'-0.2'^\v'0.2'\h'-\w;^;u'I\^" ( H sub 1 : H sub 2 )@ statistic for a similar contingency table. The number of degrees of freedom to be used when relating this statistic to the @chi sup 2@ distribution is stored into the location pointed to by \f\*(cWdf\fP. (See \s-1DISCUSSION\s0 below.) .br .EQ 2 "\v'-0.2'^\v'0.2'\h'-\w;^;u'I\^" ( H sub 1 : H sub 2 ) ~==~ 2 roman sum from i=0 to r-1 roman sum from j=0 to c-1 { f sub ij log { { N f sub ij } over { f sub { i. } f sub { .j } } } } .EN where the row and column sums @f sub { i. }@ and @f sub { .j }@ and the total number of observations @N@ are the same as for Pearson's @chi sup 2@. .SH DISCUSSION @2 "\v'-0.2'^\v'0.2'\h'-\w;^;u'I\^" ( H sub 1 : H sub 2 )@ represents twice the estimated information in favor of hypothesis @H sub 1@ over hypothesis @H sub 2@ contained in the observed frequencies, where the \fInull hypothesis\fP @H sub 2@ is that the row and column categorizations are statistically independent and the \fIalternative hypothesis\fP @H sub 1@ is that they aren't. This statistic is asymptotically distributed as @chi sup 2@ with @(r-1)(c-1)@ degrees of freedom; therefore by use of the \f\*(cWQChiSq\fP function (see \fIgamma\fP(3V)) it can be converted to the probability that the statistic would be as large as was observed if the categorizations really are independent. This is of course the traditional use of Pearson's @chi sup 2@ statistic, which @2 "\v'-0.2'^\v'0.2'\h'-\w;^;u'I\^"@ approaches for large samples when @H sub 2@ is true. The main difference is that Pearson's @chi sup 2@ is not useful for small sample sizes, whereas @2 "\v'-0.2'^\v'0.2'\h'-\w;^;u'I\^"@ fully takes into account all available information for even the smallest samples. .P @2 "\v'-0.2'^\v'0.2'\h'-\w;^;u'I\^" ( H sub 1 : H sub 2 )@ (along with its corresponding degrees of freedom for use with \f\*(cWQChiSq\fP) is \fIadditive\fP for independent samples, so that the information can be accumulated over the course of many independent experiments in order to improve one's ability to detect a violation of the null hypothesis. .SH CAVEATS Pearson's @chi sup 2@ test is unreliable for low frequencies; consequently it is usually recommended that categories be chosen to tally at least 5 occurrences in each bin. However, excessive combination of bins causes loss of information and consequently loss of discriminating power. Because @2 "\v'-0.2'^\v'0.2'\h'-\w;^;u'I\^"@ is correct for any sample size, it does not require combination of bins and is therefore immune from this problem. .bp .SH EXAMPLE \f\*(cW .ta 8n 16n 24n 32n 40n 48n 56n 64n .nf \& /* Example program to read table dimensions, then frequencies, and to print both statistics along with significance levels. */ #include <stdio.h> #include <stdlib.h> /* for EXIT_SUCCESS */ #include "chisq.h" #include "gamma.h" /* for QChiSq(\|) */ #define MAXTBL 1000 static long f[MAXTBL]; /* frequency tallies */ static int r; /* # of rows */ static int c; /* # of columns */ #define x(i,j) f[(i)*c+(j)] /* convenient way to access freqs */ static void Calc(char *name, double (*func)(int, int, const long *, int *)) { int df; /* degrees of freedom for chi-square */ double stat = (*func)(r, c, f, &df); /* computed statistic */ /* print results */ if ( stat >= 0.0 ) (void)printf("%s = %5.2f\etdf = %2d\etq = %7.4f\en", name, stat, df, QChiSq(stat, df) ); else (void)printf(stat < -3.5 ? "out of memory\en" : stat < -2.5 ? "table too small\en" : stat < -1.5 ? "negative frequency\en" : "too many zeros\en" ); } int main(int argc, char *argv[]) { register int i; /* row index */ register int j; /* column index */ while ( scanf("%d %d\en", &r, &c) == 2 ) /* start new table */ { /* input tallies */ for ( i = 0; i < r; ++i ) for ( j = 0; j < c; ++j ) (void)scanf(" %ld", &x(i,j)); /* compute statistics and print results */ Calc("chisq", ChiSqTbl); Calc("2info", InfoTbl); } return EXIT_SUCCESS; }\fP .bp .SH FILES .ta 3i chisq.h header file .br chisq.o, info.o object modules .SH REFERENCE Solomon Kullback, \fIInformation Theory and Statistics\fP (Dover, 1968). .SH "SEE ALSO" gamma(3V). .SH DIAGNOSTICS Both these functions a return negative value for the statistic when it cannot be meaningfully computed, as follows: .RS .PD 0 .TP \-1.0 too many 0 entries in the table (for \f\*(cWChiSqTbl\fP, this means insufficient degrees of freedom; for \f\*(cWInfoTbl\fP, this means \fIevery\fP entry in the table was 0) .TP \-2.0 some frequency was specified as less than 0 .TP \-3.0 specified number of rows or columns was less than 2 .TP \-4.0 unable to dynamically allocate enough working storage .PD .RE .SH AUTHOR Douglas A.\& Gwyn, BRL/VLD-VMB END OF chisq.3 echo 'chisq.c' 1>&2 cat >'chisq.c' <<'END OF chisq.c' /* ChiSqTbl -- Pearson's chi-square test for a 2-way contingency table last edit: 88/09/19 D A Gwyn SCCS ID: @(#)chisq.c 1.1 (edited for publication) Special return values: -1.0 insufficient degrees of freedom (too many 0 entries) -2.0 invalid table entry (frequency less than 0) -3.0 invalid table dimensions (r or c less than 2) -4.0 unable to allocate enough working storage */ #ifdef __STDC__ #include <stdlib.h> /* malloc, free */ #include "std.h" #else #include "std.h" extern pointer malloc(); extern void free(); #endif double ChiSqTbl( r, c, f, pdf ) int r; /* # rows in table */ int c; /* # columns in table */ const long *f; /* -> r*c frequency tallies */ int *pdf; /* -> return # degrees of freedom */ { #define x(i,j) f[(i)*c+(j)] /* convenient way to access freqs */ register int i; /* row index */ register int j; /* column index */ double N; /* (double)n */ double chisq; /* accumulates chi-square */ double *xi; /* row sums */ double *xj; /* col sums */ int rdf = r - 1; /* row degrees of freedom */ int cdf = c - 1; /* column degrees of freedom */ if ( rdf <= 0 || cdf <= 0 ) { chisq = -3.0; goto ret3; } if ( (xi = (double *)malloc( r * sizeof(double) )) == NULL ) { chisq = -4.0; goto ret3; } if ( (xj = (double *)malloc( c * sizeof(double) )) == NULL ) { chisq = -4.0; goto ret2; } /* compute row sums and total */ N = 0.0; for ( i = 0; i < r; ++i ) { double sum = 0.0; /* accumulator */ for ( j = 0; j < c; ++j ) { register long k = x(i,j); if ( k < 0L ) { chisq = -2.0; goto ret1; } sum += (double)k; } if ( (xi[i] = sum) <= 0.0 ) --rdf; else N += sum; } /* compute column sums */ for ( j = 0; j < c; ++j ) { double sum = 0.0; /* accumulator */ for ( i = 0; i < r; ++i ) sum += (double)x(i,j); if ( (xj[j] = sum) <= 0.0 ) --cdf; } if ( rdf <= 0 || cdf <= 0 ) { chisq = -1.0; goto ret1; } *pdf = rdf * cdf; /* total degrees of freedom */ /* compute chi-square */ chisq = 0.0; for ( i = 0; i < r; ++i ) { double pi = xi[i]; /* row sum */ if ( pi > 0.0 ) for ( j = 0; j < c; ++j ) { double pj = xj[j]; /* column sum */ if ( pj > 0.0 ) { double expected = pi * pj / N; double delta = (double)x(i,j) - expected; chisq += delta * delta / expected; } } } ret1: free( (pointer)xj ); ret2: free( (pointer)xi ); ret3: return chisq; } END OF chisq.c echo 'chisq.h' 1>&2 cat >'chisq.h' <<'END OF chisq.h' /* chisq.h -- definitions for contingency-table routines last edit: 88/09/19 D A Gwyn SCCS ID: @(#)chisq.h 1.1 (edited for publication) */ /* library routines: */ #ifdef __STDC__ extern double ChiSqTbl( int r, int c, const long *f, int *pdf ); extern double InfoTbl( int r, int c, const long *f, int *pdf ); #else extern double ChiSqTbl(); extern double InfoTbl(); #endif END OF chisq.h echo 'g_test.c' 1>&2 cat >'g_test.c' <<'END OF g_test.c' /* g_test -- tests for gamma and related functions last edit: 88/09/09 D A Gwyn SCCS ID: @(#)g_test.c 1.1 (edited for publication) */ #include <stdio.h> #ifdef __STDC__ #include <stdlib.h> /* for NULL etc. */ #endif #include "std.h" #include "gamma.h" #define Printf (void)printf #define TOL 1.0e-5 /* tolerance for checks */ static int errs = 0; /* tally errors */ static double RelDif( a, b ) /* returns relative difference: */ double a, b; /* 0.0 if exactly the same, otherwise ratio of difference to the larger of the two */ { double c = Abs( a ); double d = Abs( b ); d = Max( c, d ); return d == 0.0 ? 0.0 : Abs( a - b ) / d; } static void RCheck( d, r ) /* check real number */ double d; /* real to be checked */ double r; /* expected value */ { if ( RelDif( d, r ) > TOL ) { errs = 1; Printf( "value s.b. %g, was %g\n", r, d ); } } void GTest() { static struct { double in; /* input values */ double exp; /* expected output values */ } tbl[] = /* table of test values */ { { 1.0, 1.0 }, { 2.0, 1.0 }, { 3.0, 2.0 }, { 4.0, 6.0 }, { 5.0, 24.0 }, { 6.0, 120.0 }, { 0.5, 1.7724538509 }, { 1.5, 0.8862269255 }, { 0.25, 3.6256099082 }, { 0.333333333333, 2.6789385347 }, { 0.666666666667, 1.3541179394 }, { 0.75, 1.2254167024 }, { 10.0, 362880.0 }, { 20.0, 1.2164510041e+17 }, }; register int i; /* indexes tbl[] test values */ for ( i = 0; i < sizeof tbl / sizeof tbl[0]; ++i ) RCheck( Gamma( tbl[i].in ), tbl[i].exp ); } void FTest() { static struct { int in; /* input values */ double exp; /* expected output values */ } tbl[] = /* table of test values */ { { 0, 1.0 }, { 1, 1.0 }, { 2, 2.0 }, { 3, 6.0 }, { 4, 24.0 }, { 5, 120.0 }, { 6, 720.0 }, { 10, 3628800.0 }, { 20, 2.4329020082e+18 }, }; register int i; /* indexes tbl[] test values */ for ( i = 0; i < sizeof tbl / sizeof tbl[0]; ++i ) RCheck( Factorial( tbl[i].in ), tbl[i].exp ); } void BCTest() { static struct { int n; /* top parts of inputs */ int k; /* bottom parts of inputs */ double exp; /* expected output values */ } tbl[] = /* table of test values */ { { 1, 0, 1.0 }, { 1, 1, 1.0 }, { 2, 0, 1.0 }, { 2, 1, 2.0 }, { 2, 2, 1.0 }, { 3, 0, 1.0 }, { 3, 1, 3.0 }, { 3, 2, 3.0 }, { 5, 3, 10.0 }, { 10, 4, 210.0 }, { 10, 5, 252.0 }, { 40, 6, 3838380.0 }, { 50, 20, 47129212243960.0}, }; register int i; /* indexes tbl[] test values */ for ( i = 0; i < sizeof tbl / sizeof tbl[0]; ++i ) RCheck( BCoeff( tbl[i].n, tbl[i].k ), tbl[i].exp ); } void ETest() { static struct { double in; /* input values */ double exp; /* expected output values */ } tbl[] = /* table of test values */ { { 0.0, 0.0 }, { 0.1, 0.1124629160 }, { 0.2, 0.2227025892 }, { 0.5, 0.5204998778 }, { 0.8, 0.7421009647 }, { 1.0, 0.8427007929 }, { 1.5, 0.9661051465 }, { 2.0, 0.9953222650 }, }; register int i; /* indexes tbl[] test values */ for ( i = 0; i < sizeof tbl / sizeof tbl[0]; ++i ) RCheck( Erf( tbl[i].in ), tbl[i].exp ); } void QCTest() { static struct { double chisq; /* chi-square limit */ int df; /* degrees of freedom */ double exp; /* expected output values */ } tbl[] = /* table of test values */ { { 0.001, 1, 0.97477 }, { 0.01, 1, 0.92034 }, { 0.01, 2, 0.99501 }, { 0.05, 1, 0.82306 }, { 0.1, 1, 0.75183 }, { 0.1, 2, 0.95123 }, { 1.0, 1, 0.31731 }, { 1.0, 2, 0.60653 }, { 1.0, 3, 0.80125 }, { 1.0, 4, 0.90980 }, { 1.0, 5, 0.96257 }, { 1.5, 2, 0.47237 }, { 2.0, 1, 0.15730 }, { 2.0, 3, 0.57241 }, { 2.0, 5, 0.84915 }, { 4.0, 6, 0.67668 }, { 5.0, 5, 0.41588 }, { 10.0, 7, 0.188573 }, { 10.0, 10, 0.44049 }, { 10.0, 20, 0.96817 }, { 20.0, 30, 0.91654 }, { 40.0, 30, 0.104864 }, { 8.26040, 20, 0.99 }, { 10.8508, 20, 0.95 }, { 12.4426, 20, 0.90 }, { 15.4518, 20, 0.75 }, { 19.3374, 20, 0.50 }, { 23.8277, 20, 0.25 }, { 28.4120, 20, 0.10 }, { 31.4104, 20, 0.05 }, { 37.5662, 20, 0.01 }, }; register int i; /* indexes tbl[] test values */ for ( i = 0; i < sizeof tbl / sizeof tbl[0]; ++i ) RCheck( QChiSq( tbl[i].chisq, tbl[i].df ), tbl[i].exp ); } /*ARGSUSED*/ main( argc, argv ) int argc; char *argv[]; { GTest(); FTest(); BCTest(); ETest(); QCTest(); return errs; } END OF g_test.c echo 'gamma.3' 1>&2 cat >'gamma.3' <<'END OF gamma.3' '\" e .TH GAMMA 3V VMB '\" last edit: 88/09/09 D A Gwyn '\" SCCS ID: @(#)gamma.3 1.2 (edited for publication) '\" '\" This source must be typeset via "eqn | troff -man" '\" .EQ delim @@ .EN .SH NAME gamma \- gamma and related functions .SH SYNOPSIS .ds cW (CW\" change to B (without the paren) if you don't have a CW font \f\*(cW #include "gamma.h" .br /* \fPAll the following functions are declared in this header file.\f\*(cW */ .sp double Gamma(double z); .br double LGamma(double z); .br double Factorial(int n); .br double LFactorial(int n); .br double BCoeff(int n, int k); .br double Beta(double z, double w); .br double PGamma(double a, double x); .br double QGamma(double a, double x); .br double Erf(double x); .br double Erfc(double x); .br double CPoisson(double x, int k); .br double PChiSq(double chisq, int nu); .br double QChiSq(double chisq, int nu); .ft .SH DESCRIPTION \f\*(cWGamma\fP returns the real gamma function value @GAMMA (z)~==~ int "" from 0 to inf t sup z-1 e sup -t dt@ for @z>0@. .P \f\*(cWLGamma\fP returns the value @ln ~ GAMMA (z)@ for @z>0@. .P \f\*(cWFactorial\fP returns the factorial function value @n! ~=~ GAMMA (n+1)@ for @n>=0@. .P \f\*(cWLFactorial\fP returns the value @ln ~n!@\& for @n>=0@. .P \f\*(cWBCoeff\fP returns the binomial coefficient value @size +8 ( pile { n above k } size +8 ) ~==~n! over k!(n-k)!@\& for @0<=k<=n@. .P \f\*(cWBeta\fP returns the real beta function value @B(z,w)~==~ int "" from 0 to 1 t sup z-1 (1-t) sup w-1 dt@ for @z>0@ and @w>0@. .P \f\*(cWPGamma\fP returns the incomplete gamma function value @P(a,x)~==~ 1 over { GAMMA (a) } int "" from 0 to x e sup -t t sup a-1 dt@ for @a>0@ and @0<=x<= inf@. .P \f\*(cWQGamma\fP returns the value @1-P(a,x)@. .P \f\*(cWErf\fP returns the error function value @roman erf (x)~==~2 over sqrt pi int "" from 0 to x e sup {-t sup 2 } dt@. .P \f\*(cWErfc\fP returns the complementary error function value @1- roman erf (x)@. .P \f\*(cWCPoisson\fP returns the cumulative Poisson probability value @P sub x (<k)@, which is the probability that the number of Poisson random events occurring will be between 0 and @k-1@ inclusive, where the expected mean is @x@. .P \f\*(cWPChiSq\fP returns the value @P( chi sup 2 | nu )@, which is the probability that the observed chi-square for a correct model will be less than @chi sup 2@, where @nu@ is the number of degrees of freedom. .P \f\*(cWQChiSq\fP returns the value @1-P( chi sup 2 | nu )@, which is the probability that the observed chi-square will exceed @chi sup 2@ by chance even for a correct model, where @nu@ is the number of degrees of freedom. .P All these functions return .I approximations to the exact results, generally accurate to about seven significant digits. .SH CAVEATS Overflow can occur. .P Invalid parameters cause a diagnostic and termination of the process. .SH FILES .ta 3i gamma.h header file .br gamma.o object module .SH AUTHORS William H.\& Press, Brian P.\& Flannery, Saul A.\& Teukolsky, and William T.\& Vetterling (\fINumerical Recipes in C\fP, Chapter 6) .br Douglas A.\& Gwyn, BRL/VLD-VMB END OF gamma.3 echo 'gamma.c' 1>&2 cat >'gamma.c' <<'END OF gamma.c' /* Gamma -- gamma and related functions last edit: 88/09/09 D A Gwyn SCCS ID: @(#)gamma.c 1.1 (edited for publication) Acknowledgement: Code based on that found in "Numerical Methods in C". */ #include <assert.h> #include <math.h> #include "std.h" double LGamma( x ) double x; { static const double cof[6] = { 76.18009173, -86.50532033, 24.01409822, -1.231739516, 0.120858003e-2, -0.536382e-5 }; double tmp, ser; register int j; assert(x > 0.0); if ( --x < 0.0 ) /* use reflection formula for accuracy */ { double pix = PI * x; return log( pix / sin( pix ) ) - LGamma( 1.0 - x ); } tmp = x + 5.5; tmp -= (x + 0.5) * log( tmp ); ser = 1.0; for ( j = 0; j < 6; ++j ) ser += cof[j] / ++x; return -tmp + log( 2.50662827465 * ser ); } double Gamma( x ) double x; { return exp( LGamma( x ) ); } double Factorial( n ) register int n; { static double a[33] = { 1.0, 1.0, 2.0, 6.0, 24.0 }; static int ntop = 4; assert(n >= 0); if ( n > 32 ) return Gamma( (double)n + 1.0 ); while ( ntop < n ) { register int j = ntop++; a[ntop] = a[j] * (double)ntop; } return a[n]; } double LFactorial( n ) register int n; { static double a[101]; /* init 0.0 */ assert(n >= 0); if ( n <= 1 ) return 0.0; if ( n <= 100 ) if ( a[n] > 0.0 ) return a[n]; /* table value already set up */ else return a[n] = LGamma( (double)n + 1.0 ); else return LGamma( (double)n + 1.0 ); /* beyond table */ } double BCoeff( n, k ) register int n, k; { assert(k >= 0); assert(n >= k); return Round( exp( LFactorial( n ) - (LFactorial( k ) + LFactorial( n - k )) ) ); } double Beta( z, w ) double z, w; { return exp( LGamma( z ) + LGamma( w ) - LGamma( z + w ) ); } #define ITMAX 100 #define EPS 3.0e-7 static double gser( a, x ) double a, x; { double ap, del, sum; register int n; assert(x >= 0.0); if ( x <= 0.0 ) return 0.0; del = sum = 1.0 / (ap = a); for ( n = 1; n <= ITMAX; ++n ) { sum += del *= x / ++ap; if ( Abs( del ) < Abs( sum ) * EPS ) return sum * exp( -x + a * log( x ) - LGamma( a ) ); } assert(n <= ITMAX); /*NOTREACHED*/ } static double gcf( a, x ) double a, x; { register int n; double gold = 0.0, fac = 1.0, b1 = 1.0, b0 = 0.0, a0 = 1.0, a1 = x; for ( n = 1; n <= ITMAX; ++n ) { double anf; double an = (double)n; double ana = an - a; a0 = (a1 + a0 * ana) * fac; b0 = (b1 + b0 * ana) * fac; anf = an * fac; b1 = x * b0 + anf * b1; a1 = x * a0 + anf * a1; if ( a1 != 0.0 ) { /* renormalize */ double g = b1 * (fac = 1.0 / a1); gold = g - gold; if ( Abs( gold ) < EPS * Abs( g ) ) return exp( -x + a * log( x ) - LGamma( a ) ) * g; gold = g; } } assert(n <= ITMAX); /*NOTREACHED*/ } double PGamma( a, x ) double a, x; { assert(x >= 0.0); assert(a > 0.0); return x < a + 1.0 ? gser( a, x ) : 1.0 - gcf( a, x ); } double QGamma( a, x ) double a, x; { assert(x >= 0.0); assert(a > 0.0); return x < a + 1.0 ? 1.0 - gser( a, x ) : gcf( a, x ); } double Erf( x ) double x; { return x < 0.0 ? -PGamma( 0.5, x * x ) : PGamma( 0.5, x * x ); } double Erfc( x ) double x; { return x < 0.0 ? 1.0 + PGamma( 0.5, x * x ) : QGamma( 0.5, x * x ); } double CPoisson( x, k ) double x; int k; { return QGamma( (double)k, x ); } double PChiSq( chisq, df ) double chisq; int df; { return PGamma( (double)df / 2.0, chisq / 2.0 ); } double QChiSq( chisq, df ) double chisq; int df; { return QGamma( (double)df / 2.0, chisq / 2.0 ); } END OF gamma.c echo 'gamma.h' 1>&2 cat >'gamma.h' <<'END OF gamma.h' /* <gamma.h> -- definitions for gamma-function routines last edit: 88/09/09 D A Gwyn SCCS ID: @(#)gamma.h 1.1 (edited for publication) */ /* library routines: */ #ifdef __STDC__ extern double Gamma( double z ), LGamma( double z ), Factorial( int n ), LFactorial( int n ), BCoeff( int n, int k ), Beta( double z, double w ), PGamma( double a, double x ), QGamma( double a, double x ), Erf( double x ), Erfc( double x ), CPoisson( double x, int k ), PChiSq( double chisq, int nu ), QChiSq( double chisq, int nu ); #else extern double Gamma(), LGamma(), Factorial(), LFactorial(), BCoeff(), Beta(), PGamma(), QGamma(), Erf(), Erfc(), CPoisson(), PChiSq(), QChiSq(); #endif END OF gamma.h echo 'info.c' 1>&2 cat >'info.c' <<'END OF info.c' /* InfoTbl -- Kullback's information measure for a 2-way contingency table last edit: 88/09/19 D A Gwyn SCCS ID: @(#)info.c 1.1 (edited for publication) Special return values: -1.0 entire table consisted of 0 entries -2.0 invalid table entry (frequency less than 0) -3.0 invalid table dimensions (r or c less than 2) -4.0 unable to allocate enough working storage */ #include <math.h> /* for log() */ #if __STDC__ #include <stdlib.h> /* malloc, free */ #include "std.h" #else #include "std.h" extern pointer malloc(); extern void free(); #endif double InfoTbl( r, c, f, pdf ) int r; /* # rows in table */ int c; /* # columns in table */ const long *f; /* -> r*c frequency tallies */ int *pdf; /* -> return # d.f. for chi-square */ { #define x(i,j) f[(i)*c+(j)] /* convenient way to access freqs */ register int i; /* row index */ register int j; /* column index */ double N; /* (double)n */ double info; /* accumulates information measure */ double *xi; /* row sums */ double *xj; /* col sums */ int rdf = r - 1; /* row degrees of freedom */ int cdf = c - 1; /* column degrees of freedom */ if ( rdf <= 0 || cdf <= 0 ) { info = -3.0; goto ret3; } *pdf = rdf * cdf; /* total degrees of freedom */ if ( (xi = (double *)malloc( r * sizeof(double) )) == NULL ) { info = -4.0; goto ret3; } if ( (xj = (double *)malloc( c * sizeof(double) )) == NULL ) { info = -4.0; goto ret2; } /* compute row sums and total */ N = 0.0; for ( i = 0; i < r; ++i ) { double sum = 0.0; /* accumulator */ for ( j = 0; j < c; ++j ) { register long k = x(i,j); if ( k < 0L ) { info = -2.0; goto ret1; } sum += (double)k; } N += xi[i] = sum; } if ( N <= 0.0 ) { info = -1.0; goto ret1; } /* compute column sums */ for ( j = 0; j < c; ++j ) { double sum = 0.0; /* accumulator */ for ( i = 0; i < r; ++i ) sum += (double)x(i,j); xj[j] = sum; } /* compute information measure (four parts) */ info = N * log( N ); /* part 1 */ for ( i = 0; i < r; ++i ) { double pi = xi[i]; /* row sum */ if ( pi > 0.0 ) info -= pi * log( pi ); /* part 2 */ for ( j = 0; j < c; ++j ) { double pij = (double)x(i,j); if ( pij > 0.0 ) info += pij * log( pij ); /* part 3 */ } } for ( j = 0; j < c; ++j ) { double pj = xj[j]; /* column sum */ if ( pj > 0.0 ) info -= pj * log( pj ); /* part 4 */ } info *= 2.0; /* for comparability with chi-square */ ret1: free( (pointer)xj ); ret2: free( (pointer)xi ); ret3: return info; } END OF info.c echo 'std.h' 1>&2 cat >'std.h' <<'END OF std.h' /* std.h -- Douglas A. Gwyn's standard C programming definitions Prerequisite: <math.h> (if you invoke Round()) last edit: 89/06/18 D A Gwyn SCCS ID: @(#)std.h 1.30 The master source file is to be modified only by Douglas A. Gwyn <Gwyn@BRL.MIL>. When installing a VLD/VMB software distribution, this file may need to be tailored slightly to fit the target system. Usually this just involves enabling some of the "kludges for deficient C implementations" at the end of this file. */ #ifndef VLD_STD_H_ #define VLD_STD_H_ /* once-only latch */ /* Extended data types */ typedef int bool; /* Boolean data */ #define false 0 #define true 1 /* ANSI C definitions */ /* Defense against some silly systems defining __STDC__ to random things. */ #ifdef STD_C #undef STD_C #endif #ifdef __STDC__ #if __STDC__ > 0 #define STD_C __STDC__ /* use this instead of __STDC__ */ #endif #endif #ifdef STD_C typedef void *pointer; /* generic pointer */ #else typedef char *pointer; /* generic pointer */ #define const /* nothing */ /* ANSI C type qualifier */ /* There really is no substitute for the following, but these might work: */ #define signed /* nothing */ /* ANSI C type specifier */ #define volatile /* nothing */ /* ANSI C type qualifier */ #endif #ifndef EXIT_SUCCESS #define EXIT_SUCCESS 0 #endif #ifndef EXIT_FAILURE #define EXIT_FAILURE 1 #endif #ifndef NULL #define NULL 0 /* null pointer constant, all types */ #endif /* Universal constants */ #define DEGRAD 57.2957795130823208767981548141051703324054724665642 /* degrees per radian */ #define E 2.71828182845904523536028747135266249775724709369996 /* base of natural logs */ #define GAMMA 0.57721566490153286061 /* Euler's constant */ #define LOG10E 0.43429448190325182765112891891660508229439700580367 /* log of e to the base 10 */ #define PHI 1.618033988749894848204586834365638117720309180 /* golden ratio */ #define PI 3.14159265358979323846264338327950288419716939937511 /* ratio of circumf. to diam. */ /* Useful macros */ /* The comment "UNSAFE" means that the macro argument(s) may be evaluated more than once, so the programmer must realize that the macro doesn't quite act like a genuine function. This matters only when evaluating an argument produces "side effects". */ /* arbitrary numerical arguments and value: */ #define Abs( x ) ((x) < 0 ? -(x) : (x)) /* UNSAFE */ #define Max( a, b ) ((a) > (b) ? (a) : (b)) /* UNSAFE */ #define Min( a, b ) ((a) < (b) ? (a) : (b)) /* UNSAFE */ /* floating-point arguments and value: */ #define Round( d ) floor( (d) + 0.5 ) /* requires <math.h> */ /* arbitrary numerical arguments, integer value: */ #define Sgn( x ) ((x) == 0 ? 0 : (x) > 0 ? 1 : -1) /* UNSAFE */ /* string arguments, boolean value: */ #ifdef gould /* UTX-32 2.0 compiler has problems with "..."[] */ #define StrEq( a, b ) (strcmp( a, b ) == 0) /* UNSAFE */ #else #define StrEq( a, b ) (*(a) == *(b) && strcmp( a, b ) == 0) /* UNSAFE */ #endif /* array argument, integer value: */ #define Elements( a ) (sizeof a / sizeof a[0]) /* integer (or character) arguments and value: */ #define fromhostc( c ) (c) /* map host char set to ASCII */ #define tohostc( c ) (c) /* map ASCII to host char set */ #define tonumber( c ) ((c) - '0') /* convt digit char to number */ #define todigit( n ) ((n) + '0') /* convt digit number to char */ /* Kludges for deficient C implementations */ /*#define strchr index /* 7th Edition UNIX, 4.2BSD */ /*#define strrchr rindex /* 7th Edition UNIX, 4.2BSD */ /*#define void int /* K&R Appendix A followers */ #endif /* VLD_STD_H_ */ END OF std.h echo 'tot_info.1' 1>&2 cat >'tot_info.1' <<'END OF tot_info.1' '\" e .TH TOT_INFO 1V VMB '\" last edit: 89/02/07 D A Gwyn '\" SCCS ID: @(#)tot_info.1 1.2 (edited for publication) '\" '\" This source must be typeset via "eqn | troff -man" '\" .EQ delim @@ .EN .SH NAME tot_info \- total information for multiple 2-way contingency tables .SH SYNOPSIS .ds cW (CW\" change to I (without the paren) if you don't have a CW font .ds cB (CB\" change to B (without the paren) if you don't have a CB font \f\*(cBtot_info\fP .SH DESCRIPTION \f\*(cBtot_info\fP reads multiple contingency-table data sets from the standard input and prints Kullback's information statistic for each set, as well as for the aggregate over all sets, on the standard output. (See \fIchisq\^\fP(3V) for a discussion of this statistic.) .SH "INPUT FORMAT" Input consists of one or more data sets, each constituting a 2-way contingency table (not necessarily all of the same size). A data set may be preceded by any number of blank or comment lines; a comment line has a \f\*(cB#\fP character as the first non-whitespace character on the line. Following the optional comment lines is a header line containing the row and column dimensions of the contingency table (in that order), separated by white space. Finally, the contents of the contingency table (frequency counts) must be given in ``row major'' order. The table may be freely formatted with white-space separators; a row need not be on a single line. .SH "OUTPUT FORMAT" Input comments are copied to the output as they are encountered. Otherwise the output consists solely of an information line for each data set (or a diagnostic if the data is invalid) and a final information line for the aggregate over all data sets (preceded by a blank line). An information line exhibits twice the value of Kullback's @"\v'-0.2'^\v'0.2'\h'-\w;^;u'I\^" ( H sub 1 : H sub 2 )@ statistic, the corresponding number of degrees of freedom, and the probability that the statistic would be as large as was actually observed if the row and column categorizations really were independent. .P The aggregate statistic is valid if the data sets represent independent tests of the same hypothesis. .SH DIAGNOSTICS The diagnostic messages are intended to be self-explanatory. .SH "EXIT STATUS" \f\*(cBtot_info\fP returns a zero exit status if and only if no problems were encountered. .SH EXAMPLE .RS \f\*(cW$ \fP\f\*(cBtot_info .br # MilCrypI.60 biliteral cryptogram (trial pairings against G) .br # G vs B .br 2 10 .br 2 2 2 0 3 0 0 1 0 1 .br 3 1 1 1 1 2 2 1 2 1 .br # G vs D .br 2 10 .br 2 2 2 0 3 0 0 1 0 1 .br 4 1 4 4 1 1 1 3 4 2 .br # G vs J .br 2 10 .br 2 2 2 0 3 0 0 1 0 1 .br 1 1 1 1 1 1 2 1 1 1 .br # G vs L .br 2 10 .br 2 2 2 0 3 0 0 1 0 1 .br 1 4 0 4 3 4 5 3 3 4 .br # G vs N .br 2 10 .br 2 2 2 0 3 0 0 1 0 1 .br 4 1 4 3 1 1 1 2 3 3 .br # G vs Q .br 2 10 .br 2 2 2 0 3 0 0 1 0 1 .br 0 2 0 2 1 1 1 0 1 0 .br # G vs S (correct pairing) .br 2 10 .br 2 2 2 0 3 0 0 1 0 1 .br 1 2 2 0 2 1 0 0 0 1 .br # G vs V .br 2 10 .br 2 2 2 0 3 0 0 1 0 1 .br 1 4 1 3 4 4 4 3 4 3 .br # G vs X .br 2 10 .br 2 2 2 0 3 0 0 1 0 1 .br 0 1 0 1 2 1 1 0 2 0 .br # Since most pairings are incorrect, .br # the aggregate probability is small. .br ^D\fP\f\*(cW .br # MilCrypI.60 biliteral cryptogram (trial pairings against G) .br # G vs B .br 2info = 11.01 df = 9 q = 0.2748 .br # G vs D .br 2info = 12.40 df = 9 q = 0.1915 .br # G vs J .br 2info = 9.00 df = 9 q = 0.4375 .br # G vs L .br 2info = 19.03 df = 9 q = 0.0250 .br # G vs N .br 2info = 10.89 df = 9 q = 0.2830 .br # G vs Q .br 2info = 15.82 df = 9 q = 0.0707 .br # G vs S (correct pairing) .br 2info = 3.11 df = 9 q = 0.9596 .br # G vs V .br 2info = 14.47 df = 9 q = 0.1066 .br # G vs X .br 2info = 15.31 df = 9 q = 0.0826 .br # Since most pairings are incorrect, .br # the aggregate probability is small. .br .sp total 2info = 111.05 df = 81 q = 0.0150 \fP .RE .SH REFERENCES Solomon Kullback, \fIInformation Theory and Statistics\fP (Dover, 1968). .br William F.\& Friedman and Lambros D.\& Callimahos, \fIMilitary Cryptanalytics, Part I \(em Volume 1\fP (reprinted by Aegean Park Press, 1985). .SH "SEE ALSO" chisq(3V). .SH AUTHOR Douglas A.\& Gwyn, BRL/VLD-VMB END OF tot_info.1 echo 'tot_info.c' 1>&2 cat >'tot_info.c' <<'END OF tot_info.c' /* tot_info -- combine information statistics for multiple tables last edit: 89/02/06 D A Gwyn SCCS ID: @(#)tot_info.c 1.1 (edited for publication) */ #include <ctype.h> #include <stdio.h> #include "std.h" #include "chisq.h" #include "gamma.h" /* for QChiSq() */ #ifndef MAXLINE #define MAXLINE 256 #endif #ifndef MAXTBL #define MAXTBL 1000 #endif static char line[MAXLINE]; /* row/column header input line */ static long f[MAXTBL]; /* frequency tallies */ static int r; /* # of rows */ static int c; /* # of columns */ #define x(i,j) f[(i)*c+(j)] /* convenient way to access freqs */ #define COMMENT '#' /* comment character */ /*ARGSUSED*/ int main( argc, argv ) int argc; char *argv[]; { register char *p; /* input line scan location */ register int i; /* row index */ register int j; /* column index */ double info; /* computed information measure */ int infodf; /* degrees of freedom for information */ double totinfo = 0.0; /* accumulated information */ int totdf = 0; /* accumulated degrees of freedom */ while ( fgets( line, MAXLINE, stdin ) != NULL ) /* start new table */ { for ( p = line; *p != '\0' && isspace( (int)*p ); ++p ) ; if ( *p == '\0' ) continue; /* skip blank line */ if ( *p == COMMENT ) { /* copy comment through */ (void)fputs( line, stdout ); continue; } if ( sscanf( p, "%d %d\n", &r, &c ) != 2 ) { (void)fputs( "* invalid row/column line *\n", stderr ); return EXIT_FAILURE; } if ( r * c > MAXTBL ) { (void)fputs( "* table too large *\n", stderr ); return EXIT_FAILURE; } /* input tallies */ for ( i = 0; i < r; ++i ) for ( j = 0; j < c; ++j ) if ( scanf( " %ld", &x(i,j) ) != 1 ) { (void)fputs( "* EOF in table *\n", stderr ); return EXIT_FAILURE; } /* compute statistic */ info = InfoTbl( r, c, f, &infodf ); /* print results */ if ( info >= 0.0 ) { (void)printf( "2info = %5.2f\tdf = %2d\tq = %7.4f\n", info, infodf, QChiSq( info, infodf ) ); totinfo += info; totdf += infodf; } else (void)fputs( info < -3.5 ? "out of memory\n" : info < -2.5 ? "table too small\n" : info < -1.5 ? "negative freq\n" : "table all zeros\n", stdout ); } if ( totdf <= 0 ) { (void)fputs( "\n*** no information accumulated ***\n", stdout ); return EXIT_FAILURE; } (void)printf( "\ntotal 2info = %5.2f\tdf = %2d\tq = %7.4f\n", totinfo, totdf, QChiSq( totinfo, totdf ) ); return EXIT_SUCCESS; } END OF tot_info.c echo 'tot_info.exp' 1>&2 cat >'tot_info.exp' <<'END OF tot_info.exp' # MilCrypI.60 biliteral cryptogram (trial pairings against G) # G vs B 2info = 11.01 df = 9 q = 0.2748 # G vs D 2info = 12.40 df = 9 q = 0.1915 # G vs J 2info = 9.00 df = 9 q = 0.4375 # G vs L 2info = 19.03 df = 9 q = 0.0250 # G vs N 2info = 10.89 df = 9 q = 0.2830 # G vs Q 2info = 15.82 df = 9 q = 0.0707 # G vs S (correct pairing) 2info = 3.11 df = 9 q = 0.9596 # G vs V 2info = 14.47 df = 9 q = 0.1066 # G vs X 2info = 15.31 df = 9 q = 0.0826 # Since most pairings are incorrect, # the aggregate probability is small. total 2info = 111.05 df = 81 q = 0.0150 END OF tot_info.exp echo 'tot_info.in' 1>&2 cat >'tot_info.in' <<'END OF tot_info.in' # MilCrypI.60 biliteral cryptogram (trial pairings against G) # G vs B 2 10 2 2 2 0 3 0 0 1 0 1 3 1 1 1 1 2 2 1 2 1 # G vs D 2 10 2 2 2 0 3 0 0 1 0 1 4 1 4 4 1 1 1 3 4 2 # G vs J 2 10 2 2 2 0 3 0 0 1 0 1 1 1 1 1 1 1 2 1 1 1 # G vs L 2 10 2 2 2 0 3 0 0 1 0 1 1 4 0 4 3 4 5 3 3 4 # G vs N 2 10 2 2 2 0 3 0 0 1 0 1 4 1 4 3 1 1 1 2 3 3 # G vs Q 2 10 2 2 2 0 3 0 0 1 0 1 0 2 0 2 1 1 1 0 1 0 # G vs S (correct pairing) 2 10 2 2 2 0 3 0 0 1 0 1 1 2 2 0 2 1 0 0 0 1 # G vs V 2 10 2 2 2 0 3 0 0 1 0 1 1 4 1 3 4 4 4 3 4 3 # G vs X 2 10 2 2 2 0 3 0 0 1 0 1 0 1 0 1 2 1 1 0 2 0 # Since most pairings are incorrect, # the aggregate probability is small. END OF tot_info.in