[net.sources] atype.c & ctype.c -- simple text statistics

rsk@j.cc.purdue.edu (Tyrannosaurus Wombat) (10/13/86)

The following two short programs calculate simple text statistics,
and occasionally come in handy; I'm sending these out in net.sources
in the hopes of garnering useful comments on them.  They tend to
be useful in debugging from time to time.

Atype prints a table of ascii occurences like this...

  468 nul    4 soh    3 stx    1 etx    0 eot    0 enq    0 ack    3 bel
    0 bs     0 ht     0 nl     0 vt     2 np     0 cr     0 so     0 si 
    5 dle    0 dc1    0 dc2    0 dc3    0 dc4    0 nak    0 syn    0 etb
    0 can    0 em     0 sub    0 esc    0 fs     0 gs     0 rs     0 us 
    0 sp     0  !     0  "     0  #     0  $     0  %     0  &     0  ' 
    0  (     0  )     0  *     0  +     0  ,     0  -     6  .     0  / 
    0  0     1  1     0  2     0  3     0  4     0  5     0  6     0  7 
    0  8     0  9     0  :     0  ;     0  <     0  =     0  >     0  ? 
    0  @     0  A     0  B     0  C     0  D     0  E     0  F     0  G 
    1  H     0  I     0  J     0  K     0  L     0  M     0  N     0  O 
    0  P     0  Q     0  R     0  S     0  T     0  U     0  V     0  W 
    1  X     0  Y     0  Z     0  [     0  \     0  ]     0  ^     0  _ 
    0  `     2  a     0  b     3  c     0  d     3  e     0  f     0  g 
    0  h     0  i     0  j     0  k     0  l     0  m     0  n     0  o 
    3  p     0  q     0  r     0  s     3  t     0  u     0  v     0  w 
    0  x     3  y     0  z     0  {     0  |     0  }     0  ~     0 del

...and reads either stdin or whatever file arguments are provided.

Ctype prints a table of ctype(3) occurences like this...

ascii	cntrl	print	space	punct	alnum	digit	alpha	upper	lower
510	487	25	18	7	17	1	0	17	3	

....and reads either stdin or whatever file arguments are provided.

Both work on 4.2bsd.

One shortcoming of each is known: very large input can cause the printed
output fields to overflow, making the display messy.

A future release (in mod.sources) will include appropriate manual pages,
and whatever enhancements result from comments made by readers.

--------------------------------------------------
#include <stdio.h>

/*	Atype.c  find numbers of different types of characters in
*	a file...Rich Kulawiec, 8/2/82  revised 10/86
*	Note that characters 200-377 octal are mapped down.
*/

char    *maptable[16][8] = { 
	"nul", "soh", "stx", "etx", "eot", "enq", "ack", "bel",
	"bs ", "ht ", "nl ", "vt ", "np ", "cr ", "so ", "si ",
	"dle", "dc1", "dc2", "dc3", "dc4", "nak", "syn", "etb",
	"can", "em ", "sub", "esc", "fs ", "gs ", "rs ", "us ",
	"sp ", " ! ", " \" "," # ", " $ ", " % ", " & ", " ' ",
	" ( ", " ) ", " * ", " + ", " , ", " - ", " . ", " / ", 
	" 0 ", " 1 ", " 2 ", " 3 ", " 4 ", " 5 ", " 6 ", " 7 ", 
	" 8 ", " 9 ", " : ", " ; ", " < ", " = ", " > ", " ? ", 
	" @ ", " A ", " B ", " C ", " D ", " E ", " F ", " G ", 
	" H ", " I ", " J ", " K ", " L ", " M ", " N ", " O ", 
	" P ", " Q ", " R ", " S ", " T ", " U ", " V ", " W ", 
	" X ", " Y ", " Z ", " [ ", " \\ ", " ] ", " ^ ", " _ ", 
	" ` ", " a ", " b ", " c ", " d ", " e ", " f ", " g ", 
	" h ", " i ", " j ", " k ", " l ", " m ", " n ", " o ", 
	" p ", " q ", " r ", " s ", " t ", " u ", " v ", " w ",
	" x ", " y ", " z ", " { ", " | ", " } ", " ~ ", "del" 
	} ;

int     count[8][16];

FILE	*fp;
FILE	*fopen();

main(argc, argv)
int argc;
char *argv[];
{
	int c,i,j,k;

	if(argc == 1) {
		fp = stdin;
		while((c = getc(fp)) != EOF)
			count[ ((c&0177) % 8) ][ ((c&0177) / 8) ]++;
	}
	else {
		for ( i = 1; i < argc;  i++) {
			if( (fp=fopen(argv[i],"r")) == NULL) {
				(void) fprintf(stderr,"atype: can't open %s\n",argv[i]);
				continue;
			}
			while((c = getc(fp)) != EOF)
				count[ ((c&0177) % 8) ][ ((c&0177) / 8) ]++;
			(void) fclose(fp);
		}
	}

	for(k=0; k<16; k++) {
		for(j=0; j<8; j++) 
			(void) printf("%5d %s",count[j][k],maptable[k][j]);
		(void) printf("\n");
	}
}
--------------------------------------------------
#include <stdio.h>
#include <ctype.h>

/*     Ctype.c  find numbers of different types of characters in
*	a file...Rich Kulawiec, 4/20/81  revised 10/86
*/

FILE *fp;
FILE *fopen();

void	tally();

#define	NASCII	0
#define	NCNTRL	1
#define	NPRINT	2
#define	NALNUM	3
#define	NPUNCT	4
#define	NALPHA	5
#define	NDIGIT	6
#define	NUPPER	7
#define	NLOWER	8
#define	NSPACE	9

#define NCLASS	10

long	class[NCLASS];

main(argc, argv)
int argc;
char *argv[];
{
	int i,j;

	for( j = 0; j < NCLASS; j++)
		class[j] = 0L;

	(void) printf("ascii\tcntrl\tprint\tspace\tpunct\talnum\tdigit\talpha\tupper\tlower\n");

	if( argc == 1) {
		fp = stdin;
		tally(fp);
	}
	else {
		for ( i = 1; i < argc;  i++) {
			if( (fp=fopen(argv[i],"r")) == NULL) {
				(void) fprintf(stderr,"ctype: can't open %s\n",argv[i]);
				continue;
			}
			tally(fp);
			(void) fclose(fp);
		}
	}

	for ( j = 0; j <NCLASS; j++)
		(void) printf("%ld\t",class[j]);
	(void) printf("\n");
}

void tally(filep)
FILE *filep;
{
	int	c;

	while((c = getc(filep)) != EOF){
		if(isascii(c) != 0)
			class[NASCII]++;
		if(iscntrl(c) != 0)
			class[NCNTRL]++;
		if(isprint(c) != 0)
			class[NPRINT]++;
		if(isspace(c) != 0)
			class[NSPACE]++;
		if(ispunct(c) != 0)
			class[NPUNCT]++;
		if(isalnum(c) != 0)
			class[NALNUM]++;
		if(isdigit(c) != 0)
			class[NDIGIT]++;
		if(isalpha(c) != 0)
			class[NALPHA]++;
		if(isupper(c) != 0)
			class[NUPPER]++;
		if(islower(c) != 0)
			class[NLOWER]++;
	}
}
--------------------------------------------------

uh@bsiao.UUCP (Uul Haanstra) (10/28/86)

> Keywords: ctype, ascii, classification
> 
> The following two short programs calculate simple text statistics,
> and occasionally come in handy; I'm sending these out in net.sources
> in the hopes of garnering useful comments on them.  They tend to
> be useful in debugging from time to time.
> 
This surely is a joke. I remember, my first Fortran program had to
count character frequencies. Then later on, my first Algol and my first
Pascal did the same (actually, my first Pascal program made the 
Pascal triangle, but that is neither here nor there).
But this is ridiculous, now the net is being swamped by this sort
of 'utilities'. Useful in debugging it says. In debugging what?
The compiler? The disks (I wrote 256 a's and 321 b's, and now I only get
255 a's)? If I want to know what's in a file, I look at it, or use
od, or even sed or awk if I need a filter.
> 
> A future release (in mod.sources) will include appropriate manual pages,
> and whatever enhancements result from comments made by readers.
> 
Please don't. 

-- 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Uul Haanstra, Postbank N.V. Amsterdam                ...!mcvax!bsiao!uh
              Pb 21009
	      1000 EX AMSTERDAM                         +31-20 584 3312
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

swi@cs.paisley.ac.uk (S. Wilson) (10/30/86)

In article <194@bsiao.UUCP> uh@bsiao.UUCP (Uul Haanstra) writes:
>> Keywords: ctype, ascii, classification
>> 
>> The following two short programs calculate simple text statistics,
>> and occasionally come in handy; I'm sending these out in net.sources
>> in the hopes of garnering useful comments on them.  They tend to
>> be useful in debugging from time to time.
>> 
>This surely is a joke. I remember, my first Fortran program had to
>count character frequencies. Then later on, my first Algol and my first
>Pascal did the same (actually, my first Pascal program made the 
>Pascal triangle, but that is neither here nor there).
>But this is ridiculous, now the net is being swamped by this sort
>of 'utilities'. Useful in debugging it says. In debugging what?
>The compiler? The disks (I wrote 256 a's and 321 b's, and now I only get
>255 a's)? If I want to know what's in a file, I look at it, or use
>od, or even sed or awk if I need a filter.
>> 
>> A future release (in mod.sources) will include appropriate manual pages,
>> and whatever enhancements result from comments made by readers.
>> 
>Please don't. 
>
>-- 
>- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
>Uul Haanstra, Postbank N.V. Amsterdam                ...!mcvax!bsiao!uh
>              Pb 21009
>	      1000 EX AMSTERDAM                         +31-20 584 3312
>- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Totaly agree with you, what a load of absolute rubbish. Manual page; INDEED!.


JANET:  swi@uk.ac.paisley.cs                    Scott Wilson
EMAIL:	swi@cs.paisley.ac.uk		| Post: Paisley College
UUCP:	...!seismo!mcvax!ukc!paisley!swi|	Department of Mathematics,
Phone:	+44 41 887 1241 Ext. 260	|	High St. Paisley.
					|	Scotland.
					|	PA1 2BE

rbl@nitrex.UUCP ( Dr. Robin Lake ) (10/30/86)

A similar tool solved a difficult technical problem.  By analysis of the
byte-frequency on a tape, we determined it had been mis-copied on a Cyber
(63 byte character set) and not corrupted on a tape drive in a remote
location.

Also, it's been useful in finding characters to use as alias bytes when
transforming data file formats.

Robin Lake
decvax!cwruecmp!nitrex!rbl
cbatt!nitrex!rbl

ajs@hpfcla.HP.COM (Alan Silverstein) (11/03/86)

Re: atype, ctype

> ...it's been useful...

Ditto, I've been using a local variant called lc (letter count) for
several years.  It's part of my basic toolset, and it's surprising how
often I need to use this "trivial" tool.  (If anyone wants my code and
manual entry, mail me.)

The flame about the original posting is akin to saying: "Anyone can
build a screwdriver.  So don't bother giving away copies of yours."
Well, I must own 20 screwdrivers, scattered here and there, but I'll
still consider another one, especially if it's free.

Alan Silverstein, Hewlett-Packard Systems Software Operation, Fort Collins,
Colorado; {ihnp4 | hplabs}!hpfcla!ajs; 303-229-3053; (lat-long on request :-)

cire@hpisoa1.HP.COM (Eric B. Decker) (11/11/86)

> / hpisoa1:net.sources / swi@cs.paisley.ac.uk (S. Wilson) /  2:13 am  Oct 30, 1986 /
> In article <194@bsiao.UUCP> uh@bsiao.UUCP (Uul Haanstra) writes:
> >> Keywords: ctype, ascii, classification
> >> 
> >> The following two short programs calculate simple text statistics,
> >> and occasionally come in handy; I'm sending these out in net.sources
> >> in the hopes of garnering useful comments on them.  They tend to
> >> be useful in debugging from time to time.
> >> 
> >This surely is a joke. I remember, my first Fortran program had to
> >count character frequencies. Then later on, my first Algol and my first
> >Pascal did the same (actually, my first Pascal program made the 
> >Pascal triangle, but that is neither here nor there).
> >But this is ridiculous, now the net is being swamped by this sort
> >of 'utilities'. Useful in debugging it says. In debugging what?
> >The compiler? The disks (I wrote 256 a's and 321 b's, and now I only get
> >255 a's)? If I want to know what's in a file, I look at it, or use
> >od, or even sed or awk if I need a filter.
> >> 
> >> A future release (in mod.sources) will include appropriate manual pages,
> >> and whatever enhancements result from comments made by readers.
> >> 
> >Please don't. 
> >
> >-- 
> >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
> >Uul Haanstra, Postbank N.V. Amsterdam                ...!mcvax!bsiao!uh
> >              Pb 21009
> >	      1000 EX AMSTERDAM                         +31-20 584 3312
> >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
> 
> Totaly agree with you, what a load of absolute rubbish. Manual page; INDEED!.
> 
> 
> JANET:  swi@uk.ac.paisley.cs                    Scott Wilson
> EMAIL:	swi@cs.paisley.ac.uk		| Post: Paisley College
> UUCP:	...!seismo!mcvax!ukc!paisley!swi|	Department of Mathematics,
> Phone:	+44 41 887 1241 Ext. 260	|	High St. Paisley.
> 					|	Scotland.
> 					|	PA1 2BE
> ----------

Oh sit on it!  If the group is moderated then there is an appropriate amount
of screening.  At least when Rich Salz is doing it.  This is fine (the
posting of the ctype whatever program) by me.

If you don't care about this particular program, then don't read it.