[comp.sources.misc] v14i035: Key word program

root@ozdaltx.UUCP (07/27/90)
Posting-number: Volume 14, Issue 35
Submitted-by: root@ozdaltx.UUCP
Archive-name: mkkey/part01

---- Cut Here and unpack ----
#!/bin/sh
# This is a shell archive (shar 3.11)
# made 05/24/1990 17:12 UTC by root@ozdaltx
# Source directory /tmp
#
# existing files WILL be overwritten
#
# This shar contains:
# length  mode       name
# ------ ---------- ------------------------------------------
#   1883 -rw-r--r-- README
#   1738 -rw-r--r-- bkey.c
#     45 -rwxr-xr-x makeit
#   1891 -rwxr-xr-x mkkey
#
touch 2>&1 | fgrep '[-amc]' > /tmp/s3_touch$$
if [ -s /tmp/s3_touch$$ ]
then
	TOUCH=can
else
	TOUCH=cannot
fi
rm -f /tmp/s3_touch$$
# ============= README ==============
echo "x - extracting README (Text)"
sed 's/^X//' << 'SHAR_EOF' > README &&
XREADME
X
XMkkey - A group of programs to generate a list of key words
Xand their related files from text files.
X
XThis group of programs relies on certain (I assume) standard
X*NIX text processing programs:
X   hyphen - a program to extract and join hyphenated words. Could
X            be done with sed.
X   comm - Finds common words in two files.  Probably awk could
X          accomplish this, but would be slower.
X   sort & uniq - should be available anywhere.
X   vi - Use your favorite editor here.
X
XThe following is used in the program:
X    tolower - converts all upper case to lower.  Tr can do the same
X              thing.
X    bkey.c - The program that actually does the KEY file generation.
X             Makeit is the cc command. This one is for SCO XENIX.
X
XNot many comments are in the files - as they should be fairly self-
Xexplanitory.
XTwo files, apart from the text file(s), are needed;  ignore and
Xinclude.  These should be placed in the directory described by $KPATH.
XKPATH will also need to be changed to your preference.  You will
Xprobably want to add changes to mkkey to remove the temporary files
Xafter each key update.  For the ignore file, /usr/lib/eign would be a
Xgood place to start as it contains the common words; the, he, she,
Xthat...  etc.  Include builds itself.  The program does copy KEY to
XKEY.O - just in case.  I keep KEY.O around until I'm sure KEY is OK.
X
XThere are some sed lines that may confuse some people.
XThese are there to try to pear down the size of the ignore file.
X
XI'd like to know of changes and improvements.  Especially in the area
Xof stripping ignore words to their base form.  There are no
Xrestrictions on this collection of files.
X
XEnjoy....
X
XScotty
X------
XAIDS INFORMATION EXCHANGE BBS      (214) 247-2367/247-5609
X               "Education is the best weapon"
X{mic,void,egsner}!ozdaltx!sysop || {uunet,smu,ames}!sulaco!ozdaltx!sysop 
X-
SHAR_EOF
chmod 0644 README || echo "restore of README fails"
if [ $TOUCH = can ]
then
    touch -am 0524120890 README
fi
set `wc -c README`;Wc_c=$1
if test "$Wc_c" != "1883"
then echo original size 1883, current size $Wc_c;fi
# ============= bkey.c ==============
echo "x - extracting bkey.c (Text)"
sed 's/^X//' << 'SHAR_EOF' > bkey.c &&
X#include <stdio.h>
X#include <string.h>
X#define MAXLINES 2000
X
Xextern int fseek(), fscanf(), sscanf();
Xextern char *malloc();
Xextern char *fgets();
Xextern int strncmp();
X
Xmain(argc, argv)
Xint argc;
Xchar *argv[];
X{
X	FILE *fp, *fopen();
X	unsigned int i,j;
X        char *array[MAXLINES];
X        char a[2][90], b[2][90];
X	int c;
X        int len = 0;
X
X
X	j=i=0;
X	if(argc == 1){
X		printf("Usage: %s file\n",argv[0]);
X		exit(1);
X	}
X	if((fp = fopen(argv[1], "r")) == 0){
X		printf("%s: Can't open %s\n",argv[0],argv[1]);
X		exit(1);
X	}
X/* make a pass through the file to determine how many lines there are */
X	while((c=fgetc(fp)) != EOF){
X		if(c == '\n')
X			i++;
X	}
X        if(i > MAXLINES){
X		printf("%s: To many lines to read\n", argv[0]);
X		exit(1);
X	}
X	for(j=0; j <= i; j++){
X		if((array[j]=malloc(90)) == 0){
X			printf("Can't allocate memory\n");
X                	exit(1);
X		}
X	}
X/* go back to the start of the file */
X	fseek(fp,0,0);
X/* start reading the file and placing each line into array */
X	j=0;
X	while((fgets(array[j],90,fp)) != 0){
X		len=strlen(array[j]);
X		array[j][len-1] = '\0';  /* zap the newline */
X		j++;
X	}
X        fclose(fp);
X	for(j=0; j < i; j++){
X		sscanf(array[j],"%s %[0-9A-Za-z. \t]",a[0],a[1]);
X		if(strlen(array[j]) > 65){
X			printf("%s\n", array[j]);
X			continue;
X		}
X		if(strncmp(array[j+1],a[0],strlen(a[0])) == 0){
X			sscanf(array[j+1],"%s%[0-9A-Za-z. \t]",b[0],b[1]);
X			if(strlen(array[j]) + strlen(b[1]) > 75){
X				printf("%s\n", array[j]);
X				printf("%s: %s\n",b[0],b[1]);
X			} else {
X                                if(b[1][0] == ' '){
X					printf("%s%s\n", array[j],b[1]);
X				} else {
X					printf("%s %s\n", array[j],b[1]);
X				}
X			}
X			j++;
X		} else {
X			printf("%s\n", array[j]);
X
X		}
X			
X	}
X	exit(0);
X}
SHAR_EOF
chmod 0644 bkey.c || echo "restore of bkey.c fails"
if [ $TOUCH = can ]
then
    touch -am 0524114390 bkey.c
fi
set `wc -c bkey.c`;Wc_c=$1
if test "$Wc_c" != "1738"
then echo original size 1738, current size $Wc_c;fi
# ============= makeit ==============
echo "x - extracting makeit (Text)"
sed 's/^X//' << 'SHAR_EOF' > makeit &&
Xcc -LARGE -Ml2e -F 6000 -s -O bkey.c -o bkey
SHAR_EOF
chmod 0755 makeit || echo "restore of makeit fails"
if [ $TOUCH = can ]
then
    touch -am 0524114390 makeit
fi
set `wc -c makeit`;Wc_c=$1
if test "$Wc_c" != "45"
then echo original size 45, current size $Wc_c;fi
# ============= mkkey ==============
echo "x - extracting mkkey (Text)"
sed 's/^X//' << 'SHAR_EOF' > mkkey &&
XKPATH=/bbs/lib/key.d; export KPATH
Xcase $# in
X0)
X     echo "Usage : $0 sourcefile"
X     exit;;
Xesac
Xif test -f "$1"
Xthen
X   FILE=$1; export FILE
Xelse
X   echo "No file: $1"
X   exit
Xfi
Xgrep $1 KEY > /dev/null
Xcase $? in
X0) echo "$1 has already been processed"
X   exit;;
Xesac
Xecho "Processing $1, building key.raw
XFinding hyphanated words..."
Xhyphen $1 | tolower > hyph
Xsed '
X/'$1'/d
X/^$/d
Xs/-//' hyph > hy.inc
Xsed '
X/'$1'/d
X/^$/d
X:a
X/[\/-]/{
Xs//\
X/
Xb a
X}
Xs/-$//' hyph > hy.ign
Xsort -u hy.inc -o hy.inc
Xsort -u hy.ign -o hy.ign
Xtr ' ' '\012' < $1  hy.inc | tolower | sort -u > key.raw1
Xcat hy.ign key.raw1 | sort | uniq -u |
Xsed -n '
X/[	 :;,._'\'')("!*?]\[{}+=\\#@$%&*<>^]/s///g
X/['\''\`][dst]$/s/['\''\`]//
Xs/ness$//
X/[^e][cdfrtslnp]ies$/s/ies$/y/
X/[\/-]/s//\
X/g
X/^$/d
X/^[0-9]*$/d
X/^[a-z]*[0-9]*$/p
X/^[a-z]*$/p' | sort -u > key.raw
Xecho "Removing common words from key.raw"
Xcomm -23 key.raw $KPATH/ignore | comm -23 - $KPATH/include |
X sort -u > klist
X
Xif test -s klist
Xthen
X   echo "# put a % by each word going to the $KPATH/include list,
X# anything else will be sent to the $KPATH/ignore list." > ked
X   cat klist >> ked
X   vi ked
Xecho "C)ontinue or Q)uit?\c "
Xread cq
Xcase $cq in
X  [Cc]) ;;
X  [Qq]) exit;;
Xesac
Xecho "building inc and and ign"
Xsed '/^#/d
X/^%/{
Xs///w 'inc'
Xd
X} ' ked |
Xsed 's/[\/-]/\
X/g
Xs/+//g
Xs/\]//g' > ign
Xrm ked
Xfi
X
Xecho "Building ktemp"
Xif test -s inc
Xthen
X   echo "Adding inc to $KPATH/include list..."
X   cat inc >> $KPATH/include
X   sort -u $KPATH/include -o $KPATH/include
X   comm -12 key.raw $KPATH/include | sed 's/$/: '$1'/' > ktemp
Xelse
X    comm -12 key.raw $KPATH/include | sed 's/$/: '$1'/' > ktemp
Xfi
X
Xif test -s ign
Xthen
X   echo "Adding ign to $KPATH/ignore list..."
X   sort -u ign $KPATH/ignore -o $KPATH/ignore
Xfi
X
Xecho "Adding KEY to ktemp file"
X
Xif test -f KEY
Xthen
X   cp KEY KEY.O
Xfi
Xsort KEY ktemp -o ktemp
Xecho "Rebuilding KEY file"
Xbkey ktemp > KEY
SHAR_EOF
chmod 0755 mkkey || echo "restore of mkkey fails"
if [ $TOUCH = can ]
then
    touch -am 0524114290 mkkey
fi
set `wc -c mkkey`;Wc_c=$1
if test "$Wc_c" != "1891"
then echo original size 1891, current size $Wc_c;fi
exit 0