[net.sources] mailsplit

req@warwick.UUCP (Russell Quin) (11/19/86)

Mailsplit splits up a file (or lots of files) into lots of little files.
It reads its input a line at a time, and starts a new output file when
*	the input line matches a pattern, or
*	there have been n lines written to the current output file.

You can use it to split a mailbox or an archive of news articles into one
article per file, for example.
In fact, you can do this with about 5 lines of awk, but you run into problems
with long lines (and speed, if it bothers you!).

Source, Makefile and manual entry enclosed.
1:	Edit the Makefile: you'll need to alter the "R=/usr/local" if you
	don't want mailsplit to live in /usr/local/usr/bin.
	You may also want to rename it "fsplit".  I prefer "fsplit", but it's
	called mailsplit for historical reasons.

2:	make mailsplit

3:	have a play with it & satisfy yourself that it behaves reasonably

4:	make install

Make "install" will do a "$(MAKE) $(CLEAN)" afterwards.  If you don't want to
remove the binary, say
CLEAN="" make install
at step 4.

I shall be leaving Warwick to move to a new job at the end of the week, so
although mail will be forwarded (& I'd be glad of any changes that you needed
to make), I may take a while to respond!

Russell



#! /bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #! /bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create the files:
#	Makefile
#	mailsplit.1
#	mailsplit.c
#	opts.h
# This archive created: Wed Nov 19 19:54:38 1986
export PATH; PATH=/bin:$PATH
echo shar: extracting "'Makefile'" '(1029 characters)'
if test -f 'Makefile'
then
	echo shar: will not over-write existing file "'Makefile'"
else
sed 's/^X//' << \SHAR_EOF > 'Makefile'
X# Makefile for mailsplit
X# R E Quin, October 1986 University of Warwick (UK) Computer Science
X# warwick!req     +44 203 523193
X# 
X# This makefile is intended for the sys5 Augmented make.
X# 
XMAKE=make 
XCLEAN=clean 
XCC=cc 
XHACKS= 
XCFLAGS=-O $(HACKS)
X# R is the root of the filesystem -- i.e. where to install things.
X# The binaries are installed in $R/$(DESTDIR).
XR=/usr/local
XDESTDIR=$R/usr/bin 
XMANDIR=$R/usr/man/man1
XPROG=mailsplit 
X
X# PROG is what to make; DESTDIR is where to put it.
X# HACKS are for -DBUGFIX style things.
X
X# R is intended to be the root of the filesystem if it isn't "/"
X
X# "make install " does a $(MAKE) $(CLEAN) at the end, so you can say
X# CLEAN=  make -e install
X# if you don't want to remove the garbage at the end, for example.
X# This is useful primarily for testing the install: entry!
X
Xall: $(PROG)
X 
Xmailsplit: opts.h mailsplit.o
X	$(CC) -o $(PROG) mailsplit.o
X 
Xinstall: mailsplit
X	/bin/mv $(PROG) $(DESTDIR)
X	/bin/cp mailsplit.1 $(MANDIR)
X	$(MAKE) $(CLEAN)
X 
Xclean: 
X	rm -rf core *.o $(PROG) a.out
SHAR_EOF
if test 1029 -ne "`wc -c < 'Makefile'`"
then
	echo shar: error transmitting "'Makefile'" '(should have been 1029 characters)'
fi
fi # end of overwriting check
echo shar: extracting "'mailsplit.1'" '(2501 characters)'
if test -f 'mailsplit.1'
then
	echo shar: will not over-write existing file "'mailsplit.1'"
else
sed 's/^X//' << \SHAR_EOF > 'mailsplit.1'
X.TH MAILSPLIT 1L Local
X.SH NAME
Xmailsplit \- split a file or mailbox into single items (smaller files)
X.SH SYNOPSIS
X.B mailsplit
X[
X.B \-?
X]
X[
X.B \-o\fIformat\fP
X]
X[
X.B \-p\fIpattern\fP
X]
X[
X.B \-\fIn\fP
X]
X[\ file...\ ]
X.SH DESCRIPTION
X.I Mailsplit
Xsplits large files into smaller ones.  The splits occur on lines that match a
X.I pattern ,
Xwhich defaults to
X.br
X\ \ \ \ ``^From ''
X.br
Xso that the command
X.br
X\ \ \ \ mailsplit mbox
X.br
Xwill split a
X.I mail (1)
Xformat mail-box, putting each message in a differnt file.
X.PP
XOptions are:
X.IP -?
XPrint a summary of options.  Any unknown option will also do this
X.IP -p
XThe following string is taken to be a pattern to be used to match input lines
Xto determine points at which to split the input.
XSee
X.I ed (1)
Xfor details of the patterns.  The pattern may contain newlines (which match
Xthemselves).
X.IP -o
XThe folowing string is taken to be
X.I printf -style
Xformat to be used in the generation of output filenames.
XThere should be a %d in the string, which will be used to insert a
Xdisambiguating number.  This number is zero for the first file, and is
Xincremented at the start of each output file.  The \-\fIi\fP option can be used
Xto start the number with another value, however.
XThe default format is
X.br
X\ \ \ \ split:%-06.d
X.br
Xwhich results in files having names split:000000, split:000001, split:000002,
Xand so on.  Thus,
X.br
X\ \ \ \ mailsplit -o%d
X.br
Xwould produce files having the names 1, 2, 3, and so on.  The default format
Xwas chosen because the resulting files are listed in numerical order by
X.I ls (1),
Xor by
X.br
X\ \ \ \ echo *
X.br
Xwhich is sometimes useful.
X.IP \-\fBi\fP\fIn\fP
XThe number
X.I n
Xwill be used to number the first file; the number used each time will
Xthereafter be incremented as normal.  See the -\fBp\fP option for the use of
Xthis.
X.IP \-\fBn\fP\fIn\fP
XSplit the input every
X.I n
Xlines.  In this case, no pattern matching is performed.  This is the behaviour
Xof
X.I split (1),
Xexcept that
X.I mailsplit
Xnormally produced different filenames.
X.SH DIAGNOSTICS
XMostly straight-forward.
X``Internal Error'' indicates a bug in
X.I mailsplit ,
Xand should be reported.
XExit staus 1 indicates an error parsing options \- for example, if an unknown
Xflag was ued.
XExit status 2 indicates a meaningless combination was detected and rejected
X(this is rare in practice).
XExit status 3 indicates a run-time problem \- for example, if a file couldn't
Xbe opened.
X.SH "SEE ALSO"
X.I ed (1),
X.I mail (1),
X.I ls (1),
X.I split (1),
X.I printf (3).
SHAR_EOF
if test 2501 -ne "`wc -c < 'mailsplit.1'`"
then
	echo shar: error transmitting "'mailsplit.1'" '(should have been 2501 characters)'
fi
fi # end of overwriting check
echo shar: extracting "'mailsplit.c'" '(5344 characters)'
if test -f 'mailsplit.c'
then
	echo shar: will not over-write existing file "'mailsplit.c'"
else
sed 's/^X//' << \SHAR_EOF > 'mailsplit.c'
X/* mailsplit -- split files at lines that match a pattern */
X#include <stdio.h>
X#include <ctype.h>
X
X#include "opts.h"  /* defines nextstr() etc */
X
Xchar *progname = "filesplit";	/* for error messages */
Xchar *pattern = DFLTPAT;
Xchar *outformat = DFLTOUTNAME;
Xint filenumber = -1;
Xint every_n_lines = 0;	/* split every n lines if set -- overrides pattern */
X
Xusage(status)
X     int status;	/* exit if status != 0 */
X{
X     fprintf(stderr,"Usage: %s [-i n] [-o fmt] [-p pat] [-n n] [file...]\n", progname);
X     if (status)
X	  exit(status);
X}
X
Xmain(argc, argv)
X     char *argv[];
X{
X     /* split files at points that match a given pattern */
X     /* initialise things */
X     bool donefiles = FALSE;
X     char *buffer;
X
X     int getnum();	/* does more checking than atoi */
X
X     progname = argv[0];
X
X     /* now remove possible leading pathname
X      * (e.g. /usr/bin/mailsplit is to report it's errors as mailsplit
X      */
X     {
X	  register char *p;
X	  char *q = (char *) NULL;
X
X	  for (p = progname; p && *p; p++) {
X	       if (*p == '/')
X		    q = p;
X	  }
X	  if (q && *q) {
X	       progname = q;
X	  }
X     }
X
X
X     while (--argc) {
X	  if (**++argv == '-') {
X	       switch(*++*argv) {
X		    case 'p': {		/* -p pattern */
X			 nextstr(pattern,argc,argv,usage(2));
X			 break;
X		    }
X		    case 'o': {	/* -o pattern_for_output_filenames */
X			 nextstr(outformat,argc,argv,usage(2));
X			 break;
X		    }
X		    case 'i': {	/* -i initial_number */
X			 nextstr(buffer,argc,argv,usage(2));
X			 filenumber = getnum(buffer);
X			 if (filenumber < 0) {
X			    error("-i must be followed by a positive number\n");
X			    exit(EXIT_SYNTAX);
X			 }
X			 filenumber--;	/* needs to be one less to start with */
X			 break;
X		    }
X		    case 'n': {	/* -n n_lines --- split every n lines */
X			 nextstr(buffer,argc,argv,usage(2));
X			 every_n_lines = getnum(buffer);
X			 if (every_n_lines <= 0) {
X			      error("-n: number must be at least 1\n");
X			      exit(EXIT_SYNTAX);
X			 }
X			 break;
X		    }
X
X		    default: {
X			 fprintf(stderr, "Unknown flag -%c\n", **argv);
X			 usage(1);
X		    }
X	       }
X	  } else {	/* not a "-" flag */
X	       fsplit(*argv, pattern);
X	       donefiles++;
X	  }
X     }
X
X     if (!donefiles) {
X	  split(stdin, DFLTNAME, pattern);
X     }
X
X     exit(0);
X}
X
Xfsplit(name, pat)
X     char *name;
X     char *pat;
X{
X     FILE *fd;
X
X     if (!name || !*name) {
X	  error("Can't split a file with an empty name\n");
X	  usage(2);
X     }
X
X     if ( (fd = fopen(name, "r")) == NULL) {
X	  error("Can't open %s\n", name);
X	  return;
X     }
X
X     (void) split(fd, name, pat);
X
X     if (fclose(fd) == EOF) {	/* something's gone wrong */
X	  error("Can't close %s -- giving up\n", name);
X	  exit(EXIT_RUNERR);
X     }
X}
X
Xchar buffer[BUFLEN];
X
Xint
Xsplit(input, name, pattern)
X     FILE *input;
X     char *name;
X     char *pattern;
X{
X     /* do the real work here. Oh dear, I don't know how... */
X     /* we are always called with an open file. */
X
X     extern char *re_comp();	/* compile string into automaton */
X     extern int re_exec();	/* try to match string */
X#define REMATCH 1
X#define RENOMATCH 0
X#define REFAULT -1
X
X     char *errmessage;
X     FILE *output = NULL;
X     char fnambuf[MAXFILENAMELEN + 2];  /* +1 for null, +1 for overflow */
X     int reg_status = 0;	/* regular expression status */
X     int line = 0;
X
X     if (index(outformat, '%') == NULL) {
X	  error("Output filename format (\"%s\") must contain %%\n",outformat);
X	  usage(2);
X     }
X     if (!pattern || (!*pattern && !every_n_lines)) {
X	  error("Can't match an empty pattern\n");
X	  usage(2);
X     }
X
X     if (!every_n_lines && (errmessage = re_comp(pattern)) != NULL) {
X	  error("Error in pattern <%s>: %s\n", pattern, errmessage);
X	  exit(EXIT_RUNERR);
X     }
X     /* errmessage is NULL here */
X
X
X     /* the -2 to fgets is because of the null and \n appended */
X     while (fgets(buffer, BUFLEN - 2, input) != NULL) {
X	  if (!output ||	/* first line */
X	     (every_n_lines > 0 && (++line == every_n_lines)) || /* nth line */
X	     (!every_n_lines &&
X	     ((reg_status = re_exec(buffer)) == REMATCH)) ) { /* matches pat */
X	       /* don't look at 1st line of file, to avoid an infinite */
X	       /* recursion... */
X
X	       /* start a new file */
X	       if (output) {
X		    if (fclose(output) == EOF) {
X			 error("Can't close output file \"%s\"\n", fnambuf);
X			 exit(EXIT_RUNERR);
X		    }
X		    output = NULL;
X	       }
X	       line = 0;
X	       sprintf(fnambuf, outformat, ++filenumber, name);
X	       if ((output = fopen(fnambuf, "w")) == NULL) {
X		    error("Can't open output file %s\n", fnambuf);
X		    exit(EXIT_RUNERR);
X	       }
X	  } else if (reg_status == REFAULT) {
X	       /* the re_exec failed */
X	       error("Internal error trying to match <%s> to <%s>\n",
X			      pattern, buffer);
X	       exit(EXIT_INTERN);
X	  }
X	  fputs(buffer, output);
X     }
X     return (filenumber == -1);	/* exit status for main */
X}
X
Xerror(fmt, a1, a2, a3, a4)
X     char *fmt;
X{
X     fputs(progname, stderr);
X     fputs(": ", stderr);
X     fprintf(stderr, fmt, a1, a2, a3, a4);
X}
X
X/* getnum(s) returns the value of the unsigned int in s.  If there's any
X * trailing garbage, or the number isn't +ve, we return -1
X */
Xint
Xgetnum(s)
X     char *s;
X{
X     register char *p;
X
X     for (p = s; *p; p++) {
X	  if (!isdigit(*p)) {
X	       return -1;
X	  }
X     }
X     return atoi(s);
X}
SHAR_EOF
if test 5344 -ne "`wc -c < 'mailsplit.c'`"
then
	echo shar: error transmitting "'mailsplit.c'" '(should have been 5344 characters)'
fi
fi # end of overwriting check
echo shar: extracting "'opts.h'" '(859 characters)'
if test -f 'opts.h'
then
	echo shar: will not over-write existing file "'opts.h'"
else
sed 's/^X//' << \SHAR_EOF > 'opts.h'
X
X#define FALSE 0
X#define TRUE 1
Xtypedef int bool;
X
X#define EXIT_SYNTAX 1	/* syntax error parsing commandline options */
X#define EXIT_SEMANT 2	/* options are correct but meaningless */
X#define EXIT_RUNERR 3	/* error opening a file, for example */
X#define EXIT_INTERN 4	/* internal error -- bug!! */
X
X#define nextstr(s,count,array,failure)	\
X	{if (((count)<2) && !((array)[0][1])) {failure;}\
X	else {if ((array)[0][1]) { s = &((array)[0][1]); } \
X	      else {s = array[1]; --count; array++;}}}
X
X#define DFLTNAME "/dev/stdin"	/* default input filename (for errors) */
X#define DFLTPAT	 "^From "	/* default pattern for where to split lines */
X#define BUFLEN BUFSIZ	/* the maximum length of an input line (incl. "\n\0") */
X#define MAXFILENAMELEN BUFSIZ	/* longer than the longest possible file name */
X#define DFLTOUTNAME	"split:%06.d"	/* o/p file name format */
X
SHAR_EOF
if test 859 -ne "`wc -c < 'opts.h'`"
then
	echo shar: error transmitting "'opts.h'" '(should have been 859 characters)'
fi
fi # end of overwriting check
#	End of shell archive
exit 0
-- 
ARPA		req%uu.warwick.ac.uk@ucl-cs.arpa
EARN/BITNET	req%UK.AC.WARWICK.UU@AC.UK
JANET		req@uk.ac.warwick.uu
UUCP		seismo!mcvax!ukc!warwick!req  (req@warwick.UUCP)
PHONE		+44 203 523193 Until 20th November.
No new FRPList requests, please.  Details in net.games.frp.  Thanks!
The BITNET path only works from sites that have AC.UK in their tables.  Sorry.