[comp.mail.mh] Sorting messages by Date and Subject?

forys@sigi.Colorado.EDU (Jeff Forys) (06/27/88)

Has anyone written a script to sort messages by Date and Subject?
Sortm(1) appears to only handle date sorts, which would be the first
thing to do.  However, after that, I'd like something to go thru and
resort the messages based on their subjects (e.g. "X", "Re: X", ...)
and still maintain the chronological order for each new subject.

If no one has done this, and I didnt miss the existing MH tool, I'll
write one using the existing UNIX tools when I get the time.  I'd
rather not recreate the wheel tho...

Please respond thru mail; if there's enough interest, I'll summarize.

Thanks,
---
Jeff Forys @ UC/Boulder Engineering Research Comp Cntr (303-492-4991)
forys@boulder.Colorado.EDU  -or-  ..!{ncar|nbires}!boulder!forys

forys@sigi.Colorado.EDU (Jeff Forys) (07/05/88)

In article <6848@sigi.Colorado.EDU> I wrote:
> Has anyone written a script to sort messages by Date and Subject?

Craig Leres (leres@helios.ee.lbl.gov) sent me a version of sortm(1)
that sorts messages by Date and Subject (when given the "-subj" flag).
The code was written by Van Jacobson (now, where have you heard that
name before?).  Several people have expressed interest in this, and
I've received permission to post the revised version.  What follows,
is a revised sortm(1) that does "just what I wanted"...

Thanks!
Jeff Forys
--------------------------------- cut here ------------------------------

/*
 * revision 1.3        
 * date: 87/05/20 21:30:02; author: van; state: Exp; lines added/del: 122/82
 * corrected sorting of subsets of folder.  Just permute original
 * numbers.  Don't pack or renumber.
 *
 * revision 1.2        
 * date: 87/05/19 05:33:38; author: van; state: Exp; lines added/del: 360/256
 * added subject sorting (-subj flag)
 *
 * revision 1.1        
 * date: 87/05/18 22:58:33; author: van; state: Exp;  
 * Initial revision
 */

/* sortm.c - sort messages in a folder by date/time */

#include "../h/mh.h"
#include "../zotnet/tws.h"
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <ctype.h>

static struct swit switches[] = {
#define	DATESW	0
	 "datefield field", 0,

#define	VERBSW	1
	 "verbose", 0,
#define NVERBSW	2
	 "noverbose", 0,

#define SUBJSW 3
	 "subject", 0,

#define	HELPSW	4
	 "help", 4,

	 NULL, NULL
};


struct smsg {
	int s_msg;
	unsigned long s_clock;
	char *s_subj;
};

static struct smsg *smsgs;
int nmsgs;

int subjsort;			/* sort on subject if != 0 */
int verbose;
int dsort ();
int subsort ();


/* ARGSUSED */
main (argc, argv)
	int argc;
	char **argv;
{
	int msgp = 0;
	int i;
	int msgnum;
	char *cp;
	char *maildir;
	char *datesw = NULL;
	char *folder = NULL;
	char buf[100];
	char **ap;
	char **argp;
	char *arguments[MAXARGS];
	char *msgs[MAXARGS];
	struct msgs *mp;
	struct smsg **dlist;

	invo_name = r1bindex (argv[0], '/');
	if ((cp = m_find (invo_name)) != NULL) {
		ap = brkstring (cp = getcpy (cp), " ", "\n");
		ap = copyip (ap, arguments);
	} else
		ap = arguments;
	(void) copyip (argv + 1, ap);
	argp = arguments;

	while (cp = *argp++) {
		if (*cp == '-')
			switch (smatch (++cp, switches)) {
			case AMBIGSW:
				ambigsw (cp, switches);
				done (1);
			case UNKWNSW:
				adios (NULLCP, "-%s unknown", cp);
			case HELPSW:
				(void) sprintf(buf,
					      "%s [+folder] [msgs] [switches]",
					      invo_name);
				help (buf, switches);
				done (1);

			case DATESW:
				if (datesw)
					adios (NULLCP,
					       "only one date field at a time");
				if (!(datesw = *argp++) || *datesw == '-')
					adios (NULLCP, "missing argument to %s", argp[-2]);
				continue;

			case SUBJSW:
				subjsort = 1;
				continue;

			case VERBSW:
				verbose++;
				continue;
			case NVERBSW:
				verbose = 0;
				continue;
			}
		if (*cp == '+' || *cp == '@') {
			if (folder)
				adios (NULLCP, "only one folder at a time!");
			else
				folder = path (cp + 1, *cp == '+' ? TFOLDER : TSUBCWF);
		} else
			msgs[msgp++] = cp;
	}


	if (!m_find ("path"))
		free (path ("./", TFOLDER));
	if (!msgp)
		msgs[msgp++] = "all";
	if (!datesw)
		datesw = "date";
	if (!folder)
		folder = m_getfolder ();
	maildir = m_maildir (folder);

	if (chdir (maildir) == NOTOK)
		adios (maildir, "unable to change directory to");
	if (!(mp = m_gmsg (folder)))
		adios (NULLCP, "unable to read folder %s", folder);
	if (mp->hghmsg == 0)
		adios (NULLCP, "no messages in %s", folder);

	for (msgnum = 0; msgnum < msgp; msgnum++)
		if (!m_convert (mp, msgs[msgnum]))
			done (1);
	m_setseq (mp);

	if ((nmsgs = read_hdrs (mp, datesw)) <= 0)
		adios (NULLCP, "no messages to sort");

	/*
	 * sort a list of pointers to our "messages to be sorted".
	 */
	dlist = (struct smsg **) malloc ((nmsgs+1) * sizeof(*dlist));
	if (! dlist)
		adios (NULLCP, "couldn't allocate sort memory");
	for (i = 0; i < nmsgs; i++)
		dlist[i] = &smsgs[i];
	dlist[nmsgs] = 0;

	qsort ((char *) dlist, nmsgs, sizeof(*dlist), dsort);

	/*
	 * if we're sorting on subject, we need another list
	 * in subject order, then a merge pass to collate the
	 * two sorts.
	 */
	if (subjsort) {
		struct smsg **slist;
		struct smsg **flist;
		register struct smsg ***il;
		register struct smsg **fp;
		register struct smsg **dp;

		slist = (struct smsg **) malloc ((nmsgs+1) * sizeof(*slist));
		if (! slist)
			adios (NULLCP, "couldn't allocate sort memory");
		bcopy ((char *)dlist, (char *)slist, (nmsgs+1)*sizeof(*slist));
		qsort ((char *)slist, nmsgs, sizeof(*slist), subsort);

		/*
		 * make an inversion list so we can quickly find
		 * the collection of messages with the same subj
		 * given a message number.
		 */
		il = (struct smsg ***) calloc (mp->hghsel+1, sizeof(*il));
		if (! il)
			adios (NULLCP, "couldn't allocate msg list");
		for (i = 0; i < nmsgs; i++)
			il[slist[i]->s_msg] = &slist[i];
		/*
		 * make up the final list, chronological but with
		 * all the same subjects grouped together.
		 */
		flist = (struct smsg **) malloc ((nmsgs+1) * sizeof(*flist));
		if (! flist)
			adios (NULLCP, "couldn't allocate msg list");
		fp = flist;
		for (dp = dlist; *dp;) {
			register struct smsg **s = il[(*dp++)->s_msg];

			/* see if we already did this guy */
			if (! s)
				continue;

			*fp++ = *s++;
			/*
			 * take the next message(s) if there is one,
			 * its subject isn't null and its subject
			 * is the same as this one.
			 */
			while (*s && (*s)->s_subj[0] &&
			       strcmp((*s)->s_subj, s[-1]->s_subj) == 0) {
				il[(*s)->s_msg] = 0;
				*fp++ = *s++;
			}
		}
		*fp = 0;
		(void) free (slist);
		(void) free (dlist);
		dlist = flist;
	}
	rename_msgs (mp, dlist);

	m_replace (pfolder, folder);
	m_sync (mp);
	m_update ();

	done (0);
}


static int 
read_hdrs (mp, datesw)
	register struct msgs *mp;
	register char *datesw;
{
	int msgnum;
	struct tws tb;
	register struct smsg *s;

	twscopy (&tb, dtwstime ());

	smsgs = (struct smsg *)
		calloc ((unsigned) (mp->hghsel - mp->lowsel + 2),
			sizeof *smsgs);
	if (smsgs == NULL)
		adios (NULLCP, "unable to allocate sort storage");

	s = smsgs;
	for (msgnum = mp->lowsel; msgnum <= mp->hghsel; msgnum++) {
		if (mp->msgstats[msgnum] & SELECTED) {
			if (getws (datesw, msgnum, s)) {
				s->s_msg = msgnum;
				s++;
			}
		}
	}
	s->s_msg = 0;
	nmsgs = s - smsgs;
}


static
getws (datesw, msg, smsg)
	register char *datesw;
	int msg;
	register struct smsg *smsg;
{
	int compnum;
	register int state;
	char *msgnam;
	char buf[BUFSIZ], nam[NAMESZ];
	register struct tws *tw;
	register char *datecomp = NULLCP;
	register char *subjcomp = NULLCP;
	register FILE *in;

	if ((in = fopen (msgnam = m_name (msg), "r")) == NULL) {
		admonish (msgnam, "unable to read message");
		return (0);
	}
	for (compnum = 1, state = FLD;;) {
		switch (state = m_getfld (state, nam, buf, sizeof buf, in)) {
		case FLD:
		case FLDEOF:
		case FLDPLUS:
			compnum++;
			if (uleq (nam, datesw)) {
				datecomp = add (buf, datecomp);
				while (state == FLDPLUS) {
					state = m_getfld (state, nam, buf, sizeof buf, in);
					datecomp = add (buf, datecomp);
				}
				if (!subjsort || subjcomp)
					break;
			} else if (subjsort && uleq (nam, "subject")) {
				subjcomp = add (buf, subjcomp);
				while (state == FLDPLUS) {
					state = m_getfld (state, nam, buf, sizeof buf, in);
					subjcomp = add (buf, subjcomp);
				}
				if (datecomp)
					break;
			} else {
				/* just flush this guy */
				while (state == FLDPLUS)
					state = m_getfld (state, nam, buf, sizeof buf, in);
			}
			continue;

		case BODY:
		case BODYEOF:
		case FILEEOF:
			break;

		case LENERR:
		case FMTERR:
			if (state == LENERR || state == FMTERR)
				admonish (NULLCP,
				   "format error in message %d (header #%d)",
					  msg, compnum);
			if (datecomp)
				free (datecomp);
			if (subjcomp)
				free (subjcomp);
			(void) fclose (in);
			return (0);

		default:
			adios (NULLCP, "internal error -- you lose");
		}
		break;
	}

	if (!datecomp || (tw = dparsetime (datecomp)) == NULL) {
		struct stat st;

		admonish (NULLCP, "can't parse %s field in message %d",
			  datesw, msg);

		/* use the modify time of the file as its date */
		(void) fstat (fileno (in), &st);
		smsg->s_clock = st.st_mtime;
	} else {
		smsg->s_clock = twclock (tw);
	}

	if (subjsort) {
		register char *cp;
		register char *cp2;
		register char c;

		if (!subjcomp)
			subjcomp = "";

		/*
		 * try to make the subject "canonical": delete leading "re:",
		 * punctuation, white space & smash everything to lower case. 
		 */
		cp = subjcomp;
		cp2 = subjcomp;
		while (c = *cp++)
			if (isupper (c))
				*cp2++ = tolower (c);
			else if (isalnum (c) || c == ':')
				*cp2++ = c;

		*cp2 = '\0';
		while (subjcomp[0] == 'r' && subjcomp[1] == 'e'
		       && subjcomp[2] == ':')
			subjcomp += 3;

		smsg->s_subj = subjcomp;
	}
	(void) fclose (in);
	if (datecomp)
		free (datecomp);

	return (1);
}

/*
 * sort on dates.
 */
static int 
dsort (a, b)
	register struct smsg **a, **b;
{
	if ((*a)->s_clock < (*b)->s_clock)
		return (-1);
	else if ((*a)->s_clock > (*b)->s_clock)
		return (1);
	else if ((*a)->s_msg < (*b)->s_msg)
		return (-1);
	else
		return (1);
}


/*
 * sort on subjects.
 */
static int 
subsort (a, b)
	register struct smsg **a, **b;
{
	register int i;
	if (i = strcmp ((*a)->s_subj, (*b)->s_subj))
		return (i);

	return (dsort (a, b));
}


static 
rename_msgs (mp, mlist)
	register struct msgs *mp;
	register struct smsg **mlist;
{
	register int i, j, old, new;
	register struct smsg *sp;
	short stats;
	char f1[BUFSIZ], f2[BUFSIZ], tmpfil[BUFSIZ];

	(void) strcpy (tmpfil, m_scratch ("", invo_name));

	for (i = 0; i < nmsgs; i++) {
		if (! (sp = mlist[i])) 
			continue;	/* did this one */

		j = sp - smsgs;
		if (j == i)
			continue;	/* this one doesn't move */

		/*
		 * the guy that was msg j is about to become msg i.
		 * rename 'j' to make a hole, then recursively rename
		 * guys to fill up the hole.
		 */
		old = smsgs[j].s_msg;
		new = smsgs[i].s_msg;
		(void) strcpy (f1, m_name (old));

		if (verbose)
			printf ("renaming chain from %d to %d\n", old, new);

		if (rename (f1, tmpfil) == NOTOK)
			adios (tmpfil, "unable to rename %s to ", f1);
		stats = mp->msgstats[old];

		rename_chain (mp, mlist, j, i);
		if (rename (tmpfil, m_name(new)) == NOTOK)
			adios (m_name(new), "unable to rename %s to", tmpfil);

		mp->msgstats[new] = stats;
		mp->msgflags |= SEQMOD;
	}
}

rename_chain (mp, mlist, msg, endmsg)
	register struct msgs *mp;
	struct smsg **mlist;
	int msg, endmsg;
{
	int nxt, old, new;
	char *newname;
	char oldname[BUFSIZ];

	nxt = mlist[msg] - smsgs;
	mlist[msg] = 0;
	old = smsgs[nxt].s_msg;
	new = smsgs[msg].s_msg;
	(void) strcpy (oldname, m_name (old));
	newname = m_name (new);
	if (verbose)
		printf ("    %s becomes %s\n", oldname, newname);

	if (rename (oldname, newname) == NOTOK)
		adios (newname, "unable to rename %s to", oldname);

	mp->msgstats[new] = mp->msgstats[old];
	if (mp->curmsg == old)
		m_setcur (mp, new);

	if (nxt != endmsg)
		rename_chain (mp, mlist, nxt, endmsg);
}

sharat@CVL.UMD.EDU (08/07/88)

Recently  Jeff posted the question

> Has anyone written a script to sort messages by Date and Subject?
  > Sortm(1) appears to only handle date sorts, which would be the first
  > thing to do.  However, after that, I'd like something to go thru and
  > resort the messages based on their subjects (e.g. "X", "Re: X", ...)
  > and still maintain the chronological order for each new subject.

and subsequently answered the question by providing  a
`newer' sortm.  

I really wanted something like that but the problem with
the newer version is that it cant distinguish
subject-headers which are the same but from unrelated
people.  Specifically let Nancy sent me a msg on 1-1-86
titled Hi (and I reply); If a year later Bob sends me a msg
with the same title "Hi", the newer `sortm' puts Bob's
message after Nancy's but before several messages in 1986.

One would also like a `modified lexicographic' behavior.
What I am thinking of is that if the subject heading is the
same, and within a window of dates, sortm should do the
`Jeff' sort.  A window of dates could be something like 10
days, a normal period within which people would have sent
replies.  In my example Bob's new message falls outside the
window and thus will not be considered in the `subject'
sorting.  This will also allow multiple messages to be
`sorted' (ie X, Re:X, Re:X, Re:X posted by several people
on a mailing list).  Another option would be to sort by
subject only if the sender is the same (loses above
advantage).

Thus formally, messages are sorted by subject and dates
within blocks.  Blocks are sorted strictly on date.  (X and
Re:x count as the same subject).  Block size is dependent
on user.

Makes sense?  Suggestions?  Comments?  Do I have a kludge to
get this behaviour?  (I could of course do a sortm without
the subject and the sortm with the subject flag on specific
messages; something on the lines of using pick on the dates
and then sortm on those messages --- but it gets a trifle
hairy; i.e.  I don't know how to do that!)

----------------------------------------------------
Electronic:
sharat@cvl.umd.edu 			Arpanet/Domain
...!uunet!mimsy!sharat			Uucp