[comp.mail.elm] Parsing ARPA Dates

solomon@gjetost.cs.wisc.edu (Marvin Solomon) (06/22/89)
A documented "feature" of elm is that it ignores time zones when parsing
dates.  The result is that even when I use the "sort by time sent"
feature, I frequently see responses to postings from Europe before I
see the original message.  I decided to fix that, and found, to my
shock and amazement, that the routine parse_arpa_date() in src/addr_util.c,
despite its name, doen't come close to parsing "arpa" (i.e., rfc822) dates.
For example, it immediatly gives up if the date contains fewer than
6 words, even though the date
	9 Jun 89 18:30:57 GMT
is perfectly legitimate.  If this routine cannot handle the date in a Date:
line, it silently ignores it, using the date on the "From " line instead.

I completely rewrote parse_arpa_date() to accept all legal rfc822 dates
as well as a variety of deviant syntaxes discovered by looking through a
large sample of messages (for example, a version of the Unix date format,
which puts the month name before the day of the month--which seems to
be used by mush, the Mail Users' Shell).  I also try to interpret the time
zone indication, if present, and adjust the date to GMT, so that date
comparisons work out right.  To preserve the interface between
parse_arpa_date() and the rest of the program, I translate the results
back to a symbolic form.  This is silly, since it appears that the only
use made of this information is to do date comparisons, which requires
decoding it all over again.  It would make much more sense to simply
translate dates into "Unix time" (seconds past midnight Jan 1, 1970 GMT), but
I didn't want to change too much of the program.

I recognize all the time zones listed in rfc822 as well as others
gleaned from a variety of sources.  No doubt there are zones I've missed
(for example, I don't have Indian time).  In cases of conflicting use
of zone names, I've favored the US version (e.g. CST is -6:00 for central
US rather than +10:30 for central Australia).

The one exception to upward compatibility with 822 is that I don't recognize
"military" one-letter zones except for "Z" (==GMT, commonly called "Zulu
time").  The other zones seem to be rarely used (but perhaps that's because
I don't get much correspondence from soldiers :-), and besides, rfc822 has
it backwards.

This is not an "official" patch.  Perhaps the developers would like to try
it out before "blessing" it.  Others can install and try it at their own
risk.

*** /tmp/,RCSt1a06365	Wed Jun 21 10:40:21 1989
--- addr_util.c	Wed Jun 21 10:40:15 1989
***************
*** 1,8 ****
  
! static char rcsid[] = "@(#)$Id: addr_util.c,v 2.16 89/04/24 20:34:42 syd Exp $";
  
  /*******************************************************************************
!  *  The Elm Mail System  -  $Revision: 2.16 $   $State: Exp $
   *
   * 			Copyright (c) 1986, 1987 Dave Taylor
   * 			Copyright (c) 1988, 1989 USENET Community Trust
--- 1,8 ----
  
! static char rcsid[] = "@(#)$Id: addr_util.c,v 2.17 89/06/21 10:40:10 solomon Stab $";
  
  /*******************************************************************************
!  *  The Elm Mail System  -  $Revision: 2.17 $   $State: Stab $
   *
   * 			Copyright (c) 1986, 1987 Dave Taylor
   * 			Copyright (c) 1988, 1989 USENET Community Trust
***************
*** 14,21 ****
   *
   *******************************************************************************
   * $Log:	addr_util.c,v $
!  * Revision 2.16  89/04/24  20:34:42  syd
!  * checked in with -k by solomon at 89.06.21.10.28.50.
   * 
   * Revision 2.16  89/04/24  20:34:42  syd
   * Fix month lower case
--- 14,22 ----
   *
   *******************************************************************************
   * $Log:	addr_util.c,v $
!  * Revision 2.17  89/06/21  10:40:10  solomon
!  * Replace parse_arpa_date() with a routine that actually parses ARPA
!  * dates (and somewhat more).  Properly take time zones into consideration.
   * 
   * Revision 2.16  89/04/24  20:34:42  syd
   * Fix month lower case
***************
*** 625,634 ****
--- 626,1043 ----
  	}
  }
  
+ #ifdef UW
+ /* Revised verision of parse_arpa_date() by
+  * Marvin Solomon <solomon@cs.wisc.edu>, June 1989.
+  * The original version ignored the time zone, make sorting by date sent
+  * (for example) much less useful.  When I went to fix that, I found that
+  * the syntax accepted didn't seem to correspond to much of anything--it
+  * certainly wasn't even a subset of what rfc822 specifies.  I wrote the
+  * following ad hoc parsing routines, which accept all of 822 plus some
+  * of the more common violations that I have seen in my incoming mail.
+  *
+  * It would make much more sense to simply translate dates into "Unix time"
+  * (seconds past midnight Jan 1, 1970), but the rest of this program wants
+  * everything in symbolic form, and I'm not about to change that.
+  */
+ /*
+ Quoting from RFC 822:
+      5.  DATE AND TIME SPECIFICATION
+ 
+      5.1.  SYNTAX
+ 
+      date-time   =  [ day "," ] date time        ; dd mm yy
+ 						 ;  hh:mm:ss zzz
+ 
+      day         =  "Mon"  / "Tue" /  "Wed"  / "Thu"
+ 		 /  "Fri"  / "Sat" /  "Sun"
+ 
+      date        =  1*2DIGIT month 2DIGIT        ; day month year
+ 						 ;  e.g. 20 Jun 82
+ 
+      month       =  "Jan"  /  "Feb" /  "Mar"  /  "Apr"
+ 		 /  "May"  /  "Jun" /  "Jul"  /  "Aug"
+ 		 /  "Sep"  /  "Oct" /  "Nov"  /  "Dec"
+ 
+      time        =  hour zone                    ; ANSI and Military
+ 
+      hour        =  2DIGIT ":" 2DIGIT [":" 2DIGIT]
+ 						 ; 00:00:00 - 23:59:59
+ 
+      zone        =  "UT"  / "GMT"                ; Universal Time
+ 						 ; North American : UT
+ 		 /  "EST" / "EDT"                ;  Eastern:  - 5/ - 4
+ 		 /  "CST" / "CDT"                ;  Central:  - 6/ - 5
+ 		 /  "MST" / "MDT"                ;  Mountain: - 7/ - 6
+ 		 /  "PST" / "PDT"                ;  Pacific:  - 8/ - 7
+ 		 /  1ALPHA                       ; Military: Z = UT;
+ 						 ;  A:-1; (J not used)
+ 						 ;  M:-12; N:+1; Y:+12
+ 		 / ( ("+" / "-") 4DIGIT )        ; Local differential
+ 						 ;  hours+min. (HHMM)
+ */
+ 
+ #define SKIP_WS(p) while (isspace(*p)) p++
+ #define SKIP_ALPHA(p) while (isalpha(*p)) p++
+ #define SKIP_DIGITS(p) while (isdigit(*p)) p++
+ 
+ static char *day_name[8] = {
+     "sun", "mon", "tue", "wed", "thu", "fri", "sat", 0
+ };
+ 
+ static char *month_name[13] = {
+     "jan", "feb", "mar", "apr",
+     "may", "jun", "jul", "aug",
+     "sep", "oct", "nov", "dec", 0
+ };
+ 
+ static int month_len[12] = {
+     31, 28, 31, 30, 31, 30, 31,
+     31, 30, 31, 30, 31 };
+ 
+ /* The following time zones are taken from a variety of sources.  They
+  * are by no means exhaustive, but seem to include most of those
+  * in common usage.  A comprehensive list is impossible, since the same
+  * abbreviation is sometimes used to mean different things in different
+  * parts of the world.
+  */
+ static struct tzone {
+     char *str;
+     int offset; /* offset, in minutes, EAST of GMT */
+ } tzone_info[] = {
+     /* the following are from rfc822 */
+     "ut", 0, "gmt", 0,
+     "est", -5*60, "edt", -4*60,
+     "cst", -6*60, "cdt", -5*60,
+     "mst", -7*60, "mdt", -6*60,
+     "pst", -8*60, "pdt", -7*60,
+     "z", 0, /* zulu time (the rest of the military codes are bogus) */
+ 
+     /* these are also popular in Europe */
+     "wet", 0*60, "wet dst", 1*60, /* western european */
+     "met", 1*60, "met dst", 2*60, /* middle european */
+     "eet", 2*60, "eet dst", 3*60, /* eastern european */
+     "bst", 1*60, /* ??? british summer time (=+0100) */
+ 
+     /* ... and Canada */
+     "ast", -4*60, "adt", -3*60, /* atlantic */
+     "nst", -3*60-30, "ndt", -2*60-30, /* newfoundland */
+     "yst", -9*60, "ydt", -8*60, /* yukon */
+     "hst", -10*60, /* hawaii (not really canada) */
+ 
+     /* ... and Asia */
+     "jst", 9*60, /* japan */
+     "sst", 8*60, /* singapore */
+ 
+     /* ... and the South Pacific */
+     "nzst", 12*60, "nzdt", 13*60, /* new zealand */
+     "wst", 8*60, "wdt", 9*60, /* western australia */
+     /* there's also central and eastern australia, but they insist on using
+      * cst, est, etc., which would be indistinguishable for the us zones */
+      (char *)0, 0
+ };
+ 
+ /* Translate a symbolic timezone name (e.g. EDT or NZST) to a number of
+  * minutes *east* of gmt (if the local time is t, the gmt equivalent is
+  * t - tz_lookup(zone)).
+  * Return 0 if the timezone is not recognized.
+  */
+ static int tz_lookup(str)
+ char *str;
+ {
+     struct tzone *p; 
+ 
+     for (p = tzone_info; p->str; p++) {
+ 	if (strcmp(p->str,str)==0) return p->offset;
+     }
+     dprint(5,(debugfile,"unknown time zone %s\n",str));
+     return 0;
+ }
+ 
+ /* Return smallest i such that table[i] is a prefix of str.  Return -1 if not
+  * found.
+  */
+ static int prefix(table, str)
+ char **table;
+ char *str;
+ {
+     int i;
+ 
+     for (i=0;table[i];i++)
+ 	if (strncmp(table[i],str,strlen(*table))==0)
+ 	    return i;
+     return -1;
+ }
+ 
+ /* The following routines, get_XXX(p,...), expect p to point to a string
+  * of the appropriate syntax.  They return decoded values in result parameters,
+  * and return p updated to point past the parsed substring (also stripping
+  * trailing whitespace).
+  * Return 0 on syntax errors.
+  */
+ 
+ /* Parse a year: ['1' '9'] digit digit WS
+  */
+ static char *
+ get_year(p, result)
+ char *p;
+ int *result;
+ {
+     int year;
+ 
+     if (!isdigit(*p)) {
+ 	dprint(5,(debugfile,"missing year: %s\n",p));
+ 	return 0;
+     }
+     year = atoi(p);
+     /* be nice and allow 19xx, althought that's not really kosher */
+     if (year>=1900 && year <=1999) year -= 1900;
+     if (year<0 || year>99) {
+ 	dprint(5,(debugfile,"ridiculous year %d\n",year));
+ 	return 0;
+     }
+     SKIP_DIGITS(p);
+     SKIP_WS(p);
+     *result = year;
+     return p;
+ }
+ 
+ /* Parse a time: hours ':' minutes [ ':' seconds ] WS
+  * Check that 0<=hours<24, 0<=minutes,seconds<60.
+  * Also allow the syntax "digit digit digit digit" with implied ':' in the
+  * middle.
+  * Convert to minutes and seconds, with results in (*m,*s).
+  */
+ static char *
+ get_time(p,m,s)
+ char *p;
+ int *m, *s;
+ {
+     int hours, minutes, seconds;
+ 
+     /* hour */
+     if (!isdigit(*p)) {
+ 	dprint(5,(debugfile,"missing time: %s\n",p));
+ 	return 0;
+     }
+     hours = atoi(p);
+     SKIP_DIGITS(p);
+     if (*p++ != ':') {
+ 	/* perhaps they just wrote hhmm instead of hh:mm */
+ 	minutes = hours % 60;
+ 	hours /= 60;
+     }
+     else {
+ 	if (hours<0 || hours>23) {
+ 	    dprint(5,(debugfile,"ridiculous hour: %d\n",hours));
+ 	    return 0;
+ 	}
+ 	minutes = atoi(p);
+ 	if (minutes<0 || minutes>59) {
+ 	    dprint(5,(debugfile,"ridiculous minutes: %d\n",minutes));
+ 	    return 0;
+ 	}
+     }
+     SKIP_DIGITS(p);
+     if (*p == ':') {
+ 	p++;
+ 	seconds = atoi(p);
+ 	if (seconds<0 || seconds>59) {
+ 	    dprint(5,(debugfile,"ridiculous seconds: %d\n",seconds));
+ 	    return 0;
+ 	}
+ 	SKIP_DIGITS(p);
+     }
+     else seconds = 0;
+     minutes += hours*60;
+     SKIP_WS(p);
+     *m = minutes;
+     *s = seconds;
+     return p;
+ }
+ 
+ /* Parse a Unix date from which the leading week-day has been stripped.
+  * The syntax is "Jun 21 06:45:44 CDT 1989" with timezone optional.
+  * i.e., month day time [ zone ] year
+  * where day::=digit*, year and time are as defined above,
+  * and month and zone are alpha strings starting with a known 3-char prefix.
+  * The month has already been processed by the caller, so we just skip over
+  * a leading alpha* WS.
+  *
+  * Unlike the preceding routines, the result is not an updated pointer, but
+  * simply 1 for success and 0 for failure.
+  */
+ static int
+ get_unix_date(p,y,d,m,s,t)
+ char *p;
+ int *y, *d, *m, *s, *t;
+ {
+ 
+     SKIP_ALPHA(p);
+     SKIP_WS(p);
+     if (!isdigit(*p)) return 0;
+     *d = atoi(p);  /* check the value for sanity after we know the month */
+     SKIP_DIGITS(p);
+     SKIP_WS(p);
+     p = get_time(p,m,s);
+     if (!p) return 0;
+     if (isalpha(*p)) {
+ 	*t = tz_lookup(p);
+ 	SKIP_ALPHA(p);
+ 	SKIP_WS(p);
+     }
+     else *t = 0;
+     p = get_year(p,y);
+     if (!p) return 0;
+     return 1;
+ }
+ 
+ 
+ /* Parse an rfc822 (with extensions) date.  Return 1 on success, 0 on failure.
+  */
  parse_arpa_date(string, entry)
  char *string;
  struct header_rec *entry;
  {
+     char buffer[BUFSIZ], *p, *q;
+     int mday, month, year, minutes, seconds, tz;
+ 
+     /* first get everything into lower case */
+     for (p=buffer, q=buffer+sizeof buffer; *string && p<q; p++, string++) {
+ 	*p = isupper(*string) ? tolower(*string) : *string;
+     }
+     *p = 0;
+     p = buffer;
+     SKIP_WS(p);
+ 
+     if (prefix(day_name,p)>=0) {
+ 	/* accept anything that *starts* with a valid day name */
+ 	/* also, don't check whether it's right! */
+ 
+ 	(void)strncpy(entry->dayname, p, 3);
+ 	entry->dayname[3] = 0;
+ 	SKIP_ALPHA(p);
+ 	SKIP_WS(p);
+ 
+ 	if (*p==',') {
+ 	    p++;
+ 	    SKIP_WS(p);
+ 	}
+ 	/* A comma is required here, but we'll be nice guys and look the other
+ 	 * way if it's missing.
+ 	 */
+     }
+ 
+     /* date */
+ 
+     /* day of the month */
+     if (!isdigit(*p)) {
+ 	/* Missing day.  Maybe this is a Unix date?
+ 	 */
+ 	month = prefix(month_name,p);
+ 	if (month >= 0 &&
+ 	    get_unix_date(p, &year, &mday, &minutes, &seconds, &tz)) {
+ 		goto got_date;
+ 	}
+ 	dprint(5,(debugfile,"missing day: %s\n",p));
+ 	return 0;
+     }
+     mday = atoi(p);  /* check the value for sanity after we know the month */
+     SKIP_DIGITS(p);
+     SKIP_WS(p);
+ 
+     /* month name */
+     month = prefix(month_name,p);
+     if (month < 0) {
+ 	dprint(5,(debugfile,"missing month: %s\n",p));
+ 	return 0;
+     }
+     SKIP_ALPHA(p);
+     SKIP_WS(p);
+ 
+     /* year */
+     if (!(p = get_year(p,&year))) return 0;
+ 
+     /* time */
+     if (!(p = get_time(p,&minutes,&seconds))) return 0;
+ 
+     /* zone */
+     for (q=p; *q && !isspace(*q); q++) continue;
+     *q = 0;
+     if (*p=='-' || *p=='+') {
+ 	char sign = *p++;
+ 
+ 	if (isdigit(*p)) {
+ 	    int i;
+ 
+ 	    for (i=0; i<4; i++) {
+ 		if (!isdigit(p[i])) {
+ 		    dprint(5,(debugfile,"ridiculous numeric timezone: %s\n",p));
+ 		    return 0;
+ 		}
+ 		p[i] -= '0';
+ 	    }
+ 	    tz = (p[0]*10 + p[1])*60 + p[2]*10 + p[3];
+ 	    if (sign=='-') tz = -tz;
+ 	}
+ 	else {
+ 	    /* some brain-damaged dates use a '-' before a symbolic time zone */
+ 	    SKIP_WS(p);
+ 	    tz = tz_lookup(p);
+ 	}
+     }
+     else tz = tz_lookup(p);
+ 
+ got_date:
+     month_len[1] = (year%4) ? 28 : 29;
+     if (mday<0 || mday>month_len[month]) {
+ 	dprint(5,(debugfile,"ridiculous day %d of month %d\n",mday,month));
+ 	return 0;
+     }
+ 
+     /* shift everything to UTC (aka GMT) */
+     minutes -= tz;
+     if (tz > 0) { /* east of Greenwich */
+ 	if (minutes < 0) {
+ 	    if (--mday < 0) {
+ 		if (--month < 0) {
+ 		    year--; /* don't worry about 1900! */
+ 		    month = 11;
+ 		}
+ 		mday = month_len[month] - 1;
+ 	    }
+ 	    minutes += 60*60;
+ 	}
+     }
+     if (tz < 0) { /* west of Greenwich */
+ 	if (minutes >= 24*60) {
+ 	    if (++mday >= month_len[month]) {
+ 		if (++month >= 12) {
+ 		    year++; /* don't worry about 1999! */
+ 		    month = 0;
+ 		}
+ 		mday = 0;
+ 	    }
+ 	    minutes -= 24*60;
+ 	}
+     }
+ 
+     /* convert back to symbolic form (silly, but the rest of the program
+      * expects it and I'm not about to change all that!)
+      */
+     sprintf(entry->year, "%02d", year);
+     sprintf(entry->month, "%s", month_name[month]);
+     entry->month[0] = toupper(entry->month[0]);
+     sprintf(entry->day, "%d", mday);
+     sprintf(entry->time, "%02d:%02d:%02d",minutes/60,minutes%60,seconds);
+     return 1;
+ }
+ #else /* UW */
+ /* old version */
+ parse_arpa_date(string, entry)
+ char *string;
+ struct header_rec *entry;
+ {
  	/** Parse and figure out the given date format... return
  	    the entry fields changed iff it turns out we have a
  	    valid parse of the date!  **/
***************
*** 737,742 ****
--- 1146,1153 ----
  	strcpy(entry->month, shift_lower(entry->month));
  	entry->month[0] = toupper(entry->month[0]);
  }
+ /* end of old version */
+ #endif /* UW */
  
  fix_arpa_address(address)
  char *address;
--
	Marvin Solomon
	Computer Sciences Department
	University of Wisconsin, Madison WI
	solomon@cs.wisc.edu