gls@corona.ATT.COM (Col. G. L. Sicherman) (03/14/91)
A few months ago _Unix World_ ran a contest to see who could write the nicest program for stripping C++ comments. A C++ comment is /* a C comment, */ or // everything from double-slash to end of line, except that the key tokens don't count when they are part of a string (or char) literal "/*". Recently the three award winners were published: a C program, a lex program, and a sh program. The columnist challenged readers to find bugs in them. As it happens, all three programs, simple though they are, have bugs! Challenge yourself -- for each of the following three programs, devise a source text (preferably a valid C program) on which it fails to strip comments properly. (If you post your solutions, please put "SPOILER" in the Subject: line. I'll post answers in a couple of weeks -- if necessary!) The C program: #include <stdio.h> char *sccsID="@(#) cstrip.c 1.1 Bart J. Besseling, 8/90"; int m[9][8] = { /* finite-state machine */ /* events: / * " ' \ \n sp ch states: */ { 0x01,0x80,0x85,0x87,0x80,0x80,0x80,0x80 }, /* 0: hunt */ { 0x02,0x33,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0 }, /* 1: maybe */ { 0x02,0x02,0x02,0x02,0x02,0x80,0x02,0x02 }, /* 2: c++ */ { 0x13,0x14,0x13,0x13,0x13,0x83,0x83,0x13 }, /* 3: c */ { 0x10,0x13,0x13,0x13,0x13,0x83,0x83,0x13 }, /* 4: end c */ { 0x85,0x85,0x80,0x85,0x86,0x80,0x85,0x85 }, /* 5: string */ { 0x85,0x85,0x85,0x85,0x85,0x85,0x85,0x85 }, /* 6: \ in str */ { 0x87,0x87,0x87,0x80,0x88,0x80,0x87,0x87 }, /* 7: char */ { 0x87,0x87,0x87,0x87,0x87,0x87,0x87,0x87 }, /* 8: \ in char */ }; int main() /* Input parser and output generator */ { register int ch, event, state; for (state = 0; (ch = getchar()) != EOF;) { /* translate character into event */ switch (ch) { case '/': event = 0; break; case '*': event = 1; break; case '"': event = 2; break; case '\'': event = 3; break; case '\\': event = 4; break; case '\n': event = 5; break; case '\t': case ' ': event = 6; break; default: event = 7; break; } /* obtain next state and operation from machine */ state = m[state & 0x0f][event]; /* perform operation */ if (state & 0x10) putchar(' '); if (state & 0x20) putchar(' '); if (state & 0x40) putchar('/'); if (state & 0x80) putchar(ch); } return 0; } The lex program: %Start CODE CCOM STRING CHAR CPLUS %% %{ char *sccsID = "@(#) sc 1.0 Andre van Dalen, 6/90"; BEGIN CODE; %} <STRING>([^\\]\")|(\\\\\") | <CHAR>([^.\\]\')|(\\\\\') | <CPLUS>\n { ECHO; BEGIN CODE; } <CCOM>"*/" { two_space(); BEGIN CODE; } <CCOM,CPLUS>. { output(*yytext=='\t'?'\t':' ');} <CODE>"/*" { two_space(); BEGIN CCOM ; } <CODE>"//" { two_space(); BEGIN CPLUS ;} <CODE>\" { ECHO; BEGIN STRING; } <CODE>\' { ECHO; BEGIN CHAR; } <STRING,CODE>. { ECHO; } %% two_space() { output(' '); output(' '); } main(argc, argv) int argc; char **argv; { if (argc==1) yylex(); else while (*++argv) { fclose(yyin); if (!(yyin=fopen(*argv,"r"))) { perror(*argv); exit(1); } yylex(); } exit(0); } The sh program: # @(#) sc Strip comments from a C/C++ source file # Author: Carl Bergerson, August 1990 # set -x # Uncomment for debugging # Define correct usage message: USAGE="Usage: $0 [sourcefile]" case $# in 0) sed -e 's/^#/a#/' | /lib/cpp | sed -e '/^#/d' -e 's/^a#/#/';; 1) sed -e 's/^#/a#/' $1 | /lib/cpp | sed -e '/^#/d' -e 's/^a#/#/';; *) echo $USAGE >&2 exit 1 ;; esac -- Col. G. L. Sicherman gls@corona.att.COM