[comp.lang.c] Unix World contest

gls@corona.ATT.COM (Col. G. L. Sicherman) (03/14/91)

A few months ago _Unix World_ ran a contest to see who could write
the nicest program for stripping C++ comments.  A C++ comment is

	/* a C
	   comment, */
or
	// everything from double-slash to end of line,

except that the key tokens don't count when they are part of
a string (or char) literal "/*".

Recently the three award winners were published: a C program,
a lex program, and a sh program.  The columnist challenged
readers to find bugs in them.  As it happens, all three programs,
simple though they are, have bugs!

Challenge yourself -- for each of the following three programs,
devise a source text (preferably a valid C program) on which it
fails to strip comments properly.  (If you post your solutions,
please put "SPOILER" in the Subject: line.  I'll post answers
in a couple of weeks -- if necessary!)

	The C program:

#include <stdio.h>
char *sccsID="@(#) cstrip.c 1.1 Bart J. Besseling, 8/90";

int m[9][8]  = { /* finite-state machine */
/* events:
        /    *    "    '    \   \n   sp    ch    states: */
    { 0x01,0x80,0x85,0x87,0x80,0x80,0x80,0x80 }, /* 0: hunt */
    { 0x02,0x33,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0 }, /* 1: maybe */
    { 0x02,0x02,0x02,0x02,0x02,0x80,0x02,0x02 }, /* 2: c++ */
    { 0x13,0x14,0x13,0x13,0x13,0x83,0x83,0x13 }, /* 3: c */
    { 0x10,0x13,0x13,0x13,0x13,0x83,0x83,0x13 }, /* 4: end c */
    { 0x85,0x85,0x80,0x85,0x86,0x80,0x85,0x85 }, /* 5: string */
    { 0x85,0x85,0x85,0x85,0x85,0x85,0x85,0x85 }, /* 6: \ in str */
    { 0x87,0x87,0x87,0x80,0x88,0x80,0x87,0x87 }, /* 7: char */
    { 0x87,0x87,0x87,0x87,0x87,0x87,0x87,0x87 }, /* 8: \ in char */
};

int
main() /* Input parser and output generator */
{
    register int    ch, event, state;

    for (state = 0; (ch = getchar()) != EOF;) {
        /* translate character into event */
        switch (ch) {
            case '/':   event = 0; break;
            case '*':   event = 1; break;
            case '"':   event = 2; break;
            case '\'':  event = 3; break;
            case '\\':  event = 4; break;
            case '\n':  event = 5; break;
            case '\t':
            case ' ':   event = 6; break;
            default:    event = 7; break;
        }
        /* obtain next state and operation from machine */
        state = m[state & 0x0f][event];
        /* perform operation */
        if (state & 0x10) putchar(' ');
        if (state & 0x20) putchar(' ');
        if (state & 0x40) putchar('/');
        if (state & 0x80) putchar(ch);
    }
    return 0;
}

	The lex program:

%Start CODE CCOM STRING CHAR CPLUS
%%
%{
	char *sccsID = "@(#) sc 1.0 Andre van Dalen, 6/90";
	BEGIN CODE;
%}
<STRING>([^\\]\")|(\\\\\")	|
<CHAR>([^.\\]\')|(\\\\\')	|
<CPLUS>\n	{ ECHO; BEGIN CODE; }
<CCOM>"*/"	{ two_space(); BEGIN CODE; }
<CCOM,CPLUS>.	{ output(*yytext=='\t'?'\t':' ');} 
<CODE>"/*"	{ two_space(); BEGIN CCOM ; }
<CODE>"//"	{ two_space(); BEGIN CPLUS ;}
<CODE>\"	{ ECHO; BEGIN STRING; }
<CODE>\'	{ ECHO; BEGIN CHAR; }
<STRING,CODE>.	{ ECHO; }
%%
two_space()
{
	output(' '); output(' ');
}
main(argc, argv)
int argc; char **argv;
{
	if (argc==1) yylex();
	else while (*++argv) {
		fclose(yyin);
		if (!(yyin=fopen(*argv,"r"))) {
			perror(*argv);
			exit(1);
		}
		yylex();
	}
	exit(0);
}

	The sh program:

# @(#) sc  Strip comments from a C/C++ source file
# Author: Carl Bergerson, August 1990
# set -x    # Uncomment for debugging
# Define correct usage message:
USAGE="Usage: $0 [sourcefile]"
case $# in
    0)  sed -e 's/^#/a#/' | /lib/cpp |
        sed -e '/^#/d' -e 's/^a#/#/';;
    1)  sed -e 's/^#/a#/' $1 | /lib/cpp |
        sed -e '/^#/d' -e 's/^a#/#/';;
    *)  echo $USAGE >&2
        exit 1 ;;
esac
-- 
Col. G. L. Sicherman
gls@corona.att.COM