[comp.lang.c] Pattern-matching source code

gordon@osiris.cso.uiuc.edu (John Gordon) (02/14/91)

	Hi, Netters.  Here is source code for a "grep"-like function.  Use
it in your own programs.  Note that there are two listings, match.c and
esc.c .

---
John Gordon
Internet: gordon@osiris.cso.uiuc.edu        #include <disclaimer.h>
          gordon@cerl.cecer.army.mil       #include <clever_saying.h>

--------CUT HERE-----------

/***********  MATCH.C  ************************  Listing 1  ***********
 * Author:    Allen I. Holub
 * Function:  A group of subroutines to find a substring represented
 *            by a grep-like regular expression in a second string.
 * Compiler:  Any ANSI Standard C compiler.
 * Verified for: Turbo C++ 1.0, Microsoft C 6.0
 * Memory Models: any
 *
 * Compile-time switches: -DMAIN to make a small grep-like main()
 *
 *  (c) C Gazette. May be used freely as long as author and publication
 *                 are acknowledged
 *----------------------------------------------------------------------
 */
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>


#ifdef DEBUG
#define D(x) x   /* If DEBUG is defined, D(printf("hello");) expands   */
#else            /* to printf("hello"). If DEBUG isn't defined, D(...) */
#define D(x)     /* is expanded to an empty string, effectively        */
#endif           /* removing the printf() statement from the input.    */

                        /* Metacharacters in the input:         */
#define BOL     '^'     /* start-of-line anchor                 */
#define EOL     '$'     /* end-of-line anchor                   */
#define ANY     '.'     /* matches any character                */
#define CCL     '['     /* start a character class              */
#define CCLEND  ']'     /* end a character class                */
#define NCCL    '^'     /* negates character class if 1st char. */
#define CLOSURE '*'     /* Kleene closure (matches 0 or more)   */
#define PCLOSE  '+'     /* Positive closure (1 or more)         */
#define OPT     '?'     /* Optional closure (0 or 1)            */

typedef enum action     /* These are put in the pattern string  */ {
                        /* to represent metacharacters.         */
    M_BOL    = ( 0x80 | '^' ),  M_EOL   = ( 0x80 | '$' ),
    M_ANY    = ( 0x80 | '.' ),  M_CCL   = ( 0x80 | '[' ),
    M_OPT    = ( 0x80 | '?' ),  M_CLOSE = ( 0x80 | '*' ),
    M_PCLOSE = ( 0x80 | '+' )
}
action;

typedef unsigned char pattern;  /* pattern strings are unsigned char */

#define IS_ACTION(x) ((x)&0x80) /* true => element of pat. string is an   */
/* action that represents a metacharacter */

#define MAXPAT  128     /* max num. of pattern elements. Remember that   */
/* character classes require 17 pattern elements */

/*----------------------------------------------------------------------*/
#define MAPSIZE 16   /* need this many bytes for character class bit map */

    /* Advance a pointer into the pattern template   */
    /* to the next pattern element, this is a +1 for */
    /* all pattern elements but M_CCL, where you     */
    /* to skip past both the M_CCL character and the */
    /* bitmap that follows that character            */

#define ADVANCE(pat) (pat += (*pat == M_CCL) ? (MAPSIZE+1) : 1)

    /* Bitmap functions. Set bit b in the map and */
    /* test bit b to see if it was set previously */

#define SETBIT(b,map) ((map)[((b) & 0x7f) >>3] |= (1<< ((b) & 0x07)) )
#define TSTBIT(b,map) ((map)[((b) & 0x7f) >>3] &  (1<< ((b) & 0x07)) )
/*----------------------------------------------------------------------*/
#define E_NONE       0          /* Possible return values from pat_err.*/
#define E_ILLEGAL    1          /* Set in makepat() to indicate prob-   */
#define E_NOMEM      2          /* lems that came up while making the   */
#define E_PAT        3          /* pattern template.                    */

static int Error = E_NONE;              /* error flag, like errno       */
int   pat_err(){ return Error; }        /* returns current error status */

/*----------------------------------------------------------------------*/
static  pattern * doccl     (pattern *map, unsigned char *src);
extern  pattern * makepat   (char *exp);
extern  char    * match     (char *look_for, char *in_this_string);
extern  int       pat_err   (void );
extern  char    * matchs    (char *str,   pattern *pat, int ret_endp);
static  int       omatch    (char **strp, pattern *pat, char *start);
extern  char    * patcmp    (char *str,   pattern *pat, char *start);
extern  int       esc       (char **);

/*----------------------------------------------------------------------*/
pattern *makepat( exp )
char    *exp;                   /* regular expression */
{
     /*  Make a pattern template from the string pointed to by exp.
      *  Stop when '\0' or '\n' is found in exp.
      *  Return:  a pointer to the pattern template on success, NULL
      *           on failure (in which case the pat_err() subroutine
      *           will return one of the following values:
      *
      *                  E_ILLEGAL       Illegal input pattern.
      *                  E_NOMEM         out of memory
      *                  E_PAT           pattern too long.
      */

     pattern *pat;  /* pattern template being assembled    */
     pattern *cur;  /* pointer to current pattern element  */
     pattern *prev; /* pointer to previous pattern element */

     pat   = NULL;
     Error = E_ILLEGAL;
     if(!*exp || *exp=='\n' )
          goto exit;

     if( *exp==CLOSURE || *exp==PCLOSE || *exp==OPT )
          goto exit;

     Error = E_NOMEM;
     if( ! (pat = malloc(MAXPAT)) )      /* get pattern buffer       */
          goto exit;
     D( memset(pat, 0, MAXPAT); )        /* zero buffer if debugging */

     prev  = cur = pat;
     Error = E_PAT;

     while( *exp  &&  *exp != '\n' ) {
          if( cur >= &pat[MAXPAT-1] ) {
               free( pat );
               pat   = NULL;
               goto exit;
          }

          switch( *exp ) {
            case ANY:
              *cur = M_ANY;
              prev = cur++;
              ++exp;
              break;

            case BOL:       
              *cur = (cur == pat) ? M_BOL : *exp;
              prev = cur++;
              ++exp;
              break;

            case EOL:       
              *cur = (!exp[1] || exp[1]=='\n') ? M_EOL : *exp;
              prev = cur++;
              ++exp;
              break;

            case CCL:       
              if( ((cur - pat) + MAPSIZE) >= MAXPAT ) {
                   free( pat );
                   pat = NULL;
                   goto exit;   /* not enough room for bit map */
              }

              prev   = cur;
              *cur++ = M_CCL;
              exp    = doccl( cur, exp );
              cur   += MAPSIZE ;
              break;

            case OPT:  case CLOSURE:  case PCLOSE:
              switch( *prev ) {
                case M_BOL:     case M_EOL:
                case M_OPT:
                case M_PCLOSE:  case M_CLOSE:   
                  free( pat );
                  pat = NULL;
                  goto exit;
              }

              memmove( prev+1, prev, cur-prev );
              *prev = (*exp == OPT)    ? M_OPT    :
                   (*exp == PCLOSE) ? M_PCLOSE : M_CLOSE  ;
              ++cur;
              ++exp;
              break;

            default:
              prev   = cur;
              *cur++ = esc( &exp );
              break;
          }
     }

     *cur  = '\0';
     Error = E_NONE;

 exit:  return( pat );
}

/*----------------------------------------------------------------------*/
static  pattern *doccl( map, src )
pattern *src, *map;
{
     /* Set bits in the map corresponding to characters specified in
      * the src character class.  */

     int first, last, negative;
     pattern *start = src;

     ++src;                              /* skip past the [          */
     if( negative = (*src == NCCL) )     /* check for negative ccl   */
          ++src;
     start = src;                        /* start of characters in class */
     memset(map, 0, MAPSIZE);            /* bitmap initially empty       */

     while( *src && *src != CCLEND ) {
          if( *src != '-') {
               first = esc(&src);          /* Use temp. to avoid macro  */
               SETBIT( first, map );       /* side effects.             */
          }

          else if( src == start  ) {
               SETBIT( '-', map );   /* literal dash at start or end */
               ++src;
          }
          else {
               ++src;                /* skip to end-of-sequence char */
               if( *src < src[-2] ) {
                    first = *src;
                    last  = src[-2];
               }
               else {
                    first = src[-2];
                    last  = *src;
               }
               while( ++first <= last )
                    SETBIT( first, map );
               src++;
          }
     }

     if( *src == CCLEND )
          ++src;                  /* Skip CCLEND */

     if( negative )
          for( first = MAPSIZE; --first >= 0 ;)
               *map++ ^= ~0;                       /* invert all bits */

     return src;
}

/*---------------------------------------------------------------*/
char    *match( look_for, in_this_string )
char    *look_for;
char    *in_this_string;
{
     /* Return a pointer to the characters matching look_for
      * if it's in the string, else return NULL. You can use
      * the pat_err() subroutine to distinguish a bad regular
      * expression from a string-not-found condition [NULL is
      * returned from match() in both cases].
      */

     pattern     *pat;
     char        *rval = NULL;

     if( pat = makepat(look_for) )                 /* make the pattern  */ {
          rval = matchs( in_this_string, pat, 0 );  /* see if it's there */
          free( pat );
     }
     return rval;
}

/*----------------------------------------------------------------------*/
char    *matchs( str, pat, ret_endp )
char    *str;
pattern *pat;
int     ret_endp;
{
     /* Uses patcmp() to look for a match of pat anywhere in str using
      * a brute-force algorithm. str is a character string while pat is
      * a pattern template made by makepat(). Returns:
      *          1. NULL if no match was found.
      *          2. A pointer the last character satisfying the match if
      *             ret_endp is true.
      *          3. A pointer to the beginning of the matched string
      *             if ret_endp is false.
      */

     char  *start;
     char  *end = NULL;

     if( !pat )          /* This lets you do: matchs(str,makepat(...),?);*/
          return NULL;    /* without horrible consequences if makepat()  */
     /* fails.                                       */
     if( !*str ) {
          if((*pat == M_EOL) ||
               (*pat == M_BOL && (!pat[1] || pat[1]==M_EOL)))
               end = str;
     }
     else {
          start = str;   /* Do a brute-force substring search, com-   */
          while( *str )  /* paring a pattern against the input string */ {
               if( !(end = patcmp(str, pat, start)) )
                    str++;
               else                /* successful match */ {
                    if( !ret_endp )
                         end = str ;
                    break;
               }
          }
     }
     return( end );
}

/*----------------------------------------------------------------------*/
char    *patcmp( str, pat, start )              /* formerly amatch() */
char    *str, *start;
pattern *pat;
{
     /* Like strcmp, but compares str against pat. Each element of str
      * is compared with the template until either a mis-match is
      * found or the end of the template is reached. In the former
      * case a 0 is returned; in the latter, a pointer into str
      * (pointing to the last character in the matched pattern)
      * is returned. Strstart points at the first character in the
      * string, which might not be the same thing as line if the
      * search started in the middle of the string.
      */

     char *bocl,         /* beginning of closure string.         */
          *end;          /* return value: end-of-string pointer. */

     if( !pat )          /* make sure pattern is valid   */
          return( NULL );

     while( *pat ) {
          if( *pat == M_OPT ) {
               /* Zero or one matches. It doesn't matter if omatch
                * fails---it will advance str past the character on
                * success, though. Always advance the pattern past
                * both the M_OPT and the operand.
                */

               omatch( &str, ++pat, start );
               ADVANCE(pat);
          }
          else if( !(*pat == M_CLOSE || *pat == M_PCLOSE) ) {
               /* Do a simple match. Note that omatch() fails if there's
                * still something in pat but we're at end of string.
                */

               if( !omatch( &str, pat, start ) )
                    return NULL;

               ADVANCE(pat);
          }
          else            /* Process a Kleene or positive closure */ {
               if( *pat++ == M_PCLOSE )    /* one match required */
                    if( !omatch(&str, pat, start ) )
                         return NULL;

               /* Match as many as possible, zero is okay */

               bocl = str;
               while ( *str  &&  omatch(&str, pat, start)  )
                    ;

               /* 'str' now points to the character that made
                * made us fail. Try to process the rest of the
                * string. If the character following the closure
                * could have been in the closure (as in the pattern
                * "[a-z]*t") the final 't' will be sucked up in the
                * while loop. So, if the match fails, back up a notch
                * and try to match the rest of the string again,
                * repeating this process recursively until we get back
                * to the beginning of the closure. The recursion
                * goes, at most, one levels deep.
                */

               if( *ADVANCE(pat) ) {
                    for(; bocl <= str; --str )
                         if( end = patcmp(str, pat, start) )
                              break;
                    return end;
               }
               break;
          }
     }

     /* omatch() advances str to point at the next
      * character to be matched. So str points at the
      * character following the last character matched
      * when you reach the end of the template. The
      * exceptions are templates containing only a
      * BOLN or EOLN token. In these cases omatch doesn't
      * advance. Since we must return a pointer to the last
      * matched character, decrement str to make it point at
      * the end of the matched string, making sure that the
      * decrement hasn't gone past the beginning of the string.
      *
      * Note that $ is a position, not a character, but in the case
      * of a pattern ^$, a pointer to the end of line character is
      * returned. In ^xyz$, a pointer to the z is returned.
      *
      * The --str is done outside the return statement because
      * max() is often a macro that has side-effects.
      */

     --str;
     return( max(start, str) );
}

/*----------------------------------------------------------------------*/
static  int     omatch( strp, pat, start )
char    **strp;
pattern *pat;
char    *start;
{
     /*  Match one pattern element, pointed at by pat, against the
      *  character at **strp. Return 0 on a failure, 1 on success.
      *  *strp is advanced to skip over the matched character on a
      *  successful match. Closure is handled one level up by patcmp().
      *
      *  "start" points at the character at the left edge of the
      *  line. This might not be the same thing as *strp if the
      *  search is starting in the middle of the string. An end-of-
      *  line anchor matches '\n' or '\0'.
      */

     int advance = -1;   /* amount to advance *strp, -1 == error  */

     switch( *pat ) {
       case M_BOL:                         /* First char in string? */
         if( *strp == start )        /* Only one star here.   */
              advance = 0;
         break;

       case M_ANY:                         /* . = anything but newline */
         if( **strp != '\n' )
              advance = 1;
         break;

       case M_EOL:
         if( **strp == '\n'  ||  **strp == '\0' )
              advance = 0;
         break;

       case M_CCL:
         if( TSTBIT( **strp, pat + 1 ) )
              advance = 1;
         break;

       default:                            /* literal match */
         if( **strp == *pat )
              advance = 1;
         break;
     }

     if( advance > 0 )
          *strp += advance;

     return( advance + 1 );
}

/*----------------------------------------------------------------------*/
#ifdef MAIN
main( int argc, char **argv )
{
     static pattern *pat;
     static FILE    *inp;
     static char     inp_buf[ 1024 ];

     if( argc < 2 || argv[1][0]=='-' ) {
          fprintf(stderr, "Usage: match reg_exp filename\n");
          fprintf(stderr, "Usage: match reg_exp < filename\n");
          exit( 1 );
     }

     if( !(pat = makepat( argv[1] )) ) {
          fprintf(stderr, "Can't make expression template\n");
          exit( 1 );
     }

     if( argc == 2 )
          inp = stdin;
     else if( !(inp = fopen(argv[2],"r")) ) {
          perror( argv[2] );
          exit( 1 );
     }

     while( fgets( inp_buf, sizeof(inp_buf), inp ) )
          if( matchs(inp_buf, pat, 0) )
               fputs( inp_buf, stdout );

     exit( 0 );
}
#endif

/***********  Listing 2  ******************  ESC.C  *******************
 * Program:       ESC.C
 * Author:        Allen I. Holub
 * Function:      A subroutine to parse escape sequences
 * Compiler:      Any.
 * Memory Models: Any
 *
 *  (c) 1990 C Gazette. Use freely if authorship acknowledged.
 *----------------------------------------------------------------------
 */

#include <stdio.h>
#include <ctype.h>


static int hex2bin ( int c );
static int oct2bin ( int c );

/*------------------------------------------------------------*/
#define ISHEXDIGIT(x) (isdigit(x)                       \
                            || ('a'<=(x) && (x)<='f')   \
                            || ('A'<=(x) && (x)<='F')   )

#define ISOCTDIGIT(x) ('0'<=(x) && (x)<='7')

static int hex2bin(c)
int     c;
{
     /* Convert the hex digit represented by 'c' to an int. 'c'
      * must be one of: 0123456789abcdefABCDEF
      */
     return (isdigit(c) ? (c)-'0': ((toupper(c))-'A')+10)  & 0xf;
}

static int oct2bin(c)
int     c;
{
     /* Convert the hex digit represented by 'c' to an int. 'c'
      * must be a digit in the range '0'-'7'.
      */
     return ( ((c)-'0')  &  0x7 );
}

/*------------------------------------------------------------*/

int     esc(s)
char    **s;
{
     /* Map escape sequences into their equivalent symbols. Return
      * the equivalent ASCII character. *s is advanced past the
      * escape sequence. If no escape sequence is present, the
      * current character is returned and the string is advanced by
      * one. The following are recognized:
      *
      *  \b     backspace
      *  \f     formfeed
      *  \n     newline
      *  \r     carriage return
      *  \s     space
      *  \t     tab
      *  \e     ASCII ESC character ('\033')
      *  \DDD   number formed of 1-3 octal digits
      *  \xDDD  number formed of 1-3 hex digits
      *  \^C    C = any letter. Control code
      */

     int rval;

     if( **s != '\\' )
          rval = *( (*s)++ );
     else {
          ++(*s);                                 /* Skip the \ */
          switch( toupper(**s) ) {
            case '\0':  rval = '\\';             break;
            case 'B':   rval = '\b' ;            break;
            case 'F':   rval = '\f' ;            break;
            case 'N':   rval = '\n' ;            break;
            case 'R':   rval = '\r' ;            break;
            case 'S':   rval = ' '  ;            break;
            case 'T':   rval = '\t' ;            break;
            case 'E':   rval = '\033';           break;

            case '^':   
              rval = *++(*s) ;
              rval = toupper(rval) - '@' ;
              break;

            case 'X':   
              rval = 0;
              ++(*s);
              if( ISHEXDIGIT(**s) ) {
                   rval  = hex2bin( *(*s)++ );
              }
              if( ISHEXDIGIT(**s) ) {
                   rval <<= 4;
                   rval  |= hex2bin( *(*s)++ );
              }
              if( ISHEXDIGIT(**s) ) {
                   rval <<= 4;
                   rval  |= hex2bin( *(*s)++ );
              }
              --(*s);
              break;

            default:
              if( !ISOCTDIGIT(**s) )
                   rval = **s;
              else {
                   ++(*s);
                   rval = oct2bin( *(*s)++ );
                   if( ISOCTDIGIT(**s) ) {
                        rval <<= 3;
                        rval  |= oct2bin( *(*s)++ );
                   }
                   if( ISOCTDIGIT(**s) ) {
                        rval <<= 3;
                        rval  |= oct2bin( *(*s)++ );
                   }
                   --(*s);
              }
              break;
          }
          ++(*s);
     }
     return rval;
}

rsalz@bbn.com (Rich Salz) (02/16/91)

In <1991Feb14.143705.3911@ux1.cso.uiuc.edu> gordon@osiris.cso.uiuc.edu (John Gordon) writes:
|/***********  MATCH.C  ************************  Listing 1  ***********
| * Author:    Allen I. Holub
| * Function:  A group of subroutines to find a substring represented
| *            by a grep-like regular expression in a second string.
|typedef enum action     /* These are put in the pattern string  */ {
|                        /* to represent metacharacters.         */
|    M_BOL    = ( 0x80 | '^' ),  M_EOL   = ( 0x80 | '$' ),
...
|}
|action;
|
|typedef unsigned char pattern;  /* pattern strings are unsigned char */

Note that this code assumes 7bit chars.  It won't work with extended alphabets,
or even on a PC where you could have graphics characters.
	/r$
-- 
Please send comp.sources.unix-related mail to rsalz@uunet.uu.net.
Use a domain-based address or give alternate paths, or you may lose out.

oz@yunexus.yorku.ca (Ozan Yigit) (02/19/91)

In <1991Feb14.143705.3911@ux1.cso.uiuc.edu> gordon@osiris.cso.uiuc.edu
(John Gordon) writes:

>        Hi, Netters.  Here is source code for a "grep"-like function.  Use
>it in your own programs.

[Holub's verbose implementation of ad-hoc regex omitted]

Hi Gordon, Holub's regex stuff is old hat. A PD implementation of all of
this + more in a package that is a clone of BSD regex library was posted to
this net around 1986. A much more advanced [read proper] and influential
implementation of regular expression package by Henry Spencer was also
posted around that time, and has been in heavy use ever since. I think you
may be much better served by one of these packages in your programs.

oz
---
We only know ... what we know, and    | Internet: oz@nexus.yorku.ca
that is very little. -- Dan Rather    | UUCP: utzoo/utai!yunexus!oz