gordon@osiris.cso.uiuc.edu (John Gordon) (02/14/91)
Hi, Netters. Here is source code for a "grep"-like function. Use it in your own programs. Note that there are two listings, match.c and esc.c . --- John Gordon Internet: gordon@osiris.cso.uiuc.edu #include <disclaimer.h> gordon@cerl.cecer.army.mil #include <clever_saying.h> --------CUT HERE----------- /*********** MATCH.C ************************ Listing 1 *********** * Author: Allen I. Holub * Function: A group of subroutines to find a substring represented * by a grep-like regular expression in a second string. * Compiler: Any ANSI Standard C compiler. * Verified for: Turbo C++ 1.0, Microsoft C 6.0 * Memory Models: any * * Compile-time switches: -DMAIN to make a small grep-like main() * * (c) C Gazette. May be used freely as long as author and publication * are acknowledged *---------------------------------------------------------------------- */ #include <stdio.h> #include <stdlib.h> #include <ctype.h> #include <string.h> #ifdef DEBUG #define D(x) x /* If DEBUG is defined, D(printf("hello");) expands */ #else /* to printf("hello"). If DEBUG isn't defined, D(...) */ #define D(x) /* is expanded to an empty string, effectively */ #endif /* removing the printf() statement from the input. */ /* Metacharacters in the input: */ #define BOL '^' /* start-of-line anchor */ #define EOL '$' /* end-of-line anchor */ #define ANY '.' /* matches any character */ #define CCL '[' /* start a character class */ #define CCLEND ']' /* end a character class */ #define NCCL '^' /* negates character class if 1st char. */ #define CLOSURE '*' /* Kleene closure (matches 0 or more) */ #define PCLOSE '+' /* Positive closure (1 or more) */ #define OPT '?' /* Optional closure (0 or 1) */ typedef enum action /* These are put in the pattern string */ { /* to represent metacharacters. */ M_BOL = ( 0x80 | '^' ), M_EOL = ( 0x80 | '$' ), M_ANY = ( 0x80 | '.' ), M_CCL = ( 0x80 | '[' ), M_OPT = ( 0x80 | '?' ), M_CLOSE = ( 0x80 | '*' ), M_PCLOSE = ( 0x80 | '+' ) } action; typedef unsigned char pattern; /* pattern strings are unsigned char */ #define IS_ACTION(x) ((x)&0x80) /* true => element of pat. string is an */ /* action that represents a metacharacter */ #define MAXPAT 128 /* max num. of pattern elements. Remember that */ /* character classes require 17 pattern elements */ /*----------------------------------------------------------------------*/ #define MAPSIZE 16 /* need this many bytes for character class bit map */ /* Advance a pointer into the pattern template */ /* to the next pattern element, this is a +1 for */ /* all pattern elements but M_CCL, where you */ /* to skip past both the M_CCL character and the */ /* bitmap that follows that character */ #define ADVANCE(pat) (pat += (*pat == M_CCL) ? (MAPSIZE+1) : 1) /* Bitmap functions. Set bit b in the map and */ /* test bit b to see if it was set previously */ #define SETBIT(b,map) ((map)[((b) & 0x7f) >>3] |= (1<< ((b) & 0x07)) ) #define TSTBIT(b,map) ((map)[((b) & 0x7f) >>3] & (1<< ((b) & 0x07)) ) /*----------------------------------------------------------------------*/ #define E_NONE 0 /* Possible return values from pat_err.*/ #define E_ILLEGAL 1 /* Set in makepat() to indicate prob- */ #define E_NOMEM 2 /* lems that came up while making the */ #define E_PAT 3 /* pattern template. */ static int Error = E_NONE; /* error flag, like errno */ int pat_err(){ return Error; } /* returns current error status */ /*----------------------------------------------------------------------*/ static pattern * doccl (pattern *map, unsigned char *src); extern pattern * makepat (char *exp); extern char * match (char *look_for, char *in_this_string); extern int pat_err (void ); extern char * matchs (char *str, pattern *pat, int ret_endp); static int omatch (char **strp, pattern *pat, char *start); extern char * patcmp (char *str, pattern *pat, char *start); extern int esc (char **); /*----------------------------------------------------------------------*/ pattern *makepat( exp ) char *exp; /* regular expression */ { /* Make a pattern template from the string pointed to by exp. * Stop when '\0' or '\n' is found in exp. * Return: a pointer to the pattern template on success, NULL * on failure (in which case the pat_err() subroutine * will return one of the following values: * * E_ILLEGAL Illegal input pattern. * E_NOMEM out of memory * E_PAT pattern too long. */ pattern *pat; /* pattern template being assembled */ pattern *cur; /* pointer to current pattern element */ pattern *prev; /* pointer to previous pattern element */ pat = NULL; Error = E_ILLEGAL; if(!*exp || *exp=='\n' ) goto exit; if( *exp==CLOSURE || *exp==PCLOSE || *exp==OPT ) goto exit; Error = E_NOMEM; if( ! (pat = malloc(MAXPAT)) ) /* get pattern buffer */ goto exit; D( memset(pat, 0, MAXPAT); ) /* zero buffer if debugging */ prev = cur = pat; Error = E_PAT; while( *exp && *exp != '\n' ) { if( cur >= &pat[MAXPAT-1] ) { free( pat ); pat = NULL; goto exit; } switch( *exp ) { case ANY: *cur = M_ANY; prev = cur++; ++exp; break; case BOL: *cur = (cur == pat) ? M_BOL : *exp; prev = cur++; ++exp; break; case EOL: *cur = (!exp[1] || exp[1]=='\n') ? M_EOL : *exp; prev = cur++; ++exp; break; case CCL: if( ((cur - pat) + MAPSIZE) >= MAXPAT ) { free( pat ); pat = NULL; goto exit; /* not enough room for bit map */ } prev = cur; *cur++ = M_CCL; exp = doccl( cur, exp ); cur += MAPSIZE ; break; case OPT: case CLOSURE: case PCLOSE: switch( *prev ) { case M_BOL: case M_EOL: case M_OPT: case M_PCLOSE: case M_CLOSE: free( pat ); pat = NULL; goto exit; } memmove( prev+1, prev, cur-prev ); *prev = (*exp == OPT) ? M_OPT : (*exp == PCLOSE) ? M_PCLOSE : M_CLOSE ; ++cur; ++exp; break; default: prev = cur; *cur++ = esc( &exp ); break; } } *cur = '\0'; Error = E_NONE; exit: return( pat ); } /*----------------------------------------------------------------------*/ static pattern *doccl( map, src ) pattern *src, *map; { /* Set bits in the map corresponding to characters specified in * the src character class. */ int first, last, negative; pattern *start = src; ++src; /* skip past the [ */ if( negative = (*src == NCCL) ) /* check for negative ccl */ ++src; start = src; /* start of characters in class */ memset(map, 0, MAPSIZE); /* bitmap initially empty */ while( *src && *src != CCLEND ) { if( *src != '-') { first = esc(&src); /* Use temp. to avoid macro */ SETBIT( first, map ); /* side effects. */ } else if( src == start ) { SETBIT( '-', map ); /* literal dash at start or end */ ++src; } else { ++src; /* skip to end-of-sequence char */ if( *src < src[-2] ) { first = *src; last = src[-2]; } else { first = src[-2]; last = *src; } while( ++first <= last ) SETBIT( first, map ); src++; } } if( *src == CCLEND ) ++src; /* Skip CCLEND */ if( negative ) for( first = MAPSIZE; --first >= 0 ;) *map++ ^= ~0; /* invert all bits */ return src; } /*---------------------------------------------------------------*/ char *match( look_for, in_this_string ) char *look_for; char *in_this_string; { /* Return a pointer to the characters matching look_for * if it's in the string, else return NULL. You can use * the pat_err() subroutine to distinguish a bad regular * expression from a string-not-found condition [NULL is * returned from match() in both cases]. */ pattern *pat; char *rval = NULL; if( pat = makepat(look_for) ) /* make the pattern */ { rval = matchs( in_this_string, pat, 0 ); /* see if it's there */ free( pat ); } return rval; } /*----------------------------------------------------------------------*/ char *matchs( str, pat, ret_endp ) char *str; pattern *pat; int ret_endp; { /* Uses patcmp() to look for a match of pat anywhere in str using * a brute-force algorithm. str is a character string while pat is * a pattern template made by makepat(). Returns: * 1. NULL if no match was found. * 2. A pointer the last character satisfying the match if * ret_endp is true. * 3. A pointer to the beginning of the matched string * if ret_endp is false. */ char *start; char *end = NULL; if( !pat ) /* This lets you do: matchs(str,makepat(...),?);*/ return NULL; /* without horrible consequences if makepat() */ /* fails. */ if( !*str ) { if((*pat == M_EOL) || (*pat == M_BOL && (!pat[1] || pat[1]==M_EOL))) end = str; } else { start = str; /* Do a brute-force substring search, com- */ while( *str ) /* paring a pattern against the input string */ { if( !(end = patcmp(str, pat, start)) ) str++; else /* successful match */ { if( !ret_endp ) end = str ; break; } } } return( end ); } /*----------------------------------------------------------------------*/ char *patcmp( str, pat, start ) /* formerly amatch() */ char *str, *start; pattern *pat; { /* Like strcmp, but compares str against pat. Each element of str * is compared with the template until either a mis-match is * found or the end of the template is reached. In the former * case a 0 is returned; in the latter, a pointer into str * (pointing to the last character in the matched pattern) * is returned. Strstart points at the first character in the * string, which might not be the same thing as line if the * search started in the middle of the string. */ char *bocl, /* beginning of closure string. */ *end; /* return value: end-of-string pointer. */ if( !pat ) /* make sure pattern is valid */ return( NULL ); while( *pat ) { if( *pat == M_OPT ) { /* Zero or one matches. It doesn't matter if omatch * fails---it will advance str past the character on * success, though. Always advance the pattern past * both the M_OPT and the operand. */ omatch( &str, ++pat, start ); ADVANCE(pat); } else if( !(*pat == M_CLOSE || *pat == M_PCLOSE) ) { /* Do a simple match. Note that omatch() fails if there's * still something in pat but we're at end of string. */ if( !omatch( &str, pat, start ) ) return NULL; ADVANCE(pat); } else /* Process a Kleene or positive closure */ { if( *pat++ == M_PCLOSE ) /* one match required */ if( !omatch(&str, pat, start ) ) return NULL; /* Match as many as possible, zero is okay */ bocl = str; while ( *str && omatch(&str, pat, start) ) ; /* 'str' now points to the character that made * made us fail. Try to process the rest of the * string. If the character following the closure * could have been in the closure (as in the pattern * "[a-z]*t") the final 't' will be sucked up in the * while loop. So, if the match fails, back up a notch * and try to match the rest of the string again, * repeating this process recursively until we get back * to the beginning of the closure. The recursion * goes, at most, one levels deep. */ if( *ADVANCE(pat) ) { for(; bocl <= str; --str ) if( end = patcmp(str, pat, start) ) break; return end; } break; } } /* omatch() advances str to point at the next * character to be matched. So str points at the * character following the last character matched * when you reach the end of the template. The * exceptions are templates containing only a * BOLN or EOLN token. In these cases omatch doesn't * advance. Since we must return a pointer to the last * matched character, decrement str to make it point at * the end of the matched string, making sure that the * decrement hasn't gone past the beginning of the string. * * Note that $ is a position, not a character, but in the case * of a pattern ^$, a pointer to the end of line character is * returned. In ^xyz$, a pointer to the z is returned. * * The --str is done outside the return statement because * max() is often a macro that has side-effects. */ --str; return( max(start, str) ); } /*----------------------------------------------------------------------*/ static int omatch( strp, pat, start ) char **strp; pattern *pat; char *start; { /* Match one pattern element, pointed at by pat, against the * character at **strp. Return 0 on a failure, 1 on success. * *strp is advanced to skip over the matched character on a * successful match. Closure is handled one level up by patcmp(). * * "start" points at the character at the left edge of the * line. This might not be the same thing as *strp if the * search is starting in the middle of the string. An end-of- * line anchor matches '\n' or '\0'. */ int advance = -1; /* amount to advance *strp, -1 == error */ switch( *pat ) { case M_BOL: /* First char in string? */ if( *strp == start ) /* Only one star here. */ advance = 0; break; case M_ANY: /* . = anything but newline */ if( **strp != '\n' ) advance = 1; break; case M_EOL: if( **strp == '\n' || **strp == '\0' ) advance = 0; break; case M_CCL: if( TSTBIT( **strp, pat + 1 ) ) advance = 1; break; default: /* literal match */ if( **strp == *pat ) advance = 1; break; } if( advance > 0 ) *strp += advance; return( advance + 1 ); } /*----------------------------------------------------------------------*/ #ifdef MAIN main( int argc, char **argv ) { static pattern *pat; static FILE *inp; static char inp_buf[ 1024 ]; if( argc < 2 || argv[1][0]=='-' ) { fprintf(stderr, "Usage: match reg_exp filename\n"); fprintf(stderr, "Usage: match reg_exp < filename\n"); exit( 1 ); } if( !(pat = makepat( argv[1] )) ) { fprintf(stderr, "Can't make expression template\n"); exit( 1 ); } if( argc == 2 ) inp = stdin; else if( !(inp = fopen(argv[2],"r")) ) { perror( argv[2] ); exit( 1 ); } while( fgets( inp_buf, sizeof(inp_buf), inp ) ) if( matchs(inp_buf, pat, 0) ) fputs( inp_buf, stdout ); exit( 0 ); } #endif /*********** Listing 2 ****************** ESC.C ******************* * Program: ESC.C * Author: Allen I. Holub * Function: A subroutine to parse escape sequences * Compiler: Any. * Memory Models: Any * * (c) 1990 C Gazette. Use freely if authorship acknowledged. *---------------------------------------------------------------------- */ #include <stdio.h> #include <ctype.h> static int hex2bin ( int c ); static int oct2bin ( int c ); /*------------------------------------------------------------*/ #define ISHEXDIGIT(x) (isdigit(x) \ || ('a'<=(x) && (x)<='f') \ || ('A'<=(x) && (x)<='F') ) #define ISOCTDIGIT(x) ('0'<=(x) && (x)<='7') static int hex2bin(c) int c; { /* Convert the hex digit represented by 'c' to an int. 'c' * must be one of: 0123456789abcdefABCDEF */ return (isdigit(c) ? (c)-'0': ((toupper(c))-'A')+10) & 0xf; } static int oct2bin(c) int c; { /* Convert the hex digit represented by 'c' to an int. 'c' * must be a digit in the range '0'-'7'. */ return ( ((c)-'0') & 0x7 ); } /*------------------------------------------------------------*/ int esc(s) char **s; { /* Map escape sequences into their equivalent symbols. Return * the equivalent ASCII character. *s is advanced past the * escape sequence. If no escape sequence is present, the * current character is returned and the string is advanced by * one. The following are recognized: * * \b backspace * \f formfeed * \n newline * \r carriage return * \s space * \t tab * \e ASCII ESC character ('\033') * \DDD number formed of 1-3 octal digits * \xDDD number formed of 1-3 hex digits * \^C C = any letter. Control code */ int rval; if( **s != '\\' ) rval = *( (*s)++ ); else { ++(*s); /* Skip the \ */ switch( toupper(**s) ) { case '\0': rval = '\\'; break; case 'B': rval = '\b' ; break; case 'F': rval = '\f' ; break; case 'N': rval = '\n' ; break; case 'R': rval = '\r' ; break; case 'S': rval = ' ' ; break; case 'T': rval = '\t' ; break; case 'E': rval = '\033'; break; case '^': rval = *++(*s) ; rval = toupper(rval) - '@' ; break; case 'X': rval = 0; ++(*s); if( ISHEXDIGIT(**s) ) { rval = hex2bin( *(*s)++ ); } if( ISHEXDIGIT(**s) ) { rval <<= 4; rval |= hex2bin( *(*s)++ ); } if( ISHEXDIGIT(**s) ) { rval <<= 4; rval |= hex2bin( *(*s)++ ); } --(*s); break; default: if( !ISOCTDIGIT(**s) ) rval = **s; else { ++(*s); rval = oct2bin( *(*s)++ ); if( ISOCTDIGIT(**s) ) { rval <<= 3; rval |= oct2bin( *(*s)++ ); } if( ISOCTDIGIT(**s) ) { rval <<= 3; rval |= oct2bin( *(*s)++ ); } --(*s); } break; } ++(*s); } return rval; }
rsalz@bbn.com (Rich Salz) (02/16/91)
In <1991Feb14.143705.3911@ux1.cso.uiuc.edu> gordon@osiris.cso.uiuc.edu (John Gordon) writes: |/*********** MATCH.C ************************ Listing 1 *********** | * Author: Allen I. Holub | * Function: A group of subroutines to find a substring represented | * by a grep-like regular expression in a second string. |typedef enum action /* These are put in the pattern string */ { | /* to represent metacharacters. */ | M_BOL = ( 0x80 | '^' ), M_EOL = ( 0x80 | '$' ), ... |} |action; | |typedef unsigned char pattern; /* pattern strings are unsigned char */ Note that this code assumes 7bit chars. It won't work with extended alphabets, or even on a PC where you could have graphics characters. /r$ -- Please send comp.sources.unix-related mail to rsalz@uunet.uu.net. Use a domain-based address or give alternate paths, or you may lose out.
oz@yunexus.yorku.ca (Ozan Yigit) (02/19/91)
In <1991Feb14.143705.3911@ux1.cso.uiuc.edu> gordon@osiris.cso.uiuc.edu (John Gordon) writes: > Hi, Netters. Here is source code for a "grep"-like function. Use >it in your own programs. [Holub's verbose implementation of ad-hoc regex omitted] Hi Gordon, Holub's regex stuff is old hat. A PD implementation of all of this + more in a package that is a clone of BSD regex library was posted to this net around 1986. A much more advanced [read proper] and influential implementation of regular expression package by Henry Spencer was also posted around that time, and has been in heavy use ever since. I think you may be much better served by one of these packages in your programs. oz --- We only know ... what we know, and | Internet: oz@nexus.yorku.ca that is very little. -- Dan Rather | UUCP: utzoo/utai!yunexus!oz