[net.lang.c] Pattern recognition in C

josh@unm-cvax.UUCP (01/15/85)

Does anybody have a routines written in C that can be used for
 something such as finding which string in a list looks the most
 like the control string.  It would also be nice if the routine
 could handle wild card characters.

If you don't have a program, helpful ideas would also be appreciated.

              --thanks much

          Josh Siegel

 {convex,ucbvax,gatech,csu-cs,anl-mcs,lanl-a}!unmvax!unm-cvax!josh

"Gary S. Moss (AMXBR-VLD-V)" <moss@Brl-Vld.ARPA> (01/17/85)

#!/bin/sh
#	Josh -
#	I wrote a boolean function which mimics UNIX shell file name
#	expansion in the attempt to match its first argument with its
#	second.  Matchtest.c is a test driver for it.
#
#
# Self-unpacking archive format.  To unbundle, sh this file.
echo 'match.c' 1>&2
cat >'match.c' <<'END OF match.c'
/*
	SCCS id:	@(#) match.c	1.3
	Last edit: 	1/16/85 at 21:18:57
	Retrieved: 	1/16/85 at 21:19:58
	SCCS archive:	/vld/moss/work/libWIND/s.match.c

	Author:		Gary S. Moss
			U. S. Army Ballistic Research Laboratory
			Aberdeen Proving Ground
			Maryland 21005-5066
			(301)278-6647 or AV-283-6647
 */
static
char	sccsTag[] = "@(#) match.c	1.3	last edit 1/16/85 at 21:18:57";
#include <stdio.h>
#include <string.h>
#include "./ascii.h"
extern void	prnt1Err();

/*	 m a t c h ( )
	if string matches pattern, return 1, else return 0
	special characters:
		*	Matches any string including the null string.
		?	Matches any single character.
		[...]	Matches any one of the characters enclosed.
		[!..]	Matchea any character NOT enclosed.
		-	May be used inside brackets to specify range
			(i.e. str[1-58] matches str1, str2, ... str5, str8)
		\	Escapes special characters.
 */
match(	 pattern,  string )
register
char	*pattern, *string;
	{
	do
		{
		switch( pattern[0] )
		{
		case '*': /* Match any string including null string.	*/
			if( pattern[1] == NUL || string[0] == NUL )
				return	1;
			while( string[0] != NUL )
				{
				if( match( &pattern[1], string ) )
					return	1;
				++string;
				}
			return	0;
		case '?': /* Match any character.			*/
			break;
		case '[': /* Match one of the characters in brackets
				unless first is a '!', then match
				any character not inside brackets.
			   */
			{ register char	*rgtBracket;
			  static int	negation;

			++pattern; /* Skip over left bracket.		*/
			/* Find matching right bracket.			*/
			if( (rgtBracket = strchr( pattern, ']' )) == NULL )
				{
				prnt1Err( "Unmatched '['." );
				return	0;
				}
			/* Check for negation operator.			*/
			if( pattern[0] == '!' )
				{
				++pattern;
				negation = 1;
				}
			else	{
				negation = 0;
				}	
			/* Traverse pattern inside brackets.		*/
			for(	;
				pattern < rgtBracket
			     &&	pattern[0] != string[0];
				++pattern
				)
				{
				if(	pattern[ 0] == '-'
				    &&	pattern[-1] != '\\'
					)
					{
					if(	pattern[-1] <= string[0]
					    &&	pattern[-1] != '['
					    &&	pattern[ 1] >= string[0]
					    &&	pattern[ 1] != ']'
					)
						break;
					}
				}
			if( pattern == rgtBracket )
				{
				if( ! negation )
					{
					return	0;
					}
				}
			else
				{
				if( negation )
					{
					return	0;
					}
				}
			pattern = rgtBracket; /* Skip to right bracket.	*/
			break;
			}
		case '\\': /* Escape special character.			*/
			++pattern;
			/* WARNING: falls through to default case.	*/
		default:  /* Compare characters.			*/
			if( pattern[0] != string[0] )
				return	0;
		}
		++pattern;
		++string;
		}
	while( pattern[0] != NUL && string[0]  != NUL );
	if( (pattern[0] == NUL || pattern[0] == '*' ) && string[0]  == NUL )
		{
		return	1;
		}
	else
		{
		return	0;
		}
	}
END OF match.c
echo 'matchtest.c' 1>&2
cat >'matchtest.c' <<'END OF matchtest.c'
/*
	SCCS id:	@(#) matchtest.c	1.2
	Last edit: 	1/16/85 at 21:19:12
	Retrieved: 	1/16/85 at 21:20:06
	SCCS archive:	/vld/moss/work/libWIND/s.matchtest.c

	Author:		Gary S. Moss
			U. S. Army Ballistic Research Laboratory
			Aberdeen Proving Ground
			Maryland 21005-5066
			(301)278-6647 or AV-283-6647
 */
#if ! defined( lint )
static
char	sccsTag[] = "@(#) matchtest.c	1.2	last edit 1/16/85 at 21:19:12";
#endif
#include <stdio.h>
extern int	match();
char	*usage[] = {
"",
"matchtest(1.2)",
"",
"Usage: matchtest [pattern string]",
"",
"If no arguments are given, the program reads words on its standard input.",
"The program writes to its standard output.",
0
};
void		prntUsage(), prnt1Err();
static char	*pattern, *string;
static char	patbuf[BUFSIZ], strbuf[BUFSIZ];
/*	m a i n ( )
 */
main( argc, argv )
char	*argv[];
	{
	if( ! parsArgv( argc, argv ) )
		{
		prntUsage();
		exit( 1 );
		}
	if( pattern != NULL )
		{
		if( match( pattern, string ) )
			{
			(void) printf( "'%s' matches '%s'.\n", pattern, string );
			exit( 0 );
			}
		else
			{
			(void) printf(	"'%s' does not match '%s'.\n",
					pattern,
					string
					);
			exit( 1 );
			}
		}
	while( scanf( "%s %s", patbuf, strbuf ) == 2 )
		{
		if( match( patbuf, strbuf ) )
			{
			(void) printf( "'%s' matches '%s'.\n", patbuf, strbuf );
			}
		else
			{
			(void) printf(	"'%s' does not match '%s'.\n",
					patbuf,
					strbuf
					);
			}
		}		
	exit( 0 );
	}


/*	p a r s A r g v ( )
 */
parsArgv( argc, argv )
register char	**argv;
	{
	register int	c;
	extern int	optind;
	extern char	*optarg;

	/* Parse options.					*/
	while( (c = getopt( argc, argv, "" )) != EOF )
		{
		switch( c )
			{
			case '?' :
				return	0;
			}
		}
	if( argc - optind != 2 )
		{
		if( argc == optind )
			{
			pattern = string = NULL;
			} 
		else 
			{
			(void) fprintf( stderr, "Arg count wrong!\n" );
			return	0;
			}
		} 
	else
		{
		pattern = argv[optind++];
		string = argv[optind++];
		}
	return	1;
	}

/*	p r n t U s a g e ( )
	Print usage message.
 */
void
prntUsage()
	{
	register char	**p = usage;
	while( *p )
		(void) fprintf( stderr, "%s\n", *p++ );
	return;
	}

END OF matchtest.c

fnf@unisoft.UUCP (Fred Fish) (01/21/85)

[--------------]

A recent posting to net.lang.c of a "shell style" pattern matching
routine by Gary Moss appeared to be incomplete so I decided to post
an alternate.

This posting also contains my version (nmatch.c) and Gary's two
routines (match.c and matchtest.c) for the benefit of those
who might have missed the original posting since it wasn't
in net.sources.  By the way, it appears that the usage message
for matchtest is wrong.  It apparently wants the arguments
[string pattern] rather than [pattern string].

I'm sure there are many ways to do this, and probably some that
are far more compact than my solution.  I would love to see some
alternate versions (that I can understand that is :-).

[----  see net.sources for actual posting  ----]

friesen@psivax.UUCP (Stanley Friesen) (01/22/85)

	And another solution, if you are using BSD 4.1(and probably 4.2)
there are a pair of library routines, re_comp & re_exec, in libc.a
which implement ex style regular expressions.  To find out how to
use them read the manual page titled "regex" in section 3.
-- 

				Sarima (Stanley Friesen)

{trwrb|allegra|burdvax|cbosgd|hplabs|ihnp4|sdcsvax}!sdcrdcf!psivax!friesen

"Gary S. Moss (AMXBR-VLD-V)" <moss@Brl-Vld.ARPA> (01/23/85)

~ A recent posting to net.lang.c of a "shell style" pattern matching
~ routine by Gary Moss appeared to be incomplete so I decided to post
~ an alternate.

Now Fred, this makes me nervous, the posted routine was written for
a table-of-contents lookup routine, and complete functionality a la
UNIX shell, was not critical, however I am interested in what I left
out since I am likely to find other uses for it.

~ ... By the way, it appears that the usage message
~ for matchtest is wrong.  It apparently wants the arguments
~ [string pattern] rather than [pattern string].
Sorry Fred, it works as advertised, match [pattern string], if you leave
out the arguments, it reads pattern-string pairs on stdin until EOF.
YOUR match() function switches the arguments, i.e.,
	BOOLEAN nmatch( string, pattern ) {...}
while mine is
	int match( pattern, string ) {...}
So, if you substituted your function for mine ... :-)
Thanks for posting my sources, I should have done that originally.
__Moss__

robert@gitpyr.UUCP (Robert Viduya) (01/25/85)

> 
> 	And another solution, if you are using BSD 4.1(and probably 4.2)
> there are a pair of library routines, re_comp & re_exec, in libc.a
> which implement ex style regular expressions.  To find out how to
> use them read the manual page titled "regex" in section 3.
> -- 
> 

Regex(3) is also in System V release 1.  I don't know for sure, but I
think it's safe to assume it's also part of Sys V R2.0.  Anyone know
for sure?

				robert
-- 
Robert Viduya
    Office of Computing Services
    Georgia Institute of Technology, Atlanta GA 30332
    Phone:  (404) 894-4669

...!{akgua,allegra,amd,hplabs,ihnp4,masscomp,ut-ngp}!gatech!gitpyr!robert
...!{rlgvax,sb1,uf-cgrl,unmvax,ut-sally}!gatech!gitpyr!robert