[alt.sources] Revised C filename wildcard routine

rsalz@bbn.com (Rich Salz) (01/04/91)

This is a revised version of my pattern-matching routine.  It has been
posted to the net several times, and shows up in several places including
Gilmore's public domain TAR.  You might want to test this, but I did
and it seems solid.
	/r$

#! /bin/sh
# This is a shell archive.  Remove anything before this line, then feed it
# into a shell via "sh file" or similar.  To overwrite existing files,
# type "sh file -c".
# The tool that generated this appeared in the comp.sources.unix newsgroup;
# send mail to comp-sources-unix@uunet.uu.net if you want that tool.
# Contents:  wildmat.c
# Wrapped by rsalz@litchi.bbn.com on Thu Jan  3 13:27:38 1991
PATH=/bin:/usr/bin:/usr/ucb ; export PATH
echo If this archive is complete, you will see the following message:
echo '          "shar: End of archive."'
if test -f 'wildmat.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'wildmat.c'\"
else
  echo shar: Extracting \"'wildmat.c'\" \(2677 characters\)
  sed "s/^X//" >'wildmat.c' <<'END_OF_FILE'
X/*
X**  Do shell-style pattern matching for ?, \, [], and * characters.
X**  Might not be robust in face of malformed patterns; e.g., "foo[a-"
X**  could cause a segmentation violation.  It is 8bit clean.
X**
X**  Written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
X**  Special thanks to Lars Mathiesen for the ABORT code.  This can greatly
X**  speed up failing wildcard patterns.  For example:
X**	pattern: -*-*-*-*-*-*-12-*-*-*-m-*-*-*
X**	text 1:	 -adobe-courier-bold-o-normal--12-120-75-75-m-70-iso8859-1
X**	text 2:	 -adobe-courier-bold-o-normal--12-120-75-75-p-70-iso8859-1
X**  Text 1 matches with 51 calls, while text 2 fails with 54 calls.  Without
X**  the ABORT, then it takes 22310 calls to fail.  Ugh.
X*/
X
X#define TRUE		1
X#define FALSE		0
X#define ABORT		-1
X
Xstatic int
XStar(s, p)
X    register char	*s;
X    register char	*p;
X{
X    while (wildmat(s, p) == FALSE)
X	if (*++s == '\0')
X	    return ABORT;
X    return TRUE;
X}
X
X
Xstatic int
XDoMatch(s, p)
X    register char	*s;
X    register char	*p;
X{
X    register int 	 last;
X    register int 	 matched;
X    register int 	 reverse;
X
X    for ( ; *p; s++, p++) {
X	if (*s == '\0')
X	    return ABORT;
X	switch (*p) {
X	case '\\':
X	    /* Literal match with following character. */
X	    p++;
X	    /* FALLTHROUGH */
X	default:
X	    if (*s != *p)
X		return FALSE;
X	    continue;
X	case '?':
X	    /* Match anything. */
X	    continue;
X	case '*':
X	    /* Trailing star matches everything. */
X	    return *++p ? Star(s, p) : TRUE;
X	case '[':
X	    /* [^....] means inverse character class. */
X	    if (reverse = p[1] == '^')
X		p++;
X	    for (last = 0400, matched = FALSE; *++p && *p != ']'; last = *p)
X		/* This next line requires a good C compiler. */
X		if (*p == '-' ? *s <= *++p && *s >= last : *s == *p)
X		    matched = TRUE;
X	    if (matched == reverse)
X		return FALSE;
X	    continue;
X	}
X    }
X
X    return *s == '\0';
X}
X
X
Xint
Xwildmat(s, p)
X    char	*s;
X    char	*p;
X{
X    return DoMatch(s, p) == TRUE;
X}
X
X
X
X#ifdef	TEST
X#include <stdio.h>
X
X/* Yes, we use gets not fgets.  Sue me. */
Xextern char	*gets();
X
X
Xmain()
X{
X    char	 pattern[80];
X    char	 text[80];
X
X    printf("Wildmat tester.  Enter pattern, then strings to test.\n");
X    printf("A blank line gets prompts for a new pattern; a blank pattern\n");
X    printf("exits the program.\n\n");
X
X    for ( ; ; ) {
X	printf("Enter pattern:  ");
X	if (gets(pattern) == NULL)
X	    break;
X	for ( ; ; ) {
X	    printf("Enter text:  ");
X	    if (gets(text) == NULL)
X		exit(0);
X	    if (text[0] == '\0')
X		/* Blank line; go back and get a new pattern. */
X		break;
X	    printf("      %s\n", wildmat(text, pattern) ? "YES" : "NO");
X	}
X    }
X
X    exit(0);
X    /* NOTREACHED */
X}
X#endif	/* TEST */
END_OF_FILE
  if test 2677 -ne `wc -c <'wildmat.c'`; then
    echo shar: \"'wildmat.c'\" unpacked with wrong size!
  fi
  # end of 'wildmat.c'
fi
echo shar: End of archive.
exit 0
-- 
Please send comp.sources.unix-related mail to rsalz@uunet.uu.net.
Use a domain-based address or give alternate paths, or you may lose out.

thorinn@diku.dk (Lars Henrik Mathiesen) (01/05/91)

rsalz@bbn.com (Rich Salz) writes:
>This is a revised version of my pattern-matching routine.  It has been
>posted to the net several times, and shows up in several places including
>Gilmore's public domain TAR.  You might want to test this, but I did
>and it seems solid.
>	/r$

>X**  Special thanks to Lars Mathiesen for the ABORT code.  This can greatly
>X**  speed up failing wildcard patterns.  For example:

>Xstatic int
>XStar(s, p)
>X    register char	*s;
>X    register char	*p;
>X{
>X    while (wildmat(s, p) == FALSE)
>X	if (*++s == '\0')
>X	    return ABORT;
>X    return TRUE;
>X}

I just thought I'd mention that in the patch I sent to Rich, the body
of the Star routine read like this:

{
    register int ret;

    while ((ret = DoMatch(s, p)) == FALSE)
	if (*++s == '\0')
	    return ABORT;
    return ret;
}

(Except that I used wildmatch for DoMatch.) Rich's version gives the
correct results, but the recursion goes through the wildmat function
maps all returns of ABORT to FALSE --- so the time complexity is still
the same as for traditional versions, with double the function call
overhead. (The whole point of the ABORT return code is that it is
supposed to propagate through the Star function, aborting the loop.)

I have just noted that the test on *++s in Star is repeated (with the
same result) at the top of the loop in DoMatch, so I suggest that it
should be eliminated as in the following version of Star:

static int
Star(s, p)
    register char       *s;
    register char       *p;
{
    register int ret;

    do
	ret = DoMatch(s++, p);
    while (ret == FALSE);
    return ret;
}

It may cause one more call of DoMatch in the failing case, but I think
it is clearer.

--
Lars Mathiesen, DIKU, U of Copenhagen, Denmark      [uunet!]mcsun!diku!thorinn
Institute of Datalogy -- we're scientists, not engineers.      thorinn@diku.dk

rsalz@bbn.com (Rich Salz) (01/05/91)

In <3154@litchi.bbn.com> rsalz@bbn.com (Rich Salz) writes:
>This is a revised version of my pattern-matching routine.  It has been
>posted to the net several times, and shows up in several places including
>Gilmore's public domain TAR.  You might want to test this, but I did
>and it seems solid.
>	/r$

Well, I didn't test it enough.  Thanks to David Polansky <dp@meson.cray.com>
for showing that "*test* fails to match "atest"

Use the old version instead:
#! /bin/sh
# This is a shell archive.  Remove anything before this line, then feed it
# into a shell via "sh file" or similar.  To overwrite existing files,
# type "sh file -c".
# The tool that generated this appeared in the comp.sources.unix newsgroup;
# send mail to comp-sources-unix@uunet.uu.net if you want that tool.
# Contents:  wildmat.c
# Wrapped by rsalz@litchi.bbn.com on Fri Jan  4 16:27:29 1991
PATH=/bin:/usr/bin:/usr/ucb ; export PATH
echo If this archive is complete, you will see the following message:
echo '          "shar: End of archive."'
if test -f 'wildmat.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'wildmat.c'\"
else
  echo shar: Extracting \"'wildmat.c'\" \(2157 characters\)
  sed "s/^X//" >'wildmat.c' <<'END_OF_FILE'
X/*
X**  Do shell-style pattern matching for ?, \, [], and * characters.
X**  Might not be robust in face of malformed patterns; e.g., "foo[a-"
X**  could cause a segmentation violation.  I think it's 8bit clean.
X**
X**  Written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
X*/
X
X#define TRUE		1
X#define FALSE		0
X
X
Xstatic int
XStar(text, p)
X    register char	*text;
X    register char	*p;
X{
X    while (wildmat(text, p) == FALSE)
X	if (*++text == '\0')
X	    return FALSE;
X    return TRUE;
X}
X
X
Xint
Xwildmat(text, p)
X    register char	*text;
X    register char	*p;
X{
X    register int	last;
X    register int	matched;
X    register int	reverse;
X
X    for ( ; *p; text++, p++)
X	switch (*p) {
X	case '\\':
X	    /* Literal match with following character. */
X	    p++;
X	    /* FALLTHROUGH */
X	default:
X	    if (*text != *p)
X		return FALSE;
X	    continue;
X	case '?':
X	    /* Match anything. */
X	    if (*text == '\0')
X		return FALSE;
X	    continue;
X	case '*':
X	    /* Trailing star matches everything. */
X	    return *++p ? Star(text, p) : TRUE;
X	case '[':
X	    /* [^....] means inverse character class. */
X	    if (reverse = p[1] == '^')
X		p++;
X	    for (last = 0400, matched = FALSE; *++p && *p != ']'; last = *p)
X		/* This next line requires a good C compiler. */
X		if (*p == '-' ? *text <= *++p && *text >= last : *text == *p)
X		    matched = TRUE;
X	    if (matched == reverse)
X		return FALSE;
X	    continue;
X	}
X
X    return *text == '\0';
X}
X
X
X#ifdef	TEST
X#include <stdio.h>
X
X/* Yes, we use gets not fgets.  Sue me. */
Xextern char	*gets();
X
X
Xmain()
X{
X    char	pattern[80];
X    char	text[80];
X
X    printf("Wildmat tester.  Enter pattern, then strings to test.\n");
X    printf("A blank line gets prompts for a new pattern; a blank pattern\n");
X    printf("exits the program.\n\n");
X
X    for ( ; ; ) {
X	printf("Enter pattern:  ");
X	if (gets(pattern) == NULL)
X	    break;
X	for ( ; ; ) {
X	    printf("Enter text:  ");
X	    if (gets(text) == NULL)
X		exit(0);
X	    if (text[0] == '\0')
X		/* Blank line; go back and get a new pattern. */
X		break;
X	    printf("      %s\n", wildmat(text, pattern) ? "YES" : "NO");
X	}
X    }
X
X    exit(0);
X    /* NOTREACHED */
X}
X#endif	/* TEST */
END_OF_FILE
  if test 2157 -ne `wc -c <'wildmat.c'`; then
    echo shar: \"'wildmat.c'\" unpacked with wrong size!
  fi
  # end of 'wildmat.c'
fi
echo shar: End of archive.
exit 0
-- 
Please send comp.sources.unix-related mail to rsalz@uunet.uu.net.
Use a domain-based address or give alternate paths, or you may lose out.