[comp.sources.misc] mytags - awk script to complement ctags.

tcjones@watdragon.waterloo.edu (Crocodile Dundee) (11/09/87)
Here is an awk script that complements ctags. It will produce a tags file which
has tags for all #define's and also for the first occurrence of all (well
almost) identifier names.

It is a little simple-minded about the way it collects identifier names, the
cases that are not handled are documented in the code.



Steve Hayman & Terry Jones

-------------------------------------------------------------------------------
             Department Of Computer Science, University Of Waterloo
			 Waterloo Ontario Canada N2L 3G1

{ihnp4,allegra,decvax,utzoo,utcsri,clyde}!watmath!watdragon!tcjones
tcjones@dragon.waterloo.{cdn,edu} tcjones@WATER.bitnet
tcjones%watdragon@waterloo.csnet [from oz, tcjones@dragon.waterloo.cdn@munnari]
-------------------------------------------------------------------------------




#!/bin/sh
# This is a shell archive.  Remove anything before this line,
# then unpack it by saving it in a file and typing "sh file".
#
# Wrapped by watdragon!tcjones on Sat Nov  7 04:39:02 EST 1987
# Contents:  mytags
 
echo x - mytags
sed 's/^@//' > "mytags" <<'@//E*O*F mytags//'
#!/bin/sh
PATH=/bin:/usr/ucb:/usr/bin
#
#   usage: mytags [source-files]
#   Enhanced version of ctags.
#   Merge standard "ctags" and create extra tags from #define statements
#   and declarations. 
#
#   Declaration cases not handled:
#   ==============================
#
#
#       - Repeated identifier names. 
#         ==========================
#           Only the first instance will be tagged.
#           Be careful about ^]'ing to tags that are in functions... you may
#           not get what you want. Worse, you might get put into another file
#           without getting what you want. You can always get back with ^^
#
#
#       - Lines of declarations that are continued with a comma e.g.
#         ==========================================================
#               int fred, harry, joe,
#                   mike, dick;
#           Will not try to produce tags for mike or dick.
#
#
#       - Declaration lines that do not start with a type name e.g.
#         =========================================================
#               /* silly comment in the way */   int fred;
#           Will not tag fred.
#
#
#       - Declarations in comment blocks will be tagged e.g.
#         ==================================================
#               /* start of comment
#                   int fred;
#                   int harry;
#               end of comment */
#           Will produce tags for fred and harry (if they don't already exist).
#
#   
#   Run ctags, create extra tags, sort.
#
#   Note that vi searches in NOMAGIC mode, meaning
#   only ^ and $ have any effect.  Thus we have
#   to escape these, and /\, but nothing else.
#   (Note also that due to a bug in vi you get left in
#    nomagic mode if the pattern isn't found)
#
#   Steve Hayman (MFCF)
#   Terry Jones  (F.U.N. Corporation)       18/10/87
#
#-------------------------------------------------------------------------------
#            Department Of Computer Science, University Of Waterloo
#	   	     Waterloo Ontario Canada N2L 3G1
#
#{ihnp4,allegra,decvax,utzoo,utcsri,clyde}!watmath!watdragon!tcjones
#tcjones@dragon.waterloo.{cdn,edu} tcjones@WATER.bitnet
#tcjones%watdragon@waterloo.csnet [from oz, tcjones@dragon.waterloo.cdn@munnari]
#-------------------------------------------------------------------------------
#
#

if [ $# -eq 0 ]
then 
    echo usage: `basename $0` files
    exit 1
fi


#
# Make the standard tags file with ctags.
#

ctags -w -t $*


#
# Do the additional tags
#

awk '

    #
    # Initialise a few handy-dandy associative arrays.
    #

    BEGIN {
        keywd["char"]++
        keywd["int"]++
        keywd["long"]++
        keywd["double"]++
        keywd["float"]++
        keywd["short"]++
        keywd["register"]++
        keywd["static"]++
        keywd["void"]++
        keywd["unsigned"]++

        follow["["]++;
        follow["="]++
        follow[";"]++
    }

    #
    # The #define grabber.
    #

    NF > 0 && /^#[  ]*define/ {

        total_tokens++

        if ($1 == "#")
            token = $3
        else
            token = $2

        #
        # Careful with macro functions.
        #

        if ( i = index(token, "(") )
            token = substr(token, 1, i - 1)

        #
        # Set up these tags for later output (see END clause).
        #

        patterns[total_tokens] = $0
        files[total_tokens] = FILENAME
        tags[total_tokens] = token

        next
    }

    #
    # The declaration grabber.
    #

        #
        # Make sure we have some fields and that the first is a type name.
        # Could check that NF>1 but for declarations like int*fred;
        #

    NF > 0 && keywd[$1] == 1  { 

        #
        # If the last field is a keyword then we must have something like
        #
        #   unsigned int
        #   silly()
        #
        # And so we should just continue to the next line
        # (We could probably do a getline before the next, but then again
        #  they might just have a #define there... who knows? who cares?)
        #

        if ( keywd[$NF] )
            next

        #
        # Check to find the first word on the line that is not in the keywd
        # array. This must (famous last words) be the identifier we want.
        #

        for ( i = 2; i <= NF; i++ ) {
            if( keywd[$i] == 0 )
                break
        }

        #
        # Get the tail of the line, starting from the first identifier.
        #

        spot = index($0, $i)
        line = substr($0, spot, length - spot + 1)

        #
        # Strip trailing characters from line like ; and = and [ if present
        #
        # *Dont* break out of the for loop once you have found one as
        # this will make the order of their declaration in the START
        # clause important. Anyway, it is not clear who would come
        # first out of = and [
        #
        # We do this here since we want a line such as
        #
        # char *fred="this is fred" /* comment about fred the char* */
        #
        # to be cut off at the "=" instead of processing each of the ten fields
        # *fred="this, is, fred", /*, comment, about, fred, the, char* and */
        # to see if it they are identifiers. This way we process only "*fred".
        # since the line gets chopped off at the "=".
        # 
        # (Dont take "," out at this stage, since we are going to split on ",")
        #

        for ( f in follow ) {
            if ( j = index(line, f) ) {
                # god knows why i have to do this
                fred = substr(line, 1, j - 1)
                line = fred
            }
        }

        #
        # Split the line that remains on commas.
        #

        total_ids = split(line, identifiers, ",")

        #
        # Process each of the identifiers.
        #

        for ( i = 1; i <= total_ids; i++ ) {

            token = identifiers[i]

            if ( length( token ) == 0 )
                continue
            
            #
            # If there is a "(" present then this must be a function name
            # as in
            #
            # int silly()
            #
            # so we just continue.
            #

            if ( index(token, "(") )
                continue

            #
            # Strip off leading white space and * characters.
            #

            while ( (first = substr(token, 1, 1) ) == "*" || \
                first == " " || first == "  " )

                token = substr(token, 2, length(token) - 1)
            
            #
            # Otherwise lets assume we have an identifier.
            # Check to see that it is not already in existence, if it is
            # then its too bad for the user, well throw this one away.
            #
            # (one alternative would be to prepend the function name (if there
            # is one) to the identifier name). But this is messy and probably
            # would never get used anyway.
            #

            if ( identifiers[ token ] == 1 )
                continue

            identifiers[ token ] = 1
            total_tokens++

            
            #
            # And finally set up the arrays for later use.
            #

            patterns[total_tokens] = $0
            tags[total_tokens] = token
            files[total_tokens] = FILENAME
        }
    }


    #
    # Finally, process all of the tags array.
    #
    # The search pattern is the entire line. Print a line that looks like
    #
    # token <tab> filename <tab> /<appropriately-escaped-pattern>/
    #

    END {
        for ( tok in patterns ) {

            pattern = patterns[tok]
            file = files[tok]
            tag = tags[tok]

            printf "%s\t%s\t/^", tag, file
            for ( i = 1; i <= length(pattern); i++ ) {

                if( index("^$/\\",  c = substr(pattern,i,1)) )
                    printf "\\"
                printf "%s", c
            }
            printf "$/\n"
        }
    }
#
# Send all of this into sort, merging the tags we created with ctags
#
' $* | sort -u -o tags - tags
@//E*O*F mytags//
chmod u=rwx,g=rx,o=rx mytags
 
exit 0