[comp.lang.icon] in situ filename truncator for tar files

goer@sophist.uchicago.EDU (Richard Goerwitz) (02/28/90)

Having received a number of requests for this thing from a number
of places, it seemed prudent to clean it up, comment it, fix the
bugs, and repost.

I had no idea that anyone would really _use_ it.


#-------------------------------------------------------------------
#
#	MAPTARFILE 
#	
#	Map 15+ char. filenames in a tar archive to 14 chars.
#	Handles both the header blocks and the source itself.
#	Obviates the need for renaming files and directories
#	by hand, and for altering source and docs to refer to
#	the new file and directory names.
#
#	Richard L. Goerwitz, III
#
#	Last modified 2/27/90
#
#-------------------------------------------------------------------


global filenametbl, chunkset, short_chunkset   # see procedure mappiece(s)
record hblock(name,junk,size,mtime,chksum,
              linkflag,linkname,therest)       # see readtarhdr(s)


procedure main(a)

    usage := "usage:  maptarfile inputfile      # output goes to stdout"
    0 < *a < 2 | stop("Bad arg count.\n",usage)
    intext := open(a[1],"r") |
	stop("maptarfile:  can't open ",a[1])

    # Run through all the headers in the input file, filling
    # (global) filenametbl with the names of overlong files;
    # make_table_of_filenames fails if there are no such files.
    make_table_of_filenames(intext) |
	stop("maptarfile:  no overlong path names to map")
  
    # Now that a table of overlong filenames exists, go back
    # through the text, remapping all occurrences of these names
    # to new, 14-char values; also, reset header checksums, and
    # reformat text into correctly padded 512-byte blocks.  Ter-
    # minate output with 512 nulls.
    seek(intext,1)
    every writes(output_mapped_headers_and_texts(intext))

    close(intext)
    write_report()   # Record mapped file and dir names for future ref.
    exit(0)
    
end



procedure make_table_of_filenames(intext)

    local header # chunkset is global

    # search headers for overlong filenames; for now
    # ignore everything else
    while header := readtarhdr(reads(intext,512)) do {
	# tab upto the next header block
	tab_nxt_hdr(intext,trim_str(header.size),1)
	# record overlong filenames in several global tables, sets
	fixpath(trim_str(header.name))
    }
    *chunkset = 0 & fail
    return &null

end



procedure output_mapped_headers_and_texts(intext)

    # Remember that filenametbl, chunkset, and short_chunkset
    # (which are used by various procedures below) are global.
    local header, newtext, full_block, block, lastblock

    # Read in headers, one at a time.
    while header := readtarhdr(reads(intext,512)) do {

	# Replace overlong filenames with shorter ones, according to
	# the conversions specified in the global hash table filenametbl
	# (which were generated by fixpath() on the first pass).
      	header.name := left(map_filenams(header.name),100,"\x00")
	header.linkname := left(map_filenams(header.linkname),100,"\x00")

	# Use header.size field to determine the size of the subsequent text.
	# Read in the text as one string.  Map overlong filenames found in it
 	# to shorter names as specified in the global hash table filenamtbl.
	newtext := map_filenams(tab_nxt_hdr(intext,trim_str(header.size)))

	# Now, find the length of newtext, and insert it into the size field.
	header.size := right(exbase10(*newtext,8) || " ",12," ")

	# Calculate the checksum of the newly retouched header.
	header.chksum := right(exbase10(get_checksum(header),8)||"\x00 ",8," ")

	# Finally, join all the header fields into a new block and write it out
	full_block := ""; every full_block ||:= !header
	suspend left(full_block,512,"\x00")

	# Now we're ready to write out the text, padding the final block
	# out to an even 512 bytes if necessary; the next header must start
	# right at the beginning of a 512-byte block.
	newtext ? {
	    while block := move(512)
	    do suspend block
	    pos(0) & next
            lastblock := left(tab(0),512,"\x00")
	    suspend lastblock
	}
    }
    # Write out a final null-filled block.  Some tar programs will write
    # out 1024 nulls at the end.  Dunno why.
    return repl("\x00",512)

end



procedure trim_str(s)

    # Knock out spaces, nulls from those crazy tar header
    # block fields (some of which end in a space and a null,
    # some just a space, and some just a null [anyone know
    # why?]).
    return s ? {
	(tab(many(' ')) | &null) &
	    trim(tab(find("\x00")|0))
    } \ 1

end 



procedure tab_nxt_hdr(f,size_str,firstpass)

    # Tab upto the next header block.  Return the bypassed text
    # as a string (this value is not always used).

    local hs, next_header_offset

    hs := integer("8r" || size_str)
    next_header_offset := (hs / 512) * 512
    hs % 512 ~= 0 & next_header_offset +:= 512
    if 0 = next_header_offset then return ""
    else {
	# if this is pass no. 1 don't bother returning a value; we're
	# just collecting long filenames;
	if \firstpass then {
	    seek(f,where(f)+next_header_offset)
	    return
	}
	else {
	    return reads(f,next_header_offset)[1:hs+1] |
		stop("maptarfile:  error reading in ",
		     string(next_header_offset)," bytes.")
	}
    }

end



procedure fixpath(s)

    # Fixpath is a misnomer of sorts, since it is used on
    # the first pass only, and merely examines each filename
    # in a path, using the procedure mappiece to record any
    # overlong ones in the global table filenametbl and in
    # the global sets chunkset and short_chunkset; no fixing
    # is actually done here.

    s2 := ""
    s ? {
	while piece := tab(find("/")+1)
	do s2 ||:= mappiece(piece) 
	s2 ||:= mappiece(tab(0))
    }
    return s2

end



procedure mappiece(s)

    # Check s (the name of a file or dir as recorded in the tar header
    # being examined) to see if it is over 14 chars long.  If so,
    # generate a unique 14-char version of the name, and store
    # both values in the global hashtable filenametbl.  Also store
    # the original (overlong) file name in chunkset.  Store the
    # first fifteen chars of the original file name in short_chunkset.
    # Sorry about all of the tables and sets.  It actually makes for
    # a reasonably efficient program.  Doing away with both sets,
    # while possible, causes a tenfold drop in execution speed!
    
    # global filenametbl, chunkset, short_chunkset
    local j, ending

    initial {
	filenametbl := table()
	chunkset := set()
	short_chunkset := set()
    }
    
    chunk := trim(s,'/')
    if *chunk > 14 then {
	i := 0
	repeat {
	# if the file has already been looked at, continue
	    if \filenametbl[chunk] then next
	# else find a new unique 14-character name for it
	# preserve important suffixes like ".Z," ".c," etc.
	    if chunk ?
	       (tab(find(".")), ending := move(1) || tab(any(&ascii)), pos(0))
	    then lchunk := chunk[1:11] || right(string(i+:=1),2,"0") || ending
	    else lchunk := chunk[1:12] || right(string(i+:=1),3,"0")
	    if lchunk == !filenametbl
	    then next else break
	}
	# record filename in various global sets and tables
	filenametbl[chunk] := lchunk
	insert(chunkset,chunk)
	insert(short_chunkset,chunk[1:16])
    }
    else lchunk := chunk

    lchunk ||:= (s[-1] == "/")
    return lchunk

end



procedure readtarhdr(s)

    # Read the silly tar header into a record.  Note that, as was
    # complained about above, some of the fields end in a null, some
    # in a space, and some in a space and a null.  The procedure
    # trim_str() may (and in fact often _is_) used to remove this
    # extra garbage.

    this_block := hblock()
    s ? {
	this_block.name     := move(100)    # <- to be looked at later
	this_block.junk     := move(8+8+8)  # skip the permissions, uid, etc.
	this_block.size     := move(12)     # <- to be looked at later
	this_block.mtime    := move(12)
	this_block.chksum   := move(8)      # <- to be looked at later
	this_block.linkflag := move(1)
	this_block.linkname := move(100)    # <- to be looked at later
	this_block.therest  := tab(0)
    }
    integer(this_block.size) | fail  # If it's not an integer, we've hit
                                     # the final (null-filled) block.
    return this_block

end



procedure map_filenams(s)

    # Chunkset is global, and contains all the overlong filenames
    # found in the first pass through the input file; here the aim
    # is to map these filenames to the shortened variants as stored
    # in filenametbl (GLOBAL).

    local s2

    s2 := ""
    s ? {
	until pos(0) do {
	    # first narrow the possibilities, using short_chunkset
	    if member(short_chunkset,&subject[&pos:&pos+15])
            # then try to map from a long to a shorter 14-char filename
	    then s2 ||:= (filenametbl[=!chunkset] | move(1))
	    else s2 ||:= move(1)
	}
    }
    return s2

end


#  From the IPL.  Thanks, Ralph -
#  Author:  Ralph E. Griswold
#  Date:  June 10, 1988
#  exbase10(i,j) convert base-10 integer i to base j
#  The maximum base allowed is 36.

procedure exbase10(i,j)

   static digits
   local s, d, sign
   initial digits := &digits || &lcase
   if i = 0 then return 0
   if i < 0 then {
      sign := "-"
      i := -i
      }
   else sign := ""
   s := ""
   while i > 0 do {
      d := i % j
      if d > 9 then d := digits[d + 1]
      s := d || s
      i /:= j
      }
   return sign || s

end

# end IPL material


procedure get_checksum(r)
 
    # Calculates the new value of the checksum field for the
    # current header block.  Note that the specification say
    # that, when calculating this value, the chksum field must
    # be blank-filled.

    sum := 0
    r.chksum := "        "
    every field := !r
    do every sum +:= ord(!field)
    return sum

end



procedure write_report()

    # This procedure writes out a list of filenames which were
    # remapped (because they exceeded the SysV 14-char limit),
    # and then notifies the user of the existence of this file.

    local outtext, stbl, i

    (outtext := open(fname := "mapping.report","w")) |
	open(fname := "/tmp/mapping.report","w") |
	     stop("maptarfile:  Can't find a place to put mapping.report!")
    stbl := sort(filenametbl,3)
    every i := 1 to *stbl -1 by 2 do {
	write(outtext,left(stbl[i],35," ")," ",stbl[i+1])
    }
    write(&errout,"maptarfile:  ",fname," contains the list of changes")
    close(outtext)
    return &null

end