[comp.lang.perl] Multi line, flat file, record reader ...

wengland@stephsf.stephsf.com (Bill England) (01/10/91)

package rec_reader; 
## 
 #  Copyright (c) 1991, Stephen Software Systems, Inc.  
 #  All Rights Reserved.
 #
 #  Permission is granted to all interested parties to distribute 
 #  this perl library under the terms of the GNU PUBLIC LICENSE,
 #
 #  ( This is the same license included with most major GNU       )
 #  ( sofware packages.  Look for the file COPYING in the build   )
 #  ( directory of the GNU software.                              )
 #
 # This copyright and license notice must be retained. This 
 # program is distributed WITHOUT ANY WARRANTY and without even 
 # the implied # warranty of MERCHANTABILITY or FITNESS FOR A 
 # PARTICULAR PURPOSE.
 #
 # # #
 #
 # This software system is currently under construction/design/hack,
 # Please don't mind the mess :-)   ...
 #
 # Formatting Notes:
 #     80 column width, 4 space tabs.
 #
 # Modification Notes:
 #
 #      Bill England, Sun Dec 02 15:00:03 PST 1990, 
 #		Library documentation.
 # 
 #      Bill England, November 1990
 #		Created.
 #
 # Please send problems, enhancements, and corrections to
 # support@stephsf.COM
##
##
 #  _Randomly arranged record parser_ library.  This library is
 #  used for parsing randomly arranged, multiple line, flat record 
 #  files where a single record is not necessarily on one line and 
 #  where record formats, and record/field seperators differ from file 
 #  to file.
 #
 #  Routines;
 #      Parse_Index,
 #			Is used to find and parse the first line
 #			and index of the file.  To use feed each line
 #			of the file to Parse_Index and call Rtn_Index
 #			immediatly afterwards.  When Rtn_Index returns
 #			true (1) the index has been found and seperated
 #			into field names.
 #
 #      Parse_Rec,
 #      Rtn_Last_Rec,
 #			Are used to accept multiple lines of a file and
 #			return a completly found record in an associative
 #			array with the names of the fields from the current
 #			files index.
##
##
 #   As an example of how to use these routines the following 
 #   example is provided.
 # # # #
 #
 # $f_have_index = 0;
 # $last_in_file_name = 0;
 # while(<>){
 #		chop;
 # 
 #		if( $f_have_index == 0  ){
 #			$f_have_index = &Parse_Index($_);
 #		}else{
 #			if( $last_in_file_name eq $ARGV ){
 #				%rec = &Parse_Rec($_);
 #			}else{
 #				  #  When changing files force  out the last record
 #				  #  and reset the have index flag.
 #					%rec = &Rtn_Last_Rec;
 #					$f_have_index = &Parse_Index($_);
 #			}
 # 				&Your_Routine_That_Uses_The_Data( %rec ) if %rec;
 #		}
 #		$last_in_file_name = $ARGV;
 #	}
 #	die "Incomplete record file.\nAn index and records were not found.\n"
 #		unless $f_have_index;
 # 
 #  %rec = &Rtn_Last_Rec;
 #  &Your_Routine_That_Uses_The_Data( %rec ) if %rec;
 #
 # # # # # # # # #
 # An example data file follows; ( record seperator = ! 
 #                                  and field seperator = ~ )
 #
 #!~Company
 #~Name
 #~TitleName
 #~Address
 #~CityStateZip
 #
 #!~Washington State Employees Credit Union
 #~Jane Smith
 #~Ms. Smith
 #~P.O. Box WSECU
 #~Olympia, Washington    98507
 #
 #!~Weyerhaeuser Tacoma Credit Union
 #~Jane Smith
 #~Ms. Smith
 #~33615 First Way South
 #~Federal Way, Washington     98003
 #
 #!~Alaska Airlines Employee Federal Credit Union
 #~Jane Smith
 #~Ms. Smith
 #~19530 Pacific Hwy South #201
 #~Seattle, Washington       98188
##
##
 # Parse rec depends on having a valid index record found
 # at the begining of a file.  The functions Parse_Index/Rtn_Index
 # are required to run before Parse_Rec/Rtn_Rec can be used.
 #
 # Global Vars ( Muli_Line_Rec library ):
 #   rec_sep     ... record seperator.
 #   fld_sep	 ... field seperator.
 #   rec_concat  ... is the string buffer for the current record.
 #   parse       ... T/F indicates if parse has started and triggers
 #                   above variable initializations.
##

@name_index = ();
$parse = 0;
1;
##
 # Parse_Rec, Parse record and return successfully found record.
 #
##
sub main'Parse_Rec{
	local( $line_in ) = @_;

	die "The Parse_Index function has not yet completed successfully.\n"
		unless  $parse;

	$rec_concat= $rec_concat.$line_in;

	if( $line_in =~ /$rec_sep/)
	{	
		local($idx, $fld_count, @flds, $cnt, %ass_r ); 

		# Split the record out looking for the start of the
		# next record, i.e. 2 occurances of $rec_sep.
		#
		local($fld_data, $remain) = split(/$rec_sep/, $rec_concat, 2 );

		if( $remain ne '' ){
			($trash, $rec_concat) = split(//,$remain, 2);

			@flds    = split(/$fld_sep/, $fld_data);

 		    # if trailing fields are blank/null then the $flds array
		    # will be short counted.  $fld_count will contain the
		    # exact number of fields on the record.

			$fld_count= ($fld_data =~ s/$fld_sep/$fld_sep/g);
			$fld_count++;

			if(@name_index != $fld_count){ 
			 print STDERR "Number of fields in record does not match index.\n";
			 print STDERR "$fld_data\n", 
							join( "$fld_sep", @name_index ),"\n","\n";
			 return ();
			}

			$cnt=0;
			foreach $idx(@name_index){
				$ass_r{$idx} = $flds[$cnt++];	
			}
			return %ass_r;
		}
	}
	return (); # for consistancy use return although "();" might be faster.
}
##
 # Rtn_Last_Rec, Return last record.
 #
##
sub main'Rtn_Last_Rec{
	local($fld_data, $remain) = split(/$rec_sep/, $rec_concat, 2 );
	local($idx, @flds, $cnt, %ass_r ); 

	# Returning the last record implies that any existing index
	# is now garbage and that the index parse boolean is no longer
	# true.
	#
	$parse = 0;

	# Here we split the record out looking for the start of the
	# next record, i.e. 2 occurances of $rec_sep.
	#
	@flds    = split(/$fld_sep/, $fld_data);

    # if trailing fields are blank/null then the $flds array
    # will be short counted.  $fld_count will contain the
    # exact number of fields on the record.

	$fld_count= ($fld_data =~ s/$fld_sep/$fld_sep/g);
	$fld_count++;

	if(@name_index != $fld_count){ 
	 print STDERR "Number of fields in record does not match index.\n";
	 print STDERR "$fld_data\n", join( "$fld_sep", @name_index ),"\n";
	 return ();
	}

	$cnt=0;
	foreach $idx(@name_index){
		$ass_r{$idx} = $flds[$cnt++];	
	}
	return %ass_r;
}
##
 # Parse Index functions.
 #
##
sub main'Parse_Index {
	local( $line_in ) = @_;
	local( $remain );

	# First char is record seperator
	# Second char is  field seperator
	# Everything left is name of first field
	#
	if (!$parse){
		$parse = 1;
		$rec_concat='';
		($rec_sep, $fld_sep, $remain) = split(//,$line_in, 3);

		die "Identical Record and Field Seperators.\n"
			unless $rec_sep ne $fld_sep;

		$line_in = $remain;
	}

	$rec_concat= $rec_concat.$line_in;

	  {
	  local($fld_names, $remain) = split(/$rec_sep/, $rec_concat, 2 );
	  local($trash);

	  # Here we split the record out looking for the start of the
	  # next record, i.e. two occurances of $rec_sep.
	  #
	  if( $remain ne '' ){
		# trash is the first fld_sep of the next record.
		# ( Works like  chop except it chops the first char 
		#   instead of the last. )
		#
		($trash, $rec_concat) = split(//,$remain, 2);

		# Make the record names case insensitive, 
		# others may want to change this ... a runtime option
		# could  be created, say -s (case-sensitive).
		#	
		$fld_names =~ y/A-Z/a-z/;

		@name_index = split(/$fld_sep/, $fld_names);
		return 1;
	  }
	}
	return 0;
} # end of parse label index
-- 
 +-  Bill England,  wengland@stephsf.COM -----------------------------------+
 |   * *      H -> He +24Mev                                                |
 |  * * * ... Oooo, we're having so much fun making itty bitty suns *       |
 |__ * * ___________________________________________________________________| 

wengland@stephsf.stephsf.com (Bill England) (01/11/91)

 Oops, I forgot to change the comments after merging some of the 
 functions together.  The comments under "Routines:" should look
 more like this;

 #  Routines;
 #      Parse_Index,
 #          Is used to find the set of lines that from
 #          the indx to the file.  When the file index
 #          has been found the funcition returns 1 (TRUE).
 #
 #      Parse_Rec,
 #      Rtn_Last_Rec,
 #          Are used to collect  multiple lines of the data file
 #          and retrun the record in an associative array.

  Just in case applications are not obvious, this library helps in
parsing files where all of the record elements may not be on one
line.  In particular some word processors accept files of the format
that these routines can process.

  I'd like to write a sort program that would sort files in this
type format based upon a records field name.  In the example for
instance to sort by "CityStateZip".  What would be a the fastest 
sorting method to use ? 


-- 
 +-  Bill England,  wengland@stephsf.COM -----------------------------------+
 |   * *      H -> He +24Mev                                                |
 |  * * * ... Oooo, we're having so much fun making itty bitty suns *       |
 |__ * * ___________________________________________________________________|