#!/usr/bin/perl # awkm # Prints awk style fields in formatted lines. # # # Author: Eric Canzler Create Date: 03/24/2016 # Change Date: 03/24/2016 # Review: Eric Canzler Review Date: 03/24/2016 # # Update: Fix ignore empty first field 09/01/2017 # Update: Fix parenthesis matching 09/03/2016 # Update: Change period matching 07/21/2016 # Update: added splitchars CLI option 07/20/2016 # Update: added join lines capability 03/30/2016 # # $onelineflag = 0; $argc = scalar(@ARGV); if (($ARGV[0] =~ /^-h/i) || ($ARGV[1] =~ /^-h/i) || ($ARGV[2] =~ /^-h/i) || ($ARGV[0] =~ /^h$/i) || ($ARGV[1] =~ /^h$/i) || ($ARGV[2] =~ /^h$/i) ) { print <<'EOF'; awkm: splits lines by whitespace into component character strings, and prints the fields in the order specified by the command line argument string. Usage: awkm [filename]\n"; Usage: cat filename | awkm \n"; With awk, you print the specified fields with format strings in between the fields. With awkm, you print a single format string, with fields specified in between the characters of the format string. The formatstring is inside quote marks and fields are specified by Fn where n is the field number. Field numbers start from 1 and cannot contain leading zeros. Field numbers go up to F99. Example: awkm "F1 is the first field, F2 is the second" This prints out for each line: is the first field, is the second If a field does not exist, then F#n is printed to mark the place where it would have been if it existed. This can be turned off by putting the letter "i" for ignore, as the very first character of the formatstring. If the very first real character of the format string happens to be the letter "i" then it must be escaped with a backslash. Field numbers start with 1, so the special field F0 means "The whole line." Cat any text file through "awkm F1" and you will see only the first word of every line, unless its a blank line and then you will see F#1. The format string does not need quotes if it contains no spaces, but special characters will need to be escaped to prevent shell interpolation. But simple things like just F1 or F2F1 with no spaces don't require quotes. awkm can split lines by inserting "\n" into the formatstring This can be used to turn a single line into two or more lines with only the fields you specify in each resulting line. awkm can also join lines by inserting "\j" in the formatstring To join two lines, we have one "\j" and everything before the "\j" is applied to the first line, and everything after, is is applied to the second line read in. If there are further joins another line is read in, until the final join is complete. The same process then continues through end of input. You can also combine joins and splits on lines producing interesting results. EOF exit(1); } # $ARGV[0] is always the format string. # $ARGV[1] and $ARGV[2] can be either filename or splitchars # If we dont get a filename, assign FILE pointer IN as a pointer to STDIN, # so we just open IN and it will either be a file or STDIN. $filename = ""; $replace = ""; if ($argc > 1) { $filename = $ARGV[1]; if ($filename =~ /-F.+/) { $replace = $filename; if ($argc > 2) { $filename = $ARGV[2]; } else { $filename = ""; *IN = *STDIN; } } else { ########## Its a real filename so now check for replace chars if ($argc > 2) { $replace = $ARGV[2]; } } } else { *IN = *STDIN; } # If we have a filename, ignore the real STDIN and use its FILE handle # for the real file. if ($filename ne "") { open(IN, "<$filename") or die "Can't open $filename for input"; } # Default to space char which is treated as "any contiguous whitespace" $splitchar = " "; # Here we process the splitstring. We first remove the -F and check for # an escaped t and substitute a real tab character globaly. if (length($replace) > 2) { $replace =~ s/^-F//; if ($replace =~ /\\t/) { $replace =~ s/\\t/ /g; } if ($replace =~ /\\/) { $replace =~ s/\\//; } # Next we check for a space character. If we have one, then it will also # work on tabs, if there are any in the input line. A space automatically # means tabs are treated as spaces. $splitchar is the main split character. # We remove it from the split string and later, all other split characters # will be converted to spaces. if ($replace =~ / /) { $splitchar = " "; $replace =~ s/ //g; @replacechars = split("", $replace); } # If there are no spaces, then we need to use one of the splitstring # characters as the main splitchar. I could search for an easy character # instead of special characters, but it doesn't much matter. We just take the # first character and use that. else { @replacechars = split('', $replace); $splitchar = shift(@replacechars); } # Now, we've removed any spaces or at least one character to use as the main # splitchar. There may be no other characters left to split on, so mark that # down so we don't waste time on it for every line of the input file. # If $replace == "" then there is only one split character and its stored # in $splitchar. if (!scalar(@replacechars)) { $replace = ""; } } #print "SPLITCHAR $splitchar REPLACE $replace\n"; #exit(1); # This is needed to flag that we compress splitchars down to a single # character to eliminate empty fields. But predefined, and possibly empty # fields may be the wanted behavior. So we can add a CLI switch to alter # this later. Right now default is the only option, and thats to compress. # If you don't understand the above discussion, its not that important, # just thinking out loud about feature-creep, but not really going to do it. $compresschars = 1; # Now we process the format string. First check for leading i for ignore and # note that if we find one and then remove it. Also make sure any escaped i # as the first character of the format string is cleaned up. $argv = $ARGV[0]; $ignore = 0; # If it starts with "I" we need to set ignore to 1 (true) and remove the i. if ($argv =~ /^[i]/) { $ignore = 1; $argv =~ s/^.//; } # If it starts with an escaped i, meaning lowercase i is supposed to be the # first character of the output for each line, we have to remove the backslash. if ($argv =~ /^[\\][i]/) { $ignore = 0; $argv =~ s/^.//; } # Now name a master copy of the split string. Thats what argv was holding. $mastercopy = $argv; # In case there is backslashed F in the format string anywhere, replace them # all with this special string. The string does NOT contain F or f, and is very # unlikely to appear in your data set. This is how we mark and restore real # capital F followed by a digit. The format string has to have the F escaped, # so we know its a real F, but we have to remove the backslash to make it # look right in the end. Later we simply replace this special string with # upper case F. The same technique is used to mark F's in the input lines. $mastercopy =~ s/[\\][F]/XYZ-xiwcOP4CKh2yvX5tc3iTADxP485aUL6HmK0Eyzdqxy-XYZ/g; # XYZ-xiwcOP4CKh2yvX5tc3iTADxP485aUL6HmK0Eyzdqxy-XYZ # ABC-niV0bc4ATXcJm9E3Tq0ss9PV8PRg5uQksyNr7tW6ke-ABC #print "MASTER $mastercopy\n"; #exit(1); # Here is where we figure out if we have to join multiple lines. If we have # to join, we have to read in a line for each join, plus one more. So we # split the format string on the "\j"'s and count the number of format # strings we have. Each format string will be applied to its line as the # blocks of lines are read in. # Use octal escape for backslash char, Octal 134 @formatlines = split('\134j',$mastercopy); $numlines = scalar(@formatlines); # Clear the final output string. $finalprintstring = ""; $alldone = 0; do { ### until ($alldone); foreach $formatline (@formatlines) { # $submaster is a copy of the original format line to work with. $submaster = $formatline; @fields = (); # Now we read in a line. $thisline = ; # If we didn't get anything then input is all over and we have to finish up. # But if we got a line, chomp it first and replace and uppercase F's with # the special string that marks them. if (defined($thisline)) { chomp $thisline; $thisline =~ s/F/ABC-niV0bc4ATXcJm9E3Tq0ss9PV8PRg5uQksyNr7tW6ke-ABC/g; # If we have extra replace characters, we have to convert each instance in the # input line to the splitchar character, which is hopefully a space. if ($replace ne "") { #print "BEFORE $thisline\n"; #print "SPLIT1: \"$splitchar\" @replacechars\n"; # We have at least one extra replace character. # Most of these just escape the character, but some have to use the tr function # Also, have to use octal excapes to detect perenthesis. Its a mess. # And we have to go through all this same stuff all over again down below. foreach $char (@replacechars) { $done = 0; if ($char =~ /^\;$/) { $thisline =~ s/\;/$splitchar/g; $done = 1; } if ($char =~ /^\!$/) { $thisline =~ s/\!/$splitchar/g; $done = 1; } # period if ($char eq ".") { $thisline =~ s/\056/$splitchar/g; $done = 1; } if ($char =~ /^\`$/) { $thisline =~ s/\`/$splitchar/g; $done = 1; } if ($char =~ /^\$$/) { $thisline =~ s/\$/$splitchar/g; $done = 1; } if ($char =~ /^\^$/) { $thisline =~ s/\^/$splitchar/g; $done = 1; } if ($char =~ /^\*$/) { $thisline =~ s/\*/$splitchar/g; $done = 1; } # if ($char =~ /^\050$/) { if ($char =~ /^[(]$/) { $thisline =~ s/$/$splitchar/g; $done = 1; } # if ($char =~ /^\051$/) { if ($char =~ /^[)]$/) { $thisline =~ s/$/$splitchar/g; $done = 1; } if ($char =~ /^\+$/) { $thisline =~ s/\+/$splitchar/g; $done = 1; } if ($char =~ /^\[$/) { $thisline =~ s/\[/$splitchar/g; $done = 1; } if ($char =~ /^\]$/) { $thisline =~ s/\]/$splitchar/g; $done = 1; } if ($char =~ /^\|$/) { $thisline =~ s/\|/$splitchar/g; $done = 1; } if ($char =~ /^\\$/) { $thisline =~ s/\\/$splitchar/g; $done = 1; } if ($char =~ /^\'$/) { $thisline =~ s/\'/$splitchar/g; $done = 1; } if ($char =~ /^\?$/) { $thisline =~ s/\?/$splitchar/g; $done = 1; } # If it didn't match any special character above then its just a plain # alpha num and substitute is nothing special. if (!$done) { $thisline =~ s/$char/$splitchar/g; $done = 1; } } #print "DEBUG AFTER $thisline\n"; } # At this point we've replaced all the extra splitchars with the main splitchar # character, so now we have to merge each contiguous chunk of them down to # one. If we are on default space for splitchar this is not needed. # Compress sequential splitchars back into one char to seperate fields # I might have to expand this to include special handling for certain # special characters, such as backslash. But if it doesn't work on them # and it turns out not to be needed I might not bother. Also, I have # a switch reserved for CLI switch to turn of compressing fields in case they # need predefined fields that might be blank. if ($compresschars) { @copyline = split('', $thisline); $lastchar = $splitchar; $newline = ""; foreach $copychar (@copyline) { if (($copychar eq $splitchar) && ($copychar eq $lastchar)) { # Dont do nuttin. This removes any consecutive splitchars, and also, because # we set $lastchar = $splitchar before the loop, this also removes any # leading splitchars, so our field count is gauranteed to begin with one # as the first thing that contains no splitchars. Otherwise, a leading # splitchar would cause an empty first field, and our fields have to be # guaranteed to be non-empty. } else { $newline = $newline . $copychar; $lastchar = $copychar; } } $thisline = $newline; } # END if ($compresschars) #print "DEBUGB $thisline\n"; # Now we go through the special handling. We have to check for any special # character used as a split char. Since the default space is most likely # to be used as split char, check that first and then check for any others. # We don't have to check $replace to see if we have extra split chars, # because we just converted them all to the main splitchar. So if thats # a space, then use the special awk style "whitespace" split. Otherwise, # use the escape split for whatever character the main splitchar ended up # being. $done = 0; if ($splitchar eq " ") { @parts = split(' ', $thisline); } else { if ($splitchar =~ /\./) { #print "DOING period now\n"; @parts = split('\056', $thisline); #print "PARTS @parts\n"; $done = 1; } if ($splitchar =~ /\;/) { @parts = split(/\;/, $thisline); $done = 1; } if ($splitchar =~ /\!/) { @parts = split(/\!/, $thisline); $done = 1; } if ($splitchar =~ /\?/) { @parts = split(/\?/, $thisline); $done = 1; } if ($splitchar =~ /\'/) { @parts = split(/\'/, $thisline); $done = 1; } if ($splitchar =~ /\134/) { @parts = split('\134', $thisline); $done = 1; } if ($splitchar =~ /^\|$/) { @parts = split('\|', $thisline); $done = 1; } if ($splitchar =~ /\+/) { @parts = split('\+', $thisline); $done = 1; } if ($splitchar =~ /\[/) { @parts = split('\[', $thisline); $done = 1; } if ($splitchar =~ /\]/) { @parts = split('\]', $thisline); $done = 1; } if ($splitchar =~ /[(]/) { @parts = split('$', $thisline); #print "SPLITCHAR PARTS @parts\n"; $done = 1; } if ($splitchar =~ /[)]/) { @parts = split('$', $thisline); #print "SPLITCHAR PARTS @parts\n"; $done = 1; } if ($splitchar =~ /\`/) { @parts = split('\`', $thisline); $done = 1; } if ($splitchar =~ /\$/) { @parts = split('\$', $thisline); $done = 1; } if ($splitchar =~ /\^/) { @parts = split('\^', $thisline); $done = 1; } if ($splitchar =~ /\*$/) { @parts = split('\*', $thisline); $done = 1; } if (!$done) { # It wasn't any of the characters that need escaping, so just this # generic split takes care of all the other possible split characters. @parts = split("$splitchar", $thisline); } } # $numparts is how many fields we got from the current input line, $thisline. $numparts = scalar(@parts); #print " SPLIT2 $splitchar $numparts FIRST $thisline\n"; #print "#----------------------------------------------------------\n"; # Make a copy of the current format string, $submaster. We will work # on $printstring to avoid mangling $submaster for now. This is probably # not needed but its here so we preserve $submaster in case we ever need # it for debugging output. $printstring = $submaster; # You have to check the F with two digits first, otherwise it will match # /^F[1-9]/ only. That will produce field 1 followed by the second digit. while (($printstring =~ /F[0-9][0-9]/) || ($printstring =~ /F[0-9]/)) { # $match stores the actual string matched. Copy it to $fieldnum, then strip # off the F and you have a number to use as an index into the fields of the # input string. we end up naming it $index, and decrement by 1 because # arrays start counting at 0, but we want the first field to be labeled 1. # So decrement one, and the first field is array[0]. For the special F0 # field that means the whole line, we just print the whole original input # line $thisline. $match = $&; $fieldnum = $match; $fieldnum =~ s/^F//; $index = int($fieldnum); --$index; # The field given might not exist on this particular input line. # If it doesn't, then we have to create a mock item that resembles the # the field specifier with a pound sign embedded. But we have to check for # the ignore flag first. The way we ignore it, is to just go ahead and do # the substitution, but we replace the substitute string with a blank string. # The substitute string is the field data from the original line. But since # it doesn't exist, we either use the F# or ignore and replace with nothing. if ($index >= $numparts) { if ($ignore) { $substitute = ""; } else { $substitute = "F#$fieldnum"; } } else { # Since we decremented, F0 would make the $index -1, so in that case we can't # use it as an index, and we don't want to anyway, we want to print the whole # original line. So $index works out great as an array index and great as a # flag to substitute the whole line. if ($index < 0) { $substitute = $thisline; } # Otherwise we just get the right field from array @parts. else { $substitute = $parts[$index]; } # Now here is where we restore any upper case F's in the original input line. # We have to do this because we use a global substitue. If the original field # had an F followed by a digit, the global search and replace would find it # in an infinite loop. Remember, we're in a while loop here, searching for # new F's to substitute in the format string. We gotta make sure we can stop. # We undo this string later after we've done all the substitutes in the format # string. $substitute =~ s/F/ABC-niV0bc4ATXcJm9E3Tq0ss9PV8PRg5uQksyNr7tW6ke-ABC/g; } #print "DEBUG2a: $thisline\n"; #print "DEBUG2b: $match $substitute\n"; # Here is where we insert the field from the original input line, into the # format line. We are using $printstring as the format line and its a copy # of $submaster which is a substring of $master, or could be all of master. # At this pouint in the code it doesn't matter and we dont' care anyway. # We're just applying the fields to the format line ($printstring), right now, # looping through until we make allthe substitutions. $printstring =~ s/$match/$substitute/g; #print "DEBUG3a: $thisline\n"; #print "DEBUG3b: $match $substitute\n"; } # The newlines are escaped to get to us. so now we have to actually unescape # them for final output. if ($printstring =~ /\\n/) { $printstring =~ s/\\n/\n/g; } # And unescape the tabs. The newlines and tabs are just static parts of the # format string so we ignore them. We don't have to handle newlines any special # way like we do with joins. Just leave the newlines, make substitutions and # then print them out. if ($printstring =~ /\\t/) { $printstring =~ s/\\t/\t/g; } #print "DEBUG4a $printstring\n"; # Here we replace any upper case F's in the format string that came from # the input line. Note that there are two possibilities. One is for the # F's that were escaped in the format string because they wanted upper # case F's in the output. The other one is to mask upper case F's that # happen to be in the input line fields that we are inserting for output. $printstring =~ s/XYZ-xiwcOP4CKh2yvX5tc3iTADxP485aUL6HmK0Eyzdqxy-XYZ/F/g; $printstring =~ s/ABC-niV0bc4ATXcJm9E3Tq0ss9PV8PRg5uQksyNr7tW6ke-ABC/F/g; # Unescape any dollar signs that we've been ignoring. $printstring =~ s/\\$//g; #print "DEBUG4b $printstring\n"; # $finalprintstring starts out as empty string and we append each printstring # to it until we have done all the joins. If there are no joins, then this # pass through here is the last until the next input line. But if there were # joins, $finalprintstring we keep getting appended until all the lines were # joined into one big $finalprintstring. $finalprintstring = $finalprintstring . $printstring; } # This is the else part of if (defined($thisline)) { # If we hit this else part, we've read in the last line already and its # time to clean up and go home. else { $alldone = 1; } } # At this point, we finsished the while loop, and processed all lines and # format strings so time to print out what we have and start the next line. if (!$alldone) { if ($onelineflag) { print "$finalprintstring "; } else { print "$finalprintstring\n"; } } $finalprintstring = ""; #print "ALLDONE = $alldone\n"; } until ($alldone); exit(1);