Code Files

XTAB.AWK

Generated on Tue Dec 05 17:39:05 Eastern Standard Time 2006 from XTAB.AWK

# Program   : XTAB.AWK
# Purpose   : Create a cross-tab file from a .CSV file
# Author    : Bob Jonkman <bjonkman@sobac.com>

# Copyright 2008 Bob Jonkman and/or SOBAC Microcomputer Services

#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

# Date      : 9 March 2004
# Modified  : 15 June 2005 (made generic based on XTAB-VERSION-PO.AWK)

# Usage     : gawk -f xtab.awk -f library.csv ([-v TIMESLOT=(0|1)] | [-v DATESLOT=(0|1)]) [-v XFIELDNUM=x | -v XFIELDNAME=xname] [-v YFIELDNUM=y | -v YFIELDNAME=yname]  inputfile.csv > outputfile.csv

# Note      : XFIELD is across, YFIELD is down

# Variables : XFIELDNAME = (optional) the fieldheader for column values
#             XFIELDNUM  = (optional) the fieldnumber for column values if there is no XFIELDNAME
#             YFIELDNAME = (optional) the fieldheader for row values
#             YFIELDNUM  = (optional) the fieldnumber for row values if there is no YFIELDNAME
#             TIMESLOT   = (optional) Treat the YFIELD as Datetime, slotted by hour
#             DATESLOT   = (optional) Treat the YFIELD as Datetime, slotted by date

BEGIN    {      ORS   = ""   ;
                FS    = ","  ;
                QUOTE = "\"" ;
                COMMA = ","  ;
    
                FIELD_DELIMITER = QUOTE ;
                FIELD_SEPARATOR = COMMA ;
}


##### Read the headers #####

(FNR == 1)    {     numfields = parsecsv($0,headers)
                    for (i in headers)
                    {
                        if(headers[i] == XFIELDNAME)
                           xfield = i ;

                        if(headers[i] == YFIELDNAME)
                           yfield = i ;
                    }
            
                    if(!xfield)                 # if XFIELDNAME doesn't exist
                        xfield = XFIELDNUM ;

                    if(!xfield)                 # if XFIELDNUM doesn't exist either
                        xfield = 1 ;

                    if(!yfield)                 # if YFIELDNAME doesn't exist
                        yfield = YFIELDNUM ;

                    if(!yfield)
                        yfield = 2 ;            # if YFIELDNUM doesn't exist either
        }


(!(NR % 10000))    {   print("NR=" NR " X=" seenx " Y=" seeny "\n" ) > "/dev/stderr"
}


(FNR != 1)    {     if($0 == "=====")   # Don't process CSV footers
                        nextfile ;

                    parsecsv($0, record)    ;

                    if (TIMESLOT || DATESLOT)     # truncate to hour or date
                    {
# print("##### DEBUG ##### NR=" NR " if(TIMESLOT || DATESLOT): TIMESLOT=" TIMESLOT " DATESLOT=" DATESLOT "\n") > "/dev/stderr"
                         # separate Date (datetime[1]) from Time (datetime[2])
                         split(record[yfield],datetime," ") ;

                         if (TIMESLOT)
                         {
# print("##### DEBUG ##### NR=" NR " if(TIMESLOT): TIMESLOT=" TIMESLOT " DATESLOT=" DATESLOT "\n") > "/dev/stderr"

                            # separate Time into Hour (time[1]), Minute (time[2]) and Second (time[3])
                            split(datetime[2],time,":") ;

                            # Reconstruct truncated field
                            record[yfield] = datetime[1] " " time[1] ":00"
                         }
                         else   # if (DATESLOT)
                         {
# print("##### DEBUG ##### NR=" NR " elseif (DATESLOT): TIMESLOT=" TIMESLOT " DATESLOT=" DATESLOT "\n") > "/dev/stderr"

                            record[yfield] = datetime[1]
                         }
                    }

# Count the number of X fields seen
                    if (!xcount[record[xfield]])        
                    {
                        seenx++;
                        xnames[seenx] = record[xfield];        # itemize the x names

                # print("seenx =====DEBUG=====", seenx, xnames[seenx] "\n" );
                    }
                    xcount[record[xfield]]++ ;            

# Count the number of Y fields seen
                    if (!ycount[record[yfield]])            
                    {
                        seeny++;
                        ynames[seeny] = record[yfield] ;

                # print("seeny =====DEBUG=====", seeny, ynames[seeny] "\n");
                    }
                    ycount[record[yfield]]++;

# count the entry            
                    crosstab[record[xfield],record[yfield]]++ ;        
}



# Post-process: print headers, totals, and the crosstab array

END {   
        asort(xnames) ;   # ensure the top headers print in alphabetical (or numerical) order

# print headers
        print(printcsv("'" headers[yfield] "' by '" headers[xfield] "'"));
        for (i=1; i <= seenx; i++)
            print(COMMA printcsv(xnames[i]));

        print(COMMA COMMA "Totals\n" );                # Newline to complete the headers

# print the crosstab
        for (i=1; i <= seeny; i++)
          {
            print(printcsv(ynames[i]))                          | "SORT" ;
            ytotal += ycount[ynames[i]];
        
            for (j=1; j <= seenx; j++)
                print(COMMA crosstab[xnames[j], ynames[i] ] )   | "SORT" ;

            print(COMMA COMMA ycount[ynames[i]])                | "SORT" ;    # Print blank column, row total
            print( "\n" )                                       | "SORT" ;            # Newline to complete this row
          }

        print("=====\n");
        print("Totals")
        for (j=1; j <= seenx; j++)
            print(COMMA xcount[xnames[j]]);
        print(COMMA COMMA ytotal);
        print("\n\n");
        print("Unique entries:\n")
        print(printcsv(headers[yfield]) COMMA seeny "\n")
        print(printcsv(headers[xfield]) COMMA seenx "\n")
    }

# EOF: XTAB.AWK
1 files processed.