parsetable.awk
Generated on Tue Dec 05 17:39:06 Eastern Standard Time 2006 from parsetable.awk
BEGIN { IGNORECASE = 1;
FS = "<TD" ;
RS = "<TR" ;
OFS = "," ;
ORS = "" ;
FIELD_SEPARATOR = "," ;
FIELD_DELIMITER = "\"" ;
}
{
gsub(/\f/," ") ; # replace whitespace with literal " "
gsub(/\n/," ") ;
gsub(/\r/," ") ;
gsub(/\t/," ") ;
gsub(/\v/," ") ;
# If rightbracket exists then eat '<TR...' attributes to '>'
(rightbracketpos = index($0,">")) ? ($0 = substr($0,rightbracketpos+1)) : ($0 = "") ;
gsub(/<\/tr>/, "") ; # eat </TR>
gsub(/<\/td>/, "") ; # eat </TD>
# Tidy HTML from fields ($1 isn't used, it always refers to text before first '<TD>')
for(i=2; i<=NF; i++)
{
# print("##### DEBUG ##### NR=" NR, " i=" i "\n");
# if rightbracket exists then eat remaining '<TD...' attributes to '>' else set field to blank
(rightbracketpos = index($i,">")) ? ($i = substr($i,rightbracketpos+1)) : ($i = "") ;
while(leftbracketpos = index($i,"<")) # if leftbracket exists...
{
if(rightbracketpos = index($i,">") > leftbracketpos) # and if input file has valid rightbracket
{
$i = substr($i,1,leftbracketpos-1) substr($i,rightbracketpos+1) ; # then exclude <...> from field
}
}
print(printcsv(trim($i))) ; # remove spaces at start and end of field
if (i != NF)
print(OFS) ;
# print("##### DEBUG ##### \n") ;
}
print("\n") ;
}
1 files processed.