xmlparser.awk
Generated on Tue Dec 05 17:39:06 Eastern Standard Time 2006 from xmlparser.awk
# From arnold  Fri Oct 19 11:47:19 2001
# Return-Path: <arnold@skeeve.com>
# Received: (from arnold@localhost)
# 	by skeeve.com (8.11.0/8.11.0) id f9J9lJo25059
# 	for arnold; Fri, 19 Oct 2001 11:47:19 +0200
# Date: Fri, 19 Oct 2001 11:47:19 +0200
# From: Aharon Robbins <arnold@skeeve.com>
# Message-Id: <200110190947.f9J9lJo25059@skeeve.com>
# To: arnold@skeeve.com
# Subject: xml parser
# Status: R
# 
# Path: iad-read.news.verio.net!dfw-artgen.news.verio.net!dfw-peer.news.verio.net!news.verio.net!crtntx1-snh1.gtei.net!cpk-news-hub1.bbnplanet.com!news.gtei.net!nf3.bellglobal.com!sunqbc.risq.qc.ca!torn!qcarhaaa.nortelnetworks.com!bcarh189.ca.nortel.com!bcarh8ac.ca.nortel.com!bcarh8ab.ca.nortel.com!not-for-mail
# From: Steve Coile <scoile@csc.com>
# Newsgroups: comp.lang.awk
# Subject: XML parser for awk (was: awk newbie needs to parse xml file)
# Date: Wed, 17 Oct 2001 13:55:32 -0400
# Organization: Computer Sciences Corporation
# Lines: 547
# Message-ID: <3BCDC614.AC548AB8@csc.com>
# References: <3d68af91.0110151058.54e2d55b@posting.google.com>
# NNTP-Posting-Host: wpgcy0j6.us.nortel.com
# Mime-Version: 1.0
# Content-Type: text/plain; charset=us-ascii
# Content-Transfer-Encoding: 7bit
# X-Mailer: Mozilla 4.77 [en] (X11; U; Linux 2.4.3-12 i686)
# X-Accept-Language: en
# Xref: dfw-artgen.news.verio.net comp.lang.awk:10853
# 
# Reading the article about the individual wanting to process XML files
# with awk, I was inspired to try to write an XML parser in awk.  Below is
# the fruit of my effort.  Comments welcome.
# 
# ----- cut here -----
# $Id: xmlparse.awk,v 1.1 2001/10/17 17:49:33 scoile Exp $
##############################################################################
#
# xmlparse.awk - A simple XML parser for awk
#
# Author:	Steve Coile <scoile@csc.com>
#
# Version:	1.0 (20011017)
#
# Synopsis:
#
#	awk -f xmlparse.awk [FILESPEC]...
#
# Description:
#
#	This script is a simple XML parser for (modern variants of) awk.
#	Input in XML format is saved to two arrays, "type" and "item".
#
#	The term, "item", as used here, refers to a distinct XML element,
#	such as a tag, an attribute name, an attribute value, or data.
#
#	The indexes into the arrays are the sequence number that a
#	particular item was encountered.  For example, the third item's
#	type is described by type[3], and its value is stored in item[3].
#
#	The "type" array contains the type of the item encountered for
#	each sequence number.  Types are expressed as a single word:
#	"error" (invalid item or other error), "begin" (open tag),
#	"attrib" (attribute name), "value" (attribute value), "end"
#	(close tag), and "data" (data between tags).
#
#	The "item" array contains the value of the item encountered
#	for each sequence number.  For types "begin" and "end", the
#	item value is the name of the tag.  For "error", the value is
#	the text of the error message.	For "attrib", the value is the
#	attribute name.  For "value", the value is the attribute value.
#	For "data", the value is the raw data.
#
#	WARNING: XML-quoted values ("entities") in the data and attribute
#	values are *NOT* unquoted; they are stored as-is.
#
###############################################################################
BEGIN {
# In XML, literal "<" and ">" are only valid as tag delimiters;
# to include a "<" or ">" as data, they must be quoted: "<" and
# ">".  So we know that if we encounter a ">", we have reached the
# end of a tag.  This makes a convenient end-of-record marker, as the
# end-of-tag delimiter marks a special event, whereas a new-line is
# simply whitespace in XML.
	RS = ">";
	lineno = 1;
	sptr = 0;
}
# Count input lines.
{
	data = $0;
	lineno += gsub( /\n/, "", data );
	data = "";
}
# Special modes of operation.  These handle special XML sections, such
# as literal character data containing XML meta-characters ("cdata"
# sections), comments, and processing instructions ("pi") for other
# document processors.
# "Cdata" sections are teminated by the sequence, "]]>".
( mode == "cdata" ) {
	if ( $0 ~ /\]\]$/ ) {
		sub( /\]\]$/, "", $0 );
		mode = "";
	};
	item[idx] = item[idx] RS $0;
	next;
}
# Comment sections are terminated by the sequence, "-->".
( mode == "comment" ) {
	if ( $0 ~ /--$/ ) {
		sub( /--$/, "", $0 );
		mode = "";
	};
	item[idx] = item[idx] RS $0;
	next;
}
# Processing instruction sections are terminated by the sequence, "?>".
( mode == "pi" ) {
	if ( $0 ~ /\?$/ ) {
		sub( /\?$/, "", $0 );
		mode = "";
	};
	item[idx] = item[idx] RS $0;
	next;
}
( !mode ) {
	mline = 0;
# Our record separator is the end-of-tag marker, ">".  If we've
# encountered an end-of-tag marker, we should have a beginning-of-tag
# marker ("<") somewhere in the input record.  If not, either there
# is a spurious end-of-tag marker, or the record was terminated by
# the end-of-file.
	p = index( $0, "<" );
# Any data preceeding the beginning-of-tag marker is raw data.  If no
# beginning-of-tag marker is present, everything in the input is data.
	if ( !p || ( p > 1 )) {
		idx += 1;
		type[idx] = "data";
		item[idx] = ( p ? substr( $0, 1, ( p - 1 )) : $0 );
		if ( !p ) next;
		$0 = substr( $0, p );
	};
# Recognize special XML sections.  Sections are not processed as XML,
# but handled specially.  If the section end with the current input
# record, we continue processing XML in the next record; otherwise,
# we enter a special mode and perform special processing.
# Character data ("cdata") sections contain literal character data
# containing XML meta-characters that should not be processed. 
Character
# data sections begin with the sequence, "<![CDATA[" and end with "]]>".
# This section may span input records.
	if ( $0 ~ /^<!\[[Cc][Dd][Aa][Tt][Aa]\[/ ) {
		idx += 1;
		type[idx] = "cdata";
		$0 = substr( $0, 10 );
		if ( $0 ~ /\]\]$/ ) sub( /\]\]$/, "", $0 );
		else {
			mode = "cdata";
			mline = lineno;
		};
		item[idx] = $0;
		next;
	}
# Comments begin with the sequence, "<!--" and end with "-->".
# This section may span input records.
	else if ( $0 ~ /^<!--/ ) {
		idx += 1;
		type[idx] = "comment";
		$0 = substr( $0, 5 );
		if ( $0 ~ /--$/ ) sub( /--$/, "", $0 );
		else {
			mode = "comment";
			mline = lineno;
		};
		item[idx] = $0;
		next;
	}
# Declarations begin with the sequence, "<!" and end with ">".
# This section may *NOT* span input records.
	else if ( $0 ~ /^<!/ ) {
		idx += 1;
		type[idx] = "decl";
		$0 = substr( $0, 3 );
		item[idx] = $0;
		next;
	}
# Processing instructions ("pi") begin with the sequence, "<?" and end
# with "?>".  This section may span input records.
	else if ( $0 ~ /^<\?/ ) {
		idx += 1;
		type[idx] = "pi";
		$0 = substr( $0, 3 );
		if ( $0 ~ /\?$/ ) sub( /\?$/, "", $0 );
		else {
			mode = "pi";
			mline = lineno;
		};
		item[idx] = $0;
		next;
	};
# Beyond this point, we're dealing strictly with a tag.
	idx += 1;
# A tag that begins with "</" (e.g. as in "</p>") is a close tag:
# it closes a tag-enclosed block.
	if ( substr( $0, 1, 2 ) == "</" ) {
		type[idx] = "end";
		tag = $0 = substr( $0, 3 );
	}
# A tag that begins simply with "<" (e.g. as in "<p>") is an open
# tag: it starts a tag-enclosed block.  Note that a stand-alone tag
# (e.g. "<data/>") will be handled later, and will appear as an open
# tag and close tag, with no data between.
	else {
		type[idx] = "begin";
		tag = $0 = substr( $0, 2 );
	};
# The tag name is saved in "tag" so that we can retreive it later should
# we find that the tag is stand-alone and need to save a close tag item.
	sub( /[ \n\t/].*$/, "", tag );
	tag = toupper( tolower( tag ));
	item[idx] = tag;
# Validate the tag name.  If invalid, indicate so and exit.
	if ( tag !~ /^[A-Za-z][-+_.:0-9A-Za-z]*$/ )
	{
		type[idx] = "error";
		item[idx] = "line " lineno ": " tag ": invalid tag name";
		exit( 1 );
	}
# If an open tag is encountered, its name is recorded on the stack.
# If a close tag is encountered, its name is compared against the name
# on the top of the stack.  If the names differ, an error is generated
# (XML does not allow overlapping tags).
	if ( type[idx] == "begin" ) {
		sptr += 1;
		lstack[sptr] = lineno;
		tstack[sptr] = tag;
	}
	else if ( type[idx] == "end" ) {
		if ( tag != tstack[sptr] ) {
			type[idx] = "error";
			item[idx] = "line " lineno ": " tag \
					": unexpected close tag, expecting " \
					tstack[sptr];
			exit( 1 );
		};
		delete tstack[sptr];
		sptr -= 1;
	};
	sub( /[^ \n\t/]*[ \n\t]*/, "", $0 );
# Beyond this point, we're dealing with the tag attributes, if any,
# and/or the stand-alone end-of-tag marker.
	while ( $0 ) {
# If $0 contains only a slash (/), then the tag we're processing is
# stand-alone (e.g. "<data/>"), so we generate a close tag, but no data
# between the open and close tags.
		if ( $0 == "/" )
		{
			idx += 1;
			type[idx] = "end";
			item[idx] = tag;
			delete lstack[sptr];
			delete tstack[sptr];
			sptr -= 1;
			break;
		};
# The attribute name is determined.  Note that the attribute name is
also
# saved to "attrib" so that we can reference it should the attribute
# not include a value.  If the attribute does not include a value,
# it's name is given as its value.
		idx += 1;
		type[idx] = "attrib";
		attrib = $0;
		sub( /=.*$/, "", attrib );
		attrib = tolower( attrib );
		item[idx] = attrib;
# Validate the attribute name.  If invalid, indicate so and exit.
		if ( attrib !~ /^[A-Za-z][-+_0-9A-Za-z]*$/ )
		{
			type[idx] = "error";
			item[idx] = "line " lineno ": " attrib \
					": invalid attribute name";
			exit( 1 );
		}
		sub( /^[^=]*/, "", $0 );
# Each attribute must have a value.  If one isn't explicit in the input,
# we assign it one equal to the name of the attribute itself.  Attribute
# values in the input may be in one of three forms: enclosed in double
# quotes ("), enclosed in single quotes/apostrophes ('), or a single
word.
		idx += 1;
		type[idx] = "value";
		if ( substr( $0, 1, 1 ) == "=" ) {
			if ( substr( $0, 2, 1 ) == "\"" ) {
				item[idx] = substr( $0, 3 );
				sub( /".*$/, "", item[idx] );
				sub( /^="[^"]*"/, "", $0 );
			}
			else if ( substr( $0, 2, 1 ) == "'" ) {
				item[idx] = substr( $0, 3 );
				sub( /'.*$/, "", item[idx] );
				sub( /^='[^']*'/, "", $0 );
			}
			else {
				item[idx] = $0;
				sub( /[ \n\t/]*.$/, "", item[idx] );
				sub( /^=[^ \n\t/]*/, "", $0 );
			};
		}
		else item[idx] = attrib;
		sub( /^[ \n\t]*/, "", $0 );
	};
	attrib = "";
	tag = "";
	next;
}
END {
# If mode is defined, the input stream ended without terminating an
# XML section.  Thus, the input contains invalid XML.
	if ( mode ) {
		idx += 1;
		type[idx] = "error";
		if ( mode == "cdata" ) mode = "character data";
		else if ( mode == "pi" ) mode = "processing instruction";
		item[idx] = "line " mline ": unterminated " mode;
	};
# If an open tag occured with no corresponding close tag, we have
# invalid XML.
	for ( n = sptr; n; n -= 1 ) {
		idx += 1;
		type[idx] = "error";
		item[idx] = "line " lstack[n] ": " \
				tstack[n] ": unclosed tag";
	};
}
# The following simple examples demonstrate the use of the accumulated
# data from the XML input stream.
END {
# If errors occured, generate appropriate messages and exit without
# further processing.
	if ( type[idx] == "error" ) {
		for ( n = idx; n && ( type[n] == "error" ); n -= 1 );
		for ( n += 1; n <= idx; n += 1 ) print "ERROR:", item[n];
		exit 1;
	};
## Print simplified XML.  If output completes successfully and the stack
## is not empty, close tags are generated for each tag on the stack.
#
#	in_tag = 0;
#
#	for ( n = 1; n <= idx; n += 1 ) {
#
#		if ( type[n] == "attrib" ) printf( " %s", item[n] );
#
#		else if ( type[n] == "begin" ) {
#			if ( in_tag ) printf( ">" );
#			else in_tag = 1;
#			printf( "<%s", item[n] );
#		}
#
#		else if ( type[n] == "cdata" ) {
#			if ( in_tag ) {
#				printf( ">" );
#				in_tag = 0;
#			};
#			printf( "<![CDATA[%s]]>", item[n] );
#		}
#
#		else if ( type[n] == "comment" ) {
#			if ( in_tag ) {
#				printf( ">" );
#				in_tag = 0;
#			};
#			printf( "<!--%s-->", item[n] );
#		}
#
#		else if ( type[n] == "data" ) {
#			if ( in_tag ) {
#				printf( ">" );
#				in_tag = 0;
#			};
#			printf( "%s", item[n] );
#		}
#
#		else if ( type[n] == "decl" ) {
#			if ( in_tag ) {
#				printf( ">" );
#				in_tag = 0;
#			}
#			printf( "<!%s>", item[n] );
#		}
#
#		else if ( type[n] == "end" ) {
#			if ( in_tag ) {
#				printf( "/>" );
#				in_tag = 0;
#			}
#			else printf( "</%s>", item[n] );
#		}
#
#		else if ( type[n] == "error" ) {
#			if ( in_tag ) {
#				printf( ">" );
#				in_tag = 0;
#			};
#			print "";
#			print "<!-- ERROR:", item[n], "-->";
#			break;
#		}
#
#		else if ( type[n] == "pi" ) {
#			if ( in_tag ) {
#				printf( ">" );
#				in_tag = 0;
#			};
#			printf( "<?%s?>", item[n] );
#		}
#
#		else if ( type[n] == "value" ) {
#			if ( item[n] ~ /"/ ) printf( "='%s'", item[n] );
#			else printf( "=\"%s\"", item[n] );
#		};
#	};
#
#	if ( in_tag ) printf( "\>" );
#
#	for ( n = sptr; n; n -= 1 ) printf( "</%s>", tstack[n] );
## Print an object tree, identifying tags and attributes.  Nesting is
## emphasized by indenting.
#
#	indent = "";
#	for ( n = 1; n <= idx; n += 1 ) {
#		if ( type[n] == "attrib" ) print indent "attrib", item[n];
#		else if ( type[n] == "begin" ) {
#			print indent "begin", item[n];
#			indent = indent "  ";
#		}
#		else if ( type[n] == "end" ) {
#			indent = substr( indent, 3 );
#			print indent "end", item[n];
#		}
#		else if ( type[n] == "error" ) print "ERROR:", item[n];
#		else print indent type[n];
#	};
# Print in a linear format suitable for parsing by shell scripts.
# Multi-line values have the new-lines replaced with the character
# sequence, "\n" (backslash, n) to ensure the entire name/value pair
# occurs on a single line.  All occurances of backslashes (\) in the
# original value are themselves backslash quoted.
	for ( n = 1; n <= idx; n += 1 ) {
		value = item[n];
		gsub( /\\/, "\\\\", value );
		gsub( /\n/, "\\n", value );
		print type[n], value;
	};
	for ( n = sptr; n; n -= 1 ) print "end", tstack[n];
## Print attribute values and data in a linear format suitable for
## searching (e.g. with grep).  Attributes are representd as:
##
##	[TAG/]...TAG/ATTRIB=VALUE
##
## Data is represented as:
##
##	[TAG/]...TAG: DATA
##
## Note that all tag names are displayed in upper-case.  All attribute
## names are displayed in lower-case.
##
## Multi-line values have the new-lines replaced with the character
## sequence, "\n" (backslash, n) to ensure the entire name/value pair
## occurs on a single line.  All occurances of backslashes (\) in the
## original value are themselves backslash quoted.
#
#	sptr = 0;
#	for ( n = 1; n <= idx; n += 1 ) {
#		if ( type[n] == "attrib" ) {
#			lead = stack[1];
#			for ( m = 2; m <= sptr; m += 1 ) \
#				lead = lead "/" stack[m];
#			lead = lead "/" item[n] "=";
#		}
#		else if ( type[n] == "begin" ) stack[++sptr] = item[n];
#		else if (( type[n] == "cdata" ) || ( type[n] == "data" )) {
#			lead = stack[1];
#			for ( m = 2; m <= sptr; m += 1 ) \
#				lead = lead "/" stack[m];
#			lead = lead ": ";
#		}
#		else if ( type[n] == "end" ) sptr -= 1;
#		if (( type[n] == "data" ) || ( type[n] == "value" )) {
#			value = item[n];
#			gsub( /\\/, "\\\\", value );
#			gsub( /\n/, "\\n", value );
#			print lead value;
#		};
#	};
}
# ----- cut here -----
# 
# -- 
# Steve Coile
# 
   1 files processed.