#
# Copyright (C) 1999 Ken MacLeod
# XML::Parser::PerlSAX is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
# $Id: PerlSAX.pm,v 1.7 1999/12/22 21:15:00 kmacleod Exp $
#

use strict;

package XML::Parser::PerlSAX;

use XML::Parser;
use UNIVERSAL;
use vars qw{ $VERSION $name_re };

# will be substituted by make-rel script
$VERSION = "0.08";

# FIXME I doubt this is a correct Perl RE for productions [4] and
# [5] in the XML 1.0 specification, especially considering Unicode chars
$name_re = '[A-Za-z_:][A-Za-z0-9._:-]*';

sub new {
    my $type = shift;
    my $self = (@_ == 1) ? shift : { @_ };

    return bless $self, $type;
}

sub parse {
    my $self = shift;

    die "XML::Parser::PerlSAX: parser instance ($self) already parsing\n"
	if (defined $self->{ParseOptions});

    # If there's one arg and it has no ref, it's a string
    my $args;
    if (scalar (@_) == 1 && !ref($_[0])) {
	$args = { Source => { String => shift } };
    } else {
	$args = (scalar (@_) == 1) ? shift : { @_ };
    }

    my $parse_options = { %$self, %$args };
    $self->{ParseOptions} = $parse_options;

    # ensure that we have at least one source
    if (!defined $parse_options->{Source}
	|| !(defined $parse_options->{Source}{String}
	     || defined $parse_options->{Source}{ByteStream}
	     || defined $parse_options->{Source}{SystemId})) {
	die "XML::Parser::PerlSAX: no source defined for parse\n";
    }

    # assign default Handler to any undefined handlers
    if (defined $parse_options->{Handler}) {
	$parse_options->{DocumentHandler} = $parse_options->{Handler}
	    if (!defined $parse_options->{DocumentHandler});
	$parse_options->{DTDHandler} = $parse_options->{Handler}
	    if (!defined $parse_options->{DTDHandler});
	$parse_options->{EntityResolver} = $parse_options->{Handler}
	    if (!defined $parse_options->{EntityResolver});
    }

    my @handlers;
    if (defined $parse_options->{DocumentHandler}) {
	# cache DocumentHandler in self for callbacks
	$self->{DocumentHandler} = $parse_options->{DocumentHandler};

	my $doc_h = $parse_options->{DocumentHandler};

	push (@handlers, Init => sub { $self->_handle_init(@_) } )
	    if (UNIVERSAL::can($doc_h, 'start_document'));
	push (@handlers, Final => sub { $self->_handle_final(@_) } )
	    if (UNIVERSAL::can($doc_h, 'end_document'));
	push (@handlers, Start => sub { $self->_handle_start(@_) } )
	    if (UNIVERSAL::can($doc_h, 'start_element'));
	push (@handlers, End => sub { $self->_handle_end(@_) } )
	    if (UNIVERSAL::can($doc_h, 'end_element'));
	push (@handlers, Char => sub { $self->_handle_char(@_) } )
	    if (UNIVERSAL::can($doc_h, 'characters'));
	push (@handlers, Proc => sub { $self->_handle_proc(@_) } )
	    if (UNIVERSAL::can($doc_h, 'processing_instruction'));
	push (@handlers, Comment => sub { $self->_handle_comment(@_) } )
	    if (UNIVERSAL::can($doc_h, 'comment'));
	push (@handlers, CdataStart => sub { $self->_handle_cdatastart(@_) } )
	    if (UNIVERSAL::can($doc_h, 'start_cdata'));
	push (@handlers, CdataEnd => sub { $self->_handle_cdataend(@_) } )
	    if (UNIVERSAL::can($doc_h, 'end_cdata'));
	if (UNIVERSAL::can($doc_h, 'entity_reference')) {
	    push (@handlers, Default => sub { $self->_handle_default(@_) } );
	    $self->{UseEntRefs} = 1;
	}
    }

    if (defined $parse_options->{DTDHandler}) {
	# cache DTDHandler in self for callbacks
	$self->{DTDHandler} = $parse_options->{DTDHandler};

	my $dtd_h = $parse_options->{DTDHandler};

	push (@handlers, Notation => sub { $self->_handle_notation(@_) } )
	    if (UNIVERSAL::can($dtd_h, 'notation_decl'));
	push (@handlers, Unparsed => sub { $self->_handle_unparsed(@_) } )
	    if (UNIVERSAL::can($dtd_h, 'unparsed_entity_decl'));
	push (@handlers, Entity => sub { $self->_handle_entity(@_) } )
	    if ($self->{UseEntRefs}
		|| UNIVERSAL::can($dtd_h, 'entity_decl'));
	push (@handlers, Element => sub { $self->_handle_element(@_) } )
	    if (UNIVERSAL::can($dtd_h, 'element_decl'));
	push (@handlers, Attlist => sub { $self->_handle_attlist(@_) } )
	    if (UNIVERSAL::can($dtd_h, 'attlist_decl'));
	push (@handlers, Doctype => sub { $self->_handle_doctype(@_) } )
	    if (UNIVERSAL::can($dtd_h, 'doctype_decl'));
	push (@handlers, XMLDecl => sub { $self->_handle_xmldecl(@_) } )
	    if (UNIVERSAL::can($dtd_h, 'xml_decl'));
    }

    
    if (defined $parse_options->{EntityResolver}) {
	# cache EntityResolver in self for callbacks
	$self->{EntityResolver} = $parse_options->{EntityResolver};

	my $er = $parse_options->{EntityResolver};

	push (@handlers, ExternEnt => sub { $self->_handle_extern_ent(@_) } )
	    if (UNIVERSAL::can($er, 'resolve_entity'));
    }

    my @xml_parser_options;
    if ($self->{UseEntRefs}) {
	@xml_parser_options = ( NoExpand => 1,
				Handlers => { @handlers } );
    } else {
	@xml_parser_options = ( Handlers => { @handlers } );
    }

    push (@xml_parser_options,
	  ProtocolEncoding => $self->{ParseOptions}{Source}{Encoding})
	if (defined $self->{ParseOptions}{Source}{Encoding});

    my $parser = new XML::Parser(@xml_parser_options);
    my $result;

    if (defined $self->{ParseOptions}{Source}{ByteStream}) {
	$result = $parser->parse($self->{ParseOptions}{Source}{ByteStream});
    } elsif (defined $self->{ParseOptions}{Source}{String}) {
	$result = $parser->parse($self->{ParseOptions}{Source}{String});
    } elsif (defined $self->{ParseOptions}{Source}{SystemId}) {
	$result = $parser->parsefile($self->{ParseOptions}{Source}{SystemId});
    }

    # clean up parser instance
    delete $self->{ParseOptions};
    delete $self->{DocumentHandler};
    delete $self->{DTDHandler};
    delete $self->{EntityResolver};
    delete $self->{Expat};

    return $result;
}

sub location {
    my $self = shift;

    my $expat = $self->{Expat};

    my @properties = ( ColumnNumber => $expat->current_column,
		       LineNumber => $expat->current_line,
		       BytePosition => $expat->current_byte,
		       Base => $expat->base );

    # FIXME these locations change while parsing external entities
    push (@properties, PublicId => $self->{Source}{PublicId})
	if (defined $self->{Source}{PublicId});
    push (@properties, SystemId => $self->{Source}{SystemId})
	if (defined $self->{Source}{SystemId});

    return { @properties };
}

###
### DocumentHandler methods
###

sub _handle_init {
    my $self = shift;
    my $expat = shift;

    $self->{Expat} = $expat;

    if ($self->{DocumentHandler}->can('set_document_locator')) {
	$self->{DocumentHandler}->set_document_locator( { Locator => $self } );
    }
    $self->{DocumentHandler}->start_document( { } );
}

sub _handle_final {
    my $self = shift;

    delete $self->{UseEntRefs};
    delete $self->{EntRefs};
    return $self->{DocumentHandler}->end_document( { } );
}

sub _handle_start {
    my $self = shift;
    my $expat = shift;
    my $element = shift;

    my @properties;
    if ($self->{ParseOptions}{UseAttributeOrder}) {
	# Capture order and defined() status for attributes
	my $ii;

	my $order = [];
	for ($ii = 0; $ii < $#_; $ii += 2) {
	    push @$order, $_[$ii];
	}

	push @properties, 'AttributeOrder', $order;

	# Divide by two because XML::Parser counts both attribute name
	# and value within it's index
	push @properties, 'Defaulted', ($expat->specified_attr() / 2);
    }

    $self->{DocumentHandler}->start_element( { Name => $element,
					       Attributes => { @_ },
					       @properties } );
}

sub _handle_end {
    my $self = shift;
    my $expat = shift;
    my $element = shift;

    $self->{DocumentHandler}->end_element( { Name => $element } );
}

sub _handle_char {
    my $self = shift;
    my $expat = shift;
    my $string = shift;

    $self->{DocumentHandler}->characters( { Data => $string } );
}

sub _handle_proc {
    my $self = shift;
    my $expat = shift;
    my $target = shift;
    my $data = shift;

    $self->{DocumentHandler}->processing_instruction( { Target => $target,
							Data => $data } );
}

sub _handle_comment {
    my $self = shift;
    my $expat = shift;
    my $data = shift;

    $self->{DocumentHandler}->comment( { Data => $data } );
}

sub _handle_cdatastart {
    my $self = shift;
    my $expat = shift;

    $self->{DocumentHandler}->start_cdata( { } );
}

sub _handle_cdataend {
    my $self = shift;
    my $expat = shift;

    $self->{DocumentHandler}->end_cdata( { } );
}

# Default receives all characters that aren't handled by some other
# handler, this means a lot of stuff goes through here.  All we're
# looking for are `&NAME;' entity reference sequences
sub _handle_default {
    my $self = shift;
    my $expat = shift;
    my $string = shift;

    if ($string =~ /^&($name_re);$/) {
	my $ent_ref = $self->{EntRefs}{$1};
	if (!defined $ent_ref) {
	    $ent_ref = { Name => $1 };
	}
	$self->{DocumentHandler}->entity_reference($ent_ref);
    }
}

###
### DTDHandler methods
###

sub _handle_notation {
    my $self = shift;
    my $expat = shift;
    my $notation = shift;
    my $base = shift;
    my $sysid = shift;
    my $pubid = shift;
    my @properties = (Name => $notation);

    push (@properties, Base => $base)
	if (defined $base);
    push (@properties, SystemId => $sysid)
	if (defined $sysid);
    push (@properties, PublicId => $pubid)
	if (defined $pubid);


    $self->{DTDHandler}->notation_decl( { @properties } );
}

sub _handle_unparsed {
    my $self = shift;
    my $expat = shift;
    my $entity = shift;
    my $base = shift;
    my $sysid = shift;
    my $pubid = shift;
    my @properties = (Name => $entity, SystemId => $sysid);

    push (@properties, Base => $base)
	if (defined $base);
    push (@properties, PublicId => $pubid)
	if (defined $pubid);

    $self->{DTDHandler}->unparsed_entity_decl( { @properties } );
}

sub _handle_entity {
    my $self = shift;
    my $expat = shift;
    my $name = shift;
    my $val = shift;
    my $sysid = shift;
    my $pubid = shift;
    my $ndata = shift;
    my @properties = (Name => $name);

    push (@properties, Value => $val)
	if (defined $val);
    push (@properties, PublicId => $pubid)
	if (defined $pubid);
    push (@properties, SystemId => $sysid)
	if (defined $sysid);
    push (@properties, Notation => $ndata)
	if (defined $ndata);

    my $properties = { @properties };
    if ($self->{UseEntRefs}) {
	$self->{EntRefs}{$name} = $properties;
    }
    if ($self->{DTDHandler}->can('entity_decl')) {
	$self->{DTDHandler}->entity_decl( $properties );
    }
}

sub _handle_element {
    my $self = shift;
    my $expat = shift;
    my $name = shift;
    my $model = shift;

    $self->{DTDHandler}->element_decl( { Name => $name,
					 Model => $model } );
}

sub _handle_attlist {
    my $self = shift;
    my $expat = shift;
    my $elname = shift;
    my $attname = shift;
    my $type = shift;
    my $default = shift;
    my $fixed = shift;

    $self->{DTDHandler}->attlist_decl( { ElementName => $elname,
					 AttributeName => $attname,
					 Type => $type,
					 Default => $default,
					 Fixed => $fixed } );
}

sub _handle_doctype {
    my $self = shift;
    my $expat = shift;
    my $name = shift;
    my $sysid = shift;
    my $pubid = shift;
    my $internal = shift;
    my @properties = (Name => $name);

    push (@properties, SystemId => $sysid)
	if (defined $sysid);
    push (@properties, PublicId => $pubid)
	if (defined $pubid);
    push (@properties, Internal => $internal)
	if (defined $internal);

    $self->{DTDHandler}->doctype_decl( { @properties } );
}

sub _handle_xmldecl {
    my $self = shift;
    my $expat = shift;
    my $version = shift;
    my $encoding = shift;
    my $standalone = shift;
    my @properties = (Version => $version);

    push (@properties, Encoding => $encoding)
	if (defined $encoding);
    push (@properties, Standalone => $standalone)
	if (defined $standalone);

    $self->{DTDHandler}->xml_decl( { @properties } );
}

###
### EntityResolver methods
###

sub _handle_extern_ent {
    my $self = shift;
    my $expat = shift;
    my $base = shift;
    my $sysid = shift;
    my $pubid = shift;
    my @properties = (SystemId => $sysid);

    push (@properties, Base => $base)
	if (defined $base);
    push (@properties, PublicId => $pubid)
	if (defined $pubid);

    my $result = $self->{EntityResolver}->resolve_entity( { @properties } );

    if (UNIVERSAL::isa($result, 'HASH')) {
	if ($result->{ByteStream}) {
	    return $result->{ByteStream};
	} elsif ($result->{String}) {
	    return $result->{String};
	} elsif ($result->{SystemId}) {
	    # FIXME must be able to resolve SystemIds, XML::Parser's
	    # default can :-(
	    die "PerlSAX: automatic opening of SystemIds from \`resolve_entity' not implemented, contact the author\n";
	} else {
	    # FIXME
	    die "PerlSAX: invalid source returned from \`resolve_entity'\n";
	}
    }

    return undef;
}

1;

__END__

=head1 NAME

XML::Parser::PerlSAX - Perl SAX parser using XML::Parser

=head1 SYNOPSIS

 use XML::Parser::PerlSAX;

 $parser = XML::Parser::PerlSAX->new( [OPTIONS] );
 $result = $parser->parse( [OPTIONS] );

 $result = $parser->parse($string);

=head1 DESCRIPTION

C<XML::Parser::PerlSAX> is a PerlSAX parser using the XML::Parser
module.  This man page summarizes the specific options, handlers, and
properties supported by C<XML::Parser::PerlSAX>; please refer to the
PerlSAX standard in `C<PerlSAX.pod>' for general usage information.

=head1 METHODS

=over 4

=item new

Creates a new parser object.  Default options for parsing, described
below, are passed as key-value pairs or as a single hash.  Options may
be changed directly in the parser object unless stated otherwise.
Options passed to `C<parse()>' override the default options in the
parser object for the duration of the parse.

=item parse

Parses a document.  Options, described below, are passed as key-value
pairs or as a single hash.  Options passed to `C<parse()>' override
default options in the parser object.

=item location

Returns the location as a hash:

  ColumnNumber    The column number of the parse.
  LineNumber      The line number of the parse.
  BytePosition    The current byte position of the parse.
  PublicId        A string containing the public identifier, or undef
                  if none is available.
  SystemId        A string containing the system identifier, or undef
                  if none is available.
  Base            The current value of the base for resolving relative
                  URIs.

ALPHA WARNING: The `C<SystemId>' and `C<PublicId>' properties returned
are the system and public identifiers of the document passed to
`C<parse()>', not the identifiers of the currently parsing external
entity.  The column, line, and byte positions I<are> of the current
entity being parsed.

=head1 OPTIONS

The following options are supported by C<XML::Parser::PerlSAX>:

 Handler          default handler to receive events
 DocumentHandler  handler to receive document events
 DTDHandler       handler to receive DTD events
 ErrorHandler     handler to receive error events
 EntityResolver   handler to resolve entities
 Locale           locale to provide localisation for errors
 Source           hash containing the input source for parsing
 UseAttributeOrder set to true to provide AttributeOrder and Defaulted
                   properties in `start_element()'

If no handlers are provided then all events will be silently ignored,
except for `C<fatal_error()>' which will cause a `C<die()>' to be
called after calling `C<end_document()>'.

If a single string argument is passed to the `C<parse()>' method, it
is treated as if a `C<Source>' option was given with a `C<String>'
parameter.

The `C<Source>' hash may contain the following parameters:

 ByteStream       The raw byte stream (file handle) containing the
                  document.
 String           A string containing the document.
 SystemId         The system identifier (URI) of the document.
 PublicId         The public identifier.
 Encoding         A string describing the character encoding.

If more than one of `C<ByteStream>', `C<String>', or `C<SystemId>',
then preference is given first to `C<ByteStream>', then `C<String>',
then `C<SystemId>'.

=head1 HANDLERS

The following handlers and properties are supported by
C<XML::Parser::PerlSAX>:

=head2 DocumentHandler methods

=over 4

=item start_document

Receive notification of the beginning of a document.

No properties defined.

=item end_document

Receive notification of the end of a document.

No properties defined.

=item start_element

Receive notification of the beginning of an element.

 Name             The element type name.
 Attributes       A hash containing the attributes attached to the
                  element, if any.

The `C<Attributes>' hash contains only string values.

If the `C<UseAttributeOrder>' parser option is true, the following
properties are also passed to `C<start_element>':

 AttributeOrder   An array of attribute names in the order they were
                  specified, followed by the defaulted attribute
                  names.
 Defaulted        The index number of the first defaulted attribute in
                  `AttributeOrder.  If this index is equal to the
                  length of `AttributeOrder', there were no defaulted
                  values.

Note to C<XML::Parser> users:  `C<Defaulted>' will be half the value of
C<XML::Parser::Expat>'s `C<specified_attr()>' function because only
attribute names are provided, not their values.


=item end_element

Receive notification of the end of an element.

 Name             The element type name.

=item characters

Receive notification of character data.

 Data             The characters from the XML document.

=item processing_instruction

Receive notification of a processing instruction. 

 Target           The processing instruction target. 
 Data             The processing instruction data, if any.

=item comment

Receive notification of a comment.

 Data             The comment data, if any.

=item start_cdata

Receive notification of the start of a CDATA section.

No properties defined.

=item end_cdata

Receive notification of the end of a CDATA section.

No properties defined.

=item entity_reference

Receive notification of an internal entity reference.  If this handler
is defined, internal entities will not be expanded and not passed to
the `C<characters()>' handler.  If this handler is not defined,
internal entities will be expanded if possible and passed to the
`C<characters()>' handler.

 Name             The entity reference name
 Value            The entity reference value

=back

=head2 DTDHandler methods

=over 4

=item notation_decl

Receive notification of a notation declaration event.

 Name             The notation name.
 PublicId         The notation's public identifier, if any.
 SystemId         The notation's system identifier, if any.
 Base             The base for resolving a relative URI, if any.

=item unparsed_entity_decl

Receive notification of an unparsed entity declaration event.

 Name             The unparsed entity's name.
 SystemId         The entity's system identifier.
 PublicId         The entity's public identifier, if any.
 Base             The base for resolving a relative URI, if any.

=item entity_decl

Receive notification of an entity declaration event.

 Name             The entity name.
 Value            The entity value, if any.
 PublicId         The notation's public identifier, if any.
 SystemId         The notation's system identifier, if any.
 Notation         The notation declared for this entity, if any.

For internal entities, the `C<Value>' parameter will contain the value
and the `C<PublicId>', `C<SystemId>', and `C<Notation>' will be
undefined.  For external entities, the `C<Value>' parameter will be
undefined, the `C<SystemId>' parameter will have the system id, the
`C<PublicId>' parameter will have the public id if it was provided (it
will be undefined otherwise), the `C<Notation>' parameter will contain
the notation name for unparsed entities.  If this is a parameter entity
declaration, then a '%' will be prefixed to the entity name.

Note that `C<entity_decl()>' and `C<unparsed_entity_decl()>' overlap.
If both methods are implemented by a handler, then this handler will
not be called for unparsed entities.

=item element_decl

Receive notification of an element declaration event.

 Name             The element type name.
 Model            The content model as a string.

=item attlist_decl

Receive notification of an attribute list declaration event.

This handler is called for each attribute in an ATTLIST declaration
found in the internal subset. So an ATTLIST declaration that has
multiple attributes will generate multiple calls to this handler.

 ElementName      The element type name.
 AttributeName    The attribute name.
 Type             The attribute type.
 Fixed            True if this is a fixed attribute.

The default for `C<Type>' is the default value, which will either be
"#REQUIRED", "#IMPLIED" or a quoted string (i.e. the returned string
will begin and end with a quote character).

=item doctype_decl

Receive notification of a DOCTYPE declaration event.

 Name             The document type name.
 SystemId         The document's system identifier.
 PublicId         The document's public identifier, if any.
 Internal         The internal subset as a string, if any.

Internal will contain all whitespace, comments, processing
instructions, and declarations seen in the internal subset. The
declarations will be there whether or not they have been processed by
another handler (except for unparsed entities processed by the
Unparsed handler).  However, comments and processing instructions will
not appear if they've been processed by their respective handlers.

=item xml_decl

Receive notification of an XML declaration event.

 Version          The version.
 Encoding         The encoding string, if any.
 Standalone       True, false, or undefined if not declared.

=back

=head2 EntityResolver

=over 4

=item resolve_entity

Allow the handler to resolve external entities.

 Name             The notation name.
 SystemId         The notation's system identifier.
 PublicId         The notation's public identifier, if any.
 Base             The base for resolving a relative URI, if any.

`C<resolve_entity()>' should return undef to request that the parser
open a regular URI connection to the system identifier or a hash
describing the new input source.  This hash has the same properties as
the `C<Source>' parameter to `C<parse()>':

  PublicId    The public identifier of the external entity being
              referenced, or undef if none was supplied. 
  SystemId    The system identifier of the external entity being
              referenced.
  String      String containing XML text
  ByteStream  An open file handle.
  CharacterStream
              An open file handle.
  Encoding    The character encoding, if known.

=back

=head1 AUTHOR

Ken MacLeod, ken@bitsko.slc.ut.us

=head1 SEE ALSO

perl(1), PerlSAX.pod(3)

 Extensible Markup Language (XML) <http://www.w3c.org/XML/>
 SAX 1.0: The Simple API for XML <http://www.megginson.com/SAX/>

=cut