package HTML::Doctype;
use 5.008;
use strict;
use warnings;

our $VERSION = '0.02';

package HTML::Doctype::Detector;
use strict;
use warnings;

sub new
{
    my $class = shift;
    my $p = shift;
    bless { p => $p }, $class;
}

sub _type
{
    my $self = shift;
    my $type = shift;
    my $doct = shift;
    
    my $p = $self->{p};
    
    $self->{type} = $type;
    $self->{location} = $p->get_location;
    $self->{doctype} = $doct;
    
    $p->halt;
    return;
}

sub public_id { shift->{doctype}{ExternalId}{PublicId} }
sub system_id { shift->{doctype}{ExternalId}{SystemId} }

sub has_doctype { defined $_[0]->{doctype} }
sub is_xhtml    { defined $_[0]->{type} and $_[0]->{type} eq "XHTML" }

# fails if
#
#   * the first [ in the decl does not open the internal subset
#   * the internal subset contains ]>
#   * comments outside the internal subset contain >
#   * the internal subset is not closed with ]>
#
sub doctype_length
{
    my $self = shift;
    my $document = shift;
    my $doctyped = $self->{doctype};
    my $location = $self->{location};
    my $gtpos = 0;
    my $gtskip = 0;
    
    # no document type declaration
    return $gtpos unless defined $doctyped;
    
    # < of <!DOCTYPE ...
    my $ltpos = $location->{EntityOffset} - 9;

    $gtpos = index $document, ">", $ltpos;
    
    # malformed doctype, missing >
    return if $gtpos < 0;
    
    $gtskip += $doctyped->{ExternalId}{PublicId} =~ />/g
      if defined $doctyped and exists $doctyped->{ExternalId}{PublicId};
    
    $gtskip += $doctyped->{ExternalId}{SystemId} =~ />/g
      if defined $doctyped and exists $doctyped->{ExternalId}{SystemId};
    
    $gtpos = index $document, ">", $gtpos # +1 ?
      while $gtskip--;
    
    # malformed doctype, missing proper >
    return if $gtpos < 0;
    
    # extract possible doctype
    my $text = substr $document, $ltpos, $gtpos - $ltpos + 1;

    # look for ]> if suspected internal subset
    if (index($text, "[") >= 0)
    {
        my $gtpos2 = index $document, "]>", $ltpos;
        $gtpos = $gtpos2 + 1 if $gtpos2 >= 0;
    }
    
    return $gtpos - $ltpos + 1
}

sub start_dtd
{
    my $self = shift;
    my $doct = shift;
    
    # ignore specified document type declarations without
    # public or system identifier and implied document type
    # declarations (which have just a GeneratedSystemId key)
    return unless exists $doct->{ExternalId}{PublicId} or
                  exists $doct->{ExternalId}{SystemId};
    
    my $puid = $doct->{ExternalId}{PublicId};
    
    # no public identifier means HTML
    return $self->_type("HTML", $doct) unless defined $puid;
    
    # split public identifier at //
    my @comp = split(/\/\//, $puid);
    
    # malformed public identifiers mean HTML
    return $self->_type("HTML", $doct) unless @comp > 2;
    
    # we might want something different than \s and \S here
    # but it is not clear to me what exactly we should expect
    return $self->_type("HTML", $doct) unless $comp[2] =~ /^DTD\s+(\S+)/;
    
    # the first token of the public text description must include
    # the string "XHTML", see XHTML M12N section 3.1, and see also
    # http://w3.org/mid/41584c61.156809450@smtp.bjoern.hoehrmann.de
    return $self->_type("HTML", $doct) unless $1 =~ /XHTML/;
    
    # otherwise considers this document XHTML
    return $self->_type("XHTML", $doct)
}

sub start_element
{
    my $self = shift;
    my $elem = shift;
    
    # no xmlns attribute means HTML
    return $self->_type("HTML") unless exists $elem->{Attributes}{XMLNS};
    
    my $xmlns = $elem->{Attributes}{XMLNS};
    
    # this should use the corresponding helper function to deal
    # with some potential edge cases but it is not in CVS yet
    return $self->_type("HTML") unless $xmlns->{Defaulted} eq "specified";
    
    # see above
    # return $self->_type("HTML") unless "http://www.w3.org/1999/xhtml" eq
    # join '', map { $_->{Data} } @{$xmlns->{CdataChunks}};
    
    return $self->_type("XHTML")
}

1;

__END__

=pod

=head1 NAME

HTML::Doctype - HTML/XHTML/XML Doctype Operations

=head1 SYNOPSIS

  use HTML::Doctype;
  ...

=head1 DESCRIPTION

Experimental module to perform some document type declaration 
related operations. It currently depends on SGML::Parser::OpenSP
for which it provides a handler HTML::Doctype::Detector which can
be used to detect document type declarations.

  my $p = SGML::Parser::OpenSP->new;
  my $h = HTML::Doctype::Detector->new($p);
  $p->handler($h)

  # ...
  $p->parse_string("...");

  if ($h->is_xhtml)
  {
    # ...
  }

...

Future versions may offer additional functionality.

=head1 AUTHOR / COPYRIGHT / LICENSE

  Copyright (c) 2004-2008 Bjoern Hoehrmann <bjoern@hoehrmann.de>.
  This module is licensed under the same terms as Perl itself.

=cut