# Copyright 2022 Jeffrey Kegler # This file is part of Marpa::R2. Marpa::R2 is free software: you can # redistribute it and/or modify it under the terms of the GNU Lesser # General Public License as published by the Free Software Foundation, # either version 3 of the License, or (at your option) any later version. # # Marpa::R2 is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser # General Public License along with Marpa::R2. If not, see # http://www.gnu.org/licenses/. package Marpa::R2::HTML; use 5.010001; use strict; use warnings; use vars qw( $VERSION $STRING_VERSION ); $VERSION = '12.000000'; $STRING_VERSION = $VERSION; ## no critic (BuiltinFunctions::ProhibitStringyEval) $VERSION = eval $VERSION; ## use critic our @EXPORT_OK; use base qw(Exporter); BEGIN { @EXPORT_OK = qw(html); } package Marpa::R2::HTML::Internal; # Data::Dumper is used in tracing use Data::Dumper; use Marpa::R2::HTML::Internal; use Marpa::R2::HTML::Config; use Carp (); use HTML::Parser 3.69; use HTML::Entities qw(decode_entities); # versions below must be coordinated with # those required in Build.PL use English qw( -no_match_vars ); use Marpa::R2; { my $submodule_version = $Marpa::R2::VERSION; die 'Marpa::R2::VERSION not defined' if not defined $submodule_version; die "Marpa::R2::VERSION ($submodule_version) does not match Marpa::R2::HTML::VERSION ", $Marpa::R2::HTML::VERSION if $submodule_version != $Marpa::R2::HTML::VERSION; } use Marpa::R2::Thin::Trace; # constants use constant PHYSICAL_TOKEN => 42; use constant RUBY_SLIPPERS_TOKEN => 43; our @LIBMARPA_ERROR_NAMES = Marpa::R2::Thin::error_names(); our $UNEXPECTED_TOKEN_ID; our $NO_MARPA_ERROR; ERROR: for my $error_number ( 0 .. $#LIBMARPA_ERROR_NAMES ) { my $error_name = $LIBMARPA_ERROR_NAMES[$error_number]; if ( $error_name eq 'MARPA_ERR_UNEXPECTED_TOKEN_ID' ) { $UNEXPECTED_TOKEN_ID = $error_number; next ERROR; } if ( $error_name eq 'MARPA_ERR_NONE' ) { $NO_MARPA_ERROR = $error_number; next ERROR; } } ## end ERROR: for my $error_number ( 0 .. $#LIBMARPA_ERROR_NAMES ) use Marpa::R2::HTML::Callback; { my $submodule_version = $Marpa::R2::HTML::Callback::VERSION; die 'Marpa::R2::HTML::Callback::VERSION not defined' if not defined $submodule_version; die "Marpa::R2::HTML::Callback::VERSION ($submodule_version) does not match Marpa::R2::HTML::VERSION ", $Marpa::R2::HTML::VERSION if $submodule_version != $Marpa::R2::HTML::VERSION; } sub earleme_to_linecol { my ( $self, $earleme ) = @_; my $html_parser_tokens = $self->{tokens}; my $html_token_ix = $self->{earleme_to_html_token_ix}->[$earleme] + 1; die if not defined $html_token_ix; return @{ $html_parser_tokens->[$html_token_ix] }[ Marpa::R2::HTML::Internal::Token::LINE, Marpa::R2::HTML::Internal::Token::COLUMN, ]; } ## end sub earleme_to_linecol sub earleme_to_offset { my ( $self, $earleme ) = @_; my $html_parser_tokens = $self->{tokens}; my $html_token_ix = $self->{earleme_to_html_token_ix}->[$earleme] + 1; die if not defined $html_token_ix; return $html_parser_tokens->[$html_token_ix] ->[Marpa::R2::HTML::Internal::Token::END_OFFSET]; } ## end sub earleme_to_offset sub add_handler { my ( $self, $handler_description ) = @_; my $ref_type = ref $handler_description || 'not a reference'; Marpa::R2::exception( "Long form handler description should be ref to hash, but it is $ref_type" ) if $ref_type ne 'HASH'; my $element = delete $handler_description->{element}; my $class = delete $handler_description->{class}; my $pseudoclass = delete $handler_description->{pseudoclass}; my $action = delete $handler_description->{action}; Marpa::R2::exception( 'Unknown option(s) in Long form handler description: ', ( join q{ }, keys %{$handler_description} ) ) if scalar keys %{$handler_description}; Marpa::R2::exception('Handler action must be CODE ref') if ref $action ne 'CODE'; if ( defined $pseudoclass ) { $self->{handler_by_species}->{$pseudoclass} = $action; return 1; } $element = q{*} if not $element; $element = lc $element; $class //= q{*}; $self->{handler_by_element_and_class}->{ join q{;}, $element, $class } = $action; return 1; } ## end sub add_handler sub add_handlers_from_hashes { my ( $self, $handler_specs ) = @_; my $ref_type = ref $handler_specs || 'not a reference'; Marpa::R2::exception( "handlers arg must must be ref to ARRAY, it is $ref_type") if $ref_type ne 'ARRAY'; for my $handler_spec ( keys %{$handler_specs} ) { add_handler( $self, $handler_spec ); } return 1; } ## end sub add_handlers_from_hashes sub add_handlers { my ( $self, $handler_specs ) = @_; HANDLER_SPEC: for my $specifier ( keys %{$handler_specs} ) { my ( $element, $class, $pseudoclass ); my $action = $handler_specs->{$specifier}; ( $element, $class ) = ( $specifier =~ /\A ([^.]*) [.] (.*) \z/oxms ) or ( $element, $pseudoclass ) = ( $specifier =~ /\A ([^:]*) [:] (.*) \z/oxms ) or $element = $specifier; state $allowed_pseudoclasses = { map { ( $_, 1 ) } qw(TOP PI DECL COMMENT PROLOG TRAILER WHITESPACE CDATA PCDATA CRUFT) }; if ( $pseudoclass and not exists $allowed_pseudoclasses->{$pseudoclass} ) { Marpa::R2::exception( qq{pseudoclass "$pseudoclass" is not known:\n}, "Specifier was $specifier\n" ); } ## end if ( $pseudoclass and not exists $allowed_pseudoclasses...) if ( $pseudoclass and $element ) { Marpa::R2::exception( qq{pseudoclass "$pseudoclass" may not have an element specified:\n}, "Specifier was $specifier\n" ); } ## end if ( $pseudoclass and $element ) add_handler( $self, { element => $element, class => $class, pseudoclass => $pseudoclass, action => $action } ); } ## end HANDLER_SPEC: for my $specifier ( keys %{$handler_specs} ) return 1; } ## end sub add_handlers # If we factor this package, this will be the constructor. ## no critic (Subroutines::RequireArgUnpacking) sub create { ## use critic my $self = {}; $self->{trace_fh} = \*STDERR; ARG: for my $arg (@_) { my $ref_type = ref $arg || 'not a reference'; if ( $ref_type eq 'HASH' ) { Marpa::R2::HTML::Internal::add_handlers( $self, $arg ); next ARG; } Marpa::R2::exception( "Argument must be hash or refs to hash: it is $ref_type") if $ref_type ne 'REF'; my $option_hash = ${$arg}; $ref_type = ref $option_hash || 'not a reference'; Marpa::R2::exception( "Argument must be hash or refs to hash: it is ref to $ref_type") if $ref_type ne 'HASH'; OPTION: for my $option ( keys %{$option_hash} ) { if ( $option eq 'handlers' ) { add_handlers_from_hashes( $self, $option_hash->{$option} ); } state $allowed_options = { map { ( $_, 1 ) } qw(trace_fh trace_values trace_handlers trace_conflicts trace_terminals trace_cruft dump_AHFA dump_config compile ) }; if ( not exists $allowed_options->{$option} ) { Marpa::R2::exception("unknown option: $option"); } $self->{$option} = $option_hash->{$option}; } ## end OPTION: for my $option ( keys %{$option_hash} ) } ## end ARG: for my $arg (@_) my $source_ref = $self->{compile}; if ( defined $source_ref ) { ref $source_ref eq 'SCALAR' or Marpa::R2::exception( qq{value of "compile" option must be a SCALAR}); $self->{config} = Marpa::R2::HTML::Config->new_from_compile($source_ref); } ## end if ( defined $source_ref ) else { $self->{config} = Marpa::R2::HTML::Config->new(); } return $self; } ## end sub create sub handler_find { my ( $self, $rule_id, $class ) = @_; my $trace_handlers = $self->{trace_handlers}; my $handler; $class //= q{*}; my $action = $self->{action_by_rule_id}->[$rule_id]; FIND_HANDLER: { last FIND_HANDLER if not defined $action; if ( index( $action, 'SPE_' ) == 0 ) { my $species = substr $action, 4; $handler = $self->{handler_by_species}->{$species}; say {*STDERR} qq{Rule $rule_id: Found handler by species: "$species"} or Carp::croak("Cannot print: $ERRNO") if $trace_handlers and defined $handler; last FIND_HANDLER; } ## end if ( index( $action, 'SPE_' ) == 0 ) ## At this point action always is defined ## and starts with 'ELE_' my $element = substr $action, 4; my @handler_keys = ( ( join q{;}, $element, $class ), ( join q{;}, q{*}, $class ), ( join q{;}, $element, q{*} ), ( join q{;}, q{*}, q{*} ), ); ($handler) = grep {defined} @{ $self->{handler_by_element_and_class} }{@handler_keys}; say {*STDERR} qq{Rule $rule_id: Found handler by action and class: "}, ( grep { defined $self->{handler_by_element_and_class}->{$_} } @handler_keys )[0], q{"} or Carp::croak("Cannot print: $ERRNO") if $trace_handlers and defined $handler; } ## end FIND_HANDLER: return $handler if defined $handler; say {*STDERR} qq{Rule $rule_id: Using default handler for action "}, ( $action // q{*} ), qq{" and class: "$class"} or Carp::croak("Cannot print: $ERRNO") if $trace_handlers; return 'default_handler'; } ## end sub handler_find # "Original" value of a token range -- that is, the corresponding # text of the original document, unchanged. # Returned as a reference, because it may be very long sub token_range_to_original { my ( $self, $first_token_ix, $last_token_ix ) = @_; return \q{} if not defined $first_token_ix; my $document = $self->{document}; my $tokens = $self->{tokens}; my $start_offset = $tokens->[$first_token_ix] ->[Marpa::R2::HTML::Internal::Token::START_OFFSET]; my $end_offset = $tokens->[$last_token_ix] ->[Marpa::R2::HTML::Internal::Token::END_OFFSET]; my $original = substr ${$document}, $start_offset, ( $end_offset - $start_offset ); return \$original; } ## end sub token_range_to_original # "Original" value of token -- that is, the corresponding # text of the original document, unchanged. # The empty string if there is no such text. # Returned as a reference, because it may be very long sub tdesc_item_to_original { my ( $self, $tdesc_item ) = @_; my $text = q{}; my $document = $self->{document}; my $tokens = $self->{tokens}; my $tdesc_item_type = $tdesc_item->[0]; return q{} if not defined $tdesc_item_type; if ( $tdesc_item_type eq 'PHYSICAL_TOKEN' ) { return token_range_to_original( $self, $tdesc_item->[Marpa::R2::HTML::Internal::TDesc::START_TOKEN], $tdesc_item->[Marpa::R2::HTML::Internal::TDesc::END_TOKEN], ); } ## end if ( $tdesc_item_type eq 'PHYSICAL_TOKEN' ) if ( $tdesc_item_type eq 'VALUED_SPAN' ) { return token_range_to_original( $self, $tdesc_item->[Marpa::R2::HTML::Internal::TDesc::START_TOKEN], $tdesc_item->[Marpa::R2::HTML::Internal::TDesc::END_TOKEN], ); } ## end if ( $tdesc_item_type eq 'VALUED_SPAN' ) return q{}; } ## end sub tdesc_item_to_original # Given a token range and a tdesc list, # return a reference to the literal value. sub range_and_values_to_literal { my ( $self, $next_token_ix, $final_token_ix, $tdesc_list ) = @_; my @flat_tdesc_list = (); TDESC_ITEM: for my $tdesc_item ( @{$tdesc_list} ) { my $type = $tdesc_item->[0]; next TDESC_ITEM if not defined $type; next TDESC_ITEM if $type eq 'ZERO_SPAN'; next TDESC_ITEM if $type eq 'RUBY_SLIPPERS_TOKEN'; if ( $type eq 'VALUES' ) { push @flat_tdesc_list, @{ $tdesc_item->[Marpa::R2::HTML::Internal::TDesc::VALUE] }; next TDESC_ITEM; } push @flat_tdesc_list, $tdesc_item; } ## end TDESC_ITEM: for my $tdesc_item ( @{$tdesc_list} ) my @literal_pieces = (); TDESC_ITEM: for my $tdesc_item (@flat_tdesc_list) { my ( $tdesc_item_type, $next_explicit_token_ix, $furthest_explicit_token_ix ) = @{$tdesc_item}; if ( not defined $next_explicit_token_ix ) { ## An element can contain no HTML tokens -- it may contain ## only Ruby Slippers tokens. ## Treat this as a special case. if ( $tdesc_item_type eq 'VALUED_SPAN' ) { my $value = $tdesc_item->[Marpa::R2::HTML::Internal::TDesc::VALUE] // q{}; push @literal_pieces, \( q{} . $value ); } ## end if ( $tdesc_item_type eq 'VALUED_SPAN' ) next TDESC_ITEM; } ## end if ( not defined $next_explicit_token_ix ) push @literal_pieces, token_range_to_original( $self, $next_token_ix, $next_explicit_token_ix - 1 ) if $next_token_ix < $next_explicit_token_ix; if ( $tdesc_item_type eq 'VALUED_SPAN' ) { my $value = $tdesc_item->[Marpa::R2::HTML::Internal::TDesc::VALUE]; if ( defined $value ) { push @literal_pieces, \( q{} . $value ); $next_token_ix = $furthest_explicit_token_ix + 1; next TDESC_ITEM; } ## FALL THROUGH } ## end if ( $tdesc_item_type eq 'VALUED_SPAN' ) push @literal_pieces, token_range_to_original( $self, $next_explicit_token_ix, $furthest_explicit_token_ix ) if $next_explicit_token_ix <= $furthest_explicit_token_ix; $next_token_ix = $furthest_explicit_token_ix + 1; } ## end TDESC_ITEM: for my $tdesc_item (@flat_tdesc_list) return \( join q{}, map { ${$_} } @literal_pieces ); } ## end sub range_and_values_to_literal sub symbol_names_by_rule_id { my ( $self, $rule_id ) = @_; my $tracer = $self->{tracer}; my $grammar = $tracer->grammar(); my $rule_length = $grammar->rule_length($rule_id); return if not defined $rule_length; my @symbol_ids = ( $grammar->rule_lhs($rule_id) ); push @symbol_ids, map { $grammar->rule_rhs( $rule_id, $_ ) } ( 0 .. $rule_length - 1 ); return map { $tracer->symbol_name($_) } @symbol_ids; } ## end sub symbol_names_by_rule_id sub parse { my ( $self, $document_ref ) = @_; my %tags = (); Marpa::R2::exception( "parse() already run on this object\n", 'For a new parse, create a new object' ) if $self->{document}; my $trace_cruft = $self->{trace_cruft}; my $trace_terminals = $self->{trace_terminals} // 0; my $trace_conflicts = $self->{trace_conflicts}; my $trace_handlers = $self->{trace_handlers}; my $trace_values = $self->{trace_values}; my $trace_fh = $self->{trace_fh}; my $ref_type = ref $document_ref; Marpa::R2::exception('Arg to parse() must be ref to string') if not $ref_type or $ref_type ne 'SCALAR' or not defined ${$document_ref}; my $document = $self->{document} = $document_ref; my ($core_rules, $runtime_tag, $rank_by_name, $is_empty_element, $primary_group_by_tag ) = $self->{config}->contents(); $self->{is_empty_element} = $is_empty_element; if ($self->{dump_config}) { return $self->{config}->as_string(); } my @action_by_rule_id = (); $self->{action_by_rule_id} = \@action_by_rule_id; my $thin_grammar = Marpa::R2::Thin::G->new( { if => 1 } ); my $tracer = Marpa::R2::Thin::Trace->new($thin_grammar); $self->{tracer} = $tracer; RULE: for my $rule ( @{$core_rules} ) { my $lhs = $rule->{lhs}; my $rhs = $rule->{rhs}; my $min = $rule->{min}; my $action = $rule->{action}; my @symbol_ids = (); for my $symbol_name ( $lhs, @{$rhs} ) { push @symbol_ids, $tracer->symbol_by_name($symbol_name) // $tracer->symbol_new($symbol_name); } my ($lhs_id, @rhs_ids) = @symbol_ids; my $rule_id; if ( defined $min ) { $rule_id = $thin_grammar->sequence_new( $lhs_id, $rhs_ids[0], { min => $min } ); } else { $rule_id = $thin_grammar->rule_new( $lhs_id, \@rhs_ids ); } $action_by_rule_id[$rule_id] = $action; } ## end RULE: for my $rule ( @{$core_rules} ) # Some constants that we will use a lot my $SYMID_CRUFT = $tracer->symbol_by_name('CRUFT'); my $SYMID_CDATA = $tracer->symbol_by_name('CDATA'); my $SYMID_PCDATA = $tracer->symbol_by_name('PCDATA'); my $SYMID_WHITESPACE = $tracer->symbol_by_name('WHITESPACE'); my $SYMID_PI = $tracer->symbol_by_name('PI'); my $SYMID_C = $tracer->symbol_by_name('C'); my $SYMID_D = $tracer->symbol_by_name('D'); my $SYMID_EOF = $tracer->symbol_by_name('EOF'); my @raw_tokens = (); my $p = HTML::Parser->new( api_version => 3, start_h => [ \@raw_tokens, q{tagname,'S',line,column,offset,offset_end,is_cdata,attr} ], end_h => [ \@raw_tokens, q{tagname,'E',line,column,offset,offset_end,is_cdata} ], text_h => [ \@raw_tokens, qq{'$SYMID_WHITESPACE','T',line,column,offset,offset_end,is_cdata} ], comment_h => [ \@raw_tokens, qq{'$SYMID_C','C',line,column,offset,offset_end,is_cdata} ], declaration_h => [ \@raw_tokens, qq{'$SYMID_D','D',line,column,offset,offset_end,is_cdata} ], process_h => [ \@raw_tokens, qq{'$SYMID_PI','PI',line,column,offset,offset_end,is_cdata} ], unbroken_text => 1 ); $p->parse( ${$document} ); $p->eof; my @html_parser_tokens = (); HTML_PARSER_TOKEN: for my $raw_token (@raw_tokens) { my ( undef, $token_type, $line, $column, $offset, $offset_end, $is_cdata, $attr ) = @{$raw_token}; PROCESS_TOKEN_TYPE: { if ($is_cdata) { $raw_token->[Marpa::R2::HTML::Internal::Token::TOKEN_ID] = $SYMID_CDATA; last PROCESS_TOKEN_TYPE; } if ( $token_type eq 'T' ) { # White space as defined in HTML 4.01 # space (x20); ASCII tab (x09); ASCII form feed (x0C;); Zero-width space (x200B) # and the two characters which appear in line breaks: # carriage return (x0D) and line feed (x0A) # I avoid the Perl character codes because I do NOT want # localization $raw_token->[Marpa::R2::HTML::Internal::Token::TOKEN_ID] = $SYMID_PCDATA if substr( ${$document}, $offset, ( $offset_end - $offset ) ) =~ / [^\x09\x0A\x0C\x0D\x20\x{200B}] /oxms; last PROCESS_TOKEN_TYPE; } ## end if ( $token_type eq 'T' ) if ( $token_type eq 'E' or $token_type eq 'S' ) { # If it's a virtual token from HTML::Parser, # pretend it never existed. # HTML::Parser supplies missing # end tags for title elements, but for no # others. # This is not helpful and we need to special-case # these zero-length tags and throw them away. next HTML_PARSER_TOKEN if $offset_end <= $offset; my $tag_name = $raw_token ->[Marpa::R2::HTML::Internal::Token::TAG_NAME]; my $terminal = $token_type . q{_} . $tag_name; my $terminal_id = $tracer->symbol_by_name($terminal); if ( not defined $terminal_id ) { my $group_symbol = $primary_group_by_tag->{$tag_name} // 'GRP_anywhere'; my $contents = $runtime_tag->{$tag_name} // 'FLO_mixed'; my @symbol_names = ( $group_symbol, 'ELE_' . $tag_name, 'S_' . $tag_name, $contents, 'E_' . $tag_name ); my @symbol_ids = (); SYMBOL: for my $symbol_name (@symbol_names) { my $symbol_id = $tracer->symbol_by_name($symbol_name); if ( not defined $symbol_id ) { $symbol_id = $tracer->symbol_new($symbol_name); } push @symbol_ids, $symbol_id; } ## end SYMBOL: for my $symbol_name (@symbol_names) my ( $top_id, $lhs_id, @rhs_ids ) = @symbol_ids; $thin_grammar->rule_new( $top_id, [$lhs_id] ); my $element_rule_id = $thin_grammar->rule_new( $lhs_id, \@rhs_ids ); $action_by_rule_id[$element_rule_id] = 'ELE_' . $tag_name; $terminal_id = $tracer->symbol_by_name($terminal); } ## end if ( not defined $terminal_id ) $raw_token->[Marpa::R2::HTML::Internal::Token::TOKEN_ID] = $terminal_id; last PROCESS_TOKEN_TYPE; } ## end if ( $token_type eq 'E' or $token_type eq 'S' ) } ## end PROCESS_TOKEN_TYPE: push @html_parser_tokens, $raw_token; } ## end HTML_PARSER_TOKEN: for my $raw_token (@raw_tokens) # Points AFTER the last HTML # Parser token. # The other logic needs to be ready for this. { my $document_length = length ${$document}; my $last_token = $html_parser_tokens[-1]; push @html_parser_tokens, [ $SYMID_EOF, 'EOF', @{$last_token}[ Marpa::R2::HTML::Internal::Token::LINE, Marpa::R2::HTML::Internal::Token::COLUMN ], $document_length, $document_length ]; } # conserve memory $p = undef; @raw_tokens = (); $thin_grammar->start_symbol_set( $tracer->symbol_by_name('document') ); $thin_grammar->precompute(); if ($self->{dump_AHFA}) { return \$tracer->show_AHFA(); } # Memoize these -- we use highest symbol a lot my $highest_symbol_id = $thin_grammar->highest_symbol_id(); my $highest_rule_id = $thin_grammar->highest_rule_id(); # For the Ruby Slippers engine # We need to know quickly if a symbol is a start tag; my @is_start_tag = (); # Find Ruby slippers ranks, by symbol ID my @ruby_rank_by_id = (); { my @non_final_end_tag_ids = (); SYMBOL: for my $symbol_id ( 0 .. $highest_symbol_id ) { my $symbol_name = $tracer->symbol_name($symbol_id); next SYMBOL if not 0 == index $symbol_name, 'E_'; next SYMBOL if $symbol_name eq 'E_body' or $symbol_name eq 'E_html'; push @non_final_end_tag_ids, $symbol_id; } ## end SYMBOL: for my $symbol_id ( 0 .. $highest_symbol_id ) my %ruby_vectors = (); for my $rejected_symbol_name ( keys %{$rank_by_name} ) { my @ruby_vector_by_id = ( (0) x ( $highest_symbol_id + 1 ) ); my $rank_by_candidate_name = $rank_by_name->{$rejected_symbol_name}; CANDIDATE: for my $candidate_name ( keys %{$rank_by_candidate_name} ) { my $rank = $rank_by_candidate_name->{$candidate_name}; if ( $candidate_name eq '' ) { $ruby_vector_by_id[$_] = $rank for @non_final_end_tag_ids; next CANDIDATE; } my $candidate_id = $tracer->symbol_by_name($candidate_name); die "Unknown ruby slippers candidate name: $candidate_name" if not defined $candidate_id; $ruby_vector_by_id[$candidate_id] = $rank for @non_final_end_tag_ids; } ## end CANDIDATE: for my $candidate_name ( keys %{...}) $ruby_vectors{$rejected_symbol_name} = \@ruby_vector_by_id; } ## end for my $rejected_symbol_name ( keys %{$rank_by_name} ) my @no_ruby_slippers_vector = ( (0) x ( $highest_symbol_id + 1 ) ); SYMBOL: for my $rejected_symbol_id ( 0 .. $highest_symbol_id ) { if ( not $thin_grammar->symbol_is_terminal($rejected_symbol_id) ) { $ruby_rank_by_id[$rejected_symbol_id] = \@no_ruby_slippers_vector; next SYMBOL; } ## end if ( not $thin_grammar->symbol_is_terminal(...)) my $rejected_symbol_name = $tracer->symbol_name($rejected_symbol_id); my $placement; FIND_PLACEMENT: { my $prefix = substr $rejected_symbol_name, 0, 2; if ( $prefix eq 'S_' ) { $placement = ''; $is_start_tag[$rejected_symbol_id] = 1; last FIND_PLACEMENT; } if ( $prefix eq 'E_' ) { $placement = '/'; } } ## end FIND_PLACEMENT: my $ruby_vector = $ruby_vectors{$rejected_symbol_name}; if ( defined $ruby_vector ) { $ruby_rank_by_id[$rejected_symbol_id] = $ruby_vector; next SYMBOL; } if ( not defined $placement ) { if ( $rejected_symbol_name eq 'CRUFT' ) { $ruby_rank_by_id[$rejected_symbol_id] = \@no_ruby_slippers_vector; next SYMBOL; } $ruby_rank_by_id[$rejected_symbol_id] = $ruby_vectors{'!non_element'} // \@no_ruby_slippers_vector; next SYMBOL; } ## end if ( not defined $placement ) my $tag = substr $rejected_symbol_name, 2; my $primary_group = $primary_group_by_tag->{$tag}; my $element_type = defined $primary_group ? (substr $primary_group, 4) : 'anywhere'; $ruby_vector = $ruby_vectors{ q{<} . $placement . q{%} . $element_type . q{>} }; if ( defined $ruby_vector ) { $ruby_rank_by_id[$rejected_symbol_id] = $ruby_vector; next SYMBOL; } $ruby_vector = $ruby_vectors{ q{<} . $placement . q{*>} }; if ( defined $ruby_vector ) { $ruby_rank_by_id[$rejected_symbol_id] = $ruby_vector; next SYMBOL; } $ruby_rank_by_id[$rejected_symbol_id] = \@no_ruby_slippers_vector; } ## end SYMBOL: for my $rejected_symbol_id ( 0 .. $highest_symbol_id ) } my @empty_element_end_tag = (); { TAG: for my $tag (keys %{$is_empty_element}) { my $start_tag_id = $tracer->symbol_by_name('S_' . $tag); next TAG if not defined $start_tag_id; my $end_tag_id = $tracer->symbol_by_name('E_' . $tag); $empty_element_end_tag[$start_tag_id] = $end_tag_id; } } my $recce = Marpa::R2::Thin::R->new($thin_grammar); $recce->start_input(); $self->{recce} = $recce; $self->{tokens} = \@html_parser_tokens; $self->{earleme_to_html_token_ix} = [-1]; # These variables track virtual start tokens as # a protection against infinite loops. my %start_virtuals_used = (); my $earleme_of_last_start_virtual = -1; # first token is a dummy, so that ix is never 0 # this is done because 0 has a special meaning as a Libmarpa # token value my $latest_html_token = -1; my $token_number = 0; my $token_count = scalar @html_parser_tokens; # this array track the last token number (location) at which # the symbol with this number was last read. It's used # to prevent the same Ruby Slippers token being added # at the same location more than once. # If allowed, this could cause an infinite loop. # Note that only start tags are tracked -- the rest of the # array stays at -1. my @terminal_last_seen = ( (-1) x ( $highest_symbol_id + 1 ) ); $thin_grammar->throw_set(0); my $empty_element_end_tag; RECCE_RESPONSE: while ( $token_number < $token_count ) { if ( defined $empty_element_end_tag ) { my $read_result = $recce->alternative( $empty_element_end_tag, RUBY_SLIPPERS_TOKEN, 1 ); if ( $read_result != $NO_MARPA_ERROR ) { die $thin_grammar->error(); } if ($trace_terminals) { say {$trace_fh} 'Virtual end tag accepted: ', $tracer->symbol_name($empty_element_end_tag) or Carp::croak("Cannot print: $ERRNO"); } if ( $recce->earleme_complete() < 0 ) { die $thin_grammar->error(); } my $current_earleme = $recce->current_earleme(); die $thin_grammar->error() if not defined $current_earleme; $self->{earleme_to_html_token_ix}->[$current_earleme] = $latest_html_token; $empty_element_end_tag = undef; next RECCE_RESPONSE; } ## end if ( defined $empty_element_end_tag ) my $token = $html_parser_tokens[$token_number]; my $attempted_symbol_id = $token ->[Marpa::R2::HTML::Internal::Token::TOKEN_ID]; my $read_result = $recce->alternative( $attempted_symbol_id, PHYSICAL_TOKEN, 1 ); if ( $read_result != $UNEXPECTED_TOKEN_ID ) { if ( $read_result != $NO_MARPA_ERROR ) { die $thin_grammar->error(); } if ($trace_terminals) { say {$trace_fh} 'Token accepted: ', $tracer->symbol_name($attempted_symbol_id) or Carp::croak("Cannot print: $ERRNO"); } if ( $recce->earleme_complete() < 0 ) { die $thin_grammar->error(); } my $last_html_token_of_marpa_token = $token_number; $token_number++; if ( defined $last_html_token_of_marpa_token ) { $latest_html_token = $last_html_token_of_marpa_token; } my $current_earleme = $recce->current_earleme(); die $thin_grammar->error() if not defined $current_earleme; $self->{earleme_to_html_token_ix}->[$current_earleme] = $latest_html_token; $empty_element_end_tag = $empty_element_end_tag[$attempted_symbol_id]; next RECCE_RESPONSE; } ## end if ( $read_result != $UNEXPECTED_TOKEN_ID ) if ($trace_terminals) { say {$trace_fh} 'Literal Token not accepted: ', $tracer->symbol_name($attempted_symbol_id) or Carp::croak("Cannot print: $ERRNO"); } my $highest_candidate_rank = 0; my $virtual_terminal_to_add; my $ruby_vector = $ruby_rank_by_id[$attempted_symbol_id]; my @terminals_expected = $recce->terminals_expected(); die $thin_grammar->error() if not defined $terminals_expected[0]; CANDIDATE: for my $candidate_id (@terminals_expected) { my $this_candidate_rank = $ruby_vector->[$candidate_id]; if ($trace_terminals) { say {$trace_fh} 'Considering candidate: ', $tracer->symbol_name($candidate_id), "; rank is $this_candidate_rank; highest rank so far is $highest_candidate_rank" or Carp::croak("Cannot print: $ERRNO"); } ## end if ($trace_terminals) if ( $this_candidate_rank > $highest_candidate_rank ) { if ($trace_terminals) { say {$trace_fh} 'Considering candidate: ', $tracer->symbol_name($candidate_id), '; last seen at ', $terminal_last_seen[$candidate_id], "; current token number is $token_number" or Carp::croak("Cannot print: $ERRNO"); } ## end if ($trace_terminals) next CANDIDATE if $terminal_last_seen[$candidate_id] == $token_number; if ($trace_terminals) { say {$trace_fh} 'Current best candidate: ', $tracer->symbol_name($candidate_id), or Carp::croak("Cannot print: $ERRNO"); } $highest_candidate_rank = $this_candidate_rank; $virtual_terminal_to_add = $candidate_id; } ## end if ( $this_candidate_rank > $highest_candidate_rank ) } ## end CANDIDATE: for my $candidate_id (@terminals_expected) if ( defined $virtual_terminal_to_add ) { if ($trace_terminals) { say {$trace_fh} 'Adding Ruby Slippers token: ', $tracer->symbol_name($virtual_terminal_to_add), or Carp::croak("Cannot print: $ERRNO"); } my $ruby_slippers_result = $recce->alternative( $virtual_terminal_to_add, RUBY_SLIPPERS_TOKEN, 1 ); if ( $ruby_slippers_result != $NO_MARPA_ERROR ) { die $thin_grammar->error(); } if ( $recce->earleme_complete() < 0 ) { die $thin_grammar->error(); } # Only keep track of start tags. We need to be able to add end # tags repeatedly. # Adding end tags cannot cause an infinite loop, because each # one ends an element and only a finite number of elements # can have been started. $terminal_last_seen[$virtual_terminal_to_add] = $token_number if $is_start_tag[$virtual_terminal_to_add]; my $current_earleme = $recce->current_earleme(); die $thin_grammar->error() if not defined $current_earleme; $self->{earleme_to_html_token_ix}->[$current_earleme] = $latest_html_token; $empty_element_end_tag = $empty_element_end_tag[$virtual_terminal_to_add]; next RECCE_RESPONSE; } ## end if ( defined $virtual_terminal_to_add ) # If we didn't find a token to add, add the # current physical token as CRUFT. if ($trace_terminals) { say {$trace_fh} 'Adding rejected token as cruft: ', $tracer->symbol_name($attempted_symbol_id) or Carp::croak("Cannot print: $ERRNO"); } my $fatal_cruft_error = $token->[Marpa::R2::HTML::Internal::Token::TOKEN_ID] == $SYMID_CRUFT ? 1 : 0; if ( $trace_cruft or $fatal_cruft_error ) { my $current_earleme = $recce->current_earleme(); die $thin_grammar->error() if not defined $current_earleme; my ( $line, $col ) = earleme_to_linecol( $self, $current_earleme ); # HTML::Parser uses one-based line numbers, # but zero-based column numbers # The convention (in vi and cut) is that # columns are also one-based. $col++; say {$trace_fh} qq{Cruft at line $line, column $col: "}, ${ token_range_to_original( $self, $token_number, $token_number ) }, q{"} or Carp::croak("Cannot print: $ERRNO"); die 'Internal error: cruft token was rejected' if $fatal_cruft_error; } ## end if ( $trace_cruft or $fatal_cruft_error ) # Cruft tokens are not virtual. # They are the real things, hacked up. $token->[Marpa::R2::HTML::Internal::Token::TOKEN_ID] = $SYMID_CRUFT; } ## end RECCE_RESPONSE: while ( $token_number < $token_count ) $thin_grammar->throw_set(1); if ($trace_terminals) { say {$trace_fh} 'at end of tokens' or Carp::croak("Cannot print: $ERRNO"); } $Marpa::R2::HTML::INSTANCE = $self; local $Marpa::R2::HTML::Internal::PARSE_INSTANCE = $self; my $latest_earley_set_ID = $recce->latest_earley_set(); my $bocage = Marpa::R2::Thin::B->new( $recce, $latest_earley_set_ID ); my $order = Marpa::R2::Thin::O->new($bocage); my $tree = Marpa::R2::Thin::T->new($order); $tree->next(); my @stack = (); local $Marpa::R2::HTML::Internal::STACK = \@stack; my %memoized_handlers = (); my $valuator = Marpa::R2::Thin::V->new($tree); local $Marpa::R2::HTML::Internal::RECCE = $recce; local $Marpa::R2::HTML::Internal::VALUATOR = $valuator; for my $rule_id ( grep { $thin_grammar->rule_length($_); } 0 .. $thin_grammar->highest_rule_id() ) { $valuator->rule_is_valued_set( $rule_id, 1 ); } STEP: while (1) { my ( $type, @step_data ) = $valuator->step(); last STEP if not defined $type; if ( $type eq 'MARPA_STEP_TOKEN' ) { say {*STDERR} join q{ }, $type, @step_data, $tracer->symbol_name( $step_data[0] ) or Carp::croak("Cannot print: $ERRNO") if $trace_values; my ( undef, $token_value, $arg_n ) = @step_data; if ( $token_value eq RUBY_SLIPPERS_TOKEN ) { $stack[$arg_n] = ['RUBY_SLIPPERS_TOKEN']; say {*STDERR} "Stack:\n", Data::Dumper::Dumper( \@stack ) or Carp::croak("Cannot print: $ERRNO") if $trace_values; next STEP; } ## end if ( $token_value eq RUBY_SLIPPERS_TOKEN ) my ( $start_earley_set_id, $end_earley_set_id ) = $valuator->location(); my $start_earleme = $recce->earleme($start_earley_set_id); my $start_html_token_ix = $self->{earleme_to_html_token_ix}->[$start_earleme]; my $end_earleme = $recce->earleme($end_earley_set_id); my $end_html_token_ix = $self->{earleme_to_html_token_ix}->[$end_earleme]; $stack[$arg_n] = [ 'PHYSICAL_TOKEN' => $start_html_token_ix + 1, $end_html_token_ix, ]; say {*STDERR} "Stack:\n", Data::Dumper::Dumper( \@stack ) or Carp::croak("Cannot print: $ERRNO") if $trace_values; next STEP; } ## end if ( $type eq 'MARPA_STEP_TOKEN' ) if ( $type eq 'MARPA_STEP_RULE' ) { say {*STDERR} join q{ }, ( $type, @step_data ) or Carp::croak("Cannot print: $ERRNO") if $trace_values; my ( $rule_id, $arg_0, $arg_n ) = @step_data; my $attributes = undef; my $class = undef; my $action = $action_by_rule_id[$rule_id]; local $Marpa::R2::HTML::Internal::START_TAG_IX = undef; local $Marpa::R2::HTML::Internal::END_TAG_IX_REF = undef; local $Marpa::R2::HTML::Internal::ELEMENT = undef; local $Marpa::R2::HTML::Internal::SPECIES = q{}; if ( defined $action and ( index $action, 'ELE_' ) == 0 ) { $Marpa::R2::HTML::Internal::SPECIES = $Marpa::R2::HTML::Internal::ELEMENT = substr $action, 4; my $start_tag_marpa_token = $stack[$arg_0]; my $start_tag_type = $start_tag_marpa_token ->[Marpa::R2::HTML::Internal::TDesc::TYPE]; if ( defined $start_tag_type and $start_tag_type eq 'PHYSICAL_TOKEN' ) { my $start_tag_ix = $start_tag_marpa_token->[1]; my $start_tag_token = $html_parser_tokens[$start_tag_ix]; if ( $start_tag_token ->[Marpa::R2::HTML::Internal::Token::TYPE] eq 'S' ) { $Marpa::R2::HTML::Internal::START_TAG_IX = $start_tag_ix; $attributes = $start_tag_token ->[Marpa::R2::HTML::Internal::Token::ATTR]; } ## end if ( $start_tag_token->[...]) } ## end if ( defined $start_tag_type and $start_tag_type eq ...) } ## end if ( defined $action and ( index $action, 'ELE_' ) ==...) if ( defined $action and ( index $action, 'SPE_' ) == 0 ) { $Marpa::R2::HTML::Internal::SPECIES = q{:} . substr $action, 4; } local $Marpa::R2::HTML::Internal::ATTRIBUTES = $attributes; $class = $attributes->{class} // q{*}; local $Marpa::R2::HTML::Internal::CLASS = $class; local $Marpa::R2::HTML::Internal::ARG_0 = $arg_0; local $Marpa::R2::HTML::Internal::ARG_N = $arg_n; my ( $start_earley_set_id, $end_earley_set_id ) = $valuator->location(); my $start_earleme = $recce->earleme($start_earley_set_id); my $start_html_token_ix = $self->{earleme_to_html_token_ix}->[$start_earleme] + 1; my $end_earleme = $recce->earleme($end_earley_set_id); my $end_html_token_ix = $self->{earleme_to_html_token_ix}->[$end_earleme]; if ( $start_html_token_ix > $end_html_token_ix ) { $start_html_token_ix = $end_html_token_ix = undef; } local $Marpa::R2::HTML::Internal::START_HTML_TOKEN_IX = $start_html_token_ix; local $Marpa::R2::HTML::Internal::END_HTML_TOKEN_IX = $end_html_token_ix; my $handler_key = $rule_id . q{;} . $Marpa::R2::HTML::Internal::CLASS; my $handler = $memoized_handlers{$handler_key}; $trace_handlers and $handler and say {*STDERR} qq{Found memoized handler for rule $rule_id, class "}, ( $class // q{*} ), q{"}; if ( not defined $handler ) { $handler = $memoized_handlers{$handler_key} = handler_find( $self, $rule_id, $class ); } COMPUTE_VALUE: { if ( ref $handler ) { $stack[$arg_0] = [ VALUED_SPAN => $start_html_token_ix, $end_html_token_ix, ( scalar $handler->() ), $rule_id ]; last COMPUTE_VALUE; } ## end if ( ref $handler ) my @flat_tdesc_list = (); STACK_IX: for my $stack_ix ( $Marpa::R2::HTML::Internal::ARG_0 .. $Marpa::R2::HTML::Internal::ARG_N ) { my $tdesc_item = $Marpa::R2::HTML::Internal::STACK->[$stack_ix]; my $tdesc_type = $tdesc_item->[0]; next STACK_IX if not defined $tdesc_type; if ( $tdesc_type eq 'VALUES' ) { push @flat_tdesc_list, @{ $tdesc_item ->[Marpa::R2::HTML::Internal::TDesc::VALUE] }; next STACK_IX; } ## end if ( $tdesc_type eq 'VALUES' ) next STACK_IX if $tdesc_type ne 'VALUED_SPAN'; push @flat_tdesc_list, $tdesc_item; } ## end STACK_IX: for my $stack_ix ( $Marpa::R2::HTML::Internal::ARG_0...) if ( scalar @flat_tdesc_list <= 1 ) { $stack[$arg_0] = [ VALUED_SPAN => $start_html_token_ix, $end_html_token_ix, $flat_tdesc_list[0] ->[Marpa::R2::HTML::Internal::TDesc::VALUE], $rule_id ]; last COMPUTE_VALUE; } ## end if ( scalar @flat_tdesc_list <= 1 ) $stack[$arg_0] = [ VALUES => $start_html_token_ix, $end_html_token_ix, \@flat_tdesc_list, $rule_id ]; } ## end COMPUTE_VALUE: if ($trace_values) { say {*STDERR} "rule $rule_id: ", join q{ }, symbol_names_by_rule_id( $self, $rule_id ) or Carp::croak("Cannot print: $ERRNO"); say {*STDERR} "Stack:\n", Data::Dumper::Dumper( \@stack ) or Carp::croak("Cannot print: $ERRNO"); } ## end if ($trace_values) next STEP; } ## end if ( $type eq 'MARPA_STEP_RULE' ) if ( $type eq 'MARPA_STEP_NULLING_SYMBOL' ) { my ( $symbol_id, $arg_n ) = @step_data; $stack[$arg_n] = ['ZERO_SPAN']; if ($trace_values) { say {*STDERR} join q{ }, $type, @step_data, $tracer->symbol_name($symbol_id) or Carp::croak("Cannot print: $ERRNO"); say {*STDERR} "Stack:\n", Data::Dumper::Dumper( \@stack ) or Carp::croak("Cannot print: $ERRNO"); } ## end if ($trace_values) next STEP; } ## end if ( $type eq 'MARPA_STEP_NULLING_SYMBOL' ) die "Unexpected step type: $type"; } ## end STEP: while (1) my $result = $stack[0]; Marpa::R2::exception('No parse: evaler returned undef') if not defined $result; if ( ref $self->{handler_by_species}->{TOP} ) { ## This is a user-defined handler. We assume it returns ## a VALUED_SPAN. $result = $result->[Marpa::R2::HTML::Internal::TDesc::VALUE]; } else { ## The TOP handler was the default handler. ## We now want to "literalize" its result. FIND_LITERALIZEABLE: { my $type = $result->[Marpa::R2::HTML::Internal::TDesc::TYPE]; if ( $type eq 'VALUES' ) { $result = $result->[Marpa::R2::HTML::Internal::TDesc::VALUE]; last FIND_LITERALIZEABLE; } if ( $type eq 'VALUED_SPAN' ) { $result = [$result]; last FIND_LITERALIZEABLE; } die 'Internal: TOP result is not literalize-able'; } ## end FIND_LITERALIZEABLE: $result = range_and_values_to_literal( $self, 0, $#html_parser_tokens, $result ); } ## end else [ if ( ref $self->{handler_by_species}->{TOP} ) ] return $result; } ## end sub parse sub Marpa::R2::HTML::html { my ( $document_ref, @args ) = @_; my $html = Marpa::R2::HTML::Internal::create(@args); return Marpa::R2::HTML::Internal::parse( $html, $document_ref ); } 1; # vim: set expandtab shiftwidth=4: