### # CSS::SAC - a SAC implementation for Perl # Robin Berjon # 17/08/2001 - bugfixes... # 23/04/2001 - more enhancements # 19/03/2001 - second version, various suggestions and enhancements # 24/02/2001 - prototype mark I of the new model ### package CSS::SAC; use strict; use vars qw( $VERSION $RE_STRING $RE_NAME $RE_IDENT $RE_RANGE $RE_NUM %DIM_MAP %FUNC_MAP ); $VERSION = '0.08'; use CSS::SAC::ConditionFactory qw(); use CSS::SAC::SelectorFactory qw(); use CSS::SAC::LexicalUnit qw(:constants); use CSS::SAC::Selector::Sibling qw(:constants); use CSS::SAC::SelectorList qw(); use Text::Balanced qw(); use constant DEBUG => 0; #---------------------------------------------------------------------# # build a few useful regexen and maps #---------------------------------------------------------------------# # matches a quoted string $RE_STRING = Text::Balanced::gen_delimited_pat(q{'"}); #" $RE_STRING = qr/$RE_STRING/s; # matches a name token $RE_NAME = qr/ (?:(?:\\(?:(?:[a-fA-F0-9]{1,6}[\t\x20])|[\x32-\xff]))|[a-zA-Z\x80-\xff0-9-])+ /xs; # matches a valid CSS ident (this may be wrong, needs testing) $RE_IDENT = qr/ (?:(?:\\(?:(?:[a-fA-F0-9]{1,6}[\t\x20])|[ \x32-\xff]))|[a-zA-Z\x80-\xff]) (?:(?:\\(?:(?:[a-fA-F0-9]{1,6}[\t\x20])|[ \x32-\xff]))|[a-zA-Z\x80-\xff0-9_-])* /xs; # matches a unicode range $RE_RANGE = qr/(?: (?:U\+) (?: (?:[0-9a-fA-F]{1,6}-[0-9a-fA-F]{1,6}) | (?:\?{1,6}) | (?:[0-9a-fA-F](?: (?:\?{0,5}|[0-9a-fA-F])(?: (?:\?{0,4}|[0-9a-fA-F])(?: (?:\?{0,3}|[0-9a-fA-F])(?: (?:\?{0,2}|[0-9a-fA-F])(?: (?:\?{0,1}|[0-9a-fA-F]))))))) ) ) /xs; # matches a number $RE_NUM = qr/(?:(?:[0-9]*\.[0-9]+)|(?:[0-9]+))/; # maps a length or assoc value to it's constant %DIM_MAP = ( em => EM, ex => EX, px => PIXEL, cm => CENTIMETER, mm => MILLIMETER, in => INCH, pt => POINT, pc => PICA, deg => DEGREE, rad => RADIAN, grad => GRADIAN, ms => MILLISECOND, s => SECOND, hz => HERTZ, khz => KILOHERTZ, '%' => PERCENTAGE, ); # maps a length or assoc value to it's constant %FUNC_MAP = ( attr => ATTR, counter => COUNTER_FUNCTION, counters => COUNTERS_FUNCTION, rect => RECT_FUNCTION, url => URI, rgb => RGBCOLOR, ); #---------------------------------------------------------------------# #---------------------------------------------------------------------# # build the fields for an array based object #---------------------------------------------------------------------# use Class::ArrayObjects define => { fields => [qw( _cf_ _sf_ _dh_ _eh_ _dh_can_ _allow_charset_ _ns_map_ _tmp_media_ )], }; #---------------------------------------------------------------------# ### Constructor ####################################################### # # # # #---------------------------------------------------------------------# # CSS::SAC->new(\%options) # creates a new sac parser #---------------------------------------------------------------------# sub new { my $class = shift; my $options = shift || {}; # set our options my $self = bless [], $class; $self->[_cf_] = $options->{ConditionFactory} || CSS::SAC::ConditionFactory->new; $self->[_sf_] = $options->{SelectorFactory} || CSS::SAC::SelectorFactory->new; $self->[_eh_] = $options->{ErrorHandler} || CSS::SAC::DefaultErrorHandler->new; $self->[_dh_can_] = {}; $self->DocumentHandler($options->{DocumentHandler}) if $options->{DocumentHandler}; return $self; } #---------------------------------------------------------------------# # # # # ### Constructor ####################################################### ### Accessors ######################################################### # # # # #---------------------------------------------------------------------# # $sac->ParserVersion # returns the supported CSS version #---------------------------------------------------------------------# sub ParserVersion { # IMP # this should perhaps return http://www.w3.org/TR/REC-CSS{1,2,3} # as per http://www.w3.org/TR/SAC, but it's tricky for CSS3 which # is modularized return 'CSS3'; } #---------------------------------------------------------------------# *CSS::SAC::getParserVersion = \&ParserVersion; #---------------------------------------------------------------------# # my $cf = $sac->ConditionFactory # $sac->ConditionFactory($cf) # get/set the ConditionFactory that we use #---------------------------------------------------------------------# sub ConditionFactory { (@_==2) ? $_[0]->[_cf_] = $_[1] : $_[0]->[_cf_]; } #---------------------------------------------------------------------# *CSS::SAC::setConditionFactory = \&ConditionFactory; #---------------------------------------------------------------------# # my $sf = $sac->SelectorFactory # $sac->SelectorFactory($sf) # get/set the SelectorFactory that we use #---------------------------------------------------------------------# sub SelectorFactory { (@_==2) ? $_[0]->[_sf_] = $_[1] : $_[0]->[_sf_]; } #---------------------------------------------------------------------# *CSS::SAC::setSelectorFactory = \&SelectorFactory; #---------------------------------------------------------------------# # my $dh = $sac->DocumentHandler # $sac->DocumentHandler($dh) # get/set the DocumentHandler that we use #---------------------------------------------------------------------# sub DocumentHandler { my $sac = shift; my $dh = shift; # set the doc handler, and see what it can do if ($dh) { $sac->[_dh_] = $dh; my @dh_methods = qw( comment charset end_document end_font_face end_media end_page end_selector ignorable_at_rule import_style namespace_declaration property start_document start_font_face start_media start_page start_selector ); for my $meth (@dh_methods) { $sac->[_dh_can_]->{$meth} = $dh->can($meth); } } return $sac->[_dh_]; } #---------------------------------------------------------------------# *CSS::SAC::setDocumentHandler = \&DocumentHandler; #---------------------------------------------------------------------# # my $eh = $sac->ErrorHandler # $sac->ErrorHandler($eh) # get/set the ErrorHandler that we use #---------------------------------------------------------------------# sub ErrorHandler { (@_==2) ? $_[0]->[_eh_] = $_[1] : $_[0]->[_eh_]; } #---------------------------------------------------------------------# *CSS::SAC::setErrorHandler = \&ErrorHandler; # # # # ### Accessors ######################################################### ### Parsing Methods ################################################### # # # # #---------------------------------------------------------------------# # $sac->parse(\%options) # parses a style sheet #---------------------------------------------------------------------# sub parse { my $sac = shift; my $options = shift; # we always load the style sheet into memory because style sheets # are usually small. If this is a problem we'll change that. my $css; if ($options->{string}) { $css = $options->{string}; } elsif ($options->{ioref}) { my $io = $options->{ioref}; local $/ = undef; $css = <$io>; close $io; } elsif ($options->{filename}) { open CSS, "$options->{filename}" or die $!; local $/ = undef; $css = ; close CSS; } else { return undef; } ### look at the other options # charsets are forbidden in embedded style sheets if ($options->{embedded}) { $sac->[_allow_charset_] = 0; } else { $sac->[_allow_charset_] = 1; } #---> Start Parsing <---------------------------------------------# # start doc warn "[SAC] start parsing\n" if DEBUG; $sac->[_dh_]->start_document if $sac->[_dh_can_]->{start_document}; # before anything else occurs there can be a charset warn "[SAC] parsing charset\n" if DEBUG; $sac->parse_charset(\$css); $sac->[_allow_charset_] = 0; # remove an eventual HTML open comment (not reported to handler) warn "[SAC] removing HTML comments\n" if DEBUG; $css =~ s/^\s*\s*//) { # we don't do anything with those presently warn "[SAC] removing HTML comments\n" if DEBUG; } # we have selectors elsif (my $sel_list = $sac->parse_selector_list(\$css)) { warn "[SAC] parsed selectors\n" if DEBUG; next unless @$sel_list; # callbacks $sac->[_dh_]->start_selector($sel_list) if $sac->[_dh_can_]->{start_selector}; # parse the rule my $rule; warn "[SAC] parsing rule\n" if DEBUG; ### BUG # The Text::Balanced extractions below are not correct since they don't take # comments into account. With the first one, it'll fail on apostrophes in # comments, with the latter, on unbalanced apos and } in comments and # apos-strings. The latter is used currently because it is less likely to fail, # but what is needed is a real parser that steps inside the black parsing out # comments and property values. The good news is that we have most of the bits # to do that right already. #($rule,$css,undef) = Text::Balanced::extract_bracketed($css,q/{}'"/,qr/\s*/); #" ($rule,$css,undef) = Text::Balanced::extract_bracketed($css,q/{}"/,qr/\s*/); #" $sac->parse_rule(\$rule); # end of the rule $sac->[_dh_]->end_selector($sel_list) if $sac->[_dh_can_]->{end_selector}; } # trailing whitespace, should only happen at the very end elsif ($css =~ s/^\s+//) { # do nothing warn "[SAC] just whitespace\n" if DEBUG; } # error else { last if ! length $css; $sac->[_eh_]->fatal_error('Unknown trailing tokens in style sheet: "' . $css . '"'); last; } } # end doc warn "[SAC] end of document\n" if DEBUG; $sac->[_dh_]->end_document if $sac->[_dh_can_]->{end_document}; #---> Finish Parsing <--------------------------------------------# } #---------------------------------------------------------------------# *CSS::SAC::parseStyleSheet = \&parse; #---------------------------------------------------------------------# # $sac->parse_charset($string_ref) # parses a charset #---------------------------------------------------------------------# sub parse_charset { my $sac = shift; my $css = shift; # we don't remove leading ws, the charset must come first return unless $$css =~ s/^\@charset\s+//i; # extract the string if ($$css =~ s/^($RE_STRING)\s*;//) { my $charset = $1; $charset =~ s/^(?:'|")//; #" $charset =~ s/(?:'|")$//; #' $sac->[_dh_]->charset($charset) if $sac->[_dh_can_]->{charset}; } else { if ($$css =~ s/[^;]*;//) { $sac->[_eh_]->warning('Unknown token in charset declaration'); } else { $sac->[_eh_]->fatal_error('Unknown token in charset declaration'); } } } #---------------------------------------------------------------------# #---------------------------------------------------------------------# # $sac->parse_imports($string_ref) # parses import rules at the beginning #---------------------------------------------------------------------# sub parse_imports { my $sac = shift; my $css = shift; # we may have several imports separated by comments while ($$css =~ s/^\s*\@import\s+//i) { # first get the uri my $uri; if ($$css =~ s/^url\(//) { $$css =~ s/^((?:$RE_STRING)|([^\)]*))\s*//; $uri = $1; $uri =~ s/^(?:'|")//; # " $uri =~ s/(?:'|")$//; # ' $$css =~ s/^\)//; } else { $$css =~ s/^($RE_STRING)//; $uri = $1; $uri =~ s/^(?:'|")//; #" $uri =~ s/(?:'|")$//; #' } # a possible medialist my $medialist = $sac->parse_medialist($css); # we must have a terminating token now if ($$css =~ s/^\s*;//) { $sac->[_dh_]->import_style($uri,$medialist) if $sac->[_dh_can_]->{import_style}; } else { if ($$css =~ s/[^;]*;//) { $sac->[_eh_]->warning('Unknown token in import rule'); } else { $sac->[_eh_]->fatal_error('Unknown token in import rule'); } } # remove comments and run again $sac->parse_comments($css); } } #---------------------------------------------------------------------# #---------------------------------------------------------------------# # $sac->parse_namespace_declarations($string_ref) # parses ns declarations #---------------------------------------------------------------------# sub parse_namespace_declarations { my $sac = shift; my $css = shift; # we may have several ns decls separated by comments while ($$css =~ s/^\s*\@namespace\s+//i) { my ($prefix,$uri); # first get the prefix if ($$css !~ /^url\(/ and $$css =~ s/^($RE_IDENT)\s+//) { $prefix = $1; } # then get the uri if ($$css =~ s/^url\(//) { $$css =~ s/^((?:$RE_STRING)|([^\)]*))\s*//; $uri = $1; $uri =~ s/^(?:'|")//; # " $uri =~ s/(?:'|")$//; # ' $$css =~ s/^\)//; } else { $$css =~ s/^($RE_STRING)//; $uri = $1; $uri =~ s/^(?:'|")//; #" $uri =~ s/(?:'|")$//; #' } # we must have a terminating token now if ($$css =~ s/^\s*;//) { # store the prefix-ns in our ns map my $map_prefix = $prefix; $map_prefix = '#default' unless $prefix; $sac->[_ns_map_]->{$map_prefix} = $uri; # throw a callback $prefix ||= ''; $sac->[_dh_]->namespace_declaration($prefix,$uri) if $sac->[_dh_can_]->{namespace_declaration}; } else { if ($$css =~ s/[^;]*;//) { $sac->[_eh_]->warning('Unknown token in namespace declaration'); } else { $sac->[_eh_]->fatal_error('Unknown token in namespace declaration'); } } # remove comments and run again $sac->parse_comments($css); } } #---------------------------------------------------------------------# #---------------------------------------------------------------------# # $sac->parse_medialist($string_ref) # parses a list of media values # returns that list as an arrayref #---------------------------------------------------------------------# sub parse_medialist { my $sac = shift; my $css = shift; # test for the right content and return a list if found return [] unless $$css =~ s/^\s*($RE_IDENT(?:\s*,\s*$RE_IDENT)*)//; return [map { $_ =~ s/^\s+//; $_ =~ s/\s+$//; $_; } split /,/, $1]; } #---------------------------------------------------------------------# #---------------------------------------------------------------------# # $sac->parse_comments($string_ref) # parses as many comments as there are at the beginning of the string #---------------------------------------------------------------------# sub parse_comments { my $sac = shift; my $css = shift; # we may have several comments in a row my $ws; while ($$css =~ s|^(\s*)/\*||) { $ws .= $1; if ($$css =~ s{^((?:(?:\\\\)|(?:\\[^\*])|(?:\\\*)|[^\\])*?)\*/}{}) { $sac->[_dh_]->comment($1) if $sac->[_dh_can_]->{comment}; } else { if ($$css =~ s/.*\*\///) { $sac->[_eh_]->warning('Strange comment token, guessing the parse'); } else { $sac->[_eh_]->fatal_error('Unterminated comment: unrecoverable'); } } } # we need to keep the whitespace around for certain # occurences of comments, it may be significant $$css = $ws . $$css if defined $ws and defined $$css; } #---------------------------------------------------------------------# #---------------------------------------------------------------------# # $sac->parse_selector_list($string_ref) # parses a list of selectors # returns an array ref of selectors #---------------------------------------------------------------------# sub parse_selector_list { my $sac = shift; my $css = shift; # this is a long and hairy process my @sels; $$css =~ s/^\s*//; while (1) { # we've reached the rule, or there isn't anything left to parse if ($$css =~ m/^\s*\{/) { if (!@sels) { @sels = ($sac->[_sf_]->create_element_selector(undef,undef)); } last; } elsif (!length $$css) { last; } # a simple selector elsif (my $sel = $sac->parse_simple_selector($css)) { push @sels, $sel; # delete the rest $sac->parse_comments($css); $$css =~ s/^\s*,\s*//; $sac->parse_comments($css); $$css =~ s/^\s*//; } # error else { # something wrong must have happened if ($$css =~ s/[^{]*//) { $sac->[_eh_]->warning('Unknown token in selector list'); } else { $sac->[_eh_]->fatal_error('Unknown token in selector list'); } } } return unless @sels; # this returns nothing, there were no selectors (needed for parse) return CSS::SAC::SelectorList->new(\@sels); } #---------------------------------------------------------------------# *CSS::SAC::parseSelectors = \&parse_selector_list; #---------------------------------------------------------------------# # $sac->parse_simple_selector($string_ref) # parses a simple selector # returns the selector object #---------------------------------------------------------------------# sub parse_simple_selector { my $sac = shift; my $css = shift; $$css =~ s/^\s*//; ### eat the content piece by piece my @tokens; my ($attr,$func,$args); while (1) { $sac->parse_comments($css); # end of selector if ($$css =~ m/^\s*(?:,|{)/ or !length $$css) { last; } # element elsif ($$css =~ s/^(?:($RE_IDENT|\*)?(\|))?($RE_IDENT|\*)//) { # create element selector my ($ns,$lname); $lname = ($3 eq '*')?undef:$3; if (defined $2 and $2 eq '|') { if (!$1) { $ns = ''; # |E matches elements in no namespace } elsif ($1 eq '*') { $ns = undef; # undef means all, '' means default } else { $ns = $sac->[_ns_map_]->{$1}; } } else { # E matches elements in the default namespace or # any namespace if no default namespace is declared $ns = $sac->[_ns_map_]->{'#default'} || undef; } # push it push @tokens, $sac->[_sf_]->create_element_selector($ns,$lname); } # hash id elsif ($$css =~ s/^#($RE_NAME)//) { push @tokens, $sac->[_cf_]->create_id_condition($1); } # dot class elsif ($$css =~ s/^\.($RE_IDENT)//) { push @tokens, $sac->[_cf_]->create_class_condition(undef,$1); } # CSS3 pseudo-elements elsif ($$css =~ s/^::($RE_IDENT)//) { push @tokens, $sac->[_sf_]->create_pseudo_element_selector(undef,$1); } # [attr] elsif ( (($attr,$$css,undef) = Text::Balanced::extract_bracketed($$css,q/[]'"/,qr/\s*/)) and length $attr ) { $attr =~ s/^\[\s*//; $attr =~ s/\s*\]$//; # get the attr lname and ns $attr =~ s/^(?:($RE_IDENT|\*)?(\|))?($RE_IDENT|\*)//; my ($ns,$lname); $lname = ($3 eq '*')?undef:$3; if (defined $2 and $2 eq '|') { if (!$1) { $ns = '' # [|a] matches attributes in no namespace; } elsif ($1 eq '*') { $ns = undef; # undef means all, '' means default } else { $ns = $sac->[_ns_map_]->{$1}; } } else { $ns = ''; # [a] is equivalent to [|a] } # if there's more, parse on my ($op,$value); if (length $attr) { if ($attr =~ s/^((?:\^|\$|\*|\~|\|)?=)//) { $op = $1; $attr =~ s/^(?:'|")//; #" $attr =~ s/(?:'|")$//; #" $value = $attr; } else { $sac->[_eh_]->warning('Unknown token in attribute condition'); if ($$css =~ s/[^;]*;//) { $sac->[_eh_]->warning('Unknown token in import rule'); } else { $sac->[_eh_]->fatal_error('Unknown token in import rule'); } } } # create the right condition my $acond; if (!$op or $op eq '=') { my $spec = (defined $value)?1:0; $acond = $sac->[_cf_]->create_attribute_condition($lname,$ns,$spec,$value); } elsif ($op eq '^=') { $acond = $sac->[_cf_]->create_starts_with_attribute_condition($lname,$ns,1,$value); } elsif ($op eq '$=') { $acond = $sac->[_cf_]->create_ends_with_attribute_condition($lname,$ns,1,$value); } elsif ($op eq '*=') { $acond = $sac->[_cf_]->create_contains_attribute_condition($lname,$ns,1,$value); } elsif ($op eq '~=') { $acond = $sac->[_cf_]->create_one_of_attribute_condition($lname,$ns,1,$value); } elsif ($op eq '|=') { $acond = $sac->[_cf_]->create_begin_hyphen_attribute_condition($lname,$ns,1,$value); } push @tokens, $acond; } # :pseudo() elsif ( ($args,$$css,$func) = Text::Balanced::extract_bracketed($$css,q/()'"/,qr/:$RE_IDENT/) and length $func ) { # cleanup the func and args $func =~ s/^://; $args =~ s/^\(\s*//; $args =~ s/\s*\)$//; $args =~ s/^(?:'|")//; #" $args =~ s/(?:'|")$//; #" # lang(lang_tag) if (lc ($func) eq 'lang') { push @tokens, $sac->[_cf_]->create_lang_condition($args); } # contains("text") elsif (lc ($func) eq 'contains') { push @tokens, $sac->[_cf_]->create_content_condition($args); } # not(selector) elsif (lc ($func) eq 'not') { my $sel = $sac->parse_simple_selector(\$args); # push @tokens, $sac->[_sf_]->create_negative_selector($sel); push @tokens, $sac->[_cf_]->create_negative_condition($sel); } # positional: nth-child, nth-last-child, nth-of-type, nth-last-of-type, elsif ($func =~ m/^nth-(last-)?((?:child)|(?:of-type))$/i) { my $pos = $args; my $of_type = (lc($2) eq 'of-type')?1:0; $pos = (lc($1) eq 'last-')?("-$pos"):($pos); # PositionalCondition will take care of parsing # the expressions, and will provide appropriate accessors push @tokens, $sac->[_cf_]->create_positional_condition($pos,$of_type,0); } # something else we don't know about else { push @tokens, $sac->[_cf_]->create_pseudo_class_condition(undef,$func); } } # :pseudo (not a function) elsif ($$css =~ s/^\:($RE_IDENT)//) { # root if (lc($1) eq 'root') { push @tokens, $sac->[_cf_]->create_is_root_condition; } # empty elsif (lc($1) eq 'empty') { push @tokens, $sac->[_cf_]->create_is_empty_condition; } # only-child elsif (lc($1) eq 'only-child') { my $fcond = $sac->[_cf_]->create_positional_condition(1,0,0); my $lcond = $sac->[_cf_]->create_positional_condition(-1,0,0); my $ocond = $sac->[_cf_]->create_and_condition($fcond,$lcond); push @tokens, $ocond; } # only-of-type elsif (lc($1) eq 'only-of-type') { my $fcond = $sac->[_cf_]->create_positional_condition(1,1,0); my $lcond = $sac->[_cf_]->create_positional_condition(-1,1,0); my $ocond = $sac->[_cf_]->create_and_condition($fcond,$lcond); push @tokens, $ocond; } # first-child elsif (lc($1) eq 'first-child') { push @tokens, $sac->[_cf_]->create_positional_condition(1,0,0); } # last-child elsif (lc($1) eq 'last-child') { push @tokens, $sac->[_cf_]->create_positional_condition(-1,0,0); } # first-of-type elsif (lc($1) eq 'first-of-type') { push @tokens, $sac->[_cf_]->create_positional_condition(1,1,0); } # last-of-type elsif (lc($1) eq 'last-of-type') { push @tokens, $sac->[_cf_]->create_positional_condition(-1,1,0); } # pseudo-elements in disguise elsif ( lc($1) eq 'first-line' or lc($1) eq 'first-letter' or lc($1) eq 'selection' or lc($1) eq 'before' or lc($1) eq 'after' ) { push @tokens, $sac->[_sf_]->create_pseudo_element_selector(undef,lc($1)); } # regular: link, visited, hover, active, focus, target, enabled, disabled, # checked, indeterminate else { push @tokens, $sac->[_cf_]->create_pseudo_class_condition(undef,$1); } } # combinators elsif ($$css =~ s/^\s*((?:\+|>|~))\s*//) { push @tokens, $1; } # special case empty combinator elsif ($$css =~ s/^\s+//) { push @tokens, ' '; } # an error else { if (s/^.*?(,|{)/$1/) { $sac->[_eh_]->warning('Unknown token in simple selector'); } else { $sac->[_eh_]->fatal_error('Unknown token in simple selector'); } } } ### process the tokens list # if the first token isn't an element selector then create a *|* one # evaling is lame, but it's the only test I could think of eval { $tokens[0]->SelectorType }; if ($@) { unshift @tokens, $sac->[_sf_]->create_element_selector(undef,undef); } # start looping over the tokens to reduce the list my $selector = shift @tokens; eval { $selector->SelectorType }; if ($@) { # this is a serious exception $sac->[_eh_]->fatal_error('Really weird input in simple selector'); } # here we need to check whether the next token is also a selector # if it is, we need to make an AND_CONDITION containing the two selectors # and to attach it to a universal selector # then we'll have to mix it into the $cond below. if (@tokens) { eval { $tokens[0]->SelectorType }; if (!$@) { my $and_cond = $sac->[_cf_]->create_and_condition($selector,shift @tokens); $selector = $sac->[_sf_]->create_element_selector(undef,undef); } } # create a conditional selector with all conditions my $cond = $sac->build_condition(\@tokens); if ($cond) { $selector = $sac->[_sf_]->create_conditional_selector($selector,$cond); } while (@tokens) { # here there should be a combinator or nothing my $comb = shift @tokens; if ($comb) { # pretty serious error $sac->[_eh_]->fatal_error('Really weird input in simple selector') if ref $comb; # get the next selector my $new_selector = shift @tokens; eval { $new_selector->SelectorType }; if ($@) { # last unless length $new_selector; if (ref $new_selector) { unshift @tokens, $new_selector; $new_selector = $sac->[_sf_]->create_element_selector(undef,undef); } else { # this is a serious exception (we don't know what's here) $sac->[_eh_]->fatal_error('Really weird input in simple selector: "' . $$css . '"'); } } # create a conditional selector with all conditions my $cond = $sac->build_condition(\@tokens); if ($cond) { $new_selector = $sac->[_sf_]->create_conditional_selector($new_selector,$cond); } # various possible combinators if ($comb eq ' ') { $selector = $sac->[_sf_]->create_descendant_selector($selector,$new_selector); } elsif ($comb eq '>') { $selector = $sac->[_sf_]->create_child_selector($selector,$new_selector); } elsif ($comb eq '+') { $selector = $sac->[_sf_]->create_direct_adjacent_selector( ELEMENT_NODE, $selector, $new_selector ); } elsif ($comb eq '~') { $selector = $sac->[_sf_]->create_indirect_adjacent_selector( ELEMENT_NODE, $selector, $new_selector ); } } } return $selector; } #---------------------------------------------------------------------# #---------------------------------------------------------------------# # $sac->build_condition(\@tokens) # helper to build conditions #---------------------------------------------------------------------# sub build_condition { my $sac = shift; my $tokens = shift; # get all conditions my @conditions; while (@$tokens) { # eval { $tokens->[0]->SelectorType }; # if (not $@) { # $sac->[_eh_]->fatal_error('Really weird input in simple selector'); # } last if ! ref $tokens->[0]; push @conditions, shift @$tokens; } # build a single condition out of the others my $cond; if (@conditions) { $cond = shift @conditions; for my $c (@conditions) { $cond = $sac->[_cf_]->create_and_condition($cond,$c); } } return $cond; } #---------------------------------------------------------------------# #---------------------------------------------------------------------# # $sac->parse_rule($string_ref) # parses a rule (with { and }) #---------------------------------------------------------------------# sub parse_rule { my $sac = shift; my $css = shift; return unless defined $$css; # remove { and }, and parse the content $$css =~ s/^\s*{//; $$css =~ s/}\s*$//; warn "[SAC] removed curlies\n" if DEBUG; $sac->parse_style_declaration($css); } #---------------------------------------------------------------------# *CSS::SAC::parseRule = \&parse_rule; #---------------------------------------------------------------------# # $sac->parse_style_declaration($string_ref) # same as parse_rule, but without the { and }. Cool for HTML, SVG... #---------------------------------------------------------------------# sub parse_style_declaration { my $sac = shift; my $css = shift; # take those prop-val one by one $sac->parse_comments($css); $$css =~ s/^\s*//; while (length $$css) { # the property $$css =~ s/^(-?$RE_IDENT)\s*//; # includes the - prefix my $prop = $1; $sac->parse_comments($css); # the separator $$css =~ s/^\s*:\s*//; $sac->parse_comments($css); # the value my $lu = $sac->parse_property_value($css); if (!@$lu) { last unless length $$css; if ($$css =~ s/[^;}]*(?:;|\})?//) { # this is a bit dodgy... $sac->[_eh_]->warning('Unknown token in style declaration'); } else { $sac->[_eh_]->fatal_error('Unknown token in style declaration: "' . $$css . '"'); } next; } $sac->parse_comments($css); # the priority my $prio = $sac->parse_priority($css); $sac->parse_comments($css); # the terminator $$css =~ s/^\s*;\s*//; # callback $prio ||= 0; $sac->[_dh_]->property($prop,$lu,$prio) if $sac->[_dh_can_]->{property}; # remove cruft $sac->parse_comments($css); $$css =~ s/^\s*//; } } #---------------------------------------------------------------------# *CSS::SAC::parseStyleDeclaration = \&parse_style_declaration; #---------------------------------------------------------------------# # $sac->parse_property_value($string_ref) # parses a value # returns an array ref of lexical units #---------------------------------------------------------------------# sub parse_property_value { my $sac = shift; my $css = shift; my $att = shift || 0; $$css =~ s/^\s*//; # parse it by value chunks my @lus; while (1) { my ($type,$text,$value); $sac->parse_comments($css); # exit conditions if (! length($$css) or $$css =~ m/^\s*(?:;|!)/ or ($att and $$css =~ s/^\s*(?:\))//)) { last; } # ops elsif ($$css =~ s{^\s*(,|/)\s*}{}) { $value = $1; if ($value eq ',') { $type = OPERATOR_COMMA; $text = 'comma'; } else { $type = OPERATOR_SLASH; $text = 'slash'; } } # special case empty op elsif ($$css =~ s{^\s+}{}) { next; } # inherit elsif ($$css =~ s/^inherit//) { $type = INHERIT; $text = 'inherit'; $value = undef; } # lengths and assoc elsif ($$css =~ s/^((?:\+|-)?$RE_NUM) (em|ex|px|cm|mm|in|pt|pc|deg|rad|grad|ms|s|hz|khz|%) //xi) { $value = $1; $text = lc $2; $type = $DIM_MAP{$text}; } # dimension elsif ($$css =~ s/^((?:\+|-)?$RE_NUM)($RE_IDENT)//) { $value = $1; $text = lc $2; $type = DIMENSION; } # number elsif ($$css =~ s/^((?:\+|-)?$RE_NUM)//) { $value = $1; $text = 'number'; if ($value =~ m/\./) { $type = REAL; } else { $type = INTEGER; } } # unicode range elsif ($$css =~ s/^($RE_RANGE)//) { $value = $1; $text = 'unicode-range'; $type = UNICODERANGE; } # hex rgb elsif ($$css =~ s/^#([0-9a-fA-F]{6}|[0-9a-fA-F]{3})//) { $value = $1; $text = '#'; $type = RGBCOLOR; } # functions # elsif ( # ($value,$$css,$text) = Text::Balanced::extract_bracketed($$css,q/()'"/,qr/$RE_IDENT/) # and # length $text # ) { elsif ($$css =~ s/^($RE_IDENT)\(//) { # cleanup the func and args # $text = lc $text; # $value =~ s/^\(\s*//; # $value =~ s/\s*\)$//; # $value =~ s/^(?:"|')//; #" # $value =~ s/(?:"|')$//; #" $text = lc $1; $value = $sac->parse_property_value($css, 1); # get the appropriate type if ($FUNC_MAP{$text}) { $type = $FUNC_MAP{$text}; } else { $type = FUNCTION; } } # ident elsif ($$css =~ s/^($RE_IDENT)//) { $value = $1; $text = 'ident'; $type = IDENT; } # string elsif ($$css =~ s/^($RE_STRING)//) { $value = $1; $value =~ s/^(?:"|')//; #" $value =~ s/(?:"|')$//; #" $text = 'string'; $type = STRING_VALUE; } # error else { return []; } # add a lu push @lus, CSS::SAC::LexicalUnit->new($type,$text,$value); } return \@lus; } #---------------------------------------------------------------------# *CSS::SAC::parsePropertyValue = \&parse_property_value; #---------------------------------------------------------------------# # $sac->parse_priority($string_ref) # parses a priority # returns true if there is a priority value there #---------------------------------------------------------------------# sub parse_priority { my $sac = shift; my $css = shift; return 1 if $$css =~ s/^\s*!\s*important//i; } #---------------------------------------------------------------------# *CSS::SAC::parsePriority = \&parse_priority; # # # # ### Parsing Methods ################################################### ### Default Error Handler ############################################# # # # # # This is pretty much a non package, it is just there to provide the # default error handler. package CSS::SAC::DefaultErrorHandler; sub new { return bless [], __PACKAGE__; } sub warning { warn "[warning] $_[1] (line " . (caller)[2] . ")"; } sub error { warn "[error] $_[1] (line " . (caller)[2] . ")"; } sub fatal_error { die "[fatal] $_[1] (line " . (caller)[2] . ")"; } # # # # ### Default Error Handler ############################################# 1; =pod =head1 NAME CSS::SAC - SAC CSS parser =head1 SYNOPSIS use CSS::SAC qw(); use My::SACHandler (); use My::SACErrors (); my $doc_handler = My::SACHandler->new; my $err_handler = My::SACErrors->new; my $sac = CSS::SAC->new({ DocumentHandler => $doc_handler, ErrorHandler => $err_handler, }); # generate a stream of events $sac->parse({ filename => 'foo.css' }); =head1 DESCRIPTION SAC (Simple API for CSS) is an event-based API much like SAX for XML. If you are familiar with the latter, you should have little trouble getting used to SAC. More information on SAC can be found online at http://www.w3.org/TR/SAC. CSS having more constructs than XML, core SAC is still more complex than core SAX. However, if you need to parse a CSS style sheet, SAC probably remains the easiest way to get it done. Most of the spec is presently implemented. The following interfaces are not yet there: Locator, CSSException, CSSParseException, ParserFactory. They may or may not be implemented at a later date (the most likely candidates are the exception classes, for which I still have to find an appropriate model). Some places differ slightly from what is in the spec. I have tried to keep those to a justified minimum and to flag them correctly. =head2 the CSS::SAC module itself The Parser class doesn't exist separately, it's defined in CSS::SAC. It doesn't expose the locale interface because we don't localize errors (yet). It also doesn't have C but rather C, which is more consistent with other Perl parsing interfaces. I have added the C callback to the DocumentHandler interface. There are valid reasons why it wasn't there (it can be trusted only ever so often, and one should look at the actual encoding instead) but given that it's a token in the grammar, I believe that there should still be a way to access it. =head1 METHODS =over 4 =item * CSS::SAC->new(\%options) or $sac->new(\%options) Constructs a new parser object. The options can be: - ConditionFactory and SelectorFactory the factory classes used to build selector and condition objects. See CSS::SAC::{Condition,Selector}Factory for more details on the interfaces those classes must expose. - DocumentHandler and ErrorHandler the handler classes used as sinks for the event stream received from a SAC Driver. See CSS::SAC::{Document,Error}Factory for more details on the interfaces those classes must expose. Methods will be called on whatever it is you pass as values to those options. Thus, you may pass in objects as well as class names (I haven't tested this yet, there may be a problem). NOTE: an error handler should implement all callbacks, while a document handler may only implement those it is interested in. There is a default error handler (which dies and warns depending on the type of error) but not default document handler. =item * $sac->ParserVerion or $sac->getParserVerion Returns the supported CSS version. Requesting this parser's ParserVersion will return the string 'CSS3'. While that is (modulo potential bugs of course) believed to be generally true, several caveats apply: To begin with, CSS3 has been modularised, and various modules are at different stages of development. Evolving modules may require evolving this parser. I hesitated between making ParserVersion return CSS2, CSS3-pre, or simply CSS3. I chose the latter because I intend to update it as I become aware of the necessity of changes to accommodate new CSS3 stuff, and because it already supports a number of constructs alien to CSS2 (of which namespaces is imho important enough to justify a CSS3 tag). If you are aware of incompatibilities, please contact me. More importantly, it is now considered wrong for a parser to return CSSx as its version and instead it is expected to return an uri corresponding to the uri of the CSS version that it supports. However, there is no uri for CSS3, but instead one uri per module. While this issue hasn't been resolved by the WG, I will stick to returning CSS3. However, B in the future, so please avoid relying on it. =item * $cf = $sac->ConditionFactory =item * $sac->ConditionFactory($cf) or $sac->setConditionFactory($cf) =item * $cf = $sac->SelectorFactory =item * $sac->SelectorFactory($sf) or $sac->setSelectorFactory($sf) =item * $cf = $sac->DocumentHandler =item * $sac->DocumentHandler($dh) or $sac->setDocumentHandler($dh) =item * $cf = $sac->ErrorHandler =item * $sac->ErrorHandler($eh) or $sac->setErrorHandler($eh) get/set the ConditionFactory, SelectorFactory, DocumentHandler, ErrorHandler that we use =item * $sac->parse(\%options) =item * $sac->parseStyleSheet(\%options) parses a style sheet and sends events to the defined handlers. The options that you can use are: =over 8 =item * string =item * ioref =item * filename passes either a string, an open filehandle, or a filename to read the stylesheet from =item * embedded tells whether the stylesheet is embedded or not. This is most of the time useless but it will influence the interpretation of @charset rules. The latter being forbidden in embedded style sheets they will generate an ignorable_style_sheet event instead of a charset event if embedded is set to a true value. =back =item * $sac->parse_rule($string_ref) =item * $sac->parseRule($string_ref) parses a rule (with { and }). You probably don't need this one. It returns nothing, but generates the events. =item * $sac->parse_style_declaration($string_ref) =item * $sac->parseStyleDeclaration($string_ref) same as parse_rule, but without the { and }. This is useful when you want to parse style declarations embedded using style attributes in HTML, SVG, etc... It returns nothing, but generates the events. =item * $sac->parse_property_value($string_ref) =item * $sac->parsePropertyValue($string_ref) parses a property value and returns an array ref of lexical units (see CSS::SAC::LexicalUnit) =item * $sac->parse_priority($string_ref) =item * $sac->parsePriority($string_ref) parses a priority and returns true if there is a priority value there. =item * $sac->parse_selector_list($string_ref) =item * $sac->parseSelectors($string_ref) parses a list of selectors and returns an array ref of selectors =back =head1 OTHER METHODS Methods in this section are of relevance mostly to the internal workings of the parser. I document them here but I don't really consider them part of the interface, and thus may change them if need be. If you are using them directly tell me about it and I will "officialize" them. These have no Java style equivalent. =over 4 =item * $sac->parse_charset($string_ref) parses a charset. It returns nothing, but generates the events. =item * $sac->parse_imports($string_ref) parses import rules. It returns nothing, but generates the events. =item * $sac->parse_namespace_declarations($string_ref) parses ns declarations. It returns nothing, but generates the events. =item * $sac->parse_medialist($string_ref) parses a list of media values and returns that list as an arrayref =item * $sac->parse_comments($string_ref) parses as many comments as there are at the beginning of the string. It returns nothing, but generates the events. =item * $sac->parse_simple_selector($string_ref) parses a simple selector and returns the selector object =item * $sac->build_condition(\@tokens) helper to build conditions (you probably don't want to use this at all...) =back =head1 CSS::SAC::DefaultErrorHandler This is pretty much a non package, it is just there to provide the default error handler if you are too lazy to provide one yourself. All it does is pretty simple. There are three error levels: C, C, and C. What it does is warn on the two first and die on the last. Yes, it ain't fancy but then you can plug anything more intelligent into it at any moment. =head1 CSS3 ISSUES One problem is that I have modelled this parser after existing SAC implementations that do not take into account as much of CSS3 as it is possible to. Some parts of that are trivial, and I have provided support on my own in this module. Other parts though are more important and I believe that coordination between the SAC authors would be beneficial on these points (once the relevant CSS3 modules will have moved to REC). =over 4 =item * new attribute conditions CSS3-selectors introduces a bunch of new things, including new attribute conditions ^= (starts with), $= (ends with) and *= (contains). There are no corresponding constants for conditions, so I suggested SAC_STARTS_WITH_ATTRIBUTE_CONDITION, SAC_ENDS_WITH_ATTRIBUTE_CONDITION, SAC_CONTAINS_ATTRIBUTE_CONDITION. Note that these constants have been added, together with the corresponding factory methods. However, they will remain undocumented and considered experimental until some consensus is reached on the matter. =item * :root condition The :root token confuses some people because they think it is equivalent to XPath's / root step. That is not so. XPath's root selects "above" the document element. CSS's :root tests whether an element is the document element, there is nothing above a document element. Thus :root on its own is equivalent to *:root. It's a condition, not a selector. E:root matches the E element that is also the document element (if there is one). Thus, SAC_ROOT_NODE_SELECTOR does not apply and we need a new SAC_IS_ROOT_CONDITION constant. Note that this constant has been added, together with the corresponding factory method. However, it will remain undocumented and considered experimental until some consensus is reached on the matter. =item * other new pseudo-classes :empty definitely needs a constant too I'd say. Note that this constant has been added, together with the corresponding factory method. However, it will remain undocumented and considered experimental until some consensus is reached on the matter. =item * an+b syntax in positional conditions There is new syntax that allows for very customisable positional selecting. PositionalCondition needs to be updated to deal with that. =back =head1 BUGS - the problem with attaching pseudo-elements to elements as coselectors. I'm not sure which is the right representation. Don't forget to update CSS::SAC::Writer too so that it writes it out properly. - see Bjoern's list =head1 ACKNOWLEDGEMENTS - Bjoern Hoehrmann for his immediate reaction and much valuable feedback and suggestions. It's certainly much harder to type with all those fingers that all those Mafia padres have cut off, but at least I get work done much faster than before. And also those nasty bugs he kindly uncovered. - Steffen Goeldner for spotting bugs and providing patches. - Ian Hickson for very very very kind testing support, and all sorts of niceties. - Manos Batsis for starting a very long discussion on this that eventually deviated into other very interesting topics, and for giving me some really weird style sheets to feed into this module. - Simon St.Laurent for posting this on xmlhack.com and thus pointing a lot of people to this module (as seen in my referer logs). And of course all the other people that have sent encouragement notes and feature requests. =head1 TODO - add a pointer to the SAC W3 page - create the Exception classes - update PositionalCondition to include logic that can normalize the an+n notation and add a method that given a position will return a boolean indicating whether it matches the condition. - add stringify overloading to all classes so that they may be printed directly - have parser version return an overloaded object that circumvents the current problems - add docs on how to write a {Document,Error}Handler, right now there is example code in Writer, but it isn't all clearly explained. - find a way to make the '-' prefix to properties optional - add a filter that switches events to spec names, and that can be used directly through an option - add DOM-like hasFeature support (in view of SAC 3) - prefix all constants with SAC_. Keep the old ones around for a few versions, importable with :old-constants. - update docs =head1 AUTHOR Robin Berjon This module is licensed under the same terms as Perl itself. =cut