package MARC::Indexer::Config;
use strict;
use warnings;
use constant SPACE => 1 << 0;
use constant PLAIN => 1 << 1;
use constant QUOTE => 1 << 2;
use constant GROUP => 1 << 3;
use constant COMMA => 1 << 4;
use constant SEMIC => 1 << 5;
use constant ASTER => 1 << 6;
use constant NEWLN => 1 << 7;
use constant REGEX => 1 << 8;
sub new {
my ($cls, $f) = @_;
bless {
'file' => $f,
}, $cls;
}
sub parse {
my ($self, $f) = @_;
$f = $self->{'file'} || die "No file given"
if !defined $f;
my $fh;
if (ref $f) {
$fh = $f;
}
else {
open $fh, '<', $f or die "Can't open file $f: $!";
}
local $/;
my $src = $self->{'source'} = <$fh>;
$src =~ s/^\s*#.*//mg;
my @tok = _tokenize($src);
my %term;
$self->{'tokens'} = \@tok;
$self->{'terms'} = \%term;
while (@tok) {
_wsp(\@tok);
_docid(\@tok, \$self->{'docid'})
|| _defaults(\@tok, $self)
|| _term(\@tok, \%term)
|| _fatal("Unparseable", $tok[0][2]);
}
return $self;
}
sub _wspafter {
my ($tok) = @_;
my $n = 0;
$n++, pop @$tok while @$tok && $tok->[-1][0] == SPACE;
$n;
}
sub _wsp {
my ($tok) = @_;
my $n = 0;
$n++, shift @$tok while @$tok && $tok->[0][0] == SPACE;
$n;
}
sub _last {
my ($tok, $typ, $val) = @_;
_wspafter($tok);
return if !@$tok
|| defined($typ) && !($tok->[-1][0] & $typ)
|| ref($val) && $tok->[-1][1] !~ $val
|| defined($val) && $tok->[-1][1] ne $val;
return @{ pop @$tok };
}
sub _next {
my ($tok, $typ, $val) = @_;
_wsp($tok);
return if !@$tok
|| defined($typ) && !($tok->[0][0] & $typ)
|| ref($val) && $tok->[0][1] !~ $val
|| defined($val) && $tok->[0][1] ne $val;
return @{ shift @$tok };
}
sub _docid {
# docid FIELD;
my ($tok, $ref) = @_;
return if @$tok < 3;
my ($typ, $val, $lno);
($typ, $val, $lno) = _next($tok, PLAIN, 'docid') or return;
($typ, $val, $lno) = _next($tok, PLAIN) or return;
$$ref = $val;
($typ, $val, $lno) = _next($tok, SEMIC) or fatal("Junk after docid declaration");
}
sub _defaults {
# defaults { foo; bar; }
my ($tok, $self) = @_;
return if @$tok < 3;
my ($typ, $val, $lno, %def);
($typ, $val, $lno) = _next($tok, PLAIN, 'defaults') or return;
$self->{'defaults'} = \%def;
_term_body($tok, \%def);
1;
}
sub _term {
# term * { foo; bar; }
# term foo "bar" { baz; qux; }
my ($tok, $terms) = @_;
return if @$tok < 4;
my ($typ, $val, $lno);
($typ, $val, $lno) = _next($tok, PLAIN, 'term') or return;
($typ, $val, $lno) = _next($tok, ASTER|PLAIN ) or return;
my %term;
$terms->{$val} = \%term;
$term{'description'} = $val if ($typ, $val, $lno) = _next($tok, QUOTE);
_term_body($tok, \%term);
1;
}
sub _term_body {
my ($tok, $term) = @_;
_wsp($tok);
my ($typ, $val, $lno);
($typ, $val, $lno) = _next($tok, GROUP, '{') or die;
my @prop;
while (1) {
_wsp($tok);
($typ, $val, $lno) = _next($tok) or die;
if ($typ == GROUP && $val eq '}') {
if (@prop) {
my ($pkey, $pval) = _mkprop(@prop);
$term->{$pkey} = $pval;
}
last;
}
elsif ($typ == SEMIC) {
next if !@prop;
my ($pkey, $pval) = _mkprop(@prop);
$term->{$pkey} = $pval;
@prop = ();
}
elsif (!@prop && $typ == PLAIN) {
push @prop, $val;
}
elsif (@prop) {
push @prop, [$typ, $val];
}
else {
die;
}
}
$term->{'norm'} = [ list($term->{'norm'}) ];
$term->{'prefix'} = [ list($term->{'prefix'}) ];
$term->{'match'} = qr/$term->{'match'}/ if defined $term->{'match'};
}
sub _mkprop {
my $pkey = shift;
if (!@_) {
return ($pkey, '') if $pkey =~ s/^no//;
return ($pkey, 1);
}
if (_next(\@_, GROUP, '[')) {
_last(\@_, GROUP, ']') or _fatal("Unclosed list");
my @list;
while (1) {
_wsp(\@_);
my ($typ, $val, $lno) = _next(\@_, PLAIN|QUOTE)
or _fatal("Unrecognized list element");
push @list, $val;
_next(\@_, COMMA) or last;
}
_wsp(\@_);
_fatal("Junk at end of list") if @_;
return ($pkey, \@list);
}
return ($pkey, join(' ', map { $_->[1] } @_));
}
sub _fatal {
my ($err, $lno) = @_;
print STDERR "FATAL: $err at line $lno\n";
exit 2;
}
sub _tokenize {
local $_ = shift;
my @tok;
my $lno = 1;
while (!/\G\z/gc) {
$lno++, next if /\G\n/gc;
push @tok,
/\G(\s+)/gc ? [ SPACE, $1, $lno ] :
/\G([:.\w][-:.\$\/\w]*)/gc ? [ PLAIN, $1, $lno ] :
/\G"((?:\\.|[^"])*)"/gc ? [ QUOTE, _unesc($1), $lno ] :
/\G(,)/gc ? [ COMMA, $1, $lno ] :
/\G(;)/gc ? [ SEMIC, $1, $lno ] :
/\G(\*)/gc ? [ ASTER, $1, $lno ] :
/\G([(){}\[\]])/gc ? [ GROUP, $1, $lno ] :
/\G\/((?:\\.|[^\/])+)\//gc ? [ REGEX, $1, $lno ] :
_fatal("Unrecognized", $lno)
;
}
return @tok;
}
sub _unesc {
local $_ = shift;
s/\\(.)|(.)/defined($1) ? $1 : $2/eg;
return $_;
}
sub list {
my ($v) = @_;
return if !defined $v;
my $r = ref $v;
return @$v if $r eq 'ARRAY';
return %$v if $r eq 'HASH';
return $v;
}
1;