use strict;
use warnings;
package KinoSearch::Analysis::PolyAnalyzer;
use KinoSearch::Util::ToolSet;
use base qw( KinoSearch::Analysis::Analyzer );
our %instance_vars = (
# inherited
language => '',
# constructor params / members
analyzers => undef,
);
use KinoSearch::Analysis::LCNormalizer;
use KinoSearch::Analysis::Tokenizer;
use KinoSearch::Analysis::Stemmer;
sub init_instance {
my $self = shift;
my $language = $self->{language} = lc( $self->{language} );
# create a default set of analyzers if language was specified
if ( !defined $self->{analyzers} ) {
confess("Must specify either 'language' or 'analyzers'")
unless $language;
$self->{analyzers} = [
KinoSearch::Analysis::LCNormalizer->new( language => $language ),
KinoSearch::Analysis::Tokenizer->new( language => $language ),
KinoSearch::Analysis::Stemmer->new( language => $language ),
];
}
}
sub analyze_batch {
my ( $self, $token_batch ) = @_;
# iterate through each of the analyzers in order
$token_batch = $_->analyze_batch($token_batch)
for @{ $self->{analyzers} };
return $token_batch;
}
sub analyze_text {
my $self = $_[0];
my $analyzers = $self->{analyzers};
if ( !@$analyzers ) {
return KinoSearch::Analysis::TokenBatch->new( text => $_[1] );
}
elsif ( @$analyzers == 1 ) {
return $analyzers->[0]->analyze_text( $_[1] );
}
else {
my $batch = $analyzers->[0]->analyze_text( $_[1] );
$batch = $_->analyze_batch($batch)
for @{$analyzers}[ 1 .. $#$analyzers ];
return $batch;
}
}
sub analyze_field {
my $analyzers = $_[0]->{analyzers};
if ( !@$analyzers ) {
return KinoSearch::Analysis::TokenBatch->new(
text => $_[1]->{ $_[2] } );
}
elsif ( @$analyzers == 1 ) {
return $analyzers->[0]->analyze_field( $_[1], $_[2] );
}
else {
my $batch = $analyzers->[0]->analyze_field( $_[1], $_[2] );
$batch = $_->analyze_batch($batch)
for @{$analyzers}[ 1 .. $#$analyzers ];
return $batch;
}
}
1;
__END__
=head1 NAME
KinoSearch::Analysis::PolyAnalyzer - Multiple analyzers in series.
=head1 SYNOPSIS
my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new(
language => 'es',
);
# or...
my $lc_normalizer = KinoSearch::Analysis::LCNormalizer->new;
my $tokenizer = KinoSearch::Analysis::Tokenizer->new;
my $stemmer = KinoSearch::Analysis::Stemmer->new( language => 'en' );
my $polyanalyzer = KinoSearch::Analysis::PolyAnalyzer->new(
analyzers => [
$lc_normalizer,
$whitespace_tokenizer,
$stemmer,
],
);
=head1 DESCRIPTION
A PolyAnalyzer is a series of Analyzers -- objects which inherit from
L<KinoSearch::Analysis::Analyzer> -- each of which will be called upon to
"analyze" text in turn. You can either provide the Analyzers yourself, or you
can specify a supported language, in which case a PolyAnalyzer consisting of
an L<LCNormalizer|KinoSearch::Analysis::LCNormalizer>, a
L<Tokenizer|KinoSearch::Analysis::Tokenizer>, and a
L<Stemmer|KinoSearch::Analysis::Stemmer> will be generated for you.
Supported languages:
en => English,
da => Danish,
de => German,
es => Spanish,
fi => Finnish,
fr => French,
it => Italian,
nl => Dutch,
no => Norwegian,
pt => Portuguese,
ru => Russian,
sv => Swedish,
=head1 METHODS
=head2 new()
Constructor. Takes two possible hash-style parameters. If the parameter
C<analyzers> is specified, it will override C<language> and no attempt will be
made to generate a default set of Analyzers.
=over
=item
B<language> - Must be an ISO code from the list of supported languages.
=item
B<analyzers> - Must be an arrayref. Each element in the array must inherit
from KinoSearch::Analysis::Analyzer. The order of the analyzers matters.
Don't put a Stemmer before a Tokenizer (can't stem whole documents or
paragraphs -- just individual words), or a Stopalizer after a Stemmer (stemmed
words, e.g. "themselv", will not appear in a stoplist). In general, the
sequence should be: normalize, tokenize, stopalize, stem.
=back
=head1 COPYRIGHT
Copyright 2005-2007 Marvin Humphrey
=head1 LICENSE, DISCLAIMER, BUGS, etc.
See L<KinoSearch> version 0.20.
=cut