use strict;
use warnings;
package KinoSearch::Highlight::Highlighter;
use KinoSearch::Util::ToolSet;
use base qw( KinoSearch::Util::Class );
our %instance_vars = (
# members
default_encoder => undef,
default_formatter => undef,
specs => undef,
terms => undef,
limit => undef,
token_re => qr/\b\w+(?:'\w+)?\b/,
);
BEGIN { __PACKAGE__->ready_get_set(qw( terms )) }
use KinoSearch::Highlight::SimpleHTMLFormatter;
use KinoSearch::Highlight::SimpleHTMLEncoder;
sub init_instance {
my $self = shift;
$self->{specs} = [];
$self->{terms} = [];
# create shared default encoder and formatter
$self->{default_encoder} = KinoSearch::Highlight::SimpleHTMLEncoder->new;
$self->{default_formatter}
= KinoSearch::Highlight::SimpleHTMLFormatter->new(
pre_tag => '<strong>',
post_tag => '</strong>',
);
}
my %add_spec_defaults = (
field => undef,
excerpt_length => 200,
formatter => undef,
encoder => undef,
name => undef,
);
sub add_spec {
my $self = shift;
confess kerror() unless verify_args( \%add_spec_defaults, @_ );
my %spec = ( %add_spec_defaults, @_ );
confess("Missing required param 'field'")
unless defined $spec{field};
# assume HTML
if ( !defined $spec{encoder} ) {
$spec{encoder} = $self->{default_encoder};
}
if ( !defined $spec{formatter} ) {
$spec{formatter} = $self->{default_formatter};
}
# scoring window is 1.66 * excerpt_length, with the loc in the middle
$spec{limit} = int( $spec{excerpt_length} / 3 );
# use field name as key unless specified
$spec{name} = $spec{field} unless defined $spec{name};
push @{ $self->{specs} }, \%spec;
}
sub generate_excerpts {
my ( $self, $doc, $doc_vector ) = @_;
# create an excerpt for each spec
my %excerpts;
for my $spec ( @{ $self->{specs} } ) {
$excerpts{ $spec->{name} }
= $self->_gen_excerpt( $doc, $doc_vector, $spec );
}
return \%excerpts;
}
sub _gen_excerpt {
my ( $self, $doc, $doc_vector, $spec ) = @_;
my $excerpt_field = $spec->{field};
my $excerpt_length = $spec->{excerpt_length};
my $limit = $spec->{limit};
my $token_re = $self->{token_re};
# retrieve the text from the chosen field
my $text = $doc->{$excerpt_field};
return unless defined $text;
my $text_length = length $text;
my $orig_length = $text_length;
return '' unless $text_length;
# determine the rough boundaries of the excerpt
my $posits = $self->_starts_and_ends( $doc_vector, $excerpt_field );
my $best_location = $self->_calc_best_location( $posits, $limit );
my $top = $best_location - $limit;
# expand the excerpt if the best location is near the end
$top
= $text_length - $excerpt_length < $top
? $text_length - $excerpt_length
: $top;
# if the best starting point is the very beginning, cool...
if ( $top <= 0 ) {
$top = 0;
}
# ... otherwise ...
else {
# lop off $top characters
$text = substr( $text, $top );
# try to start the excerpt at a sentence boundary
if ($text =~ s/
\A
(
.{0,$limit}?
\.\s+
)
//xsm
)
{
$top += length($1);
}
# no sentence boundary, so we'll need an ellipsis
else {
# skip past possible partial tokens, prepend an ellipsis
if ($text =~ s/
\A
(
.{0,$limit}? # don't go outside the window
$token_re # match possible partial token
.*? # ... and any junk following that token
)
(?=$token_re) # just before the start of a full token...
/... /xsm # ... insert an ellipsis
)
{
$top += length($1);
$top -= 4 # three dots and a space
}
}
}
# remove possible partial tokens from the end of the excerpt
$text = substr( $text, 0, $excerpt_length + 1 );
if ( length($text) > $excerpt_length ) {
my $extra_char = chop $text;
# if the extra char wasn't part of a token, we aren't splitting one
if ( $extra_char =~ $token_re ) {
$text =~ s/$token_re$//; # if this is unsuccessful, that's fine
}
}
# if the excerpt doesn't end with a full stop, end with an an ellipsis
if ( $orig_length > length($text) and $text !~ /\.\s*\Z/xsm ) {
$text =~ s/\W+\Z//xsm;
while ( length($text) + 4 > $excerpt_length ) {
my $extra_char = chop $text;
if ( $extra_char =~ $token_re ) {
$text =~ s/\W+$token_re\Z//xsm; # if unsuccessful, that's fine
}
$text =~ s/\W+\Z//xsm;
}
$text .= ' ...';
}
# remap locations now that we know the starting and ending bytes
$text_length = length($text);
my @relative_starts = map { $_->[0] - $top } @$posits;
my @relative_ends = map { $_->[1] - $top } @$posits;
# get rid of pairs with at least one member outside the text
while ( @relative_starts and $relative_starts[0] < 0 ) {
shift @relative_starts;
shift @relative_ends;
}
while ( @relative_ends and $relative_ends[-1] > $text_length ) {
pop @relative_starts;
pop @relative_ends;
}
# insert highlight tags
my $formatter = $spec->{formatter};
my $encoder = $spec->{encoder};
my $output_text = '';
my ( $start, $end, $last_start, $last_end ) = ( undef, undef, 0, 0 );
while (@relative_starts) {
$end = shift @relative_ends;
$start = shift @relative_starts;
$output_text .= $encoder->encode(
substr( $text, $last_end, $start - $last_end ) );
$output_text .= $formatter->highlight(
$encoder->encode( substr( $text, $start, $end - $start ) ) );
$last_end = $end;
}
$output_text .= $encoder->encode( substr( $text, $last_end ) );
return $output_text;
}
=for comment
Find all points in the text where a relevant term begins and ends. For terms
that are part of a phrase, only include points that are part of the phrase.
=cut
sub _starts_and_ends {
my ( $self, $doc_vector, $excerpt_field ) = @_;
my @posits;
my %done;
TERM: for my $term ( @{ $self->{terms} } ) {
if ( a_isa_b( $term, 'KinoSearch::Index::Term' ) ) {
next if $term->get_field ne $excerpt_field;
my $term_text = $term->get_text;
next TERM if $done{$term_text};
$done{$term_text} = 1;
# add all starts and ends
my $term_vector
= $doc_vector->term_vector( $excerpt_field, $term_text );
next TERM unless defined $term_vector;
my $starts = $term_vector->get_start_offsets;
my $ends = $term_vector->get_end_offsets;
while (@$starts) {
push @posits, [ shift @$starts, shift @$ends, 1 ];
}
}
# intersect positions for phrase terms
else {
# if not a Term, it's an array of Terms representing a phrase
next if $term->[0]->get_field ne $excerpt_field;
my @term_texts = map { $_->get_text } @$term;
my $phrase_text = join( ' ', @term_texts );
next TERM if $done{$phrase_text};
$done{$phrase_text} = 1;
my $posit_vec = KinoSearch::Util::BitVector->new;
my @term_vectors
= map { $doc_vector->term_vector( $excerpt_field, $_ ) }
@term_texts;
# make sure all terms are present
next TERM unless scalar @term_vectors == scalar @term_texts;
my $i = 0;
for my $tv (@term_vectors) {
# one term missing, ergo no phrase
next TERM unless defined $tv;
if ( $i == 0 ) {
$posit_vec->set( @{ $tv->get_positions } );
}
else {
# filter positions using logical "and"
my $other_posit_vec = KinoSearch::Util::BitVector->new;
$other_posit_vec->set(
grep { $_ >= 0 }
map { $_ - $i } @{ $tv->get_positions }
);
$posit_vec->AND($other_posit_vec);
}
$i++;
}
# add only those starts/ends that belong to a valid position
my $tv_start_positions = $term_vectors[0]->get_positions;
my $tv_starts = $term_vectors[0]->get_start_offsets;
my $tv_end_positions = $term_vectors[-1]->get_positions;
my $tv_ends = $term_vectors[-1]->get_end_offsets;
$i = 0;
my $j = 0;
my $last_token_index = $#term_vectors;
for my $valid_position ( @{ $posit_vec->to_arrayref } ) {
while ( $i <= $#$tv_start_positions ) {
last if ( $tv_start_positions->[$i] >= $valid_position );
$i++;
}
$valid_position += $last_token_index;
while ( $j <= $#$tv_end_positions ) {
last if ( $tv_end_positions->[$j] >= $valid_position );
$j++;
}
push @posits,
[ $tv_starts->[$i], $tv_ends->[$j], scalar @$term ];
$i++;
$j++;
}
}
}
@posits = sort { $a->[0] <=> $b->[0] || $b->[1] <=> $a->[1] } @posits;
my @unique;
my $last = ~0;
for (@posits) {
push @unique, $_ if $_->[0] != $last;
$last = $_->[0];
}
return \@unique;
}
# Select the character position representing the greatest keyword density.
sub _calc_best_location {
my ( $self, $posits, $limit ) = @_;
my $window = $limit * 2;
# if there aren't any keywords, take the excerpt from the top of the text
return 0 unless @$posits;
my %locations = map { ( $_->[0] => 0 ) } @$posits;
# if another keyword is in close proximity, add to the loc's score
for my $loc_index ( 0 .. $#$posits ) {
# only score positions that are in range
my $location = $posits->[$loc_index][0];
my $other_loc_index = $loc_index - 1;
while ( $other_loc_index > 0 ) {
my $diff = $location - $posits->[$other_loc_index][0];
last if $diff > $window;
my $num_tokens_at_pos = $posits->[$other_loc_index][2];
$locations{$location}
+= ( 1 / ( 1 + log($diff) ) ) * $num_tokens_at_pos;
--$other_loc_index;
}
$other_loc_index = $loc_index + 1;
while ( $other_loc_index <= $#$posits ) {
my $diff = $posits->[$other_loc_index] - $location;
last if $diff > $window;
my $num_tokens_at_pos = $posits->[$other_loc_index][2];
$locations{$location}
+= ( 1 / ( 1 + log($diff) ) ) * $num_tokens_at_pos;
++$other_loc_index;
}
}
# return the highest scoring position
return ( sort { $locations{$b} <=> $locations{$a} } keys %locations )[0];
}
1;
__END__
=head1 NAME
KinoSearch::Highlight::Highlighter - Create and highlight excerpts.
=head1 SYNOPSIS
my $highlighter = KinoSearch::Highlight::Highlighter->new;
$highlighter->add_spec( field => 'body' );
$hits->create_excerpts( highlighter => $highlighter );
=head1 DESCRIPTION
KinoSearch's Highlighter can be used to select relevant snippets from a
document, and to surround search terms with highlighting tags. It handles
both stems and phrases correctly and efficiently, using special-purpose data
generated at index-time.
=head1 METHODS
=head2 new
my $highlighter = KinoSearch::Highlight::Highlighter->new;
Constructor. Takes no arguments.
=head2 add_spec
$highlighter->add_spec(
field => 'content', # required
excerpt_length => 150, # default: 200
formatter => $formatter, # default: SimpleHTMLFormatter
encoder => $encoder, # default: SimpleHTMLEncoder
name => 'blurb' # default: value of field param
);
Add a spec for a single highlighted excerpt. Takes hash-style parameters:
=over
=item *
B<field> - the name of the field from which to draw the excerpt. The field
must be C<vectorized> (which is the default -- see
L<FieldSpec|KinoSearch::FieldSpec>).
=item *
B<excerpt_length> - the maximum length of the excerpt, in characters.
=item *
B<formatter> - an object which isa L<KinoSearch::Highlight::Formatter>. Used
to perform the actual highlighting.
=item *
B<encoder> - an object which isa L<KinoSearch::Highlight::Encoder>. All
excerpt text gets passed through the encoder, including highlighted terms. By
default, this is a SimpleHTMLEncoder, which encodes HTML entities.
=item *
B<name> - the key which will identify the excerpt in the excerpts hash.
Multiple excerpts with different specifications can be created from the same
source field using different values for C<name>.
=back
=head1 COPYRIGHT
Copyright 2005-2007 Marvin Humphrey
=head1 LICENSE, DISCLAIMER, BUGS, etc.
See L<KinoSearch> version 0.20.
=cut