package KinoSearch::Search::MultiSearcher; use strict; use warnings; use KinoSearch::Util::ToolSet; use base qw( KinoSearch::Searcher ); BEGIN { __PACKAGE__->init_instance_vars( # members / constructor args searchables => undef, # members starts => undef, max_doc => undef, ); } use KinoSearch::Search::Similarity; sub init_instance { my $self = shift; $self->{field_sims} = {}; # derive max_doc, relative start offsets my $max_doc = 0; my @starts; for my $searchable ( @{ $self->{searchables} } ) { push @starts, $max_doc; $max_doc += $searchable->max_doc; } $self->{max_doc} = $max_doc; $self->{starts} = \@starts; # default similarity $self->{similarity} = KinoSearch::Search::Similarity->new unless defined $self->{similarity}; } sub get_field_names { my $self = shift; my %field_names; for my $searchable ( @{ $self->{searchables} } ) { my $sub_field_names = $searchable->get_field_names; @field_names{@$sub_field_names} = (1) x scalar @$sub_field_names; } return [ keys %field_names ]; } sub max_doc { shift->{max_doc} } sub close { } sub subsearcher { my ( $self, $doc_num ) = @_; my $i = -1; for ( @{ $self->{starts} } ) { last if $_ > $doc_num; $i++; } return $i; } sub doc_freq { my ( $self, $term ) = @_; my $doc_freq = 0; $doc_freq += $_->doc_freq($term) for @{ $self->{searchables} }; return $doc_freq; } sub fetch_doc { my ( $self, $doc_num ) = @_; my $i = $self->subsearcher($doc_num); my $searchable = $self->{searchables}[$i]; $doc_num -= $self->{starts}[$i]; return $searchable->fetch_doc($doc_num); } my %search_hit_collector_args = ( hit_collector => undef, weight => undef, filter => undef, sort_spec => undef, ); sub search_hit_collector { my $self = shift; confess kerror() unless verify_args( \%search_hit_collector_args, @_ ); my %args = ( %search_hit_collector_args, @_ ); my ( $searchables, $starts ) = @{$self}{qw( searchables starts )}; for my $i ( 0 .. $#$searchables ) { my $searchable = $searchables->[$i]; my $start = $starts->[$i]; my $collector = KinoSearch::Search::OffsetCollector->new( hit_collector => $args{hit_collector}, offset => $start ); $searchable->search_hit_collector( %args, hit_collector => $collector); } } sub rewrite { my ( $self, $orig_query ) = @_; # not necessary to rewrite until we add query types that need it return $orig_query; #my @queries = map { $_->rewrite($orig_query) } @{ $self->{searchables} }; #my $combined = $queries->[0]->combine(\@queries); #return $combined; } sub create_weight { my ( $self, $query ) = @_; my $searchables = $self->{searchables}; my $rewritten_query = $self->rewrite($query); # generate an array of unique terms my @terms = $rewritten_query->extract_terms; my %unique_terms; for my $term (@terms) { if ( a_isa_b($term, "KinoSearch::Index::Term") ) { $unique_terms{ $term->to_string } = $term; } else { # PhraseQuery returns an array of terms $unique_terms{ $_->to_string } = $_ for @$term; } } @terms = values %unique_terms; my @stringified = keys %unique_terms; # get an aggregated doc_freq for each term my @aggregated_doc_freqs = (0) x scalar @terms; for my $i ( 0 .. $#$searchables ) { my $doc_freqs = $searchables->[$i]->doc_freqs(\@terms); for my $j ( 0 .. $#terms ) { $aggregated_doc_freqs[$j] += $doc_freqs->[$j]; } } # prepare a hashmap of stringified_term => doc_freq pairs. my %doc_freq_map; @doc_freq_map{@stringified} = @aggregated_doc_freqs; my $cache_df_source = KinoSearch::Search::CacheDFSource->new( doc_freq_map => \%doc_freq_map, max_doc => $self->max_doc, similarity => $self->get_similarity, ); return $rewritten_query->to_weight($cache_df_source); } package KinoSearch::Search::CacheDFSource; use strict; use warnings; use KinoSearch::Util::ToolSet; use base qw( KinoSearch::Search::Searchable ); BEGIN { __PACKAGE__->init_instance_vars( doc_freq_map => {}, max_doc => undef, ); __PACKAGE__->ready_get(qw( max_doc )); } sub init_instance { } sub doc_freq { my ( $self, $term ) = @_; my $df = $self->{doc_freq_map}{ $term->to_string }; confess("df for " . $term->to_string . " not available") unless defined $df; } sub doc_freqs { my $self = shift; my @doc_freqs = map { $self->doc_freq($_) } @_; return \@doc_freqs; } sub max_doc { shift->{max_doc} } sub rewrite { return $_[1]; } =for comment Dummy class, only here to support initialization of Weights from Queries. =cut 1; __END__ =head1 NAME KinoSearch::Search::MultiSearcher - Aggregate results from multiple searchers. =head1 SYNOPSIS for my $server_name (@server_names) { push @searchers, KinoSearch::Search::SearchClient->new( peer_address => "$server_name:$port", analyzer => $analyzer, password => $pass, ); } my $multi_searcher = KinoSearch::Search::MultiSearcher->new( searchables => \@searchers, analyzer => $analyzer, ); my $hits = $multi_searcher->search( query => $query ); =head1 DESCRIPTION Aside from the arguments to its constructor, MultiSearcher looks and acts just like a L object. The primary use for MultiSearcher is to aggregate results from several remote searchers via L, diffusing the cost of searching a large corpus over multiple machines. =head1 METHODS =head2 new Constructor. Takes two hash-style parameters, both of which are required. =over =item * B - an item which subclasses L. =item * B - a reference to an array of searchers. =back =head1 COPYRIGHT Copyright 2006-2007 Marvin Humphrey =head1 LICENSE, DISCLAIMER, BUGS, etc. See L version 0.163. =cut