package KinoSearch::Index::SegWriter;
use strict;
use warnings;
use KinoSearch::Util::ToolSet;
use base qw( KinoSearch::Util::Class );
use KinoSearch::Analysis::TokenBatch;
use KinoSearch::Index::FieldsWriter;
use KinoSearch::Index::PostingsWriter;
use KinoSearch::Index::CompoundFileWriter;
use KinoSearch::Index::IndexFileNames qw( @COMPOUND_EXTENSIONS );
use KinoSearch::Search::Similarity;
our %instance_vars = __PACKAGE__->init_instance_vars(
# constructor params / members
invindex => undef,
seg_name => undef,
finfos => undef,
similarity => undef,
# members
norm_outstreams => [],
fields_writer => undef,
postings_writer => undef,
doc_count => 0,
);
sub init_instance {
my $self = shift;
my ( $invindex, $norm_outstreams, $seg_name, $finfos )
= @{$self}{ 'invindex', 'norm_outstreams', 'seg_name', 'finfos' };
# init norms
my @indexed_field_nums = map { $_->get_field_num }
grep { $_->get_indexed } $finfos->get_infos;
for my $field_num (@indexed_field_nums) {
$norm_outstreams->[$field_num]
= $invindex->open_outstream("$seg_name.f$field_num");
}
# init FieldsWriter
$self->{fields_writer} = KinoSearch::Index::FieldsWriter->new(
invindex => $invindex,
seg_name => $seg_name,
);
# init PostingsWriter
$self->{postings_writer} = KinoSearch::Index::PostingsWriter->new(
invindex => $invindex,
seg_name => $seg_name,
);
}
sub get_seg_name { $_[0]->{seg_name} }
sub get_doc_count { $_[0]->{doc_count} }
# Add a document to the segment.
sub add_doc {
my ( $self, $doc ) = @_;
my $norm_outstreams = $self->{norm_outstreams};
my $postings_cache = $self->{postings_cache};
my $similarity = $self->{similarity};
my $doc_boost = $doc->get_boost;
for my $indexed_field ( grep { $_->get_indexed } $doc->get_fields ) {
my $token_batch = KinoSearch::Analysis::TokenBatch->new;
if ( $indexed_field->get_value_len ) {
$token_batch->add_token( $indexed_field->get_value, 0,
$indexed_field->get_value_len );
}
if ( $indexed_field->get_analyzed ) {
$token_batch
= $indexed_field->get_analyzer()->analyze($token_batch);
}
$token_batch->build_posting_list( $self->{doc_count},
$indexed_field->get_field_num );
if ( $indexed_field->get_vectorized and $indexed_field->get_stored ) {
$indexed_field->set_tv_string( $token_batch->get_tv_string );
}
# encode a norm into a byte, write it to an outstream
my $norm_val = $doc_boost * $indexed_field->get_boost
* $similarity->lengthnorm( $token_batch->get_size );
my $outstream = $norm_outstreams->[ $indexed_field->get_field_num ];
$outstream->lu_write( 'a', $similarity->encode_norm($norm_val) );
# feed PostingsWriter
$self->{postings_writer}->add_postings( $token_batch->get_postings );
}
# store fields
$self->{fields_writer}->add_doc($doc);
$self->{doc_count}++;
}
# Finish writing the segment.
sub finish {
my $self = shift;
my ( $invindex, $seg_name ) = @{$self}{ 'invindex', 'seg_name' };
# write Term Dictionary, positions.
$self->{postings_writer}->write_postings;
# write FieldInfos
my $finfos_outstream = $invindex->open_outstream("$seg_name.fnm");
$self->{finfos}->write_infos($finfos_outstream);
$finfos_outstream->close;
# close down all the writers, so we can open the files they've finished.
$self->{postings_writer}->finish;
$self->{fields_writer}->finish;
for ( @{ $self->{norm_outstreams} } ) {
$_->close if defined;
}
# consolidate compound file
unless ( $self->{_dont_use_comp_file} ) { # testing hack - always runs
my $compound_file_writer = KinoSearch::Index::CompoundFileWriter->new(
invindex => $invindex,
filename => "$seg_name.tmp",
);
my @compound_files = map {"$seg_name.$_"} @COMPOUND_EXTENSIONS;
push @compound_files, map { "$seg_name.f" . $_->get_field_num }
grep { $_->get_indexed } $self->{finfos}->get_infos;
$compound_file_writer->add_file($_) for @compound_files;
$compound_file_writer->finish;
$invindex->rename_file( "$seg_name.tmp", "$seg_name.cfs" );
$invindex->delete_file($_) for @compound_files;
}
}
1;
__END__
=begin devdocs
=head1 NAME
KinoSearch::Index::SegWriter - write one segment of an invindex
=head1 DESCRIPTION
SegWriter is a conduit through which information fed to InvIndexer passes on
its way to low-level writers such as FieldsWriter and TermInfosWriter.
=head1 COPYRIGHT
Copyright 2005-2006 Marvin Humphrey
=head1 LICENSE, DISCLAIMER, BUGS, etc.
See L<KinoSearch|KinoSearch> version 0.08.
=end devdocs
=cut