package Dezi::Lucy::Indexer; use Moose; extends 'Dezi::Indexer'; use Dezi::Lucy::InvIndex; use Lucy::Index::Indexer; use Lucy::Plan::Schema; use Lucy::Plan::FullTextType; use Lucy::Plan::StringType; use Lucy::Analysis::PolyAnalyzer; use Carp; use SWISH::3 qw( :constants ); use Scalar::Util qw( blessed ); use Data::Dump qw( dump ); use Search::Tools::UTF8; use Path::Class::File::Lockable; use Sys::Hostname qw( hostname ); use Digest::MD5 (); our $VERSION = '0.016'; has 'highlightable_fields' => ( is => 'rw', isa => 'Bool', default => sub {0} ); my $BUILT_IN_PROPS = SWISH_DOC_PROP_MAP(); =head1 NAME Dezi::Lucy::Indexer - Dezi::App Apache Lucy indexer =head1 SYNOPSIS use Dezi::Lucy::Indexer; my $indexer = Dezi::Lucy::Indexer->new( config => Dezi::Indexer::Config->new(), invindex => Dezi::Lucy::InvIndex->new(), highlightable_fields => 0, ); =head1 DESCRIPTION Dezi::Lucy::Indexer is an Apache Lucy based indexer class based on L. =head1 CONSTANTS All the L constants are imported into this namespace, including: =over =item SWISH_DOC_PROP_MAP =item SWISH_INDEX_STEMMER_LANG =item SWISH_INDEX_NAME =item SWISH_INDEX_FORMAT =back =head1 METHODS Only new and overridden methods are documented here. See the L documentation. =head2 BUILD Implements basic object set up. Called internally by new(). In addition to the attributes documented in Dezi::Indexer, this class implements the following attributes: =over =item highlightable_fields Value should be 0 or 1. Default is 0. Passed directly to the constructor for Lucy::Plan::FullTextField objects as the value for the C option. =back =cut sub BUILD { my $self = shift; # coerce our invindex into our format subclass unless ( $self->invindex->isa('Dezi::Lucy::InvIndex') ) { $self->invindex( Dezi::Lucy::InvIndex->new( path => $self->invindex->path ) ); } $self->_build_lucy_delegates(); } sub _build_lucy_delegates { my $self = shift; my $s3config = $self->swish3->config; my $lang = $s3config->get_index->get( SWISH_INDEX_STEMMER_LANG() ) || 'none'; $self->{_lang} = $lang; # cache for finish() my $schema = Lucy::Plan::Schema->new(); my $analyzers = {}; my $case_folder = Lucy::Analysis::CaseFolder->new; my $tokenizer = Lucy::Analysis::RegexTokenizer->new; my $multival_tokenizer = Lucy::Analysis::RegexTokenizer->new( pattern => '[^' . SWISH_TOKENPOS_BUMPER() . ']+' ); # mimic StringType fields that require case and/or multival parsing. $analyzers->{store_lc} = Lucy::Analysis::PolyAnalyzer->new( analyzers => [ $multival_tokenizer, $case_folder ] ); $analyzers->{store} = $multival_tokenizer; # stemming means we fold case and tokenize too. if ( $lang and $lang =~ m/^\w\w$/ ) { my $stemmer = Lucy::Analysis::SnowballStemmer->new( language => $lang ); $analyzers->{fulltext_lc} = Lucy::Analysis::PolyAnalyzer->new( analyzers => [ $multival_tokenizer, $case_folder, $tokenizer, $stemmer ] ); $analyzers->{fulltext} = Lucy::Analysis::PolyAnalyzer->new( analyzers => [ $multival_tokenizer, $tokenizer, $stemmer ] ); } else { $analyzers->{fulltext_lc} = Lucy::Analysis::PolyAnalyzer->new( analyzers => [ $multival_tokenizer, $case_folder, $tokenizer, ], ); $analyzers->{fulltext} = Lucy::Analysis::PolyAnalyzer->new( analyzers => [ $multival_tokenizer, $tokenizer ] ); } # cache our objects for later $self->{__lucy}->{analyzers} = $analyzers; $self->{__lucy}->{schema} = $schema; # build the Lucy fields, which are a merger of MetaNames+PropertyNames my %fields; my $metanames = $s3config->get_metanames; my $meta_keys = $metanames->keys; my $properties = $s3config->get_properties; my $property_keys = $properties->keys; # merge first by name so we pair correctly in _create_field_def() my %tmpfields; for my $name (@$meta_keys) { my $mn = $metanames->get($name); $tmpfields{$name}->{meta} = $mn; } for my $name (@$property_keys) { if ( exists $BUILT_IN_PROPS->{$name} ) { confess "$name is a built-in PropertyName and should not be defined in config"; } my $pr = $properties->get($name); $tmpfields{$name}->{prop} = $pr; } # build out field definitions for my $n ( keys %tmpfields ) { my %fdef = $self->_create_field_def( $tmpfields{$n}->{meta}, $tmpfields{$n}->{prop} ); $fields{ $fdef{name} } = $fdef{def}; } $self->{_fields} = \%fields; for my $name ( keys %fields ) { my $def = $fields{$name}; my $key = $name; # if a field is purely an alias, skip it. if ( defined $def->{is_meta_alias} and defined $def->{is_prop_alias} ) { $def->{store_as}->{ $def->{is_meta_alias} } = 1; $def->{store_as}->{ $def->{is_prop_alias} } = 1; next; } my $type = $self->_get_lucy_field_type($def) or next; $schema->spec_field( name => $name, type => $type ); $def->{store_as}->{$name} = 1; } # build in the built-ins $self->debug and warn dump \%fields; for my $name ( keys %$BUILT_IN_PROPS ) { if ( exists $fields{$name} ) { my $def = $fields{$name}; #carp "found $name in built-in props: " . dump($field); # in theory this should never happen. if ( !$def->{is_prop} ) { confess "$name is a built-in PropertyName but not defined as a PropertyName in config"; } } # default property else { $schema->spec_field( name => $name, type => Lucy::Plan::StringType->new( sortable => 1, ) ); } } #dump( \%fields ); # TODO can pass lucy in? make 'lucy' attribute public? my $hostname = hostname() or confess "Can't get unique hostname"; my $manager = Lucy::Index::IndexManager->new( host => $hostname ); $self->{lucy} ||= Lucy::Index::Indexer->new( schema => $schema, index => $self->invindex->path . "", create => 1, manager => $manager, ); } sub _get_lucy_field_type { my ( $self, $def ) = @_; my ( $type, $key ); my $analyzers = $self->{__lucy}->{analyzers}; # MetaName==yes, PropertyName==no if ( $def->{is_meta} and !$def->{is_prop} ) { if ( defined $def->{is_meta_alias} ) { $key = $def->{is_meta_alias}; $def->{store_as}->{$key} = 1; return; } #warn "spec meta $name"; $type = Lucy::Plan::FullTextType->new( analyzer => $analyzers->{fulltext_lc}, stored => 0, boost => $def->{bias} || 1.0, highlightable => $self->highlightable_fields, ); } # MetaName==yes, PropertyName==yes # this is the trickiest case, because the field # is both prop+meta and could be an alias for one # and a real for the other. # **NOTE** we must have already eliminated the case where # the field is an alias for both. elsif ( $def->{is_meta} and $def->{is_prop} ) { if ( defined $def->{is_meta_alias} ) { $key = $def->{is_meta_alias}; $def->{store_as}->{$key} = 1; } elsif ( defined $def->{is_prop_alias} ) { $key = $def->{is_prop_alias}; $def->{store_as}->{$key} = 1; } my $analyzer = $analyzers->{fulltext_lc}; if ( !$def->{ignore_case} ) { $analyzer = $analyzers->{fulltext}; } #warn "spec meta+prop $name"; $type = Lucy::Plan::FullTextType->new( analyzer => $analyzer, highlightable => $self->highlightable_fields, sortable => $def->{sortable}, boost => $def->{bias} || 1.0, ); } # MetaName==no, PropertyName==yes elsif (!$def->{is_meta} and $def->{is_prop} ) { if ( defined $def->{is_prop_alias} ) { $key = $def->{is_prop_alias}; $def->{store_as}->{$key} = 1; return; } #warn "spec prop !sort $name"; my $analyzer_key = 'store'; if ( $def->{ignore_case} ) { $analyzer_key = 'store_lc'; } $type = Lucy::Plan::FullTextType->new( analyzer => $analyzers->{$analyzer_key}, highlightable => $self->highlightable_fields, sortable => $def->{sortable}, boost => $def->{bias} || 1.0, ); } $self->debug and warn sprintf( "field def %s => field type %s", dump($def), $type ); return $type; } sub _create_field_def { my ( $self, $metaname, $propname ) = @_; if ( !$metaname and !$propname ) { confess "Must have one of metaname or propname objects"; } my $name = $metaname ? $metaname->name : $propname->name; my %field_def = (); if ($metaname) { if ( $metaname->name ne $name ) { confess "Mismatched metaname for '$name': " . $metaname->name; } my $alias = $metaname->alias_for; $field_def{is_meta} = 1; $field_def{is_meta_alias} = $alias; $field_def{bias} = $metaname->bias; $field_def{store_as}->{$name} = 1; # allow for aliases to built-ins if ( exists $BUILT_IN_PROPS->{$name} ) { $field_def{is_prop} = 1; $field_def{sortable} = 1; } } if ($propname) { if ( $propname->name ne $name ) { confess "Mismatched propname for '$name'" . $propname->name; } my $prop_alias = $propname->alias_for; $field_def{is_prop} = 1; $field_def{is_prop_alias} = $prop_alias; if ( $propname->sort ) { $field_def{sortable} = 1; } for my $attr (qw( ignore_case verbatim max )) { $field_def{$attr} = $propname->$attr; } } return ( name => $name, def => \%field_def ); } sub _add_new_field { my ( $self, $metaname, $propname ) = @_; my $fields = $self->{_fields}; my %field_def = $self->_create_field_def( $metaname, $propname ); my $name = $field_def{name}; my $def = $field_def{def}; $fields->{$name} ||= $def; $self->{__lucy}->{schema}->spec_field( name => $name, type => $self->_get_lucy_field_type($def), ); return $def; } =head2 swish3_handler( I ) Called by the SWISH::3::handler() function for every document being indexed. =cut sub swish3_handler { my ( $self, $data ) = @_; my $config = $data->config; my $conf_props = $config->get_properties; my $conf_metas = $config->get_metanames; # will hold all the parsed text, keyed by field name my %doc; my $docinfo = $data->doc; # Swish built-in fields first for my $propname ( keys %$BUILT_IN_PROPS ) { my $attr = $BUILT_IN_PROPS->{$propname}; $doc{$propname} = [ $docinfo->$attr ]; } # fields parsed from document my $props = $data->properties; my $metas = $data->metanames; # field def cache my $fields = $self->{_fields}; # may need to add newly-discovered fields from $metas # that were added via UndefinedMetaTags e.g. for my $mname ( keys %$metas ) { if ( !exists $fields->{$mname} ) { #warn "New field: $mname\n"; my $prop; if ( exists $props->{$mname} ) { $prop = $conf_props->get($mname); } $self->_add_new_field( $conf_metas->get($mname), $prop ); } } #dump $fields; #dump $props; #dump $metas; for my $fname ( sort keys %$fields ) { my $field = $self->{_fields}->{$fname}; next if $field->{is_prop_alias}; next if $field->{is_meta_alias}; my @keys = keys %{ $field->{store_as} }; for my $key (@keys) { # prefer properties over metanames because # properties have verbatim flag, which affects # the stored whitespace. if ( $field->{is_prop} and !exists $BUILT_IN_PROPS->{$fname} ) { push( @{ $doc{$key} }, @{ $props->{$fname} } ); } elsif ( $field->{is_meta} ) { push( @{ $doc{$key} }, @{ $metas->{$fname} } ); } else { croak "field '$fname' is neither a PropertyName nor MetaName"; } } } # serialize the doc with our tokenpos_bump char for my $k ( keys %doc ) { $doc{$k} = to_utf8( join( SWISH_TOKENPOS_BUMPER(), @{ $doc{$k} } ) ); } $self->debug and carp dump \%doc; # make sure we delete any existing doc with same URI $self->{lucy}->delete_by_term( field => 'swishdocpath', term => $doc{swishdocpath} ); $self->{lucy}->add_doc( \%doc ); } =head2 finish Calls commit() on the internal Lucy::Indexer object, writes the C header file and calls the superclass finish() method. =cut my @chars = ( 'a' .. 'z', 'A' .. 'Z', 0 .. 9 ); around finish => sub { my $super_method = shift; my $self = shift; return 0 if $self->{_is_finished}; my $doc_count = $self->_finish_lucy(); $super_method->( $self, @_ ); $self->{_is_finished} = 1; return $doc_count; }; sub _finish_lucy { my $self = shift; # get a lock on our header file till # this entire transaction is complete. # Note that we trust the Lucy locking feature # to have prevented any other process # from getting a lock on the invindex itself, # but we want to make sure nothing interrupts # us from writing our own header after calling ->commit(). my $invindex = $self->invindex; my $header = $invindex->header_file->stringify; my $lock_file = Path::Class::File::Lockable->new($header); if ( $lock_file->locked ) { croak "Lock file found on $header -- cannot commit indexing changes"; } $lock_file->lock; # commit our changes $self->{lucy}->commit(); # get total doc count my $polyreader = Lucy::Index::PolyReader->open( index => "$invindex", ); my $doc_count = $polyreader->doc_count(); # write header # the current config should contain any existing header + runtime config my $idx_cfg = $self->swish3->config->get_index; # poor man's uuid my $uuid = Digest::MD5::md5_hex( time() . join( "", @chars[ map { rand @chars } ( 1 .. 24 ) ] ) ); $idx_cfg->set( SWISH_INDEX_NAME(), "$invindex" ); $idx_cfg->set( SWISH_INDEX_FORMAT(), 'Lucy' ); $idx_cfg->set( SWISH_INDEX_STEMMER_LANG(), $self->{_lang} ); $idx_cfg->set( 'DeziVersion', $invindex->version ); $idx_cfg->set( "DocCount", $doc_count ); $idx_cfg->set( "UUID", $uuid ); $self->swish3->config->write($header); # transaction complete $lock_file->unlock; $self->debug and carp "wrote $header with uuid $uuid"; $self->debug and carp "$doc_count docs indexed"; $self->swish3(undef); # invalidate this indexer return $doc_count; } =head2 get_lucy Returns the internal Lucy::Index::Indexer object. =cut sub get_lucy { return shift->{lucy}; } =head2 abort Sets the internal Lucy::Index::Indexer to undef, which should release any locks on the index. Also flags the Dezi::Lucy::Indexer object as stale. =cut sub abort { my $self = shift; $self->{lucy} = undef; $self->{_is_finished} = 1; $self->swish3(undef); } __PACKAGE__->meta->make_immutable; 1; __END__ =head2 MetaNames and PropertyNames Some implementation notes about MetaNames and PropertyNames. See also L. =over =item A field defined as either a MetaName, PropertyName or both, can be searched. =item Fields are matched against tag names in your XML/HTML documents. See also the TagAlias, UndefinedMetaTags, UndefinedXMLAttributes, and XMLClassAttributes directives. =item You can alias field names with MetaNamesAlias and PropertyNamesAlias. =item MetaNames are tokenized and case-insensitive and (optionally, with FuzzyIndexingMode) stemmed. =item PropertyNames are stored, case-sensitive strings. =item If a field is defined as both a MetaName and PropertyName, then it will be tokenized. =item If a field is defined only as a MetaName, it will be parsed but not stored. That means you can search on the field but when you try and retrieve the field's value from the results, it will cause a fatal error. =item If a field is defined only as a PropertyName, it will be parsed and stored, but it will not be tokenized. That means the field's contents are stored without being split up into words. =item You can control the parsing and storage of PropertyName-only fields with the following additional directives: =over =item PropertyNamesCompareCase case sensitive search =item PropertyNamesIgnoreCase case insensitive search (default) =item PropertyNamesNoStripChars preserve whitespace =back =item There are two default MetaNames defined: swishdefault and swishtitle. =item There are two default PropertyNames defined: swishtitle and swishdescription. =item The libswish3 XML and HTML parsers will automatically treat a tag as swishtitle. Likewise they will treat <body> tag as swishdescription. =item Things get complicated quickly when defining fields. Experiment with small test cases to arrive at the configuration that works best with your application. =back =head1 AUTHOR Peter Karman, E<lt>karpet@dezi.orgE<gt> =head1 BUGS Please report any bugs or feature requests to C<bug-dezi-app at rt.cpan.org>, or through the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Dezi-App>. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. =head1 SUPPORT You can find documentation for this module with the perldoc command. perldoc Dezi::App You can also look for information at: =over 4 =item * Website L<http://dezi.org/> =item * IRC #dezisearch at freenode =item * Mailing list L<https://groups.google.com/forum/#!forum/dezi-search> =item * RT: CPAN's request tracker L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Dezi-App> =item * AnnoCPAN: Annotated CPAN documentation L<http://annocpan.org/dist/Dezi-App> =item * CPAN Ratings L<http://cpanratings.perl.org/d/Dezi-App> =item * Search CPAN L<https://metacpan.org/dist/Dezi-App/> =back =head1 COPYRIGHT AND LICENSE Copyright 2018 by Peter Karman This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L<http://dezi.org/>, L<http://swish-e.org/>, L<http://lucy.apache.org/>