package Dezi::Lucy::Indexer;
use Moose;
extends 'Dezi::Indexer';

use Dezi::Lucy::InvIndex;

use Lucy::Index::Indexer;
use Lucy::Plan::Schema;
use Lucy::Plan::FullTextType;
use Lucy::Plan::StringType;
use Lucy::Analysis::PolyAnalyzer;

use Carp;
use SWISH::3 qw( :constants );
use Scalar::Util qw( blessed );
use Data::Dump qw( dump );
use Search::Tools::UTF8;
use Path::Class::File::Lockable;
use Sys::Hostname qw( hostname );
use Digest::MD5 ();

our $VERSION = '0.016';

has 'highlightable_fields' =>
    ( is => 'rw', isa => 'Bool', default => sub {0} );

my $BUILT_IN_PROPS = SWISH_DOC_PROP_MAP();

=head1 NAME

Dezi::Lucy::Indexer - Dezi::App Apache Lucy indexer

=head1 SYNOPSIS

 use Dezi::Lucy::Indexer;
 my $indexer = Dezi::Lucy::Indexer->new(
    config               => Dezi::Indexer::Config->new(),
    invindex             => Dezi::Lucy::InvIndex->new(),
    highlightable_fields => 0,
 );

=head1 DESCRIPTION

Dezi::Lucy::Indexer is an Apache Lucy based indexer
class based on L<SWISH::3>.

=head1 CONSTANTS

All the L<SWISH::3> constants are imported into this namespace,
including:

=over

=item SWISH_DOC_PROP_MAP

=item SWISH_INDEX_STEMMER_LANG

=item SWISH_INDEX_NAME

=item SWISH_INDEX_FORMAT

=back

=head1 METHODS

Only new and overridden methods are documented here. See
the L<Dezi::Indexer> documentation.

=head2 BUILD

Implements basic object set up. Called internally by new().

In addition to the attributes documented in Dezi::Indexer,
this class implements the following attributes:

=over

=item highlightable_fields

Value should be 0 or 1. Default is 0. Passed directly to the
constructor for Lucy::Plan::FullTextField objects as the value
for the C<highlightable> option.

=back

=cut

sub BUILD {
    my $self = shift;

    # coerce our invindex into our format subclass
    unless ( $self->invindex->isa('Dezi::Lucy::InvIndex') ) {
        $self->invindex(
            Dezi::Lucy::InvIndex->new( path => $self->invindex->path ) );
    }

    $self->_build_lucy_delegates();
}

sub _build_lucy_delegates {
    my $self     = shift;
    my $s3config = $self->swish3->config;
    my $lang     = $s3config->get_index->get( SWISH_INDEX_STEMMER_LANG() )
        || 'none';
    $self->{_lang} = $lang;    # cache for finish()
    my $schema      = Lucy::Plan::Schema->new();
    my $analyzers   = {};
    my $case_folder = Lucy::Analysis::CaseFolder->new;
    my $tokenizer   = Lucy::Analysis::RegexTokenizer->new;
    my $multival_tokenizer
        = Lucy::Analysis::RegexTokenizer->new(
        pattern => '[^' . SWISH_TOKENPOS_BUMPER() . ']+' );

    # mimic StringType fields that require case and/or multival parsing.
    $analyzers->{store_lc} = Lucy::Analysis::PolyAnalyzer->new(
        analyzers => [ $multival_tokenizer, $case_folder ] );
    $analyzers->{store} = $multival_tokenizer;

    # stemming means we fold case and tokenize too.
    if ( $lang and $lang =~ m/^\w\w$/ ) {
        my $stemmer
            = Lucy::Analysis::SnowballStemmer->new( language => $lang );
        $analyzers->{fulltext_lc}
            = Lucy::Analysis::PolyAnalyzer->new( analyzers =>
                [ $multival_tokenizer, $case_folder, $tokenizer, $stemmer ] );
        $analyzers->{fulltext} = Lucy::Analysis::PolyAnalyzer->new(
            analyzers => [ $multival_tokenizer, $tokenizer, $stemmer ] );
    }
    else {
        $analyzers->{fulltext_lc}
            = Lucy::Analysis::PolyAnalyzer->new(
            analyzers => [ $multival_tokenizer, $case_folder, $tokenizer, ],
            );
        $analyzers->{fulltext} = Lucy::Analysis::PolyAnalyzer->new(
            analyzers => [ $multival_tokenizer, $tokenizer ] );
    }

    # cache our objects for later
    $self->{__lucy}->{analyzers} = $analyzers;
    $self->{__lucy}->{schema}    = $schema;

    # build the Lucy fields, which are a merger of MetaNames+PropertyNames
    my %fields;

    my $metanames     = $s3config->get_metanames;
    my $meta_keys     = $metanames->keys;
    my $properties    = $s3config->get_properties;
    my $property_keys = $properties->keys;

    # merge first by name so we pair correctly in _create_field_def()
    my %tmpfields;
    for my $name (@$meta_keys) {
        my $mn = $metanames->get($name);
        $tmpfields{$name}->{meta} = $mn;
    }
    for my $name (@$property_keys) {
        if ( exists $BUILT_IN_PROPS->{$name} ) {
            confess
                "$name is a built-in PropertyName and should not be defined in config";
        }
        my $pr = $properties->get($name);
        $tmpfields{$name}->{prop} = $pr;
    }

    # build out field definitions
    for my $n ( keys %tmpfields ) {
        my %fdef = $self->_create_field_def( $tmpfields{$n}->{meta},
            $tmpfields{$n}->{prop} );
        $fields{ $fdef{name} } = $fdef{def};
    }

    $self->{_fields} = \%fields;

    for my $name ( keys %fields ) {
        my $def = $fields{$name};
        my $key = $name;

        # if a field is purely an alias, skip it.
        if (    defined $def->{is_meta_alias}
            and defined $def->{is_prop_alias} )
        {
            $def->{store_as}->{ $def->{is_meta_alias} } = 1;
            $def->{store_as}->{ $def->{is_prop_alias} } = 1;
            next;
        }

        my $type = $self->_get_lucy_field_type($def) or next;

        $schema->spec_field( name => $name, type => $type );

        $def->{store_as}->{$name} = 1;
    }

    # build in the built-ins
    $self->debug and warn dump \%fields;

    for my $name ( keys %$BUILT_IN_PROPS ) {
        if ( exists $fields{$name} ) {
            my $def = $fields{$name};

            #carp "found $name in built-in props: " . dump($field);

            # in theory this should never happen.
            if ( !$def->{is_prop} ) {
                confess
                    "$name is a built-in PropertyName but not defined as a PropertyName in config";
            }
        }

        # default property
        else {
            $schema->spec_field(
                name => $name,
                type => Lucy::Plan::StringType->new( sortable => 1, )
            );
        }
    }

    #dump( \%fields );

    # TODO can pass lucy in? make 'lucy' attribute public?
    my $hostname = hostname() or confess "Can't get unique hostname";
    my $manager = Lucy::Index::IndexManager->new( host => $hostname );
    $self->{lucy} ||= Lucy::Index::Indexer->new(
        schema  => $schema,
        index   => $self->invindex->path . "",
        create  => 1,
        manager => $manager,
    );

}

sub _get_lucy_field_type {
    my ( $self, $def ) = @_;
    my ( $type, $key );
    my $analyzers = $self->{__lucy}->{analyzers};

    # MetaName==yes, PropertyName==no
    if ( $def->{is_meta} and !$def->{is_prop} ) {
        if ( defined $def->{is_meta_alias} ) {
            $key = $def->{is_meta_alias};
            $def->{store_as}->{$key} = 1;
            return;
        }

        #warn "spec meta $name";
        $type = Lucy::Plan::FullTextType->new(
            analyzer      => $analyzers->{fulltext_lc},
            stored        => 0,
            boost         => $def->{bias} || 1.0,
            highlightable => $self->highlightable_fields,
        );
    }

    # MetaName==yes, PropertyName==yes
    # this is the trickiest case, because the field
    # is both prop+meta and could be an alias for one
    # and a real for the other.
    # **NOTE** we must have already eliminated the case where
    # the field is an alias for both.
    elsif ( $def->{is_meta} and $def->{is_prop} ) {
        if ( defined $def->{is_meta_alias} ) {
            $key = $def->{is_meta_alias};
            $def->{store_as}->{$key} = 1;
        }
        elsif ( defined $def->{is_prop_alias} ) {
            $key = $def->{is_prop_alias};
            $def->{store_as}->{$key} = 1;
        }

        my $analyzer = $analyzers->{fulltext_lc};
        if ( !$def->{ignore_case} ) {
            $analyzer = $analyzers->{fulltext};
        }

        #warn "spec meta+prop $name";
        $type = Lucy::Plan::FullTextType->new(
            analyzer      => $analyzer,
            highlightable => $self->highlightable_fields,
            sortable      => $def->{sortable},
            boost         => $def->{bias} || 1.0,
        );
    }

    # MetaName==no, PropertyName==yes
    elsif (!$def->{is_meta}
        and $def->{is_prop} )
    {

        if ( defined $def->{is_prop_alias} ) {
            $key = $def->{is_prop_alias};
            $def->{store_as}->{$key} = 1;
            return;
        }

        #warn "spec prop !sort $name";
        my $analyzer_key = 'store';
        if ( $def->{ignore_case} ) {
            $analyzer_key = 'store_lc';
        }

        $type = Lucy::Plan::FullTextType->new(
            analyzer      => $analyzers->{$analyzer_key},
            highlightable => $self->highlightable_fields,
            sortable      => $def->{sortable},
            boost         => $def->{bias} || 1.0,
        );
    }

    $self->debug
        and warn
        sprintf( "field def %s => field type %s", dump($def), $type );

    return $type;

}

sub _create_field_def {
    my ( $self, $metaname, $propname ) = @_;
    if ( !$metaname and !$propname ) {
        confess "Must have one of metaname or propname objects";
    }
    my $name = $metaname ? $metaname->name : $propname->name;
    my %field_def = ();
    if ($metaname) {
        if ( $metaname->name ne $name ) {
            confess "Mismatched metaname for '$name': " . $metaname->name;
        }
        my $alias = $metaname->alias_for;
        $field_def{is_meta}           = 1;
        $field_def{is_meta_alias}     = $alias;
        $field_def{bias}              = $metaname->bias;
        $field_def{store_as}->{$name} = 1;

        # allow for aliases to built-ins
        if ( exists $BUILT_IN_PROPS->{$name} ) {
            $field_def{is_prop}  = 1;
            $field_def{sortable} = 1;
        }
    }
    if ($propname) {
        if ( $propname->name ne $name ) {
            confess "Mismatched propname for '$name'" . $propname->name;
        }
        my $prop_alias = $propname->alias_for;
        $field_def{is_prop}       = 1;
        $field_def{is_prop_alias} = $prop_alias;
        if ( $propname->sort ) {
            $field_def{sortable} = 1;
        }
        for my $attr (qw( ignore_case verbatim max )) {
            $field_def{$attr} = $propname->$attr;
        }
    }
    return ( name => $name, def => \%field_def );
}

sub _add_new_field {
    my ( $self, $metaname, $propname ) = @_;
    my $fields    = $self->{_fields};
    my %field_def = $self->_create_field_def( $metaname, $propname );
    my $name      = $field_def{name};
    my $def       = $field_def{def};
    $fields->{$name} ||= $def;
    $self->{__lucy}->{schema}->spec_field(
        name => $name,
        type => $self->_get_lucy_field_type($def),
    );
    return $def;
}

=head2 swish3_handler( I<swish3_data> )

Called by the SWISH::3::handler() function for every document being
indexed.

=cut

sub swish3_handler {
    my ( $self, $data ) = @_;
    my $config     = $data->config;
    my $conf_props = $config->get_properties;
    my $conf_metas = $config->get_metanames;

    # will hold all the parsed text, keyed by field name
    my %doc;
    my $docinfo = $data->doc;

    # Swish built-in fields first
    for my $propname ( keys %$BUILT_IN_PROPS ) {
        my $attr = $BUILT_IN_PROPS->{$propname};
        $doc{$propname} = [ $docinfo->$attr ];
    }

    # fields parsed from document
    my $props = $data->properties;
    my $metas = $data->metanames;

    # field def cache
    my $fields = $self->{_fields};

    # may need to add newly-discovered fields from $metas
    # that were added via UndefinedMetaTags e.g.
    for my $mname ( keys %$metas ) {
        if ( !exists $fields->{$mname} ) {

            #warn "New field: $mname\n";
            my $prop;
            if ( exists $props->{$mname} ) {
                $prop = $conf_props->get($mname);
            }
            $self->_add_new_field( $conf_metas->get($mname), $prop );
        }
    }

    #dump $fields;
    #dump $props;
    #dump $metas;
    for my $fname ( sort keys %$fields ) {
        my $field = $self->{_fields}->{$fname};
        next if $field->{is_prop_alias};
        next if $field->{is_meta_alias};

        my @keys = keys %{ $field->{store_as} };

        for my $key (@keys) {

            # prefer properties over metanames because
            # properties have verbatim flag, which affects
            # the stored whitespace.

            if ( $field->{is_prop} and !exists $BUILT_IN_PROPS->{$fname} ) {
                push( @{ $doc{$key} }, @{ $props->{$fname} } );
            }
            elsif ( $field->{is_meta} ) {
                push( @{ $doc{$key} }, @{ $metas->{$fname} } );
            }
            else {
                croak "field '$fname' is neither a PropertyName nor MetaName";
            }
        }
    }

    # serialize the doc with our tokenpos_bump char
    for my $k ( keys %doc ) {
        $doc{$k} = to_utf8( join( SWISH_TOKENPOS_BUMPER(), @{ $doc{$k} } ) );
    }

    $self->debug and carp dump \%doc;

    # make sure we delete any existing doc with same URI
    $self->{lucy}->delete_by_term(
        field => 'swishdocpath',
        term  => $doc{swishdocpath}
    );

    $self->{lucy}->add_doc( \%doc );
}

=head2 finish

Calls commit() on the internal Lucy::Indexer object,
writes the C<swish.xml> header file and calls the superclass finish()
method.

=cut

my @chars = ( 'a' .. 'z', 'A' .. 'Z', 0 .. 9 );

around finish => sub {
    my $super_method = shift;
    my $self         = shift;

    return 0 if $self->{_is_finished};

    my $doc_count = $self->_finish_lucy();
    $super_method->( $self, @_ );
    $self->{_is_finished} = 1;

    return $doc_count;
};

sub _finish_lucy {
    my $self = shift;

    # get a lock on our header file till
    # this entire transaction is complete.
    # Note that we trust the Lucy locking feature
    # to have prevented any other process
    # from getting a lock on the invindex itself,
    # but we want to make sure nothing interrupts
    # us from writing our own header after calling ->commit().
    my $invindex  = $self->invindex;
    my $header    = $invindex->header_file->stringify;
    my $lock_file = Path::Class::File::Lockable->new($header);
    if ( $lock_file->locked ) {
        croak "Lock file found on $header -- cannot commit indexing changes";
    }
    $lock_file->lock;

    # commit our changes
    $self->{lucy}->commit();

    # get total doc count
    my $polyreader = Lucy::Index::PolyReader->open( index => "$invindex", );
    my $doc_count = $polyreader->doc_count();

    # write header
    # the current config should contain any existing header + runtime config
    my $idx_cfg = $self->swish3->config->get_index;

    # poor man's uuid
    my $uuid = Digest::MD5::md5_hex(
        time() . join( "", @chars[ map { rand @chars } ( 1 .. 24 ) ] ) );

    $idx_cfg->set( SWISH_INDEX_NAME(),         "$invindex" );
    $idx_cfg->set( SWISH_INDEX_FORMAT(),       'Lucy' );
    $idx_cfg->set( SWISH_INDEX_STEMMER_LANG(), $self->{_lang} );
    $idx_cfg->set( 'DeziVersion',              $invindex->version );
    $idx_cfg->set( "DocCount",                 $doc_count );
    $idx_cfg->set( "UUID",                     $uuid );

    $self->swish3->config->write($header);

    # transaction complete
    $lock_file->unlock;

    $self->debug and carp "wrote $header with uuid $uuid";
    $self->debug and carp "$doc_count docs indexed";
    $self->swish3(undef);    # invalidate this indexer

    return $doc_count;
}

=head2 get_lucy

Returns the internal Lucy::Index::Indexer object.

=cut

sub get_lucy {
    return shift->{lucy};
}

=head2 abort

Sets the internal Lucy::Index::Indexer to undef,
which should release any locks on the index.
Also flags the Dezi::Lucy::Indexer object
as stale.

=cut

sub abort {
    my $self = shift;
    $self->{lucy}         = undef;
    $self->{_is_finished} = 1;
    $self->swish3(undef);
}

__PACKAGE__->meta->make_immutable;

1;

__END__

=head2 MetaNames and PropertyNames

Some implementation notes about MetaNames and PropertyNames.
See also L<http://dezi.org/2014/07/18/metanames-and-propertynames/>.

=over

=item

A field defined as either a MetaName, PropertyName or both, can be searched.

=item

Fields are matched against tag names in your XML/HTML documents. See also the TagAlias, UndefinedMetaTags, UndefinedXMLAttributes, and XMLClassAttributes directives.

=item

You can alias field names with MetaNamesAlias and PropertyNamesAlias.

=item

MetaNames are tokenized and case-insensitive and (optionally, with FuzzyIndexingMode) stemmed.

=item

PropertyNames are stored, case-sensitive strings.

=item

If a field is defined as both a MetaName and PropertyName, then it will be tokenized.

=item

If a field is defined only as a MetaName, it will be parsed but not stored. That means you can search on the field but when you try and retrieve the field's value from the results, it will cause a fatal error.

=item

If a field is defined only as a PropertyName, it will be parsed and stored, but it will not be tokenized. That means the field's contents are stored without being split up into words.

=item

You can control the parsing and storage of PropertyName-only fields with the following additional directives:

=over

=item PropertyNamesCompareCase

case sensitive search

=item PropertyNamesIgnoreCase

case insensitive search (default)

=item PropertyNamesNoStripChars

preserve whitespace

=back

=item

There are two default MetaNames defined: swishdefault and swishtitle.

=item

There are two default PropertyNames defined: swishtitle and swishdescription.

=item

The libswish3 XML and HTML parsers will automatically treat a <title> tag as swishtitle. Likewise they will treat <body> tag as swishdescription.

=item

Things get complicated quickly when defining fields. Experiment with small test cases to arrive at the configuration that works best with your application.

=back

=head1 AUTHOR

Peter Karman, E<lt>karpet@dezi.orgE<gt>

=head1 BUGS

Please report any bugs or feature requests to C<bug-dezi-app at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Dezi-App>.
I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Dezi::App

You can also look for information at:

=over 4

=item * Website

L<http://dezi.org/>

=item * IRC

#dezisearch at freenode

=item * Mailing list

L<https://groups.google.com/forum/#!forum/dezi-search>

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Dezi-App>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/Dezi-App>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/Dezi-App>

=item * Search CPAN

L<https://metacpan.org/dist/Dezi-App/>

=back

=head1 COPYRIGHT AND LICENSE

Copyright 2018 by Peter Karman

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 SEE ALSO

L<http://dezi.org/>, L<http://swish-e.org/>, L<http://lucy.apache.org/>