use strict;
use warnings;

package KinoSearch::Analysis::Tokenizer;
use KinoSearch::Util::ToolSet;
use base qw( KinoSearch::Analysis::Analyzer );

our %instance_vars = (
    # inherited (and useless)
    language => '',

    # constructor params / members
    token_re => qr/\w+(?:'\w+)*/,
);

use KinoSearch::Analysis::Token;
use KinoSearch::Analysis::TokenBatch;

1;

__END__

__XS__

MODULE = KinoSearch  PACKAGE = KinoSearch::Analysis::Tokenizer

kino_TokenBatch*
_do_analyze(self_hv, batch_or_text_sv, ...)
    HV *self_hv;
    SV *batch_or_text_sv;
ALIAS:
    analyze_batch = 1
    analyze_text  = 2
    analyze_field = 3
CODE:
{
    kino_TokenBatch *batch           = NULL;
    SV              *token_re        = extract_sv(self_hv, SNL("token_re"));
    MAGIC           *mg              = NULL;
    REGEXP          *rx              = NULL;
    chy_u32_t        num_code_points = 0;
    SV              *wrapper         = sv_newmortal();
    char            *string          = NULL;
    STRLEN           string_len      = 0;
    RETVAL                           = kino_TokenBatch_new(NULL);
    
    if (ix == 1) {
        if (items != 2)
            CONFESS("usage: $batch = $analyzer->analyze_batch($batch)");
        EXTRACT_STRUCT( batch_or_text_sv, batch, kino_TokenBatch*,
            "KinoSearch::Analysis::TokenBatch");
    }
    else if (ix == 2) {
        if (items != 2)
            CONFESS("usage: $batch = $analyzer->analyze_text($text)");
        string = SvPVutf8( ST(1), string_len );
    }
    else if (ix == 3) {
        STRLEN   len;
        SV      *string_sv;
        char    *field_name;

        if (items != 3)
            CONFESS("analyze_field() takes 2 arguments, got %d", items - 1);
        if (!SvROK(batch_or_text_sv))
            CONFESS("first argument to analyze_field() must be hash ref");
            
        field_name  = SvPV(ST(2), len);
        string_sv   = extract_sv( (HV*)SvRV(batch_or_text_sv), 
                        field_name, len);
        string      = SvPVutf8(string_sv, string_len);
    }

    /* extract regexp struct from qr// entity */
    if (SvROK(token_re)) {
        SV *sv = SvRV(token_re);
        if (SvMAGICAL(sv))
            mg = mg_find(sv, PERL_MAGIC_qr); 
    }
    if (!mg)
        CONFESS("not a qr// entity");
    rx = (REGEXP*)mg->mg_obj;

    /* fake up an SV wrapper to feed to the regex engine */
    sv_upgrade(wrapper, SVt_PV);
    SvREADONLY_on(wrapper);
    SvLEN(wrapper) = 0;
    SvUTF8_on(wrapper);

    while (1) {
        char   *string_beg;
        char   *string_end;
        char   *string_arg;

        if (ix == 1) {
            kino_Token *token = Kino_TokenBatch_Next(batch);
            if (token == NULL)
                break;
            string_len   = token->len;
            string_beg   = token->text;
            string_end   = string_beg + string_len;
            string_arg   = string_beg;
        }
        else {
            string_beg   = string;
            string_end   = string_beg + string_len;
            string_arg   = string_beg;
        }

        /* wrap the string in an SV to please the regex engine */
        SvPVX(wrapper) = string_beg;
        SvCUR_set(wrapper, string_len);
        SvPOK_on(wrapper);

        while (
            pregexec(rx, string_arg, string_end, string_arg, 1, wrapper, 1)
        ) {
#if ((PERL_VERSION > 9) || (PERL_VERSION == 9 && PERL_SUBVERSION >= 5))
            char *const start_ptr = string_arg + rx->offs[0].start;
            char *const end_ptr   = string_arg + rx->offs[0].end;
#else 
            char *const start_ptr = string_arg + rx->startp[0];
            char *const end_ptr   = string_arg + rx->endp[0];
#endif
            chy_u32_t start, end;
            kino_Token *new_token;

            /* get start and end offsets in Unicode code points */
            for( ; string_arg < start_ptr; num_code_points++) {
                string_arg += KINO_STRHELP_UTF8_SKIP[(chy_u8_t)*string_arg];
                if (string_arg > string_end)
                    CONFESS("scanned past end of '%s'", string_beg);
            }
            start = num_code_points;
            for( ; string_arg < end_ptr; num_code_points++) {
                string_arg += KINO_STRHELP_UTF8_SKIP[(chy_u8_t)*string_arg];
                if (string_arg > string_end)
                    CONFESS("scanned past end of '%s'", string_beg);
            }
            end = num_code_points;

            /* add a token to the new_batch */
            new_token = kino_Token_new(
                start_ptr,
                (end_ptr - start_ptr),
                start,
                end,
                1.0f, /* boost always 1 for now */
                1     /* position increment */
            );
            Kino_TokenBatch_Append(RETVAL, new_token);
            REFCOUNT_DEC(new_token);
        }

        if (ix > 1) /* analyze_text and analyze_field only run one loop iter */
            break;
    }
}
OUTPUT: RETVAL
    

__POD__

=head1 NAME

KinoSearch::Analysis::Tokenizer - Customizable tokenizing.

=head1 SYNOPSIS

    my $whitespace_tokenizer
        = KinoSearch::Analysis::Tokenizer->new( token_re => qr/\S+/, );

    # or...
    my $word_char_tokenizer
        = KinoSearch::Analysis::Tokenizer->new( token_re => qr/\w+/, );

    # or...
    my $apostrophising_tokenizer = KinoSearch::Analysis::Tokenizer->new;

    # then... once you have a tokenizer, put it into a PolyAnalyzer
    my $polyanalyzer = KinoSearch::Analysis::PolyAnalyzer->new(
        analyzers => [ $lc_normalizer, $word_char_tokenizer, $stemmer ], );


=head1 DESCRIPTION

Generically, "tokenizing" is a process of breaking up a string into an array
of "tokens".

    # before:
    my $string = "three blind mice";

    # after:
    @tokens = qw( three blind mice );

KinoSearch::Analysis::Tokenizer decides where it should break up the text
based on the value of C<token_re>.

    # before:
    my $string = "Eats, Shoots and Leaves.";

    # tokenized by $whitespace_tokenizer
    @tokens = qw( Eats, Shoots and Leaves. );

    # tokenized by $word_char_tokenizer
    @tokens = qw( Eats Shoots and Leaves   );

=head1 METHODS

=head2 new

    # match "it's" as well as "it" and "O'Henry's" as well as "Henry"
    my $token_re = qr/
            \w+       # Match word chars.
            (?:       # Group, but don't capture...
               '\w+   # ... an apostrophe plus word chars.
            )*        # Matching the apostrophe group is optional.
        /xsm;
    my $tokenizer = KinoSearch::Analysis::Tokenizer->new(
        token_re => $token_re, # default: what you see above
    );

Constructor.  Takes one hash style parameter.

=over

=item *

B<token_re> - must be a pre-compiled regular expression matching one token.

=back

=head1 COPYRIGHT

Copyright 2005-2007 Marvin Humphrey

=head1 LICENSE, DISCLAIMER, BUGS, etc.

See L<KinoSearch> version 0.20.

=cut