package KinoSearch::Analysis::Token;
1;
__END__
__H__
#ifndef H_KINOSEARCH_ANALYSIS_TOKEN
#define H_KINOSEARCH_ANALYSIS_TOKEN 1
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "KinoSearchUtilMemManager.h"
typedef struct token Token;
struct token {
char *text;
STRLEN len;
I32 start_offset;
I32 end_offset;
I32 pos_inc;
Token *next;
Token *prev;
};
Token* Kino_Token_new(char* text, STRLEN len, I32 start_offset,
I32 end_offset, I32 pos_inc);
void Kino_Token_destroy(Token*);
#endif /* include guard */
__C__
#include "KinoSearchAnalysisToken.h"
Token*
Kino_Token_new(char* text, STRLEN len, I32 start_offset, I32 end_offset,
I32 pos_inc) {
Token *token;
/* allocate */
Kino_New(0, token, 1, Token);
/* allocate and assign */
token->text = Kino_savepvn(text, len);
/* assign */
token->len = len;
token->start_offset = start_offset;
token->end_offset = end_offset;
token->pos_inc = pos_inc;
/* init */
token->next = NULL;
token->prev = NULL;
return token;
}
void
Kino_Token_destroy(Token *token) {
Kino_Safefree(token->text);
Kino_Safefree(token);
}
__POD__
=head1 NAME
KinoSearch::Analysis::Token - unit of text
=head1 SYNOPSIS
# private class - no public API
=head1 PRIVATE CLASS
You can't actually instantiate a Token object at the Perl level -- however,
you can affect individual Tokens within a TokenBatch by way of TokenBatch's
(experimental) API.
=head1 DESCRIPTION
Token is the fundamental unit used by KinoSearch's Analyzer subclasses. Each
Token has 4 attributes: text, start_offset, end_offset, and pos_inc (for
position increment).
The text of a token is a string.
A Token's start_offset and end_offset locate it within a larger text, even if
the Token's text attribute gets modified -- by stemming, for instance. The
Token for "beating" in the text "beating a dead horse" begins life with a
start_offset of 0 and an end_offset of 7; after stemming, the text is "beat",
but the end_offset is still 7.
The position increment, which defaults to 1, is a an advanced tool for
manipulating phrase matching. Ordinarily, Tokens are assigned consecutive
position numbers: 0, 1, and 2 for "three blind mice". However, if you set the
position increment for "blind" to, say, 1000, then the three tokens will end
up assigned to positions 0, 1, and 1001 -- and will no longer produce a phrase
match for the query '"three blind mice"'.
=head1 COPYRIGHT
Copyright 2006 Marvin Humphrey
=head1 LICENSE, DISCLAIMER, BUGS, etc.
See L<KinoSearch|KinoSearch> version 0.12.
=cut