use strict;
use warnings;
package KinoSearch::Analysis::Token;
use KinoSearch::Util::ToolSet;
use base qw( KinoSearch::Util::Obj );
BEGIN {
__PACKAGE__->init_instance_vars(
text => undef,
start_offset => undef,
end_offset => undef,
boost => 1.0,
pos_inc => 1,
);
}
1;
__END__
__XS__
MODULE = KinoSearch PACKAGE = KinoSearch::Analysis::Token
kino_Token*
new(...)
CODE:
{
HV *const args_hash = build_args_hash( &(ST(0)), 1, items,
"KinoSearch::Analysis::Token::instance_vars");
SV *text_sv = extract_sv(args_hash, SNL("text"));
STRLEN len;
char *text = SvPVutf8(text_sv, len);
kino_u32_t start = extract_uv(args_hash, SNL("start_offset"));
kino_u32_t end = extract_uv(args_hash, SNL("end_offset"));
float boost = extract_nv(args_hash, SNL("boost"));
kino_i32_t pos_inc = extract_iv(args_hash, SNL("pos_inc"));
RETVAL = kino_Token_new(text, len, start, end, boost, pos_inc);
}
OUTPUT: RETVAL
void
_set_or_get(self, ...)
kino_Token *self;
ALIAS:
set_text = 1
get_text = 2
set_start_offset = 3
get_start_offset = 4
set_end_offset = 5
get_end_offset = 6
set_boost = 7
get_boost = 8
set_pos_inc = 9
get_pos_inc = 10
PPCODE:
{
START_SET_OR_GET_SWITCH
case 1: free(self->text);
{
STRLEN len;
char *str = SvPVutf8( ST(1), len);
self->text = kino_StrHelp_strndup(str, len);
self->len = len;
}
case 2: retval = newSVpvn(self->text, self->len);
SvUTF8_on(retval);
break;
case 3: self->start_offset = SvUV( ST(1) );
break;
case 4: retval = newSVuv(self->start_offset);
break;
case 5: self->end_offset = SvUV( ST(1) );
break;
case 6: retval = newSVuv(self->end_offset);
break;
case 7: self->boost = SvNV( ST(1) );
break;
case 8: retval = newSVnv(self->boost);
break;
case 9: self->pos_inc = SvIV( ST(1) );
break;
case 10: retval = newSViv(self->pos_inc);
break;
END_SET_OR_GET_SWITCH
}
__POD__
=head1 NAME
KinoSearch::Analysis::Token - Unit of text.
=head1 SYNOPSIS
my $token = KinoSearch::Analysis::Token->new(
text => 'horses',
start_offset => 0,
end_offset => 6,
);
$token->set_text('hors');
=head1 DESCRIPTION
Token is the fundamental unit used by KinoSearch's Analyzer subclasses. Each
Token has 5 attributes:
=over
=item *
B<text> - a UTF-8 string.
=item *
B<start_offset> - The start point of the token text, measured in UTF-8
characters from the top of the stored field. C<start_offset> and C<end_offset>
locate the Token within a larger context, even if the Token's text attribute
gets modified -- by stemming, for instance. The Token for "beating" in the
text "beating a dead horse" begins life with a start_offset of 0 and an
end_offset of 7; after stemming, the text is "beat", but the start_offset is
still 0 and the end_offset is still 7. This allows "beating" to be
highlighted correctly after a search matches "beat".
=item *
B<end_offset> The end of the token text, measured in UTF-8 characters from the
top of the field.
=item *
B<boost> - a per-token weight. Use this when you want to assign more or less
importance to a particular token, as you might for emboldened text within an
HTML document, for example. (Note: The field this token belongs to must be
spec'd to C<store_pos_boost>.)
=item *
B<pos_inc> - POSition INCrement, measured in Tokens. This attribute, which
defaults to 1, is a an advanced tool for manipulating phrase matching.
Ordinarily, Tokens are assigned consecutive position numbers: 0, 1, and 2 for
"three blind mice". However, if you set the position increment for "blind"
to, say, 1000, then the three tokens will end up assigned to positions 0, 1,
and 1001 -- and will no longer produce a phrase match for the query '"three
blind mice"'.
=back
=head1 METHODS
=head1 new
my $token = KinoSearch::Analysis::Token->new(
text => $text, # required
start_offset => 0, # required
end_offset => length($text), # required
boost => 100.0, # default 1.0
pos_inc => 0, # default 1
);
Constructor. Takes hash-style parameters, corresponding to the token's
attributes.
=head2 Accessors
Token provides these set/get methods:
=over 4
=item set_text
=item get_text
=item set_start_offset
=item get_start_offset
=item set_end_offset
=item get_end_offset
=item set_boost
=item get_boost
=item set_pos_inc
=item get_pos_inc
=back
=head1 COPYRIGHT
Copyright 2006-2007 Marvin Humphrey
=head1 LICENSE, DISCLAIMER, BUGS, etc.
See L<KinoSearch> version 0.20.
=cut