package KinoSearch::Store::InStream; use base qw( KinoSearch::Util::CClass ); use strict; use warnings; use KinoSearch::Util::ToolSet; sub close { CORE::close shift->get_fh } =for comment Dupe the filehandle and create a new object around the dupe. Seek the dupe to the same spot as the original. =cut sub clone_stream { my $self = shift; open( my $duped_fh, '<&=', $self->get_fh ) or confess("Couldn't dupe filehandle: $!"); my $evil_twin = __PACKAGE__->new( $duped_fh, $self->get_offset, $self->length, ); $evil_twin->seek( $self->tell ); return $evil_twin; } 1; __END__ __XS__ MODULE = KinoSearch PACKAGE = KinoSearch::Store::InStream =begin comment my $instream = KinoSearch::Store::Instream->new( $filehandle, $offset, $length ); Constructor. Takes 1-3 arguments, and unlike most classes in the KinoSearch suite, the arguments to the constructor are not labeled parameters. The second argument, an offset, defaults to 0 if not supplied. Non-zero offsets get factored in when calling seek and tell. The last argument, a length, is the length of the "file" in bytes. Supplying an explicit value is only essential for InStreams which are assigned to read a portion of a compound file -- otherwise, the length gets auto-calculated correctly. =end comment =cut InStream* new(class, fh_sv, ...) char *class; SV *fh_sv; PREINIT: double offset = 0; double len = -1; CODE: if (items > 2) { SV* offset_sv; offset_sv = ST(2); if (SvOK(offset_sv)) offset = SvNV(offset_sv); } if (items > 3) { SV *len_sv; len_sv = ST(3); if (SvOK(len_sv)) len = SvNV(len_sv); } RETVAL = Kino_InStream_new(class, fh_sv, offset, len); OUTPUT: RETVAL =for comment Seek to target plus the object's start offset. =cut void seek(instream, target) InStream *instream; double target; PPCODE: instream->seek(instream, target); =for comment Return the filehandle's position minus the offset. =cut double tell(instream) InStream *instream; CODE: RETVAL = instream->tell(instream); OUTPUT: RETVAL =for comment Return the length of the "file" in bytes, factoring in the offset. =cut double length(instream) InStream *instream; CODE: RETVAL = instream->len; OUTPUT: RETVAL =begin comment @items = $instream->lu_read( TEMPLATE ); Read the items specified by TEMPLATE from the InStream. =end comment =cut SV* _set_or_get(instream, ...) InStream *instream; ALIAS: set_len = 1 get_len = 2 set_offset = 3 get_offset = 4 set_fh = 5 get_fh = 6 CODE: { /* if called as a setter, make sure the extra arg is there */ if (ix % 2 == 1 && items != 2) Kino_confess("usage: $term_info->set_xxxxxx($val)"); switch (ix) { case 1: instream->len = SvNV( ST(1) ); /* fall through */ case 2: RETVAL = newSVnv(instream->len); break; case 3: instream->offset = SvNV( ST(1) ); /* fall through */ case 4: RETVAL = newSVnv(instream->offset); break; case 5: Kino_confess("Can't set_fh"); /* fall through */ case 6: RETVAL = newSVsv(instream->fh_sv); break; } } OUTPUT: RETVAL void lu_read (instream, template_sv) InStream *instream; SV *template_sv PREINIT: STRLEN tpt_len; /* bytelength of template */ char *template; /* ptr to a spot in the template */ char *tpt_end; /* ptr to the end of the template */ int repeat_count; /* number of times to repeat sym */ char sym; /* the current symbol in the template */ char countsym; /* used when calculating repeat counts */ IV aIV; SV *aSV; char aChar; char* string; STRLEN len; PPCODE: { /* prepare template string pointers */ template = SvPV(template_sv, tpt_len); tpt_end = SvEND(template_sv); repeat_count = 0; while (1) { if (repeat_count == 0) { /* fast-forward past space characters */ while (*template == ' ' && template < tpt_end) { template++; } /* break out of the loop if we've exhausted the template */ if (template == tpt_end) { break; } /* derive the current symbol and a possible digit repeat sym */ sym = *template++; countsym = *template; if (template == tpt_end) { /* sym is last char in template, so process once */ repeat_count = 1; } else if (countsym >= '0' && countsym <= '9') { /* calculate numerical repeat count */ repeat_count = countsym - KINO_NUM_CHAR_OFFSET; countsym = *(++template); while ( template <= tpt_end && countsym >= '0' && countsym <= '9' ) { repeat_count = (repeat_count * 10) + (countsym - KINO_NUM_CHAR_OFFSET); countsym = *(++template); } } else { /* no numeric repeat count, so process sym only once */ repeat_count = 1; } } /* thwart potential infinite loop */ if (repeat_count < 1) Kino_confess( "invalid repeat_count: %d", repeat_count); switch(sym) { case 'a': /* arbitrary binary data */ len = repeat_count; repeat_count = 1; aSV = newSV(len + 1); SvCUR_set(aSV, len); SvPOK_on(aSV); string = SvPVX(aSV); instream->read_bytes(instream, string, len); break; case 'b': /* signed byte */ case 'B': /* unsigned byte */ aChar = instream->read_byte(instream); if (sym == 'b') aIV = aChar; else aIV = (unsigned char)aChar; aSV = newSViv(aIV); break; case 'i': /* signed 32-bit integer */ aSV = newSViv( (I32)instream->read_int(instream) ); break; case 'I': /* unsigned 32-bit integer */ aSV = newSVuv( instream->read_int(instream) ); break; case 'Q': /* unsigned "64-bit integer" */ aSV = newSVnv( instream->read_long(instream) ); break; case 'T': /* string */ len = instream->read_vint(instream); aSV = newSV(len + 1); SvCUR_set(aSV, len); SvPOK_on(aSV); string = SvPVX(aSV); instream->read_chars(instream, string, 0, len); break; case 'V': /* VInt */ aSV = newSVuv( instream->read_vint(instream) ); break; case 'W': /* VLong */ aSV = newSVnv( instream->read_vlong(instream) ); break; default: Kino_confess("Invalid type in template: '%c'", sym); } /* Put a scalar on the stack, use up one symbol or repeater */ XPUSHs( sv_2mortal(aSV) ); repeat_count -= 1; } } void DESTROY(instream) InStream *instream; PPCODE: Kino_InStream_destroy(instream); __H__ #ifndef H_KINOSEARCH_STORE_INSTREAM #define H_KINOSEARCH_STORE_INSTREAM 1 #include "EXTERN.h" #include "perl.h" #include "XSUB.h" #include "KinoSearchUtilCarp.h" #include "KinoSearchUtilMathUtils.h" /* Detect whether we're on an ASCII or EBCDIC machine. */ #if '0' == 240 #define KINO_NUM_CHAR_OFFSET 240 #else #define KINO_NUM_CHAR_OFFSET 48 #endif #define KINO_IO_STREAM_BUF_SIZE 1024 typedef struct instream { PerlIO *fh; SV *fh_sv; double offset; double len; char *buf; Off_t buf_start; /* file position of start of buffer */ int buf_len; /* number of valid bytes in the buffer */ int buf_pos; /* next byte to read */ void (*seek)(struct instream*, double); double (*tell)(struct instream*); char (*read_byte)(struct instream*); void (*read_bytes)(struct instream*, char*, STRLEN); void (*read_chars)(struct instream*, char*, STRLEN, STRLEN); U32 (*read_int)(struct instream*); double (*read_long)(struct instream*); U32 (*read_vint)(struct instream*); double (*read_vlong)(struct instream*); } InStream; InStream* Kino_InStream_new (char*, SV*, U32, double); void Kino_InStream_seek (InStream*, double); double Kino_InStream_tell (InStream*); void Kino_InStream_refill (InStream*); char Kino_InStream_read_byte (InStream*); void Kino_InStream_read_bytes (InStream*, char*, STRLEN); void Kino_InStream_read_chars (InStream*, char*, STRLEN, STRLEN); U32 Kino_InStream_read_int (InStream*); double Kino_InStream_read_long (InStream*); U32 Kino_InStream_decode_vint(char**); U32 Kino_InStream_read_vint (InStream*); double Kino_InStream_read_vlong (InStream*); void Kino_InStream_destroy (InStream*); #endif /* include guard */ __C__ #include "KinoSearchStoreInStream.h" InStream* Kino_InStream_new(char *class, SV *fh_sv, U32 offset, double len ) { InStream *instream; /* allocate */ Kino_New(0, instream, 1, InStream); /* assign */ instream->fh_sv = newSVsv(fh_sv); instream->fh = IoIFP( sv_2io(fh_sv) ); instream->offset = offset; /* init buffer */ instream->buf = NULL; instream->buf_start = 0; instream->buf_len = 0; instream->buf_pos = 0; /* seek */ if (offset != 0) { PerlIO_seek(instream->fh, offset, 0); } /* calculate len if an (intentionally) invalid value was supplied */ if (len < 0.0) { double bookmark = PerlIO_tell(instream->fh); PerlIO_seek(instream->fh, 0, 2); len = PerlIO_tell(instream->fh); PerlIO_seek(instream->fh, bookmark, 0); } instream->len = len; /* assign methods */ instream->seek = Kino_InStream_seek; instream->tell = Kino_InStream_tell; instream->read_byte = Kino_InStream_read_byte; instream->read_bytes = Kino_InStream_read_bytes; instream->read_chars = Kino_InStream_read_chars; instream->read_int = Kino_InStream_read_int; instream->read_long = Kino_InStream_read_long; instream->read_vint = Kino_InStream_read_vint; instream->read_vlong = Kino_InStream_read_vlong; return instream; } void Kino_InStream_seek(InStream *instream, double target) { /* seek within buffer if possible */ if ( (target >= instream->buf_start) && (target < (instream->buf_start + instream->buf_pos)) ) { instream->buf_pos = target - instream->buf_start; } /* nope, not possible, so seek within file and prepare to refill */ else { instream->buf_start = target; instream->buf_pos = 0; instream->buf_len = 0; PerlIO_seek(instream->fh, target + instream->offset, 0); } } double Kino_InStream_tell(InStream *instream) { return instream->buf_start + instream->buf_pos; } void Kino_InStream_refill(InStream *instream) { int check_val; /* wait to allocate buffer until it's needed */ if (instream->buf == NULL) Kino_New(0, instream->buf, KINO_IO_STREAM_BUF_SIZE, char); /* add bytes read to file position, reset */ instream->buf_start += instream->buf_pos; instream->buf_pos = 0; /* calculate the number of bytes to read */ if (KINO_IO_STREAM_BUF_SIZE < instream->len - instream->buf_start) instream->buf_len = KINO_IO_STREAM_BUF_SIZE; else instream->buf_len = instream->len - instream->buf_start; /* perform the file operations */ PerlIO_seek(instream->fh, 0, 1); check_val = PerlIO_seek(instream->fh, (instream->buf_start + instream->offset), 0); if (check_val == -1) Kino_confess("refill: PerlIO_seek failed: %d", errno); check_val = PerlIO_read(instream->fh, instream->buf, instream->buf_len); if (check_val != instream->buf_len) Kino_confess("refill: tried to read %d bytes, got %d: %d", instream->buf_len, check_val, errno); } char Kino_InStream_read_byte(InStream *instream) { if (instream->buf_pos >= instream->buf_len) Kino_InStream_refill(instream); return instream->buf[ instream->buf_pos++ ]; } void Kino_InStream_read_bytes (InStream *instream, char* buf, STRLEN len) { if (instream->buf_pos + len < instream->buf_len) { /* request is entirely within buffer, so copy */ Copy((instream->buf + instream->buf_pos), buf, len, char); instream->buf_pos += len; } else { /* get the request from the file and reset buffer */ int check_val; Off_t start; start = instream->tell(instream); check_val = PerlIO_seek(instream->fh, (start + instream->offset), 0); if (check_val == -1) Kino_confess("read_bytes: PerlIO_seek failed: %d", errno ); check_val = PerlIO_read(instream->fh, buf, len); if (check_val < len) Kino_confess("read_bytes: tried to read %"UVuf" bytes, got %d", (UV)len, check_val); /* reset vars and refill if there's more in the file */ instream->buf_start = start + len; instream->buf_pos = 0; instream->buf_len = 0; if (instream->buf_start < instream->len) Kino_InStream_refill(instream); } } /* This is just a wrapper for read_bytes, but that may change. It should * be used whenever Lucene character data is being read, typically after * read_vint as part of a String read. If and when a change does come, it will * be a lot easier to track down all the relevant code fragments if read_chars * gets used consistently. */ void Kino_InStream_read_chars(InStream *instream, char *buf, STRLEN start, STRLEN len) { buf += start; instream->read_bytes(instream, buf, len); } U32 Kino_InStream_read_int (InStream *instream) { unsigned char buf[4]; instream->read_bytes(instream, (char*)buf, 4); return Kino_decode_bigend_U32(buf); } double Kino_InStream_read_long (InStream *instream) { unsigned char buf[8]; double aDouble; /* get 8 bytes from the stream */ instream->read_bytes(instream, (char*)buf, 8); /* get high 4 bytes, multiply by 2**32 */ aDouble = Kino_decode_bigend_U32(buf); aDouble = aDouble * pow(2.0, 32.0); /* decode low four bytes as unsigned int and add to total */ aDouble += Kino_decode_bigend_U32(&buf[4]); return aDouble; } /* read in a Variable INTeger, stored in 1-5 bytes */ U32 Kino_InStream_read_vint (InStream *instream) { unsigned char aUChar; int bitshift; U32 aU32; /* start by reading one byte; use the lower 7 bits */ aUChar = (unsigned char)instream->read_byte(instream); aU32 = aUChar & 0x7f; /* keep reading and shifting as long as the high bit is set */ for (bitshift = 7; (aUChar & 0x80) != 0; bitshift += 7) { aUChar = (unsigned char)instream->read_byte(instream); aU32 |= (aUChar & 0x7f) << bitshift; } return aU32; } U32 Kino_InStream_decode_vint(char **source_ptr) { char *source; int bitshift; U32 aU32; source = *source_ptr; aU32 = (unsigned char)*source & 0x7f; for (bitshift = 7; (*source & 0x80) != 0; bitshift += 7) { source++; aU32 |= ((unsigned char)*source & 0x7f) << bitshift; } source++; *source_ptr = source; return aU32; } double Kino_InStream_read_vlong (InStream *instream) { unsigned char aUChar; int bitshift; double aDouble; aUChar = (unsigned char)instream->read_byte(instream); aDouble = aUChar & 0x7f; for (bitshift = 7; (aUChar & 0x80) != 0; bitshift += 7) { aUChar = (unsigned char)instream->read_byte(instream); aDouble += (aUChar & 0x7f) * pow(2, bitshift); } return aDouble; } void Kino_InStream_destroy(InStream* instream) { SvREFCNT_dec(instream->fh_sv); Kino_Safefree(instream->buf); Kino_Safefree(instream); } __POD__ =begin devdocs =head1 NAME KinoSearch::Store::InStream - filehandles for reading invindexes =head1 SYNOPSIS # isa blessed filehandle my $instream = $invindex->open_instream( $filehandle, $offset, $length ); my @ten_vints = $instream->lu_read('V10'); =head1 DESCRIPTION The InStream class abstracts out all input operations to KinoSearch. InStream is implemented as a inside-out object around a blessed filehandle. It would almost be possible to use an ordinary filehandle, but the objectification is necessary because InStreams have to be capable of pretending that they are acting upon a distinct file when in reality they may be reading only a portion of a compound file. For the template used by lu_read, see InStream's companion, L. =head1 COPYRIGHT Copyright 2005-2006 Marvin Humphrey =head1 LICENSE, DISCLAIMER, BUGS, etc. See L version 0.09. =end devdocs =cut