KinoSearch::Store::InStream - filehandles for reading invindexes


InStream* new(class, fh_sv, ...) char *class; SV *fh_sv; PREINIT: double offset = 0; double len = -1; CODE: if (items > 2) { SV* offset_sv; offset_sv = ST(2); if (SvOK(offset_sv)) offset = SvNV(offset_sv); } if (items > 3) { SV *len_sv; len_sv = ST(3); if (SvOK(len_sv)) len = SvNV(len_sv); } RETVAL = Kino_InStream_new(class, fh_sv, offset, len); OUTPUT: RETVAL

SV* _set_or_get(instream, ...) InStream *instream; ALIAS: set_len = 1 get_len = 2 set_offset = 3 get_offset = 4 set_fh = 5 get_fh = 6 CODE: { KINO_START_SET_OR_GET_SWITCH

    case 1:  instream->len = SvNV( ST(1) );
             /* fall through */
    case 2:  RETVAL = newSVnv(instream->len);
             break;
    
    case 3:  instream->offset = SvNV( ST(1) );
             /* fall through */
    case 4:  RETVAL = newSVnv(instream->offset);
             break;
    
    case 5:  Kino_confess("Can't set_fh");
             /* fall through */
    case 6:  RETVAL = newSVsv(instream->fh_sv);
             break;
    KINO_END_SET_OR_GET_SWITCH
}
OUTPUT: RETVAL

void lu_read (instream, template_sv) InStream *instream; SV *template_sv PREINIT: STRLEN tpt_len; /* bytelength of template */ char *template; /* ptr to a spot in the template */ char *tpt_end; /* ptr to the end of the template */ int repeat_count; /* number of times to repeat sym */ char sym; /* the current symbol in the template */ char countsym; /* used when calculating repeat counts */ IV aIV; SV *aSV; char aChar; char* string; STRLEN len; PPCODE: { /* prepare template string pointers */ template = SvPV(template_sv, tpt_len); tpt_end = SvEND(template_sv);

    repeat_count = 0;
    while (1) {
        if (repeat_count == 0) {
            /* fast-forward past space characters */
            while (*template == ' ' && template < tpt_end) {
                template++;
            }
            /* break out of the loop if we've exhausted the template */
            if (template == tpt_end) {
                break;
            }
            
            /* derive the current symbol and a possible digit repeat sym */
            sym      = *template++;
            countsym = *template;
            if (template == tpt_end) { 
                /* sym is last char in template, so process once */
                repeat_count = 1;
            }
            else if (countsym >= '0' && countsym <= '9') {
                /* calculate numerical repeat count */
                repeat_count = countsym - KINO_NUM_CHAR_OFFSET;
                countsym = *(++template);
                while (  template <= tpt_end 
                      && countsym >= '0' 
                      && countsym <= '9'
                ) {
                    repeat_count = (repeat_count * 10) 
                        + (countsym - KINO_NUM_CHAR_OFFSET);
                    countsym = *(++template);
                }
            }
            else { /* no numeric repeat count, so process sym only once */
                repeat_count = 1;
            }
        }
        /* thwart potential infinite loop */
        if (repeat_count < 1)
            Kino_confess( "invalid repeat_count: %d", repeat_count);
        
        switch(sym) {
        case 'a': /* arbitrary binary data */
            len = repeat_count;
            repeat_count = 1;
            aSV = newSV(len + 1);
            SvCUR_set(aSV, len);
            SvPOK_on(aSV);
            string = SvPVX(aSV);
            instream->read_bytes(instream, string, len);
            break;
        case 'b': /* signed byte */
        case 'B': /* unsigned byte */
            aChar = instream->read_byte(instream);
            if (sym == 'b') 
                aIV = aChar;
            else
                aIV = (unsigned char)aChar;
            aSV = newSViv(aIV);
            break;
        case 'i': /* signed 32-bit integer */
            aSV = newSViv( (I32)instream->read_int(instream) );
            break;
            
        case 'I': /* unsigned 32-bit integer */
            aSV = newSVuv( instream->read_int(instream) );
            break;
        case 'Q': /* unsigned "64-bit integer" */
            aSV = newSVnv( instream->read_long(instream) );
            break;
        case 'T': /* string */
            len = instream->read_vint(instream);
            aSV = newSV(len + 1);
            SvCUR_set(aSV, len);
            SvPOK_on(aSV);
            string = SvPVX(aSV);
            instream->read_chars(instream, string, 0, len);
            break;
        case 'V': /* VInt */
            aSV = newSVuv( instream->read_vint(instream) );
            break;
        case 'W': /* VLong */
            aSV = newSVnv( instream->read_vlong(instream) );
            break;
        default: 
            aSV = NULL; /* suppress unused var compiler warning */
            Kino_confess("Invalid type in template: '%c'", sym);
        }
        /* Put a scalar on the stack, use up one symbol or repeater */
        XPUSHs( sv_2mortal(aSV) );
        repeat_count -= 1;
    }
}

void DESTROY(instream) InStream *instream; PPCODE: Kino_InStream_destroy(instream);

__H__

#ifndef H_KINOSEARCH_STORE_INSTREAM #define H_KINOSEARCH_STORE_INSTREAM 1

#include ``EXTERN.h'' #include ``perl.h'' #include ``XSUB.h'' #include ``KinoSearchUtilCarp.h'' #include ``KinoSearchUtilMathUtils.h''

/* Detect whether we're on an ASCII or EBCDIC machine. */ #if '0' == 240 #define KINO_NUM_CHAR_OFFSET 240 #else #define KINO_NUM_CHAR_OFFSET 48 #endif

#define KINO_IO_STREAM_BUF_SIZE 1024

typedef struct instream { PerlIO *fh; SV *fh_sv; double offset; double len; char *buf; Off_t buf_start; /* file position of start of buffer */ int buf_len; /* number of valid bytes in the buffer */ int buf_pos; /* next byte to read */ void (*seek)(struct instream*, double); double (*tell)(struct instream*); char (*read_byte)(struct instream*); void (*read_bytes)(struct instream*, char*, STRLEN); void (*read_chars)(struct instream*, char*, STRLEN, STRLEN); U32 (*read_int)(struct instream*); double (*read_long)(struct instream*); U32 (*read_vint)(struct instream*); double (*read_vlong)(struct instream*); } InStream;

InStream* Kino_InStream_new (char*, SV*, double, double); void Kino_InStream_seek (InStream*, double); double Kino_InStream_tell (InStream*); void Kino_InStream_refill (InStream*); char Kino_InStream_read_byte (InStream*); void Kino_InStream_read_bytes (InStream*, char*, STRLEN); void Kino_InStream_read_chars (InStream*, char*, STRLEN, STRLEN); U32 Kino_InStream_read_int (InStream*); double Kino_InStream_read_long (InStream*); U32 Kino_InStream_decode_vint(char**); U32 Kino_InStream_read_vint (InStream*); double Kino_InStream_read_vlong (InStream*); void Kino_InStream_destroy (InStream*);

#endif /* include guard */

__C__

#include ``KinoSearchStoreInStream.h''

InStream* Kino_InStream_new(char *class, SV *fh_sv, double offset, double len ) { InStream *instream;

    /* allocate */
    Kino_New(0, instream, 1, InStream);
    /* assign */
    instream->fh_sv       = newSVsv(fh_sv);
    instream->fh          = IoIFP( sv_2io(fh_sv) );
    instream->offset      = offset;
    /* init buffer */
    instream->buf       = NULL;
    instream->buf_start = 0;
    instream->buf_len   = 0;
    instream->buf_pos   = 0;
    /* seek */
    if (offset != 0) {
        PerlIO_seek(instream->fh, offset, 0);
    }
    /* calculate len if an (intentionally) invalid value was supplied */
    if (len < 0.0) {
        double bookmark = PerlIO_tell(instream->fh);
        PerlIO_seek(instream->fh, 0, 2);
        len = PerlIO_tell(instream->fh);
        PerlIO_seek(instream->fh, bookmark, 0);
    }
    instream->len = len;
    /* assign methods */
    instream->seek       = Kino_InStream_seek;
    instream->tell       = Kino_InStream_tell;
    instream->read_byte  = Kino_InStream_read_byte;
    instream->read_bytes = Kino_InStream_read_bytes;
    instream->read_chars = Kino_InStream_read_chars;
    instream->read_int   = Kino_InStream_read_int;
    instream->read_long  = Kino_InStream_read_long;
    instream->read_vint  = Kino_InStream_read_vint;
    instream->read_vlong = Kino_InStream_read_vlong;
    return instream;
}

void Kino_InStream_seek(InStream *instream, double target) { /* seek within buffer if possible */ if ( (target >= instream->buf_start) && (target < (instream->buf_start + instream->buf_pos)) ) { instream->buf_pos = target - instream->buf_start; } /* nope, not possible, so seek within file and prepare to refill */ else { instream->buf_start = target; instream->buf_pos = 0; instream->buf_len = 0; PerlIO_seek(instream->fh, target + instream->offset, 0); } }

double Kino_InStream_tell(InStream *instream) { return instream->buf_start + instream->buf_pos; }

void Kino_InStream_refill(InStream *instream) { int check_val;

    /* wait to allocate buffer until it's needed */
    if (instream->buf == NULL)
        Kino_New(0, instream->buf, KINO_IO_STREAM_BUF_SIZE, char);
    /* add bytes read to file position, reset */
    instream->buf_start += instream->buf_pos;
    instream->buf_pos = 0;
    /* calculate the number of bytes to read */
    if (KINO_IO_STREAM_BUF_SIZE < instream->len - instream->buf_start)
        instream->buf_len = KINO_IO_STREAM_BUF_SIZE;
    else
        instream->buf_len = instream->len - instream->buf_start;
    /* perform the file operations */
    PerlIO_seek(instream->fh, 0, 1);
    check_val = PerlIO_seek(instream->fh, 
        (instream->buf_start + instream->offset), 0);
    if (check_val == -1)
        Kino_confess("refill: PerlIO_seek failed: %d", errno);
    check_val = PerlIO_read(instream->fh, instream->buf, instream->buf_len);
    if (check_val != instream->buf_len) 
        Kino_confess("refill: tried to read %d bytes, got %d: %d", 
            instream->buf_len, check_val, errno);
}

char Kino_InStream_read_byte(InStream *instream) { if (instream->buf_pos >= instream->buf_len) Kino_InStream_refill(instream); return instream->buf[ instream->buf_pos++ ]; }

void Kino_InStream_read_bytes (InStream *instream, char* buf, STRLEN len) { if (instream->buf_pos + len < instream->buf_len) { /* request is entirely within buffer, so copy */ Copy((instream->buf + instream->buf_pos), buf, len, char); instream->buf_pos += len; } else { /* get the request from the file and reset buffer */ int check_val; Off_t start; start = instream->tell(instream); check_val = PerlIO_seek(instream->fh, (start + instream->offset), 0); if (check_val == -1) Kino_confess(``read_bytes: PerlIO_seek failed: %d'', errno ); check_val = PerlIO_read(instream->fh, buf, len); if (check_val < len) Kino_confess(``read_bytes: tried to read %''UVuf`` bytes, got %d'', (UV)len, check_val);


        /* reset vars and refill if there's more in the file */
        instream->buf_start = start + len;
        instream->buf_pos   = 0;
        instream->buf_len   = 0;
        if (instream->buf_start < instream->len)
            Kino_InStream_refill(instream);
    }
}

/* This is just a wrapper for read_bytes, but that may change. It should * be used whenever Lucene character data is being read, typically after * read_vint as part of a String read. If and when a change does come, it will * be a lot easier to track down all the relevant code fragments if read_chars * gets used consistently. */ void Kino_InStream_read_chars(InStream *instream, char *buf, STRLEN start, STRLEN len) { buf += start; instream->read_bytes(instream, buf, len); }

U32 Kino_InStream_read_int (InStream *instream) { unsigned char buf[4]; instream->read_bytes(instream, (char*)buf, 4); return Kino_decode_bigend_U32(buf); }

double Kino_InStream_read_long (InStream *instream) { unsigned char buf[8]; double aDouble;

    /* get 8 bytes from the stream */
    instream->read_bytes(instream, (char*)buf, 8);
 
    /* get high 4 bytes, multiply by 2**32 */
    aDouble = Kino_decode_bigend_U32(buf);
    aDouble = aDouble * pow(2.0, 32.0);
    
    /* decode low four bytes as unsigned int and add to total */
    aDouble += Kino_decode_bigend_U32(&buf[4]);
    return aDouble;
}

/* read in a Variable INTeger, stored in 1-5 bytes */ U32 Kino_InStream_read_vint (InStream *instream) { unsigned char aUChar; int bitshift; U32 aU32;

    /* start by reading one byte; use the lower 7 bits */
    aUChar = (unsigned char)instream->read_byte(instream);
    aU32 = aUChar & 0x7f;
    /* keep reading and shifting as long as the high bit is set */
    for (bitshift = 7; (aUChar & 0x80) != 0; bitshift += 7) {
        aUChar = (unsigned char)instream->read_byte(instream);
        aU32 |= (aUChar & 0x7f) << bitshift;
    }
    return aU32;
}

U32 Kino_InStream_decode_vint(char **source_ptr) { char *source; int bitshift; U32 aU32;


    source = *source_ptr;
    aU32 = (unsigned char)*source & 0x7f;
    for (bitshift = 7; (*source & 0x80) != 0; bitshift += 7) {
        source++;
         aU32 |= ((unsigned char)*source & 0x7f) << bitshift;
    }
    source++;
    *source_ptr = source;
    return aU32;
}

double Kino_InStream_read_vlong (InStream *instream) { unsigned char aUChar; int bitshift; double aDouble;

    aUChar = (unsigned char)instream->read_byte(instream);
    aDouble = aUChar & 0x7f;
    for (bitshift = 7; (aUChar & 0x80) != 0; bitshift += 7) {
        aUChar = (unsigned char)instream->read_byte(instream);
        aDouble += (aUChar & 0x7f) * pow(2, bitshift);
    }
    return aDouble;
}

void Kino_InStream_destroy(InStream* instream) { SvREFCNT_dec(instream->fh_sv); Kino_Safefree(instream->buf); Kino_Safefree(instream); }

__POD__

Back to Top

 KinoSearch::Store::InStream - filehandles for reading invindexes