/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/* Copyright (c) 1988 AT&T */
/* All Rights Reserved */
/*
* Copyright 2004 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <ctype.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/sysmacros.h>
#include <sys/byteorder.h>
#if SHARE
#include <sys/ipc.h>
#include <sys/shm.h>
#define ERR -1
#endif
#include "invlib.h"
#include "library.h"
#define DEBUG 0 /* debugging code and realloc messages */
#define BLOCKSIZE 2 * BUFSIZ /* logical block size */
#define LINEMAX 1000 /* sorted posting line max size */
#define POSTINC 10000 /* posting buffer size increment */
#define SEP ' ' /* sorted posting field separator */
#define SETINC 100 /* posting set size increment */
#define STATS 0 /* print statistics */
#define SUPERINC 10000 /* super index size increment */
#define TERMMAX 512 /* term max size */
#define VERSION 1 /* inverted index format version */
#define ZIPFSIZE 200 /* zipf curve size */
#define FREAD "r" /* fopen for reading */
#define FREADP "r+" /* fopen for update */
#define FWRITE "w" /* fopen truncate or create for writing */
#define FWRITEP "w+" /* fopen truncate or create for update */
extern char *argv0; /* command name (must be set in main function) */
int invbreak;
#if STATS
int showzipf; /* show postings per term distribution */
#endif
static POSTING *item, *enditem, *item1 = NULL, *item2 = NULL;
static unsigned setsize1, setsize2;
static long numitems, totterm, zerolong;
static char *indexfile, *postingfile;
static FILE *outfile, *fpost;
static unsigned supersize = SUPERINC, supintsize;
static int numpost, numlogblk, amtused, nextpost,
lastinblk, numinvitems;
static POSTING *POST, *postptr;
static unsigned long *SUPINT, *supint, nextsupfing;
static char *SUPFING, *supfing;
static char thisterm[TERMMAX];
static union {
long invblk[BLOCKSIZE / sizeof (long)];
char chrblk[BLOCKSIZE];
} logicalblk;
#if DEBUG || STATS
static long totpost;
#endif
#if STATS
static int zipf[ZIPFSIZE + 1];
#endif
static void invcannotalloc(size_t n);
static void invcannotopen(char *file);
static void invcannotwrite(char *file);
static int invnewterm(void);
static int boolready(void);
long
invmake(char *invname, char *invpost, FILE *infile)
{
unsigned char *s;
long num;
int i;
long fileindex;
unsigned postsize = POSTINC * sizeof (POSTING);
unsigned long *intptr;
char line[LINEMAX];
long tlong;
PARAM param;
POSTING posting;
#if STATS
int j;
unsigned maxtermlen = 0;
#endif
/* output file */
if ((outfile = vpfopen(invname, FWRITEP)) == NULL) {
invcannotopen(invname);
return (0);
}
indexfile = invname;
(void) fseek(outfile, (long)BUFSIZ, 0);
/* posting file */
if ((fpost = vpfopen(invpost, FWRITE)) == NULL) {
invcannotopen(invpost);
return (0);
}
postingfile = invpost;
nextpost = 0;
/* get space for the postings list */
if ((POST = (POSTING *)malloc(postsize)) == NULL) {
invcannotalloc(postsize);
return (0);
}
postptr = POST;
/* get space for the superfinger (superindex) */
if ((SUPFING = malloc(supersize)) == NULL) {
invcannotalloc(supersize);
return (0);
}
supfing = SUPFING;
supintsize = supersize / 40;
/* also for the superfinger index */
if ((SUPINT = malloc(supintsize * sizeof (long))) == NULL) {
invcannotalloc(supintsize * sizeof (long));
return (0);
}
supint = SUPINT;
supint++; /* leave first term open for a count */
/* initialize using an empty term */
(void) strcpy(thisterm, "");
*supint++ = 0;
*supfing++ = ' ';
*supfing++ = '\0';
nextsupfing = 2;
#if DEBUG || STATS
totpost = 0L;
#endif
totterm = 0L;
numpost = 1;
/*
* set up as though a block had come and gone, i.e., set up for
* new block
*/
amtused = 16; /* leave no space - init 3 words + one for luck */
numinvitems = 0;
numlogblk = 0;
lastinblk = BLOCKSIZE;
/* now loop as long as more to read (till eof) */
while (fgets(line, LINEMAX, infile) != NULL) {
#if DEBUG || STATS
++totpost;
#endif
s = (unsigned char *) strchr(line, SEP);
if (s == NULL) /* where did this line come from ??? */
continue; /* workaround: just skip it */
*s = '\0';
#if STATS
if ((i = strlen(line)) > maxtermlen) {
maxtermlen = i;
}
#endif
#if DEBUG
(void) printf("%ld: %s ", totpost, line);
(void) fflush(stdout);
#endif
if (strcmp(thisterm, line) == 0) {
if (postptr + 10 > POST + postsize / sizeof (POSTING)) {
i = postptr - POST;
postsize += POSTINC * sizeof (POSTING);
if ((POST = realloc(POST, postsize)) == NULL) {
invcannotalloc(postsize);
return (0);
}
postptr = i + POST;
#if DEBUG
(void) printf("reallocated post space to %u, "
"totpost=%ld\n", postsize, totpost);
#endif
}
numpost++;
} else {
/* have a new term */
if (!invnewterm()) {
return (0);
}
(void) strcpy(thisterm, line);
numpost = 1;
postptr = POST;
fileindex = 0;
}
/* get the new posting */
num = *++s - '!';
i = 1;
do {
num = BASE * num + *++s - '!';
} while (++i < PRECISION);
posting.lineoffset = num;
while (++fileindex < nsrcoffset && num > srcoffset[fileindex]) {
;
}
posting.fileindex = --fileindex;
posting.type = *++s;
num = *++s - '!';
if (*s != '\n') {
num = *++s - '!';
while (*++s != '\n') {
num = BASE * num + *s - '!';
}
posting.fcnoffset = num;
} else {
posting.fcnoffset = 0;
}
*postptr++ = posting;
#if DEBUG
(void) printf("%ld %ld %ld %ld\n", posting.fileindex,
posting.fcnoffset, posting.lineoffset, posting.type);
(void) fflush(stdout);
#endif
}
if (!invnewterm()) {
return (0);
}
/* now clean up final block */
logicalblk.invblk[0] = numinvitems;
/* loops pointer around to start */
logicalblk.invblk[1] = 0;
logicalblk.invblk[2] = numlogblk - 1;
if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
goto cannotwrite;
}
numlogblk++;
/* write out block to save space. what in it doesn't matter */
if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
goto cannotwrite;
}
/* finish up the super finger */
*SUPINT = numlogblk;
/* add to the offsets the size of the offset pointers */
intptr = (SUPINT + 1);
i = (char *)supint - (char *)SUPINT;
while (intptr < supint)
*intptr++ += i;
/* write out the offsets (1 for the N at start) and the super finger */
if (fwrite((char *)SUPINT, sizeof (*SUPINT), numlogblk + 1,
outfile) == 0 ||
fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
goto cannotwrite;
}
/* save the size for reference later */
nextsupfing = sizeof (long) + sizeof (long) * numlogblk +
(supfing - SUPFING);
/*
* make sure the file ends at a logical block boundary. This is
* necessary for invinsert to correctly create extended blocks
*/
i = nextsupfing % BLOCKSIZE;
/* write out junk to fill log blk */
if (fwrite(SUPFING, BLOCKSIZE - i, 1, outfile) == 0 ||
fflush(outfile) == EOF) {
/* rewind doesn't check for write failure */
goto cannotwrite;
}
/* write the control area */
rewind(outfile);
param.version = VERSION;
param.filestat = 0;
param.sizeblk = BLOCKSIZE;
param.startbyte = (numlogblk + 1) * BLOCKSIZE + BUFSIZ;
param.supsize = nextsupfing;
param.cntlsize = BUFSIZ;
param.share = 0;
if (fwrite((char *)&param, sizeof (param), 1, outfile) == 0) {
goto cannotwrite;
}
for (i = 0; i < 10; i++) /* for future use */
if (fwrite((char *)&zerolong, sizeof (zerolong),
1, outfile) == 0) {
goto cannotwrite;
}
/* make first block loop backwards to last block */
if (fflush(outfile) == EOF) {
/* fseek doesn't check for write failure */
goto cannotwrite;
}
/* get to second word first block */
(void) fseek(outfile, (long)BUFSIZ + 8, 0);
tlong = numlogblk - 1;
if (fwrite((char *)&tlong, sizeof (tlong), 1, outfile) == 0 ||
fclose(outfile) == EOF) {
cannotwrite:
invcannotwrite(invname);
return (0);
}
if (fclose(fpost) == EOF) {
invcannotwrite(postingfile);
return (0);
}
--totterm; /* don't count null term */
#if STATS
(void) printf("logical blocks = %d, postings = %ld, terms = %ld, "
"max term length = %d\n", numlogblk, totpost, totterm, maxtermlen);
if (showzipf) {
(void) printf(
"\n************* ZIPF curve ****************\n");
for (j = ZIPFSIZE; j > 1; j--)
if (zipf[j])
break;
for (i = 1; i < j; ++i) {
(void) printf("%3d -%6d ", i, zipf[i]);
if (i % 6 == 0) (void) putchar('\n');
}
(void) printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
}
#endif
/* free all malloc'd memory */
free(POST);
free(SUPFING);
free(SUPINT);
return (totterm);
}
/* add a term to the data base */
static int
invnewterm(void)
{
int backupflag, i, j, maxback, holditems, gooditems, howfar;
int len, numwilluse, wdlen;
char *tptr, *tptr2, *tptr3;
union {
unsigned long packword[2];
ENTRY e;
} iteminfo;
totterm++;
#if STATS
/* keep zipfian info on the distribution */
if (numpost <= ZIPFSIZE)
zipf[numpost]++;
else
zipf[0]++;
#endif
len = strlen(thisterm);
wdlen = (len + (sizeof (long) - 1)) / sizeof (long);
numwilluse = (wdlen + 3) * sizeof (long);
/* new block if at least 1 item in block */
if (numinvitems && numwilluse + amtused > BLOCKSIZE) {
/* set up new block */
if (supfing + 500 > SUPFING + supersize) {
i = supfing - SUPFING;
supersize += 20000;
if ((SUPFING = realloc(SUPFING, supersize)) == NULL) {
invcannotalloc(supersize);
return (0);
}
supfing = i + SUPFING;
#if DEBUG
(void) printf("reallocated superfinger space to %d, "
"totpost=%ld\n", supersize, totpost);
#endif
}
/* check that room for the offset as well */
if ((numlogblk + 10) > supintsize) {
i = supint - SUPINT;
supintsize += SUPERINC;
if ((SUPINT = realloc((char *)SUPINT,
supintsize * sizeof (long))) == NULL) {
invcannotalloc(supintsize * sizeof (long));
return (0);
}
supint = i + SUPINT;
#if DEBUG
(void) printf("reallocated superfinger offset to %d, "
"totpost = %ld\n", supintsize * sizeof (long),
totpost);
#endif
}
/* See if backup is efficatious */
backupflag = 0;
maxback = strlen(thisterm) / 10;
holditems = numinvitems;
if (maxback > numinvitems)
maxback = numinvitems - 2;
howfar = 0;
while (--maxback > 0) {
howfar++;
iteminfo.packword[0] =
logicalblk.invblk[--holditems * 2 +
(sizeof (long) - 1)];
if ((i = iteminfo.e.size / 10) < maxback) {
maxback = i;
backupflag = howfar;
gooditems = holditems;
tptr2 = logicalblk.chrblk + iteminfo.e.offset;
}
}
/* see if backup will occur */
if (backupflag) {
numinvitems = gooditems;
}
logicalblk.invblk[0] = numinvitems;
/* set forward pointer pointing to next */
logicalblk.invblk[1] = numlogblk + 1;
/* set back pointer to last block */
logicalblk.invblk[2] = numlogblk - 1;
if (fwrite((char *)logicalblk.chrblk, 1,
BLOCKSIZE, outfile) == 0) {
invcannotwrite(indexfile);
return (0);
}
amtused = 16;
numlogblk++;
/* check if had to back up, if so do it */
if (backupflag) {
/* find out where the end of the new block is */
iteminfo.packword[0] =
logicalblk.invblk[numinvitems * 2 + 1];
tptr3 = logicalblk.chrblk + iteminfo.e.offset;
/* move the index for this block */
for (i = 3; i <= (backupflag * 2 + 2); i++) {
logicalblk.invblk[i] =
logicalblk.invblk[numinvitems * 2+i];
}
/* move the word into the super index */
iteminfo.packword[0] = logicalblk.invblk[3];
iteminfo.packword[1] = logicalblk.invblk[4];
tptr2 = logicalblk.chrblk + iteminfo.e.offset;
(void) strncpy(supfing, tptr2, (int)iteminfo.e.size);
*(supfing + iteminfo.e.size) = '\0';
#if DEBUG
(void) printf("backup %d at term=%s to term=%s\n",
backupflag, thisterm, supfing);
#endif
*supint++ = nextsupfing;
nextsupfing += strlen(supfing) + 1;
supfing += strlen(supfing) + 1;
/* now fix up the logical block */
tptr = logicalblk.chrblk + lastinblk;
lastinblk = BLOCKSIZE;
tptr2 = logicalblk.chrblk + lastinblk;
j = tptr3 - tptr;
while (tptr3 > tptr)
*--tptr2 = *--tptr3;
lastinblk -= j;
amtused += (8 * backupflag + j);
for (i = 3; i < (backupflag * 2 + 2); i += 2) {
iteminfo.packword[0] = logicalblk.invblk[i];
iteminfo.e.offset += (tptr2 - tptr3);
logicalblk.invblk[i] = iteminfo.packword[0];
}
numinvitems = backupflag;
} else { /* no backup needed */
numinvitems = 0;
lastinblk = BLOCKSIZE;
/* add new term to superindex */
(void) strcpy(supfing, thisterm);
supfing += strlen(thisterm) + 1;
*supint++ = nextsupfing;
nextsupfing += strlen(thisterm) + 1;
}
}
lastinblk -= (numwilluse - 8);
iteminfo.e.offset = lastinblk;
iteminfo.e.size = (char)len;
iteminfo.e.space = 0;
iteminfo.e.post = numpost;
(void) strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
amtused += numwilluse;
logicalblk.invblk[(lastinblk/sizeof (long))+wdlen] = nextpost;
if ((i = postptr - POST) > 0) {
if (fwrite((char *)POST, sizeof (POSTING), i, fpost) == 0) {
invcannotwrite(postingfile);
return (0);
}
nextpost += i * sizeof (POSTING);
}
logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0];
logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1];
return (1);
}
static void
swap_ints(void *p, size_t sz)
{
uint32_t *s;
uint32_t *e = (uint32_t *)p + (sz / sizeof (uint32_t));
for (s = p; s < e; s++)
*s = BSWAP_32(*s);
}
static void
write_param(INVCONTROL *invcntl)
{
if (invcntl->swap)
swap_ints(&invcntl->param, sizeof (invcntl->param));
rewind(invcntl->invfile);
(void) fwrite((char *)&invcntl->param, sizeof (invcntl->param), 1,
invcntl->invfile);
if (invcntl->swap)
swap_ints(&invcntl->param, sizeof (invcntl->param));
}
static void
read_superfinger(INVCONTROL *invcntl)
{
size_t count;
(void) fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
(void) fread(invcntl->iindex, (int)invcntl->param.supsize,
1, invcntl->invfile);
if (invcntl->swap) {
/*
* The superfinger consists of a count, followed by
* count offsets, followed by a string table (which
* the offsets reference).
*
* We need to swap the count and the offsets.
*/
count = 1 + BSWAP_32(*(uint32_t *)invcntl->iindex);
swap_ints(invcntl->iindex, count * sizeof (unsigned long));
}
}
static void
read_logblock(INVCONTROL *invcntl, int block)
{
/* note always fetch it if the file is busy */
if ((block != invcntl->numblk) ||
(invcntl->param.filestat >= INVBUSY)) {
(void) fseek(invcntl->invfile,
(block * invcntl->param.sizeblk) + invcntl->param.cntlsize,
SEEK_SET);
invcntl->numblk = block;
(void) fread(invcntl->logblk, (int)invcntl->param.sizeblk,
1, invcntl->invfile);
if (invcntl->swap) {
size_t count;
ENTRY *ecur, *eend;
uint32_t *postptr;
/*
* A logblock consists of a count, a next block id,
* and a previous block id, followed by count
* ENTRYs, followed by alternating strings and
* offsets.
*/
swap_ints(invcntl->logblk, 3 * sizeof (unsigned long));
count = *(uint32_t *)invcntl->logblk;
ecur = (ENTRY *)((uint32_t *)invcntl->logblk + 3);
eend = ecur + count;
for (; ecur < eend; ecur++) {
ecur->offset = BSWAP_16(ecur->offset);
ecur->post = BSWAP_32(ecur->post);
/*
* After the string is the posting offset.
*/
postptr = (uint32_t *)(invcntl->logblk +
ecur->offset +
P2ROUNDUP(ecur->size, sizeof (long)));
*postptr = BSWAP_32(*postptr);
}
}
}
}
void
read_next_posting(INVCONTROL *invcntl, POSTING *posting)
{
(void) fread((char *)posting, sizeof (*posting), 1, invcntl->postfile);
if (invcntl->swap) {
posting->lineoffset = BSWAP_32(posting->lineoffset);
posting->fcnoffset = BSWAP_32(posting->fcnoffset);
/*
* fileindex is a 24-bit field, so shift it before swapping
*/
posting->fileindex = BSWAP_32(posting->fileindex << 8);
}
}
int
invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat)
{
int read_index;
if ((invcntl->invfile = vpfopen(invname,
((stat == 0) ? FREAD : FREADP))) == NULL) {
invcannotopen(invname);
return (-1);
}
if (fread((char *)&invcntl->param, sizeof (invcntl->param), 1,
invcntl->invfile) == 0) {
(void) fprintf(stderr, "%s: empty inverted file\n", argv0);
goto closeinv;
}
if (invcntl->param.version != VERSION &&
BSWAP_32(invcntl->param.version) != VERSION) {
(void) fprintf(stderr,
"%s: cannot read old index format; use -U option to "
"force database to rebuild\n", argv0);
goto closeinv;
}
invcntl->swap = (invcntl->param.version != VERSION);
if (invcntl->swap)
swap_ints(&invcntl->param, sizeof (invcntl->param));
if (stat == 0 && invcntl->param.filestat == INVALONE) {
(void) fprintf(stderr, "%s: inverted file is locked\n", argv0);
goto closeinv;
}
if ((invcntl->postfile = vpfopen(invpost,
((stat == 0) ? FREAD : FREADP))) == NULL) {
invcannotopen(invpost);
goto closeinv;
}
/* allocate core for a logical block */
if ((invcntl->logblk = malloc(invcntl->param.sizeblk)) == NULL) {
invcannotalloc((unsigned)invcntl->param.sizeblk);
goto closeboth;
}
/* allocate for and read in superfinger */
read_index = 1;
invcntl->iindex = NULL;
#if SHARE
if (invcntl->param.share == 1) {
key_t ftok(), shm_key;
struct shmid_ds shm_buf;
char *shmat();
int shm_id;
/* see if the shared segment exists */
shm_key = ftok(invname, 2);
shm_id = shmget(shm_key, 0, 0);
/*
* Failure simply means (hopefully) that segment doesn't
* exist
*/
if (shm_id == -1) {
/*
* Have to give general write permission due to AMdahl
* not having protected segments
*/
shm_id = shmget(shm_key,
invcntl->param.supsize + sizeof (long),
IPC_CREAT | 0666);
if (shm_id == -1)
perror("Could not create shared "
"memory segment");
} else
read_index = 0;
if (shm_id != -1) {
invcntl->iindex = shmat(shm_id, 0,
((read_index) ? 0 : SHM_RDONLY));
if (invcntl->iindex == (char *)ERR) {
(void) fprintf(stderr,
"%s: shared memory link failed\n", argv0);
invcntl->iindex = NULL;
read_index = 1;
}
}
}
#endif
if (invcntl->iindex == NULL)
invcntl->iindex = malloc(invcntl->param.supsize + 16);
if (invcntl->iindex == NULL) {
invcannotalloc((unsigned)invcntl->param.supsize);
free(invcntl->logblk);
goto closeboth;
}
if (read_index) {
read_superfinger(invcntl);
}
invcntl->numblk = -1;
if (boolready() == -1) {
closeboth:
(void) fclose(invcntl->postfile);
closeinv:
(void) fclose(invcntl->invfile);
return (-1);
}
/* write back out the control block if anything changed */
invcntl->param.filestat = stat;
if (stat > invcntl->param.filestat)
write_param(invcntl);
return (1);
}
/* invclose must be called to wrap things up and deallocate core */
void
invclose(INVCONTROL *invcntl)
{
/* write out the control block in case anything changed */
if (invcntl->param.filestat > 0) {
invcntl->param.filestat = 0;
write_param(invcntl);
}
(void) fclose(invcntl->invfile);
(void) fclose(invcntl->postfile);
#if SHARE
if (invcntl->param.share > 0) {
shmdt(invcntl->iindex);
invcntl->iindex = NULL;
}
#endif
if (invcntl->iindex != NULL)
free(invcntl->iindex);
free(invcntl->logblk);
}
/* invstep steps the inverted file forward one item */
void
invstep(INVCONTROL *invcntl)
{
if (invcntl->keypnt < (*(int *)invcntl->logblk - 1)) {
invcntl->keypnt++;
return;
}
/* move forward a block else wrap */
read_logblock(invcntl, *(int *)(invcntl->logblk + sizeof (long)));
invcntl->keypnt = 0;
}
/* invforward moves forward one term in the inverted file */
int
invforward(INVCONTROL *invcntl)
{
invstep(invcntl);
/* skip things with 0 postings */
while (((ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt)->post == 0) {
invstep(invcntl);
}
/* Check for having wrapped - reached start of inverted file! */
if ((invcntl->numblk == 0) && (invcntl->keypnt == 0))
return (0);
return (1);
}
/* invterm gets the present term from the present logical block */
int
invterm(INVCONTROL *invcntl, char *term)
{
ENTRY * entryptr;
entryptr = (ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt;
(void) strncpy(term, entryptr->offset + invcntl->logblk,
(int)entryptr->size);
*(term + entryptr->size) = '\0';
return (entryptr->post);
}
/* invfind searches for an individual item in the inverted file */
long
invfind(INVCONTROL *invcntl, char *searchterm)
{
int imid, ilow, ihigh;
long num;
int i;
unsigned long *intptr, *intptr2;
ENTRY *entryptr;
/* make sure it is initialized via invready */
if (invcntl->invfile == 0)
return (-1L);
/* now search for the appropriate finger block */
intptr = (unsigned long *)invcntl->iindex;
ilow = 0;
ihigh = *intptr++ - 1;
while (ilow <= ihigh) {
imid = (ilow + ihigh) / 2;
intptr2 = intptr + imid;
i = strcmp(searchterm, (invcntl->iindex + *intptr2));
if (i < 0)
ihigh = imid - 1;
else if (i > 0)
ilow = ++imid;
else {
ilow = imid + 1;
break;
}
}
/* be careful about case where searchterm is after last in this block */
imid = (ilow) ? ilow - 1 : 0;
/* fetch the appropriate logical block if not in core */
read_logblock(invcntl, imid);
srch_ext:
/* now find the term in this block. tricky this */
intptr = (unsigned long *)invcntl->logblk;
ilow = 0;
ihigh = *intptr - 1;
intptr += 3;
num = 0;
while (ilow <= ihigh) {
imid = (ilow + ihigh) / 2;
entryptr = (ENTRY *)intptr + imid;
i = strncmp(searchterm, (invcntl->logblk + entryptr->offset),
(int)entryptr->size);
if (i == 0)
i = strlen(searchterm) - entryptr->size;
if (i < 0)
ihigh = imid - 1;
else if (i > 0)
ilow = ++imid;
else {
num = entryptr->post;
break;
}
}
/* be careful about case where searchterm is after last in this block */
if (imid >= *(long *)invcntl->logblk) {
invcntl->keypnt = *(long *)invcntl->logblk;
invstep(invcntl);
/* note if this happens the term could be in extended block */
if (invcntl->param.startbyte <
invcntl->numblk * invcntl->param.sizeblk)
goto srch_ext;
} else
invcntl->keypnt = imid;
return (num);
}
#if DEBUG
/* invdump dumps the block the term parameter is in */
void
invdump(INVCONTROL *invcntl, char *term)
{
long i, j, n, *longptr;
ENTRY * entryptr;
char temp[512], *ptr;
/* dump superindex if term is "-" */
if (*term == '-') {
j = atoi(term + 1);
longptr = (long *)invcntl->iindex;
n = *longptr++;
(void) printf("Superindex dump, num blocks=%ld\n", n);
longptr += j;
while ((longptr <= ((long *)invcntl->iindex) + n) &&
invbreak == 0) {
(void) printf("%2ld %6ld %s\n", j++, *longptr,
invcntl->iindex + *longptr);
longptr++;
}
return;
} else if (*term == '#') {
j = atoi(term + 1);
/* fetch the appropriate logical block */
read_logblock(invcntl, j);
} else
i = abs((int)invfind(invcntl, term));
longptr = (long *)invcntl->logblk;
n = *longptr++;
(void) printf("Entry term to invdump=%s, postings=%ld, "
"forward ptr=%ld, back ptr=%ld\n", term, i, *(longptr),
*(longptr + 1));
entryptr = (ENTRY *)(invcntl->logblk + 12);
(void) printf("%ld terms in this block, block=%ld\n", n,
invcntl->numblk);
(void) printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
for (j = 0; j < n && invbreak == 0; j++) {
ptr = invcntl->logblk + entryptr->offset;
(void) strncpy(temp, ptr, (int)entryptr->size);
temp[entryptr->size] = '\0';
ptr += (sizeof (long) *
(long)((entryptr->size +
(sizeof (long) - 1)) / sizeof (long)));
(void) printf("%2ld %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp,
entryptr->post, entryptr->size, entryptr->offset,
entryptr->space, *(long *)ptr);
entryptr++;
}
}
#endif
static int
boolready(void)
{
numitems = 0;
if (item1 != NULL)
free(item1);
setsize1 = SETINC;
if ((item1 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
invcannotalloc(SETINC);
return (-1);
}
if (item2 != NULL)
free(item2);
setsize2 = SETINC;
if ((item2 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
invcannotalloc(SETINC);
return (-1);
}
item = item1;
enditem = item;
return (0);
}
void
boolclear(void)
{
numitems = 0;
item = item1;
enditem = item;
}
POSTING *
boolfile(INVCONTROL *invcntl, long *num, int bool)
{
ENTRY *entryptr;
FILE *file;
char *ptr;
unsigned long *ptr2;
POSTING *newitem;
POSTING posting;
unsigned u;
POSTING *newsetp, *set1p;
long newsetc, set1c, set2c;
entryptr = (ENTRY *) (invcntl->logblk + 12) + invcntl->keypnt;
ptr = invcntl->logblk + entryptr->offset;
ptr2 = ((unsigned long *)ptr) +
(entryptr->size + (sizeof (long) - 1)) / sizeof (long);
*num = entryptr->post;
switch (bool) {
case OR:
case NOT:
if (*num == 0) {
*num = numitems;
return (item);
}
}
/* make room for the new set */
u = 0;
switch (bool) {
case AND:
case NOT:
newsetp = set1p = item;
break;
case OR:
u = enditem - item;
/* FALLTHROUGH */
case REVERSENOT:
u += *num;
if (item == item2) {
if (u > setsize1) {
u += SETINC;
if ((item1 = (POSTING *) realloc(item1,
u * sizeof (POSTING))) == NULL) {
goto cannotalloc;
}
setsize1 = u;
}
newitem = item1;
} else {
if (u > setsize2) {
u += SETINC;
if ((item2 = (POSTING *)realloc(item2,
u * sizeof (POSTING))) == NULL) {
cannotalloc:
invcannotalloc(u * sizeof (POSTING));
(void) boolready();
*num = -1;
return (NULL);
}
setsize2 = u;
}
newitem = item2;
}
set1p = item;
newsetp = newitem;
}
file = invcntl->postfile;
(void) fseek(file, (long)*ptr2, SEEK_SET);
read_next_posting(invcntl, &posting);
newsetc = 0;
switch (bool) {
case OR:
/* while something in both sets */
set1p = item;
newsetp = newitem;
for (set1c = 0, set2c = 0;
set1c < numitems && set2c < *num; newsetc++) {
if (set1p->lineoffset < posting.lineoffset) {
*newsetp++ = *set1p++;
set1c++;
} else if (set1p->lineoffset > posting.lineoffset) {
*newsetp++ = posting;
read_next_posting(invcntl, &posting);
set2c++;
} else if (set1p->type < posting.type) {
*newsetp++ = *set1p++;
set1c++;
} else if (set1p->type > posting.type) {
*newsetp++ = posting;
read_next_posting(invcntl, &posting);
set2c++;
} else { /* identical postings */
*newsetp++ = *set1p++;
set1c++;
read_next_posting(invcntl, &posting);
set2c++;
}
}
/* find out what ran out and move the rest in */
if (set1c < numitems) {
newsetc += numitems - set1c;
while (set1c++ < numitems) {
*newsetp++ = *set1p++;
}
} else {
while (set2c++ < *num) {
*newsetp++ = posting;
newsetc++;
read_next_posting(invcntl, &posting);
}
}
item = newitem;
break; /* end of OR */
#if 0
case AND:
set1c = 0;
set2c = 0;
while (set1c < numitems && set2c < *num) {
if (set1p->lineoffset < posting.lineoffset) {
set1p++;
set1c++;
} else if (set1p->lineoffset > posting.lineoffset) {
read_next_posting(invcntl, &posting);
set2c++;
} else if (set1p->type < posting.type) {
*set1p++;
set1c++;
} else if (set1p->type > posting.type) {
read_next_posting(invcntl, &posting);
set2c++;
} else { /* identical postings */
*newsetp++ = *set1p++;
newsetc++;
set1c++;
read_next_posting(invcntl, &posting);
set2c++;
}
}
break; /* end of AND */
case NOT:
set1c = 0;
set2c = 0;
while (set1c < numitems && set2c < *num) {
if (set1p->lineoffset < posting.lineoffset) {
*newsetp++ = *set1p++;
newsetc++;
set1c++;
} else if (set1p->lineoffset > posting.lineoffset) {
read_next_posting(invcntl, &posting);
set2c++;
} else if (set1p->type < posting.type) {
*newsetp++ = *set1p++;
newsetc++;
set1c++;
} else if (set1p->type > posting.type) {
read_next_posting(invcntl, &posting);
set2c++;
} else { /* identical postings */
set1c++;
set1p++;
read_next_posting(invcntl, &posting);
set2c++;
}
}
newsetc += numitems - set1c;
while (set1c++ < numitems) {
*newsetp++ = *set1p++;
}
break; /* end of NOT */
case REVERSENOT: /* core NOT incoming set */
set1c = 0;
set2c = 0;
while (set1c < numitems && set2c < *num) {
if (set1p->lineoffset < posting.lineoffset) {
set1p++;
set1c++;
} else if (set1p->lineoffset > posting.lineoffset) {
*newsetp++ = posting;
read_next_posting(invcntl, &posting);
set2c++;
} else if (set1p->type < posting.type) {
set1p++;
set1c++;
} else if (set1p->type > posting.type) {
*newsetp++ = posting;
read_next_posting(invcntl, &posting);
set2c++;
} else { /* identical postings */
set1c++;
set1p++;
read_next_posting(invcntl, &posting);
set2c++;
}
}
while (set2c++ < *num) {
*newsetp++ = posting;
newsetc++;
read_next_posting(invcntl, &posting);
}
item = newitem;
break; /* end of REVERSENOT */
#endif
}
numitems = newsetc;
*num = newsetc;
enditem = (POSTING *)newsetp;
return ((POSTING *)item);
}
#if 0
POSTING *
boolsave(int clear)
{
int i;
POSTING *ptr;
POSTING *oldstuff, *newstuff;
if (numitems == 0) {
if (clear)
boolclear();
return (NULL);
}
/*
* if clear then give them what we have and use (void)
* boolready to realloc
*/
if (clear) {
ptr = item;
/* free up the space we didn't give them */
if (item == item1)
item1 = NULL;
else
item2 = NULL;
(void) boolready();
return (ptr);
}
i = (enditem - item) * sizeof (POSTING) + 100;
if ((ptr = (POSTING *)malloc(i))r == NULL) {
invcannotalloc(i);
return (ptr);
}
/* move present set into place */
oldstuff = item;
newstuff = ptr;
while (oldstuff < enditem)
*newstuff++ = *oldstuff++;
return (ptr);
}
#endif
static void
invcannotalloc(size_t n)
{
(void) fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n);
}
static void
invcannotopen(char *file)
{
(void) fprintf(stderr, "%s: cannot open file %s\n", argv0, file);
}
static void
invcannotwrite(char *file)
{
(void) perror(argv0); /* must be first to preserve errno */
(void) fprintf(stderr, "%s: write to file %s failed\n", argv0, file);
}