/* ** ** This program and library is free software; you can redistribute it and/or ** modify it under the terms of the GNU (Library) General Public License ** as published by the Free Software Foundation; either version 2 ** of the License, or any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU (Library) General Public License for more details. ** ** You should have received a copy of the GNU (Library) General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ** ** ** ** 2001-05-07 jmruiz init coding ** */ #include "swish.h" #include "mem.h" #include "string.h" #include "index.h" #include "hash.h" #include "date_time.h" #include "compress.h" #include "error.h" #include "metanames.h" #include "db.h" #include "db_native.h" // #include "db_berkeley_db.h" #ifndef min #define min(a, b) (a) < (b) ? a : b #endif /* -- init structures for this module */ void initModule_DB (SWISH *sw) { /* Allocate structure */ initModule_DBNative(sw); // initModule_DB_db(sw); return; } /* -- release all wired memory for this module */ void freeModule_DB (SWISH *sw) { freeModule_DBNative(sw); // freeModule_DB_db(sw); return; } /* ---------------------------------------------- */ /* -- Config Directives -- Configuration directives for this Module -- return: 0/1 = none/config applied */ int configModule_DB (SWISH *sw, StringList *sl) { //struct MOD_DB *DB = sw->Db; // char *w0 = sl->word[0]; int retval = 1; retval = 0; // tmp due to empty routine return retval; } /* General write DB routines - Common to all DB */ /* Header routines */ #define write_header_int(sw,id,num,DB) {unsigned long itmp = (num); itmp = PACKLONG(itmp); DB_WriteHeaderData((sw),(id), (unsigned char *)&itmp, sizeof(long), (DB));} #define write_header_int2(sw,id,num1,num2,DB) {unsigned long itmp[2]; itmp[0] = (num1); itmp[1] = (num2); itmp[0]= PACKLONG(itmp[0]); itmp[1] = PACKLONG(itmp[1]); DB_WriteHeaderData((sw),(id), (unsigned char *)itmp, sizeof(long) * 2, (DB));} void write_header(SWISH *sw, INDEXDATAHEADER * header, void * DB, char *filename, int totalwords, int totalfiles, int merged) { char *c, *tmp; c = (char *) strrchr(filename, '/'); if (!c || (c && !*(c + 1))) c = filename; else c += 1; DB_InitWriteHeader(sw, DB); DB_WriteHeaderData(sw, INDEXHEADER_ID, (unsigned char *)INDEXHEADER, strlen(INDEXHEADER) +1, DB); DB_WriteHeaderData(sw, INDEXVERSION_ID, (unsigned char *)INDEXVERSION, strlen(INDEXVERSION) + 1, DB); write_header_int(sw, MERGED_ID, merged, DB); DB_WriteHeaderData(sw, NAMEHEADER_ID, (unsigned char *)header->indexn, strlen(header->indexn) + 1, DB); DB_WriteHeaderData(sw, SAVEDASHEADER_ID, (unsigned char *)c, strlen(c) + 1, DB); write_header_int2(sw, COUNTSHEADER_ID, totalwords, totalfiles, DB); tmp = getTheDateISO(); DB_WriteHeaderData(sw, INDEXEDONHEADER_ID, (unsigned char *)tmp, strlen(tmp) + 1,DB); efree(tmp); DB_WriteHeaderData(sw, DESCRIPTIONHEADER_ID, (unsigned char *)header->indexd, strlen(header->indexd) + 1, DB); DB_WriteHeaderData(sw, POINTERHEADER_ID, (unsigned char *)header->indexp, strlen(header->indexp) + 1, DB); DB_WriteHeaderData(sw, MAINTAINEDBYHEADER_ID, (unsigned char *)header->indexa, strlen(header->indexa) + 1,DB); write_header_int(sw, DOCPROPENHEADER_ID, 1, DB); write_header_int(sw, FUZZYMODEHEADER_ID, header->fuzzy_mode, DB); write_header_int(sw, IGNORETOTALWORDCOUNTWHENRANKING_ID, header->ignoreTotalWordCountWhenRanking, DB); DB_WriteHeaderData(sw, WORDCHARSHEADER_ID, (unsigned char *)header->wordchars, strlen(header->wordchars) + 1, DB); write_header_int(sw, MINWORDLIMHEADER_ID, header->minwordlimit, DB); write_header_int(sw, MAXWORDLIMHEADER_ID, header->maxwordlimit, DB); DB_WriteHeaderData(sw, BEGINCHARSHEADER_ID, (unsigned char *)header->beginchars, strlen(header->beginchars) + 1, DB); DB_WriteHeaderData(sw, ENDCHARSHEADER_ID, (unsigned char *)header->endchars, strlen(header->endchars) + 1, DB); DB_WriteHeaderData(sw, IGNOREFIRSTCHARHEADER_ID, (unsigned char *)header->ignorefirstchar, strlen(header->ignorefirstchar) + 1, DB); DB_WriteHeaderData(sw, IGNORELASTCHARHEADER_ID, (unsigned char *)header->ignorelastchar, strlen(header->ignorelastchar) + 1,DB); /* Removed - Patents write_header_int(FILEINFOCOMPRESSION_ID, header->applyFileInfoCompression, DB); */ /* Jose Ruiz 06/00 Added this line to delimite the header */ write_integer_table_to_header(sw, TRANSLATECHARTABLE_ID, header->translatecharslookuptable, sizeof(header->translatecharslookuptable) / sizeof(int), DB); /* Other header stuff */ /* StopWords */ write_words_to_header(sw, STOPWORDS_ID, header->hashstoplist, DB); /* Metanames */ write_MetaNames(sw, METANAMES_ID, header, DB); /* BuzzWords */ write_words_to_header(sw, BUZZWORDS_ID, header->hashbuzzwordlist, DB); #ifndef USE_BTREE /* Write the total words per file array, if used */ if ( !header->ignoreTotalWordCountWhenRanking ) write_integer_table_to_header(sw, TOTALWORDSPERFILE_ID, header->TotalWordsPerFile, totalfiles, DB); #endif DB_EndWriteHeader(sw, DB); } /* Jose Ruiz 11/00 ** Function to write a word to the index DB */ void write_word(SWISH * sw, ENTRY * ep, IndexFILE * indexf) { long wordID; wordID = DB_GetWordID(sw, indexf->DB); DB_WriteWord(sw, ep->word,wordID,indexf->DB); /* Store word offset for futher hash computing */ ep->u1.wordID = wordID; } #ifdef USE_BTREE /* 04/2002 jmruiz ** Routine to update wordID */ void update_wordID(SWISH * sw, ENTRY * ep, IndexFILE * indexf) { long wordID; wordID = DB_GetWordID(sw, indexf->DB); DB_UpdateWordID(sw, ep->word,wordID,indexf->DB); /* Store word offset for futher hash computing */ ep->u1.wordID = wordID; } void delete_worddata(SWISH * sw, long wordID, IndexFILE * indexf) { DB_DeleteWordData(sw,wordID,indexf->DB); } #endif /* Jose Ruiz 11/00 ** Function to write all word's data to the index DB */ void build_worddata(SWISH * sw, ENTRY * ep, IndexFILE * indexf) { int i, j, curmetaID, sz_worddata; unsigned long tmp, curmetanamepos; int metaID; int bytes_size, chunk_size; unsigned char *compressed_data, *p,*q; LOCATION *l, *next; curmetaID=0; curmetanamepos=0L; q=sw->Index->worddata_buffer; /* Compute bytes required for chunk location size. Eg: 4096 -> 2 bytes, 65535 -> 2 bytes */ for(bytes_size = 0, i = COALESCE_BUFFER_MAX_SIZE; i; i >>= 8) bytes_size++; /* Write tfrequency */ q = compress3(ep->tfrequency,q); /* Write location list */ for(l=ep->allLocationList;l;) { compressed_data = (unsigned char *) l; /* Get next element */ next = *(LOCATION **)compressed_data; /* Jump pointer to next element */ p = compressed_data + sizeof(LOCATION *); metaID = uncompress2(&p); for(chunk_size = 0, i = 0, j = bytes_size - 1; i < bytes_size; i++, j--) chunk_size |= p[i] << (j * 8); p += bytes_size; if(curmetaID!=metaID) { if(curmetaID) { /* Write in previous meta (curmetaID) ** file offset to next meta */ tmp=q - sw->Index->worddata_buffer; PACKLONG2(tmp,sw->Index->worddata_buffer+curmetanamepos); } /* Check for enough memory */ /* ** MAXINTCOMPSIZE is for the worst case metaID ** ** sizeof(long) is to leave four bytes to ** store the offset of the next metaname ** (it will be 0 if no more metanames). ** ** 1 is for the trailing '\0' */ tmp=q - sw->Index->worddata_buffer; if((long)(tmp + MAXINTCOMPSIZE + sizeof(long) + 1) >= (long)sw->Index->len_worddata_buffer) { sw->Index->len_worddata_buffer=sw->Index->len_worddata_buffer*2+MAXINTCOMPSIZE+sizeof(long)+1; sw->Index->worddata_buffer=(unsigned char *) erealloc(sw->Index->worddata_buffer,sw->Index->len_worddata_buffer); q=sw->Index->worddata_buffer+tmp; /* reasign pointer inside buffer */ } /* store metaID in buffer */ curmetaID=metaID; q = compress3(curmetaID,q); /* preserve position for offset to next ** metaname. We do not know its size ** so store it as a packed long */ curmetanamepos=q - sw->Index->worddata_buffer; /* Store 0 and increase pointer */ tmp=0L; PACKLONG2(tmp,q); q+=sizeof(unsigned long); } /* Store all data for this chunk */ /* First check for enough space ** ** 1 is for the trailing '\0' */ tmp=q - sw->Index->worddata_buffer; if((long)(tmp + chunk_size + 1) >= (long)sw->Index->len_worddata_buffer) { sw->Index->len_worddata_buffer=sw->Index->len_worddata_buffer*2+chunk_size+1; sw->Index->worddata_buffer=(unsigned char *) erealloc(sw->Index->worddata_buffer,sw->Index->len_worddata_buffer); q=sw->Index->worddata_buffer+tmp; /* reasign pointer inside buffer */ } /* Copy it and advance pointer */ memcpy(q,p,chunk_size); q += chunk_size; /* End of chunk mark -> Write trailing '\0' */ *q++ = '\0'; l = next; } /* Write in previous meta (curmetaID) ** file offset to end of metas */ tmp=q - sw->Index->worddata_buffer; PACKLONG2(tmp,sw->Index->worddata_buffer+curmetanamepos); sz_worddata = q - sw->Index->worddata_buffer; /* Adjust word positions. ** if ignorelimit was set and some new stopwords weee found, positions ** are recalculated ** Also call it even if we have not set IgnoreLimit to calesce word chunks ** and remove trailing 0 from chunks to save some bytes */ adjustWordPositions(sw->Index->worddata_buffer, &sz_worddata, sw->indexlist->header.totalfiles, sw->Index->IgnoreLimitPositionsArray); sw->Index->sz_worddata_buffer = sz_worddata; } /* 04/2002 jmruiz ** New simpler routine to write worddata */ void write_worddata(SWISH * sw, ENTRY * ep, IndexFILE * indexf ) { DB_WriteWordData(sw, ep->u1.wordID,sw->Index->worddata_buffer,sw->Index->sz_worddata_buffer,indexf->DB); } /* 04/2002 jmruiz ** Function to read all word's data from the index DB */ long read_worddata(SWISH * sw, ENTRY * ep, IndexFILE * indexf, unsigned char **buffer, int *sz_buffer) { long wordID; char *word = ep->word; DB_InitReadWords(sw, indexf->DB); DB_ReadWordHash(sw, word, &wordID, indexf->DB); if(!wordID) { DB_EndReadWords(sw, indexf->DB); sw->lasterror = WORD_NOT_FOUND; *buffer = NULL; *sz_buffer = 0; return 0L; } DB_ReadWordData(sw, wordID, buffer, sz_buffer, indexf->DB); DB_EndReadWords(sw, indexf->DB); return wordID; } /* 04/2002 jmruiz ** Routine to merge two buffers of worddata */ void add_worddata(SWISH *sw, ENTRY *epi, IndexFILE *indexf, unsigned char *olddata, int sz_olddata) { int maxtotsize; unsigned char stack_buffer[32000]; /* Just to try malloc/free fragmentation */ unsigned char *newdata; int sz_newdata; int tfreq1, tfreq2; unsigned char *p1, *p2, *p; int curmetaID_1,curmetaID_2; unsigned long nextposmetaname_1,nextposmetaname_2, curmetanamepos, curmetanamepos_1, curmetanamepos_2, tmp; int last_filenum, filenum, tmpval, frequency, *posdata; #define POSDATA_STACK 2000 int stack_posdata[POSDATA_STACK]; /* Just to avoid the overhead of malloc/free */ unsigned char r_flag, *w_flag; unsigned char *q; /* First of all, ckeck for size in buffer */ maxtotsize = sw->Index->sz_worddata_buffer + sz_olddata; if(maxtotsize > sw->Index->len_worddata_buffer) { sw->Index->len_worddata_buffer = maxtotsize + 2000; sw->Index->worddata_buffer = (unsigned char *) erealloc(sw->Index->worddata_buffer,sw->Index->len_worddata_buffer); } /* Preserve new data in a local copy - sw->Index->worddata_buffer is the final destination ** of data */ if(sw->Index->sz_worddata_buffer > sizeof(stack_buffer)) newdata = (unsigned char *) emalloc(sw->Index->sz_worddata_buffer); else newdata = stack_buffer; sz_newdata = sw->Index->sz_worddata_buffer; memcpy(newdata,sw->Index->worddata_buffer, sz_newdata); /* Set pointers to all buffers */ p1 = olddata; p2 = newdata; q = p = sw->Index->worddata_buffer; /* Now read tfrequency */ tfreq1 = uncompress2(&p1); /* tfrequency - number of files with this word */ tfreq2 = uncompress2(&p2); /* tfrequency - number of files with this word */ /* Write tfrequency */ p = compress3(tfreq1 + tfreq2, p); /* Now look for MetaIDs */ curmetaID_1 = uncompress2(&p1); curmetaID_2 = uncompress2(&p2); nextposmetaname_1 = UNPACKLONG2(p1); p1 += sizeof(long); curmetanamepos_1 = p1 - olddata; nextposmetaname_2 = UNPACKLONG2(p2); p2 += sizeof(long); curmetanamepos_2 = p2 - newdata; while(curmetaID_1 && curmetaID_2) { p = compress3(min(curmetaID_1,curmetaID_2),p); curmetanamepos = p - sw->Index->worddata_buffer; /* Store 0 and increase pointer */ tmp=0L; PACKLONG2(tmp,p); p+=sizeof(unsigned long); if(curmetaID_1 == curmetaID_2) { /* Both buffers have the same metaID - In this case I have to know the number of the filenum of the last hit of the original buffer to adjust the filenum counter in the second buffer */ last_filenum = 0; do { /* Read on all items */ uncompress_location_values(&p1,&r_flag,&tmpval,&frequency); last_filenum += tmpval; if(frequency > POSDATA_STACK) posdata = (int *) emalloc(frequency * sizeof(int)); else posdata = stack_posdata; /* Read and discard positions just to advance pointer */ uncompress_location_positions(&p1,r_flag,frequency,posdata); if(posdata!=stack_posdata) efree(posdata); if ((p1 - olddata) == sz_olddata) { curmetaID_1 = 0; /* No more metaIDs for olddata */ break; /* End of olddata */ } if ((unsigned long)(p1 - olddata) == nextposmetaname_1) { break; } } while(1); memcpy(p,olddata + curmetanamepos_1, p1 - (olddata + curmetanamepos_1)); p += p1 - (olddata + curmetanamepos_1); /* Values for next metaID if exists */ if(curmetaID_1) { curmetaID_1 = uncompress2(&p1); /* Next metaID */ nextposmetaname_1 = UNPACKLONG2(p1); p1 += sizeof(long); curmetanamepos_1 = p1 - olddata; } /* Now add the new values adjusting with last_filenum just the first ** filenum in olddata*/ /* Read first item */ uncompress_location_values(&p2,&r_flag,&tmpval,&frequency); filenum = tmpval; /* First filenum in chunk */ if(frequency > POSDATA_STACK) posdata = (int *) emalloc(frequency * sizeof(int)); else posdata = stack_posdata; /* Read positions */ uncompress_location_positions(&p2,r_flag,frequency,posdata); compress_location_values(&p,&w_flag,filenum - last_filenum,frequency,posdata); compress_location_positions(&p,w_flag,frequency,posdata); if(posdata!=stack_posdata) efree(posdata); /* Copy rest of data */ memcpy(p,p2,nextposmetaname_2 - (p2 - newdata)); p += nextposmetaname_2 - (p2 - newdata); p2 += nextposmetaname_2 - (p2 - newdata); if ((p2 - newdata) == sz_newdata) { curmetaID_2 = 0; /* No more metaIDs for newdata */ } /* Values for next metaID if exists */ if(curmetaID_2) { curmetaID_2 = uncompress2(&p2); /* Next metaID */ nextposmetaname_2 = UNPACKLONG2(p2); p2 += sizeof(long); curmetanamepos_2 = p2 - newdata; } } else if (curmetaID_1 < curmetaID_2) { memcpy(p,p1,nextposmetaname_1 - (p1 - olddata)); p += nextposmetaname_1 - (p1 - olddata); p1 = olddata + nextposmetaname_1; if ((p1 - olddata) == sz_olddata) { curmetaID_1 = 0; /* No more metaIDs for newdata */ } else { curmetaID_1 = uncompress2(&p1); /* Next metaID */ nextposmetaname_1 = UNPACKLONG2(p1); p1 += sizeof(long); curmetanamepos_1 = p1 - olddata; } } else /* curmetaID_1 > curmetaID_2 */ { memcpy(p,p2,nextposmetaname_2 - (p2 - newdata)); p += nextposmetaname_2 - (p2 - newdata); p2 = newdata + nextposmetaname_2; if ((p2 - newdata) == sz_newdata) { curmetaID_2 = 0; /* No more metaIDs for newdata */ } else { curmetaID_2 = uncompress2(&p2); /* Next metaID */ nextposmetaname_2 = UNPACKLONG2(p2); p2 += sizeof(long); curmetanamepos_2 = p2 - newdata; } } /* Put nextmetaname offset */ PACKLONG2(p - sw->Index->worddata_buffer, sw->Index->worddata_buffer + curmetanamepos); } /* while */ /* Add the rest of the data if exists */ while(curmetaID_1) { p = compress3(curmetaID_1,p); curmetanamepos = p - sw->Index->worddata_buffer; /* Store 0 and increase pointer */ tmp=0L; PACKLONG2(tmp,p); p += sizeof(unsigned long); memcpy(p,p1,nextposmetaname_1 - (p1 - olddata)); p += nextposmetaname_1 - (p1 - olddata); p1 = olddata + nextposmetaname_1; if ((p1 - olddata) == sz_olddata) { curmetaID_1 = 0; /* No more metaIDs for olddata */ } else { curmetaID_1 = uncompress2(&p1); /* Next metaID */ nextposmetaname_1 = UNPACKLONG2(p1); p1 += sizeof(long); curmetanamepos_1 = p1 - olddata; } PACKLONG2(p - sw->Index->worddata_buffer, sw->Index->worddata_buffer + curmetanamepos); } while(curmetaID_2) { p = compress3(curmetaID_2,p); curmetanamepos = p - sw->Index->worddata_buffer; /* Store 0 and increase pointer */ tmp=0L; PACKLONG2(tmp,p); p += sizeof(unsigned long); memcpy(p,p2,nextposmetaname_2 - (p2 - newdata)); p += nextposmetaname_2 - (p2 - newdata); p2 = newdata + nextposmetaname_2; if ((p2 - newdata) == sz_newdata) { curmetaID_2 = 0; /* No more metaIDs for olddata */ } else { curmetaID_2 = uncompress2(&p2); /* Next metaID */ nextposmetaname_2 = UNPACKLONG2(p2); p2+= sizeof(long); curmetanamepos_2= p2 - newdata; } } if(newdata != stack_buffer) efree(newdata); /* Save the new size */ sw->Index->sz_worddata_buffer = p - sw->Index->worddata_buffer; } /* Writes the list of metaNames into the DB index */ void write_MetaNames(SWISH *sw, int id, INDEXDATAHEADER * header, void *DB) { struct metaEntry *entry = NULL; int i, sz_buffer, len; unsigned char *buffer,*s; int fields; /* Use new metaType schema - see metanames.h */ // Format of metaname is // // len, metaType, alias, and rank_bias are compressed numbers // metaName is the ascii name of the metaname // // The list of metanames is delimited by a 0 fields = 5; // len, metaID, metaType, alias, rank_bias /* Compute buffer size */ for (sz_buffer = 0 , i = 0; i < header->metaCounter; i++) { entry = header->metaEntryArray[i]; len = strlen(entry->metaName); sz_buffer += len + fields * MAXINTCOMPSIZE; /* compress can use MAXINTCOMPSIZE bytes in worse case, */ } sz_buffer += MAXINTCOMPSIZE; /* Add extra MAXINTCOMPSIZE for the number of metanames */ s = buffer = (unsigned char *) emalloc(sz_buffer); s = compress3(header->metaCounter,s); /* store the number of metanames */ for (i = 0; i < header->metaCounter; i++) { entry = header->metaEntryArray[i]; len = strlen(entry->metaName); s = compress3(len, s); memcpy(s,entry->metaName,len); s += len; s = compress3(entry->metaID, s); s = compress3(entry->metaType, s); s = compress3(entry->alias+1, s); /* keep zeros away from compress3, I believe */ s = compress3(entry->rank_bias+RANK_BIAS_RANGE+1, s); } DB_WriteHeaderData(sw, id,buffer,s-buffer,DB); efree(buffer); } /* Write a the hashlist of words into the index header file (used by stopwords and buzzwords */ int write_words_to_header(SWISH *sw, int header_ID, struct swline **hash, void *DB) { int hashval, len, num_words, sz_buffer; char *buffer, *s; struct swline *sp = NULL; /* Let's count the words */ for (sz_buffer = 0, num_words = 0 , hashval = 0; hashval < HASHSIZE; hashval++) { sp = hash[hashval]; while (sp != NULL) { num_words++; sz_buffer += MAXINTCOMPSIZE + strlen(sp->line); sp = sp->next; } } if(num_words) { sz_buffer += MAXINTCOMPSIZE; /* Add MAXINTCOMPSIZE for the number of words */ s = buffer = (char *)emalloc(sz_buffer); s = (char *)compress3(num_words, (unsigned char *)s); for (hashval = 0; hashval < HASHSIZE; hashval++) { sp = hash[hashval]; while (sp != NULL) { len = strlen(sp->line); s = (char *)compress3(len,(unsigned char *)s); memcpy(s,sp->line,len); s +=len; sp = sp->next; } } DB_WriteHeaderData(sw, header_ID, (unsigned char *)buffer, s - buffer, DB); efree(buffer); } return 0; } int write_integer_table_to_header(SWISH *sw, int id, int table[], int table_size, void *DB) { int i, tmp; char *s; char *buffer; s = buffer = (char *) emalloc((table_size + 1) * MAXINTCOMPSIZE); s = (char *)compress3(table_size,(unsigned char *)s); /* Put the number of elements */ for (i = 0; i < table_size; i++) { tmp = table[i] + 1; s = (char *)compress3(tmp, (unsigned char *)s); /* Put all the elements */ } DB_WriteHeaderData(sw, id, (unsigned char *)buffer, s-buffer, DB); efree(buffer); return 0; } /* General read DB routines - Common to all DB */ /* Reads the file offset table in the index file. */ /* Reads and prints the header of an index file. ** Also reads the information in the header (wordchars, beginchars, etc) */ // $$$ to be rewritten as function = smaller code (rasc) #define parse_int_from_buffer(num,s) (num) = UNPACKLONG2((s)) #define parse_int2_from_buffer(num1,num2,s) (num1) = UNPACKLONG2((s));(num2) = UNPACKLONG2((s+sizeof(long))) void read_header(SWISH *sw, INDEXDATAHEADER *header, void *DB) { int id, len; unsigned long tmp, tmp1, tmp2; unsigned char *buffer; DB_InitReadHeader(sw, DB); DB_ReadHeaderData(sw, &id,&buffer,&len,DB); while (id) { switch (id) { case INDEXHEADER_ID: case INDEXVERSION_ID: case MERGED_ID: case DOCPROPENHEADER_ID: break; case WORDCHARSHEADER_ID: header->wordchars = SafeStrCopy(header->wordchars, (char *)buffer, &header->lenwordchars); sortstring(header->wordchars); makelookuptable(header->wordchars, header->wordcharslookuptable); break; case BEGINCHARSHEADER_ID: header->beginchars = SafeStrCopy(header->beginchars, (char *)buffer, &header->lenbeginchars); sortstring(header->beginchars); makelookuptable(header->beginchars, header->begincharslookuptable); break; case ENDCHARSHEADER_ID: header->endchars = SafeStrCopy(header->endchars, (char *)buffer, &header->lenendchars); sortstring(header->endchars); makelookuptable(header->endchars, header->endcharslookuptable); break; case IGNOREFIRSTCHARHEADER_ID: header->ignorefirstchar = SafeStrCopy(header->ignorefirstchar, (char *)buffer, &header->lenignorefirstchar); sortstring(header->ignorefirstchar); makelookuptable(header->ignorefirstchar, header->ignorefirstcharlookuptable); break; case IGNORELASTCHARHEADER_ID: header->ignorelastchar = SafeStrCopy(header->ignorelastchar, (char *)buffer, &header->lenignorelastchar); sortstring(header->ignorelastchar); makelookuptable(header->ignorelastchar, header->ignorelastcharlookuptable); break; /* replaced by fuzzy_mode Aug 20, 2002 case STEMMINGHEADER_ID: parse_int_from_buffer(tmp,buffer); header-> = tmp; break; case SOUNDEXHEADER_ID: parse_int_from_buffer(tmp,buffer); header->applySoundexRules = tmp; break; */ case FUZZYMODEHEADER_ID: parse_int_from_buffer(tmp,buffer); header->fuzzy_mode = tmp; break; case IGNORETOTALWORDCOUNTWHENRANKING_ID: parse_int_from_buffer(tmp,buffer); header->ignoreTotalWordCountWhenRanking = tmp; break; case MINWORDLIMHEADER_ID: parse_int_from_buffer(tmp,buffer); header->minwordlimit = tmp; break; case MAXWORDLIMHEADER_ID: parse_int_from_buffer(tmp,buffer); header->maxwordlimit = tmp; break; case SAVEDASHEADER_ID: header->savedasheader = SafeStrCopy(header->savedasheader, (char *)buffer, &header->lensavedasheader); break; case NAMEHEADER_ID: header->indexn = SafeStrCopy(header->indexn, (char *)buffer, &header->lenindexn); break; case DESCRIPTIONHEADER_ID: header->indexd = SafeStrCopy(header->indexd, (char *)buffer, &header->lenindexd); break; case POINTERHEADER_ID: header->indexp = SafeStrCopy(header->indexp, (char *)buffer, &header->lenindexp); break; case MAINTAINEDBYHEADER_ID: header->indexa = SafeStrCopy(header->indexa, (char *)buffer, &header->lenindexa); break; case INDEXEDONHEADER_ID: header->indexedon = SafeStrCopy(header->indexedon, (char *)buffer, &header->lenindexedon); break; case COUNTSHEADER_ID: parse_int2_from_buffer(tmp1,tmp2,buffer); header->totalwords = tmp1; header->totalfiles = tmp2; break; /* removed due to patents problems case FILEINFOCOMPRESSION_ID: ReadHeaderInt(itmp, fp); header->applyFileInfoCompression = itmp; break; */ case TRANSLATECHARTABLE_ID: parse_integer_table_from_buffer(header->translatecharslookuptable, sizeof(header->translatecharslookuptable) / sizeof(int), (char *)buffer); break; case STOPWORDS_ID: parse_stopwords_from_buffer(header, (char *)buffer); break; case METANAMES_ID: parse_MetaNames_from_buffer(header, (char *)buffer); break; case BUZZWORDS_ID: parse_buzzwords_from_buffer(header, (char *)buffer); break; #ifndef USE_BTREE case TOTALWORDSPERFILE_ID: if ( !header->ignoreTotalWordCountWhenRanking ) { header->TotalWordsPerFile = emalloc( header->totalfiles * sizeof(int) ); parse_integer_table_from_buffer(header->TotalWordsPerFile, header->totalfiles, (char *)buffer); } break; #endif default: progerr("Severe index error in header"); break; } efree(buffer); DB_ReadHeaderData(sw, &id,&buffer,&len,DB); } DB_EndReadHeader(sw, DB); } /* Reads the metaNames from the index */ void parse_MetaNames_from_buffer(INDEXDATAHEADER *header, char *buffer) { int len; int num_metanames; int metaType, i, alias, bias, metaID; char *word; unsigned char *s = (unsigned char *)buffer; struct metaEntry *m; /* First clear out the default metanames */ freeMetaEntries( header ); num_metanames = uncompress2(&s); for (i = 0; i < num_metanames; i++) { len = uncompress2(&s); word = emalloc(len +1); memcpy(word,s,len); s += len; word[len] = '\0'; /* Read metaID */ metaID = uncompress2(&s); /* metaType was saved as metaType+1 */ metaType = uncompress2(&s); alias = uncompress2(&s) - 1; bias = uncompress2(&s) - RANK_BIAS_RANGE - 1; /* add the meta tag */ if ( !(m = addNewMetaEntry(header, word, metaType, metaID))) progerr("failed to add new meta entry '%s:%d'", word, metaID ); m->alias = alias; m->rank_bias = bias; efree(word); } } /* Reads the stopwords in the index file. */ void parse_stopwords_from_buffer(INDEXDATAHEADER *header, char *buffer) { int len; int num_words; int i; char *word = NULL; unsigned char *s = (unsigned char *)buffer; num_words = uncompress2(&s); for (i=0; i < num_words ; i++) { len = uncompress2(&s); word = emalloc(len+1); memcpy(word,s,len); s += len; word[len] = '\0'; addStopList(header, word); addstophash(header, word); efree(word); } } /* read the buzzwords from the index file */ void parse_buzzwords_from_buffer(INDEXDATAHEADER *header, char *buffer) { int len; int num_words; int i; char *word = NULL; unsigned char *s = (unsigned char *)buffer; num_words = uncompress2(&s); for (i=0; i < num_words ; i++) { len = uncompress2(&s); word = emalloc(len+1); memcpy(word,s,len); s += len; word[len] = '\0'; addbuzzwordhash(header, word); efree(word); } } void parse_integer_table_from_buffer(int table[], int table_size, char *buffer) { int tmp,i; unsigned char *s = (unsigned char *)buffer; tmp = uncompress2(&s); /* Jump the number of elements */ for (i = 0; i < table_size; i++) { tmp = uncompress2(&s); /* Gut all the elements */ table[i] = tmp - 1; } } /* 11/00 Function to read all words starting with a character */ char *getfilewords(SWISH * sw, int c, IndexFILE * indexf) { int i, j; int wordlen; char *buffer, *resultword; int bufferpos, bufferlen; unsigned char word[2]; long wordID; if (!c) return ""; /* Check if already read */ j = (int) ((unsigned char) c); if (indexf->keywords[j]) return (indexf->keywords[j]); DB_InitReadWords(sw, indexf->DB); word[0]=(unsigned char)c; word[1]='\0'; DB_ReadFirstWordInvertedIndex(sw, (char *)word, &resultword, &wordID, indexf->DB); i = (int) ((unsigned char) c); if (!wordID) { DB_EndReadWords(sw, indexf->DB); sw->lasterror = WORD_NOT_FOUND; return ""; } wordlen = strlen(resultword); bufferlen = wordlen + MAXSTRLEN * 10; bufferpos = 0; buffer = emalloc(bufferlen + 1); buffer[0] = '\0'; memcpy(buffer, resultword, wordlen); efree(resultword); if (c != (int)((unsigned char) buffer[bufferpos])) { buffer[bufferpos] = '\0'; indexf->keywords[j] = buffer; return (indexf->keywords[j]); } buffer[bufferpos + wordlen] = '\0'; bufferpos += wordlen + 1; /* Look for occurrences */ DB_ReadNextWordInvertedIndex(sw, (char *)word, &resultword, &wordID, indexf->DB); while (wordID) { wordlen = strlen(resultword); if ((bufferpos + wordlen + 1 + 1) > bufferlen) { bufferlen += MAXSTRLEN + wordlen + 1 + 1; buffer = (char *) erealloc(buffer, bufferlen + 1); } memcpy(buffer + bufferpos, resultword, wordlen); efree(resultword); if (c != (int)((unsigned char)buffer[bufferpos])) { buffer[bufferpos] = '\0'; break; } buffer[bufferpos + wordlen] = '\0'; bufferpos += wordlen + 1; DB_ReadNextWordInvertedIndex(sw, (char *)word, &resultword, &wordID, indexf->DB); } buffer[bufferpos] = '\0'; indexf->keywords[j] = buffer; return (indexf->keywords[j]); } void setTotalWordsPerFile(SWISH *sw, IndexFILE *indexf, int idx,int wordcount) { INDEXDATAHEADER *header = &indexf->header; #ifdef USE_BTREE DB_WriteTotalWordsPerFile(sw, idx, wordcount, indexf->DB); #else if ( !header->TotalWordsPerFile || idx >= header->TotalWordsPerFileMax ) { header->TotalWordsPerFileMax += 20000; /* random guess -- could be a config setting */ if(! header->TotalWordsPerFile) header->TotalWordsPerFile = emalloc( header->TotalWordsPerFileMax * sizeof(int) ); else header->TotalWordsPerFile = erealloc( header->TotalWordsPerFile, header->TotalWordsPerFileMax * sizeof(int) ); } header->TotalWordsPerFile[idx] = wordcount; #endif } void getTotalWordsPerFile(SWISH *sw, IndexFILE *indexf, int idx,int *wordcount) { #ifdef USE_BTREE DB_ReadTotalWordsPerFile(sw, idx, wordcount, indexf->DB); #else INDEXDATAHEADER *header = &indexf->header; *wordcount = header->TotalWordsPerFile[idx]; #endif } /*------------------------------------------------------*/ /*---------- General entry point of DB module ----------*/ void *DB_Create (SWISH *sw, char *dbname) { return sw->Db->DB_Create(sw, dbname); } void *DB_Open (SWISH *sw, char *dbname, int mode) { return sw->Db->DB_Open(sw, dbname,mode); } void DB_Close(SWISH *sw, void *DB) { sw->Db->DB_Close(DB); } void DB_Remove(SWISH *sw, void *DB) { sw->Db->DB_Remove(DB); } int DB_InitWriteHeader(SWISH *sw, void *DB) { return sw->Db->DB_InitWriteHeader(DB); } int DB_WriteHeaderData(SWISH *sw, int id, unsigned char *s, int len, void *DB) { return sw->Db->DB_WriteHeaderData(id, s,len,DB); } int DB_EndWriteHeader(SWISH *sw, void *DB) { return sw->Db->DB_EndWriteHeader(DB); } int DB_InitReadHeader(SWISH *sw, void *DB) { return sw->Db->DB_InitReadHeader(DB); } int DB_ReadHeaderData(SWISH *sw, int *id, unsigned char **s, int *len, void *DB) { return sw->Db->DB_ReadHeaderData(id, s, len, DB); } int DB_EndReadHeader(SWISH *sw, void *DB) { return sw->Db->DB_EndReadHeader(DB); } int DB_InitWriteWords(SWISH *sw, void *DB) { return sw->Db->DB_InitWriteWords(DB); } long DB_GetWordID(SWISH *sw, void *DB) { return sw->Db->DB_GetWordID(DB); } int DB_WriteWord(SWISH *sw, char *word, long wordID, void *DB) { return sw->Db->DB_WriteWord(word, wordID, DB); } #ifdef USE_BTREE int DB_UpdateWordID(SWISH *sw, char *word, long wordID, void *DB) { return sw->Db->DB_UpdateWordID(word, wordID, DB); } int DB_DeleteWordData(SWISH *sw, long wordID, void *DB) { return sw->Db->DB_DeleteWordData(wordID, DB); } #endif int DB_WriteWordHash(SWISH *sw, char *word, long wordID, void *DB) { return sw->Db->DB_WriteWordHash(word, wordID, DB); } long DB_WriteWordData(SWISH *sw, long wordID, unsigned char *worddata, int lendata, void *DB) { return sw->Db->DB_WriteWordData(wordID, worddata, lendata, DB); } int DB_EndWriteWords(SWISH *sw, void *DB) { return sw->Db->DB_EndWriteWords(DB); } int DB_InitReadWords(SWISH *sw, void *DB) { return sw->Db->DB_InitReadWords(DB); } int DB_ReadWordHash(SWISH *sw, char *word, long *wordID, void *DB) { return sw->Db->DB_ReadWordHash(word, wordID, DB); } int DB_ReadFirstWordInvertedIndex(SWISH *sw, char *word, char **resultword, long *wordID, void *DB) { return sw->Db->DB_ReadFirstWordInvertedIndex(word, resultword, wordID, DB); } int DB_ReadNextWordInvertedIndex(SWISH *sw, char *word, char **resultword, long *wordID, void *DB) { return sw->Db->DB_ReadNextWordInvertedIndex(word, resultword, wordID, DB); } long DB_ReadWordData(SWISH *sw, long wordID, unsigned char **worddata, int *lendata, void *DB) { return sw->Db->DB_ReadWordData(wordID, worddata, lendata, DB); } int DB_EndReadWords(SWISH *sw, void *DB) { return sw->Db->DB_EndReadWords(DB); } int DB_InitWriteFiles(SWISH *sw, void *DB) { return sw->Db->DB_InitWriteFiles(DB); } int DB_WriteFile(SWISH *sw, int filenum, unsigned char *filedata,int sz_filedata, void *DB) { return sw->Db->DB_WriteFile(filenum, filedata, sz_filedata, DB); } int DB_EndWriteFiles(SWISH *sw, void *DB) { return sw->Db->DB_EndWriteFiles(DB); } int DB_InitReadFiles(SWISH *sw, void *DB) { return sw->Db->DB_InitReadFiles(DB); } int DB_ReadFile(SWISH *sw, int filenum, unsigned char **filedata,int *sz_filedata, void *DB) { return sw->Db->DB_ReadFile(filenum, filedata,sz_filedata, DB); } int DB_EndReadFiles(SWISH *sw, void *DB) { return sw->Db->DB_EndReadFiles(DB); } #ifdef USE_BTREE int DB_InitWriteSortedIndex(SWISH *sw, void *DB, int n_props) { return sw->Db->DB_InitWriteSortedIndex(DB, n_props); } #else int DB_InitWriteSortedIndex(SWISH *sw, void *DB) { return sw->Db->DB_InitWriteSortedIndex(DB); } #endif int DB_WriteSortedIndex(SWISH *sw, int propID, unsigned char *data, int sz_data,void *DB) { return sw->Db->DB_WriteSortedIndex(propID, data, sz_data,DB); } int DB_EndWriteSortedIndex(SWISH *sw, void *DB) { return sw->Db->DB_EndWriteSortedIndex(DB); } int DB_InitReadSortedIndex(SWISH *sw, void *DB) { return sw->Db->DB_InitReadSortedIndex(DB); } int DB_ReadSortedIndex(SWISH *sw, int propID, unsigned char **data, int *sz_data,void *DB) { return sw->Db->DB_ReadSortedIndex(propID, data, sz_data,DB); } int DB_ReadSortedData(SWISH *sw, int *data,int index, int *value, void *DB) { return sw->Db->DB_ReadSortedData(data,index,value,DB); } int DB_EndReadSortedIndex(SWISH *sw, void *DB) { return sw->Db->DB_EndReadSortedIndex(DB); } void DB_WriteProperty( SWISH *sw, IndexFILE *indexf, FileRec *fi, int propID, char *buffer, int buf_len, int uncompressed_len, void *db) { sw->Db->DB_WriteProperty( indexf, fi, propID, buffer, buf_len, uncompressed_len, db ); } void DB_WritePropPositions(SWISH *sw, IndexFILE *indexf, FileRec *fi, void *db) { sw->Db->DB_WritePropPositions( indexf, fi, db); } void DB_ReadPropPositions(SWISH *sw, IndexFILE *indexf, FileRec *fi, void *db) { sw->Db->DB_ReadPropPositions( indexf, fi, db); } char *DB_ReadProperty(SWISH *sw, IndexFILE *indexf, FileRec *fi, int propID, int *buf_len, int *uncompressed_len, void *db) { return sw->Db->DB_ReadProperty( indexf, fi, propID, buf_len, uncompressed_len, db ); } void DB_Reopen_PropertiesForRead(SWISH *sw, void *DB ) { sw->Db->DB_Reopen_PropertiesForRead(DB); } #ifdef USE_BTREE int DB_WriteTotalWordsPerFile(SWISH *sw, int idx, int wordcount, void *DB) { return sw->Db->DB_WriteTotalWordsPerFile(sw, idx, wordcount, DB); } int DB_ReadTotalWordsPerFile(SWISH *sw, int index, int *value, void *DB) { return sw->Db->DB_ReadTotalWordsPerFile(sw, index, value, DB); } #endif