/* ** ** This program and library is free software; you can redistribute it and/or ** modify it under the terms of the GNU (Library) General Public License ** as published by the Free Software Foundation; either version 2 ** of the License, or any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU (Library) General Public License for more details. ** ** You should have received a copy of the GNU (Library) General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ** ** ** ** 2001-05-07 jmruiz init coding ** */ #include "swish.h" #include "merge.h" #include "docprop.h" #include "hash.h" #include "string.h" #include "mem.h" #include "db.h" #include "compress.h" #include "index.h" #include "search.h" #include "result_output.h" #include "metanames.h" #include "dump.h" void dump_index_file_list( SWISH *sw, IndexFILE *indexf ) { int i; int end = indexf->header.totalfiles; i = sw->Search->beginhits ? sw->Search->beginhits - 1 : 0; if ( i >= indexf->header.totalfiles ) { printf("Hey, there are only %d files\n", indexf->header.totalfiles ); exit(-1); } end = indexf->header.totalfiles; if ( sw->Search->maxhits > 0 ) { end = i + sw->Search->maxhits; if ( end > indexf->header.totalfiles ) end = indexf->header.totalfiles; } printf("\n\n-----> FILES in index %s <-----\n", indexf->line ); for (; i < end; i++) { FileRec fi; memset( &fi, 0, sizeof( FileRec ) ); fi.filenum = i+1; fflush(stdout); printf("Dumping File Properties for File Number: %d\n", i+1); dump_file_properties( indexf, &fi ); printf("\n"); printf("ReadAllDocProperties:\n"); fi.docProperties = ReadAllDocPropertiesFromDisk( sw, indexf, i+1 ); dump_file_properties( indexf, &fi ); freefileinfo( &fi ); printf("\n"); /* dump one at a time */ { propEntry *p; int j; struct metaEntry *meta_entry; INDEXDATAHEADER *header = &indexf->header; int count = header->property_count; printf("ReadSingleDocPropertiesFromDisk:\n"); for (j=0; j< count; j++) // just for testing { int metaID = header->propIDX_to_metaID[j]; if ( !(p = ReadSingleDocPropertiesFromDisk(sw, indexf, &fi, metaID, 0 )) ) continue; meta_entry = getPropNameByID( &indexf->header, metaID ); dump_single_property( p, meta_entry ); { // show compression char *buffer; int uncompressed_len; int buf_len; if ( (buffer = DB_ReadProperty( sw, indexf, &fi, meta_entry->metaID, &buf_len, &uncompressed_len, indexf->DB ))) { if ( uncompressed_len ) printf(" %20s: %d -> %d (%4.2f%%)\n", "**Compressed**", uncompressed_len , buf_len, (float)buf_len/(float)uncompressed_len * 100.00f ); efree(buffer); } } freeProperty( p ); } } printf("\n"); freefileinfo(&fi); } printf("\nNumber of File Entries: %d\n", indexf->header.totalfiles); fflush(stdout); } /* Prints out the data in an index DB */ void DB_decompress(SWISH * sw, IndexFILE * indexf) { int i, j, c, fieldnum, frequency, metaname, tmpval, filenum, *posdata; unsigned long nextposmetaname; char word[2]; char *resultword; unsigned char *worddata, *s, flag; int sz_worddata; long wordID; indexf->DB = DB_Open(sw, indexf->line,DB_READ); metaname = 0; nextposmetaname = 0L; c = 0; frequency = 0; /* Read header */ read_header(sw, &indexf->header, indexf->DB); if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_HEADER) ) resultPrintHeader(sw, 0, &indexf->header, indexf->line, 0); fieldnum = 0; /* Do metanames first as that will be helpful for decoding next */ if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_METANAMES) ) dump_metanames( sw, indexf, 1 ); if (DEBUG_MASK & DEBUG_INDEX_WORDS_ONLY) { DB_InitReadWords(sw, indexf->DB); for( j = 0; j < 256; j++ ) { word[0] = (unsigned char) j; word[1] = '\0'; DB_ReadFirstWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); while(wordID) { printf("%s\n",resultword); efree(resultword); DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); } } DB_EndReadWords(sw, indexf->DB); } else if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_WORDS | DEBUG_INDEX_WORDS_FULL | DEBUG_INDEX_WORDS_META) ) { int *meta_used; int end_meta = 0; printf("\n-----> WORD INFO in index %s <-----\n", indexf->line); for(i = 0; i < indexf->header.metaCounter; i++) if ( indexf->header.metaEntryArray[i]->metaID > end_meta ) end_meta = indexf->header.metaEntryArray[i]->metaID; meta_used = emalloc( sizeof(int) * ( end_meta + 1) ); /* _META only reports which tags the words are found in */ for(i = 0; i <= end_meta; i++) meta_used[i] = 0; DB_InitReadWords(sw, indexf->DB); for(j=1;j<256;j++) { word[0] = (unsigned char) j; word[1] = '\0'; DB_ReadFirstWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); while(wordID && (((int)((unsigned char)resultword[0]))== j)) { printf("\n%s",resultword); /* Read Word's data */ DB_ReadWordData(sw, wordID, &worddata, &sz_worddata, indexf->DB); /* parse and print word's data */ s = worddata; tmpval = uncompress2(&s); /* tfrequency */ metaname = uncompress2(&s); /* metaname */ if (metaname) { nextposmetaname = UNPACKLONG2(s); s += sizeof(long); } filenum = 0; while(1) { /* Read on all items */ uncompress_location_values(&s,&flag,&tmpval,&frequency); filenum += tmpval; posdata = (int *) emalloc(frequency * sizeof(int)); uncompress_location_positions(&s,flag,frequency,posdata); // if (sw->verbose >= 4) if (DEBUG_MASK & (DEBUG_INDEX_ALL|DEBUG_INDEX_WORDS_FULL)) { struct metaEntry *m; printf("\n Meta:%d", metaname); /* Get path from property list */ if ( (m = getPropNameByName( &sw->indexlist->header, AUTOPROPERTY_DOCPATH )) ) { RESULT r; char *s; memset( &r, 0, sizeof( RESULT ) ); r.indexf = indexf; r.filenum = filenum; r.fi.filenum = filenum; s = getResultPropAsString( sw, &r, m->metaID); printf(" %s", s ); efree( s ); } else printf(" Failed to lookup meta entry"); printf(" Freq:%d", frequency); printf(" Pos/Struct:"); } else if ( DEBUG_MASK & DEBUG_INDEX_WORDS_META) meta_used[ metaname ]++; else { printf(" [%d", metaname); printf(" %d", filenum); printf(" %d (", frequency); } for (i = 0; i < frequency; i++) { if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_WORDS_FULL)) //if (sw->verbose >= 4) { if (i) printf(",%d/%x", GET_POSITION(posdata[i]),GET_STRUCTURE(posdata[i])); else printf("%d/%x", GET_POSITION(posdata[i]), GET_STRUCTURE(posdata[i])); } else if ( DEBUG_MASK & DEBUG_INDEX_WORDS) { if (i) printf(" %d/%x", GET_POSITION(posdata[i]),GET_STRUCTURE(posdata[i])); else printf("%d/%x", GET_POSITION(posdata[i]),GET_STRUCTURE(posdata[i])); } } efree(posdata); if ( DEBUG_MASK & DEBUG_INDEX_WORDS ) printf(")]"); if ((s - worddata) == sz_worddata) break; /* End of worddata */ if ((unsigned long)(s - worddata) == nextposmetaname) { filenum = 0; metaname = uncompress2(&s); if (metaname) { nextposmetaname = UNPACKLONG2(s); s += sizeof(long); } else nextposmetaname = 0L; } } if ( DEBUG_MASK & DEBUG_INDEX_WORDS_META) { for(i = 0; i <= end_meta; i++) { if ( meta_used[i] ) printf( "\t%d", i ); meta_used[i] = 0; } } if ( !( DEBUG_MASK & DEBUG_INDEX_WORDS_META )) printf("\n"); efree(worddata); efree(resultword); DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); } } DB_EndReadWords(sw, indexf->DB); efree( meta_used ); } /* Decode Stop Words: All them are in just one line */ if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_STOPWORDS) ) { printf("\n\n-----> STOP WORDS in %s <-----\n" , indexf->line); for(i=0;iheader.stopPos;i++) printf("%s ",indexf->header.stopList[i]); printf("\n"); } /* Decode File Info */ if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_FILES) ) dump_index_file_list( sw, indexf ); DB_Close(sw, indexf->DB); } int check_sorted_index( SWISH *sw, IndexFILE *indexf, struct metaEntry *m ) { unsigned char *buffer; int sz_buffer; DB_InitReadSortedIndex(sw, indexf->DB); /* Get the sorted index of the property */ DB_ReadSortedIndex(sw, m->metaID, &buffer, &sz_buffer, indexf->DB); if ( sz_buffer ) efree( buffer ); /* Table doesn't exist */ return sz_buffer; } void dump_metanames( SWISH *sw, IndexFILE *indexf, int check_presorted ) { struct metaEntry *meta_entry; int i; printf("\n\n-----> METANAMES for %s <-----\n", indexf->line ); for(i = 0; i < indexf->header.metaCounter; i++) { meta_entry = indexf->header.metaEntryArray[i]; printf("%20s : id=%2d type=%2d ",meta_entry->metaName, meta_entry->metaID, meta_entry->metaType); if ( is_meta_index( meta_entry ) ) printf(" META_INDEX Rank Bias=%3d", meta_entry->rank_bias ); if ( is_meta_internal( meta_entry ) ) printf(" META_INTERNAL"); if ( is_meta_property( meta_entry ) ) { printf(" META_PROP:"); if ( is_meta_string(meta_entry) ) printf("STRING(case:%s)", is_meta_ignore_case(meta_entry)? "ignore" : "compare"); else if ( is_meta_date(meta_entry) ) printf("DATE"); else if ( is_meta_number(meta_entry) ) printf("NUMBER"); else printf("unknown!"); } if ( check_presorted && check_sorted_index( sw, indexf, meta_entry) ) printf(" *presorted*"); if ( meta_entry->alias ) { struct metaEntry *m = is_meta_index( meta_entry ) ? getMetaNameByID( &indexf->header, meta_entry->alias ) : getPropNameByID( &indexf->header, meta_entry->alias ); printf(" [Alias for %s (%d)]", m->metaName, m->metaID ); } printf("\n"); } printf("\n"); }