1 |
/* |
2 |
** |
3 |
** This program and library is free software; you can redistribute it and/or |
4 |
** modify it under the terms of the GNU (Library) General Public License |
5 |
** as published by the Free Software Foundation; either version 2 |
6 |
** of the License, or any later version. |
7 |
** |
8 |
** This program is distributed in the hope that it will be useful, |
9 |
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 |
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 |
** GNU (Library) General Public License for more details. |
12 |
** |
13 |
** You should have received a copy of the GNU (Library) General Public License |
14 |
** along with this program; if not, write to the Free Software |
15 |
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
16 |
** |
17 |
** |
18 |
** |
19 |
** 2001-05-07 jmruiz init coding |
20 |
** |
21 |
*/ |
22 |
|
23 |
#include "swish.h" |
24 |
#include "merge.h" |
25 |
#include "docprop.h" |
26 |
#include "hash.h" |
27 |
#include "string.h" |
28 |
#include "mem.h" |
29 |
#include "db.h" |
30 |
#include "compress.h" |
31 |
#include "index.h" |
32 |
#include "search.h" |
33 |
#include "result_output.h" |
34 |
#include "metanames.h" |
35 |
#include "dump.h" |
36 |
|
37 |
|
38 |
|
39 |
|
40 |
void dump_index_file_list( SWISH *sw, IndexFILE *indexf ) |
41 |
{ |
42 |
int i; |
43 |
int end = indexf->header.totalfiles; |
44 |
|
45 |
i = sw->Search->beginhits ? sw->Search->beginhits - 1 : 0; |
46 |
|
47 |
if ( i >= indexf->header.totalfiles ) |
48 |
{ |
49 |
printf("Hey, there are only %d files\n", indexf->header.totalfiles ); |
50 |
exit(-1); |
51 |
} |
52 |
|
53 |
end = indexf->header.totalfiles; |
54 |
|
55 |
if ( sw->Search->maxhits > 0 ) |
56 |
{ |
57 |
end = i + sw->Search->maxhits; |
58 |
if ( end > indexf->header.totalfiles ) |
59 |
end = indexf->header.totalfiles; |
60 |
} |
61 |
|
62 |
|
63 |
printf("\n\n-----> FILES in index %s <-----\n", indexf->line ); |
64 |
|
65 |
for (; i < end; i++) |
66 |
{ |
67 |
FileRec fi; |
68 |
|
69 |
memset( &fi, 0, sizeof( FileRec ) ); |
70 |
|
71 |
fi.filenum = i+1; |
72 |
|
73 |
fflush(stdout); |
74 |
printf("Dumping File Properties for File Number: %d\n", i+1); |
75 |
|
76 |
|
77 |
dump_file_properties( indexf, &fi ); |
78 |
printf("\n"); |
79 |
|
80 |
|
81 |
printf("ReadAllDocProperties:\n"); |
82 |
fi.docProperties = ReadAllDocPropertiesFromDisk( sw, indexf, i+1 ); |
83 |
dump_file_properties( indexf, &fi ); |
84 |
freefileinfo( &fi ); |
85 |
|
86 |
printf("\n"); |
87 |
|
88 |
|
89 |
/* dump one at a time */ |
90 |
{ |
91 |
propEntry *p; |
92 |
int j; |
93 |
struct metaEntry *meta_entry; |
94 |
INDEXDATAHEADER *header = &indexf->header; |
95 |
int count = header->property_count; |
96 |
|
97 |
printf("ReadSingleDocPropertiesFromDisk:\n"); |
98 |
|
99 |
for (j=0; j< count; j++) // just for testing |
100 |
{ |
101 |
int metaID = header->propIDX_to_metaID[j]; |
102 |
|
103 |
if ( !(p = ReadSingleDocPropertiesFromDisk(sw, indexf, &fi, metaID, 0 )) ) |
104 |
continue; |
105 |
|
106 |
meta_entry = getPropNameByID( &indexf->header, metaID ); |
107 |
dump_single_property( p, meta_entry ); |
108 |
|
109 |
{ // show compression |
110 |
char *buffer; |
111 |
int uncompressed_len; |
112 |
int buf_len; |
113 |
|
114 |
if ( (buffer = DB_ReadProperty( sw, indexf, &fi, meta_entry->metaID, &buf_len, &uncompressed_len, indexf->DB ))) |
115 |
{ |
116 |
if ( uncompressed_len ) |
117 |
printf(" %20s: %d -> %d (%4.2f%%)\n", "**Compressed**", uncompressed_len , buf_len, (float)buf_len/(float)uncompressed_len * 100.00f ); |
118 |
|
119 |
efree(buffer); |
120 |
} |
121 |
} |
122 |
|
123 |
|
124 |
|
125 |
freeProperty( p ); |
126 |
} |
127 |
} |
128 |
printf("\n"); |
129 |
|
130 |
|
131 |
freefileinfo(&fi); |
132 |
} |
133 |
printf("\nNumber of File Entries: %d\n", indexf->header.totalfiles); |
134 |
fflush(stdout); |
135 |
} |
136 |
|
137 |
|
138 |
|
139 |
|
140 |
/* Prints out the data in an index DB */ |
141 |
void DB_decompress(SWISH * sw, IndexFILE * indexf) |
142 |
{ |
143 |
int i, |
144 |
j, |
145 |
c, |
146 |
fieldnum, |
147 |
frequency, |
148 |
metaname, |
149 |
tmpval, |
150 |
filenum, |
151 |
*posdata; |
152 |
unsigned long nextposmetaname; |
153 |
char word[2]; |
154 |
char *resultword; |
155 |
unsigned char *worddata, *s, flag; |
156 |
int sz_worddata; |
157 |
long wordID; |
158 |
|
159 |
|
160 |
|
161 |
indexf->DB = DB_Open(sw, indexf->line,DB_READ); |
162 |
|
163 |
metaname = 0; |
164 |
|
165 |
nextposmetaname = 0L; |
166 |
|
167 |
c = 0; |
168 |
|
169 |
frequency = 0; |
170 |
|
171 |
/* Read header */ |
172 |
read_header(sw, &indexf->header, indexf->DB); |
173 |
|
174 |
|
175 |
if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_HEADER) ) |
176 |
resultPrintHeader(sw, 0, &indexf->header, indexf->line, 0); |
177 |
|
178 |
fieldnum = 0; |
179 |
|
180 |
|
181 |
/* Do metanames first as that will be helpful for decoding next */ |
182 |
if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_METANAMES) ) |
183 |
dump_metanames( sw, indexf, 1 ); |
184 |
|
185 |
if (DEBUG_MASK & DEBUG_INDEX_WORDS_ONLY) |
186 |
{ |
187 |
DB_InitReadWords(sw, indexf->DB); |
188 |
|
189 |
for( j = 0; j < 256; j++ ) |
190 |
{ |
191 |
word[0] = (unsigned char) j; |
192 |
word[1] = '\0'; |
193 |
DB_ReadFirstWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
194 |
|
195 |
while(wordID) |
196 |
{ |
197 |
printf("%s\n",resultword); |
198 |
|
199 |
|
200 |
efree(resultword); |
201 |
DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
202 |
|
203 |
} |
204 |
} |
205 |
DB_EndReadWords(sw, indexf->DB); |
206 |
} |
207 |
|
208 |
|
209 |
else if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_WORDS | DEBUG_INDEX_WORDS_FULL | DEBUG_INDEX_WORDS_META) ) |
210 |
{ |
211 |
int *meta_used; |
212 |
int end_meta = 0; |
213 |
|
214 |
printf("\n-----> WORD INFO in index %s <-----\n", indexf->line); |
215 |
|
216 |
for(i = 0; i < indexf->header.metaCounter; i++) |
217 |
if ( indexf->header.metaEntryArray[i]->metaID > end_meta ) |
218 |
end_meta = indexf->header.metaEntryArray[i]->metaID; |
219 |
|
220 |
meta_used = emalloc( sizeof(int) * ( end_meta + 1) ); |
221 |
|
222 |
/* _META only reports which tags the words are found in */ |
223 |
for(i = 0; i <= end_meta; i++) |
224 |
meta_used[i] = 0; |
225 |
|
226 |
|
227 |
DB_InitReadWords(sw, indexf->DB); |
228 |
|
229 |
for(j=1;j<256;j++) |
230 |
{ |
231 |
word[0] = (unsigned char) j; word[1] = '\0'; |
232 |
DB_ReadFirstWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
233 |
|
234 |
while(wordID && (((int)((unsigned char)resultword[0]))== j)) |
235 |
{ |
236 |
printf("\n%s",resultword); |
237 |
|
238 |
/* Read Word's data */ |
239 |
DB_ReadWordData(sw, wordID, &worddata, &sz_worddata, indexf->DB); |
240 |
|
241 |
/* parse and print word's data */ |
242 |
s = worddata; |
243 |
|
244 |
tmpval = uncompress2(&s); /* tfrequency */ |
245 |
metaname = uncompress2(&s); /* metaname */ |
246 |
if (metaname) |
247 |
{ |
248 |
nextposmetaname = UNPACKLONG2(s); |
249 |
s += sizeof(long); |
250 |
} |
251 |
|
252 |
filenum = 0; |
253 |
while(1) |
254 |
{ /* Read on all items */ |
255 |
uncompress_location_values(&s,&flag,&tmpval,&frequency); |
256 |
filenum += tmpval; |
257 |
posdata = (int *) emalloc(frequency * sizeof(int)); |
258 |
uncompress_location_positions(&s,flag,frequency,posdata); |
259 |
|
260 |
|
261 |
// if (sw->verbose >= 4) |
262 |
if (DEBUG_MASK & (DEBUG_INDEX_ALL|DEBUG_INDEX_WORDS_FULL)) |
263 |
{ |
264 |
struct metaEntry *m; |
265 |
|
266 |
printf("\n Meta:%d", metaname); |
267 |
|
268 |
|
269 |
/* Get path from property list */ |
270 |
if ( (m = getPropNameByName( &sw->indexlist->header, AUTOPROPERTY_DOCPATH )) ) |
271 |
{ |
272 |
RESULT r; |
273 |
char *s; |
274 |
|
275 |
memset( &r, 0, sizeof( RESULT ) ); |
276 |
|
277 |
r.indexf = indexf; |
278 |
r.filenum = filenum; |
279 |
r.fi.filenum = filenum; |
280 |
|
281 |
s = getResultPropAsString( sw, &r, m->metaID); |
282 |
|
283 |
printf(" %s", s ); |
284 |
efree( s ); |
285 |
|
286 |
} |
287 |
else |
288 |
printf(" Failed to lookup meta entry"); |
289 |
|
290 |
|
291 |
printf(" Freq:%d", frequency); |
292 |
printf(" Pos/Struct:"); |
293 |
} |
294 |
else if ( DEBUG_MASK & DEBUG_INDEX_WORDS_META) |
295 |
meta_used[ metaname ]++; |
296 |
else |
297 |
{ |
298 |
printf(" [%d", metaname); |
299 |
printf(" %d", filenum); |
300 |
printf(" %d (", frequency); |
301 |
} |
302 |
|
303 |
|
304 |
for (i = 0; i < frequency; i++) |
305 |
{ |
306 |
if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_WORDS_FULL)) |
307 |
//if (sw->verbose >= 4) |
308 |
{ |
309 |
if (i) |
310 |
printf(",%d/%x", GET_POSITION(posdata[i]),GET_STRUCTURE(posdata[i])); |
311 |
else |
312 |
printf("%d/%x", GET_POSITION(posdata[i]), GET_STRUCTURE(posdata[i])); |
313 |
} |
314 |
else if ( DEBUG_MASK & DEBUG_INDEX_WORDS) |
315 |
{ |
316 |
if (i) |
317 |
printf(" %d/%x", GET_POSITION(posdata[i]),GET_STRUCTURE(posdata[i])); |
318 |
else |
319 |
printf("%d/%x", GET_POSITION(posdata[i]),GET_STRUCTURE(posdata[i])); |
320 |
} |
321 |
} |
322 |
|
323 |
efree(posdata); |
324 |
|
325 |
if ( DEBUG_MASK & DEBUG_INDEX_WORDS ) |
326 |
printf(")]"); |
327 |
|
328 |
if ((s - worddata) == sz_worddata) |
329 |
break; /* End of worddata */ |
330 |
|
331 |
if ((unsigned long)(s - worddata) == nextposmetaname) |
332 |
{ |
333 |
filenum = 0; |
334 |
metaname = uncompress2(&s); |
335 |
if (metaname) |
336 |
{ |
337 |
nextposmetaname = UNPACKLONG2(s); |
338 |
s += sizeof(long); |
339 |
} |
340 |
else |
341 |
nextposmetaname = 0L; |
342 |
} |
343 |
} |
344 |
|
345 |
if ( DEBUG_MASK & DEBUG_INDEX_WORDS_META) |
346 |
{ |
347 |
for(i = 0; i <= end_meta; i++) |
348 |
{ |
349 |
if ( meta_used[i] ) |
350 |
printf( "\t%d", i ); |
351 |
meta_used[i] = 0; |
352 |
} |
353 |
} |
354 |
|
355 |
|
356 |
if ( !( DEBUG_MASK & DEBUG_INDEX_WORDS_META )) |
357 |
printf("\n"); |
358 |
|
359 |
efree(worddata); |
360 |
efree(resultword); |
361 |
DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
362 |
} |
363 |
} |
364 |
DB_EndReadWords(sw, indexf->DB); |
365 |
|
366 |
efree( meta_used ); |
367 |
} |
368 |
|
369 |
|
370 |
|
371 |
/* Decode Stop Words: All them are in just one line */ |
372 |
if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_STOPWORDS) ) |
373 |
{ |
374 |
printf("\n\n-----> STOP WORDS in %s <-----\n" , indexf->line); |
375 |
for(i=0;i<indexf->header.stopPos;i++) |
376 |
printf("%s ",indexf->header.stopList[i]); |
377 |
printf("\n"); |
378 |
} |
379 |
|
380 |
|
381 |
|
382 |
/* Decode File Info */ |
383 |
if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_FILES) ) |
384 |
dump_index_file_list( sw, indexf ); |
385 |
|
386 |
|
387 |
DB_Close(sw, indexf->DB); |
388 |
|
389 |
} |
390 |
|
391 |
|
392 |
int check_sorted_index( SWISH *sw, IndexFILE *indexf, struct metaEntry *m ) |
393 |
{ |
394 |
unsigned char *buffer; |
395 |
int sz_buffer; |
396 |
|
397 |
DB_InitReadSortedIndex(sw, indexf->DB); |
398 |
|
399 |
/* Get the sorted index of the property */ |
400 |
DB_ReadSortedIndex(sw, m->metaID, &buffer, &sz_buffer, indexf->DB); |
401 |
|
402 |
if ( sz_buffer ) |
403 |
efree( buffer ); |
404 |
|
405 |
/* Table doesn't exist */ |
406 |
return sz_buffer; |
407 |
} |
408 |
|
409 |
|
410 |
void dump_metanames( SWISH *sw, IndexFILE *indexf, int check_presorted ) |
411 |
{ |
412 |
struct metaEntry *meta_entry; |
413 |
int i; |
414 |
|
415 |
printf("\n\n-----> METANAMES for %s <-----\n", indexf->line ); |
416 |
for(i = 0; i < indexf->header.metaCounter; i++) |
417 |
{ |
418 |
meta_entry = indexf->header.metaEntryArray[i]; |
419 |
|
420 |
printf("%20s : id=%2d type=%2d ",meta_entry->metaName, meta_entry->metaID, meta_entry->metaType); |
421 |
|
422 |
if ( is_meta_index( meta_entry ) ) |
423 |
printf(" META_INDEX Rank Bias=%3d", meta_entry->rank_bias ); |
424 |
|
425 |
|
426 |
|
427 |
if ( is_meta_internal( meta_entry ) ) |
428 |
printf(" META_INTERNAL"); |
429 |
|
430 |
|
431 |
if ( is_meta_property( meta_entry ) ) |
432 |
{ |
433 |
printf(" META_PROP:"); |
434 |
|
435 |
if ( is_meta_string(meta_entry) ) |
436 |
printf("STRING(case:%s)", is_meta_ignore_case(meta_entry)? "ignore" : "compare"); |
437 |
|
438 |
else if ( is_meta_date(meta_entry) ) |
439 |
printf("DATE"); |
440 |
|
441 |
else if ( is_meta_number(meta_entry) ) |
442 |
printf("NUMBER"); |
443 |
|
444 |
else |
445 |
printf("unknown!"); |
446 |
} |
447 |
|
448 |
|
449 |
if ( check_presorted && check_sorted_index( sw, indexf, meta_entry) ) |
450 |
printf(" *presorted*"); |
451 |
|
452 |
|
453 |
if ( meta_entry->alias ) |
454 |
{ |
455 |
struct metaEntry *m = is_meta_index( meta_entry ) |
456 |
? getMetaNameByID( &indexf->header, meta_entry->alias ) |
457 |
: getPropNameByID( &indexf->header, meta_entry->alias ); |
458 |
|
459 |
printf(" [Alias for %s (%d)]", m->metaName, m->metaID ); |
460 |
} |
461 |
|
462 |
|
463 |
printf("\n"); |
464 |
|
465 |
} |
466 |
printf("\n"); |
467 |
} |
468 |
|
469 |
|