1 |
adcroft |
1.1 |
/* |
2 |
|
|
** |
3 |
|
|
** This program and library is free software; you can redistribute it and/or |
4 |
|
|
** modify it under the terms of the GNU (Library) General Public License |
5 |
|
|
** as published by the Free Software Foundation; either version 2 |
6 |
|
|
** of the License, or any later version. |
7 |
|
|
** |
8 |
|
|
** This program is distributed in the hope that it will be useful, |
9 |
|
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 |
|
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 |
|
|
** GNU (Library) General Public License for more details. |
12 |
|
|
** |
13 |
|
|
** You should have received a copy of the GNU (Library) General Public License |
14 |
|
|
** along with this program; if not, write to the Free Software |
15 |
|
|
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
16 |
|
|
** |
17 |
|
|
** |
18 |
|
|
** |
19 |
|
|
** 2001-05-07 jmruiz init coding |
20 |
|
|
** |
21 |
|
|
*/ |
22 |
|
|
|
23 |
|
|
#include "swish.h" |
24 |
|
|
#include "merge.h" |
25 |
|
|
#include "docprop.h" |
26 |
|
|
#include "hash.h" |
27 |
|
|
#include "string.h" |
28 |
|
|
#include "mem.h" |
29 |
|
|
#include "db.h" |
30 |
|
|
#include "compress.h" |
31 |
|
|
#include "index.h" |
32 |
|
|
#include "search.h" |
33 |
|
|
#include "result_output.h" |
34 |
|
|
#include "metanames.h" |
35 |
|
|
#include "dump.h" |
36 |
|
|
|
37 |
|
|
|
38 |
|
|
|
39 |
|
|
|
40 |
|
|
void dump_index_file_list( SWISH *sw, IndexFILE *indexf ) |
41 |
|
|
{ |
42 |
|
|
int i; |
43 |
|
|
int end = indexf->header.totalfiles; |
44 |
|
|
|
45 |
|
|
i = sw->Search->beginhits ? sw->Search->beginhits - 1 : 0; |
46 |
|
|
|
47 |
|
|
if ( i >= indexf->header.totalfiles ) |
48 |
|
|
{ |
49 |
|
|
printf("Hey, there are only %d files\n", indexf->header.totalfiles ); |
50 |
|
|
exit(-1); |
51 |
|
|
} |
52 |
|
|
|
53 |
|
|
end = indexf->header.totalfiles; |
54 |
|
|
|
55 |
|
|
if ( sw->Search->maxhits > 0 ) |
56 |
|
|
{ |
57 |
|
|
end = i + sw->Search->maxhits; |
58 |
|
|
if ( end > indexf->header.totalfiles ) |
59 |
|
|
end = indexf->header.totalfiles; |
60 |
|
|
} |
61 |
|
|
|
62 |
|
|
|
63 |
|
|
printf("\n\n-----> FILES in index %s <-----\n", indexf->line ); |
64 |
|
|
|
65 |
|
|
for (; i < end; i++) |
66 |
|
|
{ |
67 |
|
|
FileRec fi; |
68 |
|
|
|
69 |
|
|
memset( &fi, 0, sizeof( FileRec ) ); |
70 |
|
|
|
71 |
|
|
fi.filenum = i+1; |
72 |
|
|
|
73 |
|
|
fflush(stdout); |
74 |
|
|
printf("Dumping File Properties for File Number: %d\n", i+1); |
75 |
|
|
|
76 |
|
|
|
77 |
|
|
dump_file_properties( indexf, &fi ); |
78 |
|
|
printf("\n"); |
79 |
|
|
|
80 |
|
|
|
81 |
|
|
printf("ReadAllDocProperties:\n"); |
82 |
|
|
fi.docProperties = ReadAllDocPropertiesFromDisk( sw, indexf, i+1 ); |
83 |
|
|
dump_file_properties( indexf, &fi ); |
84 |
|
|
freefileinfo( &fi ); |
85 |
|
|
|
86 |
|
|
printf("\n"); |
87 |
|
|
|
88 |
|
|
|
89 |
|
|
/* dump one at a time */ |
90 |
|
|
{ |
91 |
|
|
propEntry *p; |
92 |
|
|
int j; |
93 |
|
|
struct metaEntry *meta_entry; |
94 |
|
|
INDEXDATAHEADER *header = &indexf->header; |
95 |
|
|
int count = header->property_count; |
96 |
|
|
|
97 |
|
|
printf("ReadSingleDocPropertiesFromDisk:\n"); |
98 |
|
|
|
99 |
|
|
for (j=0; j< count; j++) // just for testing |
100 |
|
|
{ |
101 |
|
|
int metaID = header->propIDX_to_metaID[j]; |
102 |
|
|
|
103 |
|
|
if ( !(p = ReadSingleDocPropertiesFromDisk(sw, indexf, &fi, metaID, 0 )) ) |
104 |
|
|
continue; |
105 |
|
|
|
106 |
|
|
meta_entry = getPropNameByID( &indexf->header, metaID ); |
107 |
|
|
dump_single_property( p, meta_entry ); |
108 |
|
|
|
109 |
|
|
{ // show compression |
110 |
|
|
char *buffer; |
111 |
|
|
int uncompressed_len; |
112 |
|
|
int buf_len; |
113 |
|
|
|
114 |
|
|
if ( (buffer = DB_ReadProperty( sw, indexf, &fi, meta_entry->metaID, &buf_len, &uncompressed_len, indexf->DB ))) |
115 |
|
|
{ |
116 |
|
|
if ( uncompressed_len ) |
117 |
|
|
printf(" %20s: %d -> %d (%4.2f%%)\n", "**Compressed**", uncompressed_len , buf_len, (float)buf_len/(float)uncompressed_len * 100.00f ); |
118 |
|
|
|
119 |
|
|
efree(buffer); |
120 |
|
|
} |
121 |
|
|
} |
122 |
|
|
|
123 |
|
|
|
124 |
|
|
|
125 |
|
|
freeProperty( p ); |
126 |
|
|
} |
127 |
|
|
} |
128 |
|
|
printf("\n"); |
129 |
|
|
|
130 |
|
|
|
131 |
|
|
freefileinfo(&fi); |
132 |
|
|
} |
133 |
|
|
printf("\nNumber of File Entries: %d\n", indexf->header.totalfiles); |
134 |
|
|
fflush(stdout); |
135 |
|
|
} |
136 |
|
|
|
137 |
|
|
|
138 |
|
|
|
139 |
|
|
|
140 |
|
|
/* Prints out the data in an index DB */ |
141 |
|
|
void DB_decompress(SWISH * sw, IndexFILE * indexf) |
142 |
|
|
{ |
143 |
|
|
int i, |
144 |
|
|
j, |
145 |
|
|
c, |
146 |
|
|
fieldnum, |
147 |
|
|
frequency, |
148 |
|
|
metaname, |
149 |
|
|
tmpval, |
150 |
|
|
filenum, |
151 |
|
|
*posdata; |
152 |
|
|
unsigned long nextposmetaname; |
153 |
|
|
char word[2]; |
154 |
|
|
char *resultword; |
155 |
|
|
unsigned char *worddata, *s, flag; |
156 |
|
|
int sz_worddata; |
157 |
|
|
long wordID; |
158 |
|
|
|
159 |
|
|
|
160 |
|
|
|
161 |
|
|
indexf->DB = DB_Open(sw, indexf->line,DB_READ); |
162 |
|
|
|
163 |
|
|
metaname = 0; |
164 |
|
|
|
165 |
|
|
nextposmetaname = 0L; |
166 |
|
|
|
167 |
|
|
c = 0; |
168 |
|
|
|
169 |
|
|
frequency = 0; |
170 |
|
|
|
171 |
|
|
/* Read header */ |
172 |
|
|
read_header(sw, &indexf->header, indexf->DB); |
173 |
|
|
|
174 |
|
|
|
175 |
|
|
if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_HEADER) ) |
176 |
|
|
resultPrintHeader(sw, 0, &indexf->header, indexf->line, 0); |
177 |
|
|
|
178 |
|
|
fieldnum = 0; |
179 |
|
|
|
180 |
|
|
|
181 |
|
|
/* Do metanames first as that will be helpful for decoding next */ |
182 |
|
|
if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_METANAMES) ) |
183 |
|
|
dump_metanames( sw, indexf, 1 ); |
184 |
|
|
|
185 |
|
|
if (DEBUG_MASK & DEBUG_INDEX_WORDS_ONLY) |
186 |
|
|
{ |
187 |
|
|
DB_InitReadWords(sw, indexf->DB); |
188 |
|
|
|
189 |
|
|
for( j = 0; j < 256; j++ ) |
190 |
|
|
{ |
191 |
|
|
word[0] = (unsigned char) j; |
192 |
|
|
word[1] = '\0'; |
193 |
|
|
DB_ReadFirstWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
194 |
|
|
|
195 |
|
|
while(wordID) |
196 |
|
|
{ |
197 |
|
|
printf("%s\n",resultword); |
198 |
|
|
|
199 |
|
|
|
200 |
|
|
efree(resultword); |
201 |
|
|
DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
202 |
|
|
|
203 |
|
|
} |
204 |
|
|
} |
205 |
|
|
DB_EndReadWords(sw, indexf->DB); |
206 |
|
|
} |
207 |
|
|
|
208 |
|
|
|
209 |
|
|
else if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_WORDS | DEBUG_INDEX_WORDS_FULL | DEBUG_INDEX_WORDS_META) ) |
210 |
|
|
{ |
211 |
|
|
int *meta_used; |
212 |
|
|
int end_meta = 0; |
213 |
|
|
|
214 |
|
|
printf("\n-----> WORD INFO in index %s <-----\n", indexf->line); |
215 |
|
|
|
216 |
|
|
for(i = 0; i < indexf->header.metaCounter; i++) |
217 |
|
|
if ( indexf->header.metaEntryArray[i]->metaID > end_meta ) |
218 |
|
|
end_meta = indexf->header.metaEntryArray[i]->metaID; |
219 |
|
|
|
220 |
|
|
meta_used = emalloc( sizeof(int) * ( end_meta + 1) ); |
221 |
|
|
|
222 |
|
|
/* _META only reports which tags the words are found in */ |
223 |
|
|
for(i = 0; i <= end_meta; i++) |
224 |
|
|
meta_used[i] = 0; |
225 |
|
|
|
226 |
|
|
|
227 |
|
|
DB_InitReadWords(sw, indexf->DB); |
228 |
|
|
|
229 |
|
|
for(j=1;j<256;j++) |
230 |
|
|
{ |
231 |
|
|
word[0] = (unsigned char) j; word[1] = '\0'; |
232 |
|
|
DB_ReadFirstWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
233 |
|
|
|
234 |
|
|
while(wordID && (((int)((unsigned char)resultword[0]))== j)) |
235 |
|
|
{ |
236 |
|
|
printf("\n%s",resultword); |
237 |
|
|
|
238 |
|
|
/* Read Word's data */ |
239 |
|
|
DB_ReadWordData(sw, wordID, &worddata, &sz_worddata, indexf->DB); |
240 |
|
|
|
241 |
|
|
/* parse and print word's data */ |
242 |
|
|
s = worddata; |
243 |
|
|
|
244 |
|
|
tmpval = uncompress2(&s); /* tfrequency */ |
245 |
|
|
metaname = uncompress2(&s); /* metaname */ |
246 |
|
|
if (metaname) |
247 |
|
|
{ |
248 |
|
|
nextposmetaname = UNPACKLONG2(s); |
249 |
|
|
s += sizeof(long); |
250 |
|
|
} |
251 |
|
|
|
252 |
|
|
filenum = 0; |
253 |
|
|
while(1) |
254 |
|
|
{ /* Read on all items */ |
255 |
|
|
uncompress_location_values(&s,&flag,&tmpval,&frequency); |
256 |
|
|
filenum += tmpval; |
257 |
|
|
posdata = (int *) emalloc(frequency * sizeof(int)); |
258 |
|
|
uncompress_location_positions(&s,flag,frequency,posdata); |
259 |
|
|
|
260 |
|
|
|
261 |
|
|
// if (sw->verbose >= 4) |
262 |
|
|
if (DEBUG_MASK & (DEBUG_INDEX_ALL|DEBUG_INDEX_WORDS_FULL)) |
263 |
|
|
{ |
264 |
|
|
struct metaEntry *m; |
265 |
|
|
|
266 |
|
|
printf("\n Meta:%d", metaname); |
267 |
|
|
|
268 |
|
|
|
269 |
|
|
/* Get path from property list */ |
270 |
|
|
if ( (m = getPropNameByName( &sw->indexlist->header, AUTOPROPERTY_DOCPATH )) ) |
271 |
|
|
{ |
272 |
|
|
RESULT r; |
273 |
|
|
char *s; |
274 |
|
|
|
275 |
|
|
memset( &r, 0, sizeof( RESULT ) ); |
276 |
|
|
|
277 |
|
|
r.indexf = indexf; |
278 |
|
|
r.filenum = filenum; |
279 |
|
|
r.fi.filenum = filenum; |
280 |
|
|
|
281 |
|
|
s = getResultPropAsString( sw, &r, m->metaID); |
282 |
|
|
|
283 |
|
|
printf(" %s", s ); |
284 |
|
|
efree( s ); |
285 |
|
|
|
286 |
|
|
} |
287 |
|
|
else |
288 |
|
|
printf(" Failed to lookup meta entry"); |
289 |
|
|
|
290 |
|
|
|
291 |
|
|
printf(" Freq:%d", frequency); |
292 |
|
|
printf(" Pos/Struct:"); |
293 |
|
|
} |
294 |
|
|
else if ( DEBUG_MASK & DEBUG_INDEX_WORDS_META) |
295 |
|
|
meta_used[ metaname ]++; |
296 |
|
|
else |
297 |
|
|
{ |
298 |
|
|
printf(" [%d", metaname); |
299 |
|
|
printf(" %d", filenum); |
300 |
|
|
printf(" %d (", frequency); |
301 |
|
|
} |
302 |
|
|
|
303 |
|
|
|
304 |
|
|
for (i = 0; i < frequency; i++) |
305 |
|
|
{ |
306 |
|
|
if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_WORDS_FULL)) |
307 |
|
|
//if (sw->verbose >= 4) |
308 |
|
|
{ |
309 |
|
|
if (i) |
310 |
|
|
printf(",%d/%x", GET_POSITION(posdata[i]),GET_STRUCTURE(posdata[i])); |
311 |
|
|
else |
312 |
|
|
printf("%d/%x", GET_POSITION(posdata[i]), GET_STRUCTURE(posdata[i])); |
313 |
|
|
} |
314 |
|
|
else if ( DEBUG_MASK & DEBUG_INDEX_WORDS) |
315 |
|
|
{ |
316 |
|
|
if (i) |
317 |
|
|
printf(" %d/%x", GET_POSITION(posdata[i]),GET_STRUCTURE(posdata[i])); |
318 |
|
|
else |
319 |
|
|
printf("%d/%x", GET_POSITION(posdata[i]),GET_STRUCTURE(posdata[i])); |
320 |
|
|
} |
321 |
|
|
} |
322 |
|
|
|
323 |
|
|
efree(posdata); |
324 |
|
|
|
325 |
|
|
if ( DEBUG_MASK & DEBUG_INDEX_WORDS ) |
326 |
|
|
printf(")]"); |
327 |
|
|
|
328 |
|
|
if ((s - worddata) == sz_worddata) |
329 |
|
|
break; /* End of worddata */ |
330 |
|
|
|
331 |
|
|
if ((unsigned long)(s - worddata) == nextposmetaname) |
332 |
|
|
{ |
333 |
|
|
filenum = 0; |
334 |
|
|
metaname = uncompress2(&s); |
335 |
|
|
if (metaname) |
336 |
|
|
{ |
337 |
|
|
nextposmetaname = UNPACKLONG2(s); |
338 |
|
|
s += sizeof(long); |
339 |
|
|
} |
340 |
|
|
else |
341 |
|
|
nextposmetaname = 0L; |
342 |
|
|
} |
343 |
|
|
} |
344 |
|
|
|
345 |
|
|
if ( DEBUG_MASK & DEBUG_INDEX_WORDS_META) |
346 |
|
|
{ |
347 |
|
|
for(i = 0; i <= end_meta; i++) |
348 |
|
|
{ |
349 |
|
|
if ( meta_used[i] ) |
350 |
|
|
printf( "\t%d", i ); |
351 |
|
|
meta_used[i] = 0; |
352 |
|
|
} |
353 |
|
|
} |
354 |
|
|
|
355 |
|
|
|
356 |
|
|
if ( !( DEBUG_MASK & DEBUG_INDEX_WORDS_META )) |
357 |
|
|
printf("\n"); |
358 |
|
|
|
359 |
|
|
efree(worddata); |
360 |
|
|
efree(resultword); |
361 |
|
|
DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
362 |
|
|
} |
363 |
|
|
} |
364 |
|
|
DB_EndReadWords(sw, indexf->DB); |
365 |
|
|
|
366 |
|
|
efree( meta_used ); |
367 |
|
|
} |
368 |
|
|
|
369 |
|
|
|
370 |
|
|
|
371 |
|
|
/* Decode Stop Words: All them are in just one line */ |
372 |
|
|
if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_STOPWORDS) ) |
373 |
|
|
{ |
374 |
|
|
printf("\n\n-----> STOP WORDS in %s <-----\n" , indexf->line); |
375 |
|
|
for(i=0;i<indexf->header.stopPos;i++) |
376 |
|
|
printf("%s ",indexf->header.stopList[i]); |
377 |
|
|
printf("\n"); |
378 |
|
|
} |
379 |
|
|
|
380 |
|
|
|
381 |
|
|
|
382 |
|
|
/* Decode File Info */ |
383 |
|
|
if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_FILES) ) |
384 |
|
|
dump_index_file_list( sw, indexf ); |
385 |
|
|
|
386 |
|
|
|
387 |
|
|
DB_Close(sw, indexf->DB); |
388 |
|
|
|
389 |
|
|
} |
390 |
|
|
|
391 |
|
|
|
392 |
|
|
int check_sorted_index( SWISH *sw, IndexFILE *indexf, struct metaEntry *m ) |
393 |
|
|
{ |
394 |
|
|
unsigned char *buffer; |
395 |
|
|
int sz_buffer; |
396 |
|
|
|
397 |
|
|
DB_InitReadSortedIndex(sw, indexf->DB); |
398 |
|
|
|
399 |
|
|
/* Get the sorted index of the property */ |
400 |
|
|
DB_ReadSortedIndex(sw, m->metaID, &buffer, &sz_buffer, indexf->DB); |
401 |
|
|
|
402 |
|
|
if ( sz_buffer ) |
403 |
|
|
efree( buffer ); |
404 |
|
|
|
405 |
|
|
/* Table doesn't exist */ |
406 |
|
|
return sz_buffer; |
407 |
|
|
} |
408 |
|
|
|
409 |
|
|
|
410 |
|
|
void dump_metanames( SWISH *sw, IndexFILE *indexf, int check_presorted ) |
411 |
|
|
{ |
412 |
|
|
struct metaEntry *meta_entry; |
413 |
|
|
int i; |
414 |
|
|
|
415 |
|
|
printf("\n\n-----> METANAMES for %s <-----\n", indexf->line ); |
416 |
|
|
for(i = 0; i < indexf->header.metaCounter; i++) |
417 |
|
|
{ |
418 |
|
|
meta_entry = indexf->header.metaEntryArray[i]; |
419 |
|
|
|
420 |
|
|
printf("%20s : id=%2d type=%2d ",meta_entry->metaName, meta_entry->metaID, meta_entry->metaType); |
421 |
|
|
|
422 |
|
|
if ( is_meta_index( meta_entry ) ) |
423 |
|
|
printf(" META_INDEX Rank Bias=%3d", meta_entry->rank_bias ); |
424 |
|
|
|
425 |
|
|
|
426 |
|
|
|
427 |
|
|
if ( is_meta_internal( meta_entry ) ) |
428 |
|
|
printf(" META_INTERNAL"); |
429 |
|
|
|
430 |
|
|
|
431 |
|
|
if ( is_meta_property( meta_entry ) ) |
432 |
|
|
{ |
433 |
|
|
printf(" META_PROP:"); |
434 |
|
|
|
435 |
|
|
if ( is_meta_string(meta_entry) ) |
436 |
|
|
printf("STRING(case:%s)", is_meta_ignore_case(meta_entry)? "ignore" : "compare"); |
437 |
|
|
|
438 |
|
|
else if ( is_meta_date(meta_entry) ) |
439 |
|
|
printf("DATE"); |
440 |
|
|
|
441 |
|
|
else if ( is_meta_number(meta_entry) ) |
442 |
|
|
printf("NUMBER"); |
443 |
|
|
|
444 |
|
|
else |
445 |
|
|
printf("unknown!"); |
446 |
|
|
} |
447 |
|
|
|
448 |
|
|
|
449 |
|
|
if ( check_presorted && check_sorted_index( sw, indexf, meta_entry) ) |
450 |
|
|
printf(" *presorted*"); |
451 |
|
|
|
452 |
|
|
|
453 |
|
|
if ( meta_entry->alias ) |
454 |
|
|
{ |
455 |
|
|
struct metaEntry *m = is_meta_index( meta_entry ) |
456 |
|
|
? getMetaNameByID( &indexf->header, meta_entry->alias ) |
457 |
|
|
: getPropNameByID( &indexf->header, meta_entry->alias ); |
458 |
|
|
|
459 |
|
|
printf(" [Alias for %s (%d)]", m->metaName, m->metaID ); |
460 |
|
|
} |
461 |
|
|
|
462 |
|
|
|
463 |
|
|
printf("\n"); |
464 |
|
|
|
465 |
|
|
} |
466 |
|
|
printf("\n"); |
467 |
|
|
} |
468 |
|
|
|
469 |
|
|
|