1 |
/* |
2 |
** This program and library is free software; you can redistribute it and/or |
3 |
** modify it under the terms of the GNU General Public License |
4 |
** as published by the Free Software Foundation; either version 2 |
5 |
** of the License, or any later version. |
6 |
** |
7 |
** This program is distributed in the hope that it will be useful, |
8 |
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
9 |
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
10 |
** GNU (Library) General Public License for more details. |
11 |
** |
12 |
** You should have received a copy of the GNU (Library) General Public License |
13 |
** along with this program; if not, write to the Free Software |
14 |
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
15 |
**----------------------------------------------------------------- |
16 |
** |
17 |
** rewritten from scratch - moseley Oct 17, 2001 |
18 |
** |
19 |
*/ |
20 |
|
21 |
#include <assert.h> /* for bug hunting */ |
22 |
#include "swish.h" |
23 |
#include "mem.h" |
24 |
#include "string.h" |
25 |
#include "merge.h" |
26 |
#include "error.h" |
27 |
#include "search.h" |
28 |
#include "index.h" |
29 |
#include "hash.h" |
30 |
#include "file.h" |
31 |
#include "docprop.h" |
32 |
#include "list.h" |
33 |
#include "compress.h" |
34 |
#include "metanames.h" |
35 |
#include "db.h" |
36 |
#include "dump.h" |
37 |
#include "result_sort.h" |
38 |
#include "swish_qsort.h" |
39 |
#include "result_output.h" |
40 |
#include "parse_conffile.h" |
41 |
static void dup_header( SWISH *sw_input, SWISH *sw_output ); |
42 |
static void check_header_match( IndexFILE *in_index, SWISH *sw_output ); |
43 |
static void make_meta_map( IndexFILE *in_index, SWISH *sw_output); |
44 |
static void load_filename_sort( SWISH *sw, IndexFILE *cur_index ); |
45 |
static IndexFILE *get_next_file_in_order( SWISH *sw_input ); |
46 |
static void add_file( FILE *filenum_map, IndexFILE *cur_index, SWISH *sw_input, SWISH *sw_output ); |
47 |
static int *get_map( FILE *filenum_map, IndexFILE *cur_index ); |
48 |
static void dump_index_words(SWISH * sw, IndexFILE * indexf, SWISH *sw_output ); |
49 |
static void write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, ENTRY *e, int metaID, int posdata ); |
50 |
|
51 |
|
52 |
// #define DEBUG_MERGE |
53 |
|
54 |
/**************************************************************************** |
55 |
* merge_indexes -- reads from input indexes, and outputs a new index |
56 |
* |
57 |
* |
58 |
*****************************************************************************/ |
59 |
|
60 |
void merge_indexes( SWISH *sw_input, SWISH *sw_output ) |
61 |
{ |
62 |
IndexFILE *cur_index; |
63 |
FILE *filenum_map; |
64 |
char *tmpfilename; |
65 |
struct MOD_Index *idx_output = sw_output->Index; |
66 |
ENTRY *e; |
67 |
int hash, |
68 |
sz_worddata, |
69 |
tmpval, |
70 |
filenum, |
71 |
metaID = 0, |
72 |
frequency, |
73 |
loc_count = 0, |
74 |
word_count = 0; |
75 |
long wordID; |
76 |
unsigned long nextposmetaID = 0L; |
77 |
unsigned char *worddata; |
78 |
unsigned char *s; |
79 |
unsigned char flag; |
80 |
int local_posdata[MAX_STACK_POSITIONS]; |
81 |
int *posdata; |
82 |
int i; |
83 |
|
84 |
/******************************************************************************* |
85 |
* Get ready to merge the indexes. For each index: |
86 |
* - check that it has the correct headers |
87 |
* - create meta entries in output index, and create a map to convert metas |
88 |
* - load an array of file numbers sorted by filename so can merge sort the filesnames |
89 |
* - set some initial defaults. |
90 |
*********************************************************************************/ |
91 |
|
92 |
cur_index = sw_input->indexlist; |
93 |
while( cur_index ) |
94 |
{ |
95 |
printf("Input index '%s' has %d files and %d words\n", cur_index->line, cur_index->header.totalfiles, cur_index->header.totalwords); |
96 |
|
97 |
if ( cur_index == sw_input->indexlist ) |
98 |
/* Duplicate the first index's header into the output index */ |
99 |
dup_header( sw_input, sw_output ); |
100 |
else |
101 |
check_header_match( cur_index, sw_output ); // errors if headers don't match - don't really need to check first one since it was the one that was dupped |
102 |
|
103 |
|
104 |
make_meta_map( cur_index, sw_output); // add metas to new index, and create map |
105 |
|
106 |
load_filename_sort( sw_input, cur_index ); // so can read in filename order |
107 |
|
108 |
cur_index->current_file = 0; |
109 |
cur_index->cur_prop = NULL; |
110 |
|
111 |
#ifdef DEBUG_MERGE |
112 |
dump_metanames( sw_input, cur_index, 1 ); |
113 |
dump_metanames( sw_output, sw_output->indexlist, 0 ); |
114 |
#endif |
115 |
|
116 |
cur_index = cur_index->next; |
117 |
} |
118 |
|
119 |
|
120 |
#ifdef DEBUG_MERGE |
121 |
printf("----- Output Header ----------\n"); |
122 |
resultPrintHeader(sw_output, 0, &sw_output->indexlist->header, sw_output->indexlist->line, 0); |
123 |
#endif |
124 |
|
125 |
|
126 |
|
127 |
/**************************************************************************** |
128 |
* Now, read in filename order (so can throw out duplicates) |
129 |
* - read properties and write out to new index |
130 |
* - write a temporay of records to identify |
131 |
* - indexfile |
132 |
* - old filenum to new filenum mapping |
133 |
* - total words per file, if set |
134 |
****************************************************************************/ |
135 |
|
136 |
/* place to store file number map and total words per file */ |
137 |
filenum_map = create_tempfile(sw_input, F_WRITE_BINARY, "fnum", &tmpfilename, 0 ); |
138 |
|
139 |
while( (cur_index = get_next_file_in_order( sw_input )) ) |
140 |
add_file( filenum_map, cur_index, sw_input, sw_output ); |
141 |
|
142 |
|
143 |
|
144 |
/* Don't need the pre-sorted indexes any more */ |
145 |
for ( cur_index = sw_input->indexlist; cur_index; cur_index = cur_index->next ) |
146 |
{ |
147 |
efree( cur_index->path_order ); |
148 |
cur_index->path_order = NULL; |
149 |
} |
150 |
|
151 |
fclose( filenum_map ); |
152 |
|
153 |
if ( !(filenum_map = fopen( tmpfilename, F_READ_BINARY )) ) |
154 |
progerrno("failed to reopen '%s' :", tmpfilename ); |
155 |
|
156 |
|
157 |
|
158 |
/**************************************************************************** |
159 |
* Finally, read the indexes one-by-one to read word and position data |
160 |
* - reads through the temp file for each index to build a filenumber map |
161 |
* |
162 |
****************************************************************************/ |
163 |
|
164 |
/* 08/2002 jmruiz |
165 |
** First of all, get all the words |
166 |
*/ |
167 |
cur_index = sw_input->indexlist; |
168 |
while( cur_index ) |
169 |
{ |
170 |
dump_index_words(sw_input, cur_index, sw_output); |
171 |
/* Get filr_num_map for later proccess */ |
172 |
cur_index->merge_file_num_map = get_map( filenum_map, cur_index ); |
173 |
cur_index = cur_index->next; |
174 |
} |
175 |
|
176 |
/* At this point we have all the words. Now we have to get worddata |
177 |
* and merge it |
178 |
*/ |
179 |
word_count = 0; |
180 |
printf("Processing words in index '%s': %6d words\r", sw_output->indexlist->line, word_count); |
181 |
fflush(stdout); |
182 |
/* walk the hash list to merge worddata */ |
183 |
for (hash = 0; hash < VERYBIGHASHSIZE; hash++) |
184 |
{ |
185 |
if (idx_output->hashentriesdirty[hash]) |
186 |
{ |
187 |
idx_output->hashentriesdirty[hash] = 0; |
188 |
for (e = idx_output->hashentries[hash]; e; e = e->next) |
189 |
{ |
190 |
word_count++; |
191 |
/* Search the word in all index and get worddata */ |
192 |
cur_index = sw_input->indexlist; |
193 |
while( cur_index ) |
194 |
{ |
195 |
DB_ReadWordHash(sw_input, e->word, &wordID, cur_index->DB); |
196 |
/* If word exits in the index */ |
197 |
if(wordID) |
198 |
{ |
199 |
|
200 |
DB_ReadWordData(sw_input, wordID, &worddata, &sz_worddata, cur_index->DB); |
201 |
|
202 |
/* Now, parse word's data */ |
203 |
s = worddata; |
204 |
tmpval = uncompress2(&s); /* tfrequency */ |
205 |
metaID = uncompress2(&s); /* metaID */ |
206 |
|
207 |
if (metaID) |
208 |
{ |
209 |
nextposmetaID = UNPACKLONG2(s); |
210 |
s += sizeof(long); |
211 |
} |
212 |
|
213 |
filenum = 0; |
214 |
|
215 |
while(1) |
216 |
{ /* Read on all items */ |
217 |
uncompress_location_values(&s,&flag,&tmpval,&frequency); |
218 |
filenum += tmpval; |
219 |
/* Use stack array when possible to avoid malloc/free overhead */ |
220 |
if(frequency > MAX_STACK_POSITIONS) |
221 |
posdata = (int *) emalloc(frequency * sizeof(int)); |
222 |
else |
223 |
posdata = local_posdata; |
224 |
|
225 |
/* Read the positions */ |
226 |
uncompress_location_positions(&s,flag,frequency,posdata); |
227 |
|
228 |
|
229 |
/* now we have the word data */ |
230 |
for (i = 0; i < frequency; i++, loc_count++) |
231 |
write_word_pos( sw_input, cur_index, sw_output, cur_index->merge_file_num_map, filenum, e, metaID, posdata[i]); |
232 |
|
233 |
if(e->tfrequency) |
234 |
{ |
235 |
/* 08/2002 jmruiz - We will call CompressCurrentLocEntry from time |
236 |
** to time to help addentry. |
237 |
** If we do not do this, addentry routine will have to run linked lists |
238 |
** of positions with thousands of elements and makes the merge proccess |
239 |
** very slow |
240 |
*/ |
241 |
if(!(loc_count % 100)) |
242 |
CompressCurrentLocEntry(sw_output, sw_output->indexlist, e); |
243 |
} |
244 |
|
245 |
|
246 |
if(posdata != local_posdata) |
247 |
efree(posdata); |
248 |
|
249 |
if ((s - worddata) == sz_worddata) |
250 |
break; /* End of worddata */ |
251 |
|
252 |
if ((unsigned long)(s - worddata) == nextposmetaID) |
253 |
{ |
254 |
filenum = 0; |
255 |
metaID = uncompress2(&s); |
256 |
if (metaID) |
257 |
{ |
258 |
nextposmetaID = UNPACKLONG2(s); |
259 |
s += sizeof(long); |
260 |
} |
261 |
else |
262 |
nextposmetaID = 0L; |
263 |
} |
264 |
} |
265 |
|
266 |
if(e->tfrequency) |
267 |
CompressCurrentLocEntry(sw_output, sw_output->indexlist, e); |
268 |
|
269 |
efree(worddata); |
270 |
} |
271 |
cur_index = cur_index->next; |
272 |
} |
273 |
/* Let's coalesce locations for each word to save memory |
274 |
** This makes use of the -e feature |
275 |
** Because we are proccessing one word at a time we can |
276 |
** coalesce its data just once |
277 |
*/ |
278 |
coalesce_word_locations(sw_output,sw_output->indexlist,e); |
279 |
|
280 |
if(!(word_count % 1000)) |
281 |
{ |
282 |
/* Make zone available for reuse and save memory */ |
283 |
Mem_ZoneReset(sw_output->Index->currentChunkLocZone); |
284 |
sw_output->Index->freeLocMemChain = NULL; |
285 |
printf("Processing words in index '%s': %6d words\r", sw_output->indexlist->line, word_count); |
286 |
} |
287 |
} |
288 |
} |
289 |
} |
290 |
|
291 |
printf("Processing words in index '%s': %6d words\n", sw_output->indexlist->line, word_count); |
292 |
fflush(stdout); |
293 |
|
294 |
cur_index = sw_input->indexlist; |
295 |
while( cur_index ) |
296 |
{ |
297 |
/* free the maps */ |
298 |
efree( cur_index->merge_file_num_map ); |
299 |
efree( cur_index->meta_map ); |
300 |
cur_index->meta_map = NULL; |
301 |
cur_index = cur_index->next; |
302 |
} |
303 |
|
304 |
|
305 |
#ifdef DEBUG_MERGE |
306 |
printf("----- Final Output Header ----------\n"); |
307 |
resultPrintHeader(sw_output, 0, &sw_output->indexlist->header, sw_output->indexlist->line, 0); |
308 |
#endif |
309 |
|
310 |
remove( tmpfilename ); |
311 |
efree( tmpfilename ); |
312 |
} |
313 |
|
314 |
/**************************************************************************** |
315 |
* dup_header -- duplicates a header |
316 |
* |
317 |
* rereads the header from the data base, and clears out some values |
318 |
* |
319 |
*****************************************************************************/ |
320 |
|
321 |
static void dup_header( SWISH *sw_input, SWISH *sw_output ) |
322 |
{ |
323 |
INDEXDATAHEADER *out_header = &sw_output->indexlist->header; |
324 |
|
325 |
// probably need to free the sw_output header from what's created in swishnew. |
326 |
|
327 |
/* Read in the header from the first merge file and store in the output file */ |
328 |
read_header(sw_input, out_header, sw_input->indexlist->DB); |
329 |
|
330 |
out_header->totalfiles = 0; |
331 |
out_header->totalwords = 0; |
332 |
|
333 |
freeMetaEntries( out_header ); |
334 |
|
335 |
if ( out_header->indexedon ) |
336 |
{ |
337 |
efree( out_header->indexedon ); |
338 |
out_header->indexedon = NULL; |
339 |
out_header->lenindexedon = 0; |
340 |
} |
341 |
} |
342 |
|
343 |
/**************************************************************************** |
344 |
* check_header_match -- makes sure that the imporant settings match |
345 |
* |
346 |
* |
347 |
*****************************************************************************/ |
348 |
|
349 |
// This assumes that the size will always preceed the content. |
350 |
typedef struct |
351 |
{ |
352 |
int len; |
353 |
char *str; |
354 |
} *HEAD_CMP; |
355 |
|
356 |
static void compare_header( char *index, char *name, void *in, void *out ) |
357 |
{ |
358 |
HEAD_CMP in_item = (HEAD_CMP)in; |
359 |
HEAD_CMP out_item = (HEAD_CMP)out; |
360 |
|
361 |
if ( in_item->len != out_item->len ) |
362 |
progerr("Header %s in index %s doesn't match length in length with output header", name, index ); |
363 |
|
364 |
if ( strcmp( (const char *)in_item->str, (const char *)out_item->str )) |
365 |
progerr("Header %s in index %s doesn't match output header", name, index ); |
366 |
|
367 |
//if ( memcmp( (const void *)in_item->str, (const void *)out_item->str, in_item->len ) ) |
368 |
// progerr("Header %s in index %s doesn't match output header", name, index ); |
369 |
|
370 |
|
371 |
|
372 |
|
373 |
} |
374 |
|
375 |
|
376 |
static void check_header_match( IndexFILE *in_index, SWISH *sw_output ) |
377 |
{ |
378 |
INDEXDATAHEADER *out_header = &sw_output->indexlist->header; |
379 |
INDEXDATAHEADER *in_header = &in_index->header; |
380 |
|
381 |
compare_header( in_index->line, "WordCharacters", &in_header->lenwordchars, &out_header->lenwordchars ); |
382 |
compare_header( in_index->line, "BeginCharacters", &in_header->lenbeginchars, &out_header->lenbeginchars ); |
383 |
compare_header( in_index->line, "EndCharacters", &in_header->lenendchars, &out_header->lenendchars ); |
384 |
|
385 |
compare_header( in_index->line, "IgnoreLastChar", &in_header->lenignorelastchar, &out_header->lenignorelastchar ); |
386 |
compare_header( in_index->line, "IgnoreFirstChar", &in_header->lenignorefirstchar, &out_header->lenignorefirstchar ); |
387 |
|
388 |
compare_header( in_index->line, "BumpPositionChars", &in_header->lenbumpposchars, &out_header->lenbumpposchars ); |
389 |
|
390 |
|
391 |
if ( in_header->fuzzy_mode != out_header->fuzzy_mode ) |
392 |
progerr("FuzzyIndexingMode in index %s of '%s' doesn't match '%s'", |
393 |
in_index->line, |
394 |
fuzzy_mode_to_string( in_header->fuzzy_mode ), |
395 |
fuzzy_mode_to_string( out_header->fuzzy_mode ) ); |
396 |
|
397 |
|
398 |
if ( in_header->ignoreTotalWordCountWhenRanking != out_header->ignoreTotalWordCountWhenRanking ) |
399 |
progerr("ignoreTotalWordCountWhenRanking Rules doesn't match for index %s", in_index->line ); |
400 |
|
401 |
if ( memcmp( &in_header->translatecharslookuptable, &out_header->translatecharslookuptable, sizeof(in_header->translatecharslookuptable) / sizeof( int ) ) ) |
402 |
progerr("TranslateChars header doesn't match for index %s", in_index->line ); |
403 |
|
404 |
|
405 |
//??? need to compare stopword lists |
406 |
|
407 |
//??? need to compare buzzwords |
408 |
|
409 |
} |
410 |
|
411 |
/**************************************************************************** |
412 |
* make_meta_map - adds metanames to output index and creates map |
413 |
* |
414 |
* |
415 |
*****************************************************************************/ |
416 |
|
417 |
static void make_meta_map( IndexFILE *in_index, SWISH *sw_output) |
418 |
{ |
419 |
INDEXDATAHEADER *out_header = &sw_output->indexlist->header; |
420 |
INDEXDATAHEADER *in_header = &in_index->header; |
421 |
int i; |
422 |
struct metaEntry *in_meta; |
423 |
struct metaEntry *out_meta; |
424 |
int *meta_map; |
425 |
|
426 |
|
427 |
meta_map = emalloc( sizeof( int ) * (in_header->metaCounter + 1) ); |
428 |
memset( meta_map, 0, sizeof( int ) * (in_header->metaCounter + 1) ); |
429 |
|
430 |
for( i = 0; i < in_header->metaCounter; i++ ) |
431 |
{ |
432 |
in_meta = in_header->metaEntryArray[i]; |
433 |
|
434 |
|
435 |
/* Try to see if it's an existing metaname */ |
436 |
out_meta = is_meta_index( in_meta ) |
437 |
? getMetaNameByNameNoAlias( out_header, in_meta->metaName ) |
438 |
: getPropNameByNameNoAlias( out_header, in_meta->metaName ); |
439 |
|
440 |
/* if it's not found, then add it */ |
441 |
if ( !out_meta ) |
442 |
out_meta = addMetaEntry(out_header, in_meta->metaName, in_meta->metaType, 0); |
443 |
else |
444 |
if (out_meta->metaType != in_meta->metaType ) |
445 |
progerr("meta name %s in index %s is different type than in output index", in_meta->metaName, in_index->line ); |
446 |
|
447 |
|
448 |
/* Now, save the mapping */ |
449 |
meta_map[ in_meta->metaID ] = out_meta->metaID; |
450 |
|
451 |
|
452 |
/* now here's a pain, and lots of room for screw up. */ |
453 |
/* Basically, check for alias mappings, and that they are correct */ |
454 |
/* you can say title is an alias for swishtitle in one index, and then say */ |
455 |
/* title is an alias for doctitle in another index */ |
456 |
|
457 |
/* If it's an alias, then make that mapping, too */ |
458 |
if ( in_meta->alias ) |
459 |
{ |
460 |
struct metaEntry *in_alias; |
461 |
struct metaEntry *out_alias; |
462 |
|
463 |
/* Grab alias meta entry so we can look it up in the out_header */ |
464 |
|
465 |
in_alias = is_meta_index( in_meta ) |
466 |
? getMetaNameByID( in_header, in_meta->alias ) |
467 |
: getPropNameByID( in_header, in_meta->alias ); |
468 |
|
469 |
if ( !in_alias ) |
470 |
progerr("Failed to lookup alias for %s in index %s", in_meta->metaName, in_index->line ); |
471 |
|
472 |
|
473 |
/* now lookup the alias in the out_header by name */ |
474 |
out_alias = is_meta_index( in_alias ) |
475 |
? getMetaNameByNameNoAlias( out_header, in_alias->metaName ) |
476 |
: getPropNameByNameNoAlias( out_header, in_alias->metaName ); |
477 |
|
478 |
|
479 |
/* should be there, since it would have been added earlier - the real metas must be added before the aliases */ |
480 |
if ( !out_alias ) |
481 |
progerr("Failed to lookup alias for %s in output index", out_meta->metaName ); |
482 |
|
483 |
|
484 |
/* If this is new (or doesn't point to the alias root, then just assign it */ |
485 |
if ( !out_meta->alias ) |
486 |
out_meta->alias = out_alias->metaID; |
487 |
|
488 |
/* else, if it is already an alias, but points someplace else, we have a problem */ |
489 |
else if ( out_meta->alias != out_alias->metaID ) |
490 |
progerr("In index %s metaname '%s' is an alias for '%s'(%d). But another index already mapped '%s' to ID# '%d'", in_index->line, in_meta->metaName, in_alias->metaName, in_alias->metaID, out_meta->metaName, out_meta->alias ); |
491 |
} |
492 |
} |
493 |
|
494 |
in_index->meta_map = meta_map; |
495 |
|
496 |
|
497 |
#ifdef DEBUG_MERGE |
498 |
printf(" %s -> %s ** Meta Map **\n", in_index->line, sw_output->indexlist->line ); |
499 |
for ( i=0; i<in_header->metaCounter + 1;i++) |
500 |
printf("%4d -> %3d\n", i, meta_map[i] ); |
501 |
#endif |
502 |
|
503 |
} |
504 |
|
505 |
/**************************************************************************** |
506 |
* load_filename_sort - creates an array for reading in filename order |
507 |
* |
508 |
* |
509 |
*****************************************************************************/ |
510 |
|
511 |
static int *sorted_data; |
512 |
|
513 |
static int compnums(const void *s1, const void *s2) |
514 |
{ |
515 |
int a = *(int *)s1; // filenumber passed from qsort |
516 |
int b = *(int *)s2; |
517 |
int v1 = sorted_data[ a-1 ]; |
518 |
int v2 = sorted_data[ b-1 ]; |
519 |
|
520 |
// return v1 <=> v2; |
521 |
|
522 |
if ( v1 < v2 ) |
523 |
return -1; |
524 |
if ( v1 > v2 ) |
525 |
return 1; |
526 |
|
527 |
return 0; |
528 |
} |
529 |
|
530 |
|
531 |
static void load_filename_sort( SWISH *sw, IndexFILE *cur_index ) |
532 |
{ |
533 |
struct metaEntry *path_meta = getPropNameByName( &cur_index->header, AUTOPROPERTY_DOCPATH ); |
534 |
int i; |
535 |
int *sort_array; |
536 |
int totalfiles = cur_index->header.totalfiles; |
537 |
|
538 |
if ( !path_meta ) |
539 |
progerr("Can't merge index %s. It doesn't contain the property %s", cur_index->line, AUTOPROPERTY_DOCPATH ); |
540 |
|
541 |
|
542 |
/* Save for looking up pathname when sorting */ |
543 |
cur_index->path_meta = path_meta; |
544 |
|
545 |
/* Case is important for most OS when comparing file names */ |
546 |
cur_index->path_meta->metaType &= ~META_IGNORE_CASE; |
547 |
|
548 |
|
549 |
|
550 |
cur_index->modified_meta = getPropNameByName( &cur_index->header, AUTOPROPERTY_LASTMODIFIED ); |
551 |
|
552 |
|
553 |
if ( !LoadSortedProps( sw, cur_index, path_meta ) ) |
554 |
{ |
555 |
FileRec fi; |
556 |
memset( &fi, 0, sizeof( FileRec )); |
557 |
path_meta->sorted_data = CreatePropSortArray( sw, cur_index, path_meta, &fi, 1 ); |
558 |
} |
559 |
|
560 |
|
561 |
/* So the qsort compare function can read it */ |
562 |
sorted_data = path_meta->sorted_data; |
563 |
|
564 |
|
565 |
if ( !sorted_data ) |
566 |
progerr("failed to load or create sorted properties for index %s", cur_index->line ); |
567 |
|
568 |
|
569 |
sort_array = emalloc( totalfiles * sizeof( int ) ); |
570 |
memset( sort_array, 0, totalfiles * sizeof( int ) ); |
571 |
|
572 |
|
573 |
/* build an array with file numbers and sort into filename order */ |
574 |
for ( i = 0; i < totalfiles; i++ ) |
575 |
sort_array[i] = i+1; // filenumber starts a one |
576 |
|
577 |
|
578 |
swish_qsort( sort_array, totalfiles, sizeof( int ), &compnums); |
579 |
|
580 |
cur_index->path_order = sort_array; |
581 |
|
582 |
efree( path_meta->sorted_data ); |
583 |
path_meta->sorted_data = NULL; |
584 |
} |
585 |
|
586 |
/**************************************************************************** |
587 |
* get_next_file_in_order -- grabs the next file entry from all the indexes |
588 |
* in filename (and then modified date) order |
589 |
* |
590 |
* |
591 |
*****************************************************************************/ |
592 |
|
593 |
/* This isn't really accurate, as some other file may come and replace the newer */ |
594 |
|
595 |
static void print_file_removed(IndexFILE *older, propEntry *op, IndexFILE *newer, propEntry *np ) |
596 |
{ |
597 |
|
598 |
char *p1, *d1, *p2, *d2; |
599 |
p1 = DecodeDocProperty( older->path_meta, older->cur_prop ); |
600 |
d1 = DecodeDocProperty( older->modified_meta, op ); |
601 |
|
602 |
p2 = DecodeDocProperty( newer->path_meta, newer->cur_prop ); |
603 |
d2 = DecodeDocProperty( newer->modified_meta, np ); |
604 |
|
605 |
printf("Replaced file '%s %s' with '%s %s'\n", p1, d1, p2, d2); |
606 |
} |
607 |
|
608 |
|
609 |
static IndexFILE *get_next_file_in_order( SWISH *sw_input ) |
610 |
{ |
611 |
IndexFILE *winner = NULL; |
612 |
IndexFILE *cur_index = sw_input->indexlist; |
613 |
FileRec fi; |
614 |
int ret; |
615 |
propEntry *wp, *cp; |
616 |
|
617 |
memset(&fi, 0, sizeof( FileRec )); |
618 |
|
619 |
for ( cur_index = sw_input->indexlist; cur_index; cur_index = cur_index->next ) |
620 |
{ |
621 |
/* don't use cached props, as they belong to a different index! */ |
622 |
if ( fi.prop_index ) |
623 |
efree( fi.prop_index ); |
624 |
memset(&fi, 0, sizeof( FileRec )); |
625 |
|
626 |
/* still some to read in this index? */ |
627 |
if ( cur_index->current_file >= cur_index->header.totalfiles ) |
628 |
continue; |
629 |
|
630 |
|
631 |
|
632 |
/* get file number from lookup table */ |
633 |
fi.filenum = cur_index->path_order[cur_index->current_file]; |
634 |
|
635 |
if ( !cur_index->cur_prop ) |
636 |
cur_index->cur_prop = ReadSingleDocPropertiesFromDisk(sw_input, cur_index, &fi, cur_index->path_meta->metaID, 0 ); |
637 |
|
638 |
|
639 |
if ( !winner ) |
640 |
{ |
641 |
winner = cur_index; |
642 |
continue; |
643 |
} |
644 |
|
645 |
ret = Compare_Properties( cur_index->path_meta, cur_index->cur_prop, winner->cur_prop ); |
646 |
|
647 |
if ( ret != 0 ) |
648 |
{ |
649 |
if ( ret < 0 ) /* take cur_index if it's smaller */ |
650 |
winner = cur_index; |
651 |
|
652 |
continue; |
653 |
} |
654 |
|
655 |
|
656 |
|
657 |
/* if they are the same name, then take the newest, and increment the older one */ |
658 |
|
659 |
|
660 |
/* read the modified time for the current file */ |
661 |
/* Use the same fi record, because it has the cached prop seek locations */ |
662 |
cp = ReadSingleDocPropertiesFromDisk(sw_input, cur_index, &fi, cur_index->modified_meta->metaID, 0 ); |
663 |
|
664 |
|
665 |
/* read the modified time for the current winner */ |
666 |
if ( fi.prop_index ) |
667 |
efree( fi.prop_index ); |
668 |
memset(&fi, 0, sizeof( FileRec )); |
669 |
|
670 |
fi.filenum = winner->path_order[winner->current_file]; |
671 |
wp = ReadSingleDocPropertiesFromDisk(sw_input, winner, &fi, cur_index->modified_meta->metaID, 0 ); |
672 |
|
673 |
ret = Compare_Properties( cur_index->modified_meta, cp, wp ); |
674 |
|
675 |
|
676 |
|
677 |
/* If current is greater (newer) then throw away winner */ |
678 |
if ( ret > 0 ) |
679 |
{ |
680 |
print_file_removed( winner, wp, cur_index, cp); |
681 |
winner->current_file++; |
682 |
if ( winner->cur_prop ) |
683 |
efree( winner->cur_prop ); |
684 |
winner->cur_prop = NULL; |
685 |
winner = cur_index; |
686 |
} |
687 |
/* else, keep winner, and throw away current */ |
688 |
else |
689 |
{ |
690 |
print_file_removed(cur_index, cp, winner, wp ); |
691 |
cur_index->current_file++; |
692 |
if ( cur_index->cur_prop ) |
693 |
efree( cur_index->cur_prop ); |
694 |
|
695 |
cur_index->cur_prop = NULL; |
696 |
} |
697 |
|
698 |
freeProperty( cp ); |
699 |
freeProperty( wp ); |
700 |
|
701 |
} |
702 |
|
703 |
if ( fi.prop_index ) |
704 |
efree( fi.prop_index ); |
705 |
|
706 |
|
707 |
if ( !winner ) |
708 |
return NULL; |
709 |
|
710 |
|
711 |
winner->filenum = winner->path_order[winner->current_file++]; |
712 |
|
713 |
#ifdef DEBUG_MERGE |
714 |
printf(" Files in order: index %s file# %d winner\n", winner->line, winner->filenum ); |
715 |
#endif |
716 |
|
717 |
/* free prop, as it's not needed anymore */ |
718 |
if ( winner->cur_prop ) |
719 |
efree( winner->cur_prop ); |
720 |
winner->cur_prop = NULL; |
721 |
|
722 |
|
723 |
return winner; |
724 |
} |
725 |
|
726 |
|
727 |
/**************************************************************************** |
728 |
* add_file |
729 |
* |
730 |
* Now, read in filename order (so can throw out duplicates) |
731 |
* - read properties and write out to new index |
732 |
* - write a temporay of records to identify |
733 |
* - indexfile |
734 |
* - old filenum to new filenum mapping |
735 |
* - total words per file, if set |
736 |
****************************************************************************/ |
737 |
|
738 |
static void add_file( FILE *filenum_map, IndexFILE *cur_index, SWISH *sw_input, SWISH *sw_output ) |
739 |
{ |
740 |
FileRec fi; |
741 |
IndexFILE *indexf = sw_output->indexlist; |
742 |
struct MOD_Index *idx = sw_output->Index; |
743 |
docProperties *d; |
744 |
int i; |
745 |
propEntry *tmp; |
746 |
docProperties *docProperties=NULL; |
747 |
struct metaEntry meta_entry; |
748 |
|
749 |
|
750 |
meta_entry.metaName = "(default)"; /* for error message, I think */ |
751 |
|
752 |
|
753 |
memset( &fi, 0, sizeof( FileRec )); |
754 |
|
755 |
|
756 |
#ifdef DEBUG_MERGE |
757 |
printf("Reading Properties from input index '%s' file %d\n", cur_index->line, cur_index->filenum); |
758 |
#endif |
759 |
|
760 |
/* read the properties and map them as needed */ |
761 |
d = ReadAllDocPropertiesFromDisk( sw_input, cur_index, cur_index->filenum ); |
762 |
|
763 |
|
764 |
#ifdef DEBUG_MERGE |
765 |
fi.docProperties = d; |
766 |
dump_file_properties( cur_index, &fi ); |
767 |
#endif |
768 |
|
769 |
|
770 |
|
771 |
/* all this off-by-one things are a mess */ |
772 |
|
773 |
/* read through all the property slots, and map them, as needed */ |
774 |
for ( i = 0; i < d->n; i++ ) |
775 |
if ( (tmp = d->propEntry[i]) ) |
776 |
{ |
777 |
meta_entry.metaID = cur_index->meta_map[ i ]; |
778 |
addDocProperty(&docProperties, &meta_entry, tmp->propValue, tmp->propLen, 1 ); |
779 |
} |
780 |
|
781 |
#ifdef DEBUG_MERGE |
782 |
printf(" after mapping file %s\n", indexf->line); |
783 |
fi.docProperties = docProperties; |
784 |
dump_file_properties( cur_index, &fi ); |
785 |
printf("\n"); |
786 |
#endif |
787 |
|
788 |
|
789 |
/* Now bump the file counter */ |
790 |
idx->filenum++; |
791 |
indexf->header.totalfiles++; |
792 |
|
793 |
if ( docProperties ) /* always true */ |
794 |
{ |
795 |
fi.filenum = idx->filenum; |
796 |
fi.docProperties = docProperties; |
797 |
|
798 |
WritePropertiesToDisk( sw_output , &fi ); |
799 |
|
800 |
freeDocProperties( d ); |
801 |
} |
802 |
|
803 |
|
804 |
|
805 |
|
806 |
/* now write out the data to be used for mapping file for a given index. */ |
807 |
// compress1( cur_index->filenum, filenum_map, fputc ); // what file number this came from |
808 |
|
809 |
fwrite( &cur_index->filenum, sizeof(int), 1, filenum_map); |
810 |
fwrite( &cur_index, sizeof(IndexFILE *), 1, filenum_map); // what index |
811 |
|
812 |
|
813 |
/* Save total words per file */ |
814 |
if ( !indexf->header.ignoreTotalWordCountWhenRanking ) |
815 |
{ |
816 |
INDEXDATAHEADER *header = &indexf->header; |
817 |
int idx1 = fi.filenum - 1; |
818 |
|
819 |
if ( !header->TotalWordsPerFile || idx1 >= header->TotalWordsPerFileMax ) |
820 |
{ |
821 |
header->TotalWordsPerFileMax += 20000; /* random guess -- could be a config setting */ |
822 |
header->TotalWordsPerFile = erealloc( header->TotalWordsPerFile, header->TotalWordsPerFileMax * sizeof(int) ); |
823 |
} |
824 |
|
825 |
header->TotalWordsPerFile[idx1] = cur_index->header.TotalWordsPerFile[cur_index->filenum-1]; |
826 |
} |
827 |
} |
828 |
|
829 |
/**************************************************************************** |
830 |
* Builds a old_filenum -> new_filenum map; |
831 |
* |
832 |
* This makes is so you can lookup an old file number and map it to a new file number |
833 |
* |
834 |
****************************************************************************/ |
835 |
|
836 |
static int *get_map( FILE *filenum_map, IndexFILE *cur_index ) |
837 |
{ |
838 |
int *array = emalloc( (cur_index->header.totalfiles+1) * sizeof( int ) ); |
839 |
IndexFILE *idf; |
840 |
int filenum; |
841 |
int new_filenum = 0; |
842 |
|
843 |
|
844 |
|
845 |
memset( array, 0, (cur_index->header.totalfiles+1) * sizeof( int ) ); |
846 |
|
847 |
|
848 |
clearerr( filenum_map ); |
849 |
fseek( filenum_map, 0, 0 ); /* start at beginning */ |
850 |
|
851 |
while ( 1 ) |
852 |
{ |
853 |
new_filenum++; |
854 |
|
855 |
if (!fread( &filenum, sizeof(int), 1, filenum_map)) |
856 |
break; |
857 |
|
858 |
|
859 |
if(!fread( &idf, sizeof(IndexFILE *), 1, filenum_map)) |
860 |
break; |
861 |
|
862 |
if ( idf == cur_index ) |
863 |
array[filenum] = new_filenum; |
864 |
|
865 |
} |
866 |
|
867 |
return array; |
868 |
} |
869 |
|
870 |
/**************************************************************************** |
871 |
* Reads the index to get the all the words |
872 |
****************************************************************************/ |
873 |
|
874 |
static void dump_index_words(SWISH * sw, IndexFILE * indexf, SWISH *sw_output) |
875 |
{ |
876 |
int j; |
877 |
int word_count = 0; |
878 |
char word[2]; |
879 |
char *resultword; |
880 |
long wordID; |
881 |
|
882 |
DB_InitReadWords(sw, indexf->DB); |
883 |
|
884 |
|
885 |
printf("Getting words in index '%s': %3d words\r", indexf->line, word_count); |
886 |
fflush(stdout); |
887 |
|
888 |
for(j=0;j<256;j++) |
889 |
{ |
890 |
|
891 |
word[0] = (unsigned char) j; word[1] = '\0'; |
892 |
DB_ReadFirstWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
893 |
|
894 |
while(wordID) |
895 |
{ |
896 |
/* Add resultword to output */ |
897 |
getentry(sw_output, resultword); |
898 |
efree(resultword); |
899 |
DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
900 |
word_count++; |
901 |
if(!word_count % 10000) |
902 |
printf("Getting words in index '%s': %3d words\r", indexf->line, word_count); |
903 |
} |
904 |
} |
905 |
printf("Getting words in index '%s': %6d words\n", indexf->line, word_count); |
906 |
|
907 |
DB_EndReadWords(sw, indexf->DB); |
908 |
|
909 |
} |
910 |
|
911 |
/**************************************************************************** |
912 |
* Writes a word out to the index |
913 |
* |
914 |
* |
915 |
****************************************************************************/ |
916 |
|
917 |
static void write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, ENTRY *e, int metaID, int posdata ) |
918 |
{ |
919 |
int new_file; |
920 |
int new_meta; |
921 |
|
922 |
#ifdef DEBUG_MERGE |
923 |
printf("\nindex %s '%s' Struct: %d Pos: %d", |
924 |
indexf->line, e->word, structure, position ); |
925 |
|
926 |
|
927 |
if ( !(new_file = file_num_map[ filenum ]) ) |
928 |
{ |
929 |
printf(" file: %d **File deleted!**\n", filenum); |
930 |
return; |
931 |
} |
932 |
|
933 |
if ( !(new_meta = indexf->meta_map[ metaID ] )) |
934 |
{ |
935 |
printf(" file: %d **Failed to map meta ID **\n", filenum); |
936 |
return; |
937 |
} |
938 |
|
939 |
printf(" File: %d -> %d Meta: %d -> %d\n", filenum, new_file, metaID, new_meta ); |
940 |
|
941 |
addentry( sw_output, e, new_file, structure, metaID, position ); |
942 |
|
943 |
return; |
944 |
|
945 |
|
946 |
#else |
947 |
|
948 |
|
949 |
if ( !(new_file = file_num_map[ filenum ]) ) |
950 |
return; |
951 |
|
952 |
if ( !(new_meta = indexf->meta_map[ metaID ] )) |
953 |
return; |
954 |
|
955 |
addentry( sw_output, e, new_file, GET_STRUCTURE(posdata), metaID, GET_POSITION(posdata) ); |
956 |
|
957 |
return; |
958 |
|
959 |
#endif |
960 |
|
961 |
|
962 |
} |
963 |
|