/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/merge.c
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/merge.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Error occurred while calculating annotation data.
Importing web-site building process.

1 /*
2 ** This program and library is free software; you can redistribute it and/or
3 ** modify it under the terms of the GNU General Public License
4 ** as published by the Free Software Foundation; either version 2
5 ** of the License, or any later version.
6 **
7 ** This program is distributed in the hope that it will be useful,
8 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
9 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 ** GNU (Library) General Public License for more details.
11 **
12 ** You should have received a copy of the GNU (Library) General Public License
13 ** along with this program; if not, write to the Free Software
14 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 **-----------------------------------------------------------------
16 **
17 ** rewritten from scratch - moseley Oct 17, 2001
18 **
19 */
20
21 #include <assert.h> /* for bug hunting */
22 #include "swish.h"
23 #include "mem.h"
24 #include "string.h"
25 #include "merge.h"
26 #include "error.h"
27 #include "search.h"
28 #include "index.h"
29 #include "hash.h"
30 #include "file.h"
31 #include "docprop.h"
32 #include "list.h"
33 #include "compress.h"
34 #include "metanames.h"
35 #include "db.h"
36 #include "dump.h"
37 #include "result_sort.h"
38 #include "swish_qsort.h"
39 #include "result_output.h"
40 #include "parse_conffile.h"
41 static void dup_header( SWISH *sw_input, SWISH *sw_output );
42 static void check_header_match( IndexFILE *in_index, SWISH *sw_output );
43 static void make_meta_map( IndexFILE *in_index, SWISH *sw_output);
44 static void load_filename_sort( SWISH *sw, IndexFILE *cur_index );
45 static IndexFILE *get_next_file_in_order( SWISH *sw_input );
46 static void add_file( FILE *filenum_map, IndexFILE *cur_index, SWISH *sw_input, SWISH *sw_output );
47 static int *get_map( FILE *filenum_map, IndexFILE *cur_index );
48 static void dump_index_words(SWISH * sw, IndexFILE * indexf, SWISH *sw_output );
49 static void write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, ENTRY *e, int metaID, int posdata );
50
51
52 // #define DEBUG_MERGE
53
54 /****************************************************************************
55 * merge_indexes -- reads from input indexes, and outputs a new index
56 *
57 *
58 *****************************************************************************/
59
60 void merge_indexes( SWISH *sw_input, SWISH *sw_output )
61 {
62 IndexFILE *cur_index;
63 FILE *filenum_map;
64 char *tmpfilename;
65 struct MOD_Index *idx_output = sw_output->Index;
66 ENTRY *e;
67 int hash,
68 sz_worddata,
69 tmpval,
70 filenum,
71 metaID = 0,
72 frequency,
73 loc_count = 0,
74 word_count = 0;
75 long wordID;
76 unsigned long nextposmetaID = 0L;
77 unsigned char *worddata;
78 unsigned char *s;
79 unsigned char flag;
80 int local_posdata[MAX_STACK_POSITIONS];
81 int *posdata;
82 int i;
83
84 /*******************************************************************************
85 * Get ready to merge the indexes. For each index:
86 * - check that it has the correct headers
87 * - create meta entries in output index, and create a map to convert metas
88 * - load an array of file numbers sorted by filename so can merge sort the filesnames
89 * - set some initial defaults.
90 *********************************************************************************/
91
92 cur_index = sw_input->indexlist;
93 while( cur_index )
94 {
95 printf("Input index '%s' has %d files and %d words\n", cur_index->line, cur_index->header.totalfiles, cur_index->header.totalwords);
96
97 if ( cur_index == sw_input->indexlist )
98 /* Duplicate the first index's header into the output index */
99 dup_header( sw_input, sw_output );
100 else
101 check_header_match( cur_index, sw_output ); // errors if headers don't match - don't really need to check first one since it was the one that was dupped
102
103
104 make_meta_map( cur_index, sw_output); // add metas to new index, and create map
105
106 load_filename_sort( sw_input, cur_index ); // so can read in filename order
107
108 cur_index->current_file = 0;
109 cur_index->cur_prop = NULL;
110
111 #ifdef DEBUG_MERGE
112 dump_metanames( sw_input, cur_index, 1 );
113 dump_metanames( sw_output, sw_output->indexlist, 0 );
114 #endif
115
116 cur_index = cur_index->next;
117 }
118
119
120 #ifdef DEBUG_MERGE
121 printf("----- Output Header ----------\n");
122 resultPrintHeader(sw_output, 0, &sw_output->indexlist->header, sw_output->indexlist->line, 0);
123 #endif
124
125
126
127 /****************************************************************************
128 * Now, read in filename order (so can throw out duplicates)
129 * - read properties and write out to new index
130 * - write a temporay of records to identify
131 * - indexfile
132 * - old filenum to new filenum mapping
133 * - total words per file, if set
134 ****************************************************************************/
135
136 /* place to store file number map and total words per file */
137 filenum_map = create_tempfile(sw_input, F_WRITE_BINARY, "fnum", &tmpfilename, 0 );
138
139 while( (cur_index = get_next_file_in_order( sw_input )) )
140 add_file( filenum_map, cur_index, sw_input, sw_output );
141
142
143
144 /* Don't need the pre-sorted indexes any more */
145 for ( cur_index = sw_input->indexlist; cur_index; cur_index = cur_index->next )
146 {
147 efree( cur_index->path_order );
148 cur_index->path_order = NULL;
149 }
150
151 fclose( filenum_map );
152
153 if ( !(filenum_map = fopen( tmpfilename, F_READ_BINARY )) )
154 progerrno("failed to reopen '%s' :", tmpfilename );
155
156
157
158 /****************************************************************************
159 * Finally, read the indexes one-by-one to read word and position data
160 * - reads through the temp file for each index to build a filenumber map
161 *
162 ****************************************************************************/
163
164 /* 08/2002 jmruiz
165 ** First of all, get all the words
166 */
167 cur_index = sw_input->indexlist;
168 while( cur_index )
169 {
170 dump_index_words(sw_input, cur_index, sw_output);
171 /* Get filr_num_map for later proccess */
172 cur_index->merge_file_num_map = get_map( filenum_map, cur_index );
173 cur_index = cur_index->next;
174 }
175
176 /* At this point we have all the words. Now we have to get worddata
177 * and merge it
178 */
179 word_count = 0;
180 printf("Processing words in index '%s': %6d words\r", sw_output->indexlist->line, word_count);
181 fflush(stdout);
182 /* walk the hash list to merge worddata */
183 for (hash = 0; hash < VERYBIGHASHSIZE; hash++)
184 {
185 if (idx_output->hashentriesdirty[hash])
186 {
187 idx_output->hashentriesdirty[hash] = 0;
188 for (e = idx_output->hashentries[hash]; e; e = e->next)
189 {
190 word_count++;
191 /* Search the word in all index and get worddata */
192 cur_index = sw_input->indexlist;
193 while( cur_index )
194 {
195 DB_ReadWordHash(sw_input, e->word, &wordID, cur_index->DB);
196 /* If word exits in the index */
197 if(wordID)
198 {
199
200 DB_ReadWordData(sw_input, wordID, &worddata, &sz_worddata, cur_index->DB);
201
202 /* Now, parse word's data */
203 s = worddata;
204 tmpval = uncompress2(&s); /* tfrequency */
205 metaID = uncompress2(&s); /* metaID */
206
207 if (metaID)
208 {
209 nextposmetaID = UNPACKLONG2(s);
210 s += sizeof(long);
211 }
212
213 filenum = 0;
214
215 while(1)
216 { /* Read on all items */
217 uncompress_location_values(&s,&flag,&tmpval,&frequency);
218 filenum += tmpval;
219 /* Use stack array when possible to avoid malloc/free overhead */
220 if(frequency > MAX_STACK_POSITIONS)
221 posdata = (int *) emalloc(frequency * sizeof(int));
222 else
223 posdata = local_posdata;
224
225 /* Read the positions */
226 uncompress_location_positions(&s,flag,frequency,posdata);
227
228
229 /* now we have the word data */
230 for (i = 0; i < frequency; i++, loc_count++)
231 write_word_pos( sw_input, cur_index, sw_output, cur_index->merge_file_num_map, filenum, e, metaID, posdata[i]);
232
233 if(e->tfrequency)
234 {
235 /* 08/2002 jmruiz - We will call CompressCurrentLocEntry from time
236 ** to time to help addentry.
237 ** If we do not do this, addentry routine will have to run linked lists
238 ** of positions with thousands of elements and makes the merge proccess
239 ** very slow
240 */
241 if(!(loc_count % 100))
242 CompressCurrentLocEntry(sw_output, sw_output->indexlist, e);
243 }
244
245
246 if(posdata != local_posdata)
247 efree(posdata);
248
249 if ((s - worddata) == sz_worddata)
250 break; /* End of worddata */
251
252 if ((unsigned long)(s - worddata) == nextposmetaID)
253 {
254 filenum = 0;
255 metaID = uncompress2(&s);
256 if (metaID)
257 {
258 nextposmetaID = UNPACKLONG2(s);
259 s += sizeof(long);
260 }
261 else
262 nextposmetaID = 0L;
263 }
264 }
265
266 if(e->tfrequency)
267 CompressCurrentLocEntry(sw_output, sw_output->indexlist, e);
268
269 efree(worddata);
270 }
271 cur_index = cur_index->next;
272 }
273 /* Let's coalesce locations for each word to save memory
274 ** This makes use of the -e feature
275 ** Because we are proccessing one word at a time we can
276 ** coalesce its data just once
277 */
278 coalesce_word_locations(sw_output,sw_output->indexlist,e);
279
280 if(!(word_count % 1000))
281 {
282 /* Make zone available for reuse and save memory */
283 Mem_ZoneReset(sw_output->Index->currentChunkLocZone);
284 sw_output->Index->freeLocMemChain = NULL;
285 printf("Processing words in index '%s': %6d words\r", sw_output->indexlist->line, word_count);
286 }
287 }
288 }
289 }
290
291 printf("Processing words in index '%s': %6d words\n", sw_output->indexlist->line, word_count);
292 fflush(stdout);
293
294 cur_index = sw_input->indexlist;
295 while( cur_index )
296 {
297 /* free the maps */
298 efree( cur_index->merge_file_num_map );
299 efree( cur_index->meta_map );
300 cur_index->meta_map = NULL;
301 cur_index = cur_index->next;
302 }
303
304
305 #ifdef DEBUG_MERGE
306 printf("----- Final Output Header ----------\n");
307 resultPrintHeader(sw_output, 0, &sw_output->indexlist->header, sw_output->indexlist->line, 0);
308 #endif
309
310 remove( tmpfilename );
311 efree( tmpfilename );
312 }
313
314 /****************************************************************************
315 * dup_header -- duplicates a header
316 *
317 * rereads the header from the data base, and clears out some values
318 *
319 *****************************************************************************/
320
321 static void dup_header( SWISH *sw_input, SWISH *sw_output )
322 {
323 INDEXDATAHEADER *out_header = &sw_output->indexlist->header;
324
325 // probably need to free the sw_output header from what's created in swishnew.
326
327 /* Read in the header from the first merge file and store in the output file */
328 read_header(sw_input, out_header, sw_input->indexlist->DB);
329
330 out_header->totalfiles = 0;
331 out_header->totalwords = 0;
332
333 freeMetaEntries( out_header );
334
335 if ( out_header->indexedon )
336 {
337 efree( out_header->indexedon );
338 out_header->indexedon = NULL;
339 out_header->lenindexedon = 0;
340 }
341 }
342
343 /****************************************************************************
344 * check_header_match -- makes sure that the imporant settings match
345 *
346 *
347 *****************************************************************************/
348
349 // This assumes that the size will always preceed the content.
350 typedef struct
351 {
352 int len;
353 char *str;
354 } *HEAD_CMP;
355
356 static void compare_header( char *index, char *name, void *in, void *out )
357 {
358 HEAD_CMP in_item = (HEAD_CMP)in;
359 HEAD_CMP out_item = (HEAD_CMP)out;
360
361 if ( in_item->len != out_item->len )
362 progerr("Header %s in index %s doesn't match length in length with output header", name, index );
363
364 if ( strcmp( (const char *)in_item->str, (const char *)out_item->str ))
365 progerr("Header %s in index %s doesn't match output header", name, index );
366
367 //if ( memcmp( (const void *)in_item->str, (const void *)out_item->str, in_item->len ) )
368 // progerr("Header %s in index %s doesn't match output header", name, index );
369
370
371
372
373 }
374
375
376 static void check_header_match( IndexFILE *in_index, SWISH *sw_output )
377 {
378 INDEXDATAHEADER *out_header = &sw_output->indexlist->header;
379 INDEXDATAHEADER *in_header = &in_index->header;
380
381 compare_header( in_index->line, "WordCharacters", &in_header->lenwordchars, &out_header->lenwordchars );
382 compare_header( in_index->line, "BeginCharacters", &in_header->lenbeginchars, &out_header->lenbeginchars );
383 compare_header( in_index->line, "EndCharacters", &in_header->lenendchars, &out_header->lenendchars );
384
385 compare_header( in_index->line, "IgnoreLastChar", &in_header->lenignorelastchar, &out_header->lenignorelastchar );
386 compare_header( in_index->line, "IgnoreFirstChar", &in_header->lenignorefirstchar, &out_header->lenignorefirstchar );
387
388 compare_header( in_index->line, "BumpPositionChars", &in_header->lenbumpposchars, &out_header->lenbumpposchars );
389
390
391 if ( in_header->fuzzy_mode != out_header->fuzzy_mode )
392 progerr("FuzzyIndexingMode in index %s of '%s' doesn't match '%s'",
393 in_index->line,
394 fuzzy_mode_to_string( in_header->fuzzy_mode ),
395 fuzzy_mode_to_string( out_header->fuzzy_mode ) );
396
397
398 if ( in_header->ignoreTotalWordCountWhenRanking != out_header->ignoreTotalWordCountWhenRanking )
399 progerr("ignoreTotalWordCountWhenRanking Rules doesn't match for index %s", in_index->line );
400
401 if ( memcmp( &in_header->translatecharslookuptable, &out_header->translatecharslookuptable, sizeof(in_header->translatecharslookuptable) / sizeof( int ) ) )
402 progerr("TranslateChars header doesn't match for index %s", in_index->line );
403
404
405 //??? need to compare stopword lists
406
407 //??? need to compare buzzwords
408
409 }
410
411 /****************************************************************************
412 * make_meta_map - adds metanames to output index and creates map
413 *
414 *
415 *****************************************************************************/
416
417 static void make_meta_map( IndexFILE *in_index, SWISH *sw_output)
418 {
419 INDEXDATAHEADER *out_header = &sw_output->indexlist->header;
420 INDEXDATAHEADER *in_header = &in_index->header;
421 int i;
422 struct metaEntry *in_meta;
423 struct metaEntry *out_meta;
424 int *meta_map;
425
426
427 meta_map = emalloc( sizeof( int ) * (in_header->metaCounter + 1) );
428 memset( meta_map, 0, sizeof( int ) * (in_header->metaCounter + 1) );
429
430 for( i = 0; i < in_header->metaCounter; i++ )
431 {
432 in_meta = in_header->metaEntryArray[i];
433
434
435 /* Try to see if it's an existing metaname */
436 out_meta = is_meta_index( in_meta )
437 ? getMetaNameByNameNoAlias( out_header, in_meta->metaName )
438 : getPropNameByNameNoAlias( out_header, in_meta->metaName );
439
440 /* if it's not found, then add it */
441 if ( !out_meta )
442 out_meta = addMetaEntry(out_header, in_meta->metaName, in_meta->metaType, 0);
443 else
444 if (out_meta->metaType != in_meta->metaType )
445 progerr("meta name %s in index %s is different type than in output index", in_meta->metaName, in_index->line );
446
447
448 /* Now, save the mapping */
449 meta_map[ in_meta->metaID ] = out_meta->metaID;
450
451
452 /* now here's a pain, and lots of room for screw up. */
453 /* Basically, check for alias mappings, and that they are correct */
454 /* you can say title is an alias for swishtitle in one index, and then say */
455 /* title is an alias for doctitle in another index */
456
457 /* If it's an alias, then make that mapping, too */
458 if ( in_meta->alias )
459 {
460 struct metaEntry *in_alias;
461 struct metaEntry *out_alias;
462
463 /* Grab alias meta entry so we can look it up in the out_header */
464
465 in_alias = is_meta_index( in_meta )
466 ? getMetaNameByID( in_header, in_meta->alias )
467 : getPropNameByID( in_header, in_meta->alias );
468
469 if ( !in_alias )
470 progerr("Failed to lookup alias for %s in index %s", in_meta->metaName, in_index->line );
471
472
473 /* now lookup the alias in the out_header by name */
474 out_alias = is_meta_index( in_alias )
475 ? getMetaNameByNameNoAlias( out_header, in_alias->metaName )
476 : getPropNameByNameNoAlias( out_header, in_alias->metaName );
477
478
479 /* should be there, since it would have been added earlier - the real metas must be added before the aliases */
480 if ( !out_alias )
481 progerr("Failed to lookup alias for %s in output index", out_meta->metaName );
482
483
484 /* If this is new (or doesn't point to the alias root, then just assign it */
485 if ( !out_meta->alias )
486 out_meta->alias = out_alias->metaID;
487
488 /* else, if it is already an alias, but points someplace else, we have a problem */
489 else if ( out_meta->alias != out_alias->metaID )
490 progerr("In index %s metaname '%s' is an alias for '%s'(%d). But another index already mapped '%s' to ID# '%d'", in_index->line, in_meta->metaName, in_alias->metaName, in_alias->metaID, out_meta->metaName, out_meta->alias );
491 }
492 }
493
494 in_index->meta_map = meta_map;
495
496
497 #ifdef DEBUG_MERGE
498 printf(" %s -> %s ** Meta Map **\n", in_index->line, sw_output->indexlist->line );
499 for ( i=0; i<in_header->metaCounter + 1;i++)
500 printf("%4d -> %3d\n", i, meta_map[i] );
501 #endif
502
503 }
504
505 /****************************************************************************
506 * load_filename_sort - creates an array for reading in filename order
507 *
508 *
509 *****************************************************************************/
510
511 static int *sorted_data;
512
513 static int compnums(const void *s1, const void *s2)
514 {
515 int a = *(int *)s1; // filenumber passed from qsort
516 int b = *(int *)s2;
517 int v1 = sorted_data[ a-1 ];
518 int v2 = sorted_data[ b-1 ];
519
520 // return v1 <=> v2;
521
522 if ( v1 < v2 )
523 return -1;
524 if ( v1 > v2 )
525 return 1;
526
527 return 0;
528 }
529
530
531 static void load_filename_sort( SWISH *sw, IndexFILE *cur_index )
532 {
533 struct metaEntry *path_meta = getPropNameByName( &cur_index->header, AUTOPROPERTY_DOCPATH );
534 int i;
535 int *sort_array;
536 int totalfiles = cur_index->header.totalfiles;
537
538 if ( !path_meta )
539 progerr("Can't merge index %s. It doesn't contain the property %s", cur_index->line, AUTOPROPERTY_DOCPATH );
540
541
542 /* Save for looking up pathname when sorting */
543 cur_index->path_meta = path_meta;
544
545 /* Case is important for most OS when comparing file names */
546 cur_index->path_meta->metaType &= ~META_IGNORE_CASE;
547
548
549
550 cur_index->modified_meta = getPropNameByName( &cur_index->header, AUTOPROPERTY_LASTMODIFIED );
551
552
553 if ( !LoadSortedProps( sw, cur_index, path_meta ) )
554 {
555 FileRec fi;
556 memset( &fi, 0, sizeof( FileRec ));
557 path_meta->sorted_data = CreatePropSortArray( sw, cur_index, path_meta, &fi, 1 );
558 }
559
560
561 /* So the qsort compare function can read it */
562 sorted_data = path_meta->sorted_data;
563
564
565 if ( !sorted_data )
566 progerr("failed to load or create sorted properties for index %s", cur_index->line );
567
568
569 sort_array = emalloc( totalfiles * sizeof( int ) );
570 memset( sort_array, 0, totalfiles * sizeof( int ) );
571
572
573 /* build an array with file numbers and sort into filename order */
574 for ( i = 0; i < totalfiles; i++ )
575 sort_array[i] = i+1; // filenumber starts a one
576
577
578 swish_qsort( sort_array, totalfiles, sizeof( int ), &compnums);
579
580 cur_index->path_order = sort_array;
581
582 efree( path_meta->sorted_data );
583 path_meta->sorted_data = NULL;
584 }
585
586 /****************************************************************************
587 * get_next_file_in_order -- grabs the next file entry from all the indexes
588 * in filename (and then modified date) order
589 *
590 *
591 *****************************************************************************/
592
593 /* This isn't really accurate, as some other file may come and replace the newer */
594
595 static void print_file_removed(IndexFILE *older, propEntry *op, IndexFILE *newer, propEntry *np )
596 {
597
598 char *p1, *d1, *p2, *d2;
599 p1 = DecodeDocProperty( older->path_meta, older->cur_prop );
600 d1 = DecodeDocProperty( older->modified_meta, op );
601
602 p2 = DecodeDocProperty( newer->path_meta, newer->cur_prop );
603 d2 = DecodeDocProperty( newer->modified_meta, np );
604
605 printf("Replaced file '%s %s' with '%s %s'\n", p1, d1, p2, d2);
606 }
607
608
609 static IndexFILE *get_next_file_in_order( SWISH *sw_input )
610 {
611 IndexFILE *winner = NULL;
612 IndexFILE *cur_index = sw_input->indexlist;
613 FileRec fi;
614 int ret;
615 propEntry *wp, *cp;
616
617 memset(&fi, 0, sizeof( FileRec ));
618
619 for ( cur_index = sw_input->indexlist; cur_index; cur_index = cur_index->next )
620 {
621 /* don't use cached props, as they belong to a different index! */
622 if ( fi.prop_index )
623 efree( fi.prop_index );
624 memset(&fi, 0, sizeof( FileRec ));
625
626 /* still some to read in this index? */
627 if ( cur_index->current_file >= cur_index->header.totalfiles )
628 continue;
629
630
631
632 /* get file number from lookup table */
633 fi.filenum = cur_index->path_order[cur_index->current_file];
634
635 if ( !cur_index->cur_prop )
636 cur_index->cur_prop = ReadSingleDocPropertiesFromDisk(sw_input, cur_index, &fi, cur_index->path_meta->metaID, 0 );
637
638
639 if ( !winner )
640 {
641 winner = cur_index;
642 continue;
643 }
644
645 ret = Compare_Properties( cur_index->path_meta, cur_index->cur_prop, winner->cur_prop );
646
647 if ( ret != 0 )
648 {
649 if ( ret < 0 ) /* take cur_index if it's smaller */
650 winner = cur_index;
651
652 continue;
653 }
654
655
656
657 /* if they are the same name, then take the newest, and increment the older one */
658
659
660 /* read the modified time for the current file */
661 /* Use the same fi record, because it has the cached prop seek locations */
662 cp = ReadSingleDocPropertiesFromDisk(sw_input, cur_index, &fi, cur_index->modified_meta->metaID, 0 );
663
664
665 /* read the modified time for the current winner */
666 if ( fi.prop_index )
667 efree( fi.prop_index );
668 memset(&fi, 0, sizeof( FileRec ));
669
670 fi.filenum = winner->path_order[winner->current_file];
671 wp = ReadSingleDocPropertiesFromDisk(sw_input, winner, &fi, cur_index->modified_meta->metaID, 0 );
672
673 ret = Compare_Properties( cur_index->modified_meta, cp, wp );
674
675
676
677 /* If current is greater (newer) then throw away winner */
678 if ( ret > 0 )
679 {
680 print_file_removed( winner, wp, cur_index, cp);
681 winner->current_file++;
682 if ( winner->cur_prop )
683 efree( winner->cur_prop );
684 winner->cur_prop = NULL;
685 winner = cur_index;
686 }
687 /* else, keep winner, and throw away current */
688 else
689 {
690 print_file_removed(cur_index, cp, winner, wp );
691 cur_index->current_file++;
692 if ( cur_index->cur_prop )
693 efree( cur_index->cur_prop );
694
695 cur_index->cur_prop = NULL;
696 }
697
698 freeProperty( cp );
699 freeProperty( wp );
700
701 }
702
703 if ( fi.prop_index )
704 efree( fi.prop_index );
705
706
707 if ( !winner )
708 return NULL;
709
710
711 winner->filenum = winner->path_order[winner->current_file++];
712
713 #ifdef DEBUG_MERGE
714 printf(" Files in order: index %s file# %d winner\n", winner->line, winner->filenum );
715 #endif
716
717 /* free prop, as it's not needed anymore */
718 if ( winner->cur_prop )
719 efree( winner->cur_prop );
720 winner->cur_prop = NULL;
721
722
723 return winner;
724 }
725
726
727 /****************************************************************************
728 * add_file
729 *
730 * Now, read in filename order (so can throw out duplicates)
731 * - read properties and write out to new index
732 * - write a temporay of records to identify
733 * - indexfile
734 * - old filenum to new filenum mapping
735 * - total words per file, if set
736 ****************************************************************************/
737
738 static void add_file( FILE *filenum_map, IndexFILE *cur_index, SWISH *sw_input, SWISH *sw_output )
739 {
740 FileRec fi;
741 IndexFILE *indexf = sw_output->indexlist;
742 struct MOD_Index *idx = sw_output->Index;
743 docProperties *d;
744 int i;
745 propEntry *tmp;
746 docProperties *docProperties=NULL;
747 struct metaEntry meta_entry;
748
749
750 meta_entry.metaName = "(default)"; /* for error message, I think */
751
752
753 memset( &fi, 0, sizeof( FileRec ));
754
755
756 #ifdef DEBUG_MERGE
757 printf("Reading Properties from input index '%s' file %d\n", cur_index->line, cur_index->filenum);
758 #endif
759
760 /* read the properties and map them as needed */
761 d = ReadAllDocPropertiesFromDisk( sw_input, cur_index, cur_index->filenum );
762
763
764 #ifdef DEBUG_MERGE
765 fi.docProperties = d;
766 dump_file_properties( cur_index, &fi );
767 #endif
768
769
770
771 /* all this off-by-one things are a mess */
772
773 /* read through all the property slots, and map them, as needed */
774 for ( i = 0; i < d->n; i++ )
775 if ( (tmp = d->propEntry[i]) )
776 {
777 meta_entry.metaID = cur_index->meta_map[ i ];
778 addDocProperty(&docProperties, &meta_entry, tmp->propValue, tmp->propLen, 1 );
779 }
780
781 #ifdef DEBUG_MERGE
782 printf(" after mapping file %s\n", indexf->line);
783 fi.docProperties = docProperties;
784 dump_file_properties( cur_index, &fi );
785 printf("\n");
786 #endif
787
788
789 /* Now bump the file counter */
790 idx->filenum++;
791 indexf->header.totalfiles++;
792
793 if ( docProperties ) /* always true */
794 {
795 fi.filenum = idx->filenum;
796 fi.docProperties = docProperties;
797
798 WritePropertiesToDisk( sw_output , &fi );
799
800 freeDocProperties( d );
801 }
802
803
804
805
806 /* now write out the data to be used for mapping file for a given index. */
807 // compress1( cur_index->filenum, filenum_map, fputc ); // what file number this came from
808
809 fwrite( &cur_index->filenum, sizeof(int), 1, filenum_map);
810 fwrite( &cur_index, sizeof(IndexFILE *), 1, filenum_map); // what index
811
812
813 /* Save total words per file */
814 if ( !indexf->header.ignoreTotalWordCountWhenRanking )
815 {
816 INDEXDATAHEADER *header = &indexf->header;
817 int idx1 = fi.filenum - 1;
818
819 if ( !header->TotalWordsPerFile || idx1 >= header->TotalWordsPerFileMax )
820 {
821 header->TotalWordsPerFileMax += 20000; /* random guess -- could be a config setting */
822 header->TotalWordsPerFile = erealloc( header->TotalWordsPerFile, header->TotalWordsPerFileMax * sizeof(int) );
823 }
824
825 header->TotalWordsPerFile[idx1] = cur_index->header.TotalWordsPerFile[cur_index->filenum-1];
826 }
827 }
828
829 /****************************************************************************
830 * Builds a old_filenum -> new_filenum map;
831 *
832 * This makes is so you can lookup an old file number and map it to a new file number
833 *
834 ****************************************************************************/
835
836 static int *get_map( FILE *filenum_map, IndexFILE *cur_index )
837 {
838 int *array = emalloc( (cur_index->header.totalfiles+1) * sizeof( int ) );
839 IndexFILE *idf;
840 int filenum;
841 int new_filenum = 0;
842
843
844
845 memset( array, 0, (cur_index->header.totalfiles+1) * sizeof( int ) );
846
847
848 clearerr( filenum_map );
849 fseek( filenum_map, 0, 0 ); /* start at beginning */
850
851 while ( 1 )
852 {
853 new_filenum++;
854
855 if (!fread( &filenum, sizeof(int), 1, filenum_map))
856 break;
857
858
859 if(!fread( &idf, sizeof(IndexFILE *), 1, filenum_map))
860 break;
861
862 if ( idf == cur_index )
863 array[filenum] = new_filenum;
864
865 }
866
867 return array;
868 }
869
870 /****************************************************************************
871 * Reads the index to get the all the words
872 ****************************************************************************/
873
874 static void dump_index_words(SWISH * sw, IndexFILE * indexf, SWISH *sw_output)
875 {
876 int j;
877 int word_count = 0;
878 char word[2];
879 char *resultword;
880 long wordID;
881
882 DB_InitReadWords(sw, indexf->DB);
883
884
885 printf("Getting words in index '%s': %3d words\r", indexf->line, word_count);
886 fflush(stdout);
887
888 for(j=0;j<256;j++)
889 {
890
891 word[0] = (unsigned char) j; word[1] = '\0';
892 DB_ReadFirstWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB);
893
894 while(wordID)
895 {
896 /* Add resultword to output */
897 getentry(sw_output, resultword);
898 efree(resultword);
899 DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB);
900 word_count++;
901 if(!word_count % 10000)
902 printf("Getting words in index '%s': %3d words\r", indexf->line, word_count);
903 }
904 }
905 printf("Getting words in index '%s': %6d words\n", indexf->line, word_count);
906
907 DB_EndReadWords(sw, indexf->DB);
908
909 }
910
911 /****************************************************************************
912 * Writes a word out to the index
913 *
914 *
915 ****************************************************************************/
916
917 static void write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, ENTRY *e, int metaID, int posdata )
918 {
919 int new_file;
920 int new_meta;
921
922 #ifdef DEBUG_MERGE
923 printf("\nindex %s '%s' Struct: %d Pos: %d",
924 indexf->line, e->word, structure, position );
925
926
927 if ( !(new_file = file_num_map[ filenum ]) )
928 {
929 printf(" file: %d **File deleted!**\n", filenum);
930 return;
931 }
932
933 if ( !(new_meta = indexf->meta_map[ metaID ] ))
934 {
935 printf(" file: %d **Failed to map meta ID **\n", filenum);
936 return;
937 }
938
939 printf(" File: %d -> %d Meta: %d -> %d\n", filenum, new_file, metaID, new_meta );
940
941 addentry( sw_output, e, new_file, structure, metaID, position );
942
943 return;
944
945
946 #else
947
948
949 if ( !(new_file = file_num_map[ filenum ]) )
950 return;
951
952 if ( !(new_meta = indexf->meta_map[ metaID ] ))
953 return;
954
955 addentry( sw_output, e, new_file, GET_STRUCTURE(posdata), metaID, GET_POSITION(posdata) );
956
957 return;
958
959 #endif
960
961
962 }
963

  ViewVC Help
Powered by ViewVC 1.1.22