/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/merge.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/merge.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch point for: Import, MAIN
File MIME type: text/plain
Initial revision

1 adcroft 1.1 /*
2     ** This program and library is free software; you can redistribute it and/or
3     ** modify it under the terms of the GNU General Public License
4     ** as published by the Free Software Foundation; either version 2
5     ** of the License, or any later version.
6     **
7     ** This program is distributed in the hope that it will be useful,
8     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
9     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10     ** GNU (Library) General Public License for more details.
11     **
12     ** You should have received a copy of the GNU (Library) General Public License
13     ** along with this program; if not, write to the Free Software
14     ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15     **-----------------------------------------------------------------
16     **
17     ** rewritten from scratch - moseley Oct 17, 2001
18     **
19     */
20    
21     #include <assert.h> /* for bug hunting */
22     #include "swish.h"
23     #include "mem.h"
24     #include "string.h"
25     #include "merge.h"
26     #include "error.h"
27     #include "search.h"
28     #include "index.h"
29     #include "hash.h"
30     #include "file.h"
31     #include "docprop.h"
32     #include "list.h"
33     #include "compress.h"
34     #include "metanames.h"
35     #include "db.h"
36     #include "dump.h"
37     #include "result_sort.h"
38     #include "swish_qsort.h"
39     #include "result_output.h"
40     #include "parse_conffile.h"
41     static void dup_header( SWISH *sw_input, SWISH *sw_output );
42     static void check_header_match( IndexFILE *in_index, SWISH *sw_output );
43     static void make_meta_map( IndexFILE *in_index, SWISH *sw_output);
44     static void load_filename_sort( SWISH *sw, IndexFILE *cur_index );
45     static IndexFILE *get_next_file_in_order( SWISH *sw_input );
46     static void add_file( FILE *filenum_map, IndexFILE *cur_index, SWISH *sw_input, SWISH *sw_output );
47     static int *get_map( FILE *filenum_map, IndexFILE *cur_index );
48     static void dump_index_words(SWISH * sw, IndexFILE * indexf, SWISH *sw_output );
49     static void write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, ENTRY *e, int metaID, int posdata );
50    
51    
52     // #define DEBUG_MERGE
53    
54     /****************************************************************************
55     * merge_indexes -- reads from input indexes, and outputs a new index
56     *
57     *
58     *****************************************************************************/
59    
60     void merge_indexes( SWISH *sw_input, SWISH *sw_output )
61     {
62     IndexFILE *cur_index;
63     FILE *filenum_map;
64     char *tmpfilename;
65     struct MOD_Index *idx_output = sw_output->Index;
66     ENTRY *e;
67     int hash,
68     sz_worddata,
69     tmpval,
70     filenum,
71     metaID = 0,
72     frequency,
73     loc_count = 0,
74     word_count = 0;
75     long wordID;
76     unsigned long nextposmetaID = 0L;
77     unsigned char *worddata;
78     unsigned char *s;
79     unsigned char flag;
80     int local_posdata[MAX_STACK_POSITIONS];
81     int *posdata;
82     int i;
83    
84     /*******************************************************************************
85     * Get ready to merge the indexes. For each index:
86     * - check that it has the correct headers
87     * - create meta entries in output index, and create a map to convert metas
88     * - load an array of file numbers sorted by filename so can merge sort the filesnames
89     * - set some initial defaults.
90     *********************************************************************************/
91    
92     cur_index = sw_input->indexlist;
93     while( cur_index )
94     {
95     printf("Input index '%s' has %d files and %d words\n", cur_index->line, cur_index->header.totalfiles, cur_index->header.totalwords);
96    
97     if ( cur_index == sw_input->indexlist )
98     /* Duplicate the first index's header into the output index */
99     dup_header( sw_input, sw_output );
100     else
101     check_header_match( cur_index, sw_output ); // errors if headers don't match - don't really need to check first one since it was the one that was dupped
102    
103    
104     make_meta_map( cur_index, sw_output); // add metas to new index, and create map
105    
106     load_filename_sort( sw_input, cur_index ); // so can read in filename order
107    
108     cur_index->current_file = 0;
109     cur_index->cur_prop = NULL;
110    
111     #ifdef DEBUG_MERGE
112     dump_metanames( sw_input, cur_index, 1 );
113     dump_metanames( sw_output, sw_output->indexlist, 0 );
114     #endif
115    
116     cur_index = cur_index->next;
117     }
118    
119    
120     #ifdef DEBUG_MERGE
121     printf("----- Output Header ----------\n");
122     resultPrintHeader(sw_output, 0, &sw_output->indexlist->header, sw_output->indexlist->line, 0);
123     #endif
124    
125    
126    
127     /****************************************************************************
128     * Now, read in filename order (so can throw out duplicates)
129     * - read properties and write out to new index
130     * - write a temporay of records to identify
131     * - indexfile
132     * - old filenum to new filenum mapping
133     * - total words per file, if set
134     ****************************************************************************/
135    
136     /* place to store file number map and total words per file */
137     filenum_map = create_tempfile(sw_input, F_WRITE_BINARY, "fnum", &tmpfilename, 0 );
138    
139     while( (cur_index = get_next_file_in_order( sw_input )) )
140     add_file( filenum_map, cur_index, sw_input, sw_output );
141    
142    
143    
144     /* Don't need the pre-sorted indexes any more */
145     for ( cur_index = sw_input->indexlist; cur_index; cur_index = cur_index->next )
146     {
147     efree( cur_index->path_order );
148     cur_index->path_order = NULL;
149     }
150    
151     fclose( filenum_map );
152    
153     if ( !(filenum_map = fopen( tmpfilename, F_READ_BINARY )) )
154     progerrno("failed to reopen '%s' :", tmpfilename );
155    
156    
157    
158     /****************************************************************************
159     * Finally, read the indexes one-by-one to read word and position data
160     * - reads through the temp file for each index to build a filenumber map
161     *
162     ****************************************************************************/
163    
164     /* 08/2002 jmruiz
165     ** First of all, get all the words
166     */
167     cur_index = sw_input->indexlist;
168     while( cur_index )
169     {
170     dump_index_words(sw_input, cur_index, sw_output);
171     /* Get filr_num_map for later proccess */
172     cur_index->merge_file_num_map = get_map( filenum_map, cur_index );
173     cur_index = cur_index->next;
174     }
175    
176     /* At this point we have all the words. Now we have to get worddata
177     * and merge it
178     */
179     word_count = 0;
180     printf("Processing words in index '%s': %6d words\r", sw_output->indexlist->line, word_count);
181     fflush(stdout);
182     /* walk the hash list to merge worddata */
183     for (hash = 0; hash < VERYBIGHASHSIZE; hash++)
184     {
185     if (idx_output->hashentriesdirty[hash])
186     {
187     idx_output->hashentriesdirty[hash] = 0;
188     for (e = idx_output->hashentries[hash]; e; e = e->next)
189     {
190     word_count++;
191     /* Search the word in all index and get worddata */
192     cur_index = sw_input->indexlist;
193     while( cur_index )
194     {
195     DB_ReadWordHash(sw_input, e->word, &wordID, cur_index->DB);
196     /* If word exits in the index */
197     if(wordID)
198     {
199    
200     DB_ReadWordData(sw_input, wordID, &worddata, &sz_worddata, cur_index->DB);
201    
202     /* Now, parse word's data */
203     s = worddata;
204     tmpval = uncompress2(&s); /* tfrequency */
205     metaID = uncompress2(&s); /* metaID */
206    
207     if (metaID)
208     {
209     nextposmetaID = UNPACKLONG2(s);
210     s += sizeof(long);
211     }
212    
213     filenum = 0;
214    
215     while(1)
216     { /* Read on all items */
217     uncompress_location_values(&s,&flag,&tmpval,&frequency);
218     filenum += tmpval;
219     /* Use stack array when possible to avoid malloc/free overhead */
220     if(frequency > MAX_STACK_POSITIONS)
221     posdata = (int *) emalloc(frequency * sizeof(int));
222     else
223     posdata = local_posdata;
224    
225     /* Read the positions */
226     uncompress_location_positions(&s,flag,frequency,posdata);
227    
228    
229     /* now we have the word data */
230     for (i = 0; i < frequency; i++, loc_count++)
231     write_word_pos( sw_input, cur_index, sw_output, cur_index->merge_file_num_map, filenum, e, metaID, posdata[i]);
232    
233     if(e->tfrequency)
234     {
235     /* 08/2002 jmruiz - We will call CompressCurrentLocEntry from time
236     ** to time to help addentry.
237     ** If we do not do this, addentry routine will have to run linked lists
238     ** of positions with thousands of elements and makes the merge proccess
239     ** very slow
240     */
241     if(!(loc_count % 100))
242     CompressCurrentLocEntry(sw_output, sw_output->indexlist, e);
243     }
244    
245    
246     if(posdata != local_posdata)
247     efree(posdata);
248    
249     if ((s - worddata) == sz_worddata)
250     break; /* End of worddata */
251    
252     if ((unsigned long)(s - worddata) == nextposmetaID)
253     {
254     filenum = 0;
255     metaID = uncompress2(&s);
256     if (metaID)
257     {
258     nextposmetaID = UNPACKLONG2(s);
259     s += sizeof(long);
260     }
261     else
262     nextposmetaID = 0L;
263     }
264     }
265    
266     if(e->tfrequency)
267     CompressCurrentLocEntry(sw_output, sw_output->indexlist, e);
268    
269     efree(worddata);
270     }
271     cur_index = cur_index->next;
272     }
273     /* Let's coalesce locations for each word to save memory
274     ** This makes use of the -e feature
275     ** Because we are proccessing one word at a time we can
276     ** coalesce its data just once
277     */
278     coalesce_word_locations(sw_output,sw_output->indexlist,e);
279    
280     if(!(word_count % 1000))
281     {
282     /* Make zone available for reuse and save memory */
283     Mem_ZoneReset(sw_output->Index->currentChunkLocZone);
284     sw_output->Index->freeLocMemChain = NULL;
285     printf("Processing words in index '%s': %6d words\r", sw_output->indexlist->line, word_count);
286     }
287     }
288     }
289     }
290    
291     printf("Processing words in index '%s': %6d words\n", sw_output->indexlist->line, word_count);
292     fflush(stdout);
293    
294     cur_index = sw_input->indexlist;
295     while( cur_index )
296     {
297     /* free the maps */
298     efree( cur_index->merge_file_num_map );
299     efree( cur_index->meta_map );
300     cur_index->meta_map = NULL;
301     cur_index = cur_index->next;
302     }
303    
304    
305     #ifdef DEBUG_MERGE
306     printf("----- Final Output Header ----------\n");
307     resultPrintHeader(sw_output, 0, &sw_output->indexlist->header, sw_output->indexlist->line, 0);
308     #endif
309    
310     remove( tmpfilename );
311     efree( tmpfilename );
312     }
313    
314     /****************************************************************************
315     * dup_header -- duplicates a header
316     *
317     * rereads the header from the data base, and clears out some values
318     *
319     *****************************************************************************/
320    
321     static void dup_header( SWISH *sw_input, SWISH *sw_output )
322     {
323     INDEXDATAHEADER *out_header = &sw_output->indexlist->header;
324    
325     // probably need to free the sw_output header from what's created in swishnew.
326    
327     /* Read in the header from the first merge file and store in the output file */
328     read_header(sw_input, out_header, sw_input->indexlist->DB);
329    
330     out_header->totalfiles = 0;
331     out_header->totalwords = 0;
332    
333     freeMetaEntries( out_header );
334    
335     if ( out_header->indexedon )
336     {
337     efree( out_header->indexedon );
338     out_header->indexedon = NULL;
339     out_header->lenindexedon = 0;
340     }
341     }
342    
343     /****************************************************************************
344     * check_header_match -- makes sure that the imporant settings match
345     *
346     *
347     *****************************************************************************/
348    
349     // This assumes that the size will always preceed the content.
350     typedef struct
351     {
352     int len;
353     char *str;
354     } *HEAD_CMP;
355    
356     static void compare_header( char *index, char *name, void *in, void *out )
357     {
358     HEAD_CMP in_item = (HEAD_CMP)in;
359     HEAD_CMP out_item = (HEAD_CMP)out;
360    
361     if ( in_item->len != out_item->len )
362     progerr("Header %s in index %s doesn't match length in length with output header", name, index );
363    
364     if ( strcmp( (const char *)in_item->str, (const char *)out_item->str ))
365     progerr("Header %s in index %s doesn't match output header", name, index );
366    
367     //if ( memcmp( (const void *)in_item->str, (const void *)out_item->str, in_item->len ) )
368     // progerr("Header %s in index %s doesn't match output header", name, index );
369    
370    
371    
372    
373     }
374    
375    
376     static void check_header_match( IndexFILE *in_index, SWISH *sw_output )
377     {
378     INDEXDATAHEADER *out_header = &sw_output->indexlist->header;
379     INDEXDATAHEADER *in_header = &in_index->header;
380    
381     compare_header( in_index->line, "WordCharacters", &in_header->lenwordchars, &out_header->lenwordchars );
382     compare_header( in_index->line, "BeginCharacters", &in_header->lenbeginchars, &out_header->lenbeginchars );
383     compare_header( in_index->line, "EndCharacters", &in_header->lenendchars, &out_header->lenendchars );
384    
385     compare_header( in_index->line, "IgnoreLastChar", &in_header->lenignorelastchar, &out_header->lenignorelastchar );
386     compare_header( in_index->line, "IgnoreFirstChar", &in_header->lenignorefirstchar, &out_header->lenignorefirstchar );
387    
388     compare_header( in_index->line, "BumpPositionChars", &in_header->lenbumpposchars, &out_header->lenbumpposchars );
389    
390    
391     if ( in_header->fuzzy_mode != out_header->fuzzy_mode )
392     progerr("FuzzyIndexingMode in index %s of '%s' doesn't match '%s'",
393     in_index->line,
394     fuzzy_mode_to_string( in_header->fuzzy_mode ),
395     fuzzy_mode_to_string( out_header->fuzzy_mode ) );
396    
397    
398     if ( in_header->ignoreTotalWordCountWhenRanking != out_header->ignoreTotalWordCountWhenRanking )
399     progerr("ignoreTotalWordCountWhenRanking Rules doesn't match for index %s", in_index->line );
400    
401     if ( memcmp( &in_header->translatecharslookuptable, &out_header->translatecharslookuptable, sizeof(in_header->translatecharslookuptable) / sizeof( int ) ) )
402     progerr("TranslateChars header doesn't match for index %s", in_index->line );
403    
404    
405     //??? need to compare stopword lists
406    
407     //??? need to compare buzzwords
408    
409     }
410    
411     /****************************************************************************
412     * make_meta_map - adds metanames to output index and creates map
413     *
414     *
415     *****************************************************************************/
416    
417     static void make_meta_map( IndexFILE *in_index, SWISH *sw_output)
418     {
419     INDEXDATAHEADER *out_header = &sw_output->indexlist->header;
420     INDEXDATAHEADER *in_header = &in_index->header;
421     int i;
422     struct metaEntry *in_meta;
423     struct metaEntry *out_meta;
424     int *meta_map;
425    
426    
427     meta_map = emalloc( sizeof( int ) * (in_header->metaCounter + 1) );
428     memset( meta_map, 0, sizeof( int ) * (in_header->metaCounter + 1) );
429    
430     for( i = 0; i < in_header->metaCounter; i++ )
431     {
432     in_meta = in_header->metaEntryArray[i];
433    
434    
435     /* Try to see if it's an existing metaname */
436     out_meta = is_meta_index( in_meta )
437     ? getMetaNameByNameNoAlias( out_header, in_meta->metaName )
438     : getPropNameByNameNoAlias( out_header, in_meta->metaName );
439    
440     /* if it's not found, then add it */
441     if ( !out_meta )
442     out_meta = addMetaEntry(out_header, in_meta->metaName, in_meta->metaType, 0);
443     else
444     if (out_meta->metaType != in_meta->metaType )
445     progerr("meta name %s in index %s is different type than in output index", in_meta->metaName, in_index->line );
446    
447    
448     /* Now, save the mapping */
449     meta_map[ in_meta->metaID ] = out_meta->metaID;
450    
451    
452     /* now here's a pain, and lots of room for screw up. */
453     /* Basically, check for alias mappings, and that they are correct */
454     /* you can say title is an alias for swishtitle in one index, and then say */
455     /* title is an alias for doctitle in another index */
456    
457     /* If it's an alias, then make that mapping, too */
458     if ( in_meta->alias )
459     {
460     struct metaEntry *in_alias;
461     struct metaEntry *out_alias;
462    
463     /* Grab alias meta entry so we can look it up in the out_header */
464    
465     in_alias = is_meta_index( in_meta )
466     ? getMetaNameByID( in_header, in_meta->alias )
467     : getPropNameByID( in_header, in_meta->alias );
468    
469     if ( !in_alias )
470     progerr("Failed to lookup alias for %s in index %s", in_meta->metaName, in_index->line );
471    
472    
473     /* now lookup the alias in the out_header by name */
474     out_alias = is_meta_index( in_alias )
475     ? getMetaNameByNameNoAlias( out_header, in_alias->metaName )
476     : getPropNameByNameNoAlias( out_header, in_alias->metaName );
477    
478    
479     /* should be there, since it would have been added earlier - the real metas must be added before the aliases */
480     if ( !out_alias )
481     progerr("Failed to lookup alias for %s in output index", out_meta->metaName );
482    
483    
484     /* If this is new (or doesn't point to the alias root, then just assign it */
485     if ( !out_meta->alias )
486     out_meta->alias = out_alias->metaID;
487    
488     /* else, if it is already an alias, but points someplace else, we have a problem */
489     else if ( out_meta->alias != out_alias->metaID )
490     progerr("In index %s metaname '%s' is an alias for '%s'(%d). But another index already mapped '%s' to ID# '%d'", in_index->line, in_meta->metaName, in_alias->metaName, in_alias->metaID, out_meta->metaName, out_meta->alias );
491     }
492     }
493    
494     in_index->meta_map = meta_map;
495    
496    
497     #ifdef DEBUG_MERGE
498     printf(" %s -> %s ** Meta Map **\n", in_index->line, sw_output->indexlist->line );
499     for ( i=0; i<in_header->metaCounter + 1;i++)
500     printf("%4d -> %3d\n", i, meta_map[i] );
501     #endif
502    
503     }
504    
505     /****************************************************************************
506     * load_filename_sort - creates an array for reading in filename order
507     *
508     *
509     *****************************************************************************/
510    
511     static int *sorted_data;
512    
513     static int compnums(const void *s1, const void *s2)
514     {
515     int a = *(int *)s1; // filenumber passed from qsort
516     int b = *(int *)s2;
517     int v1 = sorted_data[ a-1 ];
518     int v2 = sorted_data[ b-1 ];
519    
520     // return v1 <=> v2;
521    
522     if ( v1 < v2 )
523     return -1;
524     if ( v1 > v2 )
525     return 1;
526    
527     return 0;
528     }
529    
530    
531     static void load_filename_sort( SWISH *sw, IndexFILE *cur_index )
532     {
533     struct metaEntry *path_meta = getPropNameByName( &cur_index->header, AUTOPROPERTY_DOCPATH );
534     int i;
535     int *sort_array;
536     int totalfiles = cur_index->header.totalfiles;
537    
538     if ( !path_meta )
539     progerr("Can't merge index %s. It doesn't contain the property %s", cur_index->line, AUTOPROPERTY_DOCPATH );
540    
541    
542     /* Save for looking up pathname when sorting */
543     cur_index->path_meta = path_meta;
544    
545     /* Case is important for most OS when comparing file names */
546     cur_index->path_meta->metaType &= ~META_IGNORE_CASE;
547    
548    
549    
550     cur_index->modified_meta = getPropNameByName( &cur_index->header, AUTOPROPERTY_LASTMODIFIED );
551    
552    
553     if ( !LoadSortedProps( sw, cur_index, path_meta ) )
554     {
555     FileRec fi;
556     memset( &fi, 0, sizeof( FileRec ));
557     path_meta->sorted_data = CreatePropSortArray( sw, cur_index, path_meta, &fi, 1 );
558     }
559    
560    
561     /* So the qsort compare function can read it */
562     sorted_data = path_meta->sorted_data;
563    
564    
565     if ( !sorted_data )
566     progerr("failed to load or create sorted properties for index %s", cur_index->line );
567    
568    
569     sort_array = emalloc( totalfiles * sizeof( int ) );
570     memset( sort_array, 0, totalfiles * sizeof( int ) );
571    
572    
573     /* build an array with file numbers and sort into filename order */
574     for ( i = 0; i < totalfiles; i++ )
575     sort_array[i] = i+1; // filenumber starts a one
576    
577    
578     swish_qsort( sort_array, totalfiles, sizeof( int ), &compnums);
579    
580     cur_index->path_order = sort_array;
581    
582     efree( path_meta->sorted_data );
583     path_meta->sorted_data = NULL;
584     }
585    
586     /****************************************************************************
587     * get_next_file_in_order -- grabs the next file entry from all the indexes
588     * in filename (and then modified date) order
589     *
590     *
591     *****************************************************************************/
592    
593     /* This isn't really accurate, as some other file may come and replace the newer */
594    
595     static void print_file_removed(IndexFILE *older, propEntry *op, IndexFILE *newer, propEntry *np )
596     {
597    
598     char *p1, *d1, *p2, *d2;
599     p1 = DecodeDocProperty( older->path_meta, older->cur_prop );
600     d1 = DecodeDocProperty( older->modified_meta, op );
601    
602     p2 = DecodeDocProperty( newer->path_meta, newer->cur_prop );
603     d2 = DecodeDocProperty( newer->modified_meta, np );
604    
605     printf("Replaced file '%s %s' with '%s %s'\n", p1, d1, p2, d2);
606     }
607    
608    
609     static IndexFILE *get_next_file_in_order( SWISH *sw_input )
610     {
611     IndexFILE *winner = NULL;
612     IndexFILE *cur_index = sw_input->indexlist;
613     FileRec fi;
614     int ret;
615     propEntry *wp, *cp;
616    
617     memset(&fi, 0, sizeof( FileRec ));
618    
619     for ( cur_index = sw_input->indexlist; cur_index; cur_index = cur_index->next )
620     {
621     /* don't use cached props, as they belong to a different index! */
622     if ( fi.prop_index )
623     efree( fi.prop_index );
624     memset(&fi, 0, sizeof( FileRec ));
625    
626     /* still some to read in this index? */
627     if ( cur_index->current_file >= cur_index->header.totalfiles )
628     continue;
629    
630    
631    
632     /* get file number from lookup table */
633     fi.filenum = cur_index->path_order[cur_index->current_file];
634    
635     if ( !cur_index->cur_prop )
636     cur_index->cur_prop = ReadSingleDocPropertiesFromDisk(sw_input, cur_index, &fi, cur_index->path_meta->metaID, 0 );
637    
638    
639     if ( !winner )
640     {
641     winner = cur_index;
642     continue;
643     }
644    
645     ret = Compare_Properties( cur_index->path_meta, cur_index->cur_prop, winner->cur_prop );
646    
647     if ( ret != 0 )
648     {
649     if ( ret < 0 ) /* take cur_index if it's smaller */
650     winner = cur_index;
651    
652     continue;
653     }
654    
655    
656    
657     /* if they are the same name, then take the newest, and increment the older one */
658    
659    
660     /* read the modified time for the current file */
661     /* Use the same fi record, because it has the cached prop seek locations */
662     cp = ReadSingleDocPropertiesFromDisk(sw_input, cur_index, &fi, cur_index->modified_meta->metaID, 0 );
663    
664    
665     /* read the modified time for the current winner */
666     if ( fi.prop_index )
667     efree( fi.prop_index );
668     memset(&fi, 0, sizeof( FileRec ));
669    
670     fi.filenum = winner->path_order[winner->current_file];
671     wp = ReadSingleDocPropertiesFromDisk(sw_input, winner, &fi, cur_index->modified_meta->metaID, 0 );
672    
673     ret = Compare_Properties( cur_index->modified_meta, cp, wp );
674    
675    
676    
677     /* If current is greater (newer) then throw away winner */
678     if ( ret > 0 )
679     {
680     print_file_removed( winner, wp, cur_index, cp);
681     winner->current_file++;
682     if ( winner->cur_prop )
683     efree( winner->cur_prop );
684     winner->cur_prop = NULL;
685     winner = cur_index;
686     }
687     /* else, keep winner, and throw away current */
688     else
689     {
690     print_file_removed(cur_index, cp, winner, wp );
691     cur_index->current_file++;
692     if ( cur_index->cur_prop )
693     efree( cur_index->cur_prop );
694    
695     cur_index->cur_prop = NULL;
696     }
697    
698     freeProperty( cp );
699     freeProperty( wp );
700    
701     }
702    
703     if ( fi.prop_index )
704     efree( fi.prop_index );
705    
706    
707     if ( !winner )
708     return NULL;
709    
710    
711     winner->filenum = winner->path_order[winner->current_file++];
712    
713     #ifdef DEBUG_MERGE
714     printf(" Files in order: index %s file# %d winner\n", winner->line, winner->filenum );
715     #endif
716    
717     /* free prop, as it's not needed anymore */
718     if ( winner->cur_prop )
719     efree( winner->cur_prop );
720     winner->cur_prop = NULL;
721    
722    
723     return winner;
724     }
725    
726    
727     /****************************************************************************
728     * add_file
729     *
730     * Now, read in filename order (so can throw out duplicates)
731     * - read properties and write out to new index
732     * - write a temporay of records to identify
733     * - indexfile
734     * - old filenum to new filenum mapping
735     * - total words per file, if set
736     ****************************************************************************/
737    
738     static void add_file( FILE *filenum_map, IndexFILE *cur_index, SWISH *sw_input, SWISH *sw_output )
739     {
740     FileRec fi;
741     IndexFILE *indexf = sw_output->indexlist;
742     struct MOD_Index *idx = sw_output->Index;
743     docProperties *d;
744     int i;
745     propEntry *tmp;
746     docProperties *docProperties=NULL;
747     struct metaEntry meta_entry;
748    
749    
750     meta_entry.metaName = "(default)"; /* for error message, I think */
751    
752    
753     memset( &fi, 0, sizeof( FileRec ));
754    
755    
756     #ifdef DEBUG_MERGE
757     printf("Reading Properties from input index '%s' file %d\n", cur_index->line, cur_index->filenum);
758     #endif
759    
760     /* read the properties and map them as needed */
761     d = ReadAllDocPropertiesFromDisk( sw_input, cur_index, cur_index->filenum );
762    
763    
764     #ifdef DEBUG_MERGE
765     fi.docProperties = d;
766     dump_file_properties( cur_index, &fi );
767     #endif
768    
769    
770    
771     /* all this off-by-one things are a mess */
772    
773     /* read through all the property slots, and map them, as needed */
774     for ( i = 0; i < d->n; i++ )
775     if ( (tmp = d->propEntry[i]) )
776     {
777     meta_entry.metaID = cur_index->meta_map[ i ];
778     addDocProperty(&docProperties, &meta_entry, tmp->propValue, tmp->propLen, 1 );
779     }
780    
781     #ifdef DEBUG_MERGE
782     printf(" after mapping file %s\n", indexf->line);
783     fi.docProperties = docProperties;
784     dump_file_properties( cur_index, &fi );
785     printf("\n");
786     #endif
787    
788    
789     /* Now bump the file counter */
790     idx->filenum++;
791     indexf->header.totalfiles++;
792    
793     if ( docProperties ) /* always true */
794     {
795     fi.filenum = idx->filenum;
796     fi.docProperties = docProperties;
797    
798     WritePropertiesToDisk( sw_output , &fi );
799    
800     freeDocProperties( d );
801     }
802    
803    
804    
805    
806     /* now write out the data to be used for mapping file for a given index. */
807     // compress1( cur_index->filenum, filenum_map, fputc ); // what file number this came from
808    
809     fwrite( &cur_index->filenum, sizeof(int), 1, filenum_map);
810     fwrite( &cur_index, sizeof(IndexFILE *), 1, filenum_map); // what index
811    
812    
813     /* Save total words per file */
814     if ( !indexf->header.ignoreTotalWordCountWhenRanking )
815     {
816     INDEXDATAHEADER *header = &indexf->header;
817     int idx1 = fi.filenum - 1;
818    
819     if ( !header->TotalWordsPerFile || idx1 >= header->TotalWordsPerFileMax )
820     {
821     header->TotalWordsPerFileMax += 20000; /* random guess -- could be a config setting */
822     header->TotalWordsPerFile = erealloc( header->TotalWordsPerFile, header->TotalWordsPerFileMax * sizeof(int) );
823     }
824    
825     header->TotalWordsPerFile[idx1] = cur_index->header.TotalWordsPerFile[cur_index->filenum-1];
826     }
827     }
828    
829     /****************************************************************************
830     * Builds a old_filenum -> new_filenum map;
831     *
832     * This makes is so you can lookup an old file number and map it to a new file number
833     *
834     ****************************************************************************/
835    
836     static int *get_map( FILE *filenum_map, IndexFILE *cur_index )
837     {
838     int *array = emalloc( (cur_index->header.totalfiles+1) * sizeof( int ) );
839     IndexFILE *idf;
840     int filenum;
841     int new_filenum = 0;
842    
843    
844    
845     memset( array, 0, (cur_index->header.totalfiles+1) * sizeof( int ) );
846    
847    
848     clearerr( filenum_map );
849     fseek( filenum_map, 0, 0 ); /* start at beginning */
850    
851     while ( 1 )
852     {
853     new_filenum++;
854    
855     if (!fread( &filenum, sizeof(int), 1, filenum_map))
856     break;
857    
858    
859     if(!fread( &idf, sizeof(IndexFILE *), 1, filenum_map))
860     break;
861    
862     if ( idf == cur_index )
863     array[filenum] = new_filenum;
864    
865     }
866    
867     return array;
868     }
869    
870     /****************************************************************************
871     * Reads the index to get the all the words
872     ****************************************************************************/
873    
874     static void dump_index_words(SWISH * sw, IndexFILE * indexf, SWISH *sw_output)
875     {
876     int j;
877     int word_count = 0;
878     char word[2];
879     char *resultword;
880     long wordID;
881    
882     DB_InitReadWords(sw, indexf->DB);
883    
884    
885     printf("Getting words in index '%s': %3d words\r", indexf->line, word_count);
886     fflush(stdout);
887    
888     for(j=0;j<256;j++)
889     {
890    
891     word[0] = (unsigned char) j; word[1] = '\0';
892     DB_ReadFirstWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB);
893    
894     while(wordID)
895     {
896     /* Add resultword to output */
897     getentry(sw_output, resultword);
898     efree(resultword);
899     DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB);
900     word_count++;
901     if(!word_count % 10000)
902     printf("Getting words in index '%s': %3d words\r", indexf->line, word_count);
903     }
904     }
905     printf("Getting words in index '%s': %6d words\n", indexf->line, word_count);
906    
907     DB_EndReadWords(sw, indexf->DB);
908    
909     }
910    
911     /****************************************************************************
912     * Writes a word out to the index
913     *
914     *
915     ****************************************************************************/
916    
917     static void write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, ENTRY *e, int metaID, int posdata )
918     {
919     int new_file;
920     int new_meta;
921    
922     #ifdef DEBUG_MERGE
923     printf("\nindex %s '%s' Struct: %d Pos: %d",
924     indexf->line, e->word, structure, position );
925    
926    
927     if ( !(new_file = file_num_map[ filenum ]) )
928     {
929     printf(" file: %d **File deleted!**\n", filenum);
930     return;
931     }
932    
933     if ( !(new_meta = indexf->meta_map[ metaID ] ))
934     {
935     printf(" file: %d **Failed to map meta ID **\n", filenum);
936     return;
937     }
938    
939     printf(" File: %d -> %d Meta: %d -> %d\n", filenum, new_file, metaID, new_meta );
940    
941     addentry( sw_output, e, new_file, structure, metaID, position );
942    
943     return;
944    
945    
946     #else
947    
948    
949     if ( !(new_file = file_num_map[ filenum ]) )
950     return;
951    
952     if ( !(new_meta = indexf->meta_map[ metaID ] ))
953     return;
954    
955     addentry( sw_output, e, new_file, GET_STRUCTURE(posdata), metaID, GET_POSITION(posdata) );
956    
957     return;
958    
959     #endif
960    
961    
962     }
963    

  ViewVC Help
Powered by ViewVC 1.1.22