/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/index.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/index.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (hide annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 adcroft 1.1 /*
2     $Id: index.c,v 1.194 2002/08/29 13:59:48 jmruiz Exp $
3     **
4     ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
5     ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
6     **
7     ** This program and library is free software; you can redistribute it and/or
8     ** as published by the Free Software Foundation; either version 2
9     ** of the License, or any later version.
10     **
11     ** This program is distributed in the hope that it will be useful,
12     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
13     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14     ** GNU (Library) General Public License for more details.
15     **
16     ** You should have received a copy of the GNU (Library) General Public License
17     ** long with this program; if not, write to the Free Software
18     ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19     **--------------------------------------------------------------------
20     ** ** ** PATCHED 5/13/96, CJC
21     **
22     ** Added code to countwords and countwordstr to disreguard the last char
23     ** if requiered by the config.h
24     ** G. Hill 3/12/97 ghill@library.berkeley.edu
25     **
26     ** Changed addentry, countwords, countwordstr, parsecomment, rintindex
27     ** added createMetaEntryList, getMeta, parseMetaData
28     ** to support METADATA
29     ** G. Hill 3/18/97 ghill@library.berkeley.edu
30     **
31     ** Changed removestops to support printing of stop words
32     ** G. Hill 4/7/97
33     **
34     ** Changed countwords, countwrdstr, and parseMetaData to disreguard the
35     ** first char if required by the config.h
36     ** G.Hill 10/16/97 ghill@library.berkeley.edu
37     **
38     ** Added stripIgnoreLastChars and isIgnoreLastChar routines which iteratively
39     ** remove all ignore characters from the end of each word.
40     ** P. Bergner 10/5/97 bergner@lcse.umn.edu
41     **
42     ** Added stripIgnoreFirstChars and isIgnoreFirstChar to make stripping of
43     ** the ignore first chars iterative.
44     ** G. Hill 11/19/97 ghill@library.berkeley.edu
45     **
46     ** Added possibility of use of quotes and brackets in meta CONTENT countwords, parsemetadata
47     ** G. Hill 1/14/98
48     **
49     ** Added regex for replace rule G.Hill 1/98
50     **
51     ** REQMETANAME - don't index meta tags not specified in MetaNames
52     ** 10/11/99 - Bill Moseley
53     **
54     ** change sprintf to snprintf to avoid corruption, use MAXPROPLEN instead of literal "20",
55     ** added include of merge.h - missing declaration caused compile error in prototypes,
56     ** added word length arg to Stem() call for strcat overflow checking in stemmer.c
57     ** added safestrcpy() macro to avoid corruption from strcpy overflow
58     ** SRE 11/17/99
59     **
60     ** fixed misc problems pointed out by "gcc -Wall"
61     ** SRE 2/22/00
62     **
63     ** Added code for storing word positions in index file
64     ** Jose Ruiz 3/00 jmruiz@boe.es
65     **
66     ** 04/00 - Jose Ruiz
67     ** Added code for a hash table in index file for searching words
68     ** via getfileinfo in search.c (Lots of addons). Better perfomance
69     ** with big databases and or heavy searchs (a* or b* or c*)
70     **
71     ** 04/00 - Jose Ruiz
72     ** Improved number compression function (compress)
73     ** New number decompress function
74     ** Both converted into macros for better performance
75     **
76     ** 07/00 and 08/00 - Jose Ruiz
77     ** Many modifications to make some functions thread safe
78     **
79     ** 08/00 - Jose Ruiz
80     ** New function indexstring. Up to now there were 4 functions doing almost
81     ** the same thing: countwords, countwordstr, parseMetaData and parsecomment
82     ** From now on, these 4 functions calls indexstring wich is the common part
83     ** to all of them. In fact, countwordstr, parseMetaData and parsecomment
84     ** are now simple frontends to indexstring
85     **
86     ** 2000-11 - rasc
87     ** some redesgin, place common index code into a common routine
88     ** FileProp structures, routines
89     **
90     ** --
91     ** TODO
92     ** $$ there has still to be some resesign to be done.
93     ** $$ swish-e was originally designed to index html only. So the routines
94     ** $$ are for historically reasons scattered
95     ** $$ (e.g. isoktitle (), is ishtml() etc.)
96     **
97     ** 2000-12 Jose Ruiz
98     ** obsolete routine ishtml removed
99     ** isoktitle moved to html.c
100     **
101     ** 2001-03-02 rasc Header: write translatecharacters
102     ** 2001-03-14 rasc resultHeaderOutput -H n
103     ** 2001-03-24 rasc timeroutines rearranged
104     ** 2001-06-08 wsm Store word after ENTRY to save memory
105     ** 2001-08 jmruiz All locations stuff rewritten to save memory
106     **
107     */
108    
109     #include "swish.h"
110     #include "mem.h"
111     #include "string.h"
112     #include "index.h"
113     #include "hash.h"
114     #include "check.h"
115     #include "search.h"
116     #include "merge.h"
117     #include "docprop.h"
118     #include "stemmer.h"
119     #include "soundex.h"
120     #include "double_metaphone.h"
121     #include "error.h"
122     #include "file.h"
123     #include "compress.h"
124     /* Removed due to problems with patents
125     #include "deflate.h"
126     */
127     #include "html.h"
128     #include "xml.h"
129     #include "parser.h"
130     #include "txt.h"
131     #include "metanames.h"
132     #include "result_sort.h"
133     #include "result_output.h"
134     #include "filter.h"
135     #include "date_time.h"
136     #include "db.h"
137     #include "dump.h"
138     #include "swish_qsort.h"
139    
140     static void index_path_parts( SWISH *sw, char *path, path_extract_list *list, INDEXDATAHEADER *header, docProperties **properties );
141    
142    
143    
144     /*
145     -- init structures for this module
146     */
147    
148    
149     void initModule_Index (SWISH *sw)
150     {
151     int i;
152     struct MOD_Index *idx;
153    
154     idx = (struct MOD_Index *) emalloc(sizeof(struct MOD_Index));
155     memset( idx, 0, sizeof( struct MOD_Index ) );
156     sw->Index = idx;
157    
158     idx->filenum = 0;
159     idx->entryArray = NULL;
160    
161     idx->len_compression_buffer = MAXSTRLEN; /* For example */
162     idx->compression_buffer=(unsigned char *)emalloc(idx->len_compression_buffer);
163    
164     idx->len_worddata_buffer = MAXSTRLEN; /* For example */
165     idx->worddata_buffer=(unsigned char *)emalloc(idx->len_worddata_buffer);
166     idx->sz_worddata_buffer = 0;
167    
168     /* Init entries hash table */
169     for (i=0; i<VERYBIGHASHSIZE; i++)
170     {
171     idx->hashentries[i] = NULL;
172     idx->hashentriesdirty[i] = 0;
173     }
174    
175    
176     /* Economic flag and temp files*/
177     idx->swap_locdata = SWAP_LOC_DEFAULT;
178    
179    
180     for(i=0;i<BIGHASHSIZE;i++) idx->inode_hash[i]=NULL;
181    
182     /* initialize buffers used by indexstring */
183     idx->word = (char *) emalloc((idx->lenword = MAXWORDLEN) + 1);
184     idx->swishword = (char *) emalloc((idx->lenswishword = MAXWORDLEN) + 1);
185    
186     idx->plimit=PLIMIT;
187     idx->flimit=FLIMIT;
188     idx->nIgnoreLimitWords = 0;
189     idx->IgnoreLimitPositionsArray = NULL;
190    
191     /* Swapping access file functions */
192     idx->swap_tell = ftell;
193     idx->swap_write = fwrite;
194     idx->swap_close = fclose;
195     idx->swap_seek = fseek;
196     idx->swap_read = fread;
197     idx->swap_getc = fgetc;
198     idx->swap_putc = fputc;
199    
200     for( i = 0; i <MAX_LOC_SWAP_FILES ; i++)
201     {
202     idx->swap_location_name[i] = NULL;
203     idx->fp_loc_write[i] = NULL;
204     idx->fp_loc_read[i] = NULL;
205     }
206     /* Index in blocks of chunk_size documents */
207     idx->chunk_size = INDEX_DEFAULT_CHUNK_SIZE;
208    
209     /* Use this value to avoid using big zones just as a temporary location storage */
210     idx->optimalChunkLocZoneSize = INDEX_DEFAULT_OPTIMAL_CHUNK_ZONE_SIZE_FOR_LOCATIONS;
211    
212     idx->freeLocMemChain = NULL;
213    
214     /* memory zones for common structures */
215     idx->perDocTmpZone = Mem_ZoneCreate("Per Doc Temporal Zone", 0, 0);
216     idx->currentChunkLocZone = Mem_ZoneCreate("Current Chunk Locators", 0, 0);
217     idx->totalLocZone = Mem_ZoneCreate("All Locators", 0, 0);
218     idx->entryZone = Mem_ZoneCreate("struct ENTRY", 0, 0);
219    
220     /* table for storing which metaIDs to index */
221     idx->metaIDtable.max = 200; /* totally random guess */
222     idx->metaIDtable.num = 0;
223     idx->metaIDtable.array = (int *)emalloc( idx->metaIDtable.max * sizeof(int) );
224     idx->metaIDtable.defaultID = -1;
225    
226    
227     /* $$$ this is only a fix while http.c and httpserver.c still exist */
228     idx->tmpdir = estrdup(".");
229    
230     return;
231     }
232    
233    
234     /*
235     -- release all wired memory for this module
236     -- 2001-04-11 rasc
237     */
238    
239     void freeModule_Index (SWISH *sw)
240     {
241     struct MOD_Index *idx = sw->Index;
242     int i;
243    
244     /* we need to call the real free here */
245    
246     for( i = 0; i < MAX_LOC_SWAP_FILES ; i++)
247     {
248     if (idx->swap_location_name[i] && isfile(idx->swap_location_name[i]))
249     {
250     if (idx->fp_loc_read[i])
251     idx->swap_close(idx->fp_loc_read[i]);
252    
253     if (idx->fp_loc_write[i])
254     idx->swap_close(idx->fp_loc_write[i]);
255    
256     remove(idx->swap_location_name[i]);
257     }
258    
259    
260     if (idx->swap_location_name[i])
261     efree(idx->swap_location_name[i]);
262     }
263    
264     if(idx->tmpdir) efree(idx->tmpdir);
265    
266     /* Free compression buffer */
267     efree(idx->compression_buffer);
268     /* free worddata buffer */
269     efree(idx->worddata_buffer);
270    
271     /* free word buffers used by indexstring */
272     efree(idx->word);
273     efree(idx->swishword);
274    
275     /* free IgnoreLimit stuff */
276     if(idx->IgnoreLimitPositionsArray)
277     {
278     for(i=0; i<sw->indexlist->header.totalfiles; i++)
279     {
280     if(idx->IgnoreLimitPositionsArray[i])
281     {
282     efree(idx->IgnoreLimitPositionsArray[i]->pos);
283     efree(idx->IgnoreLimitPositionsArray[i]);
284     }
285     }
286     efree(idx->IgnoreLimitPositionsArray);
287     }
288    
289     /* should be free by now!!! But just in case... */
290     if (idx->entryZone)
291     Mem_ZoneFree(&idx->entryZone);
292    
293     if (idx->totalLocZone)
294     Mem_ZoneFree(&idx->totalLocZone);
295     if (idx->currentChunkLocZone)
296     Mem_ZoneFree(&idx->currentChunkLocZone);
297     if (idx->perDocTmpZone)
298     Mem_ZoneFree(&idx->perDocTmpZone);
299    
300    
301     if ( idx->entryArray )
302     efree( idx->entryArray);
303    
304    
305     efree( idx->metaIDtable.array );
306    
307     /* free module data */
308     efree (idx);
309     sw->Index = NULL;
310    
311    
312     return;
313     }
314    
315    
316     /*
317     ** ----------------------------------------------
318     **
319     ** Module config code starts here
320     **
321     ** ----------------------------------------------
322     */
323    
324    
325     /*
326     -- Config Directives
327     -- Configuration directives for this Module
328     -- return: 0/1 = none/config applied
329     */
330    
331     int configModule_Index (SWISH *sw, StringList *sl)
332    
333     {
334     struct MOD_Index *idx = sw->Index;
335     char *w0 = sl->word[0];
336     int retval = 1;
337     char *env_tmp = NULL;
338    
339     if (strcasecmp(w0, "tmpdir") == 0)
340     {
341     if (sl->n == 2)
342     {
343     idx->tmpdir = erealloc( idx->tmpdir, strlen( sl->word[1] ) + 1 );
344     strcpy( idx->tmpdir, sl->word[1] );
345     normalize_path( idx->tmpdir );
346    
347     if (!isdirectory(idx->tmpdir))
348     progerr("%s: %s is not a directory", w0, idx->tmpdir);
349    
350     if ( !( env_tmp = getenv("TMPDIR")) )
351     if ( !(env_tmp = getenv("TMP")) )
352     env_tmp = getenv("TEMP");
353    
354     if ( env_tmp )
355     progwarn("Configuration setting for TmpDir '%s' will be overridden by environment setting '%s'", idx->tmpdir, env_tmp );
356    
357    
358     }
359     else
360     progerr("%s: requires one value", w0);
361     }
362     else if (strcasecmp(w0, "IgnoreLimit") == 0)
363     {
364     if (sl->n == 3)
365     {
366     idx->plimit = atol(sl->word[1]);
367     idx->flimit = atol(sl->word[2]);
368     }
369     else
370     progerr("%s: requires two values", w0);
371     }
372     else
373     {
374     retval = 0; /* not a module directive */
375     }
376     return retval;
377     }
378    
379     /**************************************************************************
380     * Remove a file from the index. Used when the parser aborts
381     * while indexing. Typically because of FileRules.
382     *
383     **************************************************************************/
384    
385    
386     static void remove_last_file_from_list(SWISH * sw, IndexFILE * indexf)
387     {
388     struct MOD_Index *idx = sw->Index;
389     int i;
390     ENTRY *ep, *prev_ep;
391     LOCATION *l;
392    
393     /* Decrease filenum */
394     idx->filenum--;
395     indexf->header.totalfiles--;
396    
397     /* Should be removed */
398     if(idx->filenum < 0 || indexf->header.totalfiles < 0)
399     progerr("Internal error in remove_last_file_from_list");
400    
401    
402     /* walk the hash list to remove words */
403     for (i = 0; i < VERYBIGHASHSIZE; i++)
404     {
405     if (idx->hashentriesdirty[i])
406     {
407     idx->hashentriesdirty[i] = 0;
408     for (ep = idx->hashentries[i], prev_ep =NULL; ep; ep = ep->next)
409     {
410     if(ep->currentChunkLocationList)
411     {
412     /* First of all - Adjust tfrequency */
413     for(l = ep->currentChunkLocationList; l; l = l->next)
414     {
415     ep->tfrequency--;
416     }
417     /* Remove locations */
418     /* Do not use efree, locations uses a MemZone (currentChunkLocZone) */
419     /* Will be freed later */
420     ep->currentChunkLocationList = NULL;
421     ep->currentlocation = NULL;
422     /* If there is no locations we must also remove the word */
423     /* Do not call efree to remove the entry, entries use
424     ** a MemZone (perDocTmpZone) - Will be freed later */
425     if(!ep->allLocationList)
426     {
427     if(!prev_ep)
428     {
429     idx->hashentries[i] = ep->next;
430     }
431     else
432     {
433     prev_ep->next = ep->next;
434     }
435     /* Adjust word counters */
436     idx->entryArray->numWords--;
437     indexf->header.totalwords--;
438     }
439     }
440     else
441     {
442     prev_ep = ep;
443     }
444     }
445     }
446     }
447     }
448    
449    
450    
451     /**************************************************************************
452     * Index just the file name (or the title) for NoContents files
453     * $$$ this can be removed if libxml2 is used full time
454     **************************************************************************/
455     static int index_no_content(SWISH * sw, FileProp * fprop, FileRec *fi, char *buffer)
456     {
457     struct MOD_Index *idx = sw->Index;
458     char *title = "";
459     int n;
460     int position = 1; /* Position of word */
461     int metaID = 1; /* THIS ASSUMES that that's the default ID number */
462    
463    
464     /* Look for title if HTML document */
465    
466     if (fprop->doctype == HTML)
467     {
468     title = parseHTMLtitle( sw , buffer );
469    
470     if (!isoktitle(sw, title))
471     return -2; /* skipped because of title */
472     }
473    
474    
475     #ifdef HAVE_LIBXML2
476     if (fprop->doctype == HTML2)
477     return parse_HTML( sw, fprop, fi, buffer );
478     #endif
479    
480    
481     addCommonProperties( sw, fprop, fi, title, NULL, 0 );
482    
483    
484     n = indexstring( sw, *title == '\0' ? fprop->real_path : title , idx->filenum, IN_FILE, 1, &metaID, &position);
485    
486    
487     /** ??? $$$ doesn't look right -- check this ***/
488     if ( *title != '\0' )
489     efree( title );
490    
491     return n;
492     }
493    
494    
495     /*********************************************************************
496     ** 2001-08 jmruiz - A couple of specialized routines to be used with
497     ** locations and MemZones. The main goal is avoid malloc/realloc/free
498     ** wich produces a lot of fragmentation
499     **
500     ** The memory will be allocated in blocks of 64 bytes inside a zone.
501     ** (I have tried both 32 and 64. 32 looks fine
502     ** In this way, there is some overhead because when a new block is
503     ** requested from the MemZone, the space is not recovered. But this
504     ** only true for the current document because the MemZone is reset
505     ** onces the document is processed. Then, the space is recovered
506     ** after a MemZoneReset is issued
507     **
508     ** 2001-09 jmruiz Improved. Now unused space is recovered when asking
509     ** for space. Free nlocks are maintained using a linked list
510     ********************************************************************/
511    
512     #define LOC_BLOCK_SIZE 32 /* Must be greater than sizeof(LOCATION) and a power of 2 */
513     #define LOC_MIN_SIZE ((sizeof(LOCATION) + LOC_BLOCK_SIZE - 1) & (~(LOC_BLOCK_SIZE - 1)))
514    
515     struct loc_chain {
516     struct loc_chain *next;
517     int size;
518     };
519    
520     /********************************************************************
521     ** 2001-08 jmruiz
522     ** Routine to allocate memory inside a zone for a plain LOCATION
523     ** (frequency is 1). Since we are asking for LOC_BLOCK_SIZE bytes, we
524     ** are loosing some of the space.
525     ** The advantage is that we do not need to call realloc so often. In
526     ** fact, most realloc function work this way. They asks for more memory
527     ** to avoid the overhead of the sequence malloc, memcpy, free.
528     ********************************************************************/
529    
530     LOCATION *alloc_location(struct MOD_Index *idx,int size)
531     {
532     struct loc_chain *tmp = (struct loc_chain *) idx->freeLocMemChain;
533     struct loc_chain *big = NULL;
534     LOCATION *tmp2 = NULL;
535     int avail = 0;
536     struct loc_chain *p_avail = NULL;
537    
538     /* Search for a previously freed location of the same size */
539     while(tmp)
540     {
541     if(tmp->size == size)
542     {
543     if(!tmp2)
544     idx->freeLocMemChain = (LOCATION *)tmp->next;
545     else
546     tmp2->next = (LOCATION *)tmp->next;
547     return (LOCATION *)tmp;
548     }
549     else if(tmp->size > size)
550     {
551     /* Just reserve it to be used if we do not find a match */
552     big = tmp;
553     }
554     else
555     {
556     p_avail = tmp;
557     avail = tmp->size;
558     /* Check consecutive for consecutive blocks */
559     while(((unsigned char *)tmp + tmp->size) == (unsigned char *)tmp->next)
560     {
561     avail += tmp->next->size;
562     if(avail == size)
563     {
564     if(!tmp2)
565     idx->freeLocMemChain = (LOCATION *)tmp->next->next;
566     else
567     tmp2->next = (LOCATION *)tmp->next->next;
568     return (LOCATION *)p_avail;
569     }
570     else if(avail > size)
571     {
572     break;
573     }
574     else
575     {
576     tmp = tmp->next;
577     }
578     }
579     }
580     tmp2 = (LOCATION *)tmp;
581     tmp = tmp->next;
582     }
583     /* Perhaps we have a block with greater size */
584     if(big)
585     {
586     /* Split it */
587     while(big->size > size)
588     {
589     big->size >>= 1;
590     tmp = (struct loc_chain *) ((unsigned char *)big + big->size);
591     tmp->next = big->next;
592     tmp->size = big->size;
593     if(tmp->size == size)
594     return (LOCATION *)tmp;
595     big->next = tmp;
596     big = tmp;
597     }
598     }
599     /* NO memory in free chain of the same size - Asks for size */
600     return (LOCATION *)Mem_ZoneAlloc(idx->currentChunkLocZone, size);
601     }
602    
603    
604     LOCATION *new_location(struct MOD_Index *idx)
605     {
606     return (LOCATION *)alloc_location(idx, LOC_MIN_SIZE);
607     }
608    
609    
610     int is_location_full(int size)
611     {
612     int i;
613    
614     /* Fast test. Since LOC_BLOCK_SIZE is the minimum size ... */
615     if(size % LOC_BLOCK_SIZE)
616     return 0; /* it is not a power of two */
617     /* Check if size is a power of 2 (32,64,128,256,...) in binary ..000100... */
618     for(i=LOC_BLOCK_SIZE;;i <<= 1)
619     {
620     if(size>i)
621     {
622     continue;
623     }
624     if((size & i) == size)
625     {
626     return 1;
627     }
628     else
629     {
630     break;
631     }
632     }
633     return 0;
634     }
635    
636     /********************************************************************
637     ** 2001-08 jmruiz
638     ** Routine to reallocate memory inside a zone for a previous allocated
639     ** LOCATION (frequency > 1).
640     ** A new block is allocated only if the previous becomes full
641     ********************************************************************/
642     LOCATION *add_position_location(void *oldp, struct MOD_Index *idx, int frequency)
643     {
644     LOCATION *newp = NULL;
645     struct loc_chain *tmp = NULL;
646     int oldsize;
647    
648     oldsize = sizeof(LOCATION) + (frequency - 1) * sizeof(int);
649    
650     /* Check for available size in block */
651     if(is_location_full(oldsize))
652     {
653     /* Not enough size - Allocate a new block. Size rounded to LOC_BLOCK_SIZE */
654     newp = (LOCATION *)alloc_location(idx,oldsize << 1);
655     memcpy((void *)newp,(void *)oldp,oldsize);
656     /* Add old zone to the free chain of blocks */
657     tmp = (struct loc_chain *)oldp;
658     tmp->next = (struct loc_chain *)idx->freeLocMemChain;
659     tmp->size = oldsize;
660     idx->freeLocMemChain = (LOCATION *) tmp;
661     }
662     else
663     /* Enough size */
664     newp = oldp;
665    
666     return newp;
667     }
668    
669     /***********************************************************************
670     -- Start the real indexing process for a file.
671     -- This routine will be called by the different indexing methods
672     -- (httpd, filesystem, etc.)
673     -- The indexed file may be the
674     -- - real file on filesystem
675     -- - tmpfile or work file (shadow of the real file)
676     -- Checks if file has to be send thru filter (file stream)
677     -- 2000-11-19 rasc
678     ***********************************************************************/
679    
680     void do_index_file(SWISH * sw, FileProp * fprop)
681     {
682     int (*countwords)(SWISH *sw,FileProp *fprop, FileRec *fi, char *buffer);
683     IndexFILE *indexf = sw->indexlist;
684     int wordcount;
685     char *rd_buffer = NULL; /* complete file read into buffer */
686     struct MOD_Index *idx = sw->Index;
687     char strType[30];
688     int i;
689     FileRec fi; /* place to hold doc properties */
690    
691     memset( &fi, 0, sizeof( FileRec ) );
692    
693    
694     wordcount = -1;
695    
696    
697    
698     /* skip file is the last_mod date is newer than the check date */
699    
700     if (sw->mtime_limit && fprop->mtime < sw->mtime_limit)
701     {
702     if (sw->verbose >= 3)
703     progwarn("Skipping %s: last_mod date is too old\n", fprop->real_path);
704    
705     /* external program must seek past this data (fseek fails) */
706     if (fprop->fp)
707     flush_stream( fprop );
708    
709     return;
710     }
711    
712    
713     /* Upon entry, if fprop->fp is non-NULL then it's already opened and ready to be read from.
714     This is the case with "prog" external programs, *except* when a filter is selected for the file type.
715     If a filter is used with "prog" a temporary file was created (fprop->work_file), and
716     fprop->fp will be NULL (as is with http and fs access methods).
717     2001-05-13 moseley
718     */
719    
720    
721    
722     /* Get input file handle */
723     if (fprop->hasfilter)
724     {
725     fprop->fp = FilterOpen(fprop);
726    
727     /* This should be checked in filteropen because the popen probably won't fail */
728     if ( !fprop->fp )
729     progerr("Failed to open filter for file '%s'",fprop->real_path);
730     }
731    
732     else if ( !fprop->fp )
733     {
734     fprop->fp = fopen(fprop->work_path, F_READ_TEXT );
735    
736     if ( !fprop->fp )
737     {
738     progwarnno("Failed to open: '%s': ", fprop->work_path);
739     return;
740     }
741     }
742     else /* Already open - flag to prevent closing the stream used with "prog" */
743     fprop->external_program++;
744    
745    
746    
747    
748     /** Replace the path for ReplaceRules **/
749    
750     if ( sw->replaceRegexps )
751     {
752     int matched = 0;
753     fprop->real_path = process_regex_list( fprop->real_path, sw->replaceRegexps, &matched );
754     }
755    
756    
757    
758     /** Read the buffer, if not a stream parser **/
759     #ifdef HAVE_LIBXML2
760     if ( fprop->doctype == HTML2 || fprop->doctype == XML2 || fprop->doctype == TXT2 )
761     rd_buffer = NULL;
762     else
763     #endif
764     /* -- Read all data (len = 0 if filtered...) */
765     rd_buffer = read_stream(sw, fprop->real_path, fprop->fp, (fprop->hasfilter) ? 0 : fprop->fsize, sw->truncateDocSize);
766    
767    
768     /* just for fun so we can show total bytes shown */
769     sw->indexlist->total_bytes += fprop->fsize;
770    
771    
772     /* Set which parser to use */
773    
774     switch (fprop->doctype)
775     {
776    
777     case TXT:
778     strcpy(strType,"TXT");
779     countwords = countwords_TXT;
780     break;
781    
782     case HTML:
783     strcpy(strType,"HTML");
784     countwords = countwords_HTML;
785     break;
786    
787     case XML:
788     strcpy(strType,"XML");
789     countwords = countwords_XML;
790     break;
791    
792     #ifdef HAVE_LIBXML2
793     case XML2:
794     strcpy(strType,"XML2");
795     countwords = parse_XML;
796     break;
797    
798     case HTML2:
799     strcpy(strType,"HTML2");
800     countwords = parse_HTML;
801     break;
802    
803     case TXT2:
804     strcpy(strType,"TXT2");
805     countwords = parse_TXT;
806     break;
807     #endif
808    
809     case WML:
810     strcpy(strType,"WML");
811     countwords = countwords_HTML;
812     break;
813    
814     default:
815     strcpy(strType,"DEFAULT (HTML)");
816     countwords = countwords_HTML;
817     break;
818     }
819    
820     if (sw->verbose >= 3)
821     printf(" - Using %s parser - ",strType);
822    
823    
824     /* Check for NoContents flag and just save the path name */
825     /* $$$ Note, really need to only read_stream if reading from a pipe. */
826     /* $$$ waste of disk IO and memory if reading from file system */
827    
828     if (fprop->index_no_content)
829     countwords = index_no_content;
830    
831    
832     /* Make sure all meta flags are cleared (incase a parser aborts) */
833     ClearInMetaFlags( &indexf->header );
834    
835    
836    
837    
838     /* Now bump the file counter */
839     idx->filenum++;
840     indexf->header.totalfiles++; /* why ??? is this needed */
841     fi.filenum = idx->filenum;
842    
843     /** PARSE **/
844     wordcount = countwords(sw, fprop, &fi, rd_buffer);
845    
846    
847    
848    
849    
850     if (!fprop->external_program) /* external_program is not set if a filter is in use */
851     {
852     if (fprop->hasfilter)
853     FilterClose(fprop->fp); /* close filter pipe - should the filter be flushed? */
854     else
855     fclose(fprop->fp); /* close file */
856     }
857     /* Else, it's -S prog so make sure we read all the bytes we are suppose to read! */
858     /* Can remove the check for fprop->bytes_read once read_stream is no longer used */
859    
860     else if ( fprop->bytes_read && fprop->bytes_read < fprop->fsize )
861     flush_stream( fprop );
862    
863    
864     if (sw->verbose >= 3)
865     {
866     if (wordcount > 0)
867     printf(" (%d words)\n", wordcount);
868     else if (wordcount == 0)
869     printf(" (no words indexed)\n");
870     else if (wordcount == -1)
871     printf(" (not opened)\n");
872     else if (wordcount == -2)
873     printf(" (Skipped due to 'FileRules title' setting)\n");
874     else if (wordcount == -3)
875     printf(" (Skipped due to Robots Excluion Rule in meta tag)\n");
876     fflush(stdout);
877     }
878    
879    
880     /* If indexing aborted, remove the last file entry */
881     if ( wordcount == -3 || wordcount == -2 )
882     {
883     remove_last_file_from_list( sw, indexf );
884     return;
885     }
886    
887    
888     /* Continue if a file was not indexed */
889     if ( wordcount < 0 )
890     return;
891    
892    
893     if ( DEBUG_MASK & DEBUG_PROPERTIES )
894     dump_file_properties( indexf, &fi );
895    
896    
897     /* write properties to disk, and release docprop array (and the prop index array) */
898     /* Currently this just passes sw, and assumes only one index file when indexing */
899     WritePropertiesToDisk( sw , &fi );
900    
901    
902     /* Save total words per file */
903     if ( !indexf->header.ignoreTotalWordCountWhenRanking )
904     {
905    
906     setTotalWordsPerFile(sw, indexf, fi.filenum - 1,wordcount);
907     }
908    
909    
910    
911    
912     /* Compress the entries */
913     {
914     ENTRY *ep;
915    
916     /* walk the hash list, and compress entries */
917     for (i = 0; i < VERYBIGHASHSIZE; i++)
918     {
919     if (idx->hashentriesdirty[i])
920     {
921     idx->hashentriesdirty[i] = 0;
922     for (ep = idx->hashentries[i]; ep; ep = ep->next)
923     CompressCurrentLocEntry(sw, indexf, ep);
924     }
925     }
926    
927     /* Coalesce word positions int a more optimal schema to avoid maintain the location data contiguous */
928     if(idx->filenum && ((!(idx->filenum % idx->chunk_size)) || (Mem_ZoneSize(idx->currentChunkLocZone) > idx->optimalChunkLocZoneSize)))
929     {
930     for (i = 0; i < VERYBIGHASHSIZE; i++)
931     for (ep = idx->hashentries[i]; ep; ep = ep->next)
932     coalesce_word_locations(sw, indexf, ep);
933     /* Make zone available for reuse */
934     Mem_ZoneReset(idx->currentChunkLocZone);
935     idx->freeLocMemChain = NULL;
936    
937     }
938     }
939    
940    
941     /* Make zone available for reuse */
942     Mem_ZoneReset(idx->perDocTmpZone);
943    
944    
945     return;
946     }
947    
948    
949     ENTRY *getentry(SWISH * sw, char *word)
950     {
951     IndexFILE *indexf = sw->indexlist;
952     struct MOD_Index *idx = sw->Index;
953     int hashval;
954     ENTRY *e;
955    
956     if (!idx->entryArray)
957     {
958     idx->entryArray = (ENTRYARRAY *) emalloc(sizeof(ENTRYARRAY));
959     idx->entryArray->numWords = 0;
960     idx->entryArray->elist = NULL;
961     }
962     /* Compute hash value of word */
963     hashval = verybighash(word);
964    
965    
966     /* Look for the word in the hash array */
967     for (e = idx->hashentries[hashval]; e; e = e->next)
968     if (strcmp(e->word, word) == 0)
969     break;
970    
971     /* flag hash entry used this file, so that the locations can be "compressed" in do_index_file */
972     idx->hashentriesdirty[hashval] = 1;
973    
974    
975     /* Word found, return it */
976     if (e)
977     return e;
978    
979     /* Word not found, so create a new word */
980    
981     e = (ENTRY *) Mem_ZoneAlloc(idx->entryZone, sizeof(ENTRY) + strlen(word));
982     strcpy(e->word, word);
983     e->next = idx->hashentries[hashval];
984     idx->hashentries[hashval] = e;
985    
986     /* Init values */
987     e->tfrequency = 0;
988     e->u1.last_filenum = 0;
989     e->currentlocation = NULL;
990     e->currentChunkLocationList = NULL;
991     e->allLocationList = NULL;
992    
993     idx->entryArray->numWords++;
994     indexf->header.totalwords++;
995    
996     return e;
997     }
998    
999     /* Adds a word to the master index tree.
1000     */
1001    
1002     void addentry(SWISH * sw, ENTRY *e, int filenum, int structure, int metaID, int position)
1003     {
1004     int found;
1005     LOCATION *tp, *newtp, *prevtp;
1006     IndexFILE *indexf = sw->indexlist;
1007     struct MOD_Index *idx = sw->Index;
1008    
1009    
1010     indexf->total_word_positions++;
1011    
1012     if ( DEBUG_MASK & DEBUG_WORDS )
1013     {
1014     struct metaEntry *m = getMetaNameByID(&indexf->header, metaID);
1015    
1016     printf(" Adding:[%d:%s(%d)] '%s' Pos:%d Stuct:0x%0X (", filenum, m ? m->metaName : "PROP_UNKNOWN", metaID, e->word, position, structure);
1017    
1018     if ( structure & IN_EMPHASIZED ) printf(" EM");
1019     if ( structure & IN_HEADER ) printf(" HEADING");
1020     if ( structure & IN_COMMENTS ) printf(" COMMENT");
1021     if ( structure & IN_META ) printf(" META");
1022     if ( structure & IN_BODY ) printf(" BODY");
1023     if ( structure & IN_HEAD ) printf(" HEAD");
1024     if ( structure & IN_TITLE ) printf(" TITLE");
1025     if ( structure & IN_FILE ) printf(" FILE");
1026     printf(" )\n");
1027     }
1028    
1029    
1030     /* Check for first time */
1031     if(!e->tfrequency)
1032     {
1033     /* create a location record */
1034     tp = (LOCATION *) new_location(idx);
1035     tp->filenum = filenum;
1036     tp->frequency = 1;
1037     tp->metaID = metaID;
1038     tp->posdata[0] = SET_POSDATA(position,structure);
1039     tp->next = NULL;
1040    
1041     e->currentChunkLocationList = tp;
1042     e->tfrequency = 1;
1043     e->u1.last_filenum = filenum;
1044    
1045     return;
1046     }
1047    
1048     /* Word found -- look for same metaID and filename */
1049     /* $$$ To do it right, should probably compare the structure, too */
1050     /* Note: filename not needed due to compress we are only looking at the current file */
1051     /* Oct 18, 2001 -- filename is needed since merge adds words in non-filenum order */
1052    
1053     tp = e->currentChunkLocationList;
1054     found = 0;
1055    
1056     while (tp != e->currentlocation)
1057     {
1058     if(tp->metaID == metaID && tp->filenum == filenum )
1059     {
1060     found =1;
1061     break;
1062     }
1063     tp = tp->next;
1064     }
1065    
1066     /* matching metaID NOT found. So, add a new LOCATION record onto the word */
1067     /* This expands the size of the location array for this word by one */
1068    
1069     if(!found)
1070     {
1071     /* create the new LOCATION entry */
1072     tp = (LOCATION *) new_location(idx);
1073     tp->filenum = filenum;
1074     tp->frequency = 1; /* count of times this word in this file:metaID */
1075     tp->metaID = metaID;
1076     tp->posdata[0] = SET_POSDATA(position,structure);
1077    
1078     /* add the new LOCATION onto the array */
1079     tp->next = e->currentChunkLocationList;
1080     e->currentChunkLocationList = tp;
1081    
1082     /* Count number of different files that this word is used in */
1083     if ( e->u1.last_filenum != filenum )
1084     {
1085     e->tfrequency++;
1086     e->u1.last_filenum = filenum;
1087     }
1088    
1089     return; /* all done */
1090     }
1091    
1092    
1093     /* Otherwise, found matching LOCATION record (matches filenum and metaID) */
1094     /* Just add the position number onto the end by expanding the size of the LOCATION record */
1095    
1096     /* 2001/08 jmruiz - Much better memory usage occurs if we use MemZones */
1097     /* MemZone will be reset when the doc is completely proccesed */
1098    
1099     newtp = add_position_location(tp, idx, tp->frequency);
1100    
1101     if(newtp != tp)
1102     {
1103     if(e->currentChunkLocationList == tp)
1104     e->currentChunkLocationList = newtp;
1105     else
1106     for(prevtp = e->currentChunkLocationList;;prevtp = prevtp->next)
1107     {
1108     if(prevtp->next == tp)
1109     {
1110     prevtp->next = newtp;
1111     break;
1112     }
1113     }
1114     tp = newtp;
1115     }
1116    
1117     tp->posdata[tp->frequency++] = SET_POSDATA(position,structure);
1118    
1119     }
1120    
1121    
1122     /*******************************************************************
1123     * Adds common file properties to the last entry in the file array
1124     * (which should be the current one)
1125     *
1126     *
1127     * Call with:
1128     * *SWISH - need for indexing words
1129     * *fprop
1130     * *fi
1131     * *summary - document summary (why here?)
1132     * start - start position of a sub-document
1133     * size - size in bytes of document
1134     *
1135     * Returns:
1136     * void
1137     *
1138     * Note:
1139     * Uses cached meta entries (created in metanames.c) to save the
1140     * metaEntry lookup by name costs
1141     *
1142     ********************************************************************/
1143    
1144     void addCommonProperties( SWISH *sw, FileProp *fprop, FileRec *fi, char *title, char *summary, int start )
1145     {
1146     struct metaEntry *q;
1147     docProperties **properties = &fi->docProperties;
1148     unsigned long tmp;
1149     int metaID;
1150     INDEXDATAHEADER *header = &sw->indexlist->header;
1151     char *filename = fprop->real_path; /* should always have a path */
1152     int filenum = fi->filenum;
1153    
1154    
1155    
1156     /* Check if filename is internal swish metadata -- should be! */
1157    
1158     if ((q = getPropNameByName(header, AUTOPROPERTY_DOCPATH)))
1159     addDocProperty( properties, q, (unsigned char *)filename, strlen(filename),0);
1160    
1161    
1162     /* Perhaps we want it to be indexed ... */
1163     if ((q = getMetaNameByName(header, AUTOPROPERTY_DOCPATH)))
1164     {
1165     int metaID,
1166     positionMeta;
1167    
1168     metaID = q->metaID;
1169     positionMeta = 1;
1170     indexstring(sw, filename, filenum, IN_FILE, 1, &metaID, &positionMeta);
1171     }
1172    
1173    
1174     /* This allows extracting out parts of a path and indexing as a separate meta name */
1175     if ( sw->pathExtractList )
1176     index_path_parts( sw, fprop->orig_path, sw->pathExtractList, header, properties );
1177    
1178    
1179    
1180     /* Check if title is internal swish metadata */
1181     if ( title )
1182     {
1183     if ( (q = getPropNameByName(header, AUTOPROPERTY_TITLE)))
1184     addDocProperty(properties, q, (unsigned char *)title, strlen(title),0);
1185    
1186    
1187     /* Perhaps we want it to be indexed ... */
1188     if ( (q = getMetaNameByName(header, AUTOPROPERTY_TITLE)))
1189     {
1190     int positionMeta;
1191    
1192     metaID = q->metaID;
1193     positionMeta = 1;
1194     indexstring(sw, title, filenum, IN_FILE, 1, &metaID, &positionMeta);
1195     }
1196     }
1197    
1198    
1199     if ( summary )
1200     {
1201     if ( (q = getPropNameByName(header, AUTOPROPERTY_SUMMARY)))
1202     addDocProperty(properties, q, (unsigned char *)summary, strlen(summary),0);
1203    
1204    
1205     if ( (q = getMetaNameByName(header, AUTOPROPERTY_SUMMARY)))
1206     {
1207     int metaID,
1208     positionMeta;
1209    
1210     metaID = q->metaID;
1211     positionMeta = 1;
1212     indexstring(sw, summary, filenum, IN_FILE, 1, &metaID, &positionMeta);
1213     }
1214     }
1215    
1216    
1217    
1218     /* Currently don't allow indexing by date or size or position */
1219    
1220     /* mtime is a time_t, but we don't have an entry for NOT A TIME. Does anyone care about the first second of 1970? */
1221    
1222     if ( fprop->mtime && (q = getPropNameByName(header, AUTOPROPERTY_LASTMODIFIED)))
1223     {
1224     tmp = (unsigned long) fprop->mtime;
1225     tmp = PACKLONG(tmp); /* make it portable */
1226     addDocProperty(properties, q, (unsigned char *) &tmp, sizeof(tmp),1);
1227     }
1228    
1229     if ( (q = getPropNameByName(header, AUTOPROPERTY_DOCSIZE)))
1230     {
1231     tmp = (unsigned long) fprop->fsize;
1232     tmp = PACKLONG(tmp); /* make it portable */
1233     addDocProperty(properties, q, (unsigned char *) &tmp, sizeof(tmp),1);
1234     }
1235    
1236    
1237     if ( (q = getPropNameByName(header, AUTOPROPERTY_STARTPOS)))
1238     {
1239     tmp = (unsigned long) start;
1240     tmp = PACKLONG(tmp); /* make it portable */
1241     addDocProperty(properties, q, (unsigned char *) &tmp, sizeof(tmp),1);
1242     }
1243    
1244     }
1245    
1246    
1247     /*******************************************************************
1248     * extracts out parts from a path name and indexes that part
1249     *
1250     ********************************************************************/
1251     static void index_path_parts( SWISH *sw, char *path, path_extract_list *list, INDEXDATAHEADER *header, docProperties **properties )
1252     {
1253     int metaID;
1254     int positionMeta = 1;
1255     int matched = 0; /* flag if any patterns matched */
1256    
1257     while ( list )
1258     {
1259     char *str = process_regex_list( estrdup(path), list->regex, &matched );
1260    
1261     if ( !matched )
1262     {
1263     /* use default? */
1264     if ( list->meta_entry->extractpath_default )
1265     {
1266     metaID = list->meta_entry->metaID;
1267     indexstring(sw, list->meta_entry->extractpath_default, sw->Index->filenum, IN_FILE, 1, &metaID, &positionMeta);
1268     }
1269     }
1270     else
1271     {
1272     struct metaEntry *q;
1273    
1274     metaID = list->meta_entry->metaID;
1275     indexstring(sw, str, sw->Index->filenum, IN_FILE, 1, &metaID, &positionMeta);
1276    
1277     if ((q = getPropNameByName(header, list->meta_entry->metaName )))
1278     addDocProperty( properties, q, (unsigned char *)str, strlen(str),0);
1279    
1280    
1281     efree( str );
1282     }
1283    
1284     matched = 0;
1285     list = list->next;
1286     }
1287     }
1288    
1289    
1290     /* Just goes through the master list of files and
1291     ** counts 'em.
1292     */
1293    
1294     int getfilecount(IndexFILE * indexf)
1295     {
1296     return indexf->header.totalfiles;
1297     }
1298    
1299    
1300    
1301     /* Removes words that occur in over _plimit_ percent of the files and
1302     ** that occur in over _flimit_ files (marks them as stopwords, that is).
1303     */
1304     /* 05/00 Jose Ruiz
1305     ** Recompute positions when a stopword is removed from lists
1306     ** This piece of code is terrorific because the first goal
1307     ** was getting the best possible performace. So, the code is not
1308     ** very clear.
1309     ** The main problem is to recalculate word positions for all
1310     ** the words after removing the automatic stop words. This means
1311     ** looking at all word's positions for each automatic stop word
1312     ** and decrement its position
1313     */
1314     /* 2001/02 jmruiz - rewritten - all the proccess is made in one pass to achieve
1315     better performance */
1316     /* 2001-08 jmruiz - rewritten - adapted to new locations and zone schema */
1317     /* 2002-07 jmruiz - rewritten - adapted to new -e schema */
1318    
1319     int getNumberOfIgnoreLimitWords(SWISH *sw)
1320     {
1321     return sw->Index->nIgnoreLimitWords;
1322     }
1323    
1324     void getPositionsFromIgnoreLimitWords(SWISH * sw)
1325     {
1326     int i,
1327     j,
1328     k,
1329     m,
1330     stopwords,
1331     percent,
1332     bytes_size,
1333     chunk_size,
1334     metaID,
1335     frequency,
1336     tmpval,
1337     filenum;
1338     int *positions;
1339     int local_positions[MAX_STACK_POSITIONS];
1340    
1341     LOCATION *l, *next;
1342     ENTRY *ep,
1343     *ep2;
1344     ENTRY **estop = NULL;
1345     int estopsz = 0,
1346     estopmsz = 0;
1347     int totalwords;
1348     IndexFILE *indexf = sw->indexlist;
1349     int totalfiles = getfilecount(indexf);
1350     struct IgnoreLimitPositions **filepos = NULL;
1351     struct IgnoreLimitPositions *fpos;
1352     struct MOD_Index *idx = sw->Index;
1353     unsigned char *p, *q, *compressed_data, flag;
1354     int last_loc_swap;
1355    
1356     stopwords = 0;
1357     totalwords = indexf->header.totalwords;
1358    
1359     idx->nIgnoreLimitWords = 0;
1360     idx->IgnoreLimitPositionsArray = NULL;
1361    
1362     if (!totalwords || idx->plimit >= NO_PLIMIT)
1363     return;
1364    
1365     if (sw->verbose)
1366     {
1367     printf("\r Getting IgnoreLimit stopwords: ...");
1368     fflush(stdout);
1369     }
1370    
1371    
1372     if (!estopmsz)
1373     {
1374     estopmsz = 1;
1375     estop = (ENTRY **) emalloc(estopmsz * sizeof(ENTRY *));
1376     }
1377    
1378    
1379     /* this is the easy part: Remove the automatic stopwords from the hash array */
1380     /* Builds a list estop[] of ENTRY's that need to be removed */
1381    
1382     for (i = 0; i < VERYBIGHASHSIZE; i++)
1383     {
1384     for (ep2 = NULL, ep = sw->Index->hashentries[i]; ep; ep = ep->next)
1385     {
1386     percent = (ep->tfrequency * 100) / totalfiles;
1387     if (percent >= idx->plimit && ep->tfrequency >= idx->flimit)
1388     {
1389     addStopList(&indexf->header, ep->word); /* For printing list of words */
1390     addstophash(&indexf->header, ep->word); /* Lookup hash */
1391     stopwords++;
1392     /* unlink the ENTRY from the hash */
1393     if (ep2)
1394     ep2->next = ep->next;
1395     else
1396     sw->Index->hashentries[i] = ep->next;
1397    
1398     totalwords--;
1399     sw->Index->entryArray->numWords--;
1400     indexf->header.totalwords--;
1401    
1402     /* Reallocte if more space is needed */
1403     if (estopsz == estopmsz)
1404     {
1405     estopmsz *= 2;
1406     estop = (ENTRY **) erealloc(estop, estopmsz * sizeof(ENTRY *));
1407     }
1408    
1409     /* estop is an array of ENTRY's that need to be removed */
1410     estop[estopsz++] = ep;
1411     }
1412     else
1413     ep2 = ep;
1414     }
1415     }
1416    
1417    
1418    
1419     /* If we have automatic stopwords we have to recalculate word positions */
1420    
1421     if (estopsz)
1422     {
1423     /* Build an array with all the files positions to be removed */
1424     filepos = (struct IgnoreLimitPositions **) emalloc(totalfiles * sizeof(struct IgnoreLimitPositions *));
1425    
1426     for (i = 0; i < totalfiles; i++)
1427     filepos[i] = NULL;
1428    
1429     /* Compute bytes required for chunk location size. Eg: 4096 -> 2 bytes, 65535 -> 2 bytes */
1430     for(bytes_size = 0, i = COALESCE_BUFFER_MAX_SIZE; i; i >>= 8)
1431     bytes_size++;
1432    
1433     /* Process each automatic stop word */
1434     for (i = 0; i < estopsz; i++)
1435     {
1436     ep = estop[i];
1437    
1438     if (sw->verbose)
1439     {
1440     printf("\r Getting IgnoreLimit stopwords: %25s",ep->word);
1441     fflush(stdout);
1442     }
1443    
1444     if(sw->Index->swap_locdata)
1445     {
1446     /* jmruiz - Be careful with this lines!!!! If we have a lot of words,
1447     ** probably this code can be very slow and may be rethought.
1448     ** Fortunately, only a few words must usually raise a IgnoreLimit option
1449     */
1450     last_loc_swap = (verybighash(ep->word) * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1);
1451     unSwapLocData(sw, last_loc_swap, ep );
1452     }
1453    
1454     /* Run through location list to get positions */
1455     for(l=ep->allLocationList;l;)
1456     {
1457     compressed_data = (unsigned char *) l;
1458     /* Preserve next element */
1459     next = *(LOCATION **)compressed_data;
1460     /* Jump pointer to next element */
1461     p = compressed_data + sizeof(LOCATION *);
1462    
1463     metaID = uncompress2(&p);
1464    
1465     for(chunk_size = 0, k = 0, j = bytes_size - 1; k < bytes_size; k++, j--)
1466     chunk_size |= p[k] << (j * 8);
1467     p += bytes_size;
1468    
1469     filenum = 0;
1470     while(chunk_size)
1471     { /* Read on all items */
1472     q = p;
1473     uncompress_location_values(&p,&flag,&tmpval,&frequency);
1474     filenum += tmpval;
1475    
1476     if(frequency > MAX_STACK_POSITIONS)
1477     positions = (int *) emalloc(frequency * sizeof(int));
1478     else
1479     positions = local_positions;
1480    
1481     uncompress_location_positions(&p,flag,frequency,positions);
1482    
1483     chunk_size -= (p-q);
1484    
1485     /* Now build the list by filenum of meta/position info */
1486    
1487     if (!filepos[filenum - 1])
1488     {
1489     fpos = (struct IgnoreLimitPositions *) emalloc(sizeof(struct IgnoreLimitPositions));
1490     fpos->pos = (int *) emalloc(frequency * 2 * sizeof(int));
1491     fpos->n = 0;
1492     filepos[filenum - 1] = fpos;
1493     }
1494     else /* file exists in array. just append the meta and position data */
1495     {
1496     fpos = filepos[filenum - 1];
1497     fpos->pos = (int *) erealloc(fpos->pos, (fpos->n + frequency) * 2 * sizeof(int));
1498     }
1499    
1500     for (m = fpos->n * 2, k = 0; k < frequency; k++)
1501     {
1502     fpos->pos[m++] = metaID;
1503     fpos->pos[m++] = GET_POSITION(positions[k]);
1504     }
1505    
1506     fpos->n += frequency;
1507    
1508     if(positions != local_positions)
1509     efree(positions);
1510     }
1511     l = next;
1512     }
1513     if(sw->Index->swap_locdata)
1514     Mem_ZoneReset(idx->totalLocZone);
1515     }
1516    
1517     /* sort each file sort entries by metaname/position */
1518     for (i = 0; i < totalfiles; i++)
1519     {
1520     if (filepos[i])
1521     swish_qsort(filepos[i]->pos, filepos[i]->n, 2 * sizeof(int), &icomp2);
1522     }
1523     }
1524    
1525     idx->nIgnoreLimitWords = estopsz;
1526     idx->IgnoreLimitPositionsArray = filepos;
1527    
1528     if (sw->verbose)
1529     {
1530     printf("\r Getting IgnoreLimit stopwords: Complete \n");
1531     fflush(stdout);
1532     }
1533    
1534    
1535     }
1536    
1537     /* 2001-08 jmruiz - Adjust positions if there was IgnoreLimit stopwords
1538     ** In all cases, removes null end of chunk marks */
1539     void adjustWordPositions(unsigned char *worddata, int *sz_worddata, int n_files, struct IgnoreLimitPositions **ilp)
1540     {
1541     int frequency,
1542     metaID,
1543     tmpval,
1544     r_filenum,
1545     w_filenum,
1546     *posdata;
1547     int i,j,k;
1548     unsigned long r_nextposmeta;
1549     unsigned char *w_nextposmeta;
1550     int local_posdata[MAX_STACK_POSITIONS];
1551     unsigned char r_flag, *w_flag;
1552     unsigned char *p, *q;
1553    
1554     p = worddata;
1555    
1556     tmpval = uncompress2(&p); /* tfrequency */
1557     metaID = uncompress2(&p); /* metaID */
1558     r_nextposmeta = UNPACKLONG2(p);
1559     w_nextposmeta = p;
1560     p += sizeof(long);
1561    
1562     q = p;
1563     r_filenum = w_filenum = 0;
1564     while(1)
1565     { /* Read on all items */
1566     uncompress_location_values(&p,&r_flag,&tmpval,&frequency);
1567     r_filenum += tmpval;
1568    
1569     if(frequency <= MAX_STACK_POSITIONS)
1570     posdata = local_posdata;
1571     else
1572     posdata = (int *) emalloc(frequency * sizeof(int));
1573    
1574     uncompress_location_positions(&p,r_flag,frequency,posdata);
1575    
1576     if(n_files && ilp && ilp[r_filenum - 1])
1577     {
1578     for(i = 0; i < ilp[r_filenum - 1]->n; i++)
1579     {
1580     tmpval = ilp[r_filenum - 1]->pos[2 * i];
1581     if( tmpval >= metaID)
1582     break;
1583     }
1584     if(tmpval == metaID)
1585     {
1586     for(j = 0; j < frequency ; j++)
1587     {
1588     for(k = i; k < ilp[r_filenum - 1]->n ; k++)
1589     {
1590     if(ilp[r_filenum - 1]->pos[2 * k] != metaID ||
1591     ilp[r_filenum - 1]->pos[2 * k + 1] > GET_POSITION(posdata[j]))
1592     break; /* End */
1593     }
1594     posdata[j] = SET_POSDATA(GET_POSITION(posdata[j]) - (k-i), GET_STRUCTURE(posdata[j]));
1595     }
1596     }
1597     }
1598     /* Store the filenum incrementally to save space */
1599     compress_location_values(&q,&w_flag,r_filenum - w_filenum,frequency, posdata);
1600     w_filenum = r_filenum;
1601    
1602     /* store positions */
1603     compress_location_positions(&q,w_flag,frequency,posdata);
1604    
1605     if(posdata != local_posdata)
1606     efree(posdata);
1607    
1608     if(!p[0]) /* End of chunk mark */
1609     {
1610     r_filenum = 0; /* reset filenum */
1611     p++;
1612     }
1613     if ((p - worddata) == *sz_worddata)
1614     break; /* End of worddata */
1615    
1616     if ((unsigned long)(p - worddata) == r_nextposmeta)
1617     {
1618     if(q != p)
1619     PACKLONG2(q - worddata, w_nextposmeta);
1620    
1621     metaID = uncompress2(&p);
1622     q = compress3(metaID,q);
1623    
1624     r_nextposmeta = UNPACKLONG2(p);
1625     p += sizeof(long);
1626    
1627     w_nextposmeta = q;
1628     q += sizeof(long);
1629    
1630     w_filenum = 0;
1631     }
1632     }
1633     *sz_worddata = q - worddata;
1634     PACKLONG2(*sz_worddata, w_nextposmeta);
1635     }
1636    
1637    
1638    
1639     /*
1640     ** This is an all new ranking algorithm. I can't say it is based on anything,
1641     ** but it does seem to be better than what was used before!
1642     ** 2001/05 wsm
1643     **
1644     ** Parameters:
1645     ** sw
1646     ** Pointer to SWISH structure
1647     **
1648     ** freq
1649     ** Number of times this word appeared in this file
1650     **
1651     ** tfreq
1652     ** Number of files this word appeared in this index (not used for ranking)
1653     **
1654     ** words
1655     ** Number of owrds in this file
1656     **
1657     ** structure
1658     ** Bit mask of context where this word appeared
1659     **
1660     ** ignoreTotalWordCount
1661     ** Ignore total word count when ranking (config file parameter)
1662     */
1663    
1664    
1665    
1666     int entrystructcmp(const void *e1, const void *e2)
1667     {
1668     const ENTRY *ep1 = *(ENTRY * const *) e1;
1669     const ENTRY *ep2 = *(ENTRY * const *) e2;
1670    
1671     return (strcmp(ep1->word, ep2->word));
1672     }
1673    
1674    
1675     /* Sorts the words */
1676     void sort_words(SWISH * sw, IndexFILE * indexf)
1677     {
1678     int i,
1679     j;
1680     ENTRY *e;
1681    
1682    
1683     if (!sw->Index->entryArray || !sw->Index->entryArray->numWords)
1684     return;
1685    
1686    
1687     if (sw->verbose)
1688     {
1689     printf("Sorting %d words alphabetically\n", sw->Index->entryArray->numWords );
1690     fflush(stdout);
1691     }
1692    
1693     /* Build the array with the pointers to the entries */
1694     sw->Index->entryArray->elist = (ENTRY **) emalloc(sw->Index->entryArray->numWords * sizeof(ENTRY *));
1695    
1696     /* Fill the array with all the entries */
1697     for (i = 0, j = 0; i < VERYBIGHASHSIZE; i++)
1698     for (e = sw->Index->hashentries[i]; e; e = e->next)
1699     sw->Index->entryArray->elist[j++] = e;
1700    
1701     /* Sort them */
1702     swish_qsort(sw->Index->entryArray->elist, sw->Index->entryArray->numWords, sizeof(ENTRY *), &entrystructcmp);
1703     }
1704    
1705    
1706    
1707     /* Sort chunk locations of entry e by metaID, filenum */
1708     void sortChunkLocations(SWISH * sw, IndexFILE * indexf, ENTRY * e)
1709     {
1710     int i,
1711     j,
1712     k,
1713     filenum,metaID,frequency;
1714     unsigned char flag;
1715     unsigned char *ptmp,
1716     *ptmp2,
1717     *compressed_data;
1718     int *pi = NULL;
1719     LOCATION *l, *prev = NULL, **lp;
1720    
1721     /* Very trivial case */
1722     if (!e)
1723     return;
1724    
1725     if(!e->currentChunkLocationList)
1726     return;
1727    
1728     /* Get the number of locations in chunk */
1729     for(i = 0, l = e->currentChunkLocationList; l; i++)
1730     l=*(LOCATION **)l; /* Get next location */
1731    
1732     /* Compute array wide */
1733     j = 2 * sizeof(int) + sizeof(void *);
1734    
1735     /* Compute array size */
1736     ptmp = (void *) emalloc(j * i);
1737    
1738     /* Build an array with the elements to compare
1739     and pointers to data */
1740    
1741     for(l = e->currentChunkLocationList, ptmp2 = ptmp; l; )
1742     {
1743     pi = (int *) ptmp2;
1744    
1745     compressed_data = (unsigned char *)l;
1746     /* Jump next offset */
1747     compressed_data += sizeof(LOCATION *);
1748    
1749     metaID = uncompress2(&compressed_data);
1750     uncompress_location_values(&compressed_data,&flag,&filenum,&frequency);
1751     pi[0] = metaID;
1752     pi[1] = filenum;
1753     ptmp2 += 2 * sizeof(int);
1754    
1755     lp = (LOCATION **)ptmp2;
1756     *lp = l;
1757     ptmp2 += sizeof(void *);
1758     /* Get next location */
1759     l=*(LOCATION **)l; /* Get next location */
1760     }
1761    
1762     /* Sort them */
1763     swish_qsort(ptmp, i, j, &icomp2);
1764    
1765     /* Store results */
1766     for (k = 0, ptmp2 = ptmp; k < i; k++)
1767     {
1768     ptmp2 += 2 * sizeof(int);
1769    
1770     l = *(LOCATION **)ptmp2;
1771     if(!k)
1772     e->currentChunkLocationList = l;
1773     else
1774     prev->next =l;
1775     ptmp2 += sizeof(void *);
1776     prev = l;
1777     }
1778     l->next =NULL;
1779    
1780     /* Free the memory of the array */
1781     efree(ptmp);
1782     }
1783    
1784     void coalesce_all_word_locations(SWISH * sw, IndexFILE * indexf)
1785     {
1786     int i;
1787     ENTRY *epi;
1788    
1789     for (i = 0; i < VERYBIGHASHSIZE; i++)
1790     {
1791     if ((epi = sw->Index->hashentries[i]))
1792     {
1793     while (epi)
1794     {
1795     coalesce_word_locations(sw, indexf, epi);
1796     epi = epi->next;
1797     }
1798     }
1799     }
1800    
1801     }
1802    
1803     /* Write the index entries that hold the word, rank, and other information.
1804     */
1805    
1806    
1807     #ifndef USE_BTREE
1808     void write_index(SWISH * sw, IndexFILE * indexf)
1809     {
1810     int i;
1811     ENTRYARRAY *ep;
1812     ENTRY *epi;
1813     int totalwords;
1814     int percent, lastPercent, n;
1815     int last_loc_swap;
1816    
1817     #define DELTA 10
1818    
1819    
1820     if ( !(ep = sw->Index->entryArray ))
1821     return; /* nothing to do */
1822    
1823    
1824     totalwords = ep->numWords;
1825    
1826     DB_InitWriteWords(sw, indexf->DB);
1827    
1828     if (sw->verbose)
1829     {
1830     printf(" Writing word text: ...");
1831     fflush(stdout);
1832     }
1833    
1834     /* This is not longer needed. So free it as soon as possible */
1835     Mem_ZoneFree(&sw->Index->perDocTmpZone);
1836    
1837    
1838     /* This is not longer needed. So free it as soon as possible */
1839     Mem_ZoneFree(&sw->Index->currentChunkLocZone);
1840    
1841     /* If we are swaping locs to file, reset memory zone */
1842     if(sw->Index->swap_locdata)
1843     Mem_ZoneReset(sw->Index->totalLocZone);
1844    
1845     n = lastPercent = 0;
1846     for (i = 0; i < totalwords; i++)
1847     {
1848     if ( sw->verbose && totalwords > 10000 ) // just some random guess
1849     {
1850     n++;
1851     percent = (n * 100)/totalwords;
1852     if (percent - lastPercent >= DELTA )
1853     {
1854     printf("\r Writing word text: %3d%%", percent );
1855     fflush(stdout);
1856     lastPercent = percent;
1857     }
1858     }
1859    
1860     epi = ep->elist[i];
1861    
1862     /* why check for stopwords here? removestopwords could have remove them */
1863     if (!isstopword(&indexf->header, epi->word))
1864     {
1865     /* Write word to index file */
1866     write_word(sw, epi, indexf);
1867     }
1868     else
1869     epi->u1.wordID = -1; /* flag as a stop word */
1870     }
1871    
1872     if (sw->verbose)
1873     {
1874     printf("\r Writing word text: Complete\n" );
1875     printf(" Writing word hash: ...");
1876     fflush(stdout);
1877     }
1878    
1879    
1880    
1881     n = lastPercent = 0;
1882     for (i = 0; i < VERYBIGHASHSIZE; i++)
1883     {
1884     if ( sw->verbose )
1885     {
1886     n++;
1887     percent = (n * 100)/VERYBIGHASHSIZE;
1888     if (percent - lastPercent >= DELTA )
1889     {
1890     printf("\r Writing word hash: %3d%%", percent );
1891     fflush(stdout);
1892     lastPercent = percent;
1893     }
1894     }
1895    
1896    
1897     if ((epi = sw->Index->hashentries[i]))
1898     {
1899     while (epi)
1900     {
1901     /* If it is not a stopword write it */
1902     if (epi->u1.wordID > 0)
1903     DB_WriteWordHash(sw, epi->word,epi->u1.wordID,indexf->DB);
1904     epi = epi->next;
1905     }
1906     }
1907     }
1908    
1909     if (sw->verbose)
1910     {
1911     printf("\r Writing word hash: Complete\n" );
1912     printf(" Writing word data: ...");
1913     fflush(stdout);
1914     }
1915    
1916    
1917     n = lastPercent = last_loc_swap = -1;
1918     for (i = 0; i < VERYBIGHASHSIZE; i++)
1919     {
1920     /* If we are in economic mode -e restore locations */
1921     if(sw->Index->swap_locdata)
1922     {
1923     if (((i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1)) != last_loc_swap)
1924     {
1925     /* Free not longer needed memory */
1926     Mem_ZoneReset(sw->Index->totalLocZone);
1927     last_loc_swap = (i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1);
1928     unSwapLocData(sw, last_loc_swap, NULL );
1929     }
1930     }
1931     if ((epi = sw->Index->hashentries[i]))
1932     {
1933     while (epi)
1934     {
1935     /* If we are in economic mode -e we must sort locations by metaID, filenum */
1936     if(sw->Index->swap_locdata)
1937     {
1938     sortSwapLocData(sw, epi);
1939     }
1940     if ( sw->verbose && totalwords > 10000 ) // just some random guess
1941     {
1942     n++;
1943     percent = (n * 100)/totalwords;
1944     if (percent - lastPercent >= DELTA )
1945     {
1946     printf("\r Writing word data: %3d%%", percent );
1947     fflush(stdout);
1948     lastPercent = percent;
1949     }
1950     }
1951     if (epi->u1.wordID > 0) /* Not a stopword */
1952     {
1953     build_worddata(sw, epi, indexf);
1954     write_worddata(sw, epi, indexf);
1955     }
1956     epi = epi->next;
1957     }
1958     }
1959     }
1960     if (sw->verbose)
1961     printf("\r Writing word data: Complete\n" );
1962    
1963    
1964     DB_EndWriteWords(sw, indexf->DB);
1965    
1966     /* free all ENTRY structs at once */
1967     Mem_ZoneFree(&sw->Index->entryZone);
1968    
1969     /* free all location compressed data */
1970     Mem_ZoneFree(&sw->Index->totalLocZone);
1971    
1972     efree(ep->elist);
1973     }
1974    
1975     #else
1976    
1977     void write_index(SWISH * sw, IndexFILE * indexf)
1978     {
1979     int i;
1980     ENTRYARRAY *ep;
1981     ENTRY *epi;
1982     int totalwords;
1983     int percent, lastPercent, n;
1984     long old_wordid;
1985     unsigned char *buffer =NULL;
1986     int sz_buffer = 0;
1987     #define DELTA 10
1988    
1989    
1990     if ( !(ep = sw->Index->entryArray ))
1991     return; /* nothing to do */
1992    
1993     totalwords = ep->numWords;
1994    
1995    
1996     /* Write words */
1997     DB_InitWriteWords(sw, indexf->DB);
1998    
1999     if (sw->verbose)
2000     {
2001     printf(" Writing word text: ...");
2002     fflush(stdout);
2003     }
2004    
2005     /* This is not longer needed. So free it as soon as possible */
2006     Mem_ZoneFree(&sw->Index->perDocTmpZone);
2007    
2008    
2009     /* This is not longer needed. So free it as soon as possible */
2010     Mem_ZoneFree(&sw->Index->currentChunkLocZone);
2011    
2012     /* If we are swaping locs to file, reset memory zone */
2013     if(sw->Index->swap_locdata)
2014     Mem_ZoneReset(sw->Index->totalLocZone);
2015    
2016     n = lastPercent = 0;
2017     for (i = 0; i < totalwords; i++)
2018     {
2019     if ( sw->verbose && totalwords > 10000 ) // just some random guess
2020     {
2021     n++;
2022     percent = (n * 100)/totalwords;
2023     if (percent - lastPercent >= DELTA )
2024     {
2025     printf("\r Writing word text: %3d%%", percent );
2026     fflush(stdout);
2027     lastPercent = percent;
2028     }
2029     }
2030    
2031     epi = ep->elist[i];
2032    
2033     /* why check for stopwords here? removestopwords could have remove them */
2034     if (!isstopword(&indexf->header, epi->word))
2035     {
2036     /* Build worddata buffer */
2037     build_worddata(sw, epi, indexf);
2038     /* let's see if word is already in the index */
2039     old_wordid = read_worddata(sw, epi, indexf, &buffer, &sz_buffer);
2040     /* If exists, we have to add the new worddata buffer to the old one */
2041     if(old_wordid)
2042     {
2043     add_worddata(sw, epi, indexf, buffer, sz_buffer);
2044     efree(buffer);
2045     buffer = NULL;
2046     sz_buffer = 0;
2047     delete_worddata(sw, old_wordid, indexf);
2048     write_worddata(sw, epi, indexf);
2049     update_wordID(sw, epi, indexf);
2050     }
2051     else
2052     {
2053     /* Write word to index file */
2054     write_worddata(sw, epi, indexf);
2055     write_word(sw, epi, indexf);
2056     }
2057     }
2058     }
2059    
2060     if (sw->verbose)
2061     {
2062     printf("\r Writing word text: Complete\n" );
2063     fflush(stdout);
2064     }
2065    
2066    
2067     DB_EndWriteWords(sw, indexf->DB);
2068    
2069     /* free all ENTRY structs at once */
2070     Mem_ZoneFree(&sw->Index->entryZone);
2071    
2072     /* free all location compressed data */
2073     Mem_ZoneFree(&sw->Index->totalLocZone);
2074    
2075     efree(ep->elist);
2076     }
2077    
2078    
2079     #endif
2080    
2081    
2082    
2083    
2084     /* These 2 routines fix the problem when a word ends with mutiple
2085     ** IGNORELASTCHAR's (eg, qwerty'. ). The old code correctly deleted
2086     ** the ".", but didn't check if the new last character ("'") is also
2087     ** an ignore character.
2088     */
2089     void stripIgnoreLastChars(INDEXDATAHEADER *header, char *word)
2090     {
2091     int k,j,i = strlen(word);
2092    
2093     /* Get rid of specified last char's */
2094     /* for (i=0; word[i] != '\0'; i++); */
2095     /* Iteratively strip off the last character if it's an ignore character */
2096     while ((i > 0) && (isIgnoreLastChar(header, word[--i])))
2097     {
2098     word[i] = '\0';
2099    
2100     /* We must take care of the escaped characeters */
2101     /* Things like hello\c hello\\c hello\\\c can appear */
2102     for(j=0,k=i-1;k>=0 && word[k]=='\\';k--,j++);
2103    
2104     /* j contains the number of \ */
2105     if(j%2) /* Remove the escape if even */
2106     {
2107     word[--i]='\0';
2108     }
2109     }
2110     }
2111    
2112     void stripIgnoreFirstChars(INDEXDATAHEADER *header, char *word)
2113     {
2114     int j,
2115     k;
2116     int i = 0;
2117    
2118     /* Keep going until a char not to ignore is found */
2119     /* We must take care of the escaped characeters */
2120     /* Things like \chello \\chello can appear */
2121    
2122     while (word[i])
2123     {
2124     if(word[i]=='\\') /* Jump escape */
2125     k=i+1;
2126     else
2127     k=i;
2128     if(!word[k] || !isIgnoreFirstChar(header, word[k]))
2129     break;
2130     else
2131     i=k+1;
2132     }
2133    
2134     /* If all the char's are valid, just return */
2135     if (0 == i)
2136     return;
2137     else
2138     {
2139     for (k = i, j = 0; word[k] != '\0'; j++, k++)
2140     {
2141     word[j] = word[k];
2142     }
2143     /* Add the NULL */
2144     word[j] = '\0';
2145     }
2146     }
2147    
2148    
2149    
2150     static void addword( char *word, SWISH * sw, int filenum, int structure, int numMetaNames, int *metaID, int *word_position)
2151     {
2152     int i;
2153    
2154     /* Add the word for each nested metaname. */
2155     for (i = 0; i < numMetaNames; i++)
2156     (void) addentry(sw, getentry(sw,word), filenum, structure, metaID[i], *word_position);
2157    
2158     (*word_position)++;
2159     }
2160    
2161    
2162    
2163    
2164     /* Gets the next white-space delimited word */
2165     int next_word( char **buf, char **word, int *lenword )
2166     {
2167     int i;
2168    
2169     /* skip any whitespace */
2170     while ( **buf && isspace( (unsigned char) **buf) )
2171     (*buf)++;
2172    
2173     i = 0;
2174     while ( **buf && !isspace( (unsigned char) **buf) )
2175     {
2176     /* reallocate buffer, if needed */
2177     if ( i == *lenword )
2178     {
2179     *lenword *= 2;
2180     *word = erealloc(*word, *lenword + 1);
2181     }
2182    
2183     (*word)[i++] = **buf;
2184     (*buf)++;
2185     }
2186    
2187     if ( i )
2188     {
2189     (*word)[i] = '\0';
2190     return 1;
2191     }
2192    
2193     return 0;
2194     }
2195    
2196     /* Gets the next non WordChars delimited word */
2197     /* Bumps position if needed */
2198     int next_swish_word(SWISH * sw, char **buf, char **word, int *lenword, int *word_position )
2199     {
2200     int i;
2201     IndexFILE *indexf = sw->indexlist;
2202     int bump_flag = 0;
2203    
2204     /* skip non-wordchars and check for bump chars */
2205     while ( **buf && !iswordchar(indexf->header, **buf ) )
2206     {
2207     if (!bump_flag && isBumpPositionCounterChar(&indexf->header, (int) **buf))
2208     bump_flag++;
2209    
2210     (*buf)++;
2211     }
2212    
2213     i = 0;
2214     while ( **buf && iswordchar(indexf->header, **buf) )
2215     {
2216     /* It doesn't really make sense to have a WordChar that's also a bump char */
2217     if (!bump_flag && isBumpPositionCounterChar(&indexf->header, (int) **buf))
2218     bump_flag++;
2219    
2220    
2221     /* reallocate buffer, if needed */
2222     if ( i == *lenword )
2223     {
2224     *lenword *= 2;
2225     *word = erealloc(*word, *lenword + 1);
2226     }
2227    
2228     (*word)[i++] = **buf;
2229     (*buf)++;
2230     }
2231    
2232     /* If any bump chars were found then bump to prevent phrase matching */
2233     if ( bump_flag )
2234     (*word_position)++;
2235    
2236     if ( i )
2237     {
2238     (*word)[i] = '\0';
2239     stripIgnoreLastChars(&indexf->header, *word);
2240     stripIgnoreFirstChars(&indexf->header, *word);
2241    
2242     return *word ? 1 : 0;
2243     }
2244    
2245     return 0;
2246     }
2247    
2248     /******************************************************************
2249     * Build the list of metaIDs that need to be indexed
2250     *
2251     * Returns number of IDs found
2252     *
2253     *
2254     ******************************************************************/
2255     static int build_metaID_list( SWISH *sw )
2256     {
2257     struct MOD_Index *idx = sw->Index;
2258     METAIDTABLE *metas = &idx->metaIDtable;
2259     IndexFILE *indexf = sw->indexlist;
2260     INDEXDATAHEADER *header = &indexf->header;
2261     struct metaEntry *m;
2262     int i;
2263    
2264    
2265     /* cache the default metaID for speed */
2266     if ( metas->defaultID == -1 )
2267     {
2268     m = getMetaNameByName( header, AUTOPROPERTY_DEFAULT );
2269     metas->defaultID = m ? m->metaID : 0;
2270     }
2271    
2272    
2273     metas->num = 0;
2274    
2275    
2276     /* Would be smart to track number of metas flagged so not to loop through all for every lookup */
2277    
2278     for ( i = 0; i < header->metaCounter; i++)
2279     {
2280     m = header->metaEntryArray[i];
2281    
2282     if ( (m->metaType & META_INDEX) && m->in_tag )
2283     {
2284     if ( ++metas->num > metas->max )
2285     metas->array = (int *)erealloc( metas->array, (metas->max = metas->num + 200) );
2286    
2287     metas->array[metas->num - 1] = m->metaID;
2288     }
2289     }
2290    
2291     /* If no metas found to index, then add default metaID */
2292     if ( !metas->num && metas->defaultID )
2293     metas->array[metas->num++] = metas->defaultID;
2294    
2295     return metas->num;
2296     }
2297    
2298    
2299     /******************************************************************
2300     * Index a string
2301     *
2302     *
2303     ******************************************************************/
2304    
2305     /* 05/2001 Jose Ruiz - Changed word and swishword buffers to make this routine ** thread safe */
2306    
2307    
2308     int indexstring(SWISH * sw, char *s, int filenum, int structure, int numMetaNames, int *metaID, int *position)
2309     {
2310     int wordcount = 0;
2311    
2312     IndexFILE *indexf = sw->indexlist;
2313    
2314     char *buf_pos; /* pointer to current position */
2315     char *cur_pos; /* pointer to position with a word */
2316    
2317     int stem_return; /* return value of stem operation */
2318    
2319     struct MOD_Index *idx = sw->Index;
2320    
2321     /* Assign word buffers */
2322     char *word = idx->word;
2323     int lenword = idx->lenword;
2324     char *swishword = idx->swishword;
2325     int lenswishword = idx->lenswishword;
2326    
2327    
2328    
2329     /* Generate list of metaIDs to index unless passed in */
2330     if ( !metaID )
2331     {
2332     if ( !(numMetaNames = build_metaID_list( sw )) )
2333     return 0;
2334     else
2335     metaID = idx->metaIDtable.array;
2336     }
2337    
2338     /* current pointer into buffer */
2339     buf_pos = s;
2340    
2341    
2342     /* get the next word as defined by whitespace */
2343     while ( next_word( &buf_pos, &word, &lenword ) )
2344     {
2345     if ( DEBUG_MASK & DEBUG_PARSED_WORDS )
2346     printf("White-space found word '%s'\n", word );
2347    
2348    
2349     strtolower(word);
2350    
2351     /* is this a useful feature? */
2352     if ( indexf->header.is_use_words_flag )
2353     {
2354     if ( isuseword(&indexf->header, word) )
2355     {
2356     addword(word, sw, filenum, structure, numMetaNames, metaID, position );
2357     wordcount++;
2358     }
2359    
2360     continue;
2361     }
2362    
2363    
2364     /* Check for buzzwords */
2365     if ( indexf->header.buzzwords_used_flag )
2366     {
2367     /* only strip when buzzwords are being used since stripped again as a "swish word" */
2368     stripIgnoreLastChars(&indexf->header, word);
2369     stripIgnoreFirstChars(&indexf->header, word);
2370     if ( !*word ) /* stripped clean? */
2371     continue;
2372    
2373    
2374     if ( isbuzzword(&indexf->header, word) )
2375     {
2376     addword(word, sw, filenum, structure, numMetaNames, metaID, position );
2377     wordcount++;
2378     continue;
2379     }
2380     }
2381    
2382    
2383    
2384    
2385    
2386     /* Translate chars */
2387     TranslateChars(indexf->header.translatecharslookuptable, (unsigned char *)word);
2388    
2389     cur_pos = word;
2390    
2391    
2392    
2393     /* Now split the word up into "swish words" */
2394    
2395     while ( next_swish_word( sw, &cur_pos, &swishword, &lenswishword, position ) )
2396     {
2397    
2398     /* Weed out Numbers - or anything that's all the listed chars */
2399     if ( indexf->header.numberchars_used_flag )
2400     {
2401     unsigned char *c = (unsigned char *)swishword;
2402    
2403     /* look for any char that's NOT in the lookup table */
2404     while ( *c ) {
2405     if ( !indexf->header.numbercharslookuptable[(int) *c ] )
2406     break;
2407     c++;
2408     }
2409    
2410     /* if got all the way through the string then it's only those chars */
2411     if ( !*c )
2412     continue; /* skip this word */
2413     }
2414    
2415    
2416     /* Check Begin & EndCharacters */
2417     if (!indexf->header.begincharslookuptable[(int) ((unsigned char) swishword[0])])
2418     continue;
2419    
2420     if (!indexf->header.endcharslookuptable[(int) ((unsigned char) swishword[strlen(swishword) - 1])])
2421     continue;
2422    
2423    
2424     /* limit by stopwords, min/max length, max number of digits, ... */
2425     if (!isokword(sw, swishword, indexf))
2426     continue;
2427    
2428     /* Now translate word if fuzzy mode */
2429    
2430     switch ( indexf->header.fuzzy_mode )
2431     {
2432     case FUZZY_NONE:
2433     addword(swishword, sw, filenum, structure, numMetaNames, metaID, position );
2434     wordcount++;
2435     break;
2436    
2437     case FUZZY_STEMMING:
2438     stem_return = Stem(&swishword, &lenswishword);
2439    
2440     /* ===
2441     if ( stem_return == STEM_NOT_ALPHA ) printf("Stem: not alpha in '%s'\n", swishword );
2442     if ( stem_return == STEM_TOO_SMALL ) printf("Stem: too small in '%s'\n", swishword );
2443     if ( stem_return == STEM_WORD_TOO_BIG ) printf("Stem: too big to stem in '%s'\n", swishword );
2444     if ( stem_return == STEM_TO_NOTHING ) printf("Stem: stems to nothing '%s'\n", swishword );
2445     === */
2446    
2447     addword(swishword, sw, filenum, structure, numMetaNames, metaID, position );
2448     wordcount++;
2449     break;
2450    
2451    
2452     case FUZZY_SOUNDEX:
2453     soundex(swishword);
2454     addword(swishword, sw, filenum, structure, numMetaNames, metaID, position );
2455     wordcount++;
2456     break;
2457    
2458     case FUZZY_METAPHONE:
2459     case FUZZY_DOUBLE_METAPHONE:
2460     {
2461     char *codes[2];
2462     DoubleMetaphone(swishword, codes);
2463    
2464     if ( !(*codes[0]) )
2465     {
2466     efree( codes[0] );
2467     efree( codes[1] );
2468     addword(swishword, sw, filenum, structure, numMetaNames, metaID, position );
2469     wordcount++;
2470     break;
2471     }
2472     addword(codes[0], sw, filenum, structure, numMetaNames, metaID, position );
2473     wordcount++;
2474    
2475     if ( indexf->header.fuzzy_mode == FUZZY_DOUBLE_METAPHONE && *(codes[1]) && strcmp(codes[0], codes[1]) )
2476     {
2477     (*position)--; /* at same position as first word */
2478     addword(codes[1], sw, filenum, structure, numMetaNames, metaID, position );
2479     wordcount++;
2480     }
2481    
2482     efree( codes[0] );
2483     efree( codes[1] );
2484     }
2485    
2486     break;
2487    
2488    
2489     default:
2490     progerr("Invalid FuzzyMode '%d'", (int)indexf->header.fuzzy_mode );
2491     }
2492     }
2493     }
2494    
2495     /* Buffers can be reallocated - So, reasign them */
2496     idx->word = word;
2497     idx->lenword = lenword;
2498     idx->swishword = swishword;
2499     idx->lenswishword = lenswishword;
2500    
2501     return wordcount;
2502     }
2503    
2504    
2505     /* Coalesce word current word location into the linked list */
2506     void add_coalesced(SWISH *sw, ENTRY *e, unsigned char *coalesced, int sz_coalesced, int metaID)
2507     {
2508     int tmp;
2509     LOCATION *tloc, *tprev;
2510     LOCATION **tmploc, **tmploc2;
2511     unsigned char *tp;
2512    
2513    
2514     /* Check for economic mode (-e) and swap data to disk */
2515     if(sw->Index->swap_locdata)
2516     {
2517     tmploc = (LOCATION **)coalesced;
2518     *tmploc = (LOCATION *)e; /* Preserve e in buffer */
2519     /* The cast is for avoiding the warning */
2520     SwapLocData(sw, e, coalesced, sz_coalesced);
2521     return;
2522     }
2523    
2524     /* Add to the linked list keeping the data sorted by metaname, filenum */
2525     for(tprev =NULL, tloc = e->allLocationList; tloc; )
2526     {
2527     tp = (unsigned char *)tloc + sizeof(void *);
2528     tmp = uncompress2(&tp); /* Read metaID */
2529     if(tmp > metaID)
2530     break;
2531     tprev = tloc;
2532     tmploc = (LOCATION **)tloc;
2533     tloc = *tmploc;
2534     }
2535    
2536     if(! tprev)
2537     {
2538     tmploc = (LOCATION **)coalesced;
2539     *tmploc = e->allLocationList;
2540     e->allLocationList = (LOCATION *)coalesced;
2541     }
2542     else
2543     {
2544     tmploc = (LOCATION **)coalesced;
2545     tmploc2 = (LOCATION **)tprev;
2546     *tmploc = *tmploc2;
2547     *tmploc2 = (LOCATION *)coalesced;
2548     }
2549     }
2550    
2551    
2552     void coalesce_word_locations(SWISH * sw, IndexFILE * indexf, ENTRY *e)
2553     {
2554     int curmetaID, metaID,
2555     curfilenum, filenum,
2556     frequency,
2557     num_locs,
2558     bytes_size,
2559     worst_case_size;
2560     int i, j, tmp;
2561     unsigned char *p, *q, *size_p = NULL;
2562     unsigned char uflag, *cflag;
2563     LOCATION *loc, *next;
2564     static unsigned char buffer[COALESCE_BUFFER_MAX_SIZE];
2565     unsigned char *coalesced_buffer;
2566     int *posdata;
2567     int local_posdata[MAX_STACK_POSITIONS];
2568    
2569    
2570     /* Check for new locations in the current chunk */
2571     if(!e->currentChunkLocationList)
2572     return;
2573    
2574     /* Compute bytes required for size. Eg: 4096 -> 2 bytes, 65535 -> 2 bytes */
2575     for(bytes_size = 0, tmp = COALESCE_BUFFER_MAX_SIZE; tmp; tmp >>= 8)
2576     bytes_size++;
2577    
2578     /* Sort all pending word locations by metaID, filenum */
2579     sortChunkLocations(sw, indexf, e);
2580    
2581     /* Init vars */
2582     curmetaID = 0;
2583     curfilenum = 0;
2584     q = buffer; /* Destination buffer */
2585     num_locs = 0; /* Number of coalesced LOCATIONS */
2586    
2587     /* Run on all locations */
2588     for(loc = e->currentChunkLocationList; loc; )
2589     {
2590     p = (unsigned char *) loc;
2591    
2592     /* get next LOCATION in linked list*/
2593     next = * (LOCATION **) loc;
2594     p += sizeof(LOCATION *);
2595    
2596     /* get metaID of LOCATION */
2597     metaID = uncompress2(&p);
2598    
2599     /* Check for new metaID */
2600     if(metaID != curmetaID)
2601     {
2602     /* If exits previous data add it to the linked list */
2603     if(curmetaID)
2604     {
2605     /* add to the linked list and reset values */
2606     /* Update the size of chunk's data in *size_p */
2607     tmp = q - (size_p + bytes_size); /* tmp contains the size */
2608     /* Write the size */
2609     for(i = 0, j = bytes_size - 1; i < bytes_size; i++, j--)
2610     size_p[i] = tmp >> (j * 8);
2611     /* Add to the linked list keeping the data sorted by metaname, filenum */
2612     /* Allocate memory space */
2613     coalesced_buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->totalLocZone,q-buffer);
2614     /* Copy content to it */
2615     memcpy(coalesced_buffer,buffer,q-buffer);
2616     /* Add to the linked list */
2617     add_coalesced(sw, e, coalesced_buffer, q - buffer, curmetaID);
2618     }
2619     /* Reset values */
2620     curfilenum = 0;
2621     curmetaID = metaID;
2622     q = buffer + sizeof(void *); /* Make room for linked list pointer */
2623     q = compress3(metaID,q); /* Add metaID */
2624     size_p = q; /* Preserve position for size */
2625     q += bytes_size; /* Make room for size */
2626     num_locs = 0;
2627     }
2628     uncompress_location_values(&p,&uflag,&filenum,&frequency);
2629     worst_case_size = sizeof(unsigned char *) + (3 + frequency) * MAXINTCOMPSIZE;
2630    
2631     while ((q + worst_case_size) - buffer > sizeof(buffer))
2632     {
2633     if(!num_locs)
2634     progerr("Buffer too short in coalesce_word_locations. Increase COALESCE_BUFFER_MAX_SIZE in config.h and rebuild.");
2635     /* add to the linked list and reset values */
2636     /* Update the size of chunk's data in *size_p */
2637     tmp = q - (size_p + bytes_size); /* tmp contains the size */
2638     /* Write the size */
2639     for(i = 0, j = bytes_size - 1; i < bytes_size; i++, j--)
2640     size_p[i] = tmp >> (j * 8);
2641     /* Add to the linked list keeping the data sorted by metaname, filenum */
2642     /* Allocate memory space */
2643     coalesced_buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->totalLocZone,q-buffer);
2644     /* Copy content to it */
2645     memcpy(coalesced_buffer,buffer,q-buffer);
2646     /* Add to the linked list */
2647     add_coalesced(sw, e, coalesced_buffer, q - buffer, curmetaID);
2648    
2649     /* Reset values */
2650     curfilenum = 0;
2651     curmetaID = metaID;
2652     q = buffer + sizeof(void *); /* Make room for linked list pointer */
2653     q = compress3(metaID,q);
2654     size_p = q; /* Preserve position for size */
2655     q += bytes_size; /* Make room for size */
2656     num_locs = 0;
2657     }
2658    
2659     if(frequency > MAX_STACK_POSITIONS)
2660     posdata = emalloc(frequency * sizeof(int));
2661     else
2662     posdata = local_posdata;
2663    
2664     uncompress_location_positions(&p,uflag,frequency,posdata);
2665    
2666     /* Store the filenum incrementally to save space */
2667     compress_location_values(&q,&cflag,filenum - curfilenum,frequency, posdata);
2668    
2669     curfilenum = filenum;
2670    
2671     compress_location_positions(&q,cflag,frequency,posdata);
2672    
2673     if(frequency > MAX_STACK_POSITIONS)
2674     efree(posdata);
2675    
2676     num_locs++;
2677    
2678     loc = next;
2679     }
2680     if (num_locs)
2681     {
2682     /* add to the linked list and reset values */
2683     /* Update the size of chunk's data in *size_p */
2684     tmp = q - (size_p + bytes_size); /* tmp contains the size */
2685     /* Write the size */
2686     for(i = 0, j = bytes_size - 1; i < bytes_size; i++, j--)
2687     size_p[i] = tmp >> (j * 8);
2688     /* Add to the linked list keeping the data sorted by metaname, filenum */
2689     /* Allocate memory space */
2690     coalesced_buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->totalLocZone,q-buffer);
2691     /* Copy content to it */
2692     memcpy(coalesced_buffer,buffer,q-buffer);
2693     /* Add to the linked list */
2694     add_coalesced(sw, e, coalesced_buffer, q - buffer, curmetaID);
2695     }
2696     e->currentChunkLocationList = NULL;
2697     e->currentlocation = NULL;
2698    
2699     /* If we are swaping locs to file, reset also correspondant memory zone */
2700     if(sw->Index->swap_locdata)
2701     Mem_ZoneReset(sw->Index->totalLocZone);
2702    
2703     }
2704    

  ViewVC Help
Powered by ViewVC 1.1.22