/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/index.c
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/index.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Error occurred while calculating annotation data.
Importing web-site building process.

1 /*
2 $Id: index.c,v 1.194 2002/08/29 13:59:48 jmruiz Exp $
3 **
4 ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
5 ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
6 **
7 ** This program and library is free software; you can redistribute it and/or
8 ** as published by the Free Software Foundation; either version 2
9 ** of the License, or any later version.
10 **
11 ** This program is distributed in the hope that it will be useful,
12 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ** GNU (Library) General Public License for more details.
15 **
16 ** You should have received a copy of the GNU (Library) General Public License
17 ** long with this program; if not, write to the Free Software
18 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 **--------------------------------------------------------------------
20 ** ** ** PATCHED 5/13/96, CJC
21 **
22 ** Added code to countwords and countwordstr to disreguard the last char
23 ** if requiered by the config.h
24 ** G. Hill 3/12/97 ghill@library.berkeley.edu
25 **
26 ** Changed addentry, countwords, countwordstr, parsecomment, rintindex
27 ** added createMetaEntryList, getMeta, parseMetaData
28 ** to support METADATA
29 ** G. Hill 3/18/97 ghill@library.berkeley.edu
30 **
31 ** Changed removestops to support printing of stop words
32 ** G. Hill 4/7/97
33 **
34 ** Changed countwords, countwrdstr, and parseMetaData to disreguard the
35 ** first char if required by the config.h
36 ** G.Hill 10/16/97 ghill@library.berkeley.edu
37 **
38 ** Added stripIgnoreLastChars and isIgnoreLastChar routines which iteratively
39 ** remove all ignore characters from the end of each word.
40 ** P. Bergner 10/5/97 bergner@lcse.umn.edu
41 **
42 ** Added stripIgnoreFirstChars and isIgnoreFirstChar to make stripping of
43 ** the ignore first chars iterative.
44 ** G. Hill 11/19/97 ghill@library.berkeley.edu
45 **
46 ** Added possibility of use of quotes and brackets in meta CONTENT countwords, parsemetadata
47 ** G. Hill 1/14/98
48 **
49 ** Added regex for replace rule G.Hill 1/98
50 **
51 ** REQMETANAME - don't index meta tags not specified in MetaNames
52 ** 10/11/99 - Bill Moseley
53 **
54 ** change sprintf to snprintf to avoid corruption, use MAXPROPLEN instead of literal "20",
55 ** added include of merge.h - missing declaration caused compile error in prototypes,
56 ** added word length arg to Stem() call for strcat overflow checking in stemmer.c
57 ** added safestrcpy() macro to avoid corruption from strcpy overflow
58 ** SRE 11/17/99
59 **
60 ** fixed misc problems pointed out by "gcc -Wall"
61 ** SRE 2/22/00
62 **
63 ** Added code for storing word positions in index file
64 ** Jose Ruiz 3/00 jmruiz@boe.es
65 **
66 ** 04/00 - Jose Ruiz
67 ** Added code for a hash table in index file for searching words
68 ** via getfileinfo in search.c (Lots of addons). Better perfomance
69 ** with big databases and or heavy searchs (a* or b* or c*)
70 **
71 ** 04/00 - Jose Ruiz
72 ** Improved number compression function (compress)
73 ** New number decompress function
74 ** Both converted into macros for better performance
75 **
76 ** 07/00 and 08/00 - Jose Ruiz
77 ** Many modifications to make some functions thread safe
78 **
79 ** 08/00 - Jose Ruiz
80 ** New function indexstring. Up to now there were 4 functions doing almost
81 ** the same thing: countwords, countwordstr, parseMetaData and parsecomment
82 ** From now on, these 4 functions calls indexstring wich is the common part
83 ** to all of them. In fact, countwordstr, parseMetaData and parsecomment
84 ** are now simple frontends to indexstring
85 **
86 ** 2000-11 - rasc
87 ** some redesgin, place common index code into a common routine
88 ** FileProp structures, routines
89 **
90 ** --
91 ** TODO
92 ** $$ there has still to be some resesign to be done.
93 ** $$ swish-e was originally designed to index html only. So the routines
94 ** $$ are for historically reasons scattered
95 ** $$ (e.g. isoktitle (), is ishtml() etc.)
96 **
97 ** 2000-12 Jose Ruiz
98 ** obsolete routine ishtml removed
99 ** isoktitle moved to html.c
100 **
101 ** 2001-03-02 rasc Header: write translatecharacters
102 ** 2001-03-14 rasc resultHeaderOutput -H n
103 ** 2001-03-24 rasc timeroutines rearranged
104 ** 2001-06-08 wsm Store word after ENTRY to save memory
105 ** 2001-08 jmruiz All locations stuff rewritten to save memory
106 **
107 */
108
109 #include "swish.h"
110 #include "mem.h"
111 #include "string.h"
112 #include "index.h"
113 #include "hash.h"
114 #include "check.h"
115 #include "search.h"
116 #include "merge.h"
117 #include "docprop.h"
118 #include "stemmer.h"
119 #include "soundex.h"
120 #include "double_metaphone.h"
121 #include "error.h"
122 #include "file.h"
123 #include "compress.h"
124 /* Removed due to problems with patents
125 #include "deflate.h"
126 */
127 #include "html.h"
128 #include "xml.h"
129 #include "parser.h"
130 #include "txt.h"
131 #include "metanames.h"
132 #include "result_sort.h"
133 #include "result_output.h"
134 #include "filter.h"
135 #include "date_time.h"
136 #include "db.h"
137 #include "dump.h"
138 #include "swish_qsort.h"
139
140 static void index_path_parts( SWISH *sw, char *path, path_extract_list *list, INDEXDATAHEADER *header, docProperties **properties );
141
142
143
144 /*
145 -- init structures for this module
146 */
147
148
149 void initModule_Index (SWISH *sw)
150 {
151 int i;
152 struct MOD_Index *idx;
153
154 idx = (struct MOD_Index *) emalloc(sizeof(struct MOD_Index));
155 memset( idx, 0, sizeof( struct MOD_Index ) );
156 sw->Index = idx;
157
158 idx->filenum = 0;
159 idx->entryArray = NULL;
160
161 idx->len_compression_buffer = MAXSTRLEN; /* For example */
162 idx->compression_buffer=(unsigned char *)emalloc(idx->len_compression_buffer);
163
164 idx->len_worddata_buffer = MAXSTRLEN; /* For example */
165 idx->worddata_buffer=(unsigned char *)emalloc(idx->len_worddata_buffer);
166 idx->sz_worddata_buffer = 0;
167
168 /* Init entries hash table */
169 for (i=0; i<VERYBIGHASHSIZE; i++)
170 {
171 idx->hashentries[i] = NULL;
172 idx->hashentriesdirty[i] = 0;
173 }
174
175
176 /* Economic flag and temp files*/
177 idx->swap_locdata = SWAP_LOC_DEFAULT;
178
179
180 for(i=0;i<BIGHASHSIZE;i++) idx->inode_hash[i]=NULL;
181
182 /* initialize buffers used by indexstring */
183 idx->word = (char *) emalloc((idx->lenword = MAXWORDLEN) + 1);
184 idx->swishword = (char *) emalloc((idx->lenswishword = MAXWORDLEN) + 1);
185
186 idx->plimit=PLIMIT;
187 idx->flimit=FLIMIT;
188 idx->nIgnoreLimitWords = 0;
189 idx->IgnoreLimitPositionsArray = NULL;
190
191 /* Swapping access file functions */
192 idx->swap_tell = ftell;
193 idx->swap_write = fwrite;
194 idx->swap_close = fclose;
195 idx->swap_seek = fseek;
196 idx->swap_read = fread;
197 idx->swap_getc = fgetc;
198 idx->swap_putc = fputc;
199
200 for( i = 0; i <MAX_LOC_SWAP_FILES ; i++)
201 {
202 idx->swap_location_name[i] = NULL;
203 idx->fp_loc_write[i] = NULL;
204 idx->fp_loc_read[i] = NULL;
205 }
206 /* Index in blocks of chunk_size documents */
207 idx->chunk_size = INDEX_DEFAULT_CHUNK_SIZE;
208
209 /* Use this value to avoid using big zones just as a temporary location storage */
210 idx->optimalChunkLocZoneSize = INDEX_DEFAULT_OPTIMAL_CHUNK_ZONE_SIZE_FOR_LOCATIONS;
211
212 idx->freeLocMemChain = NULL;
213
214 /* memory zones for common structures */
215 idx->perDocTmpZone = Mem_ZoneCreate("Per Doc Temporal Zone", 0, 0);
216 idx->currentChunkLocZone = Mem_ZoneCreate("Current Chunk Locators", 0, 0);
217 idx->totalLocZone = Mem_ZoneCreate("All Locators", 0, 0);
218 idx->entryZone = Mem_ZoneCreate("struct ENTRY", 0, 0);
219
220 /* table for storing which metaIDs to index */
221 idx->metaIDtable.max = 200; /* totally random guess */
222 idx->metaIDtable.num = 0;
223 idx->metaIDtable.array = (int *)emalloc( idx->metaIDtable.max * sizeof(int) );
224 idx->metaIDtable.defaultID = -1;
225
226
227 /* $$$ this is only a fix while http.c and httpserver.c still exist */
228 idx->tmpdir = estrdup(".");
229
230 return;
231 }
232
233
234 /*
235 -- release all wired memory for this module
236 -- 2001-04-11 rasc
237 */
238
239 void freeModule_Index (SWISH *sw)
240 {
241 struct MOD_Index *idx = sw->Index;
242 int i;
243
244 /* we need to call the real free here */
245
246 for( i = 0; i < MAX_LOC_SWAP_FILES ; i++)
247 {
248 if (idx->swap_location_name[i] && isfile(idx->swap_location_name[i]))
249 {
250 if (idx->fp_loc_read[i])
251 idx->swap_close(idx->fp_loc_read[i]);
252
253 if (idx->fp_loc_write[i])
254 idx->swap_close(idx->fp_loc_write[i]);
255
256 remove(idx->swap_location_name[i]);
257 }
258
259
260 if (idx->swap_location_name[i])
261 efree(idx->swap_location_name[i]);
262 }
263
264 if(idx->tmpdir) efree(idx->tmpdir);
265
266 /* Free compression buffer */
267 efree(idx->compression_buffer);
268 /* free worddata buffer */
269 efree(idx->worddata_buffer);
270
271 /* free word buffers used by indexstring */
272 efree(idx->word);
273 efree(idx->swishword);
274
275 /* free IgnoreLimit stuff */
276 if(idx->IgnoreLimitPositionsArray)
277 {
278 for(i=0; i<sw->indexlist->header.totalfiles; i++)
279 {
280 if(idx->IgnoreLimitPositionsArray[i])
281 {
282 efree(idx->IgnoreLimitPositionsArray[i]->pos);
283 efree(idx->IgnoreLimitPositionsArray[i]);
284 }
285 }
286 efree(idx->IgnoreLimitPositionsArray);
287 }
288
289 /* should be free by now!!! But just in case... */
290 if (idx->entryZone)
291 Mem_ZoneFree(&idx->entryZone);
292
293 if (idx->totalLocZone)
294 Mem_ZoneFree(&idx->totalLocZone);
295 if (idx->currentChunkLocZone)
296 Mem_ZoneFree(&idx->currentChunkLocZone);
297 if (idx->perDocTmpZone)
298 Mem_ZoneFree(&idx->perDocTmpZone);
299
300
301 if ( idx->entryArray )
302 efree( idx->entryArray);
303
304
305 efree( idx->metaIDtable.array );
306
307 /* free module data */
308 efree (idx);
309 sw->Index = NULL;
310
311
312 return;
313 }
314
315
316 /*
317 ** ----------------------------------------------
318 **
319 ** Module config code starts here
320 **
321 ** ----------------------------------------------
322 */
323
324
325 /*
326 -- Config Directives
327 -- Configuration directives for this Module
328 -- return: 0/1 = none/config applied
329 */
330
331 int configModule_Index (SWISH *sw, StringList *sl)
332
333 {
334 struct MOD_Index *idx = sw->Index;
335 char *w0 = sl->word[0];
336 int retval = 1;
337 char *env_tmp = NULL;
338
339 if (strcasecmp(w0, "tmpdir") == 0)
340 {
341 if (sl->n == 2)
342 {
343 idx->tmpdir = erealloc( idx->tmpdir, strlen( sl->word[1] ) + 1 );
344 strcpy( idx->tmpdir, sl->word[1] );
345 normalize_path( idx->tmpdir );
346
347 if (!isdirectory(idx->tmpdir))
348 progerr("%s: %s is not a directory", w0, idx->tmpdir);
349
350 if ( !( env_tmp = getenv("TMPDIR")) )
351 if ( !(env_tmp = getenv("TMP")) )
352 env_tmp = getenv("TEMP");
353
354 if ( env_tmp )
355 progwarn("Configuration setting for TmpDir '%s' will be overridden by environment setting '%s'", idx->tmpdir, env_tmp );
356
357
358 }
359 else
360 progerr("%s: requires one value", w0);
361 }
362 else if (strcasecmp(w0, "IgnoreLimit") == 0)
363 {
364 if (sl->n == 3)
365 {
366 idx->plimit = atol(sl->word[1]);
367 idx->flimit = atol(sl->word[2]);
368 }
369 else
370 progerr("%s: requires two values", w0);
371 }
372 else
373 {
374 retval = 0; /* not a module directive */
375 }
376 return retval;
377 }
378
379 /**************************************************************************
380 * Remove a file from the index. Used when the parser aborts
381 * while indexing. Typically because of FileRules.
382 *
383 **************************************************************************/
384
385
386 static void remove_last_file_from_list(SWISH * sw, IndexFILE * indexf)
387 {
388 struct MOD_Index *idx = sw->Index;
389 int i;
390 ENTRY *ep, *prev_ep;
391 LOCATION *l;
392
393 /* Decrease filenum */
394 idx->filenum--;
395 indexf->header.totalfiles--;
396
397 /* Should be removed */
398 if(idx->filenum < 0 || indexf->header.totalfiles < 0)
399 progerr("Internal error in remove_last_file_from_list");
400
401
402 /* walk the hash list to remove words */
403 for (i = 0; i < VERYBIGHASHSIZE; i++)
404 {
405 if (idx->hashentriesdirty[i])
406 {
407 idx->hashentriesdirty[i] = 0;
408 for (ep = idx->hashentries[i], prev_ep =NULL; ep; ep = ep->next)
409 {
410 if(ep->currentChunkLocationList)
411 {
412 /* First of all - Adjust tfrequency */
413 for(l = ep->currentChunkLocationList; l; l = l->next)
414 {
415 ep->tfrequency--;
416 }
417 /* Remove locations */
418 /* Do not use efree, locations uses a MemZone (currentChunkLocZone) */
419 /* Will be freed later */
420 ep->currentChunkLocationList = NULL;
421 ep->currentlocation = NULL;
422 /* If there is no locations we must also remove the word */
423 /* Do not call efree to remove the entry, entries use
424 ** a MemZone (perDocTmpZone) - Will be freed later */
425 if(!ep->allLocationList)
426 {
427 if(!prev_ep)
428 {
429 idx->hashentries[i] = ep->next;
430 }
431 else
432 {
433 prev_ep->next = ep->next;
434 }
435 /* Adjust word counters */
436 idx->entryArray->numWords--;
437 indexf->header.totalwords--;
438 }
439 }
440 else
441 {
442 prev_ep = ep;
443 }
444 }
445 }
446 }
447 }
448
449
450
451 /**************************************************************************
452 * Index just the file name (or the title) for NoContents files
453 * $$$ this can be removed if libxml2 is used full time
454 **************************************************************************/
455 static int index_no_content(SWISH * sw, FileProp * fprop, FileRec *fi, char *buffer)
456 {
457 struct MOD_Index *idx = sw->Index;
458 char *title = "";
459 int n;
460 int position = 1; /* Position of word */
461 int metaID = 1; /* THIS ASSUMES that that's the default ID number */
462
463
464 /* Look for title if HTML document */
465
466 if (fprop->doctype == HTML)
467 {
468 title = parseHTMLtitle( sw , buffer );
469
470 if (!isoktitle(sw, title))
471 return -2; /* skipped because of title */
472 }
473
474
475 #ifdef HAVE_LIBXML2
476 if (fprop->doctype == HTML2)
477 return parse_HTML( sw, fprop, fi, buffer );
478 #endif
479
480
481 addCommonProperties( sw, fprop, fi, title, NULL, 0 );
482
483
484 n = indexstring( sw, *title == '\0' ? fprop->real_path : title , idx->filenum, IN_FILE, 1, &metaID, &position);
485
486
487 /** ??? $$$ doesn't look right -- check this ***/
488 if ( *title != '\0' )
489 efree( title );
490
491 return n;
492 }
493
494
495 /*********************************************************************
496 ** 2001-08 jmruiz - A couple of specialized routines to be used with
497 ** locations and MemZones. The main goal is avoid malloc/realloc/free
498 ** wich produces a lot of fragmentation
499 **
500 ** The memory will be allocated in blocks of 64 bytes inside a zone.
501 ** (I have tried both 32 and 64. 32 looks fine
502 ** In this way, there is some overhead because when a new block is
503 ** requested from the MemZone, the space is not recovered. But this
504 ** only true for the current document because the MemZone is reset
505 ** onces the document is processed. Then, the space is recovered
506 ** after a MemZoneReset is issued
507 **
508 ** 2001-09 jmruiz Improved. Now unused space is recovered when asking
509 ** for space. Free nlocks are maintained using a linked list
510 ********************************************************************/
511
512 #define LOC_BLOCK_SIZE 32 /* Must be greater than sizeof(LOCATION) and a power of 2 */
513 #define LOC_MIN_SIZE ((sizeof(LOCATION) + LOC_BLOCK_SIZE - 1) & (~(LOC_BLOCK_SIZE - 1)))
514
515 struct loc_chain {
516 struct loc_chain *next;
517 int size;
518 };
519
520 /********************************************************************
521 ** 2001-08 jmruiz
522 ** Routine to allocate memory inside a zone for a plain LOCATION
523 ** (frequency is 1). Since we are asking for LOC_BLOCK_SIZE bytes, we
524 ** are loosing some of the space.
525 ** The advantage is that we do not need to call realloc so often. In
526 ** fact, most realloc function work this way. They asks for more memory
527 ** to avoid the overhead of the sequence malloc, memcpy, free.
528 ********************************************************************/
529
530 LOCATION *alloc_location(struct MOD_Index *idx,int size)
531 {
532 struct loc_chain *tmp = (struct loc_chain *) idx->freeLocMemChain;
533 struct loc_chain *big = NULL;
534 LOCATION *tmp2 = NULL;
535 int avail = 0;
536 struct loc_chain *p_avail = NULL;
537
538 /* Search for a previously freed location of the same size */
539 while(tmp)
540 {
541 if(tmp->size == size)
542 {
543 if(!tmp2)
544 idx->freeLocMemChain = (LOCATION *)tmp->next;
545 else
546 tmp2->next = (LOCATION *)tmp->next;
547 return (LOCATION *)tmp;
548 }
549 else if(tmp->size > size)
550 {
551 /* Just reserve it to be used if we do not find a match */
552 big = tmp;
553 }
554 else
555 {
556 p_avail = tmp;
557 avail = tmp->size;
558 /* Check consecutive for consecutive blocks */
559 while(((unsigned char *)tmp + tmp->size) == (unsigned char *)tmp->next)
560 {
561 avail += tmp->next->size;
562 if(avail == size)
563 {
564 if(!tmp2)
565 idx->freeLocMemChain = (LOCATION *)tmp->next->next;
566 else
567 tmp2->next = (LOCATION *)tmp->next->next;
568 return (LOCATION *)p_avail;
569 }
570 else if(avail > size)
571 {
572 break;
573 }
574 else
575 {
576 tmp = tmp->next;
577 }
578 }
579 }
580 tmp2 = (LOCATION *)tmp;
581 tmp = tmp->next;
582 }
583 /* Perhaps we have a block with greater size */
584 if(big)
585 {
586 /* Split it */
587 while(big->size > size)
588 {
589 big->size >>= 1;
590 tmp = (struct loc_chain *) ((unsigned char *)big + big->size);
591 tmp->next = big->next;
592 tmp->size = big->size;
593 if(tmp->size == size)
594 return (LOCATION *)tmp;
595 big->next = tmp;
596 big = tmp;
597 }
598 }
599 /* NO memory in free chain of the same size - Asks for size */
600 return (LOCATION *)Mem_ZoneAlloc(idx->currentChunkLocZone, size);
601 }
602
603
604 LOCATION *new_location(struct MOD_Index *idx)
605 {
606 return (LOCATION *)alloc_location(idx, LOC_MIN_SIZE);
607 }
608
609
610 int is_location_full(int size)
611 {
612 int i;
613
614 /* Fast test. Since LOC_BLOCK_SIZE is the minimum size ... */
615 if(size % LOC_BLOCK_SIZE)
616 return 0; /* it is not a power of two */
617 /* Check if size is a power of 2 (32,64,128,256,...) in binary ..000100... */
618 for(i=LOC_BLOCK_SIZE;;i <<= 1)
619 {
620 if(size>i)
621 {
622 continue;
623 }
624 if((size & i) == size)
625 {
626 return 1;
627 }
628 else
629 {
630 break;
631 }
632 }
633 return 0;
634 }
635
636 /********************************************************************
637 ** 2001-08 jmruiz
638 ** Routine to reallocate memory inside a zone for a previous allocated
639 ** LOCATION (frequency > 1).
640 ** A new block is allocated only if the previous becomes full
641 ********************************************************************/
642 LOCATION *add_position_location(void *oldp, struct MOD_Index *idx, int frequency)
643 {
644 LOCATION *newp = NULL;
645 struct loc_chain *tmp = NULL;
646 int oldsize;
647
648 oldsize = sizeof(LOCATION) + (frequency - 1) * sizeof(int);
649
650 /* Check for available size in block */
651 if(is_location_full(oldsize))
652 {
653 /* Not enough size - Allocate a new block. Size rounded to LOC_BLOCK_SIZE */
654 newp = (LOCATION *)alloc_location(idx,oldsize << 1);
655 memcpy((void *)newp,(void *)oldp,oldsize);
656 /* Add old zone to the free chain of blocks */
657 tmp = (struct loc_chain *)oldp;
658 tmp->next = (struct loc_chain *)idx->freeLocMemChain;
659 tmp->size = oldsize;
660 idx->freeLocMemChain = (LOCATION *) tmp;
661 }
662 else
663 /* Enough size */
664 newp = oldp;
665
666 return newp;
667 }
668
669 /***********************************************************************
670 -- Start the real indexing process for a file.
671 -- This routine will be called by the different indexing methods
672 -- (httpd, filesystem, etc.)
673 -- The indexed file may be the
674 -- - real file on filesystem
675 -- - tmpfile or work file (shadow of the real file)
676 -- Checks if file has to be send thru filter (file stream)
677 -- 2000-11-19 rasc
678 ***********************************************************************/
679
680 void do_index_file(SWISH * sw, FileProp * fprop)
681 {
682 int (*countwords)(SWISH *sw,FileProp *fprop, FileRec *fi, char *buffer);
683 IndexFILE *indexf = sw->indexlist;
684 int wordcount;
685 char *rd_buffer = NULL; /* complete file read into buffer */
686 struct MOD_Index *idx = sw->Index;
687 char strType[30];
688 int i;
689 FileRec fi; /* place to hold doc properties */
690
691 memset( &fi, 0, sizeof( FileRec ) );
692
693
694 wordcount = -1;
695
696
697
698 /* skip file is the last_mod date is newer than the check date */
699
700 if (sw->mtime_limit && fprop->mtime < sw->mtime_limit)
701 {
702 if (sw->verbose >= 3)
703 progwarn("Skipping %s: last_mod date is too old\n", fprop->real_path);
704
705 /* external program must seek past this data (fseek fails) */
706 if (fprop->fp)
707 flush_stream( fprop );
708
709 return;
710 }
711
712
713 /* Upon entry, if fprop->fp is non-NULL then it's already opened and ready to be read from.
714 This is the case with "prog" external programs, *except* when a filter is selected for the file type.
715 If a filter is used with "prog" a temporary file was created (fprop->work_file), and
716 fprop->fp will be NULL (as is with http and fs access methods).
717 2001-05-13 moseley
718 */
719
720
721
722 /* Get input file handle */
723 if (fprop->hasfilter)
724 {
725 fprop->fp = FilterOpen(fprop);
726
727 /* This should be checked in filteropen because the popen probably won't fail */
728 if ( !fprop->fp )
729 progerr("Failed to open filter for file '%s'",fprop->real_path);
730 }
731
732 else if ( !fprop->fp )
733 {
734 fprop->fp = fopen(fprop->work_path, F_READ_TEXT );
735
736 if ( !fprop->fp )
737 {
738 progwarnno("Failed to open: '%s': ", fprop->work_path);
739 return;
740 }
741 }
742 else /* Already open - flag to prevent closing the stream used with "prog" */
743 fprop->external_program++;
744
745
746
747
748 /** Replace the path for ReplaceRules **/
749
750 if ( sw->replaceRegexps )
751 {
752 int matched = 0;
753 fprop->real_path = process_regex_list( fprop->real_path, sw->replaceRegexps, &matched );
754 }
755
756
757
758 /** Read the buffer, if not a stream parser **/
759 #ifdef HAVE_LIBXML2
760 if ( fprop->doctype == HTML2 || fprop->doctype == XML2 || fprop->doctype == TXT2 )
761 rd_buffer = NULL;
762 else
763 #endif
764 /* -- Read all data (len = 0 if filtered...) */
765 rd_buffer = read_stream(sw, fprop->real_path, fprop->fp, (fprop->hasfilter) ? 0 : fprop->fsize, sw->truncateDocSize);
766
767
768 /* just for fun so we can show total bytes shown */
769 sw->indexlist->total_bytes += fprop->fsize;
770
771
772 /* Set which parser to use */
773
774 switch (fprop->doctype)
775 {
776
777 case TXT:
778 strcpy(strType,"TXT");
779 countwords = countwords_TXT;
780 break;
781
782 case HTML:
783 strcpy(strType,"HTML");
784 countwords = countwords_HTML;
785 break;
786
787 case XML:
788 strcpy(strType,"XML");
789 countwords = countwords_XML;
790 break;
791
792 #ifdef HAVE_LIBXML2
793 case XML2:
794 strcpy(strType,"XML2");
795 countwords = parse_XML;
796 break;
797
798 case HTML2:
799 strcpy(strType,"HTML2");
800 countwords = parse_HTML;
801 break;
802
803 case TXT2:
804 strcpy(strType,"TXT2");
805 countwords = parse_TXT;
806 break;
807 #endif
808
809 case WML:
810 strcpy(strType,"WML");
811 countwords = countwords_HTML;
812 break;
813
814 default:
815 strcpy(strType,"DEFAULT (HTML)");
816 countwords = countwords_HTML;
817 break;
818 }
819
820 if (sw->verbose >= 3)
821 printf(" - Using %s parser - ",strType);
822
823
824 /* Check for NoContents flag and just save the path name */
825 /* $$$ Note, really need to only read_stream if reading from a pipe. */
826 /* $$$ waste of disk IO and memory if reading from file system */
827
828 if (fprop->index_no_content)
829 countwords = index_no_content;
830
831
832 /* Make sure all meta flags are cleared (incase a parser aborts) */
833 ClearInMetaFlags( &indexf->header );
834
835
836
837
838 /* Now bump the file counter */
839 idx->filenum++;
840 indexf->header.totalfiles++; /* why ??? is this needed */
841 fi.filenum = idx->filenum;
842
843 /** PARSE **/
844 wordcount = countwords(sw, fprop, &fi, rd_buffer);
845
846
847
848
849
850 if (!fprop->external_program) /* external_program is not set if a filter is in use */
851 {
852 if (fprop->hasfilter)
853 FilterClose(fprop->fp); /* close filter pipe - should the filter be flushed? */
854 else
855 fclose(fprop->fp); /* close file */
856 }
857 /* Else, it's -S prog so make sure we read all the bytes we are suppose to read! */
858 /* Can remove the check for fprop->bytes_read once read_stream is no longer used */
859
860 else if ( fprop->bytes_read && fprop->bytes_read < fprop->fsize )
861 flush_stream( fprop );
862
863
864 if (sw->verbose >= 3)
865 {
866 if (wordcount > 0)
867 printf(" (%d words)\n", wordcount);
868 else if (wordcount == 0)
869 printf(" (no words indexed)\n");
870 else if (wordcount == -1)
871 printf(" (not opened)\n");
872 else if (wordcount == -2)
873 printf(" (Skipped due to 'FileRules title' setting)\n");
874 else if (wordcount == -3)
875 printf(" (Skipped due to Robots Excluion Rule in meta tag)\n");
876 fflush(stdout);
877 }
878
879
880 /* If indexing aborted, remove the last file entry */
881 if ( wordcount == -3 || wordcount == -2 )
882 {
883 remove_last_file_from_list( sw, indexf );
884 return;
885 }
886
887
888 /* Continue if a file was not indexed */
889 if ( wordcount < 0 )
890 return;
891
892
893 if ( DEBUG_MASK & DEBUG_PROPERTIES )
894 dump_file_properties( indexf, &fi );
895
896
897 /* write properties to disk, and release docprop array (and the prop index array) */
898 /* Currently this just passes sw, and assumes only one index file when indexing */
899 WritePropertiesToDisk( sw , &fi );
900
901
902 /* Save total words per file */
903 if ( !indexf->header.ignoreTotalWordCountWhenRanking )
904 {
905
906 setTotalWordsPerFile(sw, indexf, fi.filenum - 1,wordcount);
907 }
908
909
910
911
912 /* Compress the entries */
913 {
914 ENTRY *ep;
915
916 /* walk the hash list, and compress entries */
917 for (i = 0; i < VERYBIGHASHSIZE; i++)
918 {
919 if (idx->hashentriesdirty[i])
920 {
921 idx->hashentriesdirty[i] = 0;
922 for (ep = idx->hashentries[i]; ep; ep = ep->next)
923 CompressCurrentLocEntry(sw, indexf, ep);
924 }
925 }
926
927 /* Coalesce word positions int a more optimal schema to avoid maintain the location data contiguous */
928 if(idx->filenum && ((!(idx->filenum % idx->chunk_size)) || (Mem_ZoneSize(idx->currentChunkLocZone) > idx->optimalChunkLocZoneSize)))
929 {
930 for (i = 0; i < VERYBIGHASHSIZE; i++)
931 for (ep = idx->hashentries[i]; ep; ep = ep->next)
932 coalesce_word_locations(sw, indexf, ep);
933 /* Make zone available for reuse */
934 Mem_ZoneReset(idx->currentChunkLocZone);
935 idx->freeLocMemChain = NULL;
936
937 }
938 }
939
940
941 /* Make zone available for reuse */
942 Mem_ZoneReset(idx->perDocTmpZone);
943
944
945 return;
946 }
947
948
949 ENTRY *getentry(SWISH * sw, char *word)
950 {
951 IndexFILE *indexf = sw->indexlist;
952 struct MOD_Index *idx = sw->Index;
953 int hashval;
954 ENTRY *e;
955
956 if (!idx->entryArray)
957 {
958 idx->entryArray = (ENTRYARRAY *) emalloc(sizeof(ENTRYARRAY));
959 idx->entryArray->numWords = 0;
960 idx->entryArray->elist = NULL;
961 }
962 /* Compute hash value of word */
963 hashval = verybighash(word);
964
965
966 /* Look for the word in the hash array */
967 for (e = idx->hashentries[hashval]; e; e = e->next)
968 if (strcmp(e->word, word) == 0)
969 break;
970
971 /* flag hash entry used this file, so that the locations can be "compressed" in do_index_file */
972 idx->hashentriesdirty[hashval] = 1;
973
974
975 /* Word found, return it */
976 if (e)
977 return e;
978
979 /* Word not found, so create a new word */
980
981 e = (ENTRY *) Mem_ZoneAlloc(idx->entryZone, sizeof(ENTRY) + strlen(word));
982 strcpy(e->word, word);
983 e->next = idx->hashentries[hashval];
984 idx->hashentries[hashval] = e;
985
986 /* Init values */
987 e->tfrequency = 0;
988 e->u1.last_filenum = 0;
989 e->currentlocation = NULL;
990 e->currentChunkLocationList = NULL;
991 e->allLocationList = NULL;
992
993 idx->entryArray->numWords++;
994 indexf->header.totalwords++;
995
996 return e;
997 }
998
999 /* Adds a word to the master index tree.
1000 */
1001
1002 void addentry(SWISH * sw, ENTRY *e, int filenum, int structure, int metaID, int position)
1003 {
1004 int found;
1005 LOCATION *tp, *newtp, *prevtp;
1006 IndexFILE *indexf = sw->indexlist;
1007 struct MOD_Index *idx = sw->Index;
1008
1009
1010 indexf->total_word_positions++;
1011
1012 if ( DEBUG_MASK & DEBUG_WORDS )
1013 {
1014 struct metaEntry *m = getMetaNameByID(&indexf->header, metaID);
1015
1016 printf(" Adding:[%d:%s(%d)] '%s' Pos:%d Stuct:0x%0X (", filenum, m ? m->metaName : "PROP_UNKNOWN", metaID, e->word, position, structure);
1017
1018 if ( structure & IN_EMPHASIZED ) printf(" EM");
1019 if ( structure & IN_HEADER ) printf(" HEADING");
1020 if ( structure & IN_COMMENTS ) printf(" COMMENT");
1021 if ( structure & IN_META ) printf(" META");
1022 if ( structure & IN_BODY ) printf(" BODY");
1023 if ( structure & IN_HEAD ) printf(" HEAD");
1024 if ( structure & IN_TITLE ) printf(" TITLE");
1025 if ( structure & IN_FILE ) printf(" FILE");
1026 printf(" )\n");
1027 }
1028
1029
1030 /* Check for first time */
1031 if(!e->tfrequency)
1032 {
1033 /* create a location record */
1034 tp = (LOCATION *) new_location(idx);
1035 tp->filenum = filenum;
1036 tp->frequency = 1;
1037 tp->metaID = metaID;
1038 tp->posdata[0] = SET_POSDATA(position,structure);
1039 tp->next = NULL;
1040
1041 e->currentChunkLocationList = tp;
1042 e->tfrequency = 1;
1043 e->u1.last_filenum = filenum;
1044
1045 return;
1046 }
1047
1048 /* Word found -- look for same metaID and filename */
1049 /* $$$ To do it right, should probably compare the structure, too */
1050 /* Note: filename not needed due to compress we are only looking at the current file */
1051 /* Oct 18, 2001 -- filename is needed since merge adds words in non-filenum order */
1052
1053 tp = e->currentChunkLocationList;
1054 found = 0;
1055
1056 while (tp != e->currentlocation)
1057 {
1058 if(tp->metaID == metaID && tp->filenum == filenum )
1059 {
1060 found =1;
1061 break;
1062 }
1063 tp = tp->next;
1064 }
1065
1066 /* matching metaID NOT found. So, add a new LOCATION record onto the word */
1067 /* This expands the size of the location array for this word by one */
1068
1069 if(!found)
1070 {
1071 /* create the new LOCATION entry */
1072 tp = (LOCATION *) new_location(idx);
1073 tp->filenum = filenum;
1074 tp->frequency = 1; /* count of times this word in this file:metaID */
1075 tp->metaID = metaID;
1076 tp->posdata[0] = SET_POSDATA(position,structure);
1077
1078 /* add the new LOCATION onto the array */
1079 tp->next = e->currentChunkLocationList;
1080 e->currentChunkLocationList = tp;
1081
1082 /* Count number of different files that this word is used in */
1083 if ( e->u1.last_filenum != filenum )
1084 {
1085 e->tfrequency++;
1086 e->u1.last_filenum = filenum;
1087 }
1088
1089 return; /* all done */
1090 }
1091
1092
1093 /* Otherwise, found matching LOCATION record (matches filenum and metaID) */
1094 /* Just add the position number onto the end by expanding the size of the LOCATION record */
1095
1096 /* 2001/08 jmruiz - Much better memory usage occurs if we use MemZones */
1097 /* MemZone will be reset when the doc is completely proccesed */
1098
1099 newtp = add_position_location(tp, idx, tp->frequency);
1100
1101 if(newtp != tp)
1102 {
1103 if(e->currentChunkLocationList == tp)
1104 e->currentChunkLocationList = newtp;
1105 else
1106 for(prevtp = e->currentChunkLocationList;;prevtp = prevtp->next)
1107 {
1108 if(prevtp->next == tp)
1109 {
1110 prevtp->next = newtp;
1111 break;
1112 }
1113 }
1114 tp = newtp;
1115 }
1116
1117 tp->posdata[tp->frequency++] = SET_POSDATA(position,structure);
1118
1119 }
1120
1121
1122 /*******************************************************************
1123 * Adds common file properties to the last entry in the file array
1124 * (which should be the current one)
1125 *
1126 *
1127 * Call with:
1128 * *SWISH - need for indexing words
1129 * *fprop
1130 * *fi
1131 * *summary - document summary (why here?)
1132 * start - start position of a sub-document
1133 * size - size in bytes of document
1134 *
1135 * Returns:
1136 * void
1137 *
1138 * Note:
1139 * Uses cached meta entries (created in metanames.c) to save the
1140 * metaEntry lookup by name costs
1141 *
1142 ********************************************************************/
1143
1144 void addCommonProperties( SWISH *sw, FileProp *fprop, FileRec *fi, char *title, char *summary, int start )
1145 {
1146 struct metaEntry *q;
1147 docProperties **properties = &fi->docProperties;
1148 unsigned long tmp;
1149 int metaID;
1150 INDEXDATAHEADER *header = &sw->indexlist->header;
1151 char *filename = fprop->real_path; /* should always have a path */
1152 int filenum = fi->filenum;
1153
1154
1155
1156 /* Check if filename is internal swish metadata -- should be! */
1157
1158 if ((q = getPropNameByName(header, AUTOPROPERTY_DOCPATH)))
1159 addDocProperty( properties, q, (unsigned char *)filename, strlen(filename),0);
1160
1161
1162 /* Perhaps we want it to be indexed ... */
1163 if ((q = getMetaNameByName(header, AUTOPROPERTY_DOCPATH)))
1164 {
1165 int metaID,
1166 positionMeta;
1167
1168 metaID = q->metaID;
1169 positionMeta = 1;
1170 indexstring(sw, filename, filenum, IN_FILE, 1, &metaID, &positionMeta);
1171 }
1172
1173
1174 /* This allows extracting out parts of a path and indexing as a separate meta name */
1175 if ( sw->pathExtractList )
1176 index_path_parts( sw, fprop->orig_path, sw->pathExtractList, header, properties );
1177
1178
1179
1180 /* Check if title is internal swish metadata */
1181 if ( title )
1182 {
1183 if ( (q = getPropNameByName(header, AUTOPROPERTY_TITLE)))
1184 addDocProperty(properties, q, (unsigned char *)title, strlen(title),0);
1185
1186
1187 /* Perhaps we want it to be indexed ... */
1188 if ( (q = getMetaNameByName(header, AUTOPROPERTY_TITLE)))
1189 {
1190 int positionMeta;
1191
1192 metaID = q->metaID;
1193 positionMeta = 1;
1194 indexstring(sw, title, filenum, IN_FILE, 1, &metaID, &positionMeta);
1195 }
1196 }
1197
1198
1199 if ( summary )
1200 {
1201 if ( (q = getPropNameByName(header, AUTOPROPERTY_SUMMARY)))
1202 addDocProperty(properties, q, (unsigned char *)summary, strlen(summary),0);
1203
1204
1205 if ( (q = getMetaNameByName(header, AUTOPROPERTY_SUMMARY)))
1206 {
1207 int metaID,
1208 positionMeta;
1209
1210 metaID = q->metaID;
1211 positionMeta = 1;
1212 indexstring(sw, summary, filenum, IN_FILE, 1, &metaID, &positionMeta);
1213 }
1214 }
1215
1216
1217
1218 /* Currently don't allow indexing by date or size or position */
1219
1220 /* mtime is a time_t, but we don't have an entry for NOT A TIME. Does anyone care about the first second of 1970? */
1221
1222 if ( fprop->mtime && (q = getPropNameByName(header, AUTOPROPERTY_LASTMODIFIED)))
1223 {
1224 tmp = (unsigned long) fprop->mtime;
1225 tmp = PACKLONG(tmp); /* make it portable */
1226 addDocProperty(properties, q, (unsigned char *) &tmp, sizeof(tmp),1);
1227 }
1228
1229 if ( (q = getPropNameByName(header, AUTOPROPERTY_DOCSIZE)))
1230 {
1231 tmp = (unsigned long) fprop->fsize;
1232 tmp = PACKLONG(tmp); /* make it portable */
1233 addDocProperty(properties, q, (unsigned char *) &tmp, sizeof(tmp),1);
1234 }
1235
1236
1237 if ( (q = getPropNameByName(header, AUTOPROPERTY_STARTPOS)))
1238 {
1239 tmp = (unsigned long) start;
1240 tmp = PACKLONG(tmp); /* make it portable */
1241 addDocProperty(properties, q, (unsigned char *) &tmp, sizeof(tmp),1);
1242 }
1243
1244 }
1245
1246
1247 /*******************************************************************
1248 * extracts out parts from a path name and indexes that part
1249 *
1250 ********************************************************************/
1251 static void index_path_parts( SWISH *sw, char *path, path_extract_list *list, INDEXDATAHEADER *header, docProperties **properties )
1252 {
1253 int metaID;
1254 int positionMeta = 1;
1255 int matched = 0; /* flag if any patterns matched */
1256
1257 while ( list )
1258 {
1259 char *str = process_regex_list( estrdup(path), list->regex, &matched );
1260
1261 if ( !matched )
1262 {
1263 /* use default? */
1264 if ( list->meta_entry->extractpath_default )
1265 {
1266 metaID = list->meta_entry->metaID;
1267 indexstring(sw, list->meta_entry->extractpath_default, sw->Index->filenum, IN_FILE, 1, &metaID, &positionMeta);
1268 }
1269 }
1270 else
1271 {
1272 struct metaEntry *q;
1273
1274 metaID = list->meta_entry->metaID;
1275 indexstring(sw, str, sw->Index->filenum, IN_FILE, 1, &metaID, &positionMeta);
1276
1277 if ((q = getPropNameByName(header, list->meta_entry->metaName )))
1278 addDocProperty( properties, q, (unsigned char *)str, strlen(str),0);
1279
1280
1281 efree( str );
1282 }
1283
1284 matched = 0;
1285 list = list->next;
1286 }
1287 }
1288
1289
1290 /* Just goes through the master list of files and
1291 ** counts 'em.
1292 */
1293
1294 int getfilecount(IndexFILE * indexf)
1295 {
1296 return indexf->header.totalfiles;
1297 }
1298
1299
1300
1301 /* Removes words that occur in over _plimit_ percent of the files and
1302 ** that occur in over _flimit_ files (marks them as stopwords, that is).
1303 */
1304 /* 05/00 Jose Ruiz
1305 ** Recompute positions when a stopword is removed from lists
1306 ** This piece of code is terrorific because the first goal
1307 ** was getting the best possible performace. So, the code is not
1308 ** very clear.
1309 ** The main problem is to recalculate word positions for all
1310 ** the words after removing the automatic stop words. This means
1311 ** looking at all word's positions for each automatic stop word
1312 ** and decrement its position
1313 */
1314 /* 2001/02 jmruiz - rewritten - all the proccess is made in one pass to achieve
1315 better performance */
1316 /* 2001-08 jmruiz - rewritten - adapted to new locations and zone schema */
1317 /* 2002-07 jmruiz - rewritten - adapted to new -e schema */
1318
1319 int getNumberOfIgnoreLimitWords(SWISH *sw)
1320 {
1321 return sw->Index->nIgnoreLimitWords;
1322 }
1323
1324 void getPositionsFromIgnoreLimitWords(SWISH * sw)
1325 {
1326 int i,
1327 j,
1328 k,
1329 m,
1330 stopwords,
1331 percent,
1332 bytes_size,
1333 chunk_size,
1334 metaID,
1335 frequency,
1336 tmpval,
1337 filenum;
1338 int *positions;
1339 int local_positions[MAX_STACK_POSITIONS];
1340
1341 LOCATION *l, *next;
1342 ENTRY *ep,
1343 *ep2;
1344 ENTRY **estop = NULL;
1345 int estopsz = 0,
1346 estopmsz = 0;
1347 int totalwords;
1348 IndexFILE *indexf = sw->indexlist;
1349 int totalfiles = getfilecount(indexf);
1350 struct IgnoreLimitPositions **filepos = NULL;
1351 struct IgnoreLimitPositions *fpos;
1352 struct MOD_Index *idx = sw->Index;
1353 unsigned char *p, *q, *compressed_data, flag;
1354 int last_loc_swap;
1355
1356 stopwords = 0;
1357 totalwords = indexf->header.totalwords;
1358
1359 idx->nIgnoreLimitWords = 0;
1360 idx->IgnoreLimitPositionsArray = NULL;
1361
1362 if (!totalwords || idx->plimit >= NO_PLIMIT)
1363 return;
1364
1365 if (sw->verbose)
1366 {
1367 printf("\r Getting IgnoreLimit stopwords: ...");
1368 fflush(stdout);
1369 }
1370
1371
1372 if (!estopmsz)
1373 {
1374 estopmsz = 1;
1375 estop = (ENTRY **) emalloc(estopmsz * sizeof(ENTRY *));
1376 }
1377
1378
1379 /* this is the easy part: Remove the automatic stopwords from the hash array */
1380 /* Builds a list estop[] of ENTRY's that need to be removed */
1381
1382 for (i = 0; i < VERYBIGHASHSIZE; i++)
1383 {
1384 for (ep2 = NULL, ep = sw->Index->hashentries[i]; ep; ep = ep->next)
1385 {
1386 percent = (ep->tfrequency * 100) / totalfiles;
1387 if (percent >= idx->plimit && ep->tfrequency >= idx->flimit)
1388 {
1389 addStopList(&indexf->header, ep->word); /* For printing list of words */
1390 addstophash(&indexf->header, ep->word); /* Lookup hash */
1391 stopwords++;
1392 /* unlink the ENTRY from the hash */
1393 if (ep2)
1394 ep2->next = ep->next;
1395 else
1396 sw->Index->hashentries[i] = ep->next;
1397
1398 totalwords--;
1399 sw->Index->entryArray->numWords--;
1400 indexf->header.totalwords--;
1401
1402 /* Reallocte if more space is needed */
1403 if (estopsz == estopmsz)
1404 {
1405 estopmsz *= 2;
1406 estop = (ENTRY **) erealloc(estop, estopmsz * sizeof(ENTRY *));
1407 }
1408
1409 /* estop is an array of ENTRY's that need to be removed */
1410 estop[estopsz++] = ep;
1411 }
1412 else
1413 ep2 = ep;
1414 }
1415 }
1416
1417
1418
1419 /* If we have automatic stopwords we have to recalculate word positions */
1420
1421 if (estopsz)
1422 {
1423 /* Build an array with all the files positions to be removed */
1424 filepos = (struct IgnoreLimitPositions **) emalloc(totalfiles * sizeof(struct IgnoreLimitPositions *));
1425
1426 for (i = 0; i < totalfiles; i++)
1427 filepos[i] = NULL;
1428
1429 /* Compute bytes required for chunk location size. Eg: 4096 -> 2 bytes, 65535 -> 2 bytes */
1430 for(bytes_size = 0, i = COALESCE_BUFFER_MAX_SIZE; i; i >>= 8)
1431 bytes_size++;
1432
1433 /* Process each automatic stop word */
1434 for (i = 0; i < estopsz; i++)
1435 {
1436 ep = estop[i];
1437
1438 if (sw->verbose)
1439 {
1440 printf("\r Getting IgnoreLimit stopwords: %25s",ep->word);
1441 fflush(stdout);
1442 }
1443
1444 if(sw->Index->swap_locdata)
1445 {
1446 /* jmruiz - Be careful with this lines!!!! If we have a lot of words,
1447 ** probably this code can be very slow and may be rethought.
1448 ** Fortunately, only a few words must usually raise a IgnoreLimit option
1449 */
1450 last_loc_swap = (verybighash(ep->word) * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1);
1451 unSwapLocData(sw, last_loc_swap, ep );
1452 }
1453
1454 /* Run through location list to get positions */
1455 for(l=ep->allLocationList;l;)
1456 {
1457 compressed_data = (unsigned char *) l;
1458 /* Preserve next element */
1459 next = *(LOCATION **)compressed_data;
1460 /* Jump pointer to next element */
1461 p = compressed_data + sizeof(LOCATION *);
1462
1463 metaID = uncompress2(&p);
1464
1465 for(chunk_size = 0, k = 0, j = bytes_size - 1; k < bytes_size; k++, j--)
1466 chunk_size |= p[k] << (j * 8);
1467 p += bytes_size;
1468
1469 filenum = 0;
1470 while(chunk_size)
1471 { /* Read on all items */
1472 q = p;
1473 uncompress_location_values(&p,&flag,&tmpval,&frequency);
1474 filenum += tmpval;
1475
1476 if(frequency > MAX_STACK_POSITIONS)
1477 positions = (int *) emalloc(frequency * sizeof(int));
1478 else
1479 positions = local_positions;
1480
1481 uncompress_location_positions(&p,flag,frequency,positions);
1482
1483 chunk_size -= (p-q);
1484
1485 /* Now build the list by filenum of meta/position info */
1486
1487 if (!filepos[filenum - 1])
1488 {
1489 fpos = (struct IgnoreLimitPositions *) emalloc(sizeof(struct IgnoreLimitPositions));
1490 fpos->pos = (int *) emalloc(frequency * 2 * sizeof(int));
1491 fpos->n = 0;
1492 filepos[filenum - 1] = fpos;
1493 }
1494 else /* file exists in array. just append the meta and position data */
1495 {
1496 fpos = filepos[filenum - 1];
1497 fpos->pos = (int *) erealloc(fpos->pos, (fpos->n + frequency) * 2 * sizeof(int));
1498 }
1499
1500 for (m = fpos->n * 2, k = 0; k < frequency; k++)
1501 {
1502 fpos->pos[m++] = metaID;
1503 fpos->pos[m++] = GET_POSITION(positions[k]);
1504 }
1505
1506 fpos->n += frequency;
1507
1508 if(positions != local_positions)
1509 efree(positions);
1510 }
1511 l = next;
1512 }
1513 if(sw->Index->swap_locdata)
1514 Mem_ZoneReset(idx->totalLocZone);
1515 }
1516
1517 /* sort each file sort entries by metaname/position */
1518 for (i = 0; i < totalfiles; i++)
1519 {
1520 if (filepos[i])
1521 swish_qsort(filepos[i]->pos, filepos[i]->n, 2 * sizeof(int), &icomp2);
1522 }
1523 }
1524
1525 idx->nIgnoreLimitWords = estopsz;
1526 idx->IgnoreLimitPositionsArray = filepos;
1527
1528 if (sw->verbose)
1529 {
1530 printf("\r Getting IgnoreLimit stopwords: Complete \n");
1531 fflush(stdout);
1532 }
1533
1534
1535 }
1536
1537 /* 2001-08 jmruiz - Adjust positions if there was IgnoreLimit stopwords
1538 ** In all cases, removes null end of chunk marks */
1539 void adjustWordPositions(unsigned char *worddata, int *sz_worddata, int n_files, struct IgnoreLimitPositions **ilp)
1540 {
1541 int frequency,
1542 metaID,
1543 tmpval,
1544 r_filenum,
1545 w_filenum,
1546 *posdata;
1547 int i,j,k;
1548 unsigned long r_nextposmeta;
1549 unsigned char *w_nextposmeta;
1550 int local_posdata[MAX_STACK_POSITIONS];
1551 unsigned char r_flag, *w_flag;
1552 unsigned char *p, *q;
1553
1554 p = worddata;
1555
1556 tmpval = uncompress2(&p); /* tfrequency */
1557 metaID = uncompress2(&p); /* metaID */
1558 r_nextposmeta = UNPACKLONG2(p);
1559 w_nextposmeta = p;
1560 p += sizeof(long);
1561
1562 q = p;
1563 r_filenum = w_filenum = 0;
1564 while(1)
1565 { /* Read on all items */
1566 uncompress_location_values(&p,&r_flag,&tmpval,&frequency);
1567 r_filenum += tmpval;
1568
1569 if(frequency <= MAX_STACK_POSITIONS)
1570 posdata = local_posdata;
1571 else
1572 posdata = (int *) emalloc(frequency * sizeof(int));
1573
1574 uncompress_location_positions(&p,r_flag,frequency,posdata);
1575
1576 if(n_files && ilp && ilp[r_filenum - 1])
1577 {
1578 for(i = 0; i < ilp[r_filenum - 1]->n; i++)
1579 {
1580 tmpval = ilp[r_filenum - 1]->pos[2 * i];
1581 if( tmpval >= metaID)
1582 break;
1583 }
1584 if(tmpval == metaID)
1585 {
1586 for(j = 0; j < frequency ; j++)
1587 {
1588 for(k = i; k < ilp[r_filenum - 1]->n ; k++)
1589 {
1590 if(ilp[r_filenum - 1]->pos[2 * k] != metaID ||
1591 ilp[r_filenum - 1]->pos[2 * k + 1] > GET_POSITION(posdata[j]))
1592 break; /* End */
1593 }
1594 posdata[j] = SET_POSDATA(GET_POSITION(posdata[j]) - (k-i), GET_STRUCTURE(posdata[j]));
1595 }
1596 }
1597 }
1598 /* Store the filenum incrementally to save space */
1599 compress_location_values(&q,&w_flag,r_filenum - w_filenum,frequency, posdata);
1600 w_filenum = r_filenum;
1601
1602 /* store positions */
1603 compress_location_positions(&q,w_flag,frequency,posdata);
1604
1605 if(posdata != local_posdata)
1606 efree(posdata);
1607
1608 if(!p[0]) /* End of chunk mark */
1609 {
1610 r_filenum = 0; /* reset filenum */
1611 p++;
1612 }
1613 if ((p - worddata) == *sz_worddata)
1614 break; /* End of worddata */
1615
1616 if ((unsigned long)(p - worddata) == r_nextposmeta)
1617 {
1618 if(q != p)
1619 PACKLONG2(q - worddata, w_nextposmeta);
1620
1621 metaID = uncompress2(&p);
1622 q = compress3(metaID,q);
1623
1624 r_nextposmeta = UNPACKLONG2(p);
1625 p += sizeof(long);
1626
1627 w_nextposmeta = q;
1628 q += sizeof(long);
1629
1630 w_filenum = 0;
1631 }
1632 }
1633 *sz_worddata = q - worddata;
1634 PACKLONG2(*sz_worddata, w_nextposmeta);
1635 }
1636
1637
1638
1639 /*
1640 ** This is an all new ranking algorithm. I can't say it is based on anything,
1641 ** but it does seem to be better than what was used before!
1642 ** 2001/05 wsm
1643 **
1644 ** Parameters:
1645 ** sw
1646 ** Pointer to SWISH structure
1647 **
1648 ** freq
1649 ** Number of times this word appeared in this file
1650 **
1651 ** tfreq
1652 ** Number of files this word appeared in this index (not used for ranking)
1653 **
1654 ** words
1655 ** Number of owrds in this file
1656 **
1657 ** structure
1658 ** Bit mask of context where this word appeared
1659 **
1660 ** ignoreTotalWordCount
1661 ** Ignore total word count when ranking (config file parameter)
1662 */
1663
1664
1665
1666 int entrystructcmp(const void *e1, const void *e2)
1667 {
1668 const ENTRY *ep1 = *(ENTRY * const *) e1;
1669 const ENTRY *ep2 = *(ENTRY * const *) e2;
1670
1671 return (strcmp(ep1->word, ep2->word));
1672 }
1673
1674
1675 /* Sorts the words */
1676 void sort_words(SWISH * sw, IndexFILE * indexf)
1677 {
1678 int i,
1679 j;
1680 ENTRY *e;
1681
1682
1683 if (!sw->Index->entryArray || !sw->Index->entryArray->numWords)
1684 return;
1685
1686
1687 if (sw->verbose)
1688 {
1689 printf("Sorting %d words alphabetically\n", sw->Index->entryArray->numWords );
1690 fflush(stdout);
1691 }
1692
1693 /* Build the array with the pointers to the entries */
1694 sw->Index->entryArray->elist = (ENTRY **) emalloc(sw->Index->entryArray->numWords * sizeof(ENTRY *));
1695
1696 /* Fill the array with all the entries */
1697 for (i = 0, j = 0; i < VERYBIGHASHSIZE; i++)
1698 for (e = sw->Index->hashentries[i]; e; e = e->next)
1699 sw->Index->entryArray->elist[j++] = e;
1700
1701 /* Sort them */
1702 swish_qsort(sw->Index->entryArray->elist, sw->Index->entryArray->numWords, sizeof(ENTRY *), &entrystructcmp);
1703 }
1704
1705
1706
1707 /* Sort chunk locations of entry e by metaID, filenum */
1708 void sortChunkLocations(SWISH * sw, IndexFILE * indexf, ENTRY * e)
1709 {
1710 int i,
1711 j,
1712 k,
1713 filenum,metaID,frequency;
1714 unsigned char flag;
1715 unsigned char *ptmp,
1716 *ptmp2,
1717 *compressed_data;
1718 int *pi = NULL;
1719 LOCATION *l, *prev = NULL, **lp;
1720
1721 /* Very trivial case */
1722 if (!e)
1723 return;
1724
1725 if(!e->currentChunkLocationList)
1726 return;
1727
1728 /* Get the number of locations in chunk */
1729 for(i = 0, l = e->currentChunkLocationList; l; i++)
1730 l=*(LOCATION **)l; /* Get next location */
1731
1732 /* Compute array wide */
1733 j = 2 * sizeof(int) + sizeof(void *);
1734
1735 /* Compute array size */
1736 ptmp = (void *) emalloc(j * i);
1737
1738 /* Build an array with the elements to compare
1739 and pointers to data */
1740
1741 for(l = e->currentChunkLocationList, ptmp2 = ptmp; l; )
1742 {
1743 pi = (int *) ptmp2;
1744
1745 compressed_data = (unsigned char *)l;
1746 /* Jump next offset */
1747 compressed_data += sizeof(LOCATION *);
1748
1749 metaID = uncompress2(&compressed_data);
1750 uncompress_location_values(&compressed_data,&flag,&filenum,&frequency);
1751 pi[0] = metaID;
1752 pi[1] = filenum;
1753 ptmp2 += 2 * sizeof(int);
1754
1755 lp = (LOCATION **)ptmp2;
1756 *lp = l;
1757 ptmp2 += sizeof(void *);
1758 /* Get next location */
1759 l=*(LOCATION **)l; /* Get next location */
1760 }
1761
1762 /* Sort them */
1763 swish_qsort(ptmp, i, j, &icomp2);
1764
1765 /* Store results */
1766 for (k = 0, ptmp2 = ptmp; k < i; k++)
1767 {
1768 ptmp2 += 2 * sizeof(int);
1769
1770 l = *(LOCATION **)ptmp2;
1771 if(!k)
1772 e->currentChunkLocationList = l;
1773 else
1774 prev->next =l;
1775 ptmp2 += sizeof(void *);
1776 prev = l;
1777 }
1778 l->next =NULL;
1779
1780 /* Free the memory of the array */
1781 efree(ptmp);
1782 }
1783
1784 void coalesce_all_word_locations(SWISH * sw, IndexFILE * indexf)
1785 {
1786 int i;
1787 ENTRY *epi;
1788
1789 for (i = 0; i < VERYBIGHASHSIZE; i++)
1790 {
1791 if ((epi = sw->Index->hashentries[i]))
1792 {
1793 while (epi)
1794 {
1795 coalesce_word_locations(sw, indexf, epi);
1796 epi = epi->next;
1797 }
1798 }
1799 }
1800
1801 }
1802
1803 /* Write the index entries that hold the word, rank, and other information.
1804 */
1805
1806
1807 #ifndef USE_BTREE
1808 void write_index(SWISH * sw, IndexFILE * indexf)
1809 {
1810 int i;
1811 ENTRYARRAY *ep;
1812 ENTRY *epi;
1813 int totalwords;
1814 int percent, lastPercent, n;
1815 int last_loc_swap;
1816
1817 #define DELTA 10
1818
1819
1820 if ( !(ep = sw->Index->entryArray ))
1821 return; /* nothing to do */
1822
1823
1824 totalwords = ep->numWords;
1825
1826 DB_InitWriteWords(sw, indexf->DB);
1827
1828 if (sw->verbose)
1829 {
1830 printf(" Writing word text: ...");
1831 fflush(stdout);
1832 }
1833
1834 /* This is not longer needed. So free it as soon as possible */
1835 Mem_ZoneFree(&sw->Index->perDocTmpZone);
1836
1837
1838 /* This is not longer needed. So free it as soon as possible */
1839 Mem_ZoneFree(&sw->Index->currentChunkLocZone);
1840
1841 /* If we are swaping locs to file, reset memory zone */
1842 if(sw->Index->swap_locdata)
1843 Mem_ZoneReset(sw->Index->totalLocZone);
1844
1845 n = lastPercent = 0;
1846 for (i = 0; i < totalwords; i++)
1847 {
1848 if ( sw->verbose && totalwords > 10000 ) // just some random guess
1849 {
1850 n++;
1851 percent = (n * 100)/totalwords;
1852 if (percent - lastPercent >= DELTA )
1853 {
1854 printf("\r Writing word text: %3d%%", percent );
1855 fflush(stdout);
1856 lastPercent = percent;
1857 }
1858 }
1859
1860 epi = ep->elist[i];
1861
1862 /* why check for stopwords here? removestopwords could have remove them */
1863 if (!isstopword(&indexf->header, epi->word))
1864 {
1865 /* Write word to index file */
1866 write_word(sw, epi, indexf);
1867 }
1868 else
1869 epi->u1.wordID = -1; /* flag as a stop word */
1870 }
1871
1872 if (sw->verbose)
1873 {
1874 printf("\r Writing word text: Complete\n" );
1875 printf(" Writing word hash: ...");
1876 fflush(stdout);
1877 }
1878
1879
1880
1881 n = lastPercent = 0;
1882 for (i = 0; i < VERYBIGHASHSIZE; i++)
1883 {
1884 if ( sw->verbose )
1885 {
1886 n++;
1887 percent = (n * 100)/VERYBIGHASHSIZE;
1888 if (percent - lastPercent >= DELTA )
1889 {
1890 printf("\r Writing word hash: %3d%%", percent );
1891 fflush(stdout);
1892 lastPercent = percent;
1893 }
1894 }
1895
1896
1897 if ((epi = sw->Index->hashentries[i]))
1898 {
1899 while (epi)
1900 {
1901 /* If it is not a stopword write it */
1902 if (epi->u1.wordID > 0)
1903 DB_WriteWordHash(sw, epi->word,epi->u1.wordID,indexf->DB);
1904 epi = epi->next;
1905 }
1906 }
1907 }
1908
1909 if (sw->verbose)
1910 {
1911 printf("\r Writing word hash: Complete\n" );
1912 printf(" Writing word data: ...");
1913 fflush(stdout);
1914 }
1915
1916
1917 n = lastPercent = last_loc_swap = -1;
1918 for (i = 0; i < VERYBIGHASHSIZE; i++)
1919 {
1920 /* If we are in economic mode -e restore locations */
1921 if(sw->Index->swap_locdata)
1922 {
1923 if (((i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1)) != last_loc_swap)
1924 {
1925 /* Free not longer needed memory */
1926 Mem_ZoneReset(sw->Index->totalLocZone);
1927 last_loc_swap = (i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1);
1928 unSwapLocData(sw, last_loc_swap, NULL );
1929 }
1930 }
1931 if ((epi = sw->Index->hashentries[i]))
1932 {
1933 while (epi)
1934 {
1935 /* If we are in economic mode -e we must sort locations by metaID, filenum */
1936 if(sw->Index->swap_locdata)
1937 {
1938 sortSwapLocData(sw, epi);
1939 }
1940 if ( sw->verbose && totalwords > 10000 ) // just some random guess
1941 {
1942 n++;
1943 percent = (n * 100)/totalwords;
1944 if (percent - lastPercent >= DELTA )
1945 {
1946 printf("\r Writing word data: %3d%%", percent );
1947 fflush(stdout);
1948 lastPercent = percent;
1949 }
1950 }
1951 if (epi->u1.wordID > 0) /* Not a stopword */
1952 {
1953 build_worddata(sw, epi, indexf);
1954 write_worddata(sw, epi, indexf);
1955 }
1956 epi = epi->next;
1957 }
1958 }
1959 }
1960 if (sw->verbose)
1961 printf("\r Writing word data: Complete\n" );
1962
1963
1964 DB_EndWriteWords(sw, indexf->DB);
1965
1966 /* free all ENTRY structs at once */
1967 Mem_ZoneFree(&sw->Index->entryZone);
1968
1969 /* free all location compressed data */
1970 Mem_ZoneFree(&sw->Index->totalLocZone);
1971
1972 efree(ep->elist);
1973 }
1974
1975 #else
1976
1977 void write_index(SWISH * sw, IndexFILE * indexf)
1978 {
1979 int i;
1980 ENTRYARRAY *ep;
1981 ENTRY *epi;
1982 int totalwords;
1983 int percent, lastPercent, n;
1984 long old_wordid;
1985 unsigned char *buffer =NULL;
1986 int sz_buffer = 0;
1987 #define DELTA 10
1988
1989
1990 if ( !(ep = sw->Index->entryArray ))
1991 return; /* nothing to do */
1992
1993 totalwords = ep->numWords;
1994
1995
1996 /* Write words */
1997 DB_InitWriteWords(sw, indexf->DB);
1998
1999 if (sw->verbose)
2000 {
2001 printf(" Writing word text: ...");
2002 fflush(stdout);
2003 }
2004
2005 /* This is not longer needed. So free it as soon as possible */
2006 Mem_ZoneFree(&sw->Index->perDocTmpZone);
2007
2008
2009 /* This is not longer needed. So free it as soon as possible */
2010 Mem_ZoneFree(&sw->Index->currentChunkLocZone);
2011
2012 /* If we are swaping locs to file, reset memory zone */
2013 if(sw->Index->swap_locdata)
2014 Mem_ZoneReset(sw->Index->totalLocZone);
2015
2016 n = lastPercent = 0;
2017 for (i = 0; i < totalwords; i++)
2018 {
2019 if ( sw->verbose && totalwords > 10000 ) // just some random guess
2020 {
2021 n++;
2022 percent = (n * 100)/totalwords;
2023 if (percent - lastPercent >= DELTA )
2024 {
2025 printf("\r Writing word text: %3d%%", percent );
2026 fflush(stdout);
2027 lastPercent = percent;
2028 }
2029 }
2030
2031 epi = ep->elist[i];
2032
2033 /* why check for stopwords here? removestopwords could have remove them */
2034 if (!isstopword(&indexf->header, epi->word))
2035 {
2036 /* Build worddata buffer */
2037 build_worddata(sw, epi, indexf);
2038 /* let's see if word is already in the index */
2039 old_wordid = read_worddata(sw, epi, indexf, &buffer, &sz_buffer);
2040 /* If exists, we have to add the new worddata buffer to the old one */
2041 if(old_wordid)
2042 {
2043 add_worddata(sw, epi, indexf, buffer, sz_buffer);
2044 efree(buffer);
2045 buffer = NULL;
2046 sz_buffer = 0;
2047 delete_worddata(sw, old_wordid, indexf);
2048 write_worddata(sw, epi, indexf);
2049 update_wordID(sw, epi, indexf);
2050 }
2051 else
2052 {
2053 /* Write word to index file */
2054 write_worddata(sw, epi, indexf);
2055 write_word(sw, epi, indexf);
2056 }
2057 }
2058 }
2059
2060 if (sw->verbose)
2061 {
2062 printf("\r Writing word text: Complete\n" );
2063 fflush(stdout);
2064 }
2065
2066
2067 DB_EndWriteWords(sw, indexf->DB);
2068
2069 /* free all ENTRY structs at once */
2070 Mem_ZoneFree(&sw->Index->entryZone);
2071
2072 /* free all location compressed data */
2073 Mem_ZoneFree(&sw->Index->totalLocZone);
2074
2075 efree(ep->elist);
2076 }
2077
2078
2079 #endif
2080
2081
2082
2083
2084 /* These 2 routines fix the problem when a word ends with mutiple
2085 ** IGNORELASTCHAR's (eg, qwerty'. ). The old code correctly deleted
2086 ** the ".", but didn't check if the new last character ("'") is also
2087 ** an ignore character.
2088 */
2089 void stripIgnoreLastChars(INDEXDATAHEADER *header, char *word)
2090 {
2091 int k,j,i = strlen(word);
2092
2093 /* Get rid of specified last char's */
2094 /* for (i=0; word[i] != '\0'; i++); */
2095 /* Iteratively strip off the last character if it's an ignore character */
2096 while ((i > 0) && (isIgnoreLastChar(header, word[--i])))
2097 {
2098 word[i] = '\0';
2099
2100 /* We must take care of the escaped characeters */
2101 /* Things like hello\c hello\\c hello\\\c can appear */
2102 for(j=0,k=i-1;k>=0 && word[k]=='\\';k--,j++);
2103
2104 /* j contains the number of \ */
2105 if(j%2) /* Remove the escape if even */
2106 {
2107 word[--i]='\0';
2108 }
2109 }
2110 }
2111
2112 void stripIgnoreFirstChars(INDEXDATAHEADER *header, char *word)
2113 {
2114 int j,
2115 k;
2116 int i = 0;
2117
2118 /* Keep going until a char not to ignore is found */
2119 /* We must take care of the escaped characeters */
2120 /* Things like \chello \\chello can appear */
2121
2122 while (word[i])
2123 {
2124 if(word[i]=='\\') /* Jump escape */
2125 k=i+1;
2126 else
2127 k=i;
2128 if(!word[k] || !isIgnoreFirstChar(header, word[k]))
2129 break;
2130 else
2131 i=k+1;
2132 }
2133
2134 /* If all the char's are valid, just return */
2135 if (0 == i)
2136 return;
2137 else
2138 {
2139 for (k = i, j = 0; word[k] != '\0'; j++, k++)
2140 {
2141 word[j] = word[k];
2142 }
2143 /* Add the NULL */
2144 word[j] = '\0';
2145 }
2146 }
2147
2148
2149
2150 static void addword( char *word, SWISH * sw, int filenum, int structure, int numMetaNames, int *metaID, int *word_position)
2151 {
2152 int i;
2153
2154 /* Add the word for each nested metaname. */
2155 for (i = 0; i < numMetaNames; i++)
2156 (void) addentry(sw, getentry(sw,word), filenum, structure, metaID[i], *word_position);
2157
2158 (*word_position)++;
2159 }
2160
2161
2162
2163
2164 /* Gets the next white-space delimited word */
2165 int next_word( char **buf, char **word, int *lenword )
2166 {
2167 int i;
2168
2169 /* skip any whitespace */
2170 while ( **buf && isspace( (unsigned char) **buf) )
2171 (*buf)++;
2172
2173 i = 0;
2174 while ( **buf && !isspace( (unsigned char) **buf) )
2175 {
2176 /* reallocate buffer, if needed */
2177 if ( i == *lenword )
2178 {
2179 *lenword *= 2;
2180 *word = erealloc(*word, *lenword + 1);
2181 }
2182
2183 (*word)[i++] = **buf;
2184 (*buf)++;
2185 }
2186
2187 if ( i )
2188 {
2189 (*word)[i] = '\0';
2190 return 1;
2191 }
2192
2193 return 0;
2194 }
2195
2196 /* Gets the next non WordChars delimited word */
2197 /* Bumps position if needed */
2198 int next_swish_word(SWISH * sw, char **buf, char **word, int *lenword, int *word_position )
2199 {
2200 int i;
2201 IndexFILE *indexf = sw->indexlist;
2202 int bump_flag = 0;
2203
2204 /* skip non-wordchars and check for bump chars */
2205 while ( **buf && !iswordchar(indexf->header, **buf ) )
2206 {
2207 if (!bump_flag && isBumpPositionCounterChar(&indexf->header, (int) **buf))
2208 bump_flag++;
2209
2210 (*buf)++;
2211 }
2212
2213 i = 0;
2214 while ( **buf && iswordchar(indexf->header, **buf) )
2215 {
2216 /* It doesn't really make sense to have a WordChar that's also a bump char */
2217 if (!bump_flag && isBumpPositionCounterChar(&indexf->header, (int) **buf))
2218 bump_flag++;
2219
2220
2221 /* reallocate buffer, if needed */
2222 if ( i == *lenword )
2223 {
2224 *lenword *= 2;
2225 *word = erealloc(*word, *lenword + 1);
2226 }
2227
2228 (*word)[i++] = **buf;
2229 (*buf)++;
2230 }
2231
2232 /* If any bump chars were found then bump to prevent phrase matching */
2233 if ( bump_flag )
2234 (*word_position)++;
2235
2236 if ( i )
2237 {
2238 (*word)[i] = '\0';
2239 stripIgnoreLastChars(&indexf->header, *word);
2240 stripIgnoreFirstChars(&indexf->header, *word);
2241
2242 return *word ? 1 : 0;
2243 }
2244
2245 return 0;
2246 }
2247
2248 /******************************************************************
2249 * Build the list of metaIDs that need to be indexed
2250 *
2251 * Returns number of IDs found
2252 *
2253 *
2254 ******************************************************************/
2255 static int build_metaID_list( SWISH *sw )
2256 {
2257 struct MOD_Index *idx = sw->Index;
2258 METAIDTABLE *metas = &idx->metaIDtable;
2259 IndexFILE *indexf = sw->indexlist;
2260 INDEXDATAHEADER *header = &indexf->header;
2261 struct metaEntry *m;
2262 int i;
2263
2264
2265 /* cache the default metaID for speed */
2266 if ( metas->defaultID == -1 )
2267 {
2268 m = getMetaNameByName( header, AUTOPROPERTY_DEFAULT );
2269 metas->defaultID = m ? m->metaID : 0;
2270 }
2271
2272
2273 metas->num = 0;
2274
2275
2276 /* Would be smart to track number of metas flagged so not to loop through all for every lookup */
2277
2278 for ( i = 0; i < header->metaCounter; i++)
2279 {
2280 m = header->metaEntryArray[i];
2281
2282 if ( (m->metaType & META_INDEX) && m->in_tag )
2283 {
2284 if ( ++metas->num > metas->max )
2285 metas->array = (int *)erealloc( metas->array, (metas->max = metas->num + 200) );
2286
2287 metas->array[metas->num - 1] = m->metaID;
2288 }
2289 }
2290
2291 /* If no metas found to index, then add default metaID */
2292 if ( !metas->num && metas->defaultID )
2293 metas->array[metas->num++] = metas->defaultID;
2294
2295 return metas->num;
2296 }
2297
2298
2299 /******************************************************************
2300 * Index a string
2301 *
2302 *
2303 ******************************************************************/
2304
2305 /* 05/2001 Jose Ruiz - Changed word and swishword buffers to make this routine ** thread safe */
2306
2307
2308 int indexstring(SWISH * sw, char *s, int filenum, int structure, int numMetaNames, int *metaID, int *position)
2309 {
2310 int wordcount = 0;
2311
2312 IndexFILE *indexf = sw->indexlist;
2313
2314 char *buf_pos; /* pointer to current position */
2315 char *cur_pos; /* pointer to position with a word */
2316
2317 int stem_return; /* return value of stem operation */
2318
2319 struct MOD_Index *idx = sw->Index;
2320
2321 /* Assign word buffers */
2322 char *word = idx->word;
2323 int lenword = idx->lenword;
2324 char *swishword = idx->swishword;
2325 int lenswishword = idx->lenswishword;
2326
2327
2328
2329 /* Generate list of metaIDs to index unless passed in */
2330 if ( !metaID )
2331 {
2332 if ( !(numMetaNames = build_metaID_list( sw )) )
2333 return 0;
2334 else
2335 metaID = idx->metaIDtable.array;
2336 }
2337
2338 /* current pointer into buffer */
2339 buf_pos = s;
2340
2341
2342 /* get the next word as defined by whitespace */
2343 while ( next_word( &buf_pos, &word, &lenword ) )
2344 {
2345 if ( DEBUG_MASK & DEBUG_PARSED_WORDS )
2346 printf("White-space found word '%s'\n", word );
2347
2348
2349 strtolower(word);
2350
2351 /* is this a useful feature? */
2352 if ( indexf->header.is_use_words_flag )
2353 {
2354 if ( isuseword(&indexf->header, word) )
2355 {
2356 addword(word, sw, filenum, structure, numMetaNames, metaID, position );
2357 wordcount++;
2358 }
2359
2360 continue;
2361 }
2362
2363
2364 /* Check for buzzwords */
2365 if ( indexf->header.buzzwords_used_flag )
2366 {
2367 /* only strip when buzzwords are being used since stripped again as a "swish word" */
2368 stripIgnoreLastChars(&indexf->header, word);
2369 stripIgnoreFirstChars(&indexf->header, word);
2370 if ( !*word ) /* stripped clean? */
2371 continue;
2372
2373
2374 if ( isbuzzword(&indexf->header, word) )
2375 {
2376 addword(word, sw, filenum, structure, numMetaNames, metaID, position );
2377 wordcount++;
2378 continue;
2379 }
2380 }
2381
2382
2383
2384
2385
2386 /* Translate chars */
2387 TranslateChars(indexf->header.translatecharslookuptable, (unsigned char *)word);
2388
2389 cur_pos = word;
2390
2391
2392
2393 /* Now split the word up into "swish words" */
2394
2395 while ( next_swish_word( sw, &cur_pos, &swishword, &lenswishword, position ) )
2396 {
2397
2398 /* Weed out Numbers - or anything that's all the listed chars */
2399 if ( indexf->header.numberchars_used_flag )
2400 {
2401 unsigned char *c = (unsigned char *)swishword;
2402
2403 /* look for any char that's NOT in the lookup table */
2404 while ( *c ) {
2405 if ( !indexf->header.numbercharslookuptable[(int) *c ] )
2406 break;
2407 c++;
2408 }
2409
2410 /* if got all the way through the string then it's only those chars */
2411 if ( !*c )
2412 continue; /* skip this word */
2413 }
2414
2415
2416 /* Check Begin & EndCharacters */
2417 if (!indexf->header.begincharslookuptable[(int) ((unsigned char) swishword[0])])
2418 continue;
2419
2420 if (!indexf->header.endcharslookuptable[(int) ((unsigned char) swishword[strlen(swishword) - 1])])
2421 continue;
2422
2423
2424 /* limit by stopwords, min/max length, max number of digits, ... */
2425 if (!isokword(sw, swishword, indexf))
2426 continue;
2427
2428 /* Now translate word if fuzzy mode */
2429
2430 switch ( indexf->header.fuzzy_mode )
2431 {
2432 case FUZZY_NONE:
2433 addword(swishword, sw, filenum, structure, numMetaNames, metaID, position );
2434 wordcount++;
2435 break;
2436
2437 case FUZZY_STEMMING:
2438 stem_return = Stem(&swishword, &lenswishword);
2439
2440 /* ===
2441 if ( stem_return == STEM_NOT_ALPHA ) printf("Stem: not alpha in '%s'\n", swishword );
2442 if ( stem_return == STEM_TOO_SMALL ) printf("Stem: too small in '%s'\n", swishword );
2443 if ( stem_return == STEM_WORD_TOO_BIG ) printf("Stem: too big to stem in '%s'\n", swishword );
2444 if ( stem_return == STEM_TO_NOTHING ) printf("Stem: stems to nothing '%s'\n", swishword );
2445 === */
2446
2447 addword(swishword, sw, filenum, structure, numMetaNames, metaID, position );
2448 wordcount++;
2449 break;
2450
2451
2452 case FUZZY_SOUNDEX:
2453 soundex(swishword);
2454 addword(swishword, sw, filenum, structure, numMetaNames, metaID, position );
2455 wordcount++;
2456 break;
2457
2458 case FUZZY_METAPHONE:
2459 case FUZZY_DOUBLE_METAPHONE:
2460 {
2461 char *codes[2];
2462 DoubleMetaphone(swishword, codes);
2463
2464 if ( !(*codes[0]) )
2465 {
2466 efree( codes[0] );
2467 efree( codes[1] );
2468 addword(swishword, sw, filenum, structure, numMetaNames, metaID, position );
2469 wordcount++;
2470 break;
2471 }
2472 addword(codes[0], sw, filenum, structure, numMetaNames, metaID, position );
2473 wordcount++;
2474
2475 if ( indexf->header.fuzzy_mode == FUZZY_DOUBLE_METAPHONE && *(codes[1]) && strcmp(codes[0], codes[1]) )
2476 {
2477 (*position)--; /* at same position as first word */
2478 addword(codes[1], sw, filenum, structure, numMetaNames, metaID, position );
2479 wordcount++;
2480 }
2481
2482 efree( codes[0] );
2483 efree( codes[1] );
2484 }
2485
2486 break;
2487
2488
2489 default:
2490 progerr("Invalid FuzzyMode '%d'", (int)indexf->header.fuzzy_mode );
2491 }
2492 }
2493 }
2494
2495 /* Buffers can be reallocated - So, reasign them */
2496 idx->word = word;
2497 idx->lenword = lenword;
2498 idx->swishword = swishword;
2499 idx->lenswishword = lenswishword;
2500
2501 return wordcount;
2502 }
2503
2504
2505 /* Coalesce word current word location into the linked list */
2506 void add_coalesced(SWISH *sw, ENTRY *e, unsigned char *coalesced, int sz_coalesced, int metaID)
2507 {
2508 int tmp;
2509 LOCATION *tloc, *tprev;
2510 LOCATION **tmploc, **tmploc2;
2511 unsigned char *tp;
2512
2513
2514 /* Check for economic mode (-e) and swap data to disk */
2515 if(sw->Index->swap_locdata)
2516 {
2517 tmploc = (LOCATION **)coalesced;
2518 *tmploc = (LOCATION *)e; /* Preserve e in buffer */
2519 /* The cast is for avoiding the warning */
2520 SwapLocData(sw, e, coalesced, sz_coalesced);
2521 return;
2522 }
2523
2524 /* Add to the linked list keeping the data sorted by metaname, filenum */
2525 for(tprev =NULL, tloc = e->allLocationList; tloc; )
2526 {
2527 tp = (unsigned char *)tloc + sizeof(void *);
2528 tmp = uncompress2(&tp); /* Read metaID */
2529 if(tmp > metaID)
2530 break;
2531 tprev = tloc;
2532 tmploc = (LOCATION **)tloc;
2533 tloc = *tmploc;
2534 }
2535
2536 if(! tprev)
2537 {
2538 tmploc = (LOCATION **)coalesced;
2539 *tmploc = e->allLocationList;
2540 e->allLocationList = (LOCATION *)coalesced;
2541 }
2542 else
2543 {
2544 tmploc = (LOCATION **)coalesced;
2545 tmploc2 = (LOCATION **)tprev;
2546 *tmploc = *tmploc2;
2547 *tmploc2 = (LOCATION *)coalesced;
2548 }
2549 }
2550
2551
2552 void coalesce_word_locations(SWISH * sw, IndexFILE * indexf, ENTRY *e)
2553 {
2554 int curmetaID, metaID,
2555 curfilenum, filenum,
2556 frequency,
2557 num_locs,
2558 bytes_size,
2559 worst_case_size;
2560 int i, j, tmp;
2561 unsigned char *p, *q, *size_p = NULL;
2562 unsigned char uflag, *cflag;
2563 LOCATION *loc, *next;
2564 static unsigned char buffer[COALESCE_BUFFER_MAX_SIZE];
2565 unsigned char *coalesced_buffer;
2566 int *posdata;
2567 int local_posdata[MAX_STACK_POSITIONS];
2568
2569
2570 /* Check for new locations in the current chunk */
2571 if(!e->currentChunkLocationList)
2572 return;
2573
2574 /* Compute bytes required for size. Eg: 4096 -> 2 bytes, 65535 -> 2 bytes */
2575 for(bytes_size = 0, tmp = COALESCE_BUFFER_MAX_SIZE; tmp; tmp >>= 8)
2576 bytes_size++;
2577
2578 /* Sort all pending word locations by metaID, filenum */
2579 sortChunkLocations(sw, indexf, e);
2580
2581 /* Init vars */
2582 curmetaID = 0;
2583 curfilenum = 0;
2584 q = buffer; /* Destination buffer */
2585 num_locs = 0; /* Number of coalesced LOCATIONS */
2586
2587 /* Run on all locations */
2588 for(loc = e->currentChunkLocationList; loc; )
2589 {
2590 p = (unsigned char *) loc;
2591
2592 /* get next LOCATION in linked list*/
2593 next = * (LOCATION **) loc;
2594 p += sizeof(LOCATION *);
2595
2596 /* get metaID of LOCATION */
2597 metaID = uncompress2(&p);
2598
2599 /* Check for new metaID */
2600 if(metaID != curmetaID)
2601 {
2602 /* If exits previous data add it to the linked list */
2603 if(curmetaID)
2604 {
2605 /* add to the linked list and reset values */
2606 /* Update the size of chunk's data in *size_p */
2607 tmp = q - (size_p + bytes_size); /* tmp contains the size */
2608 /* Write the size */
2609 for(i = 0, j = bytes_size - 1; i < bytes_size; i++, j--)
2610 size_p[i] = tmp >> (j * 8);
2611 /* Add to the linked list keeping the data sorted by metaname, filenum */
2612 /* Allocate memory space */
2613 coalesced_buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->totalLocZone,q-buffer);
2614 /* Copy content to it */
2615 memcpy(coalesced_buffer,buffer,q-buffer);
2616 /* Add to the linked list */
2617 add_coalesced(sw, e, coalesced_buffer, q - buffer, curmetaID);
2618 }
2619 /* Reset values */
2620 curfilenum = 0;
2621 curmetaID = metaID;
2622 q = buffer + sizeof(void *); /* Make room for linked list pointer */
2623 q = compress3(metaID,q); /* Add metaID */
2624 size_p = q; /* Preserve position for size */
2625 q += bytes_size; /* Make room for size */
2626 num_locs = 0;
2627 }
2628 uncompress_location_values(&p,&uflag,&filenum,&frequency);
2629 worst_case_size = sizeof(unsigned char *) + (3 + frequency) * MAXINTCOMPSIZE;
2630
2631 while ((q + worst_case_size) - buffer > sizeof(buffer))
2632 {
2633 if(!num_locs)
2634 progerr("Buffer too short in coalesce_word_locations. Increase COALESCE_BUFFER_MAX_SIZE in config.h and rebuild.");
2635 /* add to the linked list and reset values */
2636 /* Update the size of chunk's data in *size_p */
2637 tmp = q - (size_p + bytes_size); /* tmp contains the size */
2638 /* Write the size */
2639 for(i = 0, j = bytes_size - 1; i < bytes_size; i++, j--)
2640 size_p[i] = tmp >> (j * 8);
2641 /* Add to the linked list keeping the data sorted by metaname, filenum */
2642 /* Allocate memory space */
2643 coalesced_buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->totalLocZone,q-buffer);
2644 /* Copy content to it */
2645 memcpy(coalesced_buffer,buffer,q-buffer);
2646 /* Add to the linked list */
2647 add_coalesced(sw, e, coalesced_buffer, q - buffer, curmetaID);
2648
2649 /* Reset values */
2650 curfilenum = 0;
2651 curmetaID = metaID;
2652 q = buffer + sizeof(void *); /* Make room for linked list pointer */
2653 q = compress3(metaID,q);
2654 size_p = q; /* Preserve position for size */
2655 q += bytes_size; /* Make room for size */
2656 num_locs = 0;
2657 }
2658
2659 if(frequency > MAX_STACK_POSITIONS)
2660 posdata = emalloc(frequency * sizeof(int));
2661 else
2662 posdata = local_posdata;
2663
2664 uncompress_location_positions(&p,uflag,frequency,posdata);
2665
2666 /* Store the filenum incrementally to save space */
2667 compress_location_values(&q,&cflag,filenum - curfilenum,frequency, posdata);
2668
2669 curfilenum = filenum;
2670
2671 compress_location_positions(&q,cflag,frequency,posdata);
2672
2673 if(frequency > MAX_STACK_POSITIONS)
2674 efree(posdata);
2675
2676 num_locs++;
2677
2678 loc = next;
2679 }
2680 if (num_locs)
2681 {
2682 /* add to the linked list and reset values */
2683 /* Update the size of chunk's data in *size_p */
2684 tmp = q - (size_p + bytes_size); /* tmp contains the size */
2685 /* Write the size */
2686 for(i = 0, j = bytes_size - 1; i < bytes_size; i++, j--)
2687 size_p[i] = tmp >> (j * 8);
2688 /* Add to the linked list keeping the data sorted by metaname, filenum */
2689 /* Allocate memory space */
2690 coalesced_buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->totalLocZone,q-buffer);
2691 /* Copy content to it */
2692 memcpy(coalesced_buffer,buffer,q-buffer);
2693 /* Add to the linked list */
2694 add_coalesced(sw, e, coalesced_buffer, q - buffer, curmetaID);
2695 }
2696 e->currentChunkLocationList = NULL;
2697 e->currentlocation = NULL;
2698
2699 /* If we are swaping locs to file, reset also correspondant memory zone */
2700 if(sw->Index->swap_locdata)
2701 Mem_ZoneReset(sw->Index->totalLocZone);
2702
2703 }
2704

  ViewVC Help
Powered by ViewVC 1.1.22