/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/swish.h
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/swish.h

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 /*
2 ** $Id: swish.h,v 1.157 2002/08/29 13:45:40 jmruiz Exp $
3 **
4 ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
5 ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
6 **
7 ** This program and library is free software; you can redistribute it and/or
8 ** modify it under the terms of the GNU (Library) General Public License
9 ** as published by the Free Software Foundation; either version 2
10 ** of the License, or any later version.
11 **
12 ** This program is distributed in the hope that it will be useful,
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ** GNU (Library) General Public License for more details.
16 **
17 ** You should have received a copy of the GNU (Library) General Public License
18 ** along with this program; if not, write to the Free Software
19 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 **-------------------------------------------------
21 ** Added support for METADATA
22 ** G. Hill ghill@library.berkeley.edu 3/18/97
23 **
24 ** Added Document Properties support
25 ** Mark Gaulin gaulin@designinfo.com 11/24/98
26 **
27 ** Added safestrcpy() macro to avoid corruption from strcpy overflow
28 ** SRE 11/17/99
29 **
30 ** Added Document Filter support (e.g. PDF, Winword)
31 ** Rainer.Scherg@t-online.de (rasc) 1998-08-07, 1999-05-05, 1999-05-28
32 **
33 ** Added some definitions for phrase search
34 ** Structure location modified to add frequency and word positions
35 ** Structure entry modified to add link hash values for direct search
36 **
37 ** Jose Ruiz jmruiz@boe.es 04/04/00
38 **
39 ** 2000-11-15 Rainer Scherg (rasc) FileProp type and routines
40 **
41 ** 2001-01-01 Jose Ruiz Added ISOTime
42 **
43 ** 2001-01-xx Rainer Scherg (rasc) Added property type structures, etc.
44 ** 2001-01-xx Rainer Scherg (rasc) cmd-opt should be own structure in SWISH * (started)
45 **
46 ** 2001-02-xx rasc replaced ISOTime by binary value
47 ** removed SWISH.errorstr, etc.
48 ** ResultExtFmtStrList & var
49 **
50 ** 2001-02-28 rasc some cleanup, ANSI compliant
51 ** 2001-03-12 rasc logical search operators via config changable
52 ** moved some parts to config.h
53 **
54 ** 2001-03-16 rasc truncateDocSize
55 ** 2001-03-17 rasc fprop enhanced by real_filename
56 ** 2001-04-09 rasc filters changed and enhanced
57 ** 2001-06-08 wsm Add word to end of ENTRY and propValue to end of docPropertyEntry
58 ** to save memory and less malloc/free
59 **
60 ** 2001-08-12 jmruiz ENTRY struct modified to index in chunks
61 **
62 */
63
64
65 #include <stdio.h>
66 #include <string.h>
67 #include <math.h>
68 #include <sys/types.h>
69 #include <sys/stat.h>
70 #include <locale.h>
71 #include <ctype.h>
72 #include <errno.h>
73 #include <time.h>
74 #ifdef HAVE_CONFIG_H
75 #include "acconfig.h" /* These are defines created by autoconf */
76 #endif
77 #include "config.h"
78
79 #ifdef NEXTSTEP
80 #include <sys/dir.h>
81 #else
82
83
84 #ifdef _WIN32
85 #include "win32/config.h"
86 #define strcasecmp stricmp
87 #elif defined(__VMS)
88 #include "vms/regex.h"
89 #include <dirent.h>
90 #include <stdarg.h>
91 extern int snprintf(char *, size_t, const char *, /*args */ ...);
92 extern int vsnprintf(char *, size_t, const char *, va_list);
93 #else
94 #include <dirent.h>
95 #include <regex.h>
96 #endif
97
98 #endif
99
100 #include <ctype.h>
101 #include <stdlib.h>
102 #include <time.h>
103 #include <setjmp.h>
104
105 #ifdef __cplusplus
106 extern "C" {
107 #endif
108
109 #define SWISH_MAGIC 21076321L
110
111 #define INDEXFILE "index.swish-e"
112
113
114 #define BASEHEADER 1
115 #define INDEXHEADER "# SWISH format: " SWISH_VERSION
116 #define INDEXHEADER_ID BASEHEADER + 1
117 #define INDEXVERSION "# Swish-e format: " SWISH_VERSION
118 #define INDEXVERSION_ID (BASEHEADER + 2)
119
120 /* Admin header */
121 #define NAMEHEADERPARAMNAME "IndexName"
122 #define DESCRIPTIONPARAMNAME "IndexDescription"
123 #define POINTERPARAMNAME "IndexPointer"
124 #define MAINTAINEDBYPARAMNAME "IndexAdmin"
125
126
127 /* Other headers that can be looked via the swish-e library */
128 #define INDEXEDONPARAMNAME "IndexedOn"
129 #define WORDCHARSPARAMNAME "WordCharacters"
130 #define BEGINCHARSPARAMNAME "BeginCharacters"
131 #define ENDCHARSPARAMNAME "EndCharacters"
132 #define IGNOREFIRSTCHARPARAMNAME "IgnoreFirstChar"
133 #define IGNORELASTCHARPARAMNAME "IgnoreLastChar"
134 #define STEMMINGPARAMNAME "UseStemming"
135 #define SOUNDEXPARAMNAME "UseSoundex"
136 #define FUZZYMODEPARAMNAME "FuzzyIndexingMode"
137
138 #define FILECOUNTPARAMNAME "FileCount"
139
140
141 /* Headers for output, and their offsets */
142 #define NAMEHEADER "# Name:"
143 #define NAMEHEADER_ID (BASEHEADER + 3)
144
145 #define SAVEDASHEADER "# Saved as:"
146 #define SAVEDASHEADER_ID (BASEHEADER + 4)
147
148 #define COUNTSHEADER "# Counts:"
149 #define COUNTSHEADER_ID (BASEHEADER + 5)
150
151 #define INDEXEDONHEADER "# Indexed on:"
152 #define INDEXEDONHEADER_ID (BASEHEADER + 6)
153
154 #define DESCRIPTIONHEADER "# Description:"
155 #define DESCRIPTIONHEADER_ID (BASEHEADER + 7)
156
157 #define POINTERHEADER "# Pointer:"
158 #define POINTERHEADER_ID (BASEHEADER + 8)
159
160 #define MAINTAINEDBYHEADER "# Maintained by:"
161 #define MAINTAINEDBYHEADER_ID (BASEHEADER + 9)
162
163 #define WORDCHARSHEADER "# WordCharacters:"
164 #define WORDCHARSHEADER_ID (BASEHEADER + 10)
165
166 #define MINWORDLIMHEADER "# MinWordLimit:"
167 #define MINWORDLIMHEADER_ID (BASEHEADER + 11)
168
169 #define MAXWORDLIMHEADER "# MaxWordLimit:"
170 #define MAXWORDLIMHEADER_ID (BASEHEADER + 12)
171
172 #define BEGINCHARSHEADER "# BeginCharacters:"
173 #define BEGINCHARSHEADER_ID (BASEHEADER + 13)
174
175 #define ENDCHARSHEADER "# EndCharacters:"
176 #define ENDCHARSHEADER_ID (BASEHEADER + 14)
177
178 #define IGNOREFIRSTCHARHEADER "# IgnoreFirstChar:"
179 #define IGNOREFIRSTCHARHEADER_ID (BASEHEADER + 15)
180
181 #define IGNORELASTCHARHEADER "# IgnoreLastChar:"
182 #define IGNORELASTCHARHEADER_ID (BASEHEADER + 16)
183
184 #define STEMMINGHEADER "# Stemming Applied:"
185 //#define STEMMINGHEADER_ID (BASEHEADER + 17)
186
187 #define SOUNDEXHEADER "# Soundex Applied:"
188 //#define SOUNDEXHEADER_ID (BASEHEADER + 18)
189
190 #define FUZZYMODE_HEADER "# Fuzzy Indexing Mode:"
191 #define FUZZYMODEHEADER_ID (BASEHEADER + 18)
192
193
194 #define MERGED_ID (BASEHEADER + 19)
195
196 /* vv not used vv */
197 #define DOCPROPHEADER "# DocProperty"
198 #define DOCPROPHEADER_ID (BASEHEADER + 20)
199 /* ^^ not used ^^ */
200
201 #define DOCPROPENHEADER "# DocumentProperties:"
202 #define DOCPROPENHEADER_ID (BASEHEADER + 21)
203
204 #define SORTDOCPROPHEADER_ID (BASEHEADER + 22)
205
206 #define IGNORETOTALWORDCOUNTWHENRANKING "# IgnoreTotalWordCountWhenRanking:"
207 #define IGNORETOTALWORDCOUNTWHENRANKINGPARAMNAME "IgnoreTotalWordCountWhenRanking"
208 #define IGNORETOTALWORDCOUNTWHENRANKING_ID (BASEHEADER + 23)
209
210 #define TRANSLATECHARTABLEHEADER "# TranslateCharacterTable:"
211 #define TRANSLATECHARTABLEPARAMNAME "TranslateCharacterTable"
212 #define TRANSLATECHARTABLE_ID (BASEHEADER + 25)
213
214 #define STOPWORDS_ID (BASEHEADER + 26)
215 #define METANAMES_ID (BASEHEADER + 27)
216 #define LOCATIONLOOKUPTABLE_ID (BASEHEADER + 28)
217 #define BUZZWORDS_ID (BASEHEADER + 29) /* 2001-04-24 moseley */
218
219 #ifndef USE_BTREE
220 #define TOTALWORDSPERFILE_ID (BASEHEADER + 30) /* total words per file array */
221 #endif
222
223 /* -- end of headers */
224
225 #define MAXFILELEN 1000
226 #define MAXSTRLEN 2000
227 #define MAXWORDLEN 1000
228 #define MAXTITLELEN 300
229 #define MAXENTLEN 10
230
231 // #define HASHSIZE 101
232 // #define BIGHASHSIZE 1009
233 // #define VERYBIGHASHSIZE 10001
234
235 // Change as suggested by Jean-François PIÉRONNE <jfp@altavista.net>
236 // on Fri, 28 Dec 2001 07:37:26 -0800 (PST)
237 #define HASHSIZE 1009
238 #define BIGHASHSIZE 10001
239 #define VERYBIGHASHSIZE 100003
240
241
242 #define MAXPAR 10
243 #define MAXCHARDEFINED 256
244 #define RD_BUFFER_SIZE 65356 /* init size, larger to avoid often reallocs (2001-03-16 rasc) */
245
246 #define NOWORD "thisisnotaword"
247 #define SECSPERMIN 60
248
249 #define IN_FILE_BIT 0
250 #define IN_TITLE_BIT 1
251 #define IN_HEAD_BIT 2
252 #define IN_BODY_BIT 3
253 #define IN_COMMENTS_BIT 4
254 #define IN_HEADER_BIT 5
255 #define IN_EMPHASIZED_BIT 6
256 #define IN_META_BIT 7
257 #define STRUCTURE_END 7
258
259
260 #define IN_FILE (1<<IN_FILE_BIT)
261 #define IN_TITLE (1<<IN_TITLE_BIT)
262 #define IN_HEAD (1<<IN_HEAD_BIT)
263 #define IN_BODY (1<<IN_BODY_BIT)
264 #define IN_COMMENTS (1<<IN_COMMENTS_BIT)
265 #define IN_HEADER (1<<IN_HEADER_BIT)
266 #define IN_EMPHASIZED (1<<IN_EMPHASIZED_BIT)
267 #define IN_META (1<<IN_META_BIT)
268 #define IN_ALL (IN_FILE|IN_TITLE|IN_HEAD|IN_BODY|IN_COMMENTS|IN_HEADER|IN_EMPHASIZED|IN_META)
269
270 #define MAXLONGLEN 4
271
272 /* Document Types */
273 enum {
274 BASEDOCTYPE = 0, TXT, HTML, XML, WML, XML2, HTML2, TXT2
275 };
276
277 #define NODOCTYPE BASEDOCTYPE
278
279 // This is used to build the property to read/write to disk
280 // It's here so the buffer can live between writes
281
282 typedef struct propEntry
283 {
284 unsigned int propLen; /* Length of buffer */
285 unsigned char propValue[1]; /* Actual property value starts here */
286 }
287 propEntry;
288
289
290
291 typedef struct docProperties
292 {
293 int n; /* to be removed - can just use count of properties */
294 struct propEntry *propEntry[1]; /* Array to hold properties */
295 }
296 docProperties;
297
298 #define RANK_BIAS_RANGE 10 // max/min range ( -10 -> 10, with zero being no bias )
299
300 /* This structure is for storing both properties and metanames -- probably should be two lists */
301 struct metaEntry
302 {
303 char *metaName; /* MetaName string */
304 int metaID; /* Meta ID */
305 int metaType; /* See metanames.h for values */
306 int *inPropRange; /* Used for limiting to a range */
307 int in_tag; /* Flag to indicate that we are within this tag */
308 int max_len; /* If non-zero, limits properties to this length (for storedescription) */
309 char *extractpath_default; /* String to index under this metaname if none found with ExtractPath */
310 propEntry *loPropRange;
311 propEntry *hiPropRange;
312 int alias; /* if non-zero, this is an alias to the listed metaID */
313 int rank_bias; /* An integer used to bias hits on this metaname 0 = no bias */
314 int *sorted_data; /* Sorted data . NULL if not read/done */
315 /* If 0, files are not sorted by this metaName/property */
316 };
317
318
319
320 /* These are used to build the table of seek pointers in the main index. */
321 typedef struct
322 {
323 long length;
324 long seek;
325 } PROP_LOCATION;
326
327
328 typedef struct // there used to be more in this structure ;)
329 {
330 PROP_LOCATION prop_position[1]; // one for each property in the index.
331 } PROP_INDEX;
332
333
334 typedef struct
335 {
336 int filenum;
337 docProperties *docProperties; /* list of document props in memory */
338 PROP_INDEX *prop_index; /* pointers to properties on disk */
339 } FileRec;
340
341
342 /*
343 -- FileProperties
344 -- store for information about a file to be indexed...
345 -- Unused items may be NULL (e.g. if File is not opened, fp == NULL)
346 -- (2000-11 rasc)
347
348 -- (2000-12 Jose Ruiz)
349 -- Added StoreDescription
350
351 */
352
353 typedef struct
354 {
355 FILE *fp; /* may be also a filter stream or NULL if not opened */
356 char *real_path; /* path/URL to indexed file - may be modified by ReplaceRules */
357 char *orig_path; /* original path provided to swish */
358 char *work_path; /* path to file to index (may be tmpfile or real_path) */
359 char *real_filename; /* basename() of real_path */
360 long fsize; /* size of the original file (not filtered) */
361 long bytes_read; /* Number of bytes read from the stream - important for sw->truncateDocSize and -S prog */
362 int done; /* flag to read no more from this stream (truncate) */
363 int external_program; /* Flag to only read fsize bytes from stream */
364 time_t mtime; /* Date of last mod of or. file */
365 int doctype; /* Type of document HTML, TXT, XML, ... */
366 int index_no_content; /* Flag, index "filename/real_path" only! */
367 struct StoreDescription *stordesc; /* Null if no description/summary */
368 struct FilterList *hasfilter; /* NULL if no filter for this file */
369 }
370 FileProp;
371
372
373 typedef struct LOCATION
374 {
375 struct LOCATION *next;
376 int metaID;
377 int filenum;
378 int frequency;
379 int posdata[1];
380 }
381 LOCATION;
382
383
384 /* 2002/01 jmruiz macros for accesing POSITION and structure */
385 #define SET_POSDATA(pos,str) ((pos) << 8 | (str))
386 #define GET_POSITION(pos) ((pos) >> 8)
387 #define GET_STRUCTURE(pos) ((pos) & 0xff)
388
389 typedef struct ENTRY
390 {
391 struct ENTRY *next;
392 int tfrequency;
393 /* Chunk's LOCATIONs goes here */
394 LOCATION *currentChunkLocationList;
395 LOCATION *currentlocation;
396 /* All locations goes here */
397 LOCATION *allLocationList;
398
399 /* this union is just for saving memory */
400 struct
401 {
402 long wordID;
403 int last_filenum;
404 }
405 u1;
406 char word[1]; /* actual word starts here */
407 }
408 ENTRY;
409
410 struct swline
411 {
412 struct swline *next;
413 struct swline *nodep;
414 char *line;
415 };
416
417
418 /* Define types of word translations for fuzzy indexing */
419
420 typedef enum {
421 FUZZY_NONE = 0,
422 FUZZY_STEMMING,
423 FUZZY_SOUNDEX,
424 FUZZY_METAPHONE,
425 FUZZY_DOUBLE_METAPHONE
426 } FuzzyIndexType;
427
428
429
430
431
432 typedef struct
433 {
434 /* vars for WordCharacters */
435 int lenwordchars;
436 char *wordchars;
437
438 /* vars for BeginCharacters */
439 int lenbeginchars;
440 char *beginchars;
441
442 /* vars for EndCharacters */
443 int lenendchars;
444 char *endchars;
445
446 /* vars for IgnoreLastChar */
447 int lenignorelastchar;
448 char *ignorelastchar;
449
450 /* vars for IgnoreFirstChar */
451 int lenignorefirstchar;
452 char *ignorefirstchar;
453
454 /* vars for bump position chars */
455 int lenbumpposchars;
456 char *bumpposchars;
457
458 /* vars for header values */
459 char *savedasheader;
460 int lensavedasheader;
461
462 /* vars for numberchars */ /* Not yet stored in the header. */
463 int lennumberchars; /* Probably don't need it for searching */
464 char *numberchars;
465 int numberchars_used_flag;
466
467
468 int lenindexedon;
469 char *indexedon;
470
471 int lenindexn;
472 char *indexn;
473
474 int lenindexd;
475 char *indexd;
476
477 int lenindexp;
478 char *indexp;
479
480 int lenindexa;
481 char *indexa;
482
483 int minwordlimit;
484 int maxwordlimit;
485
486 FuzzyIndexType fuzzy_mode;
487
488 /* Total files and words in index file */
489 int totalwords;
490 int totalfiles;
491
492 /* var to specify how to ranking while indexing */
493 int ignoreTotalWordCountWhenRanking; /* added 11/24/98 - MG */
494
495 int *TotalWordsPerFile;
496 int TotalWordsPerFileMax; /* max size of array - this isn't saved in the header */
497
498
499 /* Lookup tables for fast access */
500 int wordcharslookuptable[256];
501 int begincharslookuptable[256];
502 int endcharslookuptable[256];
503 int ignorefirstcharlookuptable[256];
504 int ignorelastcharlookuptable[256];
505 int bumpposcharslookuptable[256];
506 int translatecharslookuptable[256]; /* $$$ rasc 2001-02-21 */
507 int numbercharslookuptable[256]; /* Dec 12, 2001 - moseley -- mostly for ignoring numbers */
508
509 /* values for handling stopwords */
510 struct swline *hashstoplist[HASHSIZE];
511 char **stopList;
512 int stopMaxSize;
513 int stopPos;
514
515 /* This is an array of properties that are used */
516 /* These should not be in the header, rather in indexf as they are not written to disk */
517 int *propIDX_to_metaID;
518 int *metaID_to_PropIDX;
519 int property_count;
520
521
522
523 /* Buzzwords hash */
524 int buzzwords_used_flag; /* flag to indicate that buzzwords are being used */
525 struct swline *hashbuzzwordlist[HASHSIZE];
526
527 /* values for handling "use" words - > Unused in the search proccess */
528 int is_use_words_flag;
529 struct swline *hashuselist[HASHSIZE];
530
531 /* Values for fields (metanames) */
532 struct metaEntry **metaEntryArray;
533 int metaCounter; /* Number of metanames */
534
535 }
536 INDEXDATAHEADER;
537
538 typedef struct IndexFILE
539 {
540 struct IndexFILE *next;
541 struct IndexFILE *nodep;
542
543 char *line; /*Name of the index file */
544
545 unsigned long total_bytes; /* Just to show total size when indexing */
546 unsigned long total_word_positions;
547
548
549 char **prop_string_cache; /* place to cache a result's string properties */
550 /* so caller (library) won't need to free */
551
552 /* DB handle */
553 void *DB;
554
555 /* Header Info */
556 INDEXDATAHEADER header;
557
558 /* Pointer to cache the keywords */
559 char *keywords[256];
560
561
562 /* props IDs */
563 int *propIDToDisplay;
564 int *propIDToSort;
565
566
567 /* Support for merge */
568 int *meta_map; // maps metas from this index to the output index
569 int *path_order; // lists files in order of pathname
570 int current_file; // current file pointer, used for merged reading
571 struct metaEntry *path_meta; // meta entry for the path name
572 struct metaEntry *modified_meta;
573 propEntry *cur_prop; // last read pathname
574 int filenum; // current filenumber to use
575
576
577 /* Used by merge.c */
578 int *merge_file_num_map;
579 }
580 IndexFILE;
581
582
583 typedef struct RESULT_LIST
584 {
585 struct RESULT *head;
586 struct RESULT *tail;
587 struct SWISH *sw; // ** This is a waste of memory, it's only here because
588 // ** qsort only passes two values. This can be fixed.
589 }
590 RESULT_LIST;
591
592 typedef struct RESULT
593 {
594 struct RESULT *next;
595
596 int count; /* result Entry-Counter */
597 int filenum; /* there's an extra four bytes we don't need */
598 FileRec fi; /* This is used to cache the properties and the seek index */
599 int rank;
600 int frequency;
601 int tfrequency; /* Total frequency of result */
602
603 /* file position where this document's properties are stored */
604 char **PropSort;
605 int *iPropSort; /* Used for presorted data */
606 IndexFILE *indexf;
607
608 RESULT_LIST *reslist; //* this is probably not needed, too.
609
610 int posdata[1];
611 }
612 RESULT;
613
614 struct multiswline
615 {
616 struct multiswline *next;
617 struct swline *list;
618 };
619
620
621 typedef struct
622 {
623 int numWords;
624 ENTRY **elist; /* Sorted by word */
625 }
626 ENTRYARRAY;
627
628
629
630 struct url_info
631 {
632 struct url_info *next;
633 char *url;
634 };
635
636 struct IndexContents
637 {
638 struct IndexContents *next;
639 int DocType;
640 struct swline *patt;
641 };
642
643 struct StoreDescription
644 {
645 struct StoreDescription *next;
646 int DocType;
647 char *field;
648 int size;
649 };
650
651 /* These two structs are used for lookuptables in order to save memory */
652 /* Normally Metaname, frequency and structure are repetitive schemas */
653 /* and usually have also low values */
654 /* In this way three values can be fit in just one using a lookup table*/
655 /* Structure itself can use its own lookuptable */
656 struct int_st
657 {
658 struct int_st *next;
659 int index;
660 int val[1];
661 };
662
663 struct int_lookup_st
664 {
665 int n_entries;
666 struct int_st *hash_entries[HASHSIZE];
667 struct int_st *all_entries[1];
668 };
669
670 /* These two structs are used for lookuptables in order to save memory */
671 /* Normally part of the path/url are repetitive schemas */
672 /* and usually have also low values */
673 struct char_st
674 {
675 struct char_st *next;
676 int index;
677 char *val;
678 };
679
680 struct char_lookup_st
681 {
682 int n_entries;
683 struct char_st *hash_entries[HASHSIZE];
684 struct char_st *all_entries[1];
685 };
686
687
688 /* Place to store compiled regular expressions */
689
690 typedef struct regex_list
691 {
692 struct regex_list *next;
693 regex_t re;
694 char *replace;
695 int replace_count; /* number of pattern replacements - to estimate size of replacement string */
696 int replace_length; /* newstr_max = replace_length + ( replace_count * search_str_len ) */
697 int global; /* /g flag to repeat sub */
698 int negate; /* Flag for matches if the match should be negated */
699 char *pattern; /* keep string pattern around for debugging */
700 } regex_list;
701
702 typedef struct path_extract_list
703 {
704 struct path_extract_list *next;
705 struct metaEntry *meta_entry;
706 regex_list *regex;
707 } path_extract_list;
708
709
710
711 /* -- Property data types
712 -- Result handling structures, (types storage, values)
713 -- Warnung! Changing types inflicts outpur routines, etc
714 -- 2001-01 rasc
715
716 $$$ ToDO: data types are not yet fully supported by swish
717 $$$ Future: to be part of module data_types.c/h
718 */
719
720
721 typedef enum
722 { /* Property Datatypes */
723 PROP_UNDEFINED = -1,
724 PROP_UNKNOWN = 0,
725 PROP_STRING,
726 PROP_INTEGER,
727 PROP_FLOAT,
728 PROP_DATE,
729 PROP_ULONG
730 }
731 PropType;
732
733 /* For undefined meta names */
734 typedef enum
735 {
736 UNDEF_META_DISABLE = 0, // Only for XMLAtrributes - don't even try with attributes
737 UNDEF_META_INDEX, // index as plain text
738 UNDEF_META_AUTO, // create metaname if doesn't exist
739 UNDEF_META_ERROR, // throw a nasty error
740 UNDEF_META_IGNORE // don't index
741 }
742 UndefMetaFlag;
743
744
745 typedef union
746 { /* storage of the PropertyValue */
747 char *v_str; /* strings */
748 int v_int; /* Integer */
749 time_t v_date; /* Date */
750 double v_float; /* Double Float */
751 unsigned long v_ulong; /* Unsigned long */
752 }
753 u_PropValue1;
754
755 typedef struct
756 { /* Propvalue with type info */
757 PropType datatype;
758 u_PropValue1 value;
759 int destroy; /* flag to destroy (free) any pointer type */
760 }
761 PropValue;
762
763
764
765 /* --------------------------------------- */
766
767
768
769 /* Structure to hold all results per index */
770 struct DB_RESULTS
771 {
772 struct DB_RESULTS *next;
773 /* Values for handling results */
774 RESULT_LIST *resultlist;
775 RESULT *sortresultlist;
776 RESULT *currentresult;
777 };
778
779 #define MAX_ERROR_STRING_LEN 500
780
781 typedef struct SWISH
782 {
783 /* New module design structure data */
784 struct MOD_SearchAlt *SearchAlt; /* search_alt module data */
785 struct MOD_ResultOutput *ResultOutput; /* result_output module data */
786 struct MOD_Filter *Filter; /* filter module data */
787 struct MOD_ResultSort *ResultSort; /* result_sort module data */
788 struct MOD_Entities *Entities; /* html entities module data */
789 struct MOD_DB *Db; /* DB module data */
790 struct MOD_Search *Search; /* Search module data */
791 struct MOD_Index *Index; /* Index module data */
792 struct MOD_FS *FS; /* FileSystem Index module data */
793 struct MOD_HTTP *HTTP; /* HTTP Index module data */
794 struct MOD_Swish_Words *SwishWords; /* For parsing into "swish words" */
795 struct MOD_Prog *Prog; /* For extprog.c */
796 struct MOD_PropLimit *PropLimit; /* For proplimit.c */
797
798
799 /** General Purpose **/
800
801 /* list of associated index files */
802 IndexFILE *indexlist;
803
804
805 unsigned char *Prop_IO_Buf; /* For compressing and uncompressing properties (static-like buffer) */
806 unsigned long PropIO_allocated;// total size of the structure
807 int PropCompressionLevel;
808
809
810 /* Total words and files in all index files */
811 int TotalWords;
812 int TotalFiles;
813
814 /* verbose flag */
815 int verbose;
816
817 /* Error vars */
818 int commonerror;
819 int lasterror;
820 char lasterrorstr[MAX_ERROR_STRING_LEN+1];
821
822
823 /* 06/00 Jose Ruiz */
824 int isvowellookuptable[256]; //??? is this used any place?
825
826
827 /********* Document Source info **********/
828
829 /* structure for handling all the directories/files (IndexDIR) while indexing */
830 struct swline *dirlist;
831
832 /* structure for handling IndexOnly config data while indexing */
833 struct swline *suffixlist;
834
835
836
837
838 /******** Structures for parsers **********/
839
840
841 /* Limit indexing by a file date */
842 time_t mtime_limit;
843
844 long truncateDocSize; /* size of doc, at which it will be truncated (2001-03-16 rasc) */
845
846
847 /* structure for handling replace config data while searching */
848 regex_list *replaceRegexps;
849
850
851 /* It's common to want to limit searches to areas of a file or web space */
852 /* This allow extraction of a substring out of a file path, and indexed as a metaname */
853 path_extract_list *pathExtractList;
854
855
856
857 /* structure for handling NoContents config data while searching */
858 struct swline *nocontentslist;
859
860 /* 08/00 Jose Ruiz Values for document type support */
861 int DefaultDocType;
862
863 /* maps file endings to document types */
864 struct IndexContents *indexcontents;
865
866
867 /* Should comments be indexed */
868 int indexComments;
869
870
871
872 /******** Variables used by the parsers *********/
873
874 /* 12/00 Jose Ruiz Values for summary support */
875 struct StoreDescription *storedescription;
876
877
878 /* structure to handle Ignoremeta metanames */
879 struct swline *ignoremetalist;
880
881
882 /* Structure for handling metatags from DontBumpPositionOnMetaTags */
883 struct swline *dontbumpstarttagslist;
884 struct swline *dontbumpendtagslist;
885
886
887 /* Undefined MetaName indexing options */
888 UndefMetaFlag UndefinedMetaTags;
889 UndefMetaFlag UndefinedXMLAttributes; // What to do with attributes libxml2 only
890
891
892
893 /*** libxml2 additions ***/
894
895 /* parser error warning level */
896 int parser_warn_level;
897
898 int obeyRobotsNoIndex;
899
900 /* for extracting links into a metaEntry */
901 struct metaEntry *links_meta;
902
903 /* for extracting image hrefs into a metaEntry */
904 struct metaEntry *images_meta;
905
906
907 /* if allocated the meta name to store alt tags as */
908 int IndexAltTag;
909 char *IndexAltTagMeta; // use this meta-tag, if set
910
911 /* for converting relative links in href's and img src tags absoulte */
912 int AbsoluteLinks;
913
914
915 /* structure to handle XMLClassAttributes - list of attributes to use content to make a metaname*/
916 /* <foo class="bar"> => generates a metaname foo.bar */
917 struct swline *XMLClassAttributes;
918
919 }
920 SWISH;
921
922
923 /* 06/00 Jose Ruiz
924 ** Structure StringList. Stores words up to a number of n
925 */
926 typedef struct {
927 int n;
928 char **word;
929 } StringList;
930
931 /*
932 * This structure defines all of the functions that need to
933 * be implemented to an Indexing Data Source.
934 * Right now there are two Indexing Data Source types:
935 * file-system based and an HTTP web crawler.
936 * Any Data Source can be created as long as all of the
937 * functions below are properly initialized.
938 */
939 struct _indexing_data_source_def
940 {
941 const char *IndexingDataSourceName; /* long name for data source */
942 const char *IndexingDataSourceId; /* short name for data source */
943 void (*indexpath_fn) (SWISH * sw, char *path); /* routine to index a "path" */
944 int (*parseconfline_fn) (SWISH * sw, StringList *l); /* parse config file lines */
945 };
946
947
948
949
950 #ifndef GLOBAL_VARS
951 #define VAR extern
952 #else
953 #define VAR
954 #endif
955
956
957 VAR struct _indexing_data_source_def *IndexingDataSource;
958
959
960
961 void allocatedefaults(void);
962
963 int SwishAttach(SWISH *);
964 SWISH *SwishNew(void);
965 void SwishFree(SWISH *);
966
967 /* strcpy doesn't check for overflow in the 'to' string */
968 /* strncpy doesn't guarantee null byte termination */
969 /* can't check strlen of 'from' arg since it is sometimes a function call */
970 #define safestrcpy(n,to,from) { strncpy(to,from,n); (to)[(n)-1]='\0'; }
971
972 /* Jose Ruiz 04/00
973 ** Macro for copying postions between arrays of integers
974 ** copy num integers on dest (starting at posdest) from
975 ** orig (starting at posorig)
976 */
977 /*
978 #define CopyPositions(dest,posdest,orig,posorig,num) \
979 {int i;for(i=0;i<num,i++) (dest)[i+(posdest)]=(orig)[i+(posorig)];}
980 */
981 #define CopyPositions(dest,posdest,orig,posorig,num) \
982 memcpy((char *)((int *)(dest)+(posdest)),(char *)((int *)(orig)+(posorig)),(num)*sizeof(int))
983
984
985 /* Min macro */
986 #define Min(a,b) ((a) < (b) ? (a) : (b))
987
988
989
990 /* C library prototypes */
991 SWISH *SwishOpen(char *); // depreciated
992 SWISH *SwishInit(char *);
993 void SwishClose(SWISH *);
994 void SwishResetSearch(SWISH *);
995 RESULT *SwishNext(SWISH *);
996 int SwishSearch(SWISH *, char *, int, char *, char *);
997 int SwishSeek(SWISH * sw, int pos);
998 char *SwishResultPropertyStr(SWISH *sw, RESULT *result, char *pname);
999 unsigned long SwishResultPropertyULong(SWISH *sw, RESULT *result, char *pname);
1000
1001
1002
1003 /* These are only checked in dump.c */
1004 #define DEBUG_INDEX_HEADER (1<<0)
1005 #define DEBUG_INDEX_WORDS (1<<1)
1006 #define DEBUG_INDEX_WORDS_FULL (1<<2)
1007 #define DEBUG_INDEX_STOPWORDS (1<<3)
1008 #define DEBUG_INDEX_FILES (1<<4)
1009 #define DEBUG_INDEX_METANAMES (1<<5)
1010 #define DEBUG_INDEX_ALL (1<<6)
1011 #define DEBUG_INDEX_WORDS_ONLY (1<<7)
1012 #define DEBUG_INDEX_WORDS_META (1<<8)
1013
1014 /* These are only checked while indexing */
1015 #define DEBUG_WORDS (1<<0)
1016 #define DEBUG_PARSED_WORDS (1<<1)
1017 #define DEBUG_PROPERTIES (1<<2)
1018 #define DEBUG_REGEX (1<<3)
1019 #define DEBUG_PARSED_TAGS (1<<4)
1020 #define DEBUG_PARSED_TEXT (1<<5)
1021
1022 /* These are only checked while searching */
1023
1024 /* These are are checked everywhere (can't share bits) */
1025
1026
1027 extern unsigned int DEBUG_MASK;
1028
1029 #ifdef __cplusplus
1030 }
1031 #endif /* __cplusplus */
1032

  ViewVC Help
Powered by ViewVC 1.1.22