/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/swish.h
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/swish.h

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch point for: Import, MAIN
File MIME type: text/plain
Initial revision

1 adcroft 1.1 /*
2     ** $Id: swish.h,v 1.157 2002/08/29 13:45:40 jmruiz Exp $
3     **
4     ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
5     ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
6     **
7     ** This program and library is free software; you can redistribute it and/or
8     ** modify it under the terms of the GNU (Library) General Public License
9     ** as published by the Free Software Foundation; either version 2
10     ** of the License, or any later version.
11     **
12     ** This program is distributed in the hope that it will be useful,
13     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15     ** GNU (Library) General Public License for more details.
16     **
17     ** You should have received a copy of the GNU (Library) General Public License
18     ** along with this program; if not, write to the Free Software
19     ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20     **-------------------------------------------------
21     ** Added support for METADATA
22     ** G. Hill ghill@library.berkeley.edu 3/18/97
23     **
24     ** Added Document Properties support
25     ** Mark Gaulin gaulin@designinfo.com 11/24/98
26     **
27     ** Added safestrcpy() macro to avoid corruption from strcpy overflow
28     ** SRE 11/17/99
29     **
30     ** Added Document Filter support (e.g. PDF, Winword)
31     ** Rainer.Scherg@t-online.de (rasc) 1998-08-07, 1999-05-05, 1999-05-28
32     **
33     ** Added some definitions for phrase search
34     ** Structure location modified to add frequency and word positions
35     ** Structure entry modified to add link hash values for direct search
36     **
37     ** Jose Ruiz jmruiz@boe.es 04/04/00
38     **
39     ** 2000-11-15 Rainer Scherg (rasc) FileProp type and routines
40     **
41     ** 2001-01-01 Jose Ruiz Added ISOTime
42     **
43     ** 2001-01-xx Rainer Scherg (rasc) Added property type structures, etc.
44     ** 2001-01-xx Rainer Scherg (rasc) cmd-opt should be own structure in SWISH * (started)
45     **
46     ** 2001-02-xx rasc replaced ISOTime by binary value
47     ** removed SWISH.errorstr, etc.
48     ** ResultExtFmtStrList & var
49     **
50     ** 2001-02-28 rasc some cleanup, ANSI compliant
51     ** 2001-03-12 rasc logical search operators via config changable
52     ** moved some parts to config.h
53     **
54     ** 2001-03-16 rasc truncateDocSize
55     ** 2001-03-17 rasc fprop enhanced by real_filename
56     ** 2001-04-09 rasc filters changed and enhanced
57     ** 2001-06-08 wsm Add word to end of ENTRY and propValue to end of docPropertyEntry
58     ** to save memory and less malloc/free
59     **
60     ** 2001-08-12 jmruiz ENTRY struct modified to index in chunks
61     **
62     */
63    
64    
65     #include <stdio.h>
66     #include <string.h>
67     #include <math.h>
68     #include <sys/types.h>
69     #include <sys/stat.h>
70     #include <locale.h>
71     #include <ctype.h>
72     #include <errno.h>
73     #include <time.h>
74     #ifdef HAVE_CONFIG_H
75     #include "acconfig.h" /* These are defines created by autoconf */
76     #endif
77     #include "config.h"
78    
79     #ifdef NEXTSTEP
80     #include <sys/dir.h>
81     #else
82    
83    
84     #ifdef _WIN32
85     #include "win32/config.h"
86     #define strcasecmp stricmp
87     #elif defined(__VMS)
88     #include "vms/regex.h"
89     #include <dirent.h>
90     #include <stdarg.h>
91     extern int snprintf(char *, size_t, const char *, /*args */ ...);
92     extern int vsnprintf(char *, size_t, const char *, va_list);
93     #else
94     #include <dirent.h>
95     #include <regex.h>
96     #endif
97    
98     #endif
99    
100     #include <ctype.h>
101     #include <stdlib.h>
102     #include <time.h>
103     #include <setjmp.h>
104    
105     #ifdef __cplusplus
106     extern "C" {
107     #endif
108    
109     #define SWISH_MAGIC 21076321L
110    
111     #define INDEXFILE "index.swish-e"
112    
113    
114     #define BASEHEADER 1
115     #define INDEXHEADER "# SWISH format: " SWISH_VERSION
116     #define INDEXHEADER_ID BASEHEADER + 1
117     #define INDEXVERSION "# Swish-e format: " SWISH_VERSION
118     #define INDEXVERSION_ID (BASEHEADER + 2)
119    
120     /* Admin header */
121     #define NAMEHEADERPARAMNAME "IndexName"
122     #define DESCRIPTIONPARAMNAME "IndexDescription"
123     #define POINTERPARAMNAME "IndexPointer"
124     #define MAINTAINEDBYPARAMNAME "IndexAdmin"
125    
126    
127     /* Other headers that can be looked via the swish-e library */
128     #define INDEXEDONPARAMNAME "IndexedOn"
129     #define WORDCHARSPARAMNAME "WordCharacters"
130     #define BEGINCHARSPARAMNAME "BeginCharacters"
131     #define ENDCHARSPARAMNAME "EndCharacters"
132     #define IGNOREFIRSTCHARPARAMNAME "IgnoreFirstChar"
133     #define IGNORELASTCHARPARAMNAME "IgnoreLastChar"
134     #define STEMMINGPARAMNAME "UseStemming"
135     #define SOUNDEXPARAMNAME "UseSoundex"
136     #define FUZZYMODEPARAMNAME "FuzzyIndexingMode"
137    
138     #define FILECOUNTPARAMNAME "FileCount"
139    
140    
141     /* Headers for output, and their offsets */
142     #define NAMEHEADER "# Name:"
143     #define NAMEHEADER_ID (BASEHEADER + 3)
144    
145     #define SAVEDASHEADER "# Saved as:"
146     #define SAVEDASHEADER_ID (BASEHEADER + 4)
147    
148     #define COUNTSHEADER "# Counts:"
149     #define COUNTSHEADER_ID (BASEHEADER + 5)
150    
151     #define INDEXEDONHEADER "# Indexed on:"
152     #define INDEXEDONHEADER_ID (BASEHEADER + 6)
153    
154     #define DESCRIPTIONHEADER "# Description:"
155     #define DESCRIPTIONHEADER_ID (BASEHEADER + 7)
156    
157     #define POINTERHEADER "# Pointer:"
158     #define POINTERHEADER_ID (BASEHEADER + 8)
159    
160     #define MAINTAINEDBYHEADER "# Maintained by:"
161     #define MAINTAINEDBYHEADER_ID (BASEHEADER + 9)
162    
163     #define WORDCHARSHEADER "# WordCharacters:"
164     #define WORDCHARSHEADER_ID (BASEHEADER + 10)
165    
166     #define MINWORDLIMHEADER "# MinWordLimit:"
167     #define MINWORDLIMHEADER_ID (BASEHEADER + 11)
168    
169     #define MAXWORDLIMHEADER "# MaxWordLimit:"
170     #define MAXWORDLIMHEADER_ID (BASEHEADER + 12)
171    
172     #define BEGINCHARSHEADER "# BeginCharacters:"
173     #define BEGINCHARSHEADER_ID (BASEHEADER + 13)
174    
175     #define ENDCHARSHEADER "# EndCharacters:"
176     #define ENDCHARSHEADER_ID (BASEHEADER + 14)
177    
178     #define IGNOREFIRSTCHARHEADER "# IgnoreFirstChar:"
179     #define IGNOREFIRSTCHARHEADER_ID (BASEHEADER + 15)
180    
181     #define IGNORELASTCHARHEADER "# IgnoreLastChar:"
182     #define IGNORELASTCHARHEADER_ID (BASEHEADER + 16)
183    
184     #define STEMMINGHEADER "# Stemming Applied:"
185     //#define STEMMINGHEADER_ID (BASEHEADER + 17)
186    
187     #define SOUNDEXHEADER "# Soundex Applied:"
188     //#define SOUNDEXHEADER_ID (BASEHEADER + 18)
189    
190     #define FUZZYMODE_HEADER "# Fuzzy Indexing Mode:"
191     #define FUZZYMODEHEADER_ID (BASEHEADER + 18)
192    
193    
194     #define MERGED_ID (BASEHEADER + 19)
195    
196     /* vv not used vv */
197     #define DOCPROPHEADER "# DocProperty"
198     #define DOCPROPHEADER_ID (BASEHEADER + 20)
199     /* ^^ not used ^^ */
200    
201     #define DOCPROPENHEADER "# DocumentProperties:"
202     #define DOCPROPENHEADER_ID (BASEHEADER + 21)
203    
204     #define SORTDOCPROPHEADER_ID (BASEHEADER + 22)
205    
206     #define IGNORETOTALWORDCOUNTWHENRANKING "# IgnoreTotalWordCountWhenRanking:"
207     #define IGNORETOTALWORDCOUNTWHENRANKINGPARAMNAME "IgnoreTotalWordCountWhenRanking"
208     #define IGNORETOTALWORDCOUNTWHENRANKING_ID (BASEHEADER + 23)
209    
210     #define TRANSLATECHARTABLEHEADER "# TranslateCharacterTable:"
211     #define TRANSLATECHARTABLEPARAMNAME "TranslateCharacterTable"
212     #define TRANSLATECHARTABLE_ID (BASEHEADER + 25)
213    
214     #define STOPWORDS_ID (BASEHEADER + 26)
215     #define METANAMES_ID (BASEHEADER + 27)
216     #define LOCATIONLOOKUPTABLE_ID (BASEHEADER + 28)
217     #define BUZZWORDS_ID (BASEHEADER + 29) /* 2001-04-24 moseley */
218    
219     #ifndef USE_BTREE
220     #define TOTALWORDSPERFILE_ID (BASEHEADER + 30) /* total words per file array */
221     #endif
222    
223     /* -- end of headers */
224    
225     #define MAXFILELEN 1000
226     #define MAXSTRLEN 2000
227     #define MAXWORDLEN 1000
228     #define MAXTITLELEN 300
229     #define MAXENTLEN 10
230    
231     // #define HASHSIZE 101
232     // #define BIGHASHSIZE 1009
233     // #define VERYBIGHASHSIZE 10001
234    
235     // Change as suggested by Jean-François PIÉRONNE <jfp@altavista.net>
236     // on Fri, 28 Dec 2001 07:37:26 -0800 (PST)
237     #define HASHSIZE 1009
238     #define BIGHASHSIZE 10001
239     #define VERYBIGHASHSIZE 100003
240    
241    
242     #define MAXPAR 10
243     #define MAXCHARDEFINED 256
244     #define RD_BUFFER_SIZE 65356 /* init size, larger to avoid often reallocs (2001-03-16 rasc) */
245    
246     #define NOWORD "thisisnotaword"
247     #define SECSPERMIN 60
248    
249     #define IN_FILE_BIT 0
250     #define IN_TITLE_BIT 1
251     #define IN_HEAD_BIT 2
252     #define IN_BODY_BIT 3
253     #define IN_COMMENTS_BIT 4
254     #define IN_HEADER_BIT 5
255     #define IN_EMPHASIZED_BIT 6
256     #define IN_META_BIT 7
257     #define STRUCTURE_END 7
258    
259    
260     #define IN_FILE (1<<IN_FILE_BIT)
261     #define IN_TITLE (1<<IN_TITLE_BIT)
262     #define IN_HEAD (1<<IN_HEAD_BIT)
263     #define IN_BODY (1<<IN_BODY_BIT)
264     #define IN_COMMENTS (1<<IN_COMMENTS_BIT)
265     #define IN_HEADER (1<<IN_HEADER_BIT)
266     #define IN_EMPHASIZED (1<<IN_EMPHASIZED_BIT)
267     #define IN_META (1<<IN_META_BIT)
268     #define IN_ALL (IN_FILE|IN_TITLE|IN_HEAD|IN_BODY|IN_COMMENTS|IN_HEADER|IN_EMPHASIZED|IN_META)
269    
270     #define MAXLONGLEN 4
271    
272     /* Document Types */
273     enum {
274     BASEDOCTYPE = 0, TXT, HTML, XML, WML, XML2, HTML2, TXT2
275     };
276    
277     #define NODOCTYPE BASEDOCTYPE
278    
279     // This is used to build the property to read/write to disk
280     // It's here so the buffer can live between writes
281    
282     typedef struct propEntry
283     {
284     unsigned int propLen; /* Length of buffer */
285     unsigned char propValue[1]; /* Actual property value starts here */
286     }
287     propEntry;
288    
289    
290    
291     typedef struct docProperties
292     {
293     int n; /* to be removed - can just use count of properties */
294     struct propEntry *propEntry[1]; /* Array to hold properties */
295     }
296     docProperties;
297    
298     #define RANK_BIAS_RANGE 10 // max/min range ( -10 -> 10, with zero being no bias )
299    
300     /* This structure is for storing both properties and metanames -- probably should be two lists */
301     struct metaEntry
302     {
303     char *metaName; /* MetaName string */
304     int metaID; /* Meta ID */
305     int metaType; /* See metanames.h for values */
306     int *inPropRange; /* Used for limiting to a range */
307     int in_tag; /* Flag to indicate that we are within this tag */
308     int max_len; /* If non-zero, limits properties to this length (for storedescription) */
309     char *extractpath_default; /* String to index under this metaname if none found with ExtractPath */
310     propEntry *loPropRange;
311     propEntry *hiPropRange;
312     int alias; /* if non-zero, this is an alias to the listed metaID */
313     int rank_bias; /* An integer used to bias hits on this metaname 0 = no bias */
314     int *sorted_data; /* Sorted data . NULL if not read/done */
315     /* If 0, files are not sorted by this metaName/property */
316     };
317    
318    
319    
320     /* These are used to build the table of seek pointers in the main index. */
321     typedef struct
322     {
323     long length;
324     long seek;
325     } PROP_LOCATION;
326    
327    
328     typedef struct // there used to be more in this structure ;)
329     {
330     PROP_LOCATION prop_position[1]; // one for each property in the index.
331     } PROP_INDEX;
332    
333    
334     typedef struct
335     {
336     int filenum;
337     docProperties *docProperties; /* list of document props in memory */
338     PROP_INDEX *prop_index; /* pointers to properties on disk */
339     } FileRec;
340    
341    
342     /*
343     -- FileProperties
344     -- store for information about a file to be indexed...
345     -- Unused items may be NULL (e.g. if File is not opened, fp == NULL)
346     -- (2000-11 rasc)
347    
348     -- (2000-12 Jose Ruiz)
349     -- Added StoreDescription
350    
351     */
352    
353     typedef struct
354     {
355     FILE *fp; /* may be also a filter stream or NULL if not opened */
356     char *real_path; /* path/URL to indexed file - may be modified by ReplaceRules */
357     char *orig_path; /* original path provided to swish */
358     char *work_path; /* path to file to index (may be tmpfile or real_path) */
359     char *real_filename; /* basename() of real_path */
360     long fsize; /* size of the original file (not filtered) */
361     long bytes_read; /* Number of bytes read from the stream - important for sw->truncateDocSize and -S prog */
362     int done; /* flag to read no more from this stream (truncate) */
363     int external_program; /* Flag to only read fsize bytes from stream */
364     time_t mtime; /* Date of last mod of or. file */
365     int doctype; /* Type of document HTML, TXT, XML, ... */
366     int index_no_content; /* Flag, index "filename/real_path" only! */
367     struct StoreDescription *stordesc; /* Null if no description/summary */
368     struct FilterList *hasfilter; /* NULL if no filter for this file */
369     }
370     FileProp;
371    
372    
373     typedef struct LOCATION
374     {
375     struct LOCATION *next;
376     int metaID;
377     int filenum;
378     int frequency;
379     int posdata[1];
380     }
381     LOCATION;
382    
383    
384     /* 2002/01 jmruiz macros for accesing POSITION and structure */
385     #define SET_POSDATA(pos,str) ((pos) << 8 | (str))
386     #define GET_POSITION(pos) ((pos) >> 8)
387     #define GET_STRUCTURE(pos) ((pos) & 0xff)
388    
389     typedef struct ENTRY
390     {
391     struct ENTRY *next;
392     int tfrequency;
393     /* Chunk's LOCATIONs goes here */
394     LOCATION *currentChunkLocationList;
395     LOCATION *currentlocation;
396     /* All locations goes here */
397     LOCATION *allLocationList;
398    
399     /* this union is just for saving memory */
400     struct
401     {
402     long wordID;
403     int last_filenum;
404     }
405     u1;
406     char word[1]; /* actual word starts here */
407     }
408     ENTRY;
409    
410     struct swline
411     {
412     struct swline *next;
413     struct swline *nodep;
414     char *line;
415     };
416    
417    
418     /* Define types of word translations for fuzzy indexing */
419    
420     typedef enum {
421     FUZZY_NONE = 0,
422     FUZZY_STEMMING,
423     FUZZY_SOUNDEX,
424     FUZZY_METAPHONE,
425     FUZZY_DOUBLE_METAPHONE
426     } FuzzyIndexType;
427    
428    
429    
430    
431    
432     typedef struct
433     {
434     /* vars for WordCharacters */
435     int lenwordchars;
436     char *wordchars;
437    
438     /* vars for BeginCharacters */
439     int lenbeginchars;
440     char *beginchars;
441    
442     /* vars for EndCharacters */
443     int lenendchars;
444     char *endchars;
445    
446     /* vars for IgnoreLastChar */
447     int lenignorelastchar;
448     char *ignorelastchar;
449    
450     /* vars for IgnoreFirstChar */
451     int lenignorefirstchar;
452     char *ignorefirstchar;
453    
454     /* vars for bump position chars */
455     int lenbumpposchars;
456     char *bumpposchars;
457    
458     /* vars for header values */
459     char *savedasheader;
460     int lensavedasheader;
461    
462     /* vars for numberchars */ /* Not yet stored in the header. */
463     int lennumberchars; /* Probably don't need it for searching */
464     char *numberchars;
465     int numberchars_used_flag;
466    
467    
468     int lenindexedon;
469     char *indexedon;
470    
471     int lenindexn;
472     char *indexn;
473    
474     int lenindexd;
475     char *indexd;
476    
477     int lenindexp;
478     char *indexp;
479    
480     int lenindexa;
481     char *indexa;
482    
483     int minwordlimit;
484     int maxwordlimit;
485    
486     FuzzyIndexType fuzzy_mode;
487    
488     /* Total files and words in index file */
489     int totalwords;
490     int totalfiles;
491    
492     /* var to specify how to ranking while indexing */
493     int ignoreTotalWordCountWhenRanking; /* added 11/24/98 - MG */
494    
495     int *TotalWordsPerFile;
496     int TotalWordsPerFileMax; /* max size of array - this isn't saved in the header */
497    
498    
499     /* Lookup tables for fast access */
500     int wordcharslookuptable[256];
501     int begincharslookuptable[256];
502     int endcharslookuptable[256];
503     int ignorefirstcharlookuptable[256];
504     int ignorelastcharlookuptable[256];
505     int bumpposcharslookuptable[256];
506     int translatecharslookuptable[256]; /* $$$ rasc 2001-02-21 */
507     int numbercharslookuptable[256]; /* Dec 12, 2001 - moseley -- mostly for ignoring numbers */
508    
509     /* values for handling stopwords */
510     struct swline *hashstoplist[HASHSIZE];
511     char **stopList;
512     int stopMaxSize;
513     int stopPos;
514    
515     /* This is an array of properties that are used */
516     /* These should not be in the header, rather in indexf as they are not written to disk */
517     int *propIDX_to_metaID;
518     int *metaID_to_PropIDX;
519     int property_count;
520    
521    
522    
523     /* Buzzwords hash */
524     int buzzwords_used_flag; /* flag to indicate that buzzwords are being used */
525     struct swline *hashbuzzwordlist[HASHSIZE];
526    
527     /* values for handling "use" words - > Unused in the search proccess */
528     int is_use_words_flag;
529     struct swline *hashuselist[HASHSIZE];
530    
531     /* Values for fields (metanames) */
532     struct metaEntry **metaEntryArray;
533     int metaCounter; /* Number of metanames */
534    
535     }
536     INDEXDATAHEADER;
537    
538     typedef struct IndexFILE
539     {
540     struct IndexFILE *next;
541     struct IndexFILE *nodep;
542    
543     char *line; /*Name of the index file */
544    
545     unsigned long total_bytes; /* Just to show total size when indexing */
546     unsigned long total_word_positions;
547    
548    
549     char **prop_string_cache; /* place to cache a result's string properties */
550     /* so caller (library) won't need to free */
551    
552     /* DB handle */
553     void *DB;
554    
555     /* Header Info */
556     INDEXDATAHEADER header;
557    
558     /* Pointer to cache the keywords */
559     char *keywords[256];
560    
561    
562     /* props IDs */
563     int *propIDToDisplay;
564     int *propIDToSort;
565    
566    
567     /* Support for merge */
568     int *meta_map; // maps metas from this index to the output index
569     int *path_order; // lists files in order of pathname
570     int current_file; // current file pointer, used for merged reading
571     struct metaEntry *path_meta; // meta entry for the path name
572     struct metaEntry *modified_meta;
573     propEntry *cur_prop; // last read pathname
574     int filenum; // current filenumber to use
575    
576    
577     /* Used by merge.c */
578     int *merge_file_num_map;
579     }
580     IndexFILE;
581    
582    
583     typedef struct RESULT_LIST
584     {
585     struct RESULT *head;
586     struct RESULT *tail;
587     struct SWISH *sw; // ** This is a waste of memory, it's only here because
588     // ** qsort only passes two values. This can be fixed.
589     }
590     RESULT_LIST;
591    
592     typedef struct RESULT
593     {
594     struct RESULT *next;
595    
596     int count; /* result Entry-Counter */
597     int filenum; /* there's an extra four bytes we don't need */
598     FileRec fi; /* This is used to cache the properties and the seek index */
599     int rank;
600     int frequency;
601     int tfrequency; /* Total frequency of result */
602    
603     /* file position where this document's properties are stored */
604     char **PropSort;
605     int *iPropSort; /* Used for presorted data */
606     IndexFILE *indexf;
607    
608     RESULT_LIST *reslist; //* this is probably not needed, too.
609    
610     int posdata[1];
611     }
612     RESULT;
613    
614     struct multiswline
615     {
616     struct multiswline *next;
617     struct swline *list;
618     };
619    
620    
621     typedef struct
622     {
623     int numWords;
624     ENTRY **elist; /* Sorted by word */
625     }
626     ENTRYARRAY;
627    
628    
629    
630     struct url_info
631     {
632     struct url_info *next;
633     char *url;
634     };
635    
636     struct IndexContents
637     {
638     struct IndexContents *next;
639     int DocType;
640     struct swline *patt;
641     };
642    
643     struct StoreDescription
644     {
645     struct StoreDescription *next;
646     int DocType;
647     char *field;
648     int size;
649     };
650    
651     /* These two structs are used for lookuptables in order to save memory */
652     /* Normally Metaname, frequency and structure are repetitive schemas */
653     /* and usually have also low values */
654     /* In this way three values can be fit in just one using a lookup table*/
655     /* Structure itself can use its own lookuptable */
656     struct int_st
657     {
658     struct int_st *next;
659     int index;
660     int val[1];
661     };
662    
663     struct int_lookup_st
664     {
665     int n_entries;
666     struct int_st *hash_entries[HASHSIZE];
667     struct int_st *all_entries[1];
668     };
669    
670     /* These two structs are used for lookuptables in order to save memory */
671     /* Normally part of the path/url are repetitive schemas */
672     /* and usually have also low values */
673     struct char_st
674     {
675     struct char_st *next;
676     int index;
677     char *val;
678     };
679    
680     struct char_lookup_st
681     {
682     int n_entries;
683     struct char_st *hash_entries[HASHSIZE];
684     struct char_st *all_entries[1];
685     };
686    
687    
688     /* Place to store compiled regular expressions */
689    
690     typedef struct regex_list
691     {
692     struct regex_list *next;
693     regex_t re;
694     char *replace;
695     int replace_count; /* number of pattern replacements - to estimate size of replacement string */
696     int replace_length; /* newstr_max = replace_length + ( replace_count * search_str_len ) */
697     int global; /* /g flag to repeat sub */
698     int negate; /* Flag for matches if the match should be negated */
699     char *pattern; /* keep string pattern around for debugging */
700     } regex_list;
701    
702     typedef struct path_extract_list
703     {
704     struct path_extract_list *next;
705     struct metaEntry *meta_entry;
706     regex_list *regex;
707     } path_extract_list;
708    
709    
710    
711     /* -- Property data types
712     -- Result handling structures, (types storage, values)
713     -- Warnung! Changing types inflicts outpur routines, etc
714     -- 2001-01 rasc
715    
716     $$$ ToDO: data types are not yet fully supported by swish
717     $$$ Future: to be part of module data_types.c/h
718     */
719    
720    
721     typedef enum
722     { /* Property Datatypes */
723     PROP_UNDEFINED = -1,
724     PROP_UNKNOWN = 0,
725     PROP_STRING,
726     PROP_INTEGER,
727     PROP_FLOAT,
728     PROP_DATE,
729     PROP_ULONG
730     }
731     PropType;
732    
733     /* For undefined meta names */
734     typedef enum
735     {
736     UNDEF_META_DISABLE = 0, // Only for XMLAtrributes - don't even try with attributes
737     UNDEF_META_INDEX, // index as plain text
738     UNDEF_META_AUTO, // create metaname if doesn't exist
739     UNDEF_META_ERROR, // throw a nasty error
740     UNDEF_META_IGNORE // don't index
741     }
742     UndefMetaFlag;
743    
744    
745     typedef union
746     { /* storage of the PropertyValue */
747     char *v_str; /* strings */
748     int v_int; /* Integer */
749     time_t v_date; /* Date */
750     double v_float; /* Double Float */
751     unsigned long v_ulong; /* Unsigned long */
752     }
753     u_PropValue1;
754    
755     typedef struct
756     { /* Propvalue with type info */
757     PropType datatype;
758     u_PropValue1 value;
759     int destroy; /* flag to destroy (free) any pointer type */
760     }
761     PropValue;
762    
763    
764    
765     /* --------------------------------------- */
766    
767    
768    
769     /* Structure to hold all results per index */
770     struct DB_RESULTS
771     {
772     struct DB_RESULTS *next;
773     /* Values for handling results */
774     RESULT_LIST *resultlist;
775     RESULT *sortresultlist;
776     RESULT *currentresult;
777     };
778    
779     #define MAX_ERROR_STRING_LEN 500
780    
781     typedef struct SWISH
782     {
783     /* New module design structure data */
784     struct MOD_SearchAlt *SearchAlt; /* search_alt module data */
785     struct MOD_ResultOutput *ResultOutput; /* result_output module data */
786     struct MOD_Filter *Filter; /* filter module data */
787     struct MOD_ResultSort *ResultSort; /* result_sort module data */
788     struct MOD_Entities *Entities; /* html entities module data */
789     struct MOD_DB *Db; /* DB module data */
790     struct MOD_Search *Search; /* Search module data */
791     struct MOD_Index *Index; /* Index module data */
792     struct MOD_FS *FS; /* FileSystem Index module data */
793     struct MOD_HTTP *HTTP; /* HTTP Index module data */
794     struct MOD_Swish_Words *SwishWords; /* For parsing into "swish words" */
795     struct MOD_Prog *Prog; /* For extprog.c */
796     struct MOD_PropLimit *PropLimit; /* For proplimit.c */
797    
798    
799     /** General Purpose **/
800    
801     /* list of associated index files */
802     IndexFILE *indexlist;
803    
804    
805     unsigned char *Prop_IO_Buf; /* For compressing and uncompressing properties (static-like buffer) */
806     unsigned long PropIO_allocated;// total size of the structure
807     int PropCompressionLevel;
808    
809    
810     /* Total words and files in all index files */
811     int TotalWords;
812     int TotalFiles;
813    
814     /* verbose flag */
815     int verbose;
816    
817     /* Error vars */
818     int commonerror;
819     int lasterror;
820     char lasterrorstr[MAX_ERROR_STRING_LEN+1];
821    
822    
823     /* 06/00 Jose Ruiz */
824     int isvowellookuptable[256]; //??? is this used any place?
825    
826    
827     /********* Document Source info **********/
828    
829     /* structure for handling all the directories/files (IndexDIR) while indexing */
830     struct swline *dirlist;
831    
832     /* structure for handling IndexOnly config data while indexing */
833     struct swline *suffixlist;
834    
835    
836    
837    
838     /******** Structures for parsers **********/
839    
840    
841     /* Limit indexing by a file date */
842     time_t mtime_limit;
843    
844     long truncateDocSize; /* size of doc, at which it will be truncated (2001-03-16 rasc) */
845    
846    
847     /* structure for handling replace config data while searching */
848     regex_list *replaceRegexps;
849    
850    
851     /* It's common to want to limit searches to areas of a file or web space */
852     /* This allow extraction of a substring out of a file path, and indexed as a metaname */
853     path_extract_list *pathExtractList;
854    
855    
856    
857     /* structure for handling NoContents config data while searching */
858     struct swline *nocontentslist;
859    
860     /* 08/00 Jose Ruiz Values for document type support */
861     int DefaultDocType;
862    
863     /* maps file endings to document types */
864     struct IndexContents *indexcontents;
865    
866    
867     /* Should comments be indexed */
868     int indexComments;
869    
870    
871    
872     /******** Variables used by the parsers *********/
873    
874     /* 12/00 Jose Ruiz Values for summary support */
875     struct StoreDescription *storedescription;
876    
877    
878     /* structure to handle Ignoremeta metanames */
879     struct swline *ignoremetalist;
880    
881    
882     /* Structure for handling metatags from DontBumpPositionOnMetaTags */
883     struct swline *dontbumpstarttagslist;
884     struct swline *dontbumpendtagslist;
885    
886    
887     /* Undefined MetaName indexing options */
888     UndefMetaFlag UndefinedMetaTags;
889     UndefMetaFlag UndefinedXMLAttributes; // What to do with attributes libxml2 only
890    
891    
892    
893     /*** libxml2 additions ***/
894    
895     /* parser error warning level */
896     int parser_warn_level;
897    
898     int obeyRobotsNoIndex;
899    
900     /* for extracting links into a metaEntry */
901     struct metaEntry *links_meta;
902    
903     /* for extracting image hrefs into a metaEntry */
904     struct metaEntry *images_meta;
905    
906    
907     /* if allocated the meta name to store alt tags as */
908     int IndexAltTag;
909     char *IndexAltTagMeta; // use this meta-tag, if set
910    
911     /* for converting relative links in href's and img src tags absoulte */
912     int AbsoluteLinks;
913    
914    
915     /* structure to handle XMLClassAttributes - list of attributes to use content to make a metaname*/
916     /* <foo class="bar"> => generates a metaname foo.bar */
917     struct swline *XMLClassAttributes;
918    
919     }
920     SWISH;
921    
922    
923     /* 06/00 Jose Ruiz
924     ** Structure StringList. Stores words up to a number of n
925     */
926     typedef struct {
927     int n;
928     char **word;
929     } StringList;
930    
931     /*
932     * This structure defines all of the functions that need to
933     * be implemented to an Indexing Data Source.
934     * Right now there are two Indexing Data Source types:
935     * file-system based and an HTTP web crawler.
936     * Any Data Source can be created as long as all of the
937     * functions below are properly initialized.
938     */
939     struct _indexing_data_source_def
940     {
941     const char *IndexingDataSourceName; /* long name for data source */
942     const char *IndexingDataSourceId; /* short name for data source */
943     void (*indexpath_fn) (SWISH * sw, char *path); /* routine to index a "path" */
944     int (*parseconfline_fn) (SWISH * sw, StringList *l); /* parse config file lines */
945     };
946    
947    
948    
949    
950     #ifndef GLOBAL_VARS
951     #define VAR extern
952     #else
953     #define VAR
954     #endif
955    
956    
957     VAR struct _indexing_data_source_def *IndexingDataSource;
958    
959    
960    
961     void allocatedefaults(void);
962    
963     int SwishAttach(SWISH *);
964     SWISH *SwishNew(void);
965     void SwishFree(SWISH *);
966    
967     /* strcpy doesn't check for overflow in the 'to' string */
968     /* strncpy doesn't guarantee null byte termination */
969     /* can't check strlen of 'from' arg since it is sometimes a function call */
970     #define safestrcpy(n,to,from) { strncpy(to,from,n); (to)[(n)-1]='\0'; }
971    
972     /* Jose Ruiz 04/00
973     ** Macro for copying postions between arrays of integers
974     ** copy num integers on dest (starting at posdest) from
975     ** orig (starting at posorig)
976     */
977     /*
978     #define CopyPositions(dest,posdest,orig,posorig,num) \
979     {int i;for(i=0;i<num,i++) (dest)[i+(posdest)]=(orig)[i+(posorig)];}
980     */
981     #define CopyPositions(dest,posdest,orig,posorig,num) \
982     memcpy((char *)((int *)(dest)+(posdest)),(char *)((int *)(orig)+(posorig)),(num)*sizeof(int))
983    
984    
985     /* Min macro */
986     #define Min(a,b) ((a) < (b) ? (a) : (b))
987    
988    
989    
990     /* C library prototypes */
991     SWISH *SwishOpen(char *); // depreciated
992     SWISH *SwishInit(char *);
993     void SwishClose(SWISH *);
994     void SwishResetSearch(SWISH *);
995     RESULT *SwishNext(SWISH *);
996     int SwishSearch(SWISH *, char *, int, char *, char *);
997     int SwishSeek(SWISH * sw, int pos);
998     char *SwishResultPropertyStr(SWISH *sw, RESULT *result, char *pname);
999     unsigned long SwishResultPropertyULong(SWISH *sw, RESULT *result, char *pname);
1000    
1001    
1002    
1003     /* These are only checked in dump.c */
1004     #define DEBUG_INDEX_HEADER (1<<0)
1005     #define DEBUG_INDEX_WORDS (1<<1)
1006     #define DEBUG_INDEX_WORDS_FULL (1<<2)
1007     #define DEBUG_INDEX_STOPWORDS (1<<3)
1008     #define DEBUG_INDEX_FILES (1<<4)
1009     #define DEBUG_INDEX_METANAMES (1<<5)
1010     #define DEBUG_INDEX_ALL (1<<6)
1011     #define DEBUG_INDEX_WORDS_ONLY (1<<7)
1012     #define DEBUG_INDEX_WORDS_META (1<<8)
1013    
1014     /* These are only checked while indexing */
1015     #define DEBUG_WORDS (1<<0)
1016     #define DEBUG_PARSED_WORDS (1<<1)
1017     #define DEBUG_PROPERTIES (1<<2)
1018     #define DEBUG_REGEX (1<<3)
1019     #define DEBUG_PARSED_TAGS (1<<4)
1020     #define DEBUG_PARSED_TEXT (1<<5)
1021    
1022     /* These are only checked while searching */
1023    
1024     /* These are are checked everywhere (can't share bits) */
1025    
1026    
1027     extern unsigned int DEBUG_MASK;
1028    
1029     #ifdef __cplusplus
1030     }
1031     #endif /* __cplusplus */
1032    

  ViewVC Help
Powered by ViewVC 1.1.22