1 |
/* |
2 |
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company |
3 |
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94 |
4 |
** |
5 |
** This program and library is free software; you can redistribute it and/or |
6 |
** modify it under the terms of the GNU (Library) General Public License |
7 |
** as published by the Free Software Foundation; either version 2 |
8 |
** of the License, or any later version. |
9 |
** |
10 |
** This program is distributed in the hope that it will be useful, |
11 |
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 |
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 |
** GNU (Library) General Public License for more details. |
14 |
** |
15 |
** You should have received a copy of the GNU (Library) General Public License |
16 |
** along with this program; if not, write to the Free Software |
17 |
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
18 |
**-------------------------------------------------------------------------- |
19 |
** Config file edited by Roy Tennant 2/20/96 |
20 |
** Config file edited by Giulia Hill 2/27/97 to increase lenght of |
21 |
** words that are indexed |
22 |
** Added IGNORELASTCHAR |
23 |
** G. Hill 3/12/97 ghill@library.berkeley.edu |
24 |
** |
25 |
** Added OKNOMETA to allow no failing in case the META name is |
26 |
** not listed in the config.h |
27 |
** G. Hill 4/15/97 ghill@library.berkeley.edu |
28 |
** |
29 |
** Added IGNOREFIRSTCHAR |
30 |
** G.Hill 10/16/97 ghill@library.berkeley.edu |
31 |
**----------------------------------------------------------------------- |
32 |
** The following are user-definable options that you can change |
33 |
** to fine-tune SWISH's default options. |
34 |
** |
35 |
** 2001-03-13 rasc moved search boolean words from swish.h |
36 |
** |
37 |
** 2001-05-23 wsm added ranking weights |
38 |
** |
39 |
*/ |
40 |
|
41 |
|
42 |
#ifdef __VMS |
43 |
#define PROPFILE_EXTENSION "_prop" |
44 |
#define WORDDATA_EXTENSION "_wdata" |
45 |
#define PRESORTED_EXTENSION "_psort" |
46 |
#define BTREE_EXTENSION "_btree" |
47 |
#define ARRAY_EXTENSION "_array" |
48 |
#else |
49 |
#define PROPFILE_EXTENSION ".prop" |
50 |
#define WORDDATA_EXTENSION ".wdata" |
51 |
#define PRESORTED_EXTENSION ".psort" |
52 |
#define BTREE_EXTENSION ".btree" |
53 |
#define ARRAY_EXTENSION ".array" |
54 |
#endif |
55 |
|
56 |
/* MIN_PROP_COMPRESS_SIZE sets the limit for which properties are compressed |
57 |
* must be compiled with zlib. |
58 |
*/ |
59 |
#define MIN_PROP_COMPRESS_SIZE 100 |
60 |
|
61 |
/* This is the character used to replace UTF-8 characters that cannot be |
62 |
* converted to 8859-1 Latin-1 character |
63 |
*/ |
64 |
#define ENCODE_ERROR_CHAR ' ' |
65 |
|
66 |
/* Defines the file extension to use on the property file. |
67 |
*/ |
68 |
|
69 |
#define MAX_SORT_STRING_LEN 50 |
70 |
|
71 |
/* MAX_SORT_STRING_LEN defines the max string length to use |
72 |
* for sorting properties. Should be long enough to sort ALL |
73 |
* file paths or URLs. Useful if using StoreDescription to store |
74 |
* a large amount of text. |
75 |
*/ |
76 |
|
77 |
#define USE_DOCPATH_AS_TITLE 1 |
78 |
|
79 |
/* If USE_DOCPATH_AS_TITLE is defined then documents that do not have |
80 |
* a title defined (xml and txt, and HTML documents without a title) |
81 |
* will display the document path as the title in results. |
82 |
* Documents without a title will sort as a blank title, and not |
83 |
* by the document path regardless of this setting. This is a change |
84 |
* from versions previous to 2.2. |
85 |
*/ |
86 |
|
87 |
#ifdef __VMS |
88 |
#define USE_TEMPFILE_EXTENSION "_temp" |
89 |
#else |
90 |
#define USE_TEMPFILE_EXTENSION ".temp" |
91 |
#endif |
92 |
|
93 |
/* If USE_TMPFILE_EXTENSION is defined then swish will append the supplied |
94 |
* extension onto the index files during indexing, and when indexing is |
95 |
* complete will remove the extension by renaming the files. |
96 |
* This has two important uses when an index file already exists (and is in use): |
97 |
* 1) the old index can be used while indexing is running |
98 |
* 2) a failure during indexing will not destroy the existing index |
99 |
* |
100 |
* Note: This is used instead of a normal temporary file because possible limitation |
101 |
* in renaming across file systems. Therefore, the temporary index files are |
102 |
* stored in the same directory as the final index files. |
103 |
*/ |
104 |
|
105 |
#define TEMP_FILE_PREFIX "swtmp" |
106 |
|
107 |
/* TEMP_FILE_PREFIX is prepended to all temporary files. Makes them |
108 |
* easier to find. |
109 |
*/ |
110 |
|
111 |
|
112 |
#define ALLOW_HTTP_INDEXING_DATA_SOURCE 1 |
113 |
#define ALLOW_FILESYSTEM_INDEXING_DATA_SOURCE 1 |
114 |
#define ALLOW_EXTERNAL_PROGRAM_DATA_SOURCE 1 |
115 |
|
116 |
/* These symbols allow compile-time elimination of indexing |
117 |
** data sources. Any Data Source that is allowed by these |
118 |
** symbols can be selected for indexing from the command line. |
119 |
** Comment out any options you do not want to support, but |
120 |
** be sure to leave at least one option. |
121 |
*/ |
122 |
|
123 |
#define INDEXPERMS 0644 |
124 |
|
125 |
/* After SWISH generates an index file, it changes the permissions |
126 |
** of the file to this mode. Change to the mode you like |
127 |
** (note that it must be an octal number). If you don't want |
128 |
** permissions to be changed for you, comment out this line. |
129 |
*/ |
130 |
|
131 |
#define NO_PLIMIT 101 |
132 |
|
133 |
#define PLIMIT NO_PLIMIT |
134 |
#define FLIMIT 10000 |
135 |
|
136 |
/* SWISH uses these parameters to automatically mark words as |
137 |
** being too common while indexing. For instance, if I defined PLIMIT |
138 |
** as 80 and FLIMIT as 256, SWISH would define a common word as |
139 |
** a word that occurs in over 80% of all indexed files and over |
140 |
** 256 files. Making these numbers lower will most likely make your |
141 |
** index files smaller. Making PLIMIT and FLIMIT small will also |
142 |
** ensure that searching consumes only so much CPU resources. |
143 |
*/ |
144 |
|
145 |
#define VERBOSE 1 |
146 |
|
147 |
/* You can define VERBOSE to be a number from 0 to 4. 0 is totally |
148 |
** silent operation. The default before swish 2.2 was 3 |
149 |
*/ |
150 |
|
151 |
#define _AND_WORD "and" |
152 |
#define _OR_WORD "or" |
153 |
#define _NOT_WORD "not" |
154 |
|
155 |
/* |
156 |
** these are the default boolean operator words used by swish search |
157 |
*/ |
158 |
|
159 |
#define DEFAULT_RULE AND_RULE |
160 |
|
161 |
/* If a list of search words is specified without booleans, |
162 |
** SWISH will assume they are connected by a default rule. |
163 |
** This can be AND_RULE or OR_RULE. |
164 |
*/ |
165 |
|
166 |
#define TITLETOPLINES 12 |
167 |
|
168 |
/* This is how many lines deep SWISH will look into an HTML file to |
169 |
** attempt to find a <TITLE> tag. This has no effect when using the libxml2 parser. |
170 |
*/ |
171 |
|
172 |
|
173 |
#define MINWORDLIMIT 1 |
174 |
|
175 |
/* This is the minimum length of a word. Anything shorter will not |
176 |
** be indexed. |
177 |
** Do not change it here. Use MinWordLimit in config file |
178 |
*/ |
179 |
|
180 |
#define MAXWORDLIMIT 40 |
181 |
|
182 |
/* This is the maximum length of a word. Anything longer will not |
183 |
** be indexed. |
184 |
** Do not change it here. Use MaxWordLimit in config file |
185 |
*/ |
186 |
|
187 |
#define CONVERTHTMLENTITIES 1 |
188 |
|
189 |
/* If defined as 1, all entities in indexed |
190 |
** words will be converted to an ASCII equivalent. For instance, |
191 |
** with this feature you can index the word "resumé" or |
192 |
** "resumé" and it will be indexed as the word "resume". |
193 |
** 2001-01 Do not change it here. Use ConvertHTMLEtities Yes/No in |
194 |
** config file |
195 |
*/ |
196 |
|
197 |
#define IGNOREALLV 0 |
198 |
#define IGNOREALLC 0 |
199 |
#define IGNOREALLN 0 |
200 |
|
201 |
/* If IGNOREALLV is 1, words containing all vowels won't be indexed. |
202 |
** If IGNOREALLC is 1, words containing all consonants won't be indexed. |
203 |
** If IGNOREALLN is 1, words containing all digits won't be indexed. |
204 |
** Define as 0 to allow words with consistent characters. |
205 |
** Vowels are defined as "aeiou", digits are "0123456789". |
206 |
*/ |
207 |
|
208 |
#define IGNOREROWV 60 |
209 |
#define IGNOREROWC 60 |
210 |
#define IGNOREROWN 60 |
211 |
|
212 |
/* IGNOREROWV is the maximum number of consecutive vowels a word can have. |
213 |
** IGNOREROWC is the maximum number of consecutive consonants a word can have. |
214 |
** IGNOREROWN is the maximum number of consecutive digits a word can have. |
215 |
** Vowels are defined as "aeiou", digits are "0123456789". |
216 |
*/ |
217 |
|
218 |
#define IGNORESAME 100 |
219 |
|
220 |
/* IGNORESAME is the maximum times a character can repeat in a word. |
221 |
*/ |
222 |
/* Dec 6, 2001 - Grabbed "letters" from /usr/local/share/aspell/iso8859-1.dat (http://aspell.sf.net) - moseley */ |
223 |
#define WORDCHARS "0123456789abcdefghijklmnopqrstuvwxyzªµºÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" |
224 |
|
225 |
/* |
226 |
#define WORDCHARS "abcdefghijklmnopqrstuvwxyzÁÂÃÈýÊËÌÐÝÞÍðÎÏÒÓÔÕØÙÛîèãõßø£ÜíÀ0123456789" |
227 |
*/ |
228 |
|
229 |
/* WORDCHARS is a string of characters which SWISH permits to |
230 |
** be in words. Words are defined by these characters. |
231 |
** |
232 |
** Also note that if you specify the backslash character (\) or |
233 |
** double quote (") you need to type a backslash before them to |
234 |
** make the compiler understand them. |
235 |
*/ |
236 |
|
237 |
#define BEGINCHARS "0123456789abcdefghijklmnopqrstuvwxyzªµºÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" |
238 |
|
239 |
/* Of the characters that you decide can go into words, this is |
240 |
** a list of characters that words can begin with. It should be |
241 |
** a subset of (or equal to) WORDCHARS. |
242 |
*/ |
243 |
|
244 |
#define ENDCHARS "0123456789abcdefghijklmnopqrstuvwxyzªµºÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" |
245 |
|
246 |
/* This is the same as BEGINCHARS, except you're testing for |
247 |
** valid characters at the ends of words. |
248 |
*/ |
249 |
|
250 |
#define IGNORELASTCHAR "" |
251 |
|
252 |
/* Array that contains the char that, if considered valid in the middle of |
253 |
** a word need to be disreguarded when at the end. It is important to also |
254 |
** set the given char's in the ENDCHARS array, otherwise the word will not |
255 |
** be indexed because considered invalid. |
256 |
** If none just leave the empty list "". Do not erase the line. |
257 |
*/ |
258 |
|
259 |
#define IGNOREFIRSTCHAR "" |
260 |
|
261 |
/* Array that contains the char that, if considered valid in the middle of |
262 |
** a word need to be disreguarded when at the beginning. It is important to also |
263 |
** set the given char's in the BEGINCHARS array, otherwise the word will not |
264 |
** be indexed because considered invalid. |
265 |
** If none just leave the empty list "". Do not erase the line. |
266 |
*/ |
267 |
|
268 |
#define IGNORE_STOPWORDS_IN_QUERY 1 |
269 |
|
270 |
/* Added JM 1/10/98. Setting this to 0 (default) causes a stopword in |
271 |
** an AND_RULE search to create an empty result. Setting it to 1 simply |
272 |
** ignores the stopwords and does a search on the remaining words. |
273 |
*/ |
274 |
|
275 |
#define INDEXTAGS 0 |
276 |
|
277 |
/* Normally, all data in tags in HTML files (except for words in |
278 |
** comments or meta tags) is ignored. If you want to index HTML files with the |
279 |
** text within tags and all, define this to be 1 and not 0. |
280 |
** NOTE: if you set it to 1 you will not be able to do context nor |
281 |
** metaNames searches, as tags are just plain text with no specific |
282 |
** meaning. |
283 |
*/ |
284 |
|
285 |
// #define BLANK_PROP_VALUE " *BLANK*" |
286 |
|
287 |
/* This effects how blank properties are stored |
288 |
** Normally, blank properties are treated as if they were not even contained int |
289 |
** the document. That is: |
290 |
** <meta name="author" content=""> |
291 |
** is ignored, and no "author" property is stored for that docment. |
292 |
** If BLANK_PROP_VALUE is set, then blank properties will be stored |
293 |
** but using the string provided as the property value. |
294 |
** If you use a leading space, then these properties will sort |
295 |
** before other properties (since leading whitespace is removed from |
296 |
** properties), and after documents that do not include the property |
297 |
*/ |
298 |
|
299 |
#define RANK_TITLE 4 |
300 |
#define RANK_HEADER 3 |
301 |
#define RANK_META 3 |
302 |
#define RANK_COMMENTS 1 |
303 |
#define RANK_EMPHASIZED 0 |
304 |
|
305 |
/* This symbols affect the weights applied during ranking. Note that they are added |
306 |
** together and added to a base rank of 1.0 -- thus defining a rank with a value of |
307 |
** 2.0 really means it is ranked (1.0 + 2.0) times greater than normal. |
308 |
** A value of 0.0 applies no additional ranking boost. Note that RANK_COMMENTS only |
309 |
** applies if you are indexing comment. Be sure you understand how these interact |
310 |
** in getrank; don't just go changing these values! |
311 |
*/ |
312 |
|
313 |
#define SPIDERDIRECTORY "./" |
314 |
|
315 |
#define SWAP_LOC_DEFAULT 0 |
316 |
|
317 |
/* 2001/08 jmruiz -- Default chunk size - Index will work with blocks of files. This number specifies when to coalesce locations to save memory */ |
318 |
#define INDEX_DEFAULT_CHUNK_SIZE 10000 |
319 |
|
320 |
/* 2001/08 jmruiz -- Default optimal zone size for temporal storage of locations */ |
321 |
/* 1<<23 is 8 MB */ |
322 |
#define INDEX_DEFAULT_OPTIMAL_CHUNK_ZONE_SIZE_FOR_LOCATIONS 1<<23 |
323 |
|
324 |
/* 2002/06 Number of swap loc files (-e) */ |
325 |
#define MAX_LOC_SWAP_FILES 377 |
326 |
|
327 |
/* 2001/08 jmruiz -- To avoid emalloc/erealloc in some routines some stack arrays have been added. This is their default size */ |
328 |
#define MAX_STACK_POSITIONS 1024 |
329 |
|
330 |
/* 2001/08 jmruiz -- Do not change this (it must be a unsigned number) */ |
331 |
/* This is the maximum size of a block of coalesced locations */ |
332 |
#define COALESCE_BUFFER_MAX_SIZE 1<<18 /* (256 KB) */ |
333 |
|
334 |
/* 2001/08 jmruiz -- File System sort flag - 0 means that filenames |
335 |
** will not be indexed - 1 means that filenames will be indexed */ |
336 |
#define SORT_FILENAMES 0 |
337 |
|
338 |
/* 2001/10 jmruiz -- Added BTREE schema to store words */ |
339 |
|
340 |
//#define USE_BTREE |
341 |
|
342 |
|
343 |
/* 09/00 Jose Ruiz. When set to 1 part of the info is swapped to disk |
344 |
** to save memory in the index proccess |
345 |
** Do not change it. You can activate this option through the command |
346 |
** line (option -e) |
347 |
*/ |
348 |
|
349 |
/* Set this to 1 if you are compiling under Win32 |
350 |
define _WIN32 1 |
351 |
*/ |
352 |
|
353 |
/* --- BEGIN PORTING-RELATED SYMBOLS --- */ |
354 |
|
355 |
#ifdef _WIN32 |
356 |
#define NO_SYMBOLIC_FILE_LINKS /* Win32 has no symbolic links */ |
357 |
#endif |
358 |
|
359 |
#ifdef __VMS |
360 |
#define NO_SYMBOLIC_FILE_LINKS /* VMS has no symbolic links */ |
361 |
#endif |
362 |
|
363 |
#ifdef _WIN32 |
364 |
#undef INDEXPERMS /* Win32 version doesn't use chmod() */ |
365 |
#endif |
366 |
|
367 |
#ifdef _WIN32 |
368 |
typedef int pid_t; /* process ID */ |
369 |
#endif |
370 |
|
371 |
//#ifdef _WIN32 |
372 |
//#define TMPDIR "c:\\windows\\temp" |
373 |
//#elif defined(__VMS) |
374 |
//#define TMPDIR "sys$scratch:" |
375 |
//#else |
376 |
//#define TMPDIR "/var/tmp" |
377 |
//#endif |
378 |
|
379 |
|
380 |
/* Default Delimiter of phrase search */ |
381 |
#define PHRASE_DELIMITER_CHAR '"' |
382 |
|
383 |
|
384 |
/* |
385 |
* Binary files must be open with the "b" option under Win32, so all |
386 |
* fopen() calls to index files have to go through these routines to |
387 |
* keep the code portable. |
388 |
* Note: text files should be opened normally, without the "b" option, |
389 |
* otherwise end-of-line processing is not done correctly (on Win32). |
390 |
*/ |
391 |
#define F_READ_BINARY "rb" |
392 |
#define F_WRITE_BINARY "wb" |
393 |
#define F_READWRITE_BINARY "rb+" |
394 |
|
395 |
#define F_READ_TEXT "r" |
396 |
#define F_WRITE_TEXT "w" |
397 |
#define F_READWRITE_TEXT "r+" |
398 |
|
399 |
|
400 |
|
401 |
/* #define NEXTSTEP */ |
402 |
|
403 |
/* You may need to define this if compiling on a NeXTstep machine. |
404 |
*/ |
405 |
|
406 |
/* --- END PORTING-RELATED SYMBOLS --- */ |
407 |
|